[ngrams, tests] understanding ngramsByDoc better

parent 35c2d0b0
Pipeline #7172 failed with stages
in 14 minutes and 48 seconds
...@@ -45,7 +45,7 @@ importCLI (ImportArgs fun user name settingsPath limit corpusPath) = do ...@@ -45,7 +45,7 @@ importCLI (ImportArgs fun user name settingsPath limit corpusPath) = do
format = TsvGargV3 format = TsvGargV3
corpus :: forall m. (FlowCmdM DevEnv BackendInternalError m, MonadJobStatus m, JobHandle m ~ DevJobHandle) => m CorpusId corpus :: forall m. (FlowCmdM DevEnv BackendInternalError m, MonadJobStatus m, JobHandle m ~ DevJobHandle) => m CorpusId
mkCorpusUser = MkCorpusUserNormalCorpusName (UserName $ cs user) (cs name :: Text) mkCorpusUser = MkCorpusUserNormalCorpusName (UserName $ cs user) (cs name :: Text)
corpus = flowCorpusFile mkCorpusUser limit tt format Plain corpusPath Nothing DevJobHandle corpus = flowCorpusFile mkCorpusUser tt format Plain corpusPath Nothing DevJobHandle
corpusTsvHal :: forall m. (FlowCmdM DevEnv BackendInternalError m, MonadJobStatus m, JobHandle m ~ DevJobHandle) => m CorpusId corpusTsvHal :: forall m. (FlowCmdM DevEnv BackendInternalError m, MonadJobStatus m, JobHandle m ~ DevJobHandle) => m CorpusId
corpusTsvHal = flowCorpusFile mkCorpusUser limit tt TsvHal Plain corpusPath Nothing DevJobHandle corpusTsvHal = flowCorpusFile mkCorpusUser limit tt TsvHal Plain corpusPath Nothing DevJobHandle
......
...@@ -96,18 +96,14 @@ termsInText lang pats (manipulateText lang -> txt) = ...@@ -96,18 +96,14 @@ termsInText lang pats (manipulateText lang -> txt) =
-- | Manipulates the input 'Text' before passing it to 'termsInText'. -- | Manipulates the input 'Text' before passing it to 'termsInText'.
-- In particular, if the language is Chinese (ZH), we add spaces. -- In particular, if the language is Chinese (ZH), we add spaces.
manipulateText :: Lang -> Text -> Text manipulateText :: Lang -> Text -> Text
manipulateText lang txt = case lang of manipulateText ZH txt = addSpaces txt
ZH -> addSpaces txt manipulateText _ txt = txt
_ -> txt
-------------------------------------------------------------------------- --------------------------------------------------------------------------
extractTermsWithList :: Patterns -> Text -> Corpus [Text] extractTermsWithList :: Patterns -> Text -> Corpus [Text]
extractTermsWithList pats = map (replaceTerms KeepAll pats) . monoTextsBySentence extractTermsWithList pats = map (replaceTerms KeepAll pats) . monoTextsBySentence
-- | Extract terms -- | Extract terms
-- >>> let termList = [(["chat blanc"], [["chat","blanc"]])] :: TermList
-- extractTermsWithList' (buildPatterns termList) "Le chat blanc"["chat blanc"]
-- ["chat blanc"]
extractTermsWithList' :: Patterns -> Text -> [Text] extractTermsWithList' :: Patterns -> Text -> [Text]
extractTermsWithList' pats = map (concat . map concat . replaceTerms KeepAll pats) extractTermsWithList' pats = map (concat . map concat . replaceTerms KeepAll pats)
. monoTextsBySentence . monoTextsBySentence
......
...@@ -40,6 +40,7 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list) ...@@ -40,6 +40,7 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
, addDocumentsToHyperCorpus , addDocumentsToHyperCorpus
, reIndexWith , reIndexWith
, ngramsByDoc
, getOrMkRoot , getOrMkRoot
, getOrMkRootWithCorpus , getOrMkRootWithCorpus
...@@ -81,15 +82,13 @@ import Gargantext.Core.Text.List.Social (FlowSocialListWith(..)) ...@@ -81,15 +82,13 @@ import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
import Gargantext.Core.Text.Ngrams (NgramsType(NgramsTerms), Ngrams(_ngramsTerms)) import Gargantext.Core.Text.Ngrams (NgramsType(NgramsTerms), Ngrams(_ngramsTerms))
import Gargantext.Core.Text.Terms import Gargantext.Core.Text.Terms
import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..)) import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
import Gargantext.Core.Text.Terms.WithList (MatchedText)
import Gargantext.Core.Types (HasValidationError, TermsCount) import Gargantext.Core.Types (HasValidationError, TermsCount)
import Gargantext.Core.Types.Individu (User(..)) import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Types.Main ( ListType(MapTerm) ) import Gargantext.Core.Types.Main ( ListType(MapTerm) )
import Gargantext.Core.Types.Query (Limit)
import Gargantext.Database.Action.Flow.Extract () -- ExtractNgramsT instances import Gargantext.Database.Action.Flow.Extract () -- ExtractNgramsT instances
import Gargantext.Database.Action.Flow.List ( flowList_DbRepo, toNodeNgramsW' ) import Gargantext.Database.Action.Flow.List ( flowList_DbRepo, toNodeNgramsW' )
import Gargantext.Database.Action.Flow.Types ( do_api, DataOrigin(..), DataText(..), FlowCorpus ) import Gargantext.Database.Action.Flow.Types ( do_api, DataOrigin(..), DataText(..), FlowCorpus )
import Gargantext.Database.Action.Flow.Utils (docNgrams, documentIdWithNgrams, insertDocNgrams, insertDocs, mapNodeIdNgrams) import Gargantext.Database.Action.Flow.Utils (documentIdWithNgrams, insertDocNgrams, insertDocs, mapNodeIdNgrams, ngramsByDoc)
import Gargantext.Database.Action.Metrics (updateNgramsOccurrences, updateContextScore) import Gargantext.Database.Action.Metrics (updateNgramsOccurrences, updateContextScore)
import Gargantext.Database.Action.Search (searchDocInDatabase) import Gargantext.Database.Action.Search (searchDocInDatabase)
import Gargantext.Database.Admin.Types.Hyperdata.Contact ( HyperdataContact ) import Gargantext.Database.Admin.Types.Hyperdata.Contact ( HyperdataContact )
...@@ -105,8 +104,7 @@ import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..)) ...@@ -105,8 +104,7 @@ import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
import Gargantext.Database.Query.Table.NodeContext (selectDocNodesOnlyId) import Gargantext.Database.Query.Table.NodeContext (selectDocNodesOnlyId)
import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId) import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
import Gargantext.Database.Query.Tree.Root (MkCorpusUser(..), getOrMkRoot, getOrMkRootWithCorpus, userFromMkCorpusUser) import Gargantext.Database.Query.Tree.Root (MkCorpusUser(..), getOrMkRoot, getOrMkRootWithCorpus, userFromMkCorpusUser)
import Gargantext.Database.Schema.Context (context_oid_id) import Gargantext.Database.Schema.Ngrams ( indexNgrams, NgramsId )
import Gargantext.Database.Schema.Ngrams ( indexNgrams, text2ngrams, NgramsId )
import Gargantext.Database.Schema.Node (node_hyperdata) import Gargantext.Database.Schema.Node (node_hyperdata)
import Gargantext.Prelude hiding (to) import Gargantext.Prelude hiding (to)
import Gargantext.System.Logging ( logLocM, LogLevel(DEBUG), MonadLogger ) import Gargantext.System.Logging ( logLocM, LogLevel(DEBUG), MonadLogger )
...@@ -222,7 +220,6 @@ flowCorpusFile :: ( IsDBCmd env err m ...@@ -222,7 +220,6 @@ flowCorpusFile :: ( IsDBCmd env err m
, MonadJobStatus m , MonadJobStatus m
, HasCentralExchangeNotification env ) , HasCentralExchangeNotification env )
=> MkCorpusUser => MkCorpusUser
-> Limit -- Limit the number of docs (for dev purpose)
-> TermType Lang -> TermType Lang
-> FileType -> FileType
-> FileFormat -> FileFormat
...@@ -230,7 +227,7 @@ flowCorpusFile :: ( IsDBCmd env err m ...@@ -230,7 +227,7 @@ flowCorpusFile :: ( IsDBCmd env err m
-> Maybe FlowSocialListWith -> Maybe FlowSocialListWith
-> JobHandle m -> JobHandle m
-> m CorpusId -> m CorpusId
flowCorpusFile mkCorpusUser _l la ft ff fp mfslw jobHandle = do flowCorpusFile mkCorpusUser la ft ff fp mfslw jobHandle = do
eParsed <- liftBase $ parseFile ft ff fp eParsed <- liftBase $ parseFile ft ff fp
case eParsed of case eParsed of
Right parsed -> do Right parsed -> do
...@@ -496,34 +493,21 @@ reIndexWith :: ( HasNodeStory env err m ) ...@@ -496,34 +493,21 @@ reIndexWith :: ( HasNodeStory env err m )
reIndexWith cId lId nt lts = do reIndexWith cId lId nt lts = do
-- printDebug "(cId,lId,nt,lts)" (cId, lId, nt, lts) -- printDebug "(cId,lId,nt,lts)" (cId, lId, nt, lts)
corpus_node <- getNodeWith cId (Proxy @HyperdataCorpus) corpus_node <- getNodeWith cId (Proxy @HyperdataCorpus)
-- NOTE: This assumes a single language for the whole corpus
let corpusLang = withDefaultLanguage $ view (node_hyperdata . to _hc_lang) corpus_node let corpusLang = withDefaultLanguage $ view (node_hyperdata . to _hc_lang) corpus_node
-- Getting [NgramsTerm] -- Getting [NgramsTerm]
(ts :: [NgramsTerm]) <- List.concat (ts :: [NgramsTerm]) <- List.concat
<$> map (\(k,vs) -> k:vs) <$> map (\(k, vs) -> k:vs) -- this is concatenating parent with their children, 1st level only
<$> HashMap.toList <$> HashMap.toList
<$> getTermsWith identity [lId] nt lts <$> getTermsWith identity [lId] nt lts
-- Get all documents of the corpus -- Get all documents of the corpus
(docs :: [ContextOnlyId HyperdataDocument]) <- selectDocNodesOnlyId cId (docs :: [ContextOnlyId HyperdataDocument]) <- selectDocNodesOnlyId cId
let let ngramsByDoc' = ngramsByDoc corpusLang nt ts docs
docNgrams' :: [([(MatchedText, TermsCount)], NodeId)]
docNgrams' = map (\doc -> (docNgrams corpusLang ts doc, doc ^. context_oid_id)) docs
withExtractedNgrams :: [[(ExtractedNgrams, Map NgramsType (Map NodeId (Int, TermsCount)))]]
withExtractedNgrams =
map (\(matched, nId) ->
map (\(matchedText, cnt) ->
( SimpleNgrams (text2ngrams matchedText)
, Map.singleton nt $ Map.singleton nId (1, cnt) ) ) matched)
$ docNgrams'
-- TODO Is this weight always equal to 1?
ngramsByDoc :: [HashMap.HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))]
ngramsByDoc = map (HashMap.fromListWith (Map.unionWith (Map.unionWith (\(_a,b) (_a',b') -> (1,b+b')))))
$ withExtractedNgrams
-- Saving the indexation in database -- Saving the indexation in database
mapM_ (saveDocNgramsWith lId) ngramsByDoc mapM_ (saveDocNgramsWith lId) ngramsByDoc'
pure () pure ()
...@@ -14,7 +14,8 @@ module Gargantext.Database.Action.Flow.Utils ...@@ -14,7 +14,8 @@ module Gargantext.Database.Action.Flow.Utils
, documentIdWithNgrams , documentIdWithNgrams
, insertDocNgrams , insertDocNgrams
, insertDocs , insertDocs
, mapNodeIdNgrams ) , mapNodeIdNgrams
, ngramsByDoc )
where where
import Data.HashMap.Strict (HashMap) import Data.HashMap.Strict (HashMap)
...@@ -25,6 +26,7 @@ import Gargantext.API.Ngrams.Types qualified as NT ...@@ -25,6 +26,7 @@ import Gargantext.API.Ngrams.Types qualified as NT
import Gargantext.Core (Lang, toDBid) import Gargantext.Core (Lang, toDBid)
import Gargantext.Core.Flow.Types (UniqId, uniqId) import Gargantext.Core.Flow.Types (UniqId, uniqId)
import Gargantext.Core.Text.Ngrams ( Ngrams, NgramsType ) import Gargantext.Core.Text.Ngrams ( Ngrams, NgramsType )
import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
import Gargantext.Core.Text.Terms.WithList (MatchedText, buildPatternsWith, termsInText) import Gargantext.Core.Text.Terms.WithList (MatchedText, buildPatternsWith, termsInText)
import Gargantext.Core.Types (TermsCount) import Gargantext.Core.Types (TermsCount)
import Gargantext.Core.Utils (addTuples) import Gargantext.Core.Utils (addTuples)
...@@ -37,8 +39,8 @@ import Gargantext.Database.Query.Table.ContextNodeNgrams ( ContextNodeNgramsPoly ...@@ -37,8 +39,8 @@ import Gargantext.Database.Query.Table.ContextNodeNgrams ( ContextNodeNgramsPoly
import Gargantext.Database.Query.Table.Node.Document.Add qualified as Doc (add) import Gargantext.Database.Query.Table.Node.Document.Add qualified as Doc (add)
import Gargantext.Database.Query.Table.Node.Document.Insert (ReturnId, addUniqId, insertDb, reId, reInserted, reUniqId) import Gargantext.Database.Query.Table.Node.Document.Insert (ReturnId, addUniqId, insertDb, reId, reInserted, reUniqId)
import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..)) import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
import Gargantext.Database.Schema.Context (context_oid_hyperdata) import Gargantext.Database.Schema.Context (context_oid_hyperdata, context_oid_id)
import Gargantext.Database.Schema.Ngrams (NgramsId, NgramsTypeId(..)) import Gargantext.Database.Schema.Ngrams (NgramsId, NgramsTypeId(..), text2ngrams)
import Gargantext.Database.Types ( Indexed(..), index ) import Gargantext.Database.Types ( Indexed(..), index )
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Prelude.Crypto.Hash (Hash) import Gargantext.Prelude.Crypto.Hash (Hash)
...@@ -182,3 +184,35 @@ toInserted = ...@@ -182,3 +184,35 @@ toInserted =
-- docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds -- docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
-- _ <- mapM extractInsert (splitEvery 1000 docs) -- _ <- mapM extractInsert (splitEvery 1000 docs)
-- pure () -- pure ()
ngramsByDoc :: Lang
-> NgramsType
-> [NT.NgramsTerm]
-> [ContextOnlyId HyperdataDocument]
-> [HashMap.HashMap ExtractedNgrams (DM.Map NgramsType (Map NodeId (Int, TermsCount)))]
ngramsByDoc l nt ts docs =
ngramsByDoc' l nt ts <$> docs
-- | Given list of terms and a document, produce a map for this doc's
-- terms count and weights
ngramsByDoc' :: Lang
-> NgramsType
-> [NT.NgramsTerm]
-> ContextOnlyId HyperdataDocument
-> HashMap.HashMap ExtractedNgrams (DM.Map NgramsType (Map NodeId (Int, TermsCount)))
ngramsByDoc' l nt ts doc =
HashMap.fromListWith (DM.unionWith (DM.unionWith (\(_a,b) (_a',b') -> (1,b+b')))) withExtractedNgrams
where
docNgrams' :: ([(MatchedText, TermsCount)], NodeId)
docNgrams' = (docNgrams l ts doc, doc ^. context_oid_id)
(matched, nId) = docNgrams'
withExtractedNgrams :: [(ExtractedNgrams, Map NgramsType (Map NodeId (Int, TermsCount)))]
withExtractedNgrams =
map (\(matchedText, cnt) ->
( SimpleNgrams (text2ngrams matchedText)
, DM.singleton nt $ DM.singleton nId (1, cnt) ) ) matched
...@@ -4,11 +4,17 @@ ...@@ -4,11 +4,17 @@
module Test.Ngrams.Count (tests) where module Test.Ngrams.Count (tests) where
import Data.HashMap.Strict qualified as HashMap
import Data.Map.Strict qualified as Map
import Gargantext.API.Ngrams import Gargantext.API.Ngrams
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Terms.WithList (buildPatternsWith, termsInText, Pattern(..)) import Gargantext.Core.Text.Context (TermList)
import Gargantext.Database.Action.Flow.Utils (docNgrams) import Gargantext.Core.Text.Ngrams (NgramsType(NgramsTerms), Ngrams(..))
import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
import Gargantext.Core.Text.Terms.WithList (buildPatterns, buildPatternsWith, extractTermsWithList', termsInText, Pattern(..))
import Gargantext.Database.Action.Flow.Utils (docNgrams, ngramsByDoc)
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..), emptyHyperdataDocument ) import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..), emptyHyperdataDocument )
import Gargantext.Database.Admin.Types.Node (NodeId(..))
import Gargantext.Database.Schema.Context ( ContextPolyOnlyId(..) ) import Gargantext.Database.Schema.Context ( ContextPolyOnlyId(..) )
import Gargantext.Prelude import Gargantext.Prelude
import Test.Tasty import Test.Tasty
...@@ -29,7 +35,12 @@ unitTests = testGroup "Count tests" ...@@ -29,7 +35,12 @@ unitTests = testGroup "Count tests"
, testCase "termsInText works 03" testTermsInText03 , testCase "termsInText works 03" testTermsInText03
, testCase "termsInText works 04 (related to issue #221)" testTermsInText04 , testCase "termsInText works 04 (related to issue #221)" testTermsInText04
, testCase "extractTermsWithList' works 01" testExtractTermsWithList'01
, testCase "docNgrams works 01" testDocNgrams01 , testCase "docNgrams works 01" testDocNgrams01
, testCase "docNgrams works 02" testDocNgrams02
, testCase "ngramsByDoc works 01" testNgramsByDoc01
] ]
-- | Let's document how the `buildPatternsWith` function works. -- | Let's document how the `buildPatternsWith` function works.
...@@ -93,11 +104,16 @@ testTermsInText04 :: Assertion ...@@ -93,11 +104,16 @@ testTermsInText04 :: Assertion
testTermsInText04 = do testTermsInText04 = do
let terms = ["feuilles de basilic"] :: [NgramsTerm] let terms = ["feuilles de basilic"] :: [NgramsTerm]
let frPatterns = buildPatternsWith FR terms let frPatterns = buildPatternsWith FR terms
let tit = termsInText FR frPatterns "Infos pratiques Nombre de personnes 1 personne Quantité1 verre Temps de préparation 5 minutes Degré de difficulté Très facile Coût Abordable Les ingrédients de la recette 4 feuilles de basilic 1 branche de romarin 15 ml de citron jaune 60 ml d'eau gazeuse au mastiqua 90 ml de Bulles de Muscat Jaillance La préparation de la recette Verser dans un verre type long drink le citron jaune, les feuilles de basilic et l'eau gazeuse." let tit = termsInText FR frPatterns "Infos pratiques Nombre de personnes 1 personne Quantité 1 verre Temps de préparation 5 minutes Degré de difficulté Très facile Coût Abordable Les ingrédients de la recette 4 feuilles de basilic 1 branche de romarin 15 ml de citron jaune 60 ml d'eau gazeuse au mastiqua 90 ml de Bulles de Muscat Jaillance La préparation de la recette Verser dans un verre type long drink le citron jaune, les feuilles de basilic et l'eau gazeuse."
length tit @?= 1 length tit @?= 1
let [tit1] = tit let [tit1] = tit
tit1 @?= ("feuilles de basilic", 2) tit1 @?= ("feuilles de basilic", 2)
testExtractTermsWithList'01 :: Assertion
testExtractTermsWithList'01 = do
let termList = [(["chat blanc"], [["chat","blanc"]])] :: TermList
extractTermsWithList' (buildPatterns termList) "Le chat blanc" @?= ["chat blanc"]
testDocNgrams01 :: Assertion testDocNgrams01 :: Assertion
testDocNgrams01 = do testDocNgrams01 = do
let terms = ["hello", "world"] :: [NgramsTerm] let terms = ["hello", "world"] :: [NgramsTerm]
...@@ -106,3 +122,38 @@ testDocNgrams01 = do ...@@ -106,3 +122,38 @@ testDocNgrams01 = do
let ctx = ContextOnlyId 1 hd let ctx = ContextOnlyId 1 hd
let dNgrams = docNgrams EN terms ctx let dNgrams = docNgrams EN terms ctx
length dNgrams @?= 2 length dNgrams @?= 2
testDocNgrams02 :: Assertion
testDocNgrams02 = do
let terms = ["hello", "world"] :: [NgramsTerm]
let hd = emptyHyperdataDocument { _hd_title = Just "hello world, kaboom"
, _hd_abstract = Nothing }
let ctx = ContextOnlyId 1 hd
let dNgrams = docNgrams EN terms ctx
length dNgrams @?= 2
testNgramsByDoc01 :: Assertion
testNgramsByDoc01 = do
let terms = ["hello", "world"] :: [NgramsTerm]
let hd1 = emptyHyperdataDocument { _hd_title = Just "hello world, kaboom"
, _hd_abstract = Nothing }
let ctx1 = ContextOnlyId 1 hd1
let hd2 = emptyHyperdataDocument { _hd_title = Just "world, boom"
, _hd_abstract = Nothing }
let ctx2 = ContextOnlyId 2 hd2
ngramsByDoc EN NgramsTerms terms [ctx1, ctx2] @?=
[ HashMap.fromList
[ ( SimpleNgrams $ UnsafeNgrams { _ngramsTerms = "hello", _ngramsSize = 1 }
, Map.singleton NgramsTerms $ Map.singleton (UnsafeMkNodeId 1) (1, 1) )
, ( SimpleNgrams $ UnsafeNgrams { _ngramsTerms = "world", _ngramsSize = 1 }
, Map.singleton NgramsTerms $ Map.singleton (UnsafeMkNodeId 1) (1, 1) )
]
, HashMap.fromList
[ ( SimpleNgrams $ UnsafeNgrams { _ngramsTerms = "world", _ngramsSize = 1 }
, Map.singleton NgramsTerms $ Map.singleton (UnsafeMkNodeId 2) (1, 1) )
]
]
ngramsByDoc EN NgramsTerms terms [ctx1, ctx2] @?=
(ngramsByDoc EN NgramsTerms terms [ctx1]) <> (ngramsByDoc EN NgramsTerms terms [ctx2])
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment