[ngrams, tests] understanding ngramsByDoc better

parent 35c2d0b0
Pipeline #7172 failed with stages
in 14 minutes and 48 seconds
......@@ -45,7 +45,7 @@ importCLI (ImportArgs fun user name settingsPath limit corpusPath) = do
format = TsvGargV3
corpus :: forall m. (FlowCmdM DevEnv BackendInternalError m, MonadJobStatus m, JobHandle m ~ DevJobHandle) => m CorpusId
mkCorpusUser = MkCorpusUserNormalCorpusName (UserName $ cs user) (cs name :: Text)
corpus = flowCorpusFile mkCorpusUser limit tt format Plain corpusPath Nothing DevJobHandle
corpus = flowCorpusFile mkCorpusUser tt format Plain corpusPath Nothing DevJobHandle
corpusTsvHal :: forall m. (FlowCmdM DevEnv BackendInternalError m, MonadJobStatus m, JobHandle m ~ DevJobHandle) => m CorpusId
corpusTsvHal = flowCorpusFile mkCorpusUser limit tt TsvHal Plain corpusPath Nothing DevJobHandle
......
......@@ -96,18 +96,14 @@ termsInText lang pats (manipulateText lang -> txt) =
-- | Manipulates the input 'Text' before passing it to 'termsInText'.
-- In particular, if the language is Chinese (ZH), we add spaces.
manipulateText :: Lang -> Text -> Text
manipulateText lang txt = case lang of
ZH -> addSpaces txt
_ -> txt
manipulateText ZH txt = addSpaces txt
manipulateText _ txt = txt
--------------------------------------------------------------------------
extractTermsWithList :: Patterns -> Text -> Corpus [Text]
extractTermsWithList pats = map (replaceTerms KeepAll pats) . monoTextsBySentence
-- | Extract terms
-- >>> let termList = [(["chat blanc"], [["chat","blanc"]])] :: TermList
-- extractTermsWithList' (buildPatterns termList) "Le chat blanc"["chat blanc"]
-- ["chat blanc"]
extractTermsWithList' :: Patterns -> Text -> [Text]
extractTermsWithList' pats = map (concat . map concat . replaceTerms KeepAll pats)
. monoTextsBySentence
......
......@@ -40,6 +40,7 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
, addDocumentsToHyperCorpus
, reIndexWith
, ngramsByDoc
, getOrMkRoot
, getOrMkRootWithCorpus
......@@ -81,15 +82,13 @@ import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
import Gargantext.Core.Text.Ngrams (NgramsType(NgramsTerms), Ngrams(_ngramsTerms))
import Gargantext.Core.Text.Terms
import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
import Gargantext.Core.Text.Terms.WithList (MatchedText)
import Gargantext.Core.Types (HasValidationError, TermsCount)
import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Types.Main ( ListType(MapTerm) )
import Gargantext.Core.Types.Query (Limit)
import Gargantext.Database.Action.Flow.Extract () -- ExtractNgramsT instances
import Gargantext.Database.Action.Flow.List ( flowList_DbRepo, toNodeNgramsW' )
import Gargantext.Database.Action.Flow.Types ( do_api, DataOrigin(..), DataText(..), FlowCorpus )
import Gargantext.Database.Action.Flow.Utils (docNgrams, documentIdWithNgrams, insertDocNgrams, insertDocs, mapNodeIdNgrams)
import Gargantext.Database.Action.Flow.Utils (documentIdWithNgrams, insertDocNgrams, insertDocs, mapNodeIdNgrams, ngramsByDoc)
import Gargantext.Database.Action.Metrics (updateNgramsOccurrences, updateContextScore)
import Gargantext.Database.Action.Search (searchDocInDatabase)
import Gargantext.Database.Admin.Types.Hyperdata.Contact ( HyperdataContact )
......@@ -105,8 +104,7 @@ import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
import Gargantext.Database.Query.Table.NodeContext (selectDocNodesOnlyId)
import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
import Gargantext.Database.Query.Tree.Root (MkCorpusUser(..), getOrMkRoot, getOrMkRootWithCorpus, userFromMkCorpusUser)
import Gargantext.Database.Schema.Context (context_oid_id)
import Gargantext.Database.Schema.Ngrams ( indexNgrams, text2ngrams, NgramsId )
import Gargantext.Database.Schema.Ngrams ( indexNgrams, NgramsId )
import Gargantext.Database.Schema.Node (node_hyperdata)
import Gargantext.Prelude hiding (to)
import Gargantext.System.Logging ( logLocM, LogLevel(DEBUG), MonadLogger )
......@@ -222,7 +220,6 @@ flowCorpusFile :: ( IsDBCmd env err m
, MonadJobStatus m
, HasCentralExchangeNotification env )
=> MkCorpusUser
-> Limit -- Limit the number of docs (for dev purpose)
-> TermType Lang
-> FileType
-> FileFormat
......@@ -230,7 +227,7 @@ flowCorpusFile :: ( IsDBCmd env err m
-> Maybe FlowSocialListWith
-> JobHandle m
-> m CorpusId
flowCorpusFile mkCorpusUser _l la ft ff fp mfslw jobHandle = do
flowCorpusFile mkCorpusUser la ft ff fp mfslw jobHandle = do
eParsed <- liftBase $ parseFile ft ff fp
case eParsed of
Right parsed -> do
......@@ -496,34 +493,21 @@ reIndexWith :: ( HasNodeStory env err m )
reIndexWith cId lId nt lts = do
-- printDebug "(cId,lId,nt,lts)" (cId, lId, nt, lts)
corpus_node <- getNodeWith cId (Proxy @HyperdataCorpus)
-- NOTE: This assumes a single language for the whole corpus
let corpusLang = withDefaultLanguage $ view (node_hyperdata . to _hc_lang) corpus_node
-- Getting [NgramsTerm]
(ts :: [NgramsTerm]) <- List.concat
<$> map (\(k,vs) -> k:vs)
<$> map (\(k, vs) -> k:vs) -- this is concatenating parent with their children, 1st level only
<$> HashMap.toList
<$> getTermsWith identity [lId] nt lts
-- Get all documents of the corpus
(docs :: [ContextOnlyId HyperdataDocument]) <- selectDocNodesOnlyId cId
let
docNgrams' :: [([(MatchedText, TermsCount)], NodeId)]
docNgrams' = map (\doc -> (docNgrams corpusLang ts doc, doc ^. context_oid_id)) docs
withExtractedNgrams :: [[(ExtractedNgrams, Map NgramsType (Map NodeId (Int, TermsCount)))]]
withExtractedNgrams =
map (\(matched, nId) ->
map (\(matchedText, cnt) ->
( SimpleNgrams (text2ngrams matchedText)
, Map.singleton nt $ Map.singleton nId (1, cnt) ) ) matched)
$ docNgrams'
-- TODO Is this weight always equal to 1?
ngramsByDoc :: [HashMap.HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))]
ngramsByDoc = map (HashMap.fromListWith (Map.unionWith (Map.unionWith (\(_a,b) (_a',b') -> (1,b+b')))))
$ withExtractedNgrams
let ngramsByDoc' = ngramsByDoc corpusLang nt ts docs
-- Saving the indexation in database
mapM_ (saveDocNgramsWith lId) ngramsByDoc
mapM_ (saveDocNgramsWith lId) ngramsByDoc'
pure ()
......@@ -14,7 +14,8 @@ module Gargantext.Database.Action.Flow.Utils
, documentIdWithNgrams
, insertDocNgrams
, insertDocs
, mapNodeIdNgrams )
, mapNodeIdNgrams
, ngramsByDoc )
where
import Data.HashMap.Strict (HashMap)
......@@ -25,6 +26,7 @@ import Gargantext.API.Ngrams.Types qualified as NT
import Gargantext.Core (Lang, toDBid)
import Gargantext.Core.Flow.Types (UniqId, uniqId)
import Gargantext.Core.Text.Ngrams ( Ngrams, NgramsType )
import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
import Gargantext.Core.Text.Terms.WithList (MatchedText, buildPatternsWith, termsInText)
import Gargantext.Core.Types (TermsCount)
import Gargantext.Core.Utils (addTuples)
......@@ -37,8 +39,8 @@ import Gargantext.Database.Query.Table.ContextNodeNgrams ( ContextNodeNgramsPoly
import Gargantext.Database.Query.Table.Node.Document.Add qualified as Doc (add)
import Gargantext.Database.Query.Table.Node.Document.Insert (ReturnId, addUniqId, insertDb, reId, reInserted, reUniqId)
import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
import Gargantext.Database.Schema.Context (context_oid_hyperdata)
import Gargantext.Database.Schema.Ngrams (NgramsId, NgramsTypeId(..))
import Gargantext.Database.Schema.Context (context_oid_hyperdata, context_oid_id)
import Gargantext.Database.Schema.Ngrams (NgramsId, NgramsTypeId(..), text2ngrams)
import Gargantext.Database.Types ( Indexed(..), index )
import Gargantext.Prelude
import Gargantext.Prelude.Crypto.Hash (Hash)
......@@ -182,3 +184,35 @@ toInserted =
-- docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
-- _ <- mapM extractInsert (splitEvery 1000 docs)
-- pure ()
ngramsByDoc :: Lang
-> NgramsType
-> [NT.NgramsTerm]
-> [ContextOnlyId HyperdataDocument]
-> [HashMap.HashMap ExtractedNgrams (DM.Map NgramsType (Map NodeId (Int, TermsCount)))]
ngramsByDoc l nt ts docs =
ngramsByDoc' l nt ts <$> docs
-- | Given list of terms and a document, produce a map for this doc's
-- terms count and weights
ngramsByDoc' :: Lang
-> NgramsType
-> [NT.NgramsTerm]
-> ContextOnlyId HyperdataDocument
-> HashMap.HashMap ExtractedNgrams (DM.Map NgramsType (Map NodeId (Int, TermsCount)))
ngramsByDoc' l nt ts doc =
HashMap.fromListWith (DM.unionWith (DM.unionWith (\(_a,b) (_a',b') -> (1,b+b')))) withExtractedNgrams
where
docNgrams' :: ([(MatchedText, TermsCount)], NodeId)
docNgrams' = (docNgrams l ts doc, doc ^. context_oid_id)
(matched, nId) = docNgrams'
withExtractedNgrams :: [(ExtractedNgrams, Map NgramsType (Map NodeId (Int, TermsCount)))]
withExtractedNgrams =
map (\(matchedText, cnt) ->
( SimpleNgrams (text2ngrams matchedText)
, DM.singleton nt $ DM.singleton nId (1, cnt) ) ) matched
......@@ -4,11 +4,17 @@
module Test.Ngrams.Count (tests) where
import Data.HashMap.Strict qualified as HashMap
import Data.Map.Strict qualified as Map
import Gargantext.API.Ngrams
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Terms.WithList (buildPatternsWith, termsInText, Pattern(..))
import Gargantext.Database.Action.Flow.Utils (docNgrams)
import Gargantext.Core.Text.Context (TermList)
import Gargantext.Core.Text.Ngrams (NgramsType(NgramsTerms), Ngrams(..))
import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
import Gargantext.Core.Text.Terms.WithList (buildPatterns, buildPatternsWith, extractTermsWithList', termsInText, Pattern(..))
import Gargantext.Database.Action.Flow.Utils (docNgrams, ngramsByDoc)
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..), emptyHyperdataDocument )
import Gargantext.Database.Admin.Types.Node (NodeId(..))
import Gargantext.Database.Schema.Context ( ContextPolyOnlyId(..) )
import Gargantext.Prelude
import Test.Tasty
......@@ -29,7 +35,12 @@ unitTests = testGroup "Count tests"
, testCase "termsInText works 03" testTermsInText03
, testCase "termsInText works 04 (related to issue #221)" testTermsInText04
, testCase "extractTermsWithList' works 01" testExtractTermsWithList'01
, testCase "docNgrams works 01" testDocNgrams01
, testCase "docNgrams works 02" testDocNgrams02
, testCase "ngramsByDoc works 01" testNgramsByDoc01
]
-- | Let's document how the `buildPatternsWith` function works.
......@@ -93,11 +104,16 @@ testTermsInText04 :: Assertion
testTermsInText04 = do
let terms = ["feuilles de basilic"] :: [NgramsTerm]
let frPatterns = buildPatternsWith FR terms
let tit = termsInText FR frPatterns "Infos pratiques Nombre de personnes 1 personne Quantité1 verre Temps de préparation 5 minutes Degré de difficulté Très facile Coût Abordable Les ingrédients de la recette 4 feuilles de basilic 1 branche de romarin 15 ml de citron jaune 60 ml d'eau gazeuse au mastiqua 90 ml de Bulles de Muscat Jaillance La préparation de la recette Verser dans un verre type long drink le citron jaune, les feuilles de basilic et l'eau gazeuse."
let tit = termsInText FR frPatterns "Infos pratiques Nombre de personnes 1 personne Quantité 1 verre Temps de préparation 5 minutes Degré de difficulté Très facile Coût Abordable Les ingrédients de la recette 4 feuilles de basilic 1 branche de romarin 15 ml de citron jaune 60 ml d'eau gazeuse au mastiqua 90 ml de Bulles de Muscat Jaillance La préparation de la recette Verser dans un verre type long drink le citron jaune, les feuilles de basilic et l'eau gazeuse."
length tit @?= 1
let [tit1] = tit
tit1 @?= ("feuilles de basilic", 2)
testExtractTermsWithList'01 :: Assertion
testExtractTermsWithList'01 = do
let termList = [(["chat blanc"], [["chat","blanc"]])] :: TermList
extractTermsWithList' (buildPatterns termList) "Le chat blanc" @?= ["chat blanc"]
testDocNgrams01 :: Assertion
testDocNgrams01 = do
let terms = ["hello", "world"] :: [NgramsTerm]
......@@ -106,3 +122,38 @@ testDocNgrams01 = do
let ctx = ContextOnlyId 1 hd
let dNgrams = docNgrams EN terms ctx
length dNgrams @?= 2
testDocNgrams02 :: Assertion
testDocNgrams02 = do
let terms = ["hello", "world"] :: [NgramsTerm]
let hd = emptyHyperdataDocument { _hd_title = Just "hello world, kaboom"
, _hd_abstract = Nothing }
let ctx = ContextOnlyId 1 hd
let dNgrams = docNgrams EN terms ctx
length dNgrams @?= 2
testNgramsByDoc01 :: Assertion
testNgramsByDoc01 = do
let terms = ["hello", "world"] :: [NgramsTerm]
let hd1 = emptyHyperdataDocument { _hd_title = Just "hello world, kaboom"
, _hd_abstract = Nothing }
let ctx1 = ContextOnlyId 1 hd1
let hd2 = emptyHyperdataDocument { _hd_title = Just "world, boom"
, _hd_abstract = Nothing }
let ctx2 = ContextOnlyId 2 hd2
ngramsByDoc EN NgramsTerms terms [ctx1, ctx2] @?=
[ HashMap.fromList
[ ( SimpleNgrams $ UnsafeNgrams { _ngramsTerms = "hello", _ngramsSize = 1 }
, Map.singleton NgramsTerms $ Map.singleton (UnsafeMkNodeId 1) (1, 1) )
, ( SimpleNgrams $ UnsafeNgrams { _ngramsTerms = "world", _ngramsSize = 1 }
, Map.singleton NgramsTerms $ Map.singleton (UnsafeMkNodeId 1) (1, 1) )
]
, HashMap.fromList
[ ( SimpleNgrams $ UnsafeNgrams { _ngramsTerms = "world", _ngramsSize = 1 }
, Map.singleton NgramsTerms $ Map.singleton (UnsafeMkNodeId 2) (1, 1) )
]
]
ngramsByDoc EN NgramsTerms terms [ctx1, ctx2] @?=
(ngramsByDoc EN NgramsTerms terms [ctx1]) <> (ngramsByDoc EN NgramsTerms terms [ctx2])
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment