[ngrams] fix postag_algo (#169)

We had CoreNLP hardcoded, however with Spacy this resulted in selectLems returning no entries, because the algo_id from DB didn't match. Hence no singulars grouping.

[ngrams] fix postag_algo (#169)
We had CoreNLP hardcoded, however with Spacy this resulted in selectLems returning no entries, because the algo_id from DB didn't match. Hence no singulars grouping.
445ba8f1 · Przemyslaw Kaminski · 5b22c72b · 445ba8f1 · 445ba8f1 · 445ba8f1
Verified Commit 445ba8f1 authored Mar 22, 2024 by Przemyslaw Kaminski
8 changed files
--- a/gargantext.cabal
+++ b/gargantext.cabal
@@ -100,9 +100,9 @@ library
      Gargantext.API
      Gargantext.API.Admin.Auth.Types
      Gargantext.API.Admin.EnvTypes
+      Gargantext.API.Admin.Orchestrator.Types
      Gargantext.API.Admin.Settings
      Gargantext.API.Admin.Settings.CORS
-      Gargantext.API.Admin.Orchestrator.Types
      Gargantext.API.Admin.Types
      Gargantext.API.Auth.PolicyCheck
      Gargantext.API.Dev
@@ -141,13 +141,15 @@ library
      Gargantext.Core.Text.Corpus.API
      Gargantext.Core.Text.Corpus.API.Arxiv
      Gargantext.Core.Text.Corpus.API.EPO
-      Gargantext.Core.Text.Corpus.API.Pubmed
      Gargantext.Core.Text.Corpus.API.OpenAlex
-      Gargantext.Core.Text.Corpus.Query
+      Gargantext.Core.Text.Corpus.API.Pubmed
      Gargantext.Core.Text.Corpus.Parsers
      Gargantext.Core.Text.Corpus.Parsers.CSV
      Gargantext.Core.Text.Corpus.Parsers.Date
      Gargantext.Core.Text.Corpus.Parsers.Date.Parsec
+      Gargantext.Core.Text.Corpus.Query
+      Gargantext.Core.Text.List
+      Gargantext.Core.Text.List.Group.WithStem
      Gargantext.Core.Text.List.Formats.CSV
      Gargantext.Core.Text.Metrics
      Gargantext.Core.Text.Metrics.CharByChar
@@ -171,8 +173,8 @@ library
      Gargantext.Core.Types
      Gargantext.Core.Types.Individu
      Gargantext.Core.Types.Main
-      Gargantext.Core.Types.Query
      Gargantext.Core.Types.Phylo
+      Gargantext.Core.Types.Query
      Gargantext.Core.Utils
      Gargantext.Core.Utils.Prefix
      Gargantext.Core.Viz.Graph
@@ -190,6 +192,7 @@ library
      Gargantext.Core.Viz.Types
      Gargantext.Database.Action.Flow
      Gargantext.Database.Action.Flow.Types
+      Gargantext.Database.Action.Metrics.TFICF
      Gargantext.Database.Action.Search
      Gargantext.Database.Action.User
      Gargantext.Database.Action.User.New
@@ -206,8 +209,8 @@ library
      Gargantext.Database.Query.Table.Node
      Gargantext.Database.Query.Table.Node.Error
      Gargantext.Database.Query.Table.Node.UpdateOpaleye
-      Gargantext.Database.Query.Tree.Root
      Gargantext.Database.Query.Table.User
+      Gargantext.Database.Query.Tree.Root
      Gargantext.Database.Schema.Ngrams
      Gargantext.Database.Schema.Node
      Gargantext.Database.Schema.User
@@ -310,11 +313,9 @@ library
      Gargantext.Core.Text.Corpus.Parsers.Wikidata.Crawler
      Gargantext.Core.Text.Corpus.Parsers.Wikimedia
      Gargantext.Core.Text.Learn
-      Gargantext.Core.Text.List
      Gargantext.Core.Text.List.Group
      Gargantext.Core.Text.List.Group.Prelude
      Gargantext.Core.Text.List.Group.WithScores
-      Gargantext.Core.Text.List.Group.WithStem
      Gargantext.Core.Text.List.Learn
      Gargantext.Core.Text.List.Merge
      Gargantext.Core.Text.List.Social
@@ -366,7 +367,6 @@ library
      Gargantext.Database.Action.Metrics
      Gargantext.Database.Action.Metrics.Lists
      Gargantext.Database.Action.Metrics.NgramsByContext
-      Gargantext.Database.Action.Metrics.TFICF
      Gargantext.Database.Action.Node
      Gargantext.Database.Action.Share
      Gargantext.Database.Action.TSQuery

--- a/src/Gargantext/Core/Text/List.hs
+++ b/src/Gargantext/Core/Text/List.hs
@@ -139,8 +139,8 @@ getGroupParams :: ( HasNodeError err
                  , HasTreeError err
                  )
               => GroupParams -> HashSet Ngrams -> DBCmd err GroupParams
-getGroupParams gp@(GroupWithPosTag l nsc _m) ng = do
+getGroupParams gp@(GroupWithPosTag { .. }) ng = do
-  !hashMap <- HashMap.fromList <$> selectLems l nsc (HashSet.toList ng)
+  !hashMap <- HashMap.fromList <$> selectLems _gwl_lang _gwl_nlp_config (HashSet.toList ng)
  -- printDebug "hashMap" hashMap
  pure $ over gwl_map (<> hashMap) gp
 getGroupParams gp _ = pure gp
@@ -174,7 +174,7 @@ buildNgramsTermsList user uCid mCid mfslw groupParams (nt, MapListSize mapListSi
  !(socialLists :: FlowCont NgramsTerm FlowListScores)
    <- flowSocialList mfslw user nt ( FlowCont HashMap.empty
                                                      $ HashMap.fromList
-                                                      $ List.zip (HashMap.keys   allTerms)
+                                                      $ List.zip (HashMap.keys allTerms)
                                                                 (repeat mempty)
                                    )
  -- printDebug "[buildNgramsTermsList: Flow Social List / end]" nt
@@ -188,8 +188,6 @@ buildNgramsTermsList user uCid mCid mfslw groupParams (nt, MapListSize mapListSi
  !groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys)
-  -- printDebug "[buildNgramsTermsList: groupParams']" ("" :: Text)
  let
    !socialLists_Stemmed = addScoreStem groupParams' ngramsKeys socialLists
    !groupedWithList = toGroupedTree socialLists_Stemmed allTerms

--- a/src/Gargantext/Core/Text/List/Group/WithStem.hs
+++ b/src/Gargantext/Core/Text/List/Group/WithStem.hs
@@ -44,7 +44,7 @@ addScoreStem groupParams ngrams fl = foldl' addScorePatch fl
 ------------------------------------------------------------------------
 -- | Main Types
 newtype StopSize = StopSize {unStopSize :: Int}
-  deriving (Eq)
+  deriving (Eq, Show)
 -- | TODO: group with 2 terms only can be
 -- discussed. Main purpose of this is offering
@@ -61,7 +61,7 @@ data GroupParams = GroupParams { unGroupParams_lang     :: !Lang
                                   , _gwl_nlp_config :: !NLPServerConfig
                                   , _gwl_map        :: !(HashMap Form Lem)
                                   }
-  deriving (Eq)
+  deriving (Eq, Show)
 ------------------------------------------------------------------------
 groupWith :: GroupParams

--- a/src/Gargantext/Core/Text/List/Social.hs
+++ b/src/Gargantext/Core/Text/List/Social.hs
@@ -23,7 +23,7 @@ import Data.Swagger ( ToSchema(..), genericDeclareNamedSchema, defaultSchemaOpti
 import Data.Text qualified as T
 import Data.Vector qualified as V
 import Gargantext.API.Ngrams.Types (NgramsTerm, NgramsPatch)
-import Gargantext.Core.NodeStory (getNodesArchiveHistory)
+import Gargantext.Core.NodeStory.DB ( getNodesArchiveHistory )
 import Gargantext.Core.Text.List.Social.Find (findListsId)
 import Gargantext.Core.Text.List.Social.Patch (addScorePatches)
 import Gargantext.Core.Text.List.Social.Prelude (FlowCont, FlowListScores)

--- a/src/Gargantext/Core/Text/Terms.hs
+++ b/src/Gargantext/Core/Text/Terms.hs
@@ -125,7 +125,11 @@ class ExtractNgramsT h
 ------------------------------------------------------------------------
 enrichedTerms :: Lang -> PosTagAlgo -> POS -> Terms -> NgramsPostag
 enrichedTerms l pa po (Terms ng1 ng2) =
-  NgramsPostag l pa po form lem
+  NgramsPostag { _np_lang = l
+               , _np_algo = pa
+               , _np_postag = po
+               , _np_form = form
+               , _np_lem = lem }
    where
      form = text2ngrams $ Text.intercalate " " ng1
      lem  = text2ngrams $ Text.intercalate " " $ Set.toList ng2
@@ -137,7 +141,7 @@ cleanNgrams s ng
      | otherwise                           = text2ngrams (Text.take s (ng ^. ngramsTerms))
 cleanExtractedNgrams :: Int -> ExtractedNgrams -> ExtractedNgrams
-cleanExtractedNgrams s (SimpleNgrams   ng) = SimpleNgrams $ (cleanNgrams s) ng
+cleanExtractedNgrams s (SimpleNgrams   ng) = SimpleNgrams $ cleanNgrams s ng
 cleanExtractedNgrams s (EnrichedNgrams ng) = EnrichedNgrams $ over np_form (cleanNgrams s)
                                                            $ over np_lem  (cleanNgrams s) ng
@@ -155,8 +159,7 @@ insertExtractedNgrams ngs = do
  m2 <- insertNgramsPostag (map unEnrichedNgrams e)
  --printDebug "terms" m2
-  let result = HashMap.union m1 m2
+  pure $ HashMap.union m1 m2
-  pure result
 isSimpleNgrams :: ExtractedNgrams -> Bool
 isSimpleNgrams (SimpleNgrams _) = True
@@ -188,10 +191,10 @@ type MinNgramSize = Int
 termsUnsupervised :: TermType Lang -> Text -> [TermsWithCount]
 termsUnsupervised (Unsupervised { _tt_model = Nothing }) = panicTrace "[termsUnsupervised] no model"
 termsUnsupervised (Unsupervised { _tt_model = Just _tt_model, .. }) =
-               map (\(t, cnt) -> (text2term _tt_lang t, cnt))
+               map (first (text2term _tt_lang))
             . groupWithCounts
             -- . List.nub
-             . (List.filter (\l' -> List.length l' >= _tt_windowSize))
+             . List.filter (\l' -> List.length l' >= _tt_windowSize)
             . List.concat
             . mainEleveWith _tt_model _tt_ngramsSize
             . uniText
@@ -199,19 +202,18 @@ termsUnsupervised _ = undefined
 newTries :: Int -> Text -> Tries Token ()
-newTries n t = buildTries n (fmap toToken $ uniText t)
+newTries n t = buildTries n (toToken <$> uniText t)
 -- | TODO removing long terms > 24
 uniText :: Text -> [[Text]]
-uniText = map (List.filter (not . isPunctuation))
+uniText = map (List.filter (not . isPunctuation) . tokenize)
-        . map tokenize
+            . sentences       -- TODO get sentences according to lang
-        . sentences       -- TODO get sentences according to lang
+            . Text.toLower
-        . Text.toLower
 text2term :: Lang -> [Text] -> Terms
 text2term _ [] = Terms [] Set.empty
 text2term lang txt = Terms txt (Set.fromList $ map (stem lang PorterAlgorithm) txt)
 isPunctuation :: Text -> Bool
-isPunctuation x = List.elem x $  (Text.pack . pure)
+isPunctuation x = List.elem x $  Text.pack . pure
                             <$> ("!?(),;.:" :: String)
--- a/src/Gargantext/Database/Action/Flow.hs
+++ b/src/Gargantext/Database/Action/Flow.hs
@@ -367,6 +367,7 @@ flowCorpusUser l user userCorpusId listId ctype mfslw = do
  pure userCorpusId
+-- | This function is responsible for contructing terms.
 buildSocialList :: ( HasNodeError err
                   , HasValidationError err
                   , HasNLPServer env
@@ -430,7 +431,7 @@ insertMasterDocs ncs c lang hs  =  do
                      (extractNgramsT ncs $ withLang lang documentsWithId)
                      (map (B.first contextId2NodeId) documentsWithId)
-  lId      <- getOrMkList masterCorpusId masterUserId
+  lId <- getOrMkList masterCorpusId masterUserId
  -- _ <- saveDocNgramsWith lId mapNgramsDocs'
  _ <- saveDocNgramsWith lId mapNgramsDocs'
@@ -455,8 +456,8 @@ saveDocNgramsWith lId mapNgramsDocs' = do
  --printDebug "saveDocNgramsWith" mapCgramsId
  -- insertDocNgrams
-  let ngrams2insert =  catMaybes [ ContextNodeNgrams2 <$> Just (nodeId2ContextId nId)
+  let ngrams2insert =  catMaybes [ ContextNodeNgrams2 (nodeId2ContextId nId)
-                                            <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
+                                            <$> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
                                            <*> Just (fromIntegral w :: Double)
                       | (terms'', mapNgramsTypes)      <- HashMap.toList mapNgramsDocs
                       , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes

--- a/src/Gargantext/Database/Action/Flow/Extract.hs
+++ b/src/Gargantext/Database/Action/Flow/Extract.hs
@@ -20,7 +20,7 @@ module Gargantext.Database.Action.Flow.Extract
 import Control.Lens ((^.), _Just, view)
 import Data.HashMap.Strict qualified as HashMap
 import Data.Map.Strict qualified as DM
-import Gargantext.Core (Lang, NLPServerConfig, PosTagAlgo(CoreNLP))
+import Gargantext.Core (Lang, NLPServerConfig(server))
 import Gargantext.Core.Text (HasText(..))
 import Gargantext.Core.Text.Corpus.Parsers (splitOn)
 import Gargantext.Core.Text.Ngrams (NgramsType(..))
@@ -77,7 +77,7 @@ instance ExtractNgramsT HyperdataDocument
                         $ maybe ["Nothing"] (splitOn Authors (doc^. hd_bdd))
                         $ doc ^. hd_authors
-          termsWithCounts' <- map (first (enrichedTerms (lang ^. tt_lang) CoreNLP NP)) . concat <$>
+          termsWithCounts' <- map (first (enrichedTerms (lang ^. tt_lang) (server ncs) NP)) . concat <$>
                                  liftBase (extractTerms ncs lang $ hasText doc)
          pure $ HashMap.fromList

--- a/src/Gargantext/Database/Query/Table/NgramsPostag.hs
+++ b/src/Gargantext/Database/Query/Table/NgramsPostag.hs
@@ -88,7 +88,7 @@ insertNgramsPostag' :: [NgramsPostagInsert] -> DBCmd err [Indexed Text Int]
 insertNgramsPostag' ns = runPGSQuery queryInsertNgramsPostag (PGS.Only $ Values fields ns)
  where
-    fields = map (\t -> QualifiedIdentifier Nothing t) $ snd fields_name
+    fields = map (QualifiedIdentifier Nothing) $ snd fields_name
    fields_name :: ( [Text], [Text])
    fields_name = ( ["lang_id", "algo_id", "postag", "form", "form_n", "lem" , "lem_n"]