[ngrams] fix postag_algo (#169)

We had CoreNLP hardcoded, however with Spacy this resulted in
selectLems returning no entries, because the algo_id from DB didn't
match. Hence no singulars grouping.
parent 5b22c72b
...@@ -100,9 +100,9 @@ library ...@@ -100,9 +100,9 @@ library
Gargantext.API Gargantext.API
Gargantext.API.Admin.Auth.Types Gargantext.API.Admin.Auth.Types
Gargantext.API.Admin.EnvTypes Gargantext.API.Admin.EnvTypes
Gargantext.API.Admin.Orchestrator.Types
Gargantext.API.Admin.Settings Gargantext.API.Admin.Settings
Gargantext.API.Admin.Settings.CORS Gargantext.API.Admin.Settings.CORS
Gargantext.API.Admin.Orchestrator.Types
Gargantext.API.Admin.Types Gargantext.API.Admin.Types
Gargantext.API.Auth.PolicyCheck Gargantext.API.Auth.PolicyCheck
Gargantext.API.Dev Gargantext.API.Dev
...@@ -141,13 +141,15 @@ library ...@@ -141,13 +141,15 @@ library
Gargantext.Core.Text.Corpus.API Gargantext.Core.Text.Corpus.API
Gargantext.Core.Text.Corpus.API.Arxiv Gargantext.Core.Text.Corpus.API.Arxiv
Gargantext.Core.Text.Corpus.API.EPO Gargantext.Core.Text.Corpus.API.EPO
Gargantext.Core.Text.Corpus.API.Pubmed
Gargantext.Core.Text.Corpus.API.OpenAlex Gargantext.Core.Text.Corpus.API.OpenAlex
Gargantext.Core.Text.Corpus.Query Gargantext.Core.Text.Corpus.API.Pubmed
Gargantext.Core.Text.Corpus.Parsers Gargantext.Core.Text.Corpus.Parsers
Gargantext.Core.Text.Corpus.Parsers.CSV Gargantext.Core.Text.Corpus.Parsers.CSV
Gargantext.Core.Text.Corpus.Parsers.Date Gargantext.Core.Text.Corpus.Parsers.Date
Gargantext.Core.Text.Corpus.Parsers.Date.Parsec Gargantext.Core.Text.Corpus.Parsers.Date.Parsec
Gargantext.Core.Text.Corpus.Query
Gargantext.Core.Text.List
Gargantext.Core.Text.List.Group.WithStem
Gargantext.Core.Text.List.Formats.CSV Gargantext.Core.Text.List.Formats.CSV
Gargantext.Core.Text.Metrics Gargantext.Core.Text.Metrics
Gargantext.Core.Text.Metrics.CharByChar Gargantext.Core.Text.Metrics.CharByChar
...@@ -171,8 +173,8 @@ library ...@@ -171,8 +173,8 @@ library
Gargantext.Core.Types Gargantext.Core.Types
Gargantext.Core.Types.Individu Gargantext.Core.Types.Individu
Gargantext.Core.Types.Main Gargantext.Core.Types.Main
Gargantext.Core.Types.Query
Gargantext.Core.Types.Phylo Gargantext.Core.Types.Phylo
Gargantext.Core.Types.Query
Gargantext.Core.Utils Gargantext.Core.Utils
Gargantext.Core.Utils.Prefix Gargantext.Core.Utils.Prefix
Gargantext.Core.Viz.Graph Gargantext.Core.Viz.Graph
...@@ -190,6 +192,7 @@ library ...@@ -190,6 +192,7 @@ library
Gargantext.Core.Viz.Types Gargantext.Core.Viz.Types
Gargantext.Database.Action.Flow Gargantext.Database.Action.Flow
Gargantext.Database.Action.Flow.Types Gargantext.Database.Action.Flow.Types
Gargantext.Database.Action.Metrics.TFICF
Gargantext.Database.Action.Search Gargantext.Database.Action.Search
Gargantext.Database.Action.User Gargantext.Database.Action.User
Gargantext.Database.Action.User.New Gargantext.Database.Action.User.New
...@@ -206,8 +209,8 @@ library ...@@ -206,8 +209,8 @@ library
Gargantext.Database.Query.Table.Node Gargantext.Database.Query.Table.Node
Gargantext.Database.Query.Table.Node.Error Gargantext.Database.Query.Table.Node.Error
Gargantext.Database.Query.Table.Node.UpdateOpaleye Gargantext.Database.Query.Table.Node.UpdateOpaleye
Gargantext.Database.Query.Tree.Root
Gargantext.Database.Query.Table.User Gargantext.Database.Query.Table.User
Gargantext.Database.Query.Tree.Root
Gargantext.Database.Schema.Ngrams Gargantext.Database.Schema.Ngrams
Gargantext.Database.Schema.Node Gargantext.Database.Schema.Node
Gargantext.Database.Schema.User Gargantext.Database.Schema.User
...@@ -310,11 +313,9 @@ library ...@@ -310,11 +313,9 @@ library
Gargantext.Core.Text.Corpus.Parsers.Wikidata.Crawler Gargantext.Core.Text.Corpus.Parsers.Wikidata.Crawler
Gargantext.Core.Text.Corpus.Parsers.Wikimedia Gargantext.Core.Text.Corpus.Parsers.Wikimedia
Gargantext.Core.Text.Learn Gargantext.Core.Text.Learn
Gargantext.Core.Text.List
Gargantext.Core.Text.List.Group Gargantext.Core.Text.List.Group
Gargantext.Core.Text.List.Group.Prelude Gargantext.Core.Text.List.Group.Prelude
Gargantext.Core.Text.List.Group.WithScores Gargantext.Core.Text.List.Group.WithScores
Gargantext.Core.Text.List.Group.WithStem
Gargantext.Core.Text.List.Learn Gargantext.Core.Text.List.Learn
Gargantext.Core.Text.List.Merge Gargantext.Core.Text.List.Merge
Gargantext.Core.Text.List.Social Gargantext.Core.Text.List.Social
...@@ -366,7 +367,6 @@ library ...@@ -366,7 +367,6 @@ library
Gargantext.Database.Action.Metrics Gargantext.Database.Action.Metrics
Gargantext.Database.Action.Metrics.Lists Gargantext.Database.Action.Metrics.Lists
Gargantext.Database.Action.Metrics.NgramsByContext Gargantext.Database.Action.Metrics.NgramsByContext
Gargantext.Database.Action.Metrics.TFICF
Gargantext.Database.Action.Node Gargantext.Database.Action.Node
Gargantext.Database.Action.Share Gargantext.Database.Action.Share
Gargantext.Database.Action.TSQuery Gargantext.Database.Action.TSQuery
......
...@@ -139,8 +139,8 @@ getGroupParams :: ( HasNodeError err ...@@ -139,8 +139,8 @@ getGroupParams :: ( HasNodeError err
, HasTreeError err , HasTreeError err
) )
=> GroupParams -> HashSet Ngrams -> DBCmd err GroupParams => GroupParams -> HashSet Ngrams -> DBCmd err GroupParams
getGroupParams gp@(GroupWithPosTag l nsc _m) ng = do getGroupParams gp@(GroupWithPosTag { .. }) ng = do
!hashMap <- HashMap.fromList <$> selectLems l nsc (HashSet.toList ng) !hashMap <- HashMap.fromList <$> selectLems _gwl_lang _gwl_nlp_config (HashSet.toList ng)
-- printDebug "hashMap" hashMap -- printDebug "hashMap" hashMap
pure $ over gwl_map (<> hashMap) gp pure $ over gwl_map (<> hashMap) gp
getGroupParams gp _ = pure gp getGroupParams gp _ = pure gp
...@@ -174,7 +174,7 @@ buildNgramsTermsList user uCid mCid mfslw groupParams (nt, MapListSize mapListSi ...@@ -174,7 +174,7 @@ buildNgramsTermsList user uCid mCid mfslw groupParams (nt, MapListSize mapListSi
!(socialLists :: FlowCont NgramsTerm FlowListScores) !(socialLists :: FlowCont NgramsTerm FlowListScores)
<- flowSocialList mfslw user nt ( FlowCont HashMap.empty <- flowSocialList mfslw user nt ( FlowCont HashMap.empty
$ HashMap.fromList $ HashMap.fromList
$ List.zip (HashMap.keys allTerms) $ List.zip (HashMap.keys allTerms)
(repeat mempty) (repeat mempty)
) )
-- printDebug "[buildNgramsTermsList: Flow Social List / end]" nt -- printDebug "[buildNgramsTermsList: Flow Social List / end]" nt
...@@ -188,8 +188,6 @@ buildNgramsTermsList user uCid mCid mfslw groupParams (nt, MapListSize mapListSi ...@@ -188,8 +188,6 @@ buildNgramsTermsList user uCid mCid mfslw groupParams (nt, MapListSize mapListSi
!groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys) !groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys)
-- printDebug "[buildNgramsTermsList: groupParams']" ("" :: Text)
let let
!socialLists_Stemmed = addScoreStem groupParams' ngramsKeys socialLists !socialLists_Stemmed = addScoreStem groupParams' ngramsKeys socialLists
!groupedWithList = toGroupedTree socialLists_Stemmed allTerms !groupedWithList = toGroupedTree socialLists_Stemmed allTerms
......
...@@ -44,7 +44,7 @@ addScoreStem groupParams ngrams fl = foldl' addScorePatch fl ...@@ -44,7 +44,7 @@ addScoreStem groupParams ngrams fl = foldl' addScorePatch fl
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | Main Types -- | Main Types
newtype StopSize = StopSize {unStopSize :: Int} newtype StopSize = StopSize {unStopSize :: Int}
deriving (Eq) deriving (Eq, Show)
-- | TODO: group with 2 terms only can be -- | TODO: group with 2 terms only can be
-- discussed. Main purpose of this is offering -- discussed. Main purpose of this is offering
...@@ -61,7 +61,7 @@ data GroupParams = GroupParams { unGroupParams_lang :: !Lang ...@@ -61,7 +61,7 @@ data GroupParams = GroupParams { unGroupParams_lang :: !Lang
, _gwl_nlp_config :: !NLPServerConfig , _gwl_nlp_config :: !NLPServerConfig
, _gwl_map :: !(HashMap Form Lem) , _gwl_map :: !(HashMap Form Lem)
} }
deriving (Eq) deriving (Eq, Show)
------------------------------------------------------------------------ ------------------------------------------------------------------------
groupWith :: GroupParams groupWith :: GroupParams
......
...@@ -23,7 +23,7 @@ import Data.Swagger ( ToSchema(..), genericDeclareNamedSchema, defaultSchemaOpti ...@@ -23,7 +23,7 @@ import Data.Swagger ( ToSchema(..), genericDeclareNamedSchema, defaultSchemaOpti
import Data.Text qualified as T import Data.Text qualified as T
import Data.Vector qualified as V import Data.Vector qualified as V
import Gargantext.API.Ngrams.Types (NgramsTerm, NgramsPatch) import Gargantext.API.Ngrams.Types (NgramsTerm, NgramsPatch)
import Gargantext.Core.NodeStory (getNodesArchiveHistory) import Gargantext.Core.NodeStory.DB ( getNodesArchiveHistory )
import Gargantext.Core.Text.List.Social.Find (findListsId) import Gargantext.Core.Text.List.Social.Find (findListsId)
import Gargantext.Core.Text.List.Social.Patch (addScorePatches) import Gargantext.Core.Text.List.Social.Patch (addScorePatches)
import Gargantext.Core.Text.List.Social.Prelude (FlowCont, FlowListScores) import Gargantext.Core.Text.List.Social.Prelude (FlowCont, FlowListScores)
......
...@@ -125,7 +125,11 @@ class ExtractNgramsT h ...@@ -125,7 +125,11 @@ class ExtractNgramsT h
------------------------------------------------------------------------ ------------------------------------------------------------------------
enrichedTerms :: Lang -> PosTagAlgo -> POS -> Terms -> NgramsPostag enrichedTerms :: Lang -> PosTagAlgo -> POS -> Terms -> NgramsPostag
enrichedTerms l pa po (Terms ng1 ng2) = enrichedTerms l pa po (Terms ng1 ng2) =
NgramsPostag l pa po form lem NgramsPostag { _np_lang = l
, _np_algo = pa
, _np_postag = po
, _np_form = form
, _np_lem = lem }
where where
form = text2ngrams $ Text.intercalate " " ng1 form = text2ngrams $ Text.intercalate " " ng1
lem = text2ngrams $ Text.intercalate " " $ Set.toList ng2 lem = text2ngrams $ Text.intercalate " " $ Set.toList ng2
...@@ -137,7 +141,7 @@ cleanNgrams s ng ...@@ -137,7 +141,7 @@ cleanNgrams s ng
| otherwise = text2ngrams (Text.take s (ng ^. ngramsTerms)) | otherwise = text2ngrams (Text.take s (ng ^. ngramsTerms))
cleanExtractedNgrams :: Int -> ExtractedNgrams -> ExtractedNgrams cleanExtractedNgrams :: Int -> ExtractedNgrams -> ExtractedNgrams
cleanExtractedNgrams s (SimpleNgrams ng) = SimpleNgrams $ (cleanNgrams s) ng cleanExtractedNgrams s (SimpleNgrams ng) = SimpleNgrams $ cleanNgrams s ng
cleanExtractedNgrams s (EnrichedNgrams ng) = EnrichedNgrams $ over np_form (cleanNgrams s) cleanExtractedNgrams s (EnrichedNgrams ng) = EnrichedNgrams $ over np_form (cleanNgrams s)
$ over np_lem (cleanNgrams s) ng $ over np_lem (cleanNgrams s) ng
...@@ -155,8 +159,7 @@ insertExtractedNgrams ngs = do ...@@ -155,8 +159,7 @@ insertExtractedNgrams ngs = do
m2 <- insertNgramsPostag (map unEnrichedNgrams e) m2 <- insertNgramsPostag (map unEnrichedNgrams e)
--printDebug "terms" m2 --printDebug "terms" m2
let result = HashMap.union m1 m2 pure $ HashMap.union m1 m2
pure result
isSimpleNgrams :: ExtractedNgrams -> Bool isSimpleNgrams :: ExtractedNgrams -> Bool
isSimpleNgrams (SimpleNgrams _) = True isSimpleNgrams (SimpleNgrams _) = True
...@@ -188,10 +191,10 @@ type MinNgramSize = Int ...@@ -188,10 +191,10 @@ type MinNgramSize = Int
termsUnsupervised :: TermType Lang -> Text -> [TermsWithCount] termsUnsupervised :: TermType Lang -> Text -> [TermsWithCount]
termsUnsupervised (Unsupervised { _tt_model = Nothing }) = panicTrace "[termsUnsupervised] no model" termsUnsupervised (Unsupervised { _tt_model = Nothing }) = panicTrace "[termsUnsupervised] no model"
termsUnsupervised (Unsupervised { _tt_model = Just _tt_model, .. }) = termsUnsupervised (Unsupervised { _tt_model = Just _tt_model, .. }) =
map (\(t, cnt) -> (text2term _tt_lang t, cnt)) map (first (text2term _tt_lang))
. groupWithCounts . groupWithCounts
-- . List.nub -- . List.nub
. (List.filter (\l' -> List.length l' >= _tt_windowSize)) . List.filter (\l' -> List.length l' >= _tt_windowSize)
. List.concat . List.concat
. mainEleveWith _tt_model _tt_ngramsSize . mainEleveWith _tt_model _tt_ngramsSize
. uniText . uniText
...@@ -199,19 +202,18 @@ termsUnsupervised _ = undefined ...@@ -199,19 +202,18 @@ termsUnsupervised _ = undefined
newTries :: Int -> Text -> Tries Token () newTries :: Int -> Text -> Tries Token ()
newTries n t = buildTries n (fmap toToken $ uniText t) newTries n t = buildTries n (toToken <$> uniText t)
-- | TODO removing long terms > 24 -- | TODO removing long terms > 24
uniText :: Text -> [[Text]] uniText :: Text -> [[Text]]
uniText = map (List.filter (not . isPunctuation)) uniText = map (List.filter (not . isPunctuation) . tokenize)
. map tokenize . sentences -- TODO get sentences according to lang
. sentences -- TODO get sentences according to lang . Text.toLower
. Text.toLower
text2term :: Lang -> [Text] -> Terms text2term :: Lang -> [Text] -> Terms
text2term _ [] = Terms [] Set.empty text2term _ [] = Terms [] Set.empty
text2term lang txt = Terms txt (Set.fromList $ map (stem lang PorterAlgorithm) txt) text2term lang txt = Terms txt (Set.fromList $ map (stem lang PorterAlgorithm) txt)
isPunctuation :: Text -> Bool isPunctuation :: Text -> Bool
isPunctuation x = List.elem x $ (Text.pack . pure) isPunctuation x = List.elem x $ Text.pack . pure
<$> ("!?(),;.:" :: String) <$> ("!?(),;.:" :: String)
...@@ -367,6 +367,7 @@ flowCorpusUser l user userCorpusId listId ctype mfslw = do ...@@ -367,6 +367,7 @@ flowCorpusUser l user userCorpusId listId ctype mfslw = do
pure userCorpusId pure userCorpusId
-- | This function is responsible for contructing terms.
buildSocialList :: ( HasNodeError err buildSocialList :: ( HasNodeError err
, HasValidationError err , HasValidationError err
, HasNLPServer env , HasNLPServer env
...@@ -430,7 +431,7 @@ insertMasterDocs ncs c lang hs = do ...@@ -430,7 +431,7 @@ insertMasterDocs ncs c lang hs = do
(extractNgramsT ncs $ withLang lang documentsWithId) (extractNgramsT ncs $ withLang lang documentsWithId)
(map (B.first contextId2NodeId) documentsWithId) (map (B.first contextId2NodeId) documentsWithId)
lId <- getOrMkList masterCorpusId masterUserId lId <- getOrMkList masterCorpusId masterUserId
-- _ <- saveDocNgramsWith lId mapNgramsDocs' -- _ <- saveDocNgramsWith lId mapNgramsDocs'
_ <- saveDocNgramsWith lId mapNgramsDocs' _ <- saveDocNgramsWith lId mapNgramsDocs'
...@@ -455,8 +456,8 @@ saveDocNgramsWith lId mapNgramsDocs' = do ...@@ -455,8 +456,8 @@ saveDocNgramsWith lId mapNgramsDocs' = do
--printDebug "saveDocNgramsWith" mapCgramsId --printDebug "saveDocNgramsWith" mapCgramsId
-- insertDocNgrams -- insertDocNgrams
let ngrams2insert = catMaybes [ ContextNodeNgrams2 <$> Just (nodeId2ContextId nId) let ngrams2insert = catMaybes [ ContextNodeNgrams2 (nodeId2ContextId nId)
<*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'') <$> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
<*> Just (fromIntegral w :: Double) <*> Just (fromIntegral w :: Double)
| (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
, (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
......
...@@ -20,7 +20,7 @@ module Gargantext.Database.Action.Flow.Extract ...@@ -20,7 +20,7 @@ module Gargantext.Database.Action.Flow.Extract
import Control.Lens ((^.), _Just, view) import Control.Lens ((^.), _Just, view)
import Data.HashMap.Strict qualified as HashMap import Data.HashMap.Strict qualified as HashMap
import Data.Map.Strict qualified as DM import Data.Map.Strict qualified as DM
import Gargantext.Core (Lang, NLPServerConfig, PosTagAlgo(CoreNLP)) import Gargantext.Core (Lang, NLPServerConfig(server))
import Gargantext.Core.Text (HasText(..)) import Gargantext.Core.Text (HasText(..))
import Gargantext.Core.Text.Corpus.Parsers (splitOn) import Gargantext.Core.Text.Corpus.Parsers (splitOn)
import Gargantext.Core.Text.Ngrams (NgramsType(..)) import Gargantext.Core.Text.Ngrams (NgramsType(..))
...@@ -77,7 +77,7 @@ instance ExtractNgramsT HyperdataDocument ...@@ -77,7 +77,7 @@ instance ExtractNgramsT HyperdataDocument
$ maybe ["Nothing"] (splitOn Authors (doc^. hd_bdd)) $ maybe ["Nothing"] (splitOn Authors (doc^. hd_bdd))
$ doc ^. hd_authors $ doc ^. hd_authors
termsWithCounts' <- map (first (enrichedTerms (lang ^. tt_lang) CoreNLP NP)) . concat <$> termsWithCounts' <- map (first (enrichedTerms (lang ^. tt_lang) (server ncs) NP)) . concat <$>
liftBase (extractTerms ncs lang $ hasText doc) liftBase (extractTerms ncs lang $ hasText doc)
pure $ HashMap.fromList pure $ HashMap.fromList
......
...@@ -88,7 +88,7 @@ insertNgramsPostag' :: [NgramsPostagInsert] -> DBCmd err [Indexed Text Int] ...@@ -88,7 +88,7 @@ insertNgramsPostag' :: [NgramsPostagInsert] -> DBCmd err [Indexed Text Int]
insertNgramsPostag' ns = runPGSQuery queryInsertNgramsPostag (PGS.Only $ Values fields ns) insertNgramsPostag' ns = runPGSQuery queryInsertNgramsPostag (PGS.Only $ Values fields ns)
where where
fields = map (\t -> QualifiedIdentifier Nothing t) $ snd fields_name fields = map (QualifiedIdentifier Nothing) $ snd fields_name
fields_name :: ( [Text], [Text]) fields_name :: ( [Text], [Text])
fields_name = ( ["lang_id", "algo_id", "postag", "form", "form_n", "lem" , "lem_n"] fields_name = ( ["lang_id", "algo_id", "postag", "form", "form_n", "lem" , "lem_n"]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment