[ngrams] fix postag_algo (#169)

We had CoreNLP hardcoded, however with Spacy this resulted in
selectLems returning no entries, because the algo_id from DB didn't
match. Hence no singulars grouping.
parent 5b22c72b
......@@ -100,9 +100,9 @@ library
Gargantext.API
Gargantext.API.Admin.Auth.Types
Gargantext.API.Admin.EnvTypes
Gargantext.API.Admin.Orchestrator.Types
Gargantext.API.Admin.Settings
Gargantext.API.Admin.Settings.CORS
Gargantext.API.Admin.Orchestrator.Types
Gargantext.API.Admin.Types
Gargantext.API.Auth.PolicyCheck
Gargantext.API.Dev
......@@ -141,13 +141,15 @@ library
Gargantext.Core.Text.Corpus.API
Gargantext.Core.Text.Corpus.API.Arxiv
Gargantext.Core.Text.Corpus.API.EPO
Gargantext.Core.Text.Corpus.API.Pubmed
Gargantext.Core.Text.Corpus.API.OpenAlex
Gargantext.Core.Text.Corpus.Query
Gargantext.Core.Text.Corpus.API.Pubmed
Gargantext.Core.Text.Corpus.Parsers
Gargantext.Core.Text.Corpus.Parsers.CSV
Gargantext.Core.Text.Corpus.Parsers.Date
Gargantext.Core.Text.Corpus.Parsers.Date.Parsec
Gargantext.Core.Text.Corpus.Query
Gargantext.Core.Text.List
Gargantext.Core.Text.List.Group.WithStem
Gargantext.Core.Text.List.Formats.CSV
Gargantext.Core.Text.Metrics
Gargantext.Core.Text.Metrics.CharByChar
......@@ -171,8 +173,8 @@ library
Gargantext.Core.Types
Gargantext.Core.Types.Individu
Gargantext.Core.Types.Main
Gargantext.Core.Types.Query
Gargantext.Core.Types.Phylo
Gargantext.Core.Types.Query
Gargantext.Core.Utils
Gargantext.Core.Utils.Prefix
Gargantext.Core.Viz.Graph
......@@ -190,6 +192,7 @@ library
Gargantext.Core.Viz.Types
Gargantext.Database.Action.Flow
Gargantext.Database.Action.Flow.Types
Gargantext.Database.Action.Metrics.TFICF
Gargantext.Database.Action.Search
Gargantext.Database.Action.User
Gargantext.Database.Action.User.New
......@@ -206,8 +209,8 @@ library
Gargantext.Database.Query.Table.Node
Gargantext.Database.Query.Table.Node.Error
Gargantext.Database.Query.Table.Node.UpdateOpaleye
Gargantext.Database.Query.Tree.Root
Gargantext.Database.Query.Table.User
Gargantext.Database.Query.Tree.Root
Gargantext.Database.Schema.Ngrams
Gargantext.Database.Schema.Node
Gargantext.Database.Schema.User
......@@ -310,11 +313,9 @@ library
Gargantext.Core.Text.Corpus.Parsers.Wikidata.Crawler
Gargantext.Core.Text.Corpus.Parsers.Wikimedia
Gargantext.Core.Text.Learn
Gargantext.Core.Text.List
Gargantext.Core.Text.List.Group
Gargantext.Core.Text.List.Group.Prelude
Gargantext.Core.Text.List.Group.WithScores
Gargantext.Core.Text.List.Group.WithStem
Gargantext.Core.Text.List.Learn
Gargantext.Core.Text.List.Merge
Gargantext.Core.Text.List.Social
......@@ -366,7 +367,6 @@ library
Gargantext.Database.Action.Metrics
Gargantext.Database.Action.Metrics.Lists
Gargantext.Database.Action.Metrics.NgramsByContext
Gargantext.Database.Action.Metrics.TFICF
Gargantext.Database.Action.Node
Gargantext.Database.Action.Share
Gargantext.Database.Action.TSQuery
......
......@@ -139,8 +139,8 @@ getGroupParams :: ( HasNodeError err
, HasTreeError err
)
=> GroupParams -> HashSet Ngrams -> DBCmd err GroupParams
getGroupParams gp@(GroupWithPosTag l nsc _m) ng = do
!hashMap <- HashMap.fromList <$> selectLems l nsc (HashSet.toList ng)
getGroupParams gp@(GroupWithPosTag { .. }) ng = do
!hashMap <- HashMap.fromList <$> selectLems _gwl_lang _gwl_nlp_config (HashSet.toList ng)
-- printDebug "hashMap" hashMap
pure $ over gwl_map (<> hashMap) gp
getGroupParams gp _ = pure gp
......@@ -188,8 +188,6 @@ buildNgramsTermsList user uCid mCid mfslw groupParams (nt, MapListSize mapListSi
!groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys)
-- printDebug "[buildNgramsTermsList: groupParams']" ("" :: Text)
let
!socialLists_Stemmed = addScoreStem groupParams' ngramsKeys socialLists
!groupedWithList = toGroupedTree socialLists_Stemmed allTerms
......
......@@ -44,7 +44,7 @@ addScoreStem groupParams ngrams fl = foldl' addScorePatch fl
------------------------------------------------------------------------
-- | Main Types
newtype StopSize = StopSize {unStopSize :: Int}
deriving (Eq)
deriving (Eq, Show)
-- | TODO: group with 2 terms only can be
-- discussed. Main purpose of this is offering
......@@ -61,7 +61,7 @@ data GroupParams = GroupParams { unGroupParams_lang :: !Lang
, _gwl_nlp_config :: !NLPServerConfig
, _gwl_map :: !(HashMap Form Lem)
}
deriving (Eq)
deriving (Eq, Show)
------------------------------------------------------------------------
groupWith :: GroupParams
......
......@@ -23,7 +23,7 @@ import Data.Swagger ( ToSchema(..), genericDeclareNamedSchema, defaultSchemaOpti
import Data.Text qualified as T
import Data.Vector qualified as V
import Gargantext.API.Ngrams.Types (NgramsTerm, NgramsPatch)
import Gargantext.Core.NodeStory (getNodesArchiveHistory)
import Gargantext.Core.NodeStory.DB ( getNodesArchiveHistory )
import Gargantext.Core.Text.List.Social.Find (findListsId)
import Gargantext.Core.Text.List.Social.Patch (addScorePatches)
import Gargantext.Core.Text.List.Social.Prelude (FlowCont, FlowListScores)
......
......@@ -125,7 +125,11 @@ class ExtractNgramsT h
------------------------------------------------------------------------
enrichedTerms :: Lang -> PosTagAlgo -> POS -> Terms -> NgramsPostag
enrichedTerms l pa po (Terms ng1 ng2) =
NgramsPostag l pa po form lem
NgramsPostag { _np_lang = l
, _np_algo = pa
, _np_postag = po
, _np_form = form
, _np_lem = lem }
where
form = text2ngrams $ Text.intercalate " " ng1
lem = text2ngrams $ Text.intercalate " " $ Set.toList ng2
......@@ -137,7 +141,7 @@ cleanNgrams s ng
| otherwise = text2ngrams (Text.take s (ng ^. ngramsTerms))
cleanExtractedNgrams :: Int -> ExtractedNgrams -> ExtractedNgrams
cleanExtractedNgrams s (SimpleNgrams ng) = SimpleNgrams $ (cleanNgrams s) ng
cleanExtractedNgrams s (SimpleNgrams ng) = SimpleNgrams $ cleanNgrams s ng
cleanExtractedNgrams s (EnrichedNgrams ng) = EnrichedNgrams $ over np_form (cleanNgrams s)
$ over np_lem (cleanNgrams s) ng
......@@ -155,8 +159,7 @@ insertExtractedNgrams ngs = do
m2 <- insertNgramsPostag (map unEnrichedNgrams e)
--printDebug "terms" m2
let result = HashMap.union m1 m2
pure result
pure $ HashMap.union m1 m2
isSimpleNgrams :: ExtractedNgrams -> Bool
isSimpleNgrams (SimpleNgrams _) = True
......@@ -188,10 +191,10 @@ type MinNgramSize = Int
termsUnsupervised :: TermType Lang -> Text -> [TermsWithCount]
termsUnsupervised (Unsupervised { _tt_model = Nothing }) = panicTrace "[termsUnsupervised] no model"
termsUnsupervised (Unsupervised { _tt_model = Just _tt_model, .. }) =
map (\(t, cnt) -> (text2term _tt_lang t, cnt))
map (first (text2term _tt_lang))
. groupWithCounts
-- . List.nub
. (List.filter (\l' -> List.length l' >= _tt_windowSize))
. List.filter (\l' -> List.length l' >= _tt_windowSize)
. List.concat
. mainEleveWith _tt_model _tt_ngramsSize
. uniText
......@@ -199,12 +202,11 @@ termsUnsupervised _ = undefined
newTries :: Int -> Text -> Tries Token ()
newTries n t = buildTries n (fmap toToken $ uniText t)
newTries n t = buildTries n (toToken <$> uniText t)
-- | TODO removing long terms > 24
uniText :: Text -> [[Text]]
uniText = map (List.filter (not . isPunctuation))
. map tokenize
uniText = map (List.filter (not . isPunctuation) . tokenize)
. sentences -- TODO get sentences according to lang
. Text.toLower
......@@ -213,5 +215,5 @@ text2term _ [] = Terms [] Set.empty
text2term lang txt = Terms txt (Set.fromList $ map (stem lang PorterAlgorithm) txt)
isPunctuation :: Text -> Bool
isPunctuation x = List.elem x $ (Text.pack . pure)
isPunctuation x = List.elem x $ Text.pack . pure
<$> ("!?(),;.:" :: String)
......@@ -367,6 +367,7 @@ flowCorpusUser l user userCorpusId listId ctype mfslw = do
pure userCorpusId
-- | This function is responsible for contructing terms.
buildSocialList :: ( HasNodeError err
, HasValidationError err
, HasNLPServer env
......@@ -455,8 +456,8 @@ saveDocNgramsWith lId mapNgramsDocs' = do
--printDebug "saveDocNgramsWith" mapCgramsId
-- insertDocNgrams
let ngrams2insert = catMaybes [ ContextNodeNgrams2 <$> Just (nodeId2ContextId nId)
<*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
let ngrams2insert = catMaybes [ ContextNodeNgrams2 (nodeId2ContextId nId)
<$> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
<*> Just (fromIntegral w :: Double)
| (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
, (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
......
......@@ -20,7 +20,7 @@ module Gargantext.Database.Action.Flow.Extract
import Control.Lens ((^.), _Just, view)
import Data.HashMap.Strict qualified as HashMap
import Data.Map.Strict qualified as DM
import Gargantext.Core (Lang, NLPServerConfig, PosTagAlgo(CoreNLP))
import Gargantext.Core (Lang, NLPServerConfig(server))
import Gargantext.Core.Text (HasText(..))
import Gargantext.Core.Text.Corpus.Parsers (splitOn)
import Gargantext.Core.Text.Ngrams (NgramsType(..))
......@@ -77,7 +77,7 @@ instance ExtractNgramsT HyperdataDocument
$ maybe ["Nothing"] (splitOn Authors (doc^. hd_bdd))
$ doc ^. hd_authors
termsWithCounts' <- map (first (enrichedTerms (lang ^. tt_lang) CoreNLP NP)) . concat <$>
termsWithCounts' <- map (first (enrichedTerms (lang ^. tt_lang) (server ncs) NP)) . concat <$>
liftBase (extractTerms ncs lang $ hasText doc)
pure $ HashMap.fromList
......
......@@ -88,7 +88,7 @@ insertNgramsPostag' :: [NgramsPostagInsert] -> DBCmd err [Indexed Text Int]
insertNgramsPostag' ns = runPGSQuery queryInsertNgramsPostag (PGS.Only $ Values fields ns)
where
fields = map (\t -> QualifiedIdentifier Nothing t) $ snd fields_name
fields = map (QualifiedIdentifier Nothing) $ snd fields_name
fields_name :: ( [Text], [Text])
fields_name = ( ["lang_id", "algo_id", "postag", "form", "form_n", "lem" , "lem_n"]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment