[ngrams] more types annotations

parent 0d8a77c4
Pipeline #7187 passed with stages
in 53 minutes and 43 seconds
...@@ -176,7 +176,7 @@ terms :: NLPServerConfig -> TermType Lang -> Text -> IO [TermsWithCount] ...@@ -176,7 +176,7 @@ terms :: NLPServerConfig -> TermType Lang -> Text -> IO [TermsWithCount]
terms _ (Mono lang) txt = pure $ monoTerms lang txt terms _ (Mono lang) txt = pure $ monoTerms lang txt
terms ncs (Multi lang) txt = multiterms ncs lang txt terms ncs (Multi lang) txt = multiterms ncs lang txt
terms ncs (MonoMulti lang) txt = terms ncs (Multi lang) txt terms ncs (MonoMulti lang) txt = terms ncs (Multi lang) txt
terms _ (Unsupervised { .. }) txt = pure $ termsUnsupervised (Unsupervised { _tt_model = Just m', .. }) txt terms _ (Unsupervised { .. }) txt = pure $ termsUnsupervised _tt_lang m' _tt_windowSize _tt_ngramsSize txt
where where
m' = maybe (newTries _tt_ngramsSize txt) identity _tt_model m' = maybe (newTries _tt_ngramsSize txt) identity _tt_model
-- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt -- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
...@@ -189,17 +189,15 @@ type MinNgramSize = Int ...@@ -189,17 +189,15 @@ type MinNgramSize = Int
-- | Unsupervised ngrams extraction -- | Unsupervised ngrams extraction
-- language agnostic extraction -- language agnostic extraction
-- TODO: newtype BlockText -- TODO: newtype BlockText
termsUnsupervised :: TermType Lang -> Text -> [TermsWithCount] termsUnsupervised :: Lang -> Tries Token () -> Int -> Int -> Text -> [TermsWithCount]
termsUnsupervised (Unsupervised { _tt_model = Nothing }) = panicTrace "[termsUnsupervised] no model" termsUnsupervised lang model windowSize ngramsSize =
termsUnsupervised (Unsupervised { _tt_model = Just _tt_model, .. }) = map (first (text2term lang))
map (first (text2term _tt_lang))
. groupWithCounts . groupWithCounts
-- . List.nub -- . List.nub
. List.filter (\l' -> List.length l' >= _tt_windowSize) . List.filter (\l' -> List.length l' >= windowSize)
. List.concat . List.concat
. mainEleveWith _tt_model _tt_ngramsSize . mainEleveWith model ngramsSize
. uniText . uniText
termsUnsupervised _ = undefined
newTries :: Int -> Text -> Tries Token () newTries :: Int -> Text -> Tries Token ()
......
...@@ -12,6 +12,7 @@ Portability : POSIX ...@@ -12,6 +12,7 @@ Portability : POSIX
{-# OPTIONS_GHC -fno-warn-orphans #-} {-# OPTIONS_GHC -fno-warn-orphans #-}
{-# LANGUAGE InstanceSigs #-} {-# LANGUAGE InstanceSigs #-}
{-# LANGUAGE ScopedTypeVariables #-}
module Gargantext.Database.Action.Flow.Extract module Gargantext.Database.Action.Flow.Extract
...@@ -30,6 +31,7 @@ import Gargantext.Database.Admin.Types.Hyperdata.Contact ( HyperdataContact, cw_ ...@@ -30,6 +31,7 @@ import Gargantext.Database.Admin.Types.Hyperdata.Contact ( HyperdataContact, cw_
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument, hd_authors, hd_bdd, hd_institutes, hd_source ) import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument, hd_authors, hd_bdd, hd_institutes, hd_source )
import Gargantext.Database.Admin.Types.Node ( Node ) import Gargantext.Database.Admin.Types.Node ( Node )
import Gargantext.Database.Prelude (DBCmd) import Gargantext.Database.Prelude (DBCmd)
import Gargantext.Database.Query.Table.NgramsPostag (NgramsPostag)
import Gargantext.Database.Schema.Ngrams ( text2ngrams ) import Gargantext.Database.Schema.Ngrams ( text2ngrams )
import Gargantext.Database.Schema.Node (NodePoly(..)) import Gargantext.Database.Schema.Node (NodePoly(..))
import Gargantext.Prelude import Gargantext.Prelude
...@@ -77,7 +79,8 @@ instance ExtractNgramsT HyperdataDocument ...@@ -77,7 +79,8 @@ instance ExtractNgramsT HyperdataDocument
$ maybe ["Nothing"] (splitOn Authors (doc^. hd_bdd)) $ maybe ["Nothing"] (splitOn Authors (doc^. hd_bdd))
$ doc ^. hd_authors $ doc ^. hd_authors
termsWithCounts' <- map (first (enrichedTerms (lang ^. tt_lang) (server ncs) NP)) . concat <$> termsWithCounts' :: [(NgramsPostag, TermsCount)] <-
map (first (enrichedTerms (lang ^. tt_lang) (server ncs) NP)) . concat <$>
liftBase (extractTerms ncs lang $ hasText doc) liftBase (extractTerms ncs lang $ hasText doc)
pure $ HashMap.fromList pure $ HashMap.fromList
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment