[ngrams] more types annotations

parent 0d8a77c4
Pipeline #7187 passed with stages
in 53 minutes and 43 seconds
......@@ -176,7 +176,7 @@ terms :: NLPServerConfig -> TermType Lang -> Text -> IO [TermsWithCount]
terms _ (Mono lang) txt = pure $ monoTerms lang txt
terms ncs (Multi lang) txt = multiterms ncs lang txt
terms ncs (MonoMulti lang) txt = terms ncs (Multi lang) txt
terms _ (Unsupervised { .. }) txt = pure $ termsUnsupervised (Unsupervised { _tt_model = Just m', .. }) txt
terms _ (Unsupervised { .. }) txt = pure $ termsUnsupervised _tt_lang m' _tt_windowSize _tt_ngramsSize txt
where
m' = maybe (newTries _tt_ngramsSize txt) identity _tt_model
-- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
......@@ -189,17 +189,15 @@ type MinNgramSize = Int
-- | Unsupervised ngrams extraction
-- language agnostic extraction
-- TODO: newtype BlockText
termsUnsupervised :: TermType Lang -> Text -> [TermsWithCount]
termsUnsupervised (Unsupervised { _tt_model = Nothing }) = panicTrace "[termsUnsupervised] no model"
termsUnsupervised (Unsupervised { _tt_model = Just _tt_model, .. }) =
map (first (text2term _tt_lang))
termsUnsupervised :: Lang -> Tries Token () -> Int -> Int -> Text -> [TermsWithCount]
termsUnsupervised lang model windowSize ngramsSize =
map (first (text2term lang))
. groupWithCounts
-- . List.nub
. List.filter (\l' -> List.length l' >= _tt_windowSize)
. List.filter (\l' -> List.length l' >= windowSize)
. List.concat
. mainEleveWith _tt_model _tt_ngramsSize
. mainEleveWith model ngramsSize
. uniText
termsUnsupervised _ = undefined
newTries :: Int -> Text -> Tries Token ()
......
......@@ -12,6 +12,7 @@ Portability : POSIX
{-# OPTIONS_GHC -fno-warn-orphans #-}
{-# LANGUAGE InstanceSigs #-}
{-# LANGUAGE ScopedTypeVariables #-}
module Gargantext.Database.Action.Flow.Extract
......@@ -30,6 +31,7 @@ import Gargantext.Database.Admin.Types.Hyperdata.Contact ( HyperdataContact, cw_
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument, hd_authors, hd_bdd, hd_institutes, hd_source )
import Gargantext.Database.Admin.Types.Node ( Node )
import Gargantext.Database.Prelude (DBCmd)
import Gargantext.Database.Query.Table.NgramsPostag (NgramsPostag)
import Gargantext.Database.Schema.Ngrams ( text2ngrams )
import Gargantext.Database.Schema.Node (NodePoly(..))
import Gargantext.Prelude
......@@ -77,7 +79,8 @@ instance ExtractNgramsT HyperdataDocument
$ maybe ["Nothing"] (splitOn Authors (doc^. hd_bdd))
$ doc ^. hd_authors
termsWithCounts' <- map (first (enrichedTerms (lang ^. tt_lang) (server ncs) NP)) . concat <$>
termsWithCounts' :: [(NgramsPostag, TermsCount)] <-
map (first (enrichedTerms (lang ^. tt_lang) (server ncs) NP)) . concat <$>
liftBase (extractTerms ncs lang $ hasText doc)
pure $ HashMap.fromList
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment