[ngrams] refactor opaque Int into TermsWeight newtype

parent a81ea049
Pipeline #7176 passed with stages
in 47 minutes and 37 seconds
......@@ -52,7 +52,7 @@ import Gargantext.Core.Text.Terms.Mono (monoTerms)
import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
import Gargantext.Core.Text.Terms.Mono.Token.En (tokenize)
import Gargantext.Core.Text.Terms.Multi (multiterms)
import Gargantext.Core.Types ( TermsCount, POS, Terms(..), TermsWithCount )
import Gargantext.Core.Types ( TermsCount, TermsWeight, POS, Terms(..), TermsWithCount )
import Gargantext.Core.Utils (groupWithCounts)
import Gargantext.Database.Prelude (DBCmd)
import Gargantext.Database.Query.Table.Ngrams (insertNgrams)
......@@ -122,7 +122,7 @@ class ExtractNgramsT h
=> NLPServerConfig
-> TermType Lang
-> h
-> DBCmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
-> DBCmd err (HashMap ExtractedNgrams (Map NgramsType TermsWeight, TermsCount))
------------------------------------------------------------------------
enrichedTerms :: Lang -> PosTagAlgo -> POS -> Terms -> NgramsPostag
enrichedTerms l pa po (Terms { .. }) =
......
......@@ -20,7 +20,7 @@ commentary with @some markup@.
module Gargantext.Core.Types ( module Gargantext.Core.Types.Main
, module Gargantext.Database.Admin.Types.Node
, DebugMode(..), withDebugMode
, Term(..), Terms(..), TermsCount, TermsWithCount
, Term(..), Terms(..), TermsCount, TermsWeight(..), TermsWithCount
, TokenTag(..), POS(..), NER(..)
, Label, Stems
, HasValidationError(..), assertValid
......@@ -74,6 +74,9 @@ type TermsCount = Int
type TermsWithCount = (Terms, TermsCount)
newtype TermsWeight = TermsWeight { unTermsWeight :: Int }
deriving newtype (Eq, Ord, Num, Show)
------------------------------------------------------------------------
data Tag = POS | NER
deriving (Show, Eq)
......
......@@ -82,7 +82,7 @@ import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
import Gargantext.Core.Text.Ngrams (NgramsType(NgramsTerms), Ngrams(_ngramsTerms))
import Gargantext.Core.Text.Terms
import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
import Gargantext.Core.Types (HasValidationError, TermsCount)
import Gargantext.Core.Types (HasValidationError, TermsCount, TermsWeight(..))
import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Types.Main ( ListType(MapTerm) )
import Gargantext.Database.Action.Flow.Extract () -- ExtractNgramsT instances
......@@ -430,7 +430,7 @@ insertMasterDocs ncs c lang hs = do
-- this will enable global database monitoring
-- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
mapNgramsDocs' :: HashMap.HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
mapNgramsDocs' :: HashMap.HashMap ExtractedNgrams (Map NgramsType (Map NodeId (TermsWeight, TermsCount)))
<- mapNodeIdNgrams
<$> documentIdWithNgrams
(extractNgramsT ncs $ withLang lang documentsWithId)
......@@ -445,7 +445,7 @@ insertMasterDocs ncs c lang hs = do
saveDocNgramsWith :: (IsDBCmd env err m)
=> ListId
-> HashMap.HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
-> HashMap.HashMap ExtractedNgrams (Map NgramsType (Map NodeId (TermsWeight, TermsCount)))
-> m ()
saveDocNgramsWith lId mapNgramsDocs' = do
--printDebug "[saveDocNgramsWith] mapNgramsDocs'" mapNgramsDocs'
......@@ -453,7 +453,7 @@ saveDocNgramsWith lId mapNgramsDocs' = do
-- mapNgramsDocsNoCount = over (traverse . traverse . traverse) fst mapNgramsDocs'
(terms2id :: HashMap.HashMap Text NgramsId) <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
let mapNgramsDocs :: HashMap.HashMap Ngrams (Map NgramsType (Map NodeId (Int, TermsCount)))
let mapNgramsDocs :: HashMap.HashMap Ngrams (Map NgramsType (Map NodeId (TermsWeight, TermsCount)))
mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
-- new
......@@ -465,7 +465,7 @@ saveDocNgramsWith lId mapNgramsDocs' = do
-- insertDocNgrams
let ngrams2insert = catMaybes [ ContextNodeNgrams2 (nodeId2ContextId nId)
<$> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
<*> Just (fromIntegral w :: Double)
<*> Just (fromIntegral $ unTermsWeight w :: Double)
| (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
, (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
, (nId, (w, _cnt)) <- Map.toList mapNodeIdWeight
......
......@@ -25,7 +25,7 @@ import Gargantext.Core.Text (HasText(..))
import Gargantext.Core.Text.Corpus.Parsers (splitOn)
import Gargantext.Core.Text.Ngrams (NgramsType(..))
import Gargantext.Core.Text.Terms (ExtractNgramsT, ExtractedNgrams(..), TermType, cleanExtractedNgrams, enrichedTerms, extractNgramsT, extractTerms, tt_lang)
import Gargantext.Core.Types (POS(NP), TermsCount)
import Gargantext.Core.Types (POS(NP), TermsCount, TermsWeight)
import Gargantext.Database.Admin.Types.Hyperdata.Contact ( HyperdataContact, cw_lastName, hc_who )
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument, hd_authors, hd_bdd, hd_institutes, hd_source )
import Gargantext.Database.Admin.Types.Node ( Node )
......@@ -42,7 +42,7 @@ instance ExtractNgramsT HyperdataContact
extractNgramsT _ncs l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
where
extract :: TermType Lang -> HyperdataContact
-> DBCmd err (HashMap.HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
-> DBCmd err (HashMap.HashMap ExtractedNgrams (Map NgramsType TermsWeight, TermsCount))
extract _l hc' = do
let authors = map text2ngrams
$ maybe ["Nothing"] (\a -> [a])
......@@ -59,11 +59,11 @@ instance ExtractNgramsT HyperdataDocument
extractNgramsT :: NLPServerConfig
-> TermType Lang
-> HyperdataDocument
-> DBCmd err (HashMap.HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
-> DBCmd err (HashMap.HashMap ExtractedNgrams (Map NgramsType TermsWeight, TermsCount))
extractNgramsT ncs lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' hd
where
extractNgramsT' :: HyperdataDocument
-> DBCmd err (HashMap.HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
-> DBCmd err (HashMap.HashMap ExtractedNgrams (Map NgramsType TermsWeight, TermsCount))
extractNgramsT' doc = do
let source = text2ngrams
$ maybe "Nothing" identity
......
......@@ -26,7 +26,7 @@ import Gargantext.Core.Text ( HasText )
import Gargantext.API.Admin.Orchestrator.Types qualified as API
import Gargantext.Core.Text.Ngrams (NgramsType(..))
import Gargantext.Core.Text.Terms ( ExtractNgramsT )
import Gargantext.Core.Types (HasValidationError, TermsCount)
import Gargantext.Core.Types (HasValidationError, TermsCount, TermsWeight)
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument )
import Gargantext.Database.Admin.Types.Node (NodeId)
......@@ -67,7 +67,7 @@ type FlowInsertDB a = ( AddUniqId a
data DocumentIdWithNgrams a b =
DocumentIdWithNgrams
{ documentWithId :: Indexed NodeId a
, documentNgrams :: HashMap b (Map NgramsType Int, TermsCount)
, documentNgrams :: HashMap b (Map NgramsType TermsWeight, TermsCount)
} deriving (Show)
......
......@@ -28,7 +28,7 @@ import Gargantext.Core.Flow.Types (UniqId, uniqId)
import Gargantext.Core.Text.Ngrams ( Ngrams, NgramsType )
import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
import Gargantext.Core.Text.Terms.WithList (MatchedText, buildPatternsWith, termsInText)
import Gargantext.Core.Types (TermsCount)
import Gargantext.Core.Types (TermsCount, TermsWeight(..))
import Gargantext.Core.Utils (addTuples)
import Gargantext.Data.HashMap.Strict.Utils qualified as HashMap
import Gargantext.Database.Action.Flow.Types (DocumentIdWithNgrams(..), FlowInsertDB)
......@@ -47,7 +47,7 @@ import Gargantext.Prelude.Crypto.Hash (Hash)
insertDocNgrams :: ListId
-> HashMap (Indexed NgramsId Ngrams) (Map NgramsType (Map DocId (Int, TermsCount)))
-> HashMap (Indexed NgramsId Ngrams) (Map NgramsType (Map DocId (TermsWeight, TermsCount)))
-> DBCmd err Int
insertDocNgrams lId m = do
-- printDebug "[insertDocNgrams] ns" ns
......@@ -56,11 +56,11 @@ insertDocNgrams lId m = do
ns = [ ContextNodeNgrams (nodeId2ContextId docId)
lId (ng^.index)
(NgramsTypeId $ toDBid t)
(fromIntegral i)
(fromIntegral $ unTermsWeight w)
cnt
| (ng, t2n2i) <- HashMap.toList m
, (t, n2i) <- DM.toList t2n2i
, (docId, (i, cnt)) <- DM.toList n2i
, (docId, (w, cnt)) <- DM.toList n2i
]
-- [(NodeId, {Ngrams: ({NgramsType: Int}, TermsCount)})]
......@@ -88,8 +88,8 @@ docNgrams lang ts doc =
documentIdWithNgrams :: HasNodeError err
=> (a
-> DBCmd err (HashMap.HashMap b (Map NgramsType Int, TermsCount)))
=> ( a
-> DBCmd err (HashMap.HashMap b (Map NgramsType TermsWeight, TermsCount)) )
-> [Indexed NodeId a]
-> DBCmd err [DocumentIdWithNgrams a b]
documentIdWithNgrams f = traverse toDocumentIdWithNgrams
......@@ -104,7 +104,7 @@ mapNodeIdNgrams :: (Ord b, Hashable b)
=> [DocumentIdWithNgrams a b]
-> HashMap.HashMap b
(Map NgramsType
(Map NodeId (Int, TermsCount))
(Map NodeId (TermsWeight, TermsCount))
)
mapNodeIdNgrams = HashMap.unionsWith (DM.unionWith (DM.unionWith addTuples)) . fmap f
where
......@@ -113,8 +113,8 @@ mapNodeIdNgrams = HashMap.unionsWith (DM.unionWith (DM.unionWith addTuples)) . f
-- for it (which is the number of times the terms appears in a
-- document) is copied over to all its types.
f :: DocumentIdWithNgrams a b
-> HashMap.HashMap b (Map NgramsType (Map NodeId (Int, TermsCount)))
f d = fmap (\(ngramsTypeMap, cnt) -> fmap (\i -> DM.singleton nId (i, cnt)) ngramsTypeMap) $ documentNgrams d
-> HashMap.HashMap b (Map NgramsType (Map NodeId (TermsWeight, TermsCount)))
f d = fmap (\(ngramsTypeMap, cnt) -> fmap (\w -> DM.singleton nId (w, cnt)) ngramsTypeMap) $ documentNgrams d
where
nId = _index $ documentWithId d
......@@ -192,35 +192,17 @@ ngramsByDoc :: Lang
-> NgramsType
-> [NT.NgramsTerm]
-> [ContextOnlyId HyperdataDocument]
-> [HashMap.HashMap ExtractedNgrams (DM.Map NgramsType (Map NodeId (Int, TermsCount)))]
-> [HashMap.HashMap ExtractedNgrams (DM.Map NgramsType (Map NodeId (TermsWeight, TermsCount)))]
ngramsByDoc l nt ts docs =
ngramsByDoc' l nt ts <$> docs
-- | Given list of terms and a document, produce a map for this doc's
-- terms count and weights. Notice that the weight is always 1 here.
-- ngramsByDoc' :: Lang
-- -> NgramsType
-- -> [NT.NgramsTerm]
-- -> ContextOnlyId HyperdataDocument
-- -> HashMap.HashMap ExtractedNgrams (DM.Map NgramsType (Map NodeId (Int, TermsCount)))
-- ngramsByDoc' l nt ts doc =
-- HashMap.fromListWith (DM.unionWith (DM.unionWith (\(_a,b) (_a',b') -> (1,b+b')))) withExtractedNgrams
-- where
-- _docNgrams' :: ([(MatchedText, TermsCount)], NodeId)
-- _docNgrams'@(matched, nId) = (docNgrams l ts doc, doc ^. context_oid_id)
-- withExtractedNgrams :: [(ExtractedNgrams, Map NgramsType (Map NodeId (Int, TermsCount)))]
-- withExtractedNgrams =
-- map (\(matchedText, cnt) ->
-- ( SimpleNgrams (text2ngrams matchedText)
-- , DM.singleton nt $ DM.singleton nId (1, cnt) ) ) matched
ngramsByDoc' :: Lang
-> NgramsType
-> [NT.NgramsTerm]
-> ContextOnlyId HyperdataDocument
-> HashMap.HashMap ExtractedNgrams (DM.Map NgramsType (Map NodeId (Int, TermsCount)))
-> HashMap.HashMap ExtractedNgrams (DM.Map NgramsType (Map NodeId (TermsWeight, TermsCount)))
ngramsByDoc' l nt ts doc =
HashMap.map (\cnt -> DM.singleton nt $ DM.singleton nId (1, cnt)) extractedMap
where
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment