[ngrams] refactor opaque Int into TermsWeight newtype

parent a81ea049
Pipeline #7176 passed with stages
in 47 minutes and 37 seconds
...@@ -52,7 +52,7 @@ import Gargantext.Core.Text.Terms.Mono (monoTerms) ...@@ -52,7 +52,7 @@ import Gargantext.Core.Text.Terms.Mono (monoTerms)
import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..)) import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
import Gargantext.Core.Text.Terms.Mono.Token.En (tokenize) import Gargantext.Core.Text.Terms.Mono.Token.En (tokenize)
import Gargantext.Core.Text.Terms.Multi (multiterms) import Gargantext.Core.Text.Terms.Multi (multiterms)
import Gargantext.Core.Types ( TermsCount, POS, Terms(..), TermsWithCount ) import Gargantext.Core.Types ( TermsCount, TermsWeight, POS, Terms(..), TermsWithCount )
import Gargantext.Core.Utils (groupWithCounts) import Gargantext.Core.Utils (groupWithCounts)
import Gargantext.Database.Prelude (DBCmd) import Gargantext.Database.Prelude (DBCmd)
import Gargantext.Database.Query.Table.Ngrams (insertNgrams) import Gargantext.Database.Query.Table.Ngrams (insertNgrams)
...@@ -122,7 +122,7 @@ class ExtractNgramsT h ...@@ -122,7 +122,7 @@ class ExtractNgramsT h
=> NLPServerConfig => NLPServerConfig
-> TermType Lang -> TermType Lang
-> h -> h
-> DBCmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount)) -> DBCmd err (HashMap ExtractedNgrams (Map NgramsType TermsWeight, TermsCount))
------------------------------------------------------------------------ ------------------------------------------------------------------------
enrichedTerms :: Lang -> PosTagAlgo -> POS -> Terms -> NgramsPostag enrichedTerms :: Lang -> PosTagAlgo -> POS -> Terms -> NgramsPostag
enrichedTerms l pa po (Terms { .. }) = enrichedTerms l pa po (Terms { .. }) =
......
...@@ -20,7 +20,7 @@ commentary with @some markup@. ...@@ -20,7 +20,7 @@ commentary with @some markup@.
module Gargantext.Core.Types ( module Gargantext.Core.Types.Main module Gargantext.Core.Types ( module Gargantext.Core.Types.Main
, module Gargantext.Database.Admin.Types.Node , module Gargantext.Database.Admin.Types.Node
, DebugMode(..), withDebugMode , DebugMode(..), withDebugMode
, Term(..), Terms(..), TermsCount, TermsWithCount , Term(..), Terms(..), TermsCount, TermsWeight(..), TermsWithCount
, TokenTag(..), POS(..), NER(..) , TokenTag(..), POS(..), NER(..)
, Label, Stems , Label, Stems
, HasValidationError(..), assertValid , HasValidationError(..), assertValid
...@@ -74,6 +74,9 @@ type TermsCount = Int ...@@ -74,6 +74,9 @@ type TermsCount = Int
type TermsWithCount = (Terms, TermsCount) type TermsWithCount = (Terms, TermsCount)
newtype TermsWeight = TermsWeight { unTermsWeight :: Int }
deriving newtype (Eq, Ord, Num, Show)
------------------------------------------------------------------------ ------------------------------------------------------------------------
data Tag = POS | NER data Tag = POS | NER
deriving (Show, Eq) deriving (Show, Eq)
......
...@@ -82,7 +82,7 @@ import Gargantext.Core.Text.List.Social (FlowSocialListWith(..)) ...@@ -82,7 +82,7 @@ import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
import Gargantext.Core.Text.Ngrams (NgramsType(NgramsTerms), Ngrams(_ngramsTerms)) import Gargantext.Core.Text.Ngrams (NgramsType(NgramsTerms), Ngrams(_ngramsTerms))
import Gargantext.Core.Text.Terms import Gargantext.Core.Text.Terms
import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..)) import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
import Gargantext.Core.Types (HasValidationError, TermsCount) import Gargantext.Core.Types (HasValidationError, TermsCount, TermsWeight(..))
import Gargantext.Core.Types.Individu (User(..)) import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Types.Main ( ListType(MapTerm) ) import Gargantext.Core.Types.Main ( ListType(MapTerm) )
import Gargantext.Database.Action.Flow.Extract () -- ExtractNgramsT instances import Gargantext.Database.Action.Flow.Extract () -- ExtractNgramsT instances
...@@ -430,7 +430,7 @@ insertMasterDocs ncs c lang hs = do ...@@ -430,7 +430,7 @@ insertMasterDocs ncs c lang hs = do
-- this will enable global database monitoring -- this will enable global database monitoring
-- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int)) -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
mapNgramsDocs' :: HashMap.HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount))) mapNgramsDocs' :: HashMap.HashMap ExtractedNgrams (Map NgramsType (Map NodeId (TermsWeight, TermsCount)))
<- mapNodeIdNgrams <- mapNodeIdNgrams
<$> documentIdWithNgrams <$> documentIdWithNgrams
(extractNgramsT ncs $ withLang lang documentsWithId) (extractNgramsT ncs $ withLang lang documentsWithId)
...@@ -445,7 +445,7 @@ insertMasterDocs ncs c lang hs = do ...@@ -445,7 +445,7 @@ insertMasterDocs ncs c lang hs = do
saveDocNgramsWith :: (IsDBCmd env err m) saveDocNgramsWith :: (IsDBCmd env err m)
=> ListId => ListId
-> HashMap.HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount))) -> HashMap.HashMap ExtractedNgrams (Map NgramsType (Map NodeId (TermsWeight, TermsCount)))
-> m () -> m ()
saveDocNgramsWith lId mapNgramsDocs' = do saveDocNgramsWith lId mapNgramsDocs' = do
--printDebug "[saveDocNgramsWith] mapNgramsDocs'" mapNgramsDocs' --printDebug "[saveDocNgramsWith] mapNgramsDocs'" mapNgramsDocs'
...@@ -453,7 +453,7 @@ saveDocNgramsWith lId mapNgramsDocs' = do ...@@ -453,7 +453,7 @@ saveDocNgramsWith lId mapNgramsDocs' = do
-- mapNgramsDocsNoCount = over (traverse . traverse . traverse) fst mapNgramsDocs' -- mapNgramsDocsNoCount = over (traverse . traverse . traverse) fst mapNgramsDocs'
(terms2id :: HashMap.HashMap Text NgramsId) <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs' (terms2id :: HashMap.HashMap Text NgramsId) <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
let mapNgramsDocs :: HashMap.HashMap Ngrams (Map NgramsType (Map NodeId (Int, TermsCount))) let mapNgramsDocs :: HashMap.HashMap Ngrams (Map NgramsType (Map NodeId (TermsWeight, TermsCount)))
mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs' mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
-- new -- new
...@@ -465,7 +465,7 @@ saveDocNgramsWith lId mapNgramsDocs' = do ...@@ -465,7 +465,7 @@ saveDocNgramsWith lId mapNgramsDocs' = do
-- insertDocNgrams -- insertDocNgrams
let ngrams2insert = catMaybes [ ContextNodeNgrams2 (nodeId2ContextId nId) let ngrams2insert = catMaybes [ ContextNodeNgrams2 (nodeId2ContextId nId)
<$> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'') <$> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
<*> Just (fromIntegral w :: Double) <*> Just (fromIntegral $ unTermsWeight w :: Double)
| (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
, (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
, (nId, (w, _cnt)) <- Map.toList mapNodeIdWeight , (nId, (w, _cnt)) <- Map.toList mapNodeIdWeight
......
...@@ -25,7 +25,7 @@ import Gargantext.Core.Text (HasText(..)) ...@@ -25,7 +25,7 @@ import Gargantext.Core.Text (HasText(..))
import Gargantext.Core.Text.Corpus.Parsers (splitOn) import Gargantext.Core.Text.Corpus.Parsers (splitOn)
import Gargantext.Core.Text.Ngrams (NgramsType(..)) import Gargantext.Core.Text.Ngrams (NgramsType(..))
import Gargantext.Core.Text.Terms (ExtractNgramsT, ExtractedNgrams(..), TermType, cleanExtractedNgrams, enrichedTerms, extractNgramsT, extractTerms, tt_lang) import Gargantext.Core.Text.Terms (ExtractNgramsT, ExtractedNgrams(..), TermType, cleanExtractedNgrams, enrichedTerms, extractNgramsT, extractTerms, tt_lang)
import Gargantext.Core.Types (POS(NP), TermsCount) import Gargantext.Core.Types (POS(NP), TermsCount, TermsWeight)
import Gargantext.Database.Admin.Types.Hyperdata.Contact ( HyperdataContact, cw_lastName, hc_who ) import Gargantext.Database.Admin.Types.Hyperdata.Contact ( HyperdataContact, cw_lastName, hc_who )
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument, hd_authors, hd_bdd, hd_institutes, hd_source ) import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument, hd_authors, hd_bdd, hd_institutes, hd_source )
import Gargantext.Database.Admin.Types.Node ( Node ) import Gargantext.Database.Admin.Types.Node ( Node )
...@@ -42,7 +42,7 @@ instance ExtractNgramsT HyperdataContact ...@@ -42,7 +42,7 @@ instance ExtractNgramsT HyperdataContact
extractNgramsT _ncs l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc extractNgramsT _ncs l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
where where
extract :: TermType Lang -> HyperdataContact extract :: TermType Lang -> HyperdataContact
-> DBCmd err (HashMap.HashMap ExtractedNgrams (Map NgramsType Int, TermsCount)) -> DBCmd err (HashMap.HashMap ExtractedNgrams (Map NgramsType TermsWeight, TermsCount))
extract _l hc' = do extract _l hc' = do
let authors = map text2ngrams let authors = map text2ngrams
$ maybe ["Nothing"] (\a -> [a]) $ maybe ["Nothing"] (\a -> [a])
...@@ -59,11 +59,11 @@ instance ExtractNgramsT HyperdataDocument ...@@ -59,11 +59,11 @@ instance ExtractNgramsT HyperdataDocument
extractNgramsT :: NLPServerConfig extractNgramsT :: NLPServerConfig
-> TermType Lang -> TermType Lang
-> HyperdataDocument -> HyperdataDocument
-> DBCmd err (HashMap.HashMap ExtractedNgrams (Map NgramsType Int, TermsCount)) -> DBCmd err (HashMap.HashMap ExtractedNgrams (Map NgramsType TermsWeight, TermsCount))
extractNgramsT ncs lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' hd extractNgramsT ncs lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' hd
where where
extractNgramsT' :: HyperdataDocument extractNgramsT' :: HyperdataDocument
-> DBCmd err (HashMap.HashMap ExtractedNgrams (Map NgramsType Int, TermsCount)) -> DBCmd err (HashMap.HashMap ExtractedNgrams (Map NgramsType TermsWeight, TermsCount))
extractNgramsT' doc = do extractNgramsT' doc = do
let source = text2ngrams let source = text2ngrams
$ maybe "Nothing" identity $ maybe "Nothing" identity
......
...@@ -26,7 +26,7 @@ import Gargantext.Core.Text ( HasText ) ...@@ -26,7 +26,7 @@ import Gargantext.Core.Text ( HasText )
import Gargantext.API.Admin.Orchestrator.Types qualified as API import Gargantext.API.Admin.Orchestrator.Types qualified as API
import Gargantext.Core.Text.Ngrams (NgramsType(..)) import Gargantext.Core.Text.Ngrams (NgramsType(..))
import Gargantext.Core.Text.Terms ( ExtractNgramsT ) import Gargantext.Core.Text.Terms ( ExtractNgramsT )
import Gargantext.Core.Types (HasValidationError, TermsCount) import Gargantext.Core.Types (HasValidationError, TermsCount, TermsWeight)
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger) import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument ) import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument )
import Gargantext.Database.Admin.Types.Node (NodeId) import Gargantext.Database.Admin.Types.Node (NodeId)
...@@ -67,7 +67,7 @@ type FlowInsertDB a = ( AddUniqId a ...@@ -67,7 +67,7 @@ type FlowInsertDB a = ( AddUniqId a
data DocumentIdWithNgrams a b = data DocumentIdWithNgrams a b =
DocumentIdWithNgrams DocumentIdWithNgrams
{ documentWithId :: Indexed NodeId a { documentWithId :: Indexed NodeId a
, documentNgrams :: HashMap b (Map NgramsType Int, TermsCount) , documentNgrams :: HashMap b (Map NgramsType TermsWeight, TermsCount)
} deriving (Show) } deriving (Show)
......
...@@ -28,7 +28,7 @@ import Gargantext.Core.Flow.Types (UniqId, uniqId) ...@@ -28,7 +28,7 @@ import Gargantext.Core.Flow.Types (UniqId, uniqId)
import Gargantext.Core.Text.Ngrams ( Ngrams, NgramsType ) import Gargantext.Core.Text.Ngrams ( Ngrams, NgramsType )
import Gargantext.Core.Text.Terms (ExtractedNgrams(..)) import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
import Gargantext.Core.Text.Terms.WithList (MatchedText, buildPatternsWith, termsInText) import Gargantext.Core.Text.Terms.WithList (MatchedText, buildPatternsWith, termsInText)
import Gargantext.Core.Types (TermsCount) import Gargantext.Core.Types (TermsCount, TermsWeight(..))
import Gargantext.Core.Utils (addTuples) import Gargantext.Core.Utils (addTuples)
import Gargantext.Data.HashMap.Strict.Utils qualified as HashMap import Gargantext.Data.HashMap.Strict.Utils qualified as HashMap
import Gargantext.Database.Action.Flow.Types (DocumentIdWithNgrams(..), FlowInsertDB) import Gargantext.Database.Action.Flow.Types (DocumentIdWithNgrams(..), FlowInsertDB)
...@@ -47,7 +47,7 @@ import Gargantext.Prelude.Crypto.Hash (Hash) ...@@ -47,7 +47,7 @@ import Gargantext.Prelude.Crypto.Hash (Hash)
insertDocNgrams :: ListId insertDocNgrams :: ListId
-> HashMap (Indexed NgramsId Ngrams) (Map NgramsType (Map DocId (Int, TermsCount))) -> HashMap (Indexed NgramsId Ngrams) (Map NgramsType (Map DocId (TermsWeight, TermsCount)))
-> DBCmd err Int -> DBCmd err Int
insertDocNgrams lId m = do insertDocNgrams lId m = do
-- printDebug "[insertDocNgrams] ns" ns -- printDebug "[insertDocNgrams] ns" ns
...@@ -56,11 +56,11 @@ insertDocNgrams lId m = do ...@@ -56,11 +56,11 @@ insertDocNgrams lId m = do
ns = [ ContextNodeNgrams (nodeId2ContextId docId) ns = [ ContextNodeNgrams (nodeId2ContextId docId)
lId (ng^.index) lId (ng^.index)
(NgramsTypeId $ toDBid t) (NgramsTypeId $ toDBid t)
(fromIntegral i) (fromIntegral $ unTermsWeight w)
cnt cnt
| (ng, t2n2i) <- HashMap.toList m | (ng, t2n2i) <- HashMap.toList m
, (t, n2i) <- DM.toList t2n2i , (t, n2i) <- DM.toList t2n2i
, (docId, (i, cnt)) <- DM.toList n2i , (docId, (w, cnt)) <- DM.toList n2i
] ]
-- [(NodeId, {Ngrams: ({NgramsType: Int}, TermsCount)})] -- [(NodeId, {Ngrams: ({NgramsType: Int}, TermsCount)})]
...@@ -88,8 +88,8 @@ docNgrams lang ts doc = ...@@ -88,8 +88,8 @@ docNgrams lang ts doc =
documentIdWithNgrams :: HasNodeError err documentIdWithNgrams :: HasNodeError err
=> (a => ( a
-> DBCmd err (HashMap.HashMap b (Map NgramsType Int, TermsCount))) -> DBCmd err (HashMap.HashMap b (Map NgramsType TermsWeight, TermsCount)) )
-> [Indexed NodeId a] -> [Indexed NodeId a]
-> DBCmd err [DocumentIdWithNgrams a b] -> DBCmd err [DocumentIdWithNgrams a b]
documentIdWithNgrams f = traverse toDocumentIdWithNgrams documentIdWithNgrams f = traverse toDocumentIdWithNgrams
...@@ -104,7 +104,7 @@ mapNodeIdNgrams :: (Ord b, Hashable b) ...@@ -104,7 +104,7 @@ mapNodeIdNgrams :: (Ord b, Hashable b)
=> [DocumentIdWithNgrams a b] => [DocumentIdWithNgrams a b]
-> HashMap.HashMap b -> HashMap.HashMap b
(Map NgramsType (Map NgramsType
(Map NodeId (Int, TermsCount)) (Map NodeId (TermsWeight, TermsCount))
) )
mapNodeIdNgrams = HashMap.unionsWith (DM.unionWith (DM.unionWith addTuples)) . fmap f mapNodeIdNgrams = HashMap.unionsWith (DM.unionWith (DM.unionWith addTuples)) . fmap f
where where
...@@ -113,8 +113,8 @@ mapNodeIdNgrams = HashMap.unionsWith (DM.unionWith (DM.unionWith addTuples)) . f ...@@ -113,8 +113,8 @@ mapNodeIdNgrams = HashMap.unionsWith (DM.unionWith (DM.unionWith addTuples)) . f
-- for it (which is the number of times the terms appears in a -- for it (which is the number of times the terms appears in a
-- document) is copied over to all its types. -- document) is copied over to all its types.
f :: DocumentIdWithNgrams a b f :: DocumentIdWithNgrams a b
-> HashMap.HashMap b (Map NgramsType (Map NodeId (Int, TermsCount))) -> HashMap.HashMap b (Map NgramsType (Map NodeId (TermsWeight, TermsCount)))
f d = fmap (\(ngramsTypeMap, cnt) -> fmap (\i -> DM.singleton nId (i, cnt)) ngramsTypeMap) $ documentNgrams d f d = fmap (\(ngramsTypeMap, cnt) -> fmap (\w -> DM.singleton nId (w, cnt)) ngramsTypeMap) $ documentNgrams d
where where
nId = _index $ documentWithId d nId = _index $ documentWithId d
...@@ -192,35 +192,17 @@ ngramsByDoc :: Lang ...@@ -192,35 +192,17 @@ ngramsByDoc :: Lang
-> NgramsType -> NgramsType
-> [NT.NgramsTerm] -> [NT.NgramsTerm]
-> [ContextOnlyId HyperdataDocument] -> [ContextOnlyId HyperdataDocument]
-> [HashMap.HashMap ExtractedNgrams (DM.Map NgramsType (Map NodeId (Int, TermsCount)))] -> [HashMap.HashMap ExtractedNgrams (DM.Map NgramsType (Map NodeId (TermsWeight, TermsCount)))]
ngramsByDoc l nt ts docs = ngramsByDoc l nt ts docs =
ngramsByDoc' l nt ts <$> docs ngramsByDoc' l nt ts <$> docs
-- | Given list of terms and a document, produce a map for this doc's -- | Given list of terms and a document, produce a map for this doc's
-- terms count and weights. Notice that the weight is always 1 here. -- terms count and weights. Notice that the weight is always 1 here.
-- ngramsByDoc' :: Lang
-- -> NgramsType
-- -> [NT.NgramsTerm]
-- -> ContextOnlyId HyperdataDocument
-- -> HashMap.HashMap ExtractedNgrams (DM.Map NgramsType (Map NodeId (Int, TermsCount)))
-- ngramsByDoc' l nt ts doc =
-- HashMap.fromListWith (DM.unionWith (DM.unionWith (\(_a,b) (_a',b') -> (1,b+b')))) withExtractedNgrams
-- where
-- _docNgrams' :: ([(MatchedText, TermsCount)], NodeId)
-- _docNgrams'@(matched, nId) = (docNgrams l ts doc, doc ^. context_oid_id)
-- withExtractedNgrams :: [(ExtractedNgrams, Map NgramsType (Map NodeId (Int, TermsCount)))]
-- withExtractedNgrams =
-- map (\(matchedText, cnt) ->
-- ( SimpleNgrams (text2ngrams matchedText)
-- , DM.singleton nt $ DM.singleton nId (1, cnt) ) ) matched
ngramsByDoc' :: Lang ngramsByDoc' :: Lang
-> NgramsType -> NgramsType
-> [NT.NgramsTerm] -> [NT.NgramsTerm]
-> ContextOnlyId HyperdataDocument -> ContextOnlyId HyperdataDocument
-> HashMap.HashMap ExtractedNgrams (DM.Map NgramsType (Map NodeId (Int, TermsCount))) -> HashMap.HashMap ExtractedNgrams (DM.Map NgramsType (Map NodeId (TermsWeight, TermsCount)))
ngramsByDoc' l nt ts doc = ngramsByDoc' l nt ts doc =
HashMap.map (\cnt -> DM.singleton nt $ DM.singleton nId (1, cnt)) extractedMap HashMap.map (\cnt -> DM.singleton nt $ DM.singleton nId (1, cnt)) extractedMap
where where
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment