Commit 3b163685 authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

[ngrams] first version of ngrams counting in docs

parent 0d6a5eaa
Pipeline #3329 passed with stage
in 91 minutes and 4 seconds
......@@ -196,6 +196,8 @@ To build documentation, run:
stack --docker build --haddock --no-haddock-deps --fast
```
(in `.stack-work/dist/x86_64-linux-nix/Cabal-3.2.1.0/doc/html/gargantext`).
## GraphQL
Some introspection information.
......
......@@ -33,7 +33,6 @@ import Gargantext.Database.Admin.Trigger.Init
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataCorpus)
import Gargantext.Database.Admin.Types.Node (NodeType(NodeDocument, NodeContact))
import Gargantext.Database.Prelude (Cmd'', Cmd, execPGSQuery)
import Gargantext.Database.Query.Table.Node (getOrMkList)
import Gargantext.Prelude
import Gargantext.Prelude.Config (GargConfig(..), readConfig)
import Prelude (getLine)
......@@ -67,7 +66,7 @@ main = do
_ok <- getLine
cfg <- readConfig iniPath
let secret = _gc_secretkey cfg
let _secret = _gc_secretkey cfg
withDevEnv iniPath $ \env -> do
-- First upgrade the Database Schema
......
......@@ -24,7 +24,7 @@ services:
network_mode: host
#command: ["postgres", "-c", "log_statement=all"]
#ports:
#- 5432:5432
# - 5432:5432
environment:
POSTGRES_USER: gargantua
POSTGRES_PASSWORD: C8kdcUrAQy66U
......
......@@ -164,6 +164,7 @@ CREATE TABLE public.context_node_ngrams (
ngrams_id INTEGER NOT NULL REFERENCES public.ngrams (id) ON DELETE CASCADE,
ngrams_type INTEGER ,
weight double precision,
doc_count INTEGER,
PRIMARY KEY (context_id, node_id, ngrams_id, ngrams_type)
);
ALTER TABLE public.context_node_ngrams OWNER TO gargantua;
......
ALTER TABLE context_node_ngrams
ADD COLUMN doc_count INTEGER;
cabal-version: 1.12
-- This file has been generated from package.yaml by hpack version 0.35.0.
-- This file has been generated from package.yaml by hpack version 0.34.7.
--
-- see: https://github.com/sol/hpack
name: gargantext
version: 0.0.6.8.2
version: 0.0.6.8.2
synopsis: Search, map, share
description: Please see README.md
category: Data
......@@ -50,6 +50,7 @@ library
Gargantext.Core.Types
Gargantext.Core.Types.Individu
Gargantext.Core.Types.Main
Gargantext.Core.Utils
Gargantext.Core.Utils.Prefix
Gargantext.Utils.Jobs
Gargantext.Utils.Jobs.API
......@@ -87,6 +88,7 @@ library
Gargantext.Core.Text.Prepare
Gargantext.Core.Text.Search
Gargantext.Core.Text.Terms
Gargantext.Core.Text.Terms.Eleve
Gargantext.Core.Text.Terms.Mono
Gargantext.Core.Text.Terms.Multi.Lang.En
Gargantext.Core.Text.Terms.Multi.Lang.Fr
......@@ -211,7 +213,6 @@ library
Gargantext.Core.Text.Samples.EN
Gargantext.Core.Text.Samples.FR
Gargantext.Core.Text.Samples.SP
Gargantext.Core.Text.Terms.Eleve
Gargantext.Core.Text.Terms.Mono.Stem
Gargantext.Core.Text.Terms.Mono.Stem.En
Gargantext.Core.Text.Terms.Mono.Token
......@@ -221,7 +222,6 @@ library
Gargantext.Core.Text.Terms.Multi.PosTagging
Gargantext.Core.Text.Terms.Multi.PosTagging.Types
Gargantext.Core.Types.Phylo
Gargantext.Core.Utils
Gargantext.Core.Utils.DateUtils
Gargantext.Core.Viz
Gargantext.Core.Viz.Chart
......@@ -809,6 +809,7 @@ test-suite garg-test
Core.Text
Core.Text.Examples
Core.Text.Flow
Core.Utils
Graph.Clustering
Graph.Distance
Ngrams.Lang
......
......@@ -74,6 +74,7 @@ library:
- Gargantext.Core.Types
- Gargantext.Core.Types.Individu
- Gargantext.Core.Types.Main
- Gargantext.Core.Utils
- Gargantext.Core.Utils.Prefix
- Gargantext.Utils.Jobs
- Gargantext.Utils.Jobs.API
......@@ -111,6 +112,7 @@ library:
- Gargantext.Core.Text.Prepare
- Gargantext.Core.Text.Search
- Gargantext.Core.Text.Terms
- Gargantext.Core.Text.Terms.Eleve
- Gargantext.Core.Text.Terms.Mono
- Gargantext.Core.Text.Terms.Multi.Lang.En
- Gargantext.Core.Text.Terms.Multi.Lang.Fr
......
{-|
Module : Graph.Clustering
Description : Basic tests to avoid quick regression
......
{-|
Module : Core.Utils
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module Core.Utils where
import Test.Hspec
import Gargantext.Prelude
import Gargantext.Core.Utils
-- | Core.Utils tests
test :: IO ()
test = hspec $ do
describe "check if groupWithCounts works" $ do
it "simple integer array" $ do
(groupWithCounts [1, 2, 3, 1, 2, 3]) `shouldBe` [(1, 2), (2, 2), (3, 2)]
it "string" $ do
(groupWithCounts "abccba") `shouldBe` [('a', 2), ('b', 2), ('c', 2)]
{-|
Module : Graph.Clustering
Description : Basic tests to avoid quick regression
......@@ -40,4 +39,3 @@ test = hspec $ do
let
result = List.length partitions > 1
shouldBe True result
......@@ -11,6 +11,8 @@ Portability : POSIX
import Gargantext.Prelude
import Gargantext.Core (Lang(..))
import qualified Core.Utils as Utils
--import qualified Ngrams.Lang.Fr as Fr
--import qualified Ngrams.Lang as Lang
import qualified Ngrams.Lang.Occurrences as Occ
......@@ -22,11 +24,12 @@ import qualified Utils.Crypto as Crypto
main :: IO ()
main = do
Utils.test
-- Occ.parsersTest
-- Lang.ngramsExtractionTest FR
-- Lang.ngramsExtractionTest EN
-- Metrics.main
Graph.test
Graph.test
PD.testFromRFC3339
-- GD.test
Crypto.test
{-|
Module : Utils.Crypto
Description :
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
......@@ -43,4 +42,3 @@ test = hspec $ do
let hash2 = hash (["b","a"] :: [Text])
it "compare" $ do
hash1 `shouldBe` hash2
......@@ -184,9 +184,9 @@ saveNodeStory :: ( MonadReader env m, MonadBase IO m, HasNodeStorySaver env )
saveNodeStory = do
saver <- view hasNodeStorySaver
liftBase $ do
Gargantext.Prelude.putStrLn "---- Running node story saver ----"
--Gargantext.Prelude.putStrLn "---- Running node story saver ----"
saver
Gargantext.Prelude.putStrLn "---- Node story saver finished ----"
--Gargantext.Prelude.putStrLn "---- Node story saver finished ----"
saveNodeStoryImmediate :: ( MonadReader env m, MonadBase IO m, HasNodeStoryImmediateSaver env )
......@@ -194,9 +194,9 @@ saveNodeStoryImmediate :: ( MonadReader env m, MonadBase IO m, HasNodeStoryImmed
saveNodeStoryImmediate = do
saver <- view hasNodeStoryImmediateSaver
liftBase $ do
Gargantext.Prelude.putStrLn "---- Running node story immediate saver ----"
--Gargantext.Prelude.putStrLn "---- Running node story immediate saver ----"
saver
Gargantext.Prelude.putStrLn "---- Node story immediate saver finished ----"
--Gargantext.Prelude.putStrLn "---- Node story immediate saver finished ----"
listTypeConflictResolution :: ListType -> ListType -> ListType
......
......@@ -159,15 +159,15 @@ reIndexWith cId lId nt lts = do
-- TODO Tests here
let
ngramsByDoc = map (HashMap.fromList)
$ map (map (\(k,v) -> (SimpleNgrams (text2ngrams k), v)))
$ map (\doc -> List.zip
(termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
$ Text.unlines $ catMaybes
[ doc ^. context_hyperdata . hd_title
, doc ^. context_hyperdata . hd_abstract
]
$ map (map (\((k, cnt), v) -> (SimpleNgrams (text2ngrams k), over (traverse . traverse) (\p -> (p, cnt)) v)))
$ map (\doc -> List.zip
(termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
$ Text.unlines $ catMaybes
[ doc ^. context_hyperdata . hd_title
, doc ^. context_hyperdata . hd_abstract
]
)
(List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
(List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
) docs
-- printDebug "ngramsByDoc" ngramsByDoc
......
......@@ -195,7 +195,7 @@ getCoocByNgrams' f (Diagonal diag) m =
listToCombi identity ks
]
where
where
ks = HM.keys m
-- TODO k could be either k1 or k2 here
......@@ -220,7 +220,7 @@ getCoocByNgrams'' (Diagonal diag) (f1,f2) (m1,m2) =
-- TODO check optim
-- listToCombi identity ks1
]
where
where
ks1 = HM.keys m1
ks2 = HM.keys m2
......
......@@ -455,7 +455,7 @@ insertArchiveList c nodeId a = do
where
query :: PGS.Query
query = [sql| WITH s as (SELECT ? as sid, ? sversion, ? sngrams_type_id, ngrams.id as sngrams_id, ?::jsonb as srepo FROM ngrams WHERE terms = ?)
INSERT INTO node_stories(node_id, version, ngrams_type_id, ngrams_id, ngrams_repo_element)
INSERT INTO node_stories(node_id, version, ngrams_type_id, ngrams_id, ngrams_repo_element)
SELECT s.sid, s.sversion, s.sngrams_type_id, s.sngrams_id, s.srepo from s s join nodes n on s.sid = n.id
|]
......@@ -505,22 +505,22 @@ updateNodeStory c nodeId@(NodeId _nId) currentArchive newArchive = do
--printDebug "[updateNodeStory] updates" $ Text.unlines $ (Text.pack . show) <$> updates
-- 2. Perform inserts/deletes/updates
printDebug "[updateNodeStory] applying insert" ()
--printDebug "[updateNodeStory] applying insert" ()
insertArchiveList c nodeId $ Archive { _a_version = newArchive ^. a_version
, _a_history = []
, _a_state = archiveStateFromList inserts }
printDebug "[updateNodeStory] insert applied" ()
--printDebug "[updateNodeStory] insert applied" ()
--TODO Use currentArchive ^. a_version in delete and report error
-- if entries with (node_id, ngrams_type_id, ngrams_id) but
-- different version are found.
deleteArchiveList c nodeId $ Archive { _a_version = newArchive ^. a_version
, _a_history = []
, _a_state = archiveStateFromList deletes }
printDebug "[updateNodeStory] delete applied" ()
--printDebug "[updateNodeStory] delete applied" ()
updateArchiveList c nodeId $ Archive { _a_version = newArchive ^. a_version
, _a_history = []
, _a_state = archiveStateFromList updates }
printDebug "[updateNodeStory] update applied" ()
--printDebug "[updateNodeStory] update applied" ()
pure ()
-- where
......
......@@ -28,8 +28,9 @@ compute graph
-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE ConstrainedClassMethods #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.Core.Text.Terms
where
......@@ -47,6 +48,7 @@ import qualified Data.Set as Set
import qualified Data.Text as Text
import qualified Data.HashMap.Strict as HashMap
import Gargantext.Core
import Gargantext.Core.Utils (groupWithCounts)
import Gargantext.Core.Text (sentences, HasText(..))
import Gargantext.Core.Text.Terms.Eleve (mainEleveWith, Tries, Token, buildTries, toToken)
import Gargantext.Core.Text.Terms.Mono (monoTerms)
......@@ -70,24 +72,23 @@ data TermType lang
, _tt_model :: !(Maybe (Tries Token ()))
}
deriving (Generic)
deriving instance (Show lang) => Show (TermType lang)
makeLenses ''TermType
--group :: [Text] -> [Text]
--group = undefined
-- remove Stop Words
-- map (filter (\t -> not . elem t)) $
-- map (filter (\t -> not . elem t)) $
------------------------------------------------------------------------
-- | Sugar to extract terms from text (hiddeng mapM from end user).
-- | Sugar to extract terms from text (hidding 'mapM' from end user).
--extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
extractTerms :: TermType Lang -> [Text] -> IO [[Terms]]
extractTerms :: TermType Lang -> [Text] -> IO [[TermsWithCount]]
extractTerms (Unsupervised {..}) xs = mapM (terms (Unsupervised { _tt_model = Just m', .. })) xs
where
m' = case _tt_model of
Just m''-> m''
Nothing -> newTries _tt_windowSize (Text.intercalate " " xs)
extractTerms termTypeLang xs = mapM (terms termTypeLang) xs
......@@ -116,12 +117,13 @@ data ExtractedNgrams = SimpleNgrams { unSimpleNgrams :: Ngrams }
instance Hashable ExtractedNgrams
-- | A typeclass that represents extracting ngrams from an entity.
class ExtractNgramsT h
where
extractNgramsT :: HasText h
=> TermType Lang
-> h
-> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
-> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
------------------------------------------------------------------------
enrichedTerms :: Lang -> PosTagAlgo -> POS -> Terms -> NgramsPostag
enrichedTerms l pa po (Terms ng1 ng2) =
......@@ -132,7 +134,7 @@ enrichedTerms l pa po (Terms ng1 ng2) =
------------------------------------------------------------------------
cleanNgrams :: Int -> Ngrams -> Ngrams
cleanNgrams s ng
cleanNgrams s ng
| Text.length (ng ^. ngramsTerms) < s = ng
| otherwise = text2ngrams (Text.take s (ng ^. ngramsTerms))
......@@ -151,10 +153,10 @@ insertExtractedNgrams ngs = do
let (s, e) = List.partition isSimpleNgrams ngs
m1 <- insertNgrams (map unSimpleNgrams s)
--printDebug "others" m1
m2 <- insertNgramsPostag (map unEnrichedNgrams e)
--printDebug "terms" m2
let result = HashMap.union m1 m2
pure result
......@@ -163,43 +165,48 @@ isSimpleNgrams (SimpleNgrams _) = True
isSimpleNgrams _ = False
------------------------------------------------------------------------
-- | Terms from Text
-- Mono : mono terms
-- Multi : multi terms
-- MonoMulti : mono and multi
-- | Terms from 'Text'
-- 'Mono' : mono terms
-- 'Multi' : multi terms
-- 'MonoMulti' : mono and multi
-- TODO : multi terms should exclude mono (intersection is not empty yet)
terms :: TermType Lang -> Text -> IO [Terms]
terms (Mono lang) txt = pure $ monoTerms lang txt
terms (Multi lang) txt = multiterms lang txt
terms (MonoMulti lang) txt = terms (Multi lang) txt
terms (Unsupervised { .. }) txt = termsUnsupervised (Unsupervised { _tt_model = Just m', .. }) txt
terms :: TermType Lang -> Text -> IO [TermsWithCount]
terms tt txt = do
printDebug "[terms] tt" tt
printDebug "[terms] txt" txt
out <- termsNoLog tt txt
printDebug "[terms] out" out
pure out
termsNoLog :: TermType Lang -> Text -> IO [TermsWithCount]
termsNoLog (Mono lang) txt = pure $ monoTerms lang txt
termsNoLog (Multi lang) txt = multiterms lang txt
termsNoLog (MonoMulti lang) txt = terms (Multi lang) txt
termsNoLog (Unsupervised { .. }) txt = pure $ termsUnsupervised (Unsupervised { _tt_model = Just m', .. }) txt
where
m' = maybe (newTries _tt_ngramsSize txt) identity _tt_model
-- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
------------------------------------------------------------------------
-- | Unsupervised ngrams extraction
-- language agnostic extraction
-- TODO: remove IO
-- TODO: newtype BlockText
type WindowSize = Int
type MinNgramSize = Int
termsUnsupervised :: TermType Lang -> Text -> IO [Terms]
termsUnsupervised (Unsupervised l n s m) =
pure
. map (text2term l)
. List.nub
. (List.filter (\l' -> List.length l' >= s))
-- | Unsupervised ngrams extraction
-- language agnostic extraction
-- TODO: newtype BlockText
termsUnsupervised :: TermType Lang -> Text -> [TermsWithCount]
termsUnsupervised (Unsupervised { _tt_model = Nothing }) = panic "[termsUnsupervised] no model"
termsUnsupervised (Unsupervised { _tt_model = Just _tt_model, .. }) =
map (\(t, cnt) -> (text2term _tt_lang t, cnt))
. groupWithCounts
-- . List.nub
. (List.filter (\l' -> List.length l' >= _tt_windowSize))
. List.concat
. mainEleveWith (maybe (panic "no model") identity m) n
. mainEleveWith _tt_model _tt_ngramsSize
. uniText
termsUnsupervised _ = undefined
newTries :: Int -> Text -> Tries Token ()
newTries n t = buildTries n (fmap toToken $ uniText t)
......@@ -217,5 +224,3 @@ text2term lang txt = Terms txt (Set.fromList $ map (stem lang) txt)
isPunctuation :: Text -> Bool
isPunctuation x = List.elem x $ (Text.pack . pure)
<$> ("!?(),;.:" :: String)
......@@ -32,6 +32,7 @@ Notes for current implementation:
-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeFamilies #-}
......@@ -278,6 +279,8 @@ data Tries k e = Tries
makeLenses ''Tries
deriving instance (Show k, Show e) => Show (Tries k e)
buildTries :: Int -> [[Token]] -> Tries Token ()
buildTries n sentences = Tries
{ _fwd = buildTrie Forward n sentences
......
......@@ -40,8 +40,8 @@ words = monoTexts
isSep :: Char -> Bool
isSep = (`elem` (",.:;?!(){}[]\"\'" :: String))
monoTerms :: Lang -> Text -> [Terms]
monoTerms l txt = map (monoText2term l) $ monoTexts txt
monoTerms :: Lang -> Text -> [TermsWithCount]
monoTerms l txt = map (\t -> (monoText2term l t, 1)) $ monoTexts txt
monoTexts :: Text -> [Text]
monoTexts = L.concat . monoTextsBySentence
......
......@@ -21,6 +21,7 @@ import Data.List (concat)
import Gargantext.Prelude
import Gargantext.Core (Lang(..))
import Gargantext.Core.Types
import Gargantext.Core.Utils (groupWithCounts)
import Gargantext.Core.Text.Terms.Multi.PosTagging
import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
......@@ -37,14 +38,16 @@ import qualified Gargantext.Utils.SpacyNLP as SpacyNLP
type NLP_API = Lang -> Text -> IO PosSentences
-------------------------------------------------------------------
multiterms :: Lang -> Text -> IO [Terms]
multiterms = multiterms' tokenTag2terms
multiterms :: Lang -> Text -> IO [TermsWithCount]
multiterms l txt = do
ret <- multiterms' tokenTag2terms l txt
pure $ groupWithCounts ret
where
multiterms' :: (TokenTag -> a) -> Lang -> Text -> IO [a]
multiterms' f lang txt = concat
multiterms' f lang txt' = concat
<$> map (map f)
<$> map (filter (\t -> _my_token_pos t == Just NP))
<$> tokenTags lang txt
<$> tokenTags lang txt'
-------------------------------------------------------------------
tokenTag2terms :: TokenTag -> Terms
......@@ -57,10 +60,10 @@ tokenTags l _ = panic $ "[G.C.T.T.Multi] Lang NLP API not implemented yet " <
tokenTagsWith :: Lang -> Text -> NLP_API -> IO [[TokenTag]]
tokenTagsWith lang txt nlp = map (groupTokens lang)
<$> map tokens2tokensTags
<$> map _sentenceTokens
<$> _sentences
<$> nlp lang txt
<$> map tokens2tokensTags
<$> map _sentenceTokens
<$> _sentences
<$> nlp lang txt
---- | This function analyses and groups (or not) ngrams according to
......
......@@ -30,4 +30,3 @@ group2 p1 p2 (x@(TokenTag _ _ Nothing _):y) = (x: group2 p1 p2 y)
group2 _ _ [x@(TokenTag _ _ (Just _) _)] = [x]
group2 p1 p2 (x@(TokenTag _ _ (Just _) _):y@(TokenTag _ _ Nothing _):z) = (x:y: group2 p1 p2 (y:z))
group2 _ _ [] = []
......@@ -40,4 +40,3 @@ groupTokens ntags = group2 NP NP
--groupNgrams ((x,_,"LOCATION"):(y,yy,"LOCATION"):xs) = groupNgrams ((x <> " " <> y,yy,"LOCATION"):xs)
--
--groupNgrams (x:xs) = (x:(groupNgrams xs))
......@@ -45,19 +45,22 @@ tokens2tokensTags :: [Token] -> [TokenTag]
tokens2tokensTags ts = filter' $ map tokenTag ts
------------------------------------------------------------------------
tokenTag :: Token -> TokenTag
tokenTag (Token _ w _ l _ _ p n _ _) = TokenTag w' l' p n
tokenTag (Token { .. }) = TokenTag { _my_token_word = w'
, _my_token_lemma = l'
, _my_token_pos = _tokenPos
, _my_token_ner = _tokenNer }
where
w' = split w
l' = fromList (split l)
w' = split _tokenWord
l' = fromList (split _tokenLemma)
split = splitOn (pack " ") . toLower
filter' :: [TokenTag] -> [TokenTag]
filter' xs = filter isNgrams xs
where
isNgrams (TokenTag _ _ p n) = isJust p || isJust n
isNgrams (TokenTag { .. }) = isJust _my_token_pos || isJust _my_token_ner
------------------------------------------------------------------------
-- request =
-- request =
-- "fr" : {
-- "tokenize.language" : "fr",
-- "pos.model" : "edu/stanford/nlp/models/pos-tagger/french/french.tagger",
......@@ -66,9 +69,9 @@ filter' xs = filter isNgrams xs
-- "depparse.model" : "edu/stanford/nlp/models/parser/nndep/UD_French.gz",
-- "depparse.language" : "french",
-- "ner.model": DATA_ROOT+"/eunews.fr.crf.gz",
-- "ssplit.newlineIsSentenceBreak": "always"
-- "ssplit.newlineIsSentenceBreak": "always"
-- },
--
--
corenlp' :: ( FromJSON a
, ConvertibleStrings p ByteString
......@@ -80,6 +83,7 @@ corenlp' lang txt = do
FR -> "{\"annotators\": \"tokenize,ssplit,pos,lemma,ner\", \"parse.model\":\"edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz\", \"pos.model\":\"edu/stanford/nlp/models/pos-tagger/french/french.tagger\", \"tokenize.language\":\"fr\", \"outputFormat\": \"json\"}"
_ -> panic $ pack "not implemented yet"
url <- parseRequest $ "POST http://localhost:9000/?properties=" <> properties
-- curl -XPOST 'http://localhost:9000/?properties=%7B%22annotators%22:%20%22tokenize,ssplit,pos,ner%22,%20%22outputFormat%22:%20%22json%22%7D' -d 'hello world, hello' | jq .
let request = setRequestBodyLBS (cs txt) url
httpJSON request
......
......@@ -52,4 +52,3 @@ data PosSentences = PosSentences { _sentences :: [Sentence]}
deriving (Show, Generic)
$(deriveJSON (unPrefix "_") ''PosSentences)
{-|
Module : Gargantext.Core.Text.Terms.WithList
Description :
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
......@@ -21,6 +21,8 @@ import Data.Text (Text, concat, unwords)
import Gargantext.Prelude
import Gargantext.Core.Text.Context
import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence)
import Gargantext.Core.Types (TermsCount)
import Gargantext.Core.Utils (groupWithCounts)
import Prelude (error)
import qualified Data.Algorithms.KMP as KMP
import qualified Data.IntMap.Strict as IntMap
......@@ -72,8 +74,8 @@ buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
-- Utils
type BlockText = Text
type MatchedText = Text
termsInText :: Patterns -> BlockText -> [MatchedText]
termsInText pats txt = List.nub
termsInText :: Patterns -> BlockText -> [(MatchedText, TermsCount)]
termsInText pats txt = groupWithCounts
$ List.concat
$ map (map unwords)
$ extractTermsWithList pats txt
......@@ -96,7 +98,7 @@ extractTermsWithList' pats = map (concat . map concat . replaceTerms pats)
{- | Not used
filterWith :: TermList
-> (a -> Text)
-> [a]
-> [a]
-> [(a, [Text])]
filterWith termList f xs = filterWith' termList f zip xs
......@@ -104,7 +106,7 @@ filterWith termList f xs = filterWith' termList f zip xs
filterWith' :: TermList
-> (a -> Text)
-> ([a] -> [[Text]] -> [b])
-> [a]
-> [a]
-> [b]
filterWith' termList f f' xs = f' xs
$ map (extractTermsWithList' pats)
......
{-|
Module : Gargantext.Types
Description :
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
......@@ -17,7 +17,7 @@ commentary with @some markup@.
module Gargantext.Core.Types ( module Gargantext.Core.Types.Main
, module Gargantext.Database.Admin.Types.Node
, DebugMode(..), withDebugMode
, Term, Terms(..)
, Term, Terms(..), TermsCount, TermsWithCount
, TokenTag(..), POS(..), NER(..)
, Label, Stems
, HasInvalidError(..), assertValid
......@@ -70,10 +70,13 @@ type Label = [Text]
data Terms = Terms { _terms_label :: Label
, _terms_stem :: Stems
} deriving (Ord, Show)
instance Eq Terms where
(==) (Terms _ s1) (Terms _ s2) = s1 == s2
type TermsCount = Int
type TermsWithCount = (Terms, TermsCount)
------------------------------------------------------------------------
data Tag = POS | NER
deriving (Show, Eq)
......@@ -208,5 +211,3 @@ data TODO = TODO
instance ToSchema TODO where
instance ToParamSchema TODO where
----------------------------------------------------------------------------
{-|
Module : Gargantext.Utils
Description :
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
......@@ -12,16 +12,19 @@ commentary with @some markup@.
-}
module Gargantext.Core.Utils (
module Gargantext.Core.Utils (
-- module Gargantext.Utils.Chronos
module Gargantext.Core.Utils.Prefix
, something
, alphanum
, choices
, randomString
, groupWithCounts
, addTuples
) where
import Data.Char (chr, ord)
import qualified Data.List as List
import Data.Maybe
import Data.Monoid
import Data.Text (Text, pack)
......@@ -57,3 +60,17 @@ randomString :: Int -> IO Text
randomString num = do
str <- choices num alphanum
pure $ pack str
-- | Given a list of items of type 'a', return list with unique items
-- (like List.nub) but tuple-d with their counts in the original list
groupWithCounts :: (Ord a, Eq a) => [a] -> [(a, Int)]
groupWithCounts = map f
. List.group
. List.sort
where
f [] = panic "[groupWithCounts] impossible"
f ts@(t:_) = (t, length ts)
addTuples :: (Num a, Num b) => (a, b) -> (a, b) -> (a, b)
addTuples (a1, b1) (a2, b2) = (a1 + a2, b1 + b2)
......@@ -65,7 +65,7 @@ flowPhylo cId = do
patterns = buildPatterns termList
-- | To filter the Ngrams of a document based on the termList
filterTerms :: Patterns -> (Date, Text) -> (Date, [Text])
filterTerms patterns' (y,d) = (y,termsInText patterns' d)
filterTerms patterns' (y,d) = (y, fst <$> termsInText patterns' d)
docs = map ((\(y,t) -> Document y t) . filterTerms patterns) docs'
......@@ -123,4 +123,3 @@ writePhylo _fp _phview = undefined
-- refactor 2021
-- viewPhylo2Svg :: PhyloView -> IO DB.ByteString
-- viewPhylo2Svg p = graphvizWithHandle Dot (viewToDot p) Svg DB.hGetContents
......@@ -48,7 +48,7 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
where
import Conduit
import Control.Lens ((^.), view, _Just, makeLenses)
import Control.Lens ((^.), view, _Just, makeLenses, over, traverse)
import Data.Aeson.TH (deriveJSON)
import Data.Conduit.Internal (zipSources)
import Data.Either
......@@ -60,7 +60,6 @@ import Data.Maybe (catMaybes)
import Data.Monoid
import Data.Swagger
import qualified Data.Text as T
import Data.Traversable (traverse)
import Data.Tuple.Extra (first, second)
import GHC.Generics (Generic)
import Servant.Client (ClientError)
......@@ -83,9 +82,10 @@ import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(.
import Gargantext.Core.Text.List.Social (FlowSocialListWith)
import Gargantext.Core.Text.Terms
import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
import Gargantext.Core.Types (POS(NP))
import Gargantext.Core.Types (POS(NP), TermsCount)
import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Types.Main
import Gargantext.Core.Utils (addTuples)
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
import Gargantext.Database.Action.Flow.List
import Gargantext.Database.Action.Flow.Types
......@@ -284,7 +284,7 @@ flow c u cn la mfslw (mLength, docsC) logStatus = do
, _scst_events = Just []
}
pure $ Prelude.head id
------------------------------------------------------------------------
......@@ -357,25 +357,27 @@ insertMasterDocs c lang hs = do
-- this will enable global database monitoring
-- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
<- mapNodeIdNgrams
<$> documentIdWithNgrams
(extractNgramsT $ withLang lang documentsWithId)
documentsWithId
(extractNgramsT $ withLang lang documentsWithId)
documentsWithId
lId <- getOrMkList masterCorpusId masterUserId
-- _ <- saveDocNgramsWith lId mapNgramsDocs'
_ <- saveDocNgramsWith lId mapNgramsDocs'
-- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
pure ids'
saveDocNgramsWith :: ( FlowCmdM env err m)
saveDocNgramsWith :: (FlowCmdM env err m)
=> ListId
-> HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
-> HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
-> m ()
saveDocNgramsWith lId mapNgramsDocs' = do
terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
--printDebug "terms2id" terms2id
printDebug "[saveDocNgramsWith] mapNgramsDocs'" mapNgramsDocs'
let mapNgramsDocsNoCount = over (traverse . traverse . traverse) fst mapNgramsDocs'
terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocsNoCount
let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
......@@ -392,7 +394,7 @@ saveDocNgramsWith lId mapNgramsDocs' = do
<*> Just (fromIntegral w :: Double)
| (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
, (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
, (nId, w) <- Map.toList mapNodeIdWeight
, (nId, (w, _cnt)) <- Map.toList mapNodeIdWeight
]
-- to be removed
......@@ -451,28 +453,32 @@ mergeData rs = catMaybes . map toDocumentWithId . Map.toList
------------------------------------------------------------------------
documentIdWithNgrams :: HasNodeError err
=> (a
-> Cmd err (HashMap b (Map NgramsType Int)))
-> Cmd err (HashMap b (Map NgramsType Int, TermsCount)))
-> [Indexed NodeId a]
-> Cmd err [DocumentIdWithNgrams a b]
documentIdWithNgrams f = traverse toDocumentIdWithNgrams
where
toDocumentIdWithNgrams d = do
e <- f $ _unIndex d
pure $ DocumentIdWithNgrams d e
pure $ DocumentIdWithNgrams d e
-- | TODO check optimization
mapNodeIdNgrams :: (Ord b, Hashable b)
=> [DocumentIdWithNgrams a b]
-> HashMap b
(Map NgramsType
(Map NodeId Int)
(Map NgramsType
(Map NodeId (Int, TermsCount))
)
mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith addTuples)) . fmap f
where
-- | NOTE We are somehow multiplying 'TermsCount' here: If the
-- same ngrams term has different ngrams types, the 'TermsCount'
-- for it (which is the number of times the terms appears in a
-- document) is copied over to all its types.
f :: DocumentIdWithNgrams a b
-> HashMap b (Map NgramsType (Map NodeId Int))
f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
-> HashMap b (Map NgramsType (Map NodeId (Int, TermsCount)))
f d = fmap (\(ngramsTypeMap, cnt) -> fmap (\i -> Map.singleton nId (i, cnt)) ngramsTypeMap) $ documentNgrams d
where
nId = _index $ documentWithId d
......@@ -483,25 +489,25 @@ instance ExtractNgramsT HyperdataContact
extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
where
extract :: TermType Lang -> HyperdataContact
-> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
-> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
extract _l hc' = do
let authors = map text2ngrams
$ maybe ["Nothing"] (\a -> [a])
$ view (hc_who . _Just . cw_lastName) hc'
pure $ HashMap.fromList $ [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
pure $ HashMap.fromList $ [(SimpleNgrams a', (Map.singleton Authors 1, 1)) | a' <- authors ]
instance ExtractNgramsT HyperdataDocument
where
extractNgramsT :: TermType Lang
-> HyperdataDocument
-> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
-> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
where
extractNgramsT' :: TermType Lang
-> HyperdataDocument
-> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
-> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
extractNgramsT' lang' doc = do
let source = text2ngrams
$ maybe "Nothing" identity
......@@ -515,23 +521,25 @@ instance ExtractNgramsT HyperdataDocument
$ maybe ["Nothing"] (T.splitOn ", ")
$ _hd_authors doc
terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
<$> concat
<$> liftBase (extractTerms lang' $ hasText doc)
termsWithCounts' <- map (\(t, cnt) -> (enrichedTerms (lang' ^. tt_lang) CoreNLP NP t, cnt))
<$> concat
<$> liftBase (extractTerms lang' $ hasText doc)
printDebug "[extractNgramsT HyperdataDocument] termsWithCounts'" termsWithCounts'
printDebug "[extractNgramsT HyperdataDocument] termsWithLargerCounts" $ filter (\(_, cnt) -> cnt > 1) termsWithCounts'
pure $ HashMap.fromList
$ [(SimpleNgrams source, Map.singleton Sources 1) ]
<> [(SimpleNgrams i', Map.singleton Institutes 1) | i' <- institutes ]
<> [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
<> [(EnrichedNgrams t', Map.singleton NgramsTerms 1) | t' <- terms' ]
$ [(SimpleNgrams source, (Map.singleton Sources 1, 1)) ]
<> [(SimpleNgrams i', (Map.singleton Institutes 1, 1)) | i' <- institutes ]
<> [(SimpleNgrams a', (Map.singleton Authors 1, 1)) | a' <- authors ]
<> [(EnrichedNgrams t', (Map.singleton NgramsTerms 1, cnt')) | (t', cnt') <- termsWithCounts' ]
instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
where
extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
extractNgramsT l (Node { _node_hyperdata = h }) = extractNgramsT l h
instance HasText a => HasText (Node a)
where
hasText (Node _ _ _ _ _ _ _ h) = hasText h
hasText (Node { _node_hyperdata = h }) = hasText h
......
......@@ -15,6 +15,7 @@ module Gargantext.Database.Action.Flow.Utils
import Data.Map (Map)
import Data.HashMap.Strict (HashMap)
import Gargantext.Core.Types (TermsCount)
import Gargantext.Database.Admin.Types.Node
import Gargantext.Database.Prelude (Cmd)
import Gargantext.Database.Query.Table.ContextNodeNgrams
......@@ -29,24 +30,24 @@ import qualified Data.HashMap.Strict as HashMap
data DocumentIdWithNgrams a b =
DocumentIdWithNgrams
{ documentWithId :: Indexed NodeId a
, documentNgrams :: HashMap b (Map NgramsType Int)
, documentNgrams :: HashMap b (Map NgramsType Int, TermsCount)
} deriving (Show)
insertDocNgrams :: ListId
-> HashMap (Indexed NgramsId Ngrams) (Map NgramsType (Map DocId Int))
-> HashMap (Indexed NgramsId Ngrams) (Map NgramsType (Map DocId (Int, TermsCount)))
-> Cmd err Int
insertDocNgrams lId m = insertContextNodeNgrams ns
insertDocNgrams lId m = do
printDebug "[insertDocNgrams] ns" ns
insertContextNodeNgrams ns
where
ns = [ ContextNodeNgrams docId lId (ng^.index)
(ngramsTypeId t)
(fromIntegral i)
cnt
| (ng, t2n2i) <- HashMap.toList m
, (t, n2i) <- DM.toList t2n2i
, (docId, i) <- DM.toList n2i
, (docId, (i, cnt)) <- DM.toList n2i
]
-- [(NodeId, {Ngrams: ({NgramsType: Int}, TermsCount)})]
-- {Ngrams: {NgramsType: {NodeId: (Int, TermsCount)}}}
......@@ -36,12 +36,13 @@ queryContextNodeNgramsTable = selectTable contextNodeNgramsTable
-- | Insert utils
insertContextNodeNgrams :: [ContextNodeNgrams] -> Cmd err Int
insertContextNodeNgrams = insertContextNodeNgramsW
. map (\(ContextNodeNgrams c n ng nt w) ->
. map (\(ContextNodeNgrams c n ng nt w dc) ->
ContextNodeNgrams (pgContextId c)
(pgNodeId n)
(sqlInt4 ng)
(pgNgramsTypeId nt)
(sqlDouble w)
(sqlInt4 dc)
)
insertContextNodeNgramsW :: [ContextNodeNgramsWrite] -> Cmd err Int
......
......@@ -35,8 +35,9 @@ updateHyperdata i h = mkCmd $ \c -> putStrLn "before runUpdate_" >>
updateHyperdataQuery :: ToJSON a => NodeId -> a -> Update Int64
updateHyperdataQuery i h = seq h' $ trace "updateHyperdataQuery: encoded JSON" $ Update
{ uTable = nodeTable
, uUpdateWith = updateEasy (\ (Node _ni _nh _nt _nu _np _nn _nd _h)
-> trace "updating mate" $ Node _ni _nh _nt _nu _np _nn _nd h'
, uUpdateWith = updateEasy (\ (Node { .. })
-> Node { _node_hyperdata = h', .. }
-- -> trace "updating mate" $ Node _ni _nh _nt _nu _np _nn _nd h'
)
, uWhere = (\row -> {-trace "uWhere" $-} _node_id row .== pgNodeId i )
, uReturning = rCount
......@@ -63,5 +64,3 @@ updateNodesWithType_ :: ( HasNodeError err
updateNodesWithType_ nt h = do
ns <- getNodesIdWithType nt
mapM (\n -> updateHyperdata n h) ns
......@@ -19,6 +19,7 @@ module Gargantext.Database.Schema.ContextNodeNgrams
where
import Prelude
import Gargantext.Core.Types (TermsCount)
import Gargantext.Database.Schema.Prelude
import Gargantext.Database.Schema.Ngrams (NgramsTypeId, NgramsId)
import Gargantext.Database.Admin.Types.Node
......@@ -26,15 +27,16 @@ import Gargantext.Database.Admin.Types.Node
type ContextNodeNgrams =
ContextNodeNgramsPoly ContextId ListId NgramsId NgramsTypeId Double
ContextNodeNgramsPoly ContextId ListId NgramsId NgramsTypeId Double TermsCount
data ContextNodeNgramsPoly c n ngrams_id ngt w
data ContextNodeNgramsPoly c n ngrams_id ngt w dc
= ContextNodeNgrams { _cnng_context_id :: !c
, _cnng_node_id :: !n
, _cnng_ngrams_id :: !ngrams_id
, _cnng_ngramsType :: !ngt
, _cnng_weight :: !w
, _cnng_doc_count :: !dc
} deriving (Show)
type ContextNodeNgramsWrite =
......@@ -43,6 +45,7 @@ type ContextNodeNgramsWrite =
(Column SqlInt4 )
(Column SqlInt4 )
(Column SqlFloat8)
(Column SqlInt4 )
type ContextNodeNgramsRead =
ContextNodeNgramsPoly (Column SqlInt4 )
......@@ -50,6 +53,7 @@ type ContextNodeNgramsRead =
(Column SqlInt4 )
(Column SqlInt4 )
(Column SqlFloat8)
(Column SqlInt4 )
type ContextNodeNgramsReadNull =
ContextNodeNgramsPoly (Column (Nullable SqlInt4 ))
......@@ -57,6 +61,7 @@ type ContextNodeNgramsReadNull =
(Column (Nullable SqlInt4 ))
(Column (Nullable SqlInt4 ))
(Column (Nullable SqlFloat8))
(Column (Nullable SqlInt4 ))
$(makeAdaptorAndInstance "pContextNodeNgrams" ''ContextNodeNgramsPoly)
makeLenses ''ContextNodeNgramsPoly
......@@ -70,5 +75,6 @@ contextNodeNgramsTable = Table "context_node_ngrams"
, _cnng_ngrams_id = requiredTableField "ngrams_id"
, _cnng_ngramsType = requiredTableField "ngrams_type"
, _cnng_weight = requiredTableField "weight"
, _cnng_doc_count = requiredTableField "doc_count"
}
)
{-|
Module : Gargantext.Database.Schema.NodeNgrams
Description :
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
......@@ -86,7 +86,7 @@ type NgramsClass = Int
type NgramsText = Text
-- Example of list Ngrams
-- type ListNgrams = NodeNgramsPoly (Maybe Int) ListType Text
-- type ListNgrams = NodeNgramsPoly (Maybe Int) ListType Text
type NodeNgramsW =
NodeNgramsPoly (Maybe NodeNgramsId) NodeId ListType NgramsText
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment