Commit 3b163685 authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

[ngrams] first version of ngrams counting in docs

parent 0d6a5eaa
Pipeline #3329 passed with stage
in 91 minutes and 4 seconds
...@@ -196,6 +196,8 @@ To build documentation, run: ...@@ -196,6 +196,8 @@ To build documentation, run:
stack --docker build --haddock --no-haddock-deps --fast stack --docker build --haddock --no-haddock-deps --fast
``` ```
(in `.stack-work/dist/x86_64-linux-nix/Cabal-3.2.1.0/doc/html/gargantext`).
## GraphQL ## GraphQL
Some introspection information. Some introspection information.
......
...@@ -33,7 +33,6 @@ import Gargantext.Database.Admin.Trigger.Init ...@@ -33,7 +33,6 @@ import Gargantext.Database.Admin.Trigger.Init
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataCorpus) import Gargantext.Database.Admin.Types.Hyperdata (HyperdataCorpus)
import Gargantext.Database.Admin.Types.Node (NodeType(NodeDocument, NodeContact)) import Gargantext.Database.Admin.Types.Node (NodeType(NodeDocument, NodeContact))
import Gargantext.Database.Prelude (Cmd'', Cmd, execPGSQuery) import Gargantext.Database.Prelude (Cmd'', Cmd, execPGSQuery)
import Gargantext.Database.Query.Table.Node (getOrMkList)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Prelude.Config (GargConfig(..), readConfig) import Gargantext.Prelude.Config (GargConfig(..), readConfig)
import Prelude (getLine) import Prelude (getLine)
...@@ -67,7 +66,7 @@ main = do ...@@ -67,7 +66,7 @@ main = do
_ok <- getLine _ok <- getLine
cfg <- readConfig iniPath cfg <- readConfig iniPath
let secret = _gc_secretkey cfg let _secret = _gc_secretkey cfg
withDevEnv iniPath $ \env -> do withDevEnv iniPath $ \env -> do
-- First upgrade the Database Schema -- First upgrade the Database Schema
......
...@@ -24,7 +24,7 @@ services: ...@@ -24,7 +24,7 @@ services:
network_mode: host network_mode: host
#command: ["postgres", "-c", "log_statement=all"] #command: ["postgres", "-c", "log_statement=all"]
#ports: #ports:
#- 5432:5432 # - 5432:5432
environment: environment:
POSTGRES_USER: gargantua POSTGRES_USER: gargantua
POSTGRES_PASSWORD: C8kdcUrAQy66U POSTGRES_PASSWORD: C8kdcUrAQy66U
......
...@@ -164,6 +164,7 @@ CREATE TABLE public.context_node_ngrams ( ...@@ -164,6 +164,7 @@ CREATE TABLE public.context_node_ngrams (
ngrams_id INTEGER NOT NULL REFERENCES public.ngrams (id) ON DELETE CASCADE, ngrams_id INTEGER NOT NULL REFERENCES public.ngrams (id) ON DELETE CASCADE,
ngrams_type INTEGER , ngrams_type INTEGER ,
weight double precision, weight double precision,
doc_count INTEGER,
PRIMARY KEY (context_id, node_id, ngrams_id, ngrams_type) PRIMARY KEY (context_id, node_id, ngrams_id, ngrams_type)
); );
ALTER TABLE public.context_node_ngrams OWNER TO gargantua; ALTER TABLE public.context_node_ngrams OWNER TO gargantua;
......
ALTER TABLE context_node_ngrams
ADD COLUMN doc_count INTEGER;
cabal-version: 1.12 cabal-version: 1.12
-- This file has been generated from package.yaml by hpack version 0.35.0. -- This file has been generated from package.yaml by hpack version 0.34.7.
-- --
-- see: https://github.com/sol/hpack -- see: https://github.com/sol/hpack
...@@ -50,6 +50,7 @@ library ...@@ -50,6 +50,7 @@ library
Gargantext.Core.Types Gargantext.Core.Types
Gargantext.Core.Types.Individu Gargantext.Core.Types.Individu
Gargantext.Core.Types.Main Gargantext.Core.Types.Main
Gargantext.Core.Utils
Gargantext.Core.Utils.Prefix Gargantext.Core.Utils.Prefix
Gargantext.Utils.Jobs Gargantext.Utils.Jobs
Gargantext.Utils.Jobs.API Gargantext.Utils.Jobs.API
...@@ -87,6 +88,7 @@ library ...@@ -87,6 +88,7 @@ library
Gargantext.Core.Text.Prepare Gargantext.Core.Text.Prepare
Gargantext.Core.Text.Search Gargantext.Core.Text.Search
Gargantext.Core.Text.Terms Gargantext.Core.Text.Terms
Gargantext.Core.Text.Terms.Eleve
Gargantext.Core.Text.Terms.Mono Gargantext.Core.Text.Terms.Mono
Gargantext.Core.Text.Terms.Multi.Lang.En Gargantext.Core.Text.Terms.Multi.Lang.En
Gargantext.Core.Text.Terms.Multi.Lang.Fr Gargantext.Core.Text.Terms.Multi.Lang.Fr
...@@ -211,7 +213,6 @@ library ...@@ -211,7 +213,6 @@ library
Gargantext.Core.Text.Samples.EN Gargantext.Core.Text.Samples.EN
Gargantext.Core.Text.Samples.FR Gargantext.Core.Text.Samples.FR
Gargantext.Core.Text.Samples.SP Gargantext.Core.Text.Samples.SP
Gargantext.Core.Text.Terms.Eleve
Gargantext.Core.Text.Terms.Mono.Stem Gargantext.Core.Text.Terms.Mono.Stem
Gargantext.Core.Text.Terms.Mono.Stem.En Gargantext.Core.Text.Terms.Mono.Stem.En
Gargantext.Core.Text.Terms.Mono.Token Gargantext.Core.Text.Terms.Mono.Token
...@@ -221,7 +222,6 @@ library ...@@ -221,7 +222,6 @@ library
Gargantext.Core.Text.Terms.Multi.PosTagging Gargantext.Core.Text.Terms.Multi.PosTagging
Gargantext.Core.Text.Terms.Multi.PosTagging.Types Gargantext.Core.Text.Terms.Multi.PosTagging.Types
Gargantext.Core.Types.Phylo Gargantext.Core.Types.Phylo
Gargantext.Core.Utils
Gargantext.Core.Utils.DateUtils Gargantext.Core.Utils.DateUtils
Gargantext.Core.Viz Gargantext.Core.Viz
Gargantext.Core.Viz.Chart Gargantext.Core.Viz.Chart
...@@ -809,6 +809,7 @@ test-suite garg-test ...@@ -809,6 +809,7 @@ test-suite garg-test
Core.Text Core.Text
Core.Text.Examples Core.Text.Examples
Core.Text.Flow Core.Text.Flow
Core.Utils
Graph.Clustering Graph.Clustering
Graph.Distance Graph.Distance
Ngrams.Lang Ngrams.Lang
......
...@@ -74,6 +74,7 @@ library: ...@@ -74,6 +74,7 @@ library:
- Gargantext.Core.Types - Gargantext.Core.Types
- Gargantext.Core.Types.Individu - Gargantext.Core.Types.Individu
- Gargantext.Core.Types.Main - Gargantext.Core.Types.Main
- Gargantext.Core.Utils
- Gargantext.Core.Utils.Prefix - Gargantext.Core.Utils.Prefix
- Gargantext.Utils.Jobs - Gargantext.Utils.Jobs
- Gargantext.Utils.Jobs.API - Gargantext.Utils.Jobs.API
...@@ -111,6 +112,7 @@ library: ...@@ -111,6 +112,7 @@ library:
- Gargantext.Core.Text.Prepare - Gargantext.Core.Text.Prepare
- Gargantext.Core.Text.Search - Gargantext.Core.Text.Search
- Gargantext.Core.Text.Terms - Gargantext.Core.Text.Terms
- Gargantext.Core.Text.Terms.Eleve
- Gargantext.Core.Text.Terms.Mono - Gargantext.Core.Text.Terms.Mono
- Gargantext.Core.Text.Terms.Multi.Lang.En - Gargantext.Core.Text.Terms.Multi.Lang.En
- Gargantext.Core.Text.Terms.Multi.Lang.Fr - Gargantext.Core.Text.Terms.Multi.Lang.Fr
......
{-| {-|
Module : Graph.Clustering Module : Graph.Clustering
Description : Basic tests to avoid quick regression Description : Basic tests to avoid quick regression
......
{-|
Module : Core.Utils
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module Core.Utils where
import Test.Hspec
import Gargantext.Prelude
import Gargantext.Core.Utils
-- | Core.Utils tests
test :: IO ()
test = hspec $ do
describe "check if groupWithCounts works" $ do
it "simple integer array" $ do
(groupWithCounts [1, 2, 3, 1, 2, 3]) `shouldBe` [(1, 2), (2, 2), (3, 2)]
it "string" $ do
(groupWithCounts "abccba") `shouldBe` [('a', 2), ('b', 2), ('c', 2)]
{-| {-|
Module : Graph.Clustering Module : Graph.Clustering
Description : Basic tests to avoid quick regression Description : Basic tests to avoid quick regression
...@@ -40,4 +39,3 @@ test = hspec $ do ...@@ -40,4 +39,3 @@ test = hspec $ do
let let
result = List.length partitions > 1 result = List.length partitions > 1
shouldBe True result shouldBe True result
...@@ -11,6 +11,8 @@ Portability : POSIX ...@@ -11,6 +11,8 @@ Portability : POSIX
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import qualified Core.Utils as Utils
--import qualified Ngrams.Lang.Fr as Fr --import qualified Ngrams.Lang.Fr as Fr
--import qualified Ngrams.Lang as Lang --import qualified Ngrams.Lang as Lang
import qualified Ngrams.Lang.Occurrences as Occ import qualified Ngrams.Lang.Occurrences as Occ
...@@ -22,6 +24,7 @@ import qualified Utils.Crypto as Crypto ...@@ -22,6 +24,7 @@ import qualified Utils.Crypto as Crypto
main :: IO () main :: IO ()
main = do main = do
Utils.test
-- Occ.parsersTest -- Occ.parsersTest
-- Lang.ngramsExtractionTest FR -- Lang.ngramsExtractionTest FR
-- Lang.ngramsExtractionTest EN -- Lang.ngramsExtractionTest EN
......
{-| {-|
Module : Utils.Crypto Module : Utils.Crypto
Description : Description :
...@@ -43,4 +42,3 @@ test = hspec $ do ...@@ -43,4 +42,3 @@ test = hspec $ do
let hash2 = hash (["b","a"] :: [Text]) let hash2 = hash (["b","a"] :: [Text])
it "compare" $ do it "compare" $ do
hash1 `shouldBe` hash2 hash1 `shouldBe` hash2
...@@ -184,9 +184,9 @@ saveNodeStory :: ( MonadReader env m, MonadBase IO m, HasNodeStorySaver env ) ...@@ -184,9 +184,9 @@ saveNodeStory :: ( MonadReader env m, MonadBase IO m, HasNodeStorySaver env )
saveNodeStory = do saveNodeStory = do
saver <- view hasNodeStorySaver saver <- view hasNodeStorySaver
liftBase $ do liftBase $ do
Gargantext.Prelude.putStrLn "---- Running node story saver ----" --Gargantext.Prelude.putStrLn "---- Running node story saver ----"
saver saver
Gargantext.Prelude.putStrLn "---- Node story saver finished ----" --Gargantext.Prelude.putStrLn "---- Node story saver finished ----"
saveNodeStoryImmediate :: ( MonadReader env m, MonadBase IO m, HasNodeStoryImmediateSaver env ) saveNodeStoryImmediate :: ( MonadReader env m, MonadBase IO m, HasNodeStoryImmediateSaver env )
...@@ -194,9 +194,9 @@ saveNodeStoryImmediate :: ( MonadReader env m, MonadBase IO m, HasNodeStoryImmed ...@@ -194,9 +194,9 @@ saveNodeStoryImmediate :: ( MonadReader env m, MonadBase IO m, HasNodeStoryImmed
saveNodeStoryImmediate = do saveNodeStoryImmediate = do
saver <- view hasNodeStoryImmediateSaver saver <- view hasNodeStoryImmediateSaver
liftBase $ do liftBase $ do
Gargantext.Prelude.putStrLn "---- Running node story immediate saver ----" --Gargantext.Prelude.putStrLn "---- Running node story immediate saver ----"
saver saver
Gargantext.Prelude.putStrLn "---- Node story immediate saver finished ----" --Gargantext.Prelude.putStrLn "---- Node story immediate saver finished ----"
listTypeConflictResolution :: ListType -> ListType -> ListType listTypeConflictResolution :: ListType -> ListType -> ListType
......
...@@ -159,7 +159,7 @@ reIndexWith cId lId nt lts = do ...@@ -159,7 +159,7 @@ reIndexWith cId lId nt lts = do
-- TODO Tests here -- TODO Tests here
let let
ngramsByDoc = map (HashMap.fromList) ngramsByDoc = map (HashMap.fromList)
$ map (map (\(k,v) -> (SimpleNgrams (text2ngrams k), v))) $ map (map (\((k, cnt), v) -> (SimpleNgrams (text2ngrams k), over (traverse . traverse) (\p -> (p, cnt)) v)))
$ map (\doc -> List.zip $ map (\doc -> List.zip
(termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans) (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
$ Text.unlines $ catMaybes $ Text.unlines $ catMaybes
......
...@@ -505,22 +505,22 @@ updateNodeStory c nodeId@(NodeId _nId) currentArchive newArchive = do ...@@ -505,22 +505,22 @@ updateNodeStory c nodeId@(NodeId _nId) currentArchive newArchive = do
--printDebug "[updateNodeStory] updates" $ Text.unlines $ (Text.pack . show) <$> updates --printDebug "[updateNodeStory] updates" $ Text.unlines $ (Text.pack . show) <$> updates
-- 2. Perform inserts/deletes/updates -- 2. Perform inserts/deletes/updates
printDebug "[updateNodeStory] applying insert" () --printDebug "[updateNodeStory] applying insert" ()
insertArchiveList c nodeId $ Archive { _a_version = newArchive ^. a_version insertArchiveList c nodeId $ Archive { _a_version = newArchive ^. a_version
, _a_history = [] , _a_history = []
, _a_state = archiveStateFromList inserts } , _a_state = archiveStateFromList inserts }
printDebug "[updateNodeStory] insert applied" () --printDebug "[updateNodeStory] insert applied" ()
--TODO Use currentArchive ^. a_version in delete and report error --TODO Use currentArchive ^. a_version in delete and report error
-- if entries with (node_id, ngrams_type_id, ngrams_id) but -- if entries with (node_id, ngrams_type_id, ngrams_id) but
-- different version are found. -- different version are found.
deleteArchiveList c nodeId $ Archive { _a_version = newArchive ^. a_version deleteArchiveList c nodeId $ Archive { _a_version = newArchive ^. a_version
, _a_history = [] , _a_history = []
, _a_state = archiveStateFromList deletes } , _a_state = archiveStateFromList deletes }
printDebug "[updateNodeStory] delete applied" () --printDebug "[updateNodeStory] delete applied" ()
updateArchiveList c nodeId $ Archive { _a_version = newArchive ^. a_version updateArchiveList c nodeId $ Archive { _a_version = newArchive ^. a_version
, _a_history = [] , _a_history = []
, _a_state = archiveStateFromList updates } , _a_state = archiveStateFromList updates }
printDebug "[updateNodeStory] update applied" () --printDebug "[updateNodeStory] update applied" ()
pure () pure ()
-- where -- where
......
...@@ -28,8 +28,9 @@ compute graph ...@@ -28,8 +28,9 @@ compute graph
-} -}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE ConstrainedClassMethods #-} {-# LANGUAGE ConstrainedClassMethods #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.Core.Text.Terms module Gargantext.Core.Text.Terms
where where
...@@ -47,6 +48,7 @@ import qualified Data.Set as Set ...@@ -47,6 +48,7 @@ import qualified Data.Set as Set
import qualified Data.Text as Text import qualified Data.Text as Text
import qualified Data.HashMap.Strict as HashMap import qualified Data.HashMap.Strict as HashMap
import Gargantext.Core import Gargantext.Core
import Gargantext.Core.Utils (groupWithCounts)
import Gargantext.Core.Text (sentences, HasText(..)) import Gargantext.Core.Text (sentences, HasText(..))
import Gargantext.Core.Text.Terms.Eleve (mainEleveWith, Tries, Token, buildTries, toToken) import Gargantext.Core.Text.Terms.Eleve (mainEleveWith, Tries, Token, buildTries, toToken)
import Gargantext.Core.Text.Terms.Mono (monoTerms) import Gargantext.Core.Text.Terms.Mono (monoTerms)
...@@ -70,6 +72,7 @@ data TermType lang ...@@ -70,6 +72,7 @@ data TermType lang
, _tt_model :: !(Maybe (Tries Token ())) , _tt_model :: !(Maybe (Tries Token ()))
} }
deriving (Generic) deriving (Generic)
deriving instance (Show lang) => Show (TermType lang)
makeLenses ''TermType makeLenses ''TermType
--group :: [Text] -> [Text] --group :: [Text] -> [Text]
...@@ -78,16 +81,14 @@ makeLenses ''TermType ...@@ -78,16 +81,14 @@ makeLenses ''TermType
-- remove Stop Words -- remove Stop Words
-- map (filter (\t -> not . elem t)) $ -- map (filter (\t -> not . elem t)) $
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | Sugar to extract terms from text (hiddeng mapM from end user). -- | Sugar to extract terms from text (hidding 'mapM' from end user).
--extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms]) --extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
extractTerms :: TermType Lang -> [Text] -> IO [[Terms]] extractTerms :: TermType Lang -> [Text] -> IO [[TermsWithCount]]
extractTerms (Unsupervised {..}) xs = mapM (terms (Unsupervised { _tt_model = Just m', .. })) xs extractTerms (Unsupervised {..}) xs = mapM (terms (Unsupervised { _tt_model = Just m', .. })) xs
where where
m' = case _tt_model of m' = case _tt_model of
Just m''-> m'' Just m''-> m''
Nothing -> newTries _tt_windowSize (Text.intercalate " " xs) Nothing -> newTries _tt_windowSize (Text.intercalate " " xs)
extractTerms termTypeLang xs = mapM (terms termTypeLang) xs extractTerms termTypeLang xs = mapM (terms termTypeLang) xs
...@@ -116,12 +117,13 @@ data ExtractedNgrams = SimpleNgrams { unSimpleNgrams :: Ngrams } ...@@ -116,12 +117,13 @@ data ExtractedNgrams = SimpleNgrams { unSimpleNgrams :: Ngrams }
instance Hashable ExtractedNgrams instance Hashable ExtractedNgrams
-- | A typeclass that represents extracting ngrams from an entity.
class ExtractNgramsT h class ExtractNgramsT h
where where
extractNgramsT :: HasText h extractNgramsT :: HasText h
=> TermType Lang => TermType Lang
-> h -> h
-> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int)) -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
------------------------------------------------------------------------ ------------------------------------------------------------------------
enrichedTerms :: Lang -> PosTagAlgo -> POS -> Terms -> NgramsPostag enrichedTerms :: Lang -> PosTagAlgo -> POS -> Terms -> NgramsPostag
enrichedTerms l pa po (Terms ng1 ng2) = enrichedTerms l pa po (Terms ng1 ng2) =
...@@ -163,43 +165,48 @@ isSimpleNgrams (SimpleNgrams _) = True ...@@ -163,43 +165,48 @@ isSimpleNgrams (SimpleNgrams _) = True
isSimpleNgrams _ = False isSimpleNgrams _ = False
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | Terms from Text -- | Terms from 'Text'
-- Mono : mono terms -- 'Mono' : mono terms
-- Multi : multi terms -- 'Multi' : multi terms
-- MonoMulti : mono and multi -- 'MonoMulti' : mono and multi
-- TODO : multi terms should exclude mono (intersection is not empty yet) -- TODO : multi terms should exclude mono (intersection is not empty yet)
terms :: TermType Lang -> Text -> IO [Terms] terms :: TermType Lang -> Text -> IO [TermsWithCount]
terms (Mono lang) txt = pure $ monoTerms lang txt terms tt txt = do
terms (Multi lang) txt = multiterms lang txt printDebug "[terms] tt" tt
terms (MonoMulti lang) txt = terms (Multi lang) txt printDebug "[terms] txt" txt
terms (Unsupervised { .. }) txt = termsUnsupervised (Unsupervised { _tt_model = Just m', .. }) txt out <- termsNoLog tt txt
printDebug "[terms] out" out
pure out
termsNoLog :: TermType Lang -> Text -> IO [TermsWithCount]
termsNoLog (Mono lang) txt = pure $ monoTerms lang txt
termsNoLog (Multi lang) txt = multiterms lang txt
termsNoLog (MonoMulti lang) txt = terms (Multi lang) txt
termsNoLog (Unsupervised { .. }) txt = pure $ termsUnsupervised (Unsupervised { _tt_model = Just m', .. }) txt
where where
m' = maybe (newTries _tt_ngramsSize txt) identity _tt_model m' = maybe (newTries _tt_ngramsSize txt) identity _tt_model
-- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt -- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | Unsupervised ngrams extraction
-- language agnostic extraction
-- TODO: remove IO
-- TODO: newtype BlockText
type WindowSize = Int type WindowSize = Int
type MinNgramSize = Int type MinNgramSize = Int
termsUnsupervised :: TermType Lang -> Text -> IO [Terms] -- | Unsupervised ngrams extraction
termsUnsupervised (Unsupervised l n s m) = -- language agnostic extraction
pure -- TODO: newtype BlockText
. map (text2term l) termsUnsupervised :: TermType Lang -> Text -> [TermsWithCount]
. List.nub termsUnsupervised (Unsupervised { _tt_model = Nothing }) = panic "[termsUnsupervised] no model"
. (List.filter (\l' -> List.length l' >= s)) termsUnsupervised (Unsupervised { _tt_model = Just _tt_model, .. }) =
map (\(t, cnt) -> (text2term _tt_lang t, cnt))
. groupWithCounts
-- . List.nub
. (List.filter (\l' -> List.length l' >= _tt_windowSize))
. List.concat . List.concat
. mainEleveWith (maybe (panic "no model") identity m) n . mainEleveWith _tt_model _tt_ngramsSize
. uniText . uniText
termsUnsupervised _ = undefined termsUnsupervised _ = undefined
newTries :: Int -> Text -> Tries Token () newTries :: Int -> Text -> Tries Token ()
newTries n t = buildTries n (fmap toToken $ uniText t) newTries n t = buildTries n (fmap toToken $ uniText t)
...@@ -217,5 +224,3 @@ text2term lang txt = Terms txt (Set.fromList $ map (stem lang) txt) ...@@ -217,5 +224,3 @@ text2term lang txt = Terms txt (Set.fromList $ map (stem lang) txt)
isPunctuation :: Text -> Bool isPunctuation :: Text -> Bool
isPunctuation x = List.elem x $ (Text.pack . pure) isPunctuation x = List.elem x $ (Text.pack . pure)
<$> ("!?(),;.:" :: String) <$> ("!?(),;.:" :: String)
...@@ -32,6 +32,7 @@ Notes for current implementation: ...@@ -32,6 +32,7 @@ Notes for current implementation:
-} -}
{-# LANGUAGE ConstraintKinds #-} {-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeFamilies #-} {-# LANGUAGE TypeFamilies #-}
...@@ -278,6 +279,8 @@ data Tries k e = Tries ...@@ -278,6 +279,8 @@ data Tries k e = Tries
makeLenses ''Tries makeLenses ''Tries
deriving instance (Show k, Show e) => Show (Tries k e)
buildTries :: Int -> [[Token]] -> Tries Token () buildTries :: Int -> [[Token]] -> Tries Token ()
buildTries n sentences = Tries buildTries n sentences = Tries
{ _fwd = buildTrie Forward n sentences { _fwd = buildTrie Forward n sentences
......
...@@ -40,8 +40,8 @@ words = monoTexts ...@@ -40,8 +40,8 @@ words = monoTexts
isSep :: Char -> Bool isSep :: Char -> Bool
isSep = (`elem` (",.:;?!(){}[]\"\'" :: String)) isSep = (`elem` (",.:;?!(){}[]\"\'" :: String))
monoTerms :: Lang -> Text -> [Terms] monoTerms :: Lang -> Text -> [TermsWithCount]
monoTerms l txt = map (monoText2term l) $ monoTexts txt monoTerms l txt = map (\t -> (monoText2term l t, 1)) $ monoTexts txt
monoTexts :: Text -> [Text] monoTexts :: Text -> [Text]
monoTexts = L.concat . monoTextsBySentence monoTexts = L.concat . monoTextsBySentence
......
...@@ -21,6 +21,7 @@ import Data.List (concat) ...@@ -21,6 +21,7 @@ import Data.List (concat)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Core.Types import Gargantext.Core.Types
import Gargantext.Core.Utils (groupWithCounts)
import Gargantext.Core.Text.Terms.Multi.PosTagging import Gargantext.Core.Text.Terms.Multi.PosTagging
import Gargantext.Core.Text.Terms.Multi.PosTagging.Types import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
...@@ -37,14 +38,16 @@ import qualified Gargantext.Utils.SpacyNLP as SpacyNLP ...@@ -37,14 +38,16 @@ import qualified Gargantext.Utils.SpacyNLP as SpacyNLP
type NLP_API = Lang -> Text -> IO PosSentences type NLP_API = Lang -> Text -> IO PosSentences
------------------------------------------------------------------- -------------------------------------------------------------------
multiterms :: Lang -> Text -> IO [Terms] multiterms :: Lang -> Text -> IO [TermsWithCount]
multiterms = multiterms' tokenTag2terms multiterms l txt = do
ret <- multiterms' tokenTag2terms l txt
pure $ groupWithCounts ret
where where
multiterms' :: (TokenTag -> a) -> Lang -> Text -> IO [a] multiterms' :: (TokenTag -> a) -> Lang -> Text -> IO [a]
multiterms' f lang txt = concat multiterms' f lang txt' = concat
<$> map (map f) <$> map (map f)
<$> map (filter (\t -> _my_token_pos t == Just NP)) <$> map (filter (\t -> _my_token_pos t == Just NP))
<$> tokenTags lang txt <$> tokenTags lang txt'
------------------------------------------------------------------- -------------------------------------------------------------------
tokenTag2terms :: TokenTag -> Terms tokenTag2terms :: TokenTag -> Terms
......
...@@ -30,4 +30,3 @@ group2 p1 p2 (x@(TokenTag _ _ Nothing _):y) = (x: group2 p1 p2 y) ...@@ -30,4 +30,3 @@ group2 p1 p2 (x@(TokenTag _ _ Nothing _):y) = (x: group2 p1 p2 y)
group2 _ _ [x@(TokenTag _ _ (Just _) _)] = [x] group2 _ _ [x@(TokenTag _ _ (Just _) _)] = [x]
group2 p1 p2 (x@(TokenTag _ _ (Just _) _):y@(TokenTag _ _ Nothing _):z) = (x:y: group2 p1 p2 (y:z)) group2 p1 p2 (x@(TokenTag _ _ (Just _) _):y@(TokenTag _ _ Nothing _):z) = (x:y: group2 p1 p2 (y:z))
group2 _ _ [] = [] group2 _ _ [] = []
...@@ -40,4 +40,3 @@ groupTokens ntags = group2 NP NP ...@@ -40,4 +40,3 @@ groupTokens ntags = group2 NP NP
--groupNgrams ((x,_,"LOCATION"):(y,yy,"LOCATION"):xs) = groupNgrams ((x <> " " <> y,yy,"LOCATION"):xs) --groupNgrams ((x,_,"LOCATION"):(y,yy,"LOCATION"):xs) = groupNgrams ((x <> " " <> y,yy,"LOCATION"):xs)
-- --
--groupNgrams (x:xs) = (x:(groupNgrams xs)) --groupNgrams (x:xs) = (x:(groupNgrams xs))
...@@ -45,16 +45,19 @@ tokens2tokensTags :: [Token] -> [TokenTag] ...@@ -45,16 +45,19 @@ tokens2tokensTags :: [Token] -> [TokenTag]
tokens2tokensTags ts = filter' $ map tokenTag ts tokens2tokensTags ts = filter' $ map tokenTag ts
------------------------------------------------------------------------ ------------------------------------------------------------------------
tokenTag :: Token -> TokenTag tokenTag :: Token -> TokenTag
tokenTag (Token _ w _ l _ _ p n _ _) = TokenTag w' l' p n tokenTag (Token { .. }) = TokenTag { _my_token_word = w'
, _my_token_lemma = l'
, _my_token_pos = _tokenPos
, _my_token_ner = _tokenNer }
where where
w' = split w w' = split _tokenWord
l' = fromList (split l) l' = fromList (split _tokenLemma)
split = splitOn (pack " ") . toLower split = splitOn (pack " ") . toLower
filter' :: [TokenTag] -> [TokenTag] filter' :: [TokenTag] -> [TokenTag]
filter' xs = filter isNgrams xs filter' xs = filter isNgrams xs
where where
isNgrams (TokenTag _ _ p n) = isJust p || isJust n isNgrams (TokenTag { .. }) = isJust _my_token_pos || isJust _my_token_ner
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- request = -- request =
...@@ -80,6 +83,7 @@ corenlp' lang txt = do ...@@ -80,6 +83,7 @@ corenlp' lang txt = do
FR -> "{\"annotators\": \"tokenize,ssplit,pos,lemma,ner\", \"parse.model\":\"edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz\", \"pos.model\":\"edu/stanford/nlp/models/pos-tagger/french/french.tagger\", \"tokenize.language\":\"fr\", \"outputFormat\": \"json\"}" FR -> "{\"annotators\": \"tokenize,ssplit,pos,lemma,ner\", \"parse.model\":\"edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz\", \"pos.model\":\"edu/stanford/nlp/models/pos-tagger/french/french.tagger\", \"tokenize.language\":\"fr\", \"outputFormat\": \"json\"}"
_ -> panic $ pack "not implemented yet" _ -> panic $ pack "not implemented yet"
url <- parseRequest $ "POST http://localhost:9000/?properties=" <> properties url <- parseRequest $ "POST http://localhost:9000/?properties=" <> properties
-- curl -XPOST 'http://localhost:9000/?properties=%7B%22annotators%22:%20%22tokenize,ssplit,pos,ner%22,%20%22outputFormat%22:%20%22json%22%7D' -d 'hello world, hello' | jq .
let request = setRequestBodyLBS (cs txt) url let request = setRequestBodyLBS (cs txt) url
httpJSON request httpJSON request
......
...@@ -52,4 +52,3 @@ data PosSentences = PosSentences { _sentences :: [Sentence]} ...@@ -52,4 +52,3 @@ data PosSentences = PosSentences { _sentences :: [Sentence]}
deriving (Show, Generic) deriving (Show, Generic)
$(deriveJSON (unPrefix "_") ''PosSentences) $(deriveJSON (unPrefix "_") ''PosSentences)
...@@ -21,6 +21,8 @@ import Data.Text (Text, concat, unwords) ...@@ -21,6 +21,8 @@ import Data.Text (Text, concat, unwords)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Core.Text.Context import Gargantext.Core.Text.Context
import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence) import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence)
import Gargantext.Core.Types (TermsCount)
import Gargantext.Core.Utils (groupWithCounts)
import Prelude (error) import Prelude (error)
import qualified Data.Algorithms.KMP as KMP import qualified Data.Algorithms.KMP as KMP
import qualified Data.IntMap.Strict as IntMap import qualified Data.IntMap.Strict as IntMap
...@@ -72,8 +74,8 @@ buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern ...@@ -72,8 +74,8 @@ buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
-- Utils -- Utils
type BlockText = Text type BlockText = Text
type MatchedText = Text type MatchedText = Text
termsInText :: Patterns -> BlockText -> [MatchedText] termsInText :: Patterns -> BlockText -> [(MatchedText, TermsCount)]
termsInText pats txt = List.nub termsInText pats txt = groupWithCounts
$ List.concat $ List.concat
$ map (map unwords) $ map (map unwords)
$ extractTermsWithList pats txt $ extractTermsWithList pats txt
......
...@@ -17,7 +17,7 @@ commentary with @some markup@. ...@@ -17,7 +17,7 @@ commentary with @some markup@.
module Gargantext.Core.Types ( module Gargantext.Core.Types.Main module Gargantext.Core.Types ( module Gargantext.Core.Types.Main
, module Gargantext.Database.Admin.Types.Node , module Gargantext.Database.Admin.Types.Node
, DebugMode(..), withDebugMode , DebugMode(..), withDebugMode
, Term, Terms(..) , Term, Terms(..), TermsCount, TermsWithCount
, TokenTag(..), POS(..), NER(..) , TokenTag(..), POS(..), NER(..)
, Label, Stems , Label, Stems
, HasInvalidError(..), assertValid , HasInvalidError(..), assertValid
...@@ -70,10 +70,13 @@ type Label = [Text] ...@@ -70,10 +70,13 @@ type Label = [Text]
data Terms = Terms { _terms_label :: Label data Terms = Terms { _terms_label :: Label
, _terms_stem :: Stems , _terms_stem :: Stems
} deriving (Ord, Show) } deriving (Ord, Show)
instance Eq Terms where instance Eq Terms where
(==) (Terms _ s1) (Terms _ s2) = s1 == s2 (==) (Terms _ s1) (Terms _ s2) = s1 == s2
type TermsCount = Int
type TermsWithCount = (Terms, TermsCount)
------------------------------------------------------------------------ ------------------------------------------------------------------------
data Tag = POS | NER data Tag = POS | NER
deriving (Show, Eq) deriving (Show, Eq)
...@@ -208,5 +211,3 @@ data TODO = TODO ...@@ -208,5 +211,3 @@ data TODO = TODO
instance ToSchema TODO where instance ToSchema TODO where
instance ToParamSchema TODO where instance ToParamSchema TODO where
---------------------------------------------------------------------------- ----------------------------------------------------------------------------
...@@ -19,9 +19,12 @@ module Gargantext.Core.Utils ( ...@@ -19,9 +19,12 @@ module Gargantext.Core.Utils (
, alphanum , alphanum
, choices , choices
, randomString , randomString
, groupWithCounts
, addTuples
) where ) where
import Data.Char (chr, ord) import Data.Char (chr, ord)
import qualified Data.List as List
import Data.Maybe import Data.Maybe
import Data.Monoid import Data.Monoid
import Data.Text (Text, pack) import Data.Text (Text, pack)
...@@ -57,3 +60,17 @@ randomString :: Int -> IO Text ...@@ -57,3 +60,17 @@ randomString :: Int -> IO Text
randomString num = do randomString num = do
str <- choices num alphanum str <- choices num alphanum
pure $ pack str pure $ pack str
-- | Given a list of items of type 'a', return list with unique items
-- (like List.nub) but tuple-d with their counts in the original list
groupWithCounts :: (Ord a, Eq a) => [a] -> [(a, Int)]
groupWithCounts = map f
. List.group
. List.sort
where
f [] = panic "[groupWithCounts] impossible"
f ts@(t:_) = (t, length ts)
addTuples :: (Num a, Num b) => (a, b) -> (a, b) -> (a, b)
addTuples (a1, b1) (a2, b2) = (a1 + a2, b1 + b2)
...@@ -65,7 +65,7 @@ flowPhylo cId = do ...@@ -65,7 +65,7 @@ flowPhylo cId = do
patterns = buildPatterns termList patterns = buildPatterns termList
-- | To filter the Ngrams of a document based on the termList -- | To filter the Ngrams of a document based on the termList
filterTerms :: Patterns -> (Date, Text) -> (Date, [Text]) filterTerms :: Patterns -> (Date, Text) -> (Date, [Text])
filterTerms patterns' (y,d) = (y,termsInText patterns' d) filterTerms patterns' (y,d) = (y, fst <$> termsInText patterns' d)
docs = map ((\(y,t) -> Document y t) . filterTerms patterns) docs' docs = map ((\(y,t) -> Document y t) . filterTerms patterns) docs'
...@@ -123,4 +123,3 @@ writePhylo _fp _phview = undefined ...@@ -123,4 +123,3 @@ writePhylo _fp _phview = undefined
-- refactor 2021 -- refactor 2021
-- viewPhylo2Svg :: PhyloView -> IO DB.ByteString -- viewPhylo2Svg :: PhyloView -> IO DB.ByteString
-- viewPhylo2Svg p = graphvizWithHandle Dot (viewToDot p) Svg DB.hGetContents -- viewPhylo2Svg p = graphvizWithHandle Dot (viewToDot p) Svg DB.hGetContents
...@@ -48,7 +48,7 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list) ...@@ -48,7 +48,7 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
where where
import Conduit import Conduit
import Control.Lens ((^.), view, _Just, makeLenses) import Control.Lens ((^.), view, _Just, makeLenses, over, traverse)
import Data.Aeson.TH (deriveJSON) import Data.Aeson.TH (deriveJSON)
import Data.Conduit.Internal (zipSources) import Data.Conduit.Internal (zipSources)
import Data.Either import Data.Either
...@@ -60,7 +60,6 @@ import Data.Maybe (catMaybes) ...@@ -60,7 +60,6 @@ import Data.Maybe (catMaybes)
import Data.Monoid import Data.Monoid
import Data.Swagger import Data.Swagger
import qualified Data.Text as T import qualified Data.Text as T
import Data.Traversable (traverse)
import Data.Tuple.Extra (first, second) import Data.Tuple.Extra (first, second)
import GHC.Generics (Generic) import GHC.Generics (Generic)
import Servant.Client (ClientError) import Servant.Client (ClientError)
...@@ -83,9 +82,10 @@ import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(. ...@@ -83,9 +82,10 @@ import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(.
import Gargantext.Core.Text.List.Social (FlowSocialListWith) import Gargantext.Core.Text.List.Social (FlowSocialListWith)
import Gargantext.Core.Text.Terms import Gargantext.Core.Text.Terms
import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt) import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
import Gargantext.Core.Types (POS(NP)) import Gargantext.Core.Types (POS(NP), TermsCount)
import Gargantext.Core.Types.Individu (User(..)) import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Types.Main import Gargantext.Core.Types.Main
import Gargantext.Core.Utils (addTuples)
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger) import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
import Gargantext.Database.Action.Flow.List import Gargantext.Database.Action.Flow.List
import Gargantext.Database.Action.Flow.Types import Gargantext.Database.Action.Flow.Types
...@@ -357,25 +357,27 @@ insertMasterDocs c lang hs = do ...@@ -357,25 +357,27 @@ insertMasterDocs c lang hs = do
-- this will enable global database monitoring -- this will enable global database monitoring
-- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int)) -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int)) mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
<- mapNodeIdNgrams <- mapNodeIdNgrams
<$> documentIdWithNgrams <$> documentIdWithNgrams
(extractNgramsT $ withLang lang documentsWithId) (extractNgramsT $ withLang lang documentsWithId)
documentsWithId documentsWithId
lId <- getOrMkList masterCorpusId masterUserId lId <- getOrMkList masterCorpusId masterUserId
-- _ <- saveDocNgramsWith lId mapNgramsDocs'
_ <- saveDocNgramsWith lId mapNgramsDocs' _ <- saveDocNgramsWith lId mapNgramsDocs'
-- _cooc <- insertDefaultNode NodeListCooc lId masterUserId -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
pure ids' pure ids'
saveDocNgramsWith :: ( FlowCmdM env err m) saveDocNgramsWith :: (FlowCmdM env err m)
=> ListId => ListId
-> HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int)) -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
-> m () -> m ()
saveDocNgramsWith lId mapNgramsDocs' = do saveDocNgramsWith lId mapNgramsDocs' = do
terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs' printDebug "[saveDocNgramsWith] mapNgramsDocs'" mapNgramsDocs'
--printDebug "terms2id" terms2id let mapNgramsDocsNoCount = over (traverse . traverse . traverse) fst mapNgramsDocs'
terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocsNoCount
let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs' let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
...@@ -392,7 +394,7 @@ saveDocNgramsWith lId mapNgramsDocs' = do ...@@ -392,7 +394,7 @@ saveDocNgramsWith lId mapNgramsDocs' = do
<*> Just (fromIntegral w :: Double) <*> Just (fromIntegral w :: Double)
| (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
, (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
, (nId, w) <- Map.toList mapNodeIdWeight , (nId, (w, _cnt)) <- Map.toList mapNodeIdWeight
] ]
-- to be removed -- to be removed
...@@ -451,7 +453,7 @@ mergeData rs = catMaybes . map toDocumentWithId . Map.toList ...@@ -451,7 +453,7 @@ mergeData rs = catMaybes . map toDocumentWithId . Map.toList
------------------------------------------------------------------------ ------------------------------------------------------------------------
documentIdWithNgrams :: HasNodeError err documentIdWithNgrams :: HasNodeError err
=> (a => (a
-> Cmd err (HashMap b (Map NgramsType Int))) -> Cmd err (HashMap b (Map NgramsType Int, TermsCount)))
-> [Indexed NodeId a] -> [Indexed NodeId a]
-> Cmd err [DocumentIdWithNgrams a b] -> Cmd err [DocumentIdWithNgrams a b]
documentIdWithNgrams f = traverse toDocumentIdWithNgrams documentIdWithNgrams f = traverse toDocumentIdWithNgrams
...@@ -466,13 +468,17 @@ mapNodeIdNgrams :: (Ord b, Hashable b) ...@@ -466,13 +468,17 @@ mapNodeIdNgrams :: (Ord b, Hashable b)
=> [DocumentIdWithNgrams a b] => [DocumentIdWithNgrams a b]
-> HashMap b -> HashMap b
(Map NgramsType (Map NgramsType
(Map NodeId Int) (Map NodeId (Int, TermsCount))
) )
mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith addTuples)) . fmap f
where where
-- | NOTE We are somehow multiplying 'TermsCount' here: If the
-- same ngrams term has different ngrams types, the 'TermsCount'
-- for it (which is the number of times the terms appears in a
-- document) is copied over to all its types.
f :: DocumentIdWithNgrams a b f :: DocumentIdWithNgrams a b
-> HashMap b (Map NgramsType (Map NodeId Int)) -> HashMap b (Map NgramsType (Map NodeId (Int, TermsCount)))
f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d f d = fmap (\(ngramsTypeMap, cnt) -> fmap (\i -> Map.singleton nId (i, cnt)) ngramsTypeMap) $ documentNgrams d
where where
nId = _index $ documentWithId d nId = _index $ documentWithId d
...@@ -483,25 +489,25 @@ instance ExtractNgramsT HyperdataContact ...@@ -483,25 +489,25 @@ instance ExtractNgramsT HyperdataContact
extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
where where
extract :: TermType Lang -> HyperdataContact extract :: TermType Lang -> HyperdataContact
-> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int)) -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
extract _l hc' = do extract _l hc' = do
let authors = map text2ngrams let authors = map text2ngrams
$ maybe ["Nothing"] (\a -> [a]) $ maybe ["Nothing"] (\a -> [a])
$ view (hc_who . _Just . cw_lastName) hc' $ view (hc_who . _Just . cw_lastName) hc'
pure $ HashMap.fromList $ [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ] pure $ HashMap.fromList $ [(SimpleNgrams a', (Map.singleton Authors 1, 1)) | a' <- authors ]
instance ExtractNgramsT HyperdataDocument instance ExtractNgramsT HyperdataDocument
where where
extractNgramsT :: TermType Lang extractNgramsT :: TermType Lang
-> HyperdataDocument -> HyperdataDocument
-> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int)) -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
where where
extractNgramsT' :: TermType Lang extractNgramsT' :: TermType Lang
-> HyperdataDocument -> HyperdataDocument
-> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int)) -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
extractNgramsT' lang' doc = do extractNgramsT' lang' doc = do
let source = text2ngrams let source = text2ngrams
$ maybe "Nothing" identity $ maybe "Nothing" identity
...@@ -515,23 +521,25 @@ instance ExtractNgramsT HyperdataDocument ...@@ -515,23 +521,25 @@ instance ExtractNgramsT HyperdataDocument
$ maybe ["Nothing"] (T.splitOn ", ") $ maybe ["Nothing"] (T.splitOn ", ")
$ _hd_authors doc $ _hd_authors doc
terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP) termsWithCounts' <- map (\(t, cnt) -> (enrichedTerms (lang' ^. tt_lang) CoreNLP NP t, cnt))
<$> concat <$> concat
<$> liftBase (extractTerms lang' $ hasText doc) <$> liftBase (extractTerms lang' $ hasText doc)
printDebug "[extractNgramsT HyperdataDocument] termsWithCounts'" termsWithCounts'
printDebug "[extractNgramsT HyperdataDocument] termsWithLargerCounts" $ filter (\(_, cnt) -> cnt > 1) termsWithCounts'
pure $ HashMap.fromList pure $ HashMap.fromList
$ [(SimpleNgrams source, Map.singleton Sources 1) ] $ [(SimpleNgrams source, (Map.singleton Sources 1, 1)) ]
<> [(SimpleNgrams i', Map.singleton Institutes 1) | i' <- institutes ] <> [(SimpleNgrams i', (Map.singleton Institutes 1, 1)) | i' <- institutes ]
<> [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ] <> [(SimpleNgrams a', (Map.singleton Authors 1, 1)) | a' <- authors ]
<> [(EnrichedNgrams t', Map.singleton NgramsTerms 1) | t' <- terms' ] <> [(EnrichedNgrams t', (Map.singleton NgramsTerms 1, cnt')) | (t', cnt') <- termsWithCounts' ]
instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a) instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
where where
extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h extractNgramsT l (Node { _node_hyperdata = h }) = extractNgramsT l h
instance HasText a => HasText (Node a) instance HasText a => HasText (Node a)
where where
hasText (Node _ _ _ _ _ _ _ h) = hasText h hasText (Node { _node_hyperdata = h }) = hasText h
......
...@@ -15,6 +15,7 @@ module Gargantext.Database.Action.Flow.Utils ...@@ -15,6 +15,7 @@ module Gargantext.Database.Action.Flow.Utils
import Data.Map (Map) import Data.Map (Map)
import Data.HashMap.Strict (HashMap) import Data.HashMap.Strict (HashMap)
import Gargantext.Core.Types (TermsCount)
import Gargantext.Database.Admin.Types.Node import Gargantext.Database.Admin.Types.Node
import Gargantext.Database.Prelude (Cmd) import Gargantext.Database.Prelude (Cmd)
import Gargantext.Database.Query.Table.ContextNodeNgrams import Gargantext.Database.Query.Table.ContextNodeNgrams
...@@ -29,24 +30,24 @@ import qualified Data.HashMap.Strict as HashMap ...@@ -29,24 +30,24 @@ import qualified Data.HashMap.Strict as HashMap
data DocumentIdWithNgrams a b = data DocumentIdWithNgrams a b =
DocumentIdWithNgrams DocumentIdWithNgrams
{ documentWithId :: Indexed NodeId a { documentWithId :: Indexed NodeId a
, documentNgrams :: HashMap b (Map NgramsType Int) , documentNgrams :: HashMap b (Map NgramsType Int, TermsCount)
} deriving (Show) } deriving (Show)
insertDocNgrams :: ListId insertDocNgrams :: ListId
-> HashMap (Indexed NgramsId Ngrams) (Map NgramsType (Map DocId Int)) -> HashMap (Indexed NgramsId Ngrams) (Map NgramsType (Map DocId (Int, TermsCount)))
-> Cmd err Int -> Cmd err Int
insertDocNgrams lId m = insertContextNodeNgrams ns insertDocNgrams lId m = do
printDebug "[insertDocNgrams] ns" ns
insertContextNodeNgrams ns
where where
ns = [ ContextNodeNgrams docId lId (ng^.index) ns = [ ContextNodeNgrams docId lId (ng^.index)
(ngramsTypeId t) (ngramsTypeId t)
(fromIntegral i) (fromIntegral i)
cnt
| (ng, t2n2i) <- HashMap.toList m | (ng, t2n2i) <- HashMap.toList m
, (t, n2i) <- DM.toList t2n2i , (t, n2i) <- DM.toList t2n2i
, (docId, i) <- DM.toList n2i , (docId, (i, cnt)) <- DM.toList n2i
] ]
-- [(NodeId, {Ngrams: ({NgramsType: Int}, TermsCount)})]
-- {Ngrams: {NgramsType: {NodeId: (Int, TermsCount)}}}
...@@ -36,12 +36,13 @@ queryContextNodeNgramsTable = selectTable contextNodeNgramsTable ...@@ -36,12 +36,13 @@ queryContextNodeNgramsTable = selectTable contextNodeNgramsTable
-- | Insert utils -- | Insert utils
insertContextNodeNgrams :: [ContextNodeNgrams] -> Cmd err Int insertContextNodeNgrams :: [ContextNodeNgrams] -> Cmd err Int
insertContextNodeNgrams = insertContextNodeNgramsW insertContextNodeNgrams = insertContextNodeNgramsW
. map (\(ContextNodeNgrams c n ng nt w) -> . map (\(ContextNodeNgrams c n ng nt w dc) ->
ContextNodeNgrams (pgContextId c) ContextNodeNgrams (pgContextId c)
(pgNodeId n) (pgNodeId n)
(sqlInt4 ng) (sqlInt4 ng)
(pgNgramsTypeId nt) (pgNgramsTypeId nt)
(sqlDouble w) (sqlDouble w)
(sqlInt4 dc)
) )
insertContextNodeNgramsW :: [ContextNodeNgramsWrite] -> Cmd err Int insertContextNodeNgramsW :: [ContextNodeNgramsWrite] -> Cmd err Int
......
...@@ -35,8 +35,9 @@ updateHyperdata i h = mkCmd $ \c -> putStrLn "before runUpdate_" >> ...@@ -35,8 +35,9 @@ updateHyperdata i h = mkCmd $ \c -> putStrLn "before runUpdate_" >>
updateHyperdataQuery :: ToJSON a => NodeId -> a -> Update Int64 updateHyperdataQuery :: ToJSON a => NodeId -> a -> Update Int64
updateHyperdataQuery i h = seq h' $ trace "updateHyperdataQuery: encoded JSON" $ Update updateHyperdataQuery i h = seq h' $ trace "updateHyperdataQuery: encoded JSON" $ Update
{ uTable = nodeTable { uTable = nodeTable
, uUpdateWith = updateEasy (\ (Node _ni _nh _nt _nu _np _nn _nd _h) , uUpdateWith = updateEasy (\ (Node { .. })
-> trace "updating mate" $ Node _ni _nh _nt _nu _np _nn _nd h' -> Node { _node_hyperdata = h', .. }
-- -> trace "updating mate" $ Node _ni _nh _nt _nu _np _nn _nd h'
) )
, uWhere = (\row -> {-trace "uWhere" $-} _node_id row .== pgNodeId i ) , uWhere = (\row -> {-trace "uWhere" $-} _node_id row .== pgNodeId i )
, uReturning = rCount , uReturning = rCount
...@@ -63,5 +64,3 @@ updateNodesWithType_ :: ( HasNodeError err ...@@ -63,5 +64,3 @@ updateNodesWithType_ :: ( HasNodeError err
updateNodesWithType_ nt h = do updateNodesWithType_ nt h = do
ns <- getNodesIdWithType nt ns <- getNodesIdWithType nt
mapM (\n -> updateHyperdata n h) ns mapM (\n -> updateHyperdata n h) ns
...@@ -19,6 +19,7 @@ module Gargantext.Database.Schema.ContextNodeNgrams ...@@ -19,6 +19,7 @@ module Gargantext.Database.Schema.ContextNodeNgrams
where where
import Prelude import Prelude
import Gargantext.Core.Types (TermsCount)
import Gargantext.Database.Schema.Prelude import Gargantext.Database.Schema.Prelude
import Gargantext.Database.Schema.Ngrams (NgramsTypeId, NgramsId) import Gargantext.Database.Schema.Ngrams (NgramsTypeId, NgramsId)
import Gargantext.Database.Admin.Types.Node import Gargantext.Database.Admin.Types.Node
...@@ -26,15 +27,16 @@ import Gargantext.Database.Admin.Types.Node ...@@ -26,15 +27,16 @@ import Gargantext.Database.Admin.Types.Node
type ContextNodeNgrams = type ContextNodeNgrams =
ContextNodeNgramsPoly ContextId ListId NgramsId NgramsTypeId Double ContextNodeNgramsPoly ContextId ListId NgramsId NgramsTypeId Double TermsCount
data ContextNodeNgramsPoly c n ngrams_id ngt w data ContextNodeNgramsPoly c n ngrams_id ngt w dc
= ContextNodeNgrams { _cnng_context_id :: !c = ContextNodeNgrams { _cnng_context_id :: !c
, _cnng_node_id :: !n , _cnng_node_id :: !n
, _cnng_ngrams_id :: !ngrams_id , _cnng_ngrams_id :: !ngrams_id
, _cnng_ngramsType :: !ngt , _cnng_ngramsType :: !ngt
, _cnng_weight :: !w , _cnng_weight :: !w
, _cnng_doc_count :: !dc
} deriving (Show) } deriving (Show)
type ContextNodeNgramsWrite = type ContextNodeNgramsWrite =
...@@ -43,6 +45,7 @@ type ContextNodeNgramsWrite = ...@@ -43,6 +45,7 @@ type ContextNodeNgramsWrite =
(Column SqlInt4 ) (Column SqlInt4 )
(Column SqlInt4 ) (Column SqlInt4 )
(Column SqlFloat8) (Column SqlFloat8)
(Column SqlInt4 )
type ContextNodeNgramsRead = type ContextNodeNgramsRead =
ContextNodeNgramsPoly (Column SqlInt4 ) ContextNodeNgramsPoly (Column SqlInt4 )
...@@ -50,6 +53,7 @@ type ContextNodeNgramsRead = ...@@ -50,6 +53,7 @@ type ContextNodeNgramsRead =
(Column SqlInt4 ) (Column SqlInt4 )
(Column SqlInt4 ) (Column SqlInt4 )
(Column SqlFloat8) (Column SqlFloat8)
(Column SqlInt4 )
type ContextNodeNgramsReadNull = type ContextNodeNgramsReadNull =
ContextNodeNgramsPoly (Column (Nullable SqlInt4 )) ContextNodeNgramsPoly (Column (Nullable SqlInt4 ))
...@@ -57,6 +61,7 @@ type ContextNodeNgramsReadNull = ...@@ -57,6 +61,7 @@ type ContextNodeNgramsReadNull =
(Column (Nullable SqlInt4 )) (Column (Nullable SqlInt4 ))
(Column (Nullable SqlInt4 )) (Column (Nullable SqlInt4 ))
(Column (Nullable SqlFloat8)) (Column (Nullable SqlFloat8))
(Column (Nullable SqlInt4 ))
$(makeAdaptorAndInstance "pContextNodeNgrams" ''ContextNodeNgramsPoly) $(makeAdaptorAndInstance "pContextNodeNgrams" ''ContextNodeNgramsPoly)
makeLenses ''ContextNodeNgramsPoly makeLenses ''ContextNodeNgramsPoly
...@@ -70,5 +75,6 @@ contextNodeNgramsTable = Table "context_node_ngrams" ...@@ -70,5 +75,6 @@ contextNodeNgramsTable = Table "context_node_ngrams"
, _cnng_ngrams_id = requiredTableField "ngrams_id" , _cnng_ngrams_id = requiredTableField "ngrams_id"
, _cnng_ngramsType = requiredTableField "ngrams_type" , _cnng_ngramsType = requiredTableField "ngrams_type"
, _cnng_weight = requiredTableField "weight" , _cnng_weight = requiredTableField "weight"
, _cnng_doc_count = requiredTableField "doc_count"
} }
) )
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment