Commit dc52c519 authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

Merge branch 'dev' into 151-dev-pubmed-api-key

parents c38275d0 93e711b1
## Version 0.0.6.8.5.1
* [BACK][FIX] Indexing issue: taking all terms instead of longest of terms in case of ngrams included in others
* [FRONT][FIX][Disconnection of instance causes a blank page (#464)](https://gitlab.iscpif.fr/gargantext/purescript-gargantext/issues/464)
* [BACK][FIX] ArXiv search in Abstracts by default
## Version 0.0.6.8.5
* [BACK][FIX][Ngrams Table, page sort / limit (#149)](https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/149)
* [FRONT][FIX][Security Issue with Teams (#452)](https://gitlab.iscpif.fr/gargantext/purescript-gargantext/issues/452)
......@@ -9,6 +15,7 @@
## Version 0.0.6.8.3
* [INFO][UPGRADE] To upgrade use the 0.0.6.8.3.sql script
* [FRONT][SECU][Security Issue with Teams (#452)](https://gitlab.iscpif.fr/gargantext/purescript-gargantext/issues/452)
* [FRONT][FEAT][Graph Explorer: disable controls when ForceAtlas is running (#451)](https://gitlab.iscpif.fr/gargantext/purescript-gargantext/issues/451)
* [FRONT][FIX][[Doc date filter] Filter by multiple dates (#450)](https://gitlab.iscpif.fr/gargantext/purescript-gargantext/issues/450)
......
......@@ -196,6 +196,8 @@ To build documentation, run:
stack --docker build --haddock --no-haddock-deps --fast
```
(in `.stack-work/dist/x86_64-linux-nix/Cabal-3.2.1.0/doc/html/gargantext`).
## GraphQL
Some introspection information.
......
......@@ -33,7 +33,6 @@ import Gargantext.Database.Admin.Trigger.Init
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataCorpus)
import Gargantext.Database.Admin.Types.Node (NodeType(NodeDocument, NodeContact))
import Gargantext.Database.Prelude (Cmd'', Cmd, execPGSQuery)
import Gargantext.Database.Query.Table.Node (getOrMkList)
import Gargantext.Prelude
import Gargantext.Prelude.Config (GargConfig(..), readConfig)
import Prelude (getLine)
......@@ -67,7 +66,7 @@ main = do
_ok <- getLine
cfg <- readConfig iniPath
let secret = _gc_secretkey cfg
let _secret = _gc_secretkey cfg
withDevEnv iniPath $ \env -> do
-- First upgrade the Database Schema
......
......@@ -21,10 +21,11 @@ services:
postgres:
#image: 'postgres:latest'
image: 'postgres:14'
shm_size: 1g # https://stackoverflow.com/a/56754077
network_mode: host
#command: ["postgres", "-c", "log_statement=all"]
#ports:
#- 5432:5432
# - 5432:5432
environment:
POSTGRES_USER: gargantua
POSTGRES_PASSWORD: C8kdcUrAQy66U
......
......@@ -164,6 +164,7 @@ CREATE TABLE public.context_node_ngrams (
ngrams_id INTEGER NOT NULL REFERENCES public.ngrams (id) ON DELETE CASCADE,
ngrams_type INTEGER ,
weight double precision,
doc_count INTEGER DEFAULT 0,
PRIMARY KEY (context_id, node_id, ngrams_id, ngrams_type)
);
ALTER TABLE public.context_node_ngrams OWNER TO gargantua;
......
ALTER TABLE context_node_ngrams
ADD COLUMN doc_count INTEGER DEFAULT 0;
......@@ -5,7 +5,7 @@ cabal-version: 1.12
-- see: https://github.com/sol/hpack
name: gargantext
version: 0.0.6.8.5
version: 0.0.6.8.5.1
synopsis: Search, map, share
description: Please see README.md
category: Data
......@@ -50,6 +50,7 @@ library
Gargantext.Core.Types
Gargantext.Core.Types.Individu
Gargantext.Core.Types.Main
Gargantext.Core.Utils
Gargantext.Core.Utils.Prefix
Gargantext.Utils.Jobs
Gargantext.Utils.Jobs.API
......@@ -87,6 +88,7 @@ library
Gargantext.Core.Text.Prepare
Gargantext.Core.Text.Search
Gargantext.Core.Text.Terms
Gargantext.Core.Text.Terms.Eleve
Gargantext.Core.Text.Terms.Mono
Gargantext.Core.Text.Terms.Multi.Lang.En
Gargantext.Core.Text.Terms.Multi.Lang.Fr
......@@ -211,7 +213,6 @@ library
Gargantext.Core.Text.Samples.EN
Gargantext.Core.Text.Samples.FR
Gargantext.Core.Text.Samples.SP
Gargantext.Core.Text.Terms.Eleve
Gargantext.Core.Text.Terms.Mono.Stem
Gargantext.Core.Text.Terms.Mono.Stem.En
Gargantext.Core.Text.Terms.Mono.Token
......@@ -221,7 +222,6 @@ library
Gargantext.Core.Text.Terms.Multi.PosTagging
Gargantext.Core.Text.Terms.Multi.PosTagging.Types
Gargantext.Core.Types.Phylo
Gargantext.Core.Utils
Gargantext.Core.Utils.DateUtils
Gargantext.Core.Viz
Gargantext.Core.Viz.Chart
......@@ -809,6 +809,7 @@ test-suite garg-test
Core.Text
Core.Text.Examples
Core.Text.Flow
Core.Utils
Graph.Clustering
Graph.Distance
Ngrams.Lang
......
......@@ -6,7 +6,7 @@ name: gargantext
# | | | +----- Layers * : New versions with API additions
# | | | | +--- Layers * : New versions without API breaking changes
# | | | | |
version: '0.0.6.8.5'
version: '0.0.6.8.5.1'
synopsis: Search, map, share
description: Please see README.md
category: Data
......@@ -74,6 +74,7 @@ library:
- Gargantext.Core.Types
- Gargantext.Core.Types.Individu
- Gargantext.Core.Types.Main
- Gargantext.Core.Utils
- Gargantext.Core.Utils.Prefix
- Gargantext.Utils.Jobs
- Gargantext.Utils.Jobs.API
......@@ -111,6 +112,7 @@ library:
- Gargantext.Core.Text.Prepare
- Gargantext.Core.Text.Search
- Gargantext.Core.Text.Terms
- Gargantext.Core.Text.Terms.Eleve
- Gargantext.Core.Text.Terms.Mono
- Gargantext.Core.Text.Terms.Multi.Lang.En
- Gargantext.Core.Text.Terms.Multi.Lang.Fr
......
{-|
Module : Graph.Clustering
Description : Basic tests to avoid quick regression
......
{-|
Module : Core.Utils
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module Core.Utils where
import Test.Hspec
import Gargantext.Prelude
import Gargantext.Core.Utils
-- | Core.Utils tests
test :: IO ()
test = hspec $ do
describe "check if groupWithCounts works" $ do
it "simple integer array" $ do
(groupWithCounts [1, 2, 3, 1, 2, 3]) `shouldBe` [(1, 2), (2, 2), (3, 2)]
it "string" $ do
(groupWithCounts "abccba") `shouldBe` [('a', 2), ('b', 2), ('c', 2)]
{-|
Module : Graph.Clustering
Description : Basic tests to avoid quick regression
......@@ -40,4 +39,3 @@ test = hspec $ do
let
result = List.length partitions > 1
shouldBe True result
......@@ -11,6 +11,8 @@ Portability : POSIX
import Gargantext.Prelude
import Gargantext.Core (Lang(..))
import qualified Core.Utils as Utils
--import qualified Ngrams.Lang.Fr as Fr
--import qualified Ngrams.Lang as Lang
import qualified Ngrams.Lang.Occurrences as Occ
......@@ -22,11 +24,12 @@ import qualified Utils.Crypto as Crypto
main :: IO ()
main = do
Utils.test
-- Occ.parsersTest
-- Lang.ngramsExtractionTest FR
-- Lang.ngramsExtractionTest EN
-- Metrics.main
Graph.test
Graph.test
PD.testFromRFC3339
-- GD.test
Crypto.test
{-|
Module : Utils.Crypto
Description :
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
......@@ -43,4 +42,3 @@ test = hspec $ do
let hash2 = hash (["b","a"] :: [Text])
it "compare" $ do
hash1 `shouldBe` hash2
......@@ -187,9 +187,9 @@ saveNodeStory :: ( MonadReader env m, MonadBase IO m, HasNodeStorySaver env )
saveNodeStory = do
saver <- view hasNodeStorySaver
liftBase $ do
Gargantext.Prelude.putStrLn "---- Running node story saver ----"
--Gargantext.Prelude.putStrLn "---- Running node story saver ----"
saver
Gargantext.Prelude.putStrLn "---- Node story saver finished ----"
--Gargantext.Prelude.putStrLn "---- Node story saver finished ----"
saveNodeStoryImmediate :: ( MonadReader env m, MonadBase IO m, HasNodeStoryImmediateSaver env )
......@@ -197,9 +197,9 @@ saveNodeStoryImmediate :: ( MonadReader env m, MonadBase IO m, HasNodeStoryImmed
saveNodeStoryImmediate = do
saver <- view hasNodeStoryImmediateSaver
liftBase $ do
Gargantext.Prelude.putStrLn "---- Running node story immediate saver ----"
--Gargantext.Prelude.putStrLn "---- Running node story immediate saver ----"
saver
Gargantext.Prelude.putStrLn "---- Node story immediate saver finished ----"
--Gargantext.Prelude.putStrLn "---- Node story immediate saver finished ----"
listTypeConflictResolution :: ListType -> ListType -> ListType
......
......@@ -130,26 +130,23 @@ reIndexWith :: ( HasNodeStory env err m
-> Set ListType
-> m ()
reIndexWith cId lId nt lts = do
printDebug "(cId,lId,nt,lts)" (cId, lId, nt, lts)
-- Getting [NgramsTerm]
ts <- List.concat
<$> map (\(k,vs) -> k:vs)
<$> HashMap.toList
<$> getTermsWith identity [lId] nt lts
-- printDebug "ts" ts
-- Taking the ngrams with 0 occurrences only (orphans)
-- occs <- getOccByNgramsOnlyFast' cId lId nt ts
-- printDebug "occs" occs
let orphans = ts {- List.concat
$ map (\t -> case HashMap.lookup t occs of
Nothing -> [t]
Just n -> if n <= 1 then [t] else [ ]
) ts
-}
-- printDebug "orphans" orphans
-}
printDebug "orphans" orphans
-- Get all documents of the corpus
docs <- selectDocNodes cId
......@@ -158,24 +155,25 @@ reIndexWith cId lId nt lts = do
-- Checking Text documents where orphans match
-- TODO Tests here
let
ngramsByDoc = map (HashMap.fromListWith (<>))
$ map (map (\(k,v) -> (SimpleNgrams (text2ngrams k), v)))
$ map (\doc -> List.zip
(termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
$ Text.unlines $ catMaybes
[ doc ^. context_hyperdata . hd_title
, doc ^. context_hyperdata . hd_abstract
]
-- fromListWith (<>)
ngramsByDoc = map (HashMap.fromList)
$ map (map (\((k, cnt), v) -> (SimpleNgrams (text2ngrams k), over (traverse . traverse) (\p -> (p, cnt)) v)))
$ map (\doc -> List.zip
(termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
$ Text.unlines $ catMaybes
[ doc ^. context_hyperdata . hd_title
, doc ^. context_hyperdata . hd_abstract
]
)
(List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
(List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
) docs
-- printDebug "ngramsByDoc" ngramsByDoc
printDebug "ngramsByDoc: " ngramsByDoc
-- Saving the indexation in database
_ <- mapM (saveDocNgramsWith lId) ngramsByDoc
pure () -- ngramsByDoc
pure ()
toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
toIndexedNgrams m t = Indexed <$> i <*> n
......
......@@ -195,7 +195,7 @@ getCoocByNgrams' f (Diagonal diag) m =
listToCombi identity ks
]
where
where
ks = HM.keys m
-- TODO k could be either k1 or k2 here
......@@ -220,7 +220,7 @@ getCoocByNgrams'' (Diagonal diag) (f1,f2) (m1,m2) =
-- TODO check optim
-- listToCombi identity ks1
]
where
where
ks1 = HM.keys m1
ks2 = HM.keys m2
......
......@@ -29,6 +29,7 @@ import Control.Monad.Reader (ReaderT)
import Control.Monad.Error.Class (MonadError(..))
import Crypto.JOSE.Error as Jose
import Data.Aeson.Types
import qualified Data.Text as Text
import Data.Typeable
import Data.Validity
import Gargantext.API.Admin.Orchestrator.Types
......@@ -115,7 +116,7 @@ data GargError
makePrisms ''GargError
instance ToJSON GargError where
toJSON _ = String "SomeGargErrorPleaseReport"
toJSON err = object [("error", String $ Text.pack $ show err)]
instance Exception GargError
......
......@@ -289,7 +289,7 @@ addCorpusWithForm user cid =
serveJobsAPI AddCorpusFormJob $ \i log' ->
let
log'' x = do
printDebug "[addToCorpusWithForm] " x
--printDebug "[addToCorpusWithForm] " x
liftBase $ log' x
in New.addToCorpusWithForm user cid i log'' (jobLogInit 3)
......
......@@ -18,6 +18,7 @@ module Gargantext.API.Search
where
import Data.Aeson hiding (defaultTaggedObject)
-- import Data.List (concat)
import Data.Maybe (fromMaybe)
import Data.Swagger hiding (fieldLabelModifier, Contact)
import Data.Text (Text)
......@@ -56,6 +57,7 @@ api nId (SearchQuery q SearchDoc) o l order =
SearchResult <$> SearchResultDoc
<$> map (toRow nId)
<$> searchInCorpus nId False q o l order
-- <$> searchInCorpus nId False (concat q) o l order
api nId (SearchQuery q SearchContact) o l order = do
printDebug "isPairedWith" nId
......@@ -69,11 +71,13 @@ api nId (SearchQuery q SearchContact) o l order = do
<$> map (toRow aId)
<$> searchInCorpusWithContacts nId aId q o l order
api _nId (SearchQuery _q SearchDocWithNgrams) _o _l _order = undefined
-----------------------------------------------------------------------
-----------------------------------------------------------------------
-- | Main Types
-----------------------------------------------------------------------
data SearchType = SearchDoc | SearchContact
data SearchType = SearchDoc | SearchContact | SearchDocWithNgrams
deriving (Generic)
instance FromJSON SearchType where
parseJSON = genericParseJSON (defaultOptions { sumEncoding = ObjectWithSingleField })
......@@ -132,7 +136,7 @@ instance FromJSON SearchResultTypes where
parseJSON = genericParseJSON (defaultOptions { sumEncoding = defaultTaggedObject })
instance ToJSON SearchResultTypes where
toJSON = genericToJSON (defaultOptions { sumEncoding = defaultTaggedObject })
instance Arbitrary SearchResultTypes where
arbitrary = do
srd <- SearchResultDoc <$> arbitrary
......@@ -163,7 +167,7 @@ data Row =
deriving (Generic)
instance FromJSON Row
where
parseJSON = genericParseJSON
parseJSON = genericParseJSON
( defaultOptions { sumEncoding = defaultTaggedObject } )
instance ToJSON Row
where
......
......@@ -455,7 +455,7 @@ insertArchiveList c nodeId a = do
where
query :: PGS.Query
query = [sql| WITH s as (SELECT ? as sid, ? sversion, ? sngrams_type_id, ngrams.id as sngrams_id, ?::jsonb as srepo FROM ngrams WHERE terms = ?)
INSERT INTO node_stories(node_id, version, ngrams_type_id, ngrams_id, ngrams_repo_element)
INSERT INTO node_stories(node_id, version, ngrams_type_id, ngrams_id, ngrams_repo_element)
SELECT s.sid, s.sversion, s.sngrams_type_id, s.sngrams_id, s.srepo from s s join nodes n on s.sid = n.id
|]
......@@ -505,22 +505,22 @@ updateNodeStory c nodeId@(NodeId _nId) currentArchive newArchive = do
--printDebug "[updateNodeStory] updates" $ Text.unlines $ (Text.pack . show) <$> updates
-- 2. Perform inserts/deletes/updates
printDebug "[updateNodeStory] applying insert" ()
--printDebug "[updateNodeStory] applying insert" ()
insertArchiveList c nodeId $ Archive { _a_version = newArchive ^. a_version
, _a_history = []
, _a_state = archiveStateFromList inserts }
printDebug "[updateNodeStory] insert applied" ()
--printDebug "[updateNodeStory] insert applied" ()
--TODO Use currentArchive ^. a_version in delete and report error
-- if entries with (node_id, ngrams_type_id, ngrams_id) but
-- different version are found.
deleteArchiveList c nodeId $ Archive { _a_version = newArchive ^. a_version
, _a_history = []
, _a_state = archiveStateFromList deletes }
printDebug "[updateNodeStory] delete applied" ()
--printDebug "[updateNodeStory] delete applied" ()
updateArchiveList c nodeId $ Archive { _a_version = newArchive ^. a_version
, _a_history = []
, _a_state = archiveStateFromList updates }
printDebug "[updateNodeStory] update applied" ()
--printDebug "[updateNodeStory] update applied" ()
pure ()
-- where
......
......@@ -28,8 +28,9 @@ compute graph
-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE ConstrainedClassMethods #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.Core.Text.Terms
where
......@@ -47,6 +48,7 @@ import qualified Data.Set as Set
import qualified Data.Text as Text
import qualified Data.HashMap.Strict as HashMap
import Gargantext.Core
import Gargantext.Core.Utils (groupWithCounts)
import Gargantext.Core.Text (sentences, HasText(..))
import Gargantext.Core.Text.Terms.Eleve (mainEleveWith, Tries, Token, buildTries, toToken)
import Gargantext.Core.Text.Terms.Mono (monoTerms)
......@@ -70,24 +72,23 @@ data TermType lang
, _tt_model :: !(Maybe (Tries Token ()))
}
deriving (Generic)
deriving instance (Show lang) => Show (TermType lang)
makeLenses ''TermType
--group :: [Text] -> [Text]
--group = undefined
-- remove Stop Words
-- map (filter (\t -> not . elem t)) $
-- map (filter (\t -> not . elem t)) $
------------------------------------------------------------------------
-- | Sugar to extract terms from text (hiddeng mapM from end user).
-- | Sugar to extract terms from text (hidding 'mapM' from end user).
--extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
extractTerms :: TermType Lang -> [Text] -> IO [[Terms]]
extractTerms :: TermType Lang -> [Text] -> IO [[TermsWithCount]]
extractTerms (Unsupervised {..}) xs = mapM (terms (Unsupervised { _tt_model = Just m', .. })) xs
where
m' = case _tt_model of
Just m''-> m''
Nothing -> newTries _tt_windowSize (Text.intercalate " " xs)
extractTerms termTypeLang xs = mapM (terms termTypeLang) xs
......@@ -116,12 +117,13 @@ data ExtractedNgrams = SimpleNgrams { unSimpleNgrams :: Ngrams }
instance Hashable ExtractedNgrams
-- | A typeclass that represents extracting ngrams from an entity.
class ExtractNgramsT h
where
extractNgramsT :: HasText h
=> TermType Lang
-> h
-> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
-> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
------------------------------------------------------------------------
enrichedTerms :: Lang -> PosTagAlgo -> POS -> Terms -> NgramsPostag
enrichedTerms l pa po (Terms ng1 ng2) =
......@@ -132,7 +134,7 @@ enrichedTerms l pa po (Terms ng1 ng2) =
------------------------------------------------------------------------
cleanNgrams :: Int -> Ngrams -> Ngrams
cleanNgrams s ng
cleanNgrams s ng
| Text.length (ng ^. ngramsTerms) < s = ng
| otherwise = text2ngrams (Text.take s (ng ^. ngramsTerms))
......@@ -151,10 +153,10 @@ insertExtractedNgrams ngs = do
let (s, e) = List.partition isSimpleNgrams ngs
m1 <- insertNgrams (map unSimpleNgrams s)
--printDebug "others" m1
m2 <- insertNgramsPostag (map unEnrichedNgrams e)
--printDebug "terms" m2
let result = HashMap.union m1 m2
pure result
......@@ -163,43 +165,41 @@ isSimpleNgrams (SimpleNgrams _) = True
isSimpleNgrams _ = False
------------------------------------------------------------------------
-- | Terms from Text
-- Mono : mono terms
-- Multi : multi terms
-- MonoMulti : mono and multi
-- | Terms from 'Text'
-- 'Mono' : mono terms
-- 'Multi' : multi terms
-- 'MonoMulti' : mono and multi
-- TODO : multi terms should exclude mono (intersection is not empty yet)
terms :: TermType Lang -> Text -> IO [Terms]
terms :: TermType Lang -> Text -> IO [TermsWithCount]
terms (Mono lang) txt = pure $ monoTerms lang txt
terms (Multi lang) txt = multiterms lang txt
terms (MonoMulti lang) txt = terms (Multi lang) txt
terms (Unsupervised { .. }) txt = termsUnsupervised (Unsupervised { _tt_model = Just m', .. }) txt
terms (Unsupervised { .. }) txt = pure $ termsUnsupervised (Unsupervised { _tt_model = Just m', .. }) txt
where
m' = maybe (newTries _tt_ngramsSize txt) identity _tt_model
-- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
------------------------------------------------------------------------
-- | Unsupervised ngrams extraction
-- language agnostic extraction
-- TODO: remove IO
-- TODO: newtype BlockText
type WindowSize = Int
type MinNgramSize = Int
termsUnsupervised :: TermType Lang -> Text -> IO [Terms]
termsUnsupervised (Unsupervised l n s m) =
pure
. map (text2term l)
. List.nub
. (List.filter (\l' -> List.length l' >= s))
-- | Unsupervised ngrams extraction
-- language agnostic extraction
-- TODO: newtype BlockText
termsUnsupervised :: TermType Lang -> Text -> [TermsWithCount]
termsUnsupervised (Unsupervised { _tt_model = Nothing }) = panic "[termsUnsupervised] no model"
termsUnsupervised (Unsupervised { _tt_model = Just _tt_model, .. }) =
map (\(t, cnt) -> (text2term _tt_lang t, cnt))
. groupWithCounts
-- . List.nub
. (List.filter (\l' -> List.length l' >= _tt_windowSize))
. List.concat
. mainEleveWith (maybe (panic "no model") identity m) n
. mainEleveWith _tt_model _tt_ngramsSize
. uniText
termsUnsupervised _ = undefined
newTries :: Int -> Text -> Tries Token ()
newTries n t = buildTries n (fmap toToken $ uniText t)
......@@ -217,5 +217,3 @@ text2term lang txt = Terms txt (Set.fromList $ map (stem lang) txt)
isPunctuation :: Text -> Bool
isPunctuation x = List.elem x $ (Text.pack . pure)
<$> ("!?(),;.:" :: String)
......@@ -32,6 +32,7 @@ Notes for current implementation:
-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeFamilies #-}
......@@ -278,6 +279,8 @@ data Tries k e = Tries
makeLenses ''Tries
deriving instance (Show k, Show e) => Show (Tries k e)
buildTries :: Int -> [[Token]] -> Tries Token ()
buildTries n sentences = Tries
{ _fwd = buildTrie Forward n sentences
......
......@@ -40,8 +40,8 @@ words = monoTexts
isSep :: Char -> Bool
isSep = (`elem` (",.:;?!(){}[]\"\'" :: String))
monoTerms :: Lang -> Text -> [Terms]
monoTerms l txt = map (monoText2term l) $ monoTexts txt
monoTerms :: Lang -> Text -> [TermsWithCount]
monoTerms l txt = map (\t -> (monoText2term l t, 1)) $ monoTexts txt
monoTexts :: Text -> [Text]
monoTexts = L.concat . monoTextsBySentence
......
......@@ -21,6 +21,7 @@ import Data.List (concat)
import Gargantext.Prelude
import Gargantext.Core (Lang(..))
import Gargantext.Core.Types
import Gargantext.Core.Utils (groupWithCounts)
import Gargantext.Core.Text.Terms.Multi.PosTagging
import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
......@@ -37,14 +38,16 @@ import qualified Gargantext.Utils.SpacyNLP as SpacyNLP
type NLP_API = Lang -> Text -> IO PosSentences
-------------------------------------------------------------------
multiterms :: Lang -> Text -> IO [Terms]
multiterms = multiterms' tokenTag2terms
multiterms :: Lang -> Text -> IO [TermsWithCount]
multiterms l txt = do
ret <- multiterms' tokenTag2terms l txt
pure $ groupWithCounts ret
where
multiterms' :: (TokenTag -> a) -> Lang -> Text -> IO [a]
multiterms' f lang txt = concat
multiterms' f lang txt' = concat
<$> map (map f)
<$> map (filter (\t -> _my_token_pos t == Just NP))
<$> tokenTags lang txt
<$> tokenTags lang txt'
-------------------------------------------------------------------
tokenTag2terms :: TokenTag -> Terms
......@@ -57,10 +60,10 @@ tokenTags l _ = panic $ "[G.C.T.T.Multi] Lang NLP API not implemented yet " <
tokenTagsWith :: Lang -> Text -> NLP_API -> IO [[TokenTag]]
tokenTagsWith lang txt nlp = map (groupTokens lang)
<$> map tokens2tokensTags
<$> map _sentenceTokens
<$> _sentences
<$> nlp lang txt
<$> map tokens2tokensTags
<$> map _sentenceTokens
<$> _sentences
<$> nlp lang txt
---- | This function analyses and groups (or not) ngrams according to
......
......@@ -30,4 +30,3 @@ group2 p1 p2 (x@(TokenTag _ _ Nothing _):y) = (x: group2 p1 p2 y)
group2 _ _ [x@(TokenTag _ _ (Just _) _)] = [x]
group2 p1 p2 (x@(TokenTag _ _ (Just _) _):y@(TokenTag _ _ Nothing _):z) = (x:y: group2 p1 p2 (y:z))
group2 _ _ [] = []
......@@ -40,4 +40,3 @@ groupTokens ntags = group2 NP NP
--groupNgrams ((x,_,"LOCATION"):(y,yy,"LOCATION"):xs) = groupNgrams ((x <> " " <> y,yy,"LOCATION"):xs)
--
--groupNgrams (x:xs) = (x:(groupNgrams xs))
......@@ -45,19 +45,22 @@ tokens2tokensTags :: [Token] -> [TokenTag]
tokens2tokensTags ts = filter' $ map tokenTag ts
------------------------------------------------------------------------
tokenTag :: Token -> TokenTag
tokenTag (Token _ w _ l _ _ p n _ _) = TokenTag w' l' p n
tokenTag (Token { .. }) = TokenTag { _my_token_word = w'
, _my_token_lemma = l'
, _my_token_pos = _tokenPos
, _my_token_ner = _tokenNer }
where
w' = split w
l' = fromList (split l)
w' = split _tokenWord
l' = fromList (split _tokenLemma)
split = splitOn (pack " ") . toLower
filter' :: [TokenTag] -> [TokenTag]
filter' xs = filter isNgrams xs
where
isNgrams (TokenTag _ _ p n) = isJust p || isJust n
isNgrams (TokenTag { .. }) = isJust _my_token_pos || isJust _my_token_ner
------------------------------------------------------------------------
-- request =
-- request =
-- "fr" : {
-- "tokenize.language" : "fr",
-- "pos.model" : "edu/stanford/nlp/models/pos-tagger/french/french.tagger",
......@@ -66,9 +69,9 @@ filter' xs = filter isNgrams xs
-- "depparse.model" : "edu/stanford/nlp/models/parser/nndep/UD_French.gz",
-- "depparse.language" : "french",
-- "ner.model": DATA_ROOT+"/eunews.fr.crf.gz",
-- "ssplit.newlineIsSentenceBreak": "always"
-- "ssplit.newlineIsSentenceBreak": "always"
-- },
--
--
corenlp' :: ( FromJSON a
, ConvertibleStrings p ByteString
......@@ -80,6 +83,7 @@ corenlp' lang txt = do
FR -> "{\"annotators\": \"tokenize,ssplit,pos,lemma,ner\", \"parse.model\":\"edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz\", \"pos.model\":\"edu/stanford/nlp/models/pos-tagger/french/french.tagger\", \"tokenize.language\":\"fr\", \"outputFormat\": \"json\"}"
_ -> panic $ pack "not implemented yet"
url <- parseRequest $ "POST http://localhost:9000/?properties=" <> properties
-- curl -XPOST 'http://localhost:9000/?properties=%7B%22annotators%22:%20%22tokenize,ssplit,pos,ner%22,%20%22outputFormat%22:%20%22json%22%7D' -d 'hello world, hello' | jq .
let request = setRequestBodyLBS (cs txt) url
httpJSON request
......
......@@ -52,4 +52,3 @@ data PosSentences = PosSentences { _sentences :: [Sentence]}
deriving (Show, Generic)
$(deriveJSON (unPrefix "_") ''PosSentences)
{-|
Module : Gargantext.Core.Text.Terms.WithList
Description :
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
......@@ -21,6 +21,8 @@ import Data.Text (Text, concat, unwords)
import Gargantext.Prelude
import Gargantext.Core.Text.Context
import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence)
import Gargantext.Core.Types (TermsCount)
import Gargantext.Core.Utils (groupWithCounts)
import Prelude (error)
import qualified Data.Algorithms.KMP as KMP
import qualified Data.IntMap.Strict as IntMap
......@@ -35,8 +37,11 @@ data Pattern = Pattern
type Patterns = [Pattern]
------------------------------------------------------------------------
replaceTerms :: Patterns -> [Text] -> [[Text]]
replaceTerms pats terms = go 0
data ReplaceTerms = KeepAll | LongestOnly
replaceTerms :: ReplaceTerms -> Patterns -> [Text] -> [[Text]]
replaceTerms rplaceTerms pats terms = go 0
where
terms_len = length terms
......@@ -47,15 +52,17 @@ replaceTerms pats terms = go 0
Just (len, term) ->
term : go (ix + len)
merge (len1, lab1) (len2, lab2) =
if len2 < len1 then (len1, lab1) else (len2, lab2)
m =
IntMap.fromListWith merge
m = toMap
[ (ix, (len, term))
| Pattern pat len term <- pats, ix <- KMP.match pat terms ]
toMap = case rplaceTerms of
KeepAll -> IntMap.fromList
LongestOnly -> IntMap.fromListWith merge
where
merge (len1, lab1) (len2, lab2) =
if len2 < len1 then (len1, lab1) else (len2, lab2)
buildPatterns :: TermList -> Patterns
buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
where
......@@ -71,8 +78,8 @@ buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
--------------------------------------------------------------------------
-- Utils
type MatchedText = Text
termsInText :: Patterns -> Text -> [MatchedText]
termsInText pats txt = List.nub
termsInText :: Patterns -> Text -> [(MatchedText, TermsCount)]
termsInText pats txt = groupWithCounts
$ List.concat
$ map (map unwords)
$ extractTermsWithList pats txt
......@@ -80,14 +87,14 @@ termsInText pats txt = List.nub
--------------------------------------------------------------------------
extractTermsWithList :: Patterns -> Text -> Corpus [Text]
extractTermsWithList pats = map (replaceTerms pats) . monoTextsBySentence
extractTermsWithList pats = map (replaceTerms KeepAll pats) . monoTextsBySentence
-- | Extract terms
-- >>> let termList = [(["chat blanc"], [["chat","blanc"]])] :: TermList
-- extractTermsWithList' (buildPatterns termList) "Le chat blanc"["chat blanc"]
-- ["chat blanc"]
extractTermsWithList' :: Patterns -> Text -> [Text]
extractTermsWithList' pats = map (concat . map concat . replaceTerms pats)
extractTermsWithList' pats = map (concat . map concat . replaceTerms KeepAll pats)
. monoTextsBySentence
--------------------------------------------------------------------------
......@@ -95,7 +102,7 @@ extractTermsWithList' pats = map (concat . map concat . replaceTerms pats)
{- | Not used
filterWith :: TermList
-> (a -> Text)
-> [a]
-> [a]
-> [(a, [Text])]
filterWith termList f xs = filterWith' termList f zip xs
......@@ -103,7 +110,7 @@ filterWith termList f xs = filterWith' termList f zip xs
filterWith' :: TermList
-> (a -> Text)
-> ([a] -> [[Text]] -> [b])
-> [a]
-> [a]
-> [b]
filterWith' termList f f' xs = f' xs
$ map (extractTermsWithList' pats)
......
{-|
Module : Gargantext.Types
Description :
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
......@@ -17,7 +17,7 @@ commentary with @some markup@.
module Gargantext.Core.Types ( module Gargantext.Core.Types.Main
, module Gargantext.Database.Admin.Types.Node
, DebugMode(..), withDebugMode
, Term, Terms(..)
, Term, Terms(..), TermsCount, TermsWithCount
, TokenTag(..), POS(..), NER(..)
, Label, Stems
, HasInvalidError(..), assertValid
......@@ -70,10 +70,13 @@ type Label = [Text]
data Terms = Terms { _terms_label :: Label
, _terms_stem :: Stems
} deriving (Ord, Show)
instance Eq Terms where
(==) (Terms _ s1) (Terms _ s2) = s1 == s2
type TermsCount = Int
type TermsWithCount = (Terms, TermsCount)
------------------------------------------------------------------------
data Tag = POS | NER
deriving (Show, Eq)
......@@ -208,5 +211,3 @@ data TODO = TODO
instance ToSchema TODO where
instance ToParamSchema TODO where
----------------------------------------------------------------------------
{-|
Module : Gargantext.Utils
Description :
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
......@@ -12,16 +12,19 @@ commentary with @some markup@.
-}
module Gargantext.Core.Utils (
module Gargantext.Core.Utils (
-- module Gargantext.Utils.Chronos
module Gargantext.Core.Utils.Prefix
, something
, alphanum
, choices
, randomString
, groupWithCounts
, addTuples
) where
import Data.Char (chr, ord)
import qualified Data.List as List
import Data.Maybe
import Data.Monoid
import Data.Text (Text, pack)
......@@ -57,3 +60,17 @@ randomString :: Int -> IO Text
randomString num = do
str <- choices num alphanum
pure $ pack str
-- | Given a list of items of type 'a', return list with unique items
-- (like List.nub) but tuple-d with their counts in the original list
groupWithCounts :: (Ord a, Eq a) => [a] -> [(a, Int)]
groupWithCounts = map f
. List.group
. List.sort
where
f [] = panic "[groupWithCounts] impossible"
f ts@(t:_) = (t, length ts)
addTuples :: (Num a, Num b) => (a, b) -> (a, b) -> (a, b)
addTuples (a1, b1) (a2, b2) = (a1 + a2, b1 + b2)
......@@ -65,7 +65,7 @@ flowPhylo cId = do
patterns = buildPatterns termList
-- | To filter the Ngrams of a document based on the termList
filterTerms :: Patterns -> (Date, Text) -> (Date, [Text])
filterTerms patterns' (y,d) = (y,termsInText patterns' d)
filterTerms patterns' (y,d) = (y, fst <$> termsInText patterns' d)
docs = map ((\(y,t) -> Document y t) . filterTerms patterns) docs'
......@@ -123,4 +123,3 @@ writePhylo _fp _phview = undefined
-- refactor 2021
-- viewPhylo2Svg :: PhyloView -> IO DB.ByteString
-- viewPhylo2Svg p = graphvizWithHandle Dot (viewToDot p) Svg DB.hGetContents
......@@ -48,7 +48,7 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
where
import Conduit
import Control.Lens ((^.), view, _Just, makeLenses)
import Control.Lens ((^.), view, _Just, makeLenses, over, traverse)
import Control.Monad.Reader (MonadReader)
import Data.Aeson.TH (deriveJSON)
import Data.Conduit.Internal (zipSources)
......@@ -61,7 +61,6 @@ import Data.Maybe (catMaybes)
import Data.Monoid
import Data.Swagger
import qualified Data.Text as T
import Data.Traversable (traverse)
import Data.Tuple.Extra (first, second)
import GHC.Generics (Generic)
import Servant.Client (ClientError)
......@@ -84,9 +83,10 @@ import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(.
import Gargantext.Core.Text.List.Social (FlowSocialListWith)
import Gargantext.Core.Text.Terms
import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
import Gargantext.Core.Types (POS(NP))
import Gargantext.Core.Types (POS(NP), TermsCount)
import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Types.Main
import Gargantext.Core.Utils (addTuples)
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
import Gargantext.Database.Action.Flow.List
import Gargantext.Database.Action.Flow.Types
......@@ -362,25 +362,27 @@ insertMasterDocs c lang hs = do
-- this will enable global database monitoring
-- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
<- mapNodeIdNgrams
<$> documentIdWithNgrams
(extractNgramsT $ withLang lang documentsWithId)
documentsWithId
(extractNgramsT $ withLang lang documentsWithId)
documentsWithId
lId <- getOrMkList masterCorpusId masterUserId
-- _ <- saveDocNgramsWith lId mapNgramsDocs'
_ <- saveDocNgramsWith lId mapNgramsDocs'
-- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
pure ids'
saveDocNgramsWith :: ( FlowCmdM env err m)
saveDocNgramsWith :: (FlowCmdM env err m)
=> ListId
-> HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
-> HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
-> m ()
saveDocNgramsWith lId mapNgramsDocs' = do
terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
--printDebug "terms2id" terms2id
--printDebug "[saveDocNgramsWith] mapNgramsDocs'" mapNgramsDocs'
let mapNgramsDocsNoCount = over (traverse . traverse . traverse) fst mapNgramsDocs'
terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocsNoCount
let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
......@@ -397,7 +399,7 @@ saveDocNgramsWith lId mapNgramsDocs' = do
<*> Just (fromIntegral w :: Double)
| (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
, (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
, (nId, w) <- Map.toList mapNodeIdWeight
, (nId, (w, _cnt)) <- Map.toList mapNodeIdWeight
]
-- to be removed
......@@ -456,14 +458,14 @@ mergeData rs = catMaybes . map toDocumentWithId . Map.toList
------------------------------------------------------------------------
documentIdWithNgrams :: HasNodeError err
=> (a
-> Cmd err (HashMap b (Map NgramsType Int)))
-> Cmd err (HashMap b (Map NgramsType Int, TermsCount)))
-> [Indexed NodeId a]
-> Cmd err [DocumentIdWithNgrams a b]
documentIdWithNgrams f = traverse toDocumentIdWithNgrams
where
toDocumentIdWithNgrams d = do
e <- f $ _unIndex d
pure $ DocumentIdWithNgrams d e
pure $ DocumentIdWithNgrams d e
-- | TODO check optimization
......@@ -471,13 +473,17 @@ mapNodeIdNgrams :: (Ord b, Hashable b)
=> [DocumentIdWithNgrams a b]
-> HashMap b
(Map NgramsType
(Map NodeId Int)
(Map NodeId (Int, TermsCount))
)
mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith addTuples)) . fmap f
where
-- | NOTE We are somehow multiplying 'TermsCount' here: If the
-- same ngrams term has different ngrams types, the 'TermsCount'
-- for it (which is the number of times the terms appears in a
-- document) is copied over to all its types.
f :: DocumentIdWithNgrams a b
-> HashMap b (Map NgramsType (Map NodeId Int))
f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
-> HashMap b (Map NgramsType (Map NodeId (Int, TermsCount)))
f d = fmap (\(ngramsTypeMap, cnt) -> fmap (\i -> Map.singleton nId (i, cnt)) ngramsTypeMap) $ documentNgrams d
where
nId = _index $ documentWithId d
......@@ -488,25 +494,25 @@ instance ExtractNgramsT HyperdataContact
extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
where
extract :: TermType Lang -> HyperdataContact
-> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
-> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
extract _l hc' = do
let authors = map text2ngrams
$ maybe ["Nothing"] (\a -> [a])
$ view (hc_who . _Just . cw_lastName) hc'
pure $ HashMap.fromList $ [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
pure $ HashMap.fromList $ [(SimpleNgrams a', (Map.singleton Authors 1, 1)) | a' <- authors ]
instance ExtractNgramsT HyperdataDocument
where
extractNgramsT :: TermType Lang
-> HyperdataDocument
-> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
-> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
where
extractNgramsT' :: TermType Lang
-> HyperdataDocument
-> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
-> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
extractNgramsT' lang' doc = do
let source = text2ngrams
$ maybe "Nothing" identity
......@@ -520,23 +526,23 @@ instance ExtractNgramsT HyperdataDocument
$ maybe ["Nothing"] (T.splitOn ", ")
$ _hd_authors doc
terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
<$> concat
<$> liftBase (extractTerms lang' $ hasText doc)
termsWithCounts' <- map (\(t, cnt) -> (enrichedTerms (lang' ^. tt_lang) CoreNLP NP t, cnt))
<$> concat
<$> liftBase (extractTerms lang' $ hasText doc)
pure $ HashMap.fromList
$ [(SimpleNgrams source, Map.singleton Sources 1) ]
<> [(SimpleNgrams i', Map.singleton Institutes 1) | i' <- institutes ]
<> [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
<> [(EnrichedNgrams t', Map.singleton NgramsTerms 1) | t' <- terms' ]
$ [(SimpleNgrams source, (Map.singleton Sources 1, 1)) ]
<> [(SimpleNgrams i', (Map.singleton Institutes 1, 1)) | i' <- institutes ]
<> [(SimpleNgrams a', (Map.singleton Authors 1, 1)) | a' <- authors ]
<> [(EnrichedNgrams t', (Map.singleton NgramsTerms 1, cnt')) | (t', cnt') <- termsWithCounts' ]
instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
where
extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
extractNgramsT l (Node { _node_hyperdata = h }) = extractNgramsT l h
instance HasText a => HasText (Node a)
where
hasText (Node _ _ _ _ _ _ _ h) = hasText h
hasText (Node { _node_hyperdata = h }) = hasText h
......
......@@ -15,6 +15,7 @@ module Gargantext.Database.Action.Flow.Utils
import Data.Map (Map)
import Data.HashMap.Strict (HashMap)
import Gargantext.Core.Types (TermsCount)
import Gargantext.Database.Admin.Types.Node
import Gargantext.Database.Prelude (Cmd)
import Gargantext.Database.Query.Table.ContextNodeNgrams
......@@ -29,24 +30,24 @@ import qualified Data.HashMap.Strict as HashMap
data DocumentIdWithNgrams a b =
DocumentIdWithNgrams
{ documentWithId :: Indexed NodeId a
, documentNgrams :: HashMap b (Map NgramsType Int)
, documentNgrams :: HashMap b (Map NgramsType Int, TermsCount)
} deriving (Show)
insertDocNgrams :: ListId
-> HashMap (Indexed NgramsId Ngrams) (Map NgramsType (Map DocId Int))
-> HashMap (Indexed NgramsId Ngrams) (Map NgramsType (Map DocId (Int, TermsCount)))
-> Cmd err Int
insertDocNgrams lId m = insertContextNodeNgrams ns
insertDocNgrams lId m = do
printDebug "[insertDocNgrams] ns" ns
insertContextNodeNgrams ns
where
ns = [ ContextNodeNgrams docId lId (ng^.index)
(ngramsTypeId t)
(fromIntegral i)
cnt
| (ng, t2n2i) <- HashMap.toList m
, (t, n2i) <- DM.toList t2n2i
, (docId, i) <- DM.toList n2i
, (docId, (i, cnt)) <- DM.toList n2i
]
-- [(NodeId, {Ngrams: ({NgramsType: Int}, TermsCount)})]
-- {Ngrams: {NgramsType: {NodeId: (Int, TermsCount)}}}
......@@ -121,6 +121,7 @@ getOccByNgramsOnlyFast cId lId nt = do
-> Cmd err [(Text, DPST.PGArray Int)]
run cId' lId' nt' = runPGSQuery query
( cId'
, cId'
, lId'
, ngramsTypeId nt'
)
......@@ -130,7 +131,14 @@ getOccByNgramsOnlyFast cId lId nt = do
SELECT ng.terms
-- , ng.id
--, round(nng.weight)
, ARRAY(SELECT DISTINCT context_node_ngrams.context_id FROM context_node_ngrams WHERE ng.id = ngrams_id) AS context_ids
, ARRAY(
SELECT DISTINCT context_node_ngrams.context_id
FROM context_node_ngrams
JOIN nodes_contexts
ON context_node_ngrams.context_id = nodes_contexts.context_id
WHERE ng.id = context_node_ngrams.ngrams_id
AND nodes_contexts.node_id = ?
) AS context_ids
-- , ns.version
-- , nng.ngrams_type
-- , ns.ngrams_type_id
......
......@@ -14,7 +14,10 @@ module Gargantext.Database.Action.Search where
import Control.Arrow (returnA)
import Control.Lens ((^.))
import qualified Data.List as List
import qualified Data.Map as Map
import Data.Maybe
import qualified Data.Set as Set
import Data.Text (Text, unpack, intercalate)
import Data.Time (UTCTime)
import Gargantext.Core
......@@ -26,8 +29,11 @@ import Gargantext.Database.Query.Filter
import Gargantext.Database.Query.Join (leftJoin5)
import Gargantext.Database.Query.Table.Node
import Gargantext.Database.Query.Table.Context
import Gargantext.Database.Query.Table.ContextNodeNgrams (queryContextNodeNgramsTable)
import Gargantext.Database.Query.Table.NodeContext
import Gargantext.Database.Query.Table.NodeContext_NodeContext
import Gargantext.Database.Schema.ContextNodeNgrams (ContextNodeNgramsPoly(..))
import Gargantext.Database.Schema.Ngrams (NgramsType(..))
import Gargantext.Database.Schema.Node
import Gargantext.Database.Schema.Context
import Gargantext.Prelude
......@@ -43,7 +49,7 @@ searchDocInDatabase :: HasDBid NodeType
-> Cmd err [(NodeId, HyperdataDocument)]
searchDocInDatabase p t = runOpaQuery (queryDocInDatabase p t)
where
-- | Global search query where ParentId is Master Node Corpus Id
-- | Global search query where ParentId is Master Node Corpus Id
queryDocInDatabase :: ParentId -> Text -> O.Select (Column SqlInt4, Column SqlJsonb)
queryDocInDatabase _p q = proc () -> do
row <- queryNodeSearchTable -< ()
......@@ -51,6 +57,74 @@ searchDocInDatabase p t = runOpaQuery (queryDocInDatabase p t)
restrict -< (_ns_typename row) .== (sqlInt4 $ toDBid NodeDocument)
returnA -< (_ns_id row, _ns_hyperdata row)
------------------------------------------------------------------------
-- | Search ngrams in documents, ranking them by TF-IDF. We narrow our
-- search only to map/candidate terms.
searchInCorpusWithNgrams :: HasDBid NodeType
=> CorpusId
-> ListId
-> IsTrash
-> NgramsType
-> [[Text]]
-> Maybe Offset
-> Maybe Limit
-> Maybe OrderBy
-> Cmd err [FacetDoc]
searchInCorpusWithNgrams _cId _lId _t _ngt _q _o _l _order = undefined
-- | Compute TF-IDF for all 'ngramIds' in given 'CorpusId'. In this
-- case only the "TF" part makes sense and so we only compute the
-- ratio of "number of times our terms appear in given document" and
-- "number of all terms in document" and return a sorted list of
-- document ids
tfidfAll :: CorpusId -> [Int] -> Cmd err [Int]
tfidfAll cId ngramIds = do
let ngramIdsSet = Set.fromList ngramIds
docsWithNgrams <- runOpaQuery (queryCorpusWithNgrams cId ngramIds) :: Cmd err [(Int, Int, Int)]
-- NOTE The query returned docs with ANY ngramIds. We need to further
-- restrict to ALL ngramIds.
let docsNgramsM =
Map.fromListWith (Set.union)
[ (ctxId, Set.singleton ngrams_id)
| (ctxId, ngrams_id, _) <- docsWithNgrams]
let docsWithAllNgramsS = Set.fromList $ List.map fst $
List.filter (\(_, docNgrams) ->
ngramIdsSet == Set.intersection ngramIdsSet docNgrams) $ Map.toList docsNgramsM
let docsWithAllNgrams =
List.filter (\(ctxId, _, _) ->
Set.member ctxId docsWithAllNgramsS) docsWithNgrams
printDebug "[tfidfAll] docsWithAllNgrams" docsWithAllNgrams
let docsWithCounts = Map.fromListWith (+) [ (ctxId, doc_count)
| (ctxId, _, doc_count) <- docsWithAllNgrams]
printDebug "[tfidfAll] docsWithCounts" docsWithCounts
let totals = [ ( ctxId
, ngrams_id
, fromIntegral doc_count :: Double
, fromIntegral (fromMaybe 0 $ Map.lookup ctxId docsWithCounts) :: Double)
| (ctxId, ngrams_id, doc_count) <- docsWithAllNgrams]
let tfidf_sorted = List.sortOn snd [(ctxId, doc_count/s)
| (ctxId, _, doc_count, s) <- totals]
pure $ List.map fst $ List.reverse tfidf_sorted
-- | Query for searching the 'context_node_ngrams' table so that we
-- find docs with ANY given 'ngramIds'.
queryCorpusWithNgrams :: CorpusId -> [Int] -> Select (Column SqlInt4, Column SqlInt4, Column SqlInt4)
queryCorpusWithNgrams cId ngramIds = proc () -> do
row <- queryContextNodeNgramsTable -< ()
restrict -< (_cnng_node_id row) .== (pgNodeId cId)
restrict -< in_ (sqlInt4 <$> ngramIds) (_cnng_ngrams_id row)
returnA -< ( _cnng_context_id row
, _cnng_ngrams_id row
, _cnng_doc_count row)
--returnA -< row
-- returnA -< ( _cnng_context_id row
-- , _cnng_node_id row
-- , _cnng_ngrams_id row
-- , _cnng_ngramsType row
-- , _cnng_weight row
-- , _cnng_doc_count row)
------------------------------------------------------------------------
-- | todo add limit and offset and order
searchInCorpus :: HasDBid NodeType
......@@ -209,4 +283,3 @@ queryContactViaDoc =
)
) -> Column SqlBool
cond45 (doc, (corpus, (_,(_,_)))) = doc^.cs_id .== corpus^.nc_context_id
......@@ -103,7 +103,7 @@ data Facet id created title hyperdata category ngramCount score =
, facetDoc_score :: score
} deriving (Show, Generic)
{- | TODO after demo
data Facet id date hyperdata score =
data Facet id date hyperdata score =
FacetDoc { facetDoc_id :: id
, facetDoc_date :: date
, facetDoc_hyperdata :: hyperdata
......@@ -163,7 +163,7 @@ type FacetPairedReadNull = FacetPaired (Column (Nullable SqlInt4) )
(Column (Nullable SqlInt4) )
type FacetPairedReadNullAgg = FacetPaired (Aggregator (Column (Nullable SqlInt4) )
(Column (Nullable SqlInt4) )
(Column (Nullable SqlInt4) )
)
(Aggregator (Column (Nullable SqlTimestamptz))
(Column (Nullable SqlTimestamptz))
......
......@@ -43,7 +43,7 @@ import Opaleye.Internal.Join (NullMaker(..))
import qualified Opaleye.Internal.Unpackspec()
keepWhen :: (a -> Field SqlBool) -> SelectArr a a
keepWhen :: (a -> Field SqlBool) -> SelectArr a a
keepWhen p = proc a -> do
restrict -< p a
returnA -< a
......@@ -61,7 +61,7 @@ leftJoin2 = leftJoin
------------------------------------------------------------------------
-- | LeftJoin3 in two ways to write it
leftJoin3 :: Select columnsA -> Select columnsB -> Select columnsC
-> ((columnsA, columnsB, columnsC) -> Column SqlBool)
-> ((columnsA, columnsB, columnsC) -> Column SqlBool)
-> Select (columnsA, columnsB, columnsC)
leftJoin3 q1 q2 q3 cond = ((,,) <$> q1 <*> q2 <*> q3) >>> keepWhen cond
......@@ -82,7 +82,7 @@ leftJoin4' :: Select columnsA
-> Select columnsB
-> Select columnsC
-> Select columnsD
-> ((columnsA, columnsB, columnsC, columnsD) -> Column SqlBool)
-> ((columnsA, columnsB, columnsC, columnsD) -> Column SqlBool)
-> Select (columnsA, columnsB, columnsC, columnsD)
leftJoin4' q1 q2 q3 q4 cond = ((,,,) <$> q1 <*> q2 <*> q3 <*> q4) >>> keepWhen cond
......@@ -375,4 +375,3 @@ leftJoin9 q1 q2 q3 q4 q5 q6 q7 q8 q9
) cond67
) cond78
) cond89
......@@ -36,12 +36,13 @@ queryContextNodeNgramsTable = selectTable contextNodeNgramsTable
-- | Insert utils
insertContextNodeNgrams :: [ContextNodeNgrams] -> Cmd err Int
insertContextNodeNgrams = insertContextNodeNgramsW
. map (\(ContextNodeNgrams c n ng nt w) ->
. map (\(ContextNodeNgrams c n ng nt w dc) ->
ContextNodeNgrams (pgContextId c)
(pgNodeId n)
(sqlInt4 ng)
(pgNgramsTypeId nt)
(sqlDouble w)
(sqlInt4 dc)
)
insertContextNodeNgramsW :: [ContextNodeNgramsWrite] -> Cmd err Int
......
......@@ -35,8 +35,9 @@ updateHyperdata i h = mkCmd $ \c -> putStrLn "before runUpdate_" >>
updateHyperdataQuery :: ToJSON a => NodeId -> a -> Update Int64
updateHyperdataQuery i h = seq h' $ trace "updateHyperdataQuery: encoded JSON" $ Update
{ uTable = nodeTable
, uUpdateWith = updateEasy (\ (Node _ni _nh _nt _nu _np _nn _nd _h)
-> trace "updating mate" $ Node _ni _nh _nt _nu _np _nn _nd h'
, uUpdateWith = updateEasy (\ (Node { .. })
-> Node { _node_hyperdata = h', .. }
-- -> trace "updating mate" $ Node _ni _nh _nt _nu _np _nn _nd h'
)
, uWhere = (\row -> {-trace "uWhere" $-} _node_id row .== pgNodeId i )
, uReturning = rCount
......@@ -63,5 +64,3 @@ updateNodesWithType_ :: ( HasNodeError err
updateNodesWithType_ nt h = do
ns <- getNodesIdWithType nt
mapM (\n -> updateHyperdata n h) ns
......@@ -19,6 +19,7 @@ module Gargantext.Database.Schema.ContextNodeNgrams
where
import Prelude
import Gargantext.Core.Types (TermsCount)
import Gargantext.Database.Schema.Prelude
import Gargantext.Database.Schema.Ngrams (NgramsTypeId, NgramsId)
import Gargantext.Database.Admin.Types.Node
......@@ -26,15 +27,16 @@ import Gargantext.Database.Admin.Types.Node
type ContextNodeNgrams =
ContextNodeNgramsPoly ContextId ListId NgramsId NgramsTypeId Double
ContextNodeNgramsPoly ContextId ListId NgramsId NgramsTypeId Double TermsCount
data ContextNodeNgramsPoly c n ngrams_id ngt w
data ContextNodeNgramsPoly c n ngrams_id ngt w dc
= ContextNodeNgrams { _cnng_context_id :: !c
, _cnng_node_id :: !n
, _cnng_ngrams_id :: !ngrams_id
, _cnng_ngramsType :: !ngt
, _cnng_weight :: !w
, _cnng_doc_count :: !dc
} deriving (Show)
type ContextNodeNgramsWrite =
......@@ -43,6 +45,7 @@ type ContextNodeNgramsWrite =
(Column SqlInt4 )
(Column SqlInt4 )
(Column SqlFloat8)
(Column SqlInt4 )
type ContextNodeNgramsRead =
ContextNodeNgramsPoly (Column SqlInt4 )
......@@ -50,6 +53,7 @@ type ContextNodeNgramsRead =
(Column SqlInt4 )
(Column SqlInt4 )
(Column SqlFloat8)
(Column SqlInt4 )
type ContextNodeNgramsReadNull =
ContextNodeNgramsPoly (Column (Nullable SqlInt4 ))
......@@ -57,6 +61,7 @@ type ContextNodeNgramsReadNull =
(Column (Nullable SqlInt4 ))
(Column (Nullable SqlInt4 ))
(Column (Nullable SqlFloat8))
(Column (Nullable SqlInt4 ))
$(makeAdaptorAndInstance "pContextNodeNgrams" ''ContextNodeNgramsPoly)
makeLenses ''ContextNodeNgramsPoly
......@@ -70,5 +75,6 @@ contextNodeNgramsTable = Table "context_node_ngrams"
, _cnng_ngrams_id = requiredTableField "ngrams_id"
, _cnng_ngramsType = requiredTableField "ngrams_type"
, _cnng_weight = requiredTableField "weight"
, _cnng_doc_count = requiredTableField "doc_count"
}
)
{-|
Module : Gargantext.Database.Schema.NodeNgrams
Description :
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
......@@ -86,7 +86,7 @@ type NgramsClass = Int
type NgramsText = Text
-- Example of list Ngrams
-- type ListNgrams = NodeNgramsPoly (Maybe Int) ListType Text
-- type ListNgrams = NodeNgramsPoly (Maybe Int) ListType Text
type NodeNgramsW =
NodeNgramsPoly (Maybe NodeNgramsId) NodeId ListType NgramsText
......
......@@ -83,7 +83,7 @@ extra-deps:
- git: https://gitlab.iscpif.fr/gargantext/crawlers/isidore.git
commit: 3db385e767d2100d8abe900833c6e7de3ac55e1b
- git: https://gitlab.iscpif.fr/gargantext/crawlers/arxiv-api.git
commit: 4d6ae5aad435c00cdae1d47ebb5281d13d7b172c
commit: a2d78abeaec9315be765b90d5e51a4a50c48e7b8
#- git: https://gitlab.iscpif.fr/cgenie/arxiv-api.git
#- arxiv-0.0.3@sha256:02de1114091d11f1f3ab401d104d125ad4301260806feb7f63b3dcefc7db88cf,1588
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment