Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
153
Issues
153
List
Board
Labels
Milestones
Merge Requests
9
Merge Requests
9
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
da954a33
Commit
da954a33
authored
Nov 24, 2022
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Plain Diff
Merge remote-tracking branch 'origin/145-dev-graph-explorer-search-tfidf' into dev
parents
23d2a12f
00f726d1
Changes
38
Hide whitespace changes
Inline
Side-by-side
Showing
38 changed files
with
310 additions
and
163 deletions
+310
-163
README.md
README.md
+2
-0
Main.hs
bin/gargantext-upgrade/Main.hs
+1
-2
docker-compose.yaml
devops/docker/docker-compose.yaml
+1
-1
schema.sql
devops/postgres/schema.sql
+1
-0
0.0.6.7.3.sql
devops/postgres/upgrade/0.0.6.7.3.sql
+2
-0
gargantext.cabal
gargantext.cabal
+4
-3
package.yaml
package.yaml
+2
-0
Text.hs
src-test/Core/Text.hs
+0
-1
Utils.hs
src-test/Core/Utils.hs
+28
-0
Clustering.hs
src-test/Graph/Clustering.hs
+0
-2
Main.hs
src-test/Main.hs
+4
-1
Crypto.hs
src-test/Utils/Crypto.hs
+1
-3
Ngrams.hs
src/Gargantext/API/Ngrams.hs
+4
-4
List.hs
src/Gargantext/API/Ngrams/List.hs
+10
-9
Tools.hs
src/Gargantext/API/Ngrams/Tools.hs
+2
-2
Search.hs
src/Gargantext/API/Search.hs
+10
-7
NodeStory.hs
src/Gargantext/Core/NodeStory.hs
+5
-5
Terms.hs
src/Gargantext/Core/Text/Terms.hs
+29
-31
Eleve.hs
src/Gargantext/Core/Text/Terms/Eleve.hs
+3
-0
Mono.hs
src/Gargantext/Core/Text/Terms/Mono.hs
+2
-2
Multi.hs
src/Gargantext/Core/Text/Terms/Multi.hs
+11
-8
Group.hs
src/Gargantext/Core/Text/Terms/Multi/Group.hs
+0
-1
En.hs
src/Gargantext/Core/Text/Terms/Multi/Lang/En.hs
+0
-1
PosTagging.hs
src/Gargantext/Core/Text/Terms/Multi/PosTagging.hs
+11
-7
Types.hs
src/Gargantext/Core/Text/Terms/Multi/PosTagging/Types.hs
+0
-1
WithList.hs
src/Gargantext/Core/Text/Terms/WithList.hs
+7
-5
Types.hs
src/Gargantext/Core/Types.hs
+6
-5
Utils.hs
src/Gargantext/Core/Utils.hs
+19
-2
LegacyMain.hs
src/Gargantext/Core/Viz/Phylo/Legacy/LegacyMain.hs
+1
-2
Flow.hs
src/Gargantext/Database/Action/Flow.hs
+38
-32
Utils.hs
src/Gargantext/Database/Action/Flow/Utils.hs
+10
-9
Search.hs
src/Gargantext/Database/Action/Search.hs
+76
-2
Facet.hs
src/Gargantext/Database/Query/Facet.hs
+2
-2
Join.hs
src/Gargantext/Database/Query/Join.hs
+3
-4
ContextNodeNgrams.hs
src/Gargantext/Database/Query/Table/ContextNodeNgrams.hs
+2
-1
UpdateOpaleye.hs
src/Gargantext/Database/Query/Table/Node/UpdateOpaleye.hs
+3
-4
ContextNodeNgrams.hs
src/Gargantext/Database/Schema/ContextNodeNgrams.hs
+8
-2
NodeNgrams.hs
src/Gargantext/Database/Schema/NodeNgrams.hs
+2
-2
No files found.
README.md
View file @
da954a33
...
...
@@ -196,6 +196,8 @@ To build documentation, run:
stack
--docker
build
--haddock
--no-haddock-deps
--fast
```
(in
`.stack-work/dist/x86_64-linux-nix/Cabal-3.2.1.0/doc/html/gargantext`
).
## GraphQL
Some introspection information.
...
...
bin/gargantext-upgrade/Main.hs
View file @
da954a33
...
...
@@ -33,7 +33,6 @@ import Gargantext.Database.Admin.Trigger.Init
import
Gargantext.Database.Admin.Types.Hyperdata
(
HyperdataCorpus
)
import
Gargantext.Database.Admin.Types.Node
(
NodeType
(
NodeDocument
,
NodeContact
))
import
Gargantext.Database.Prelude
(
Cmd
''
,
Cmd
,
execPGSQuery
)
import
Gargantext.Database.Query.Table.Node
(
getOrMkList
)
import
Gargantext.Prelude
import
Gargantext.Prelude.Config
(
GargConfig
(
..
),
readConfig
)
import
Prelude
(
getLine
)
...
...
@@ -67,7 +66,7 @@ main = do
_ok
<-
getLine
cfg
<-
readConfig
iniPath
let
secret
=
_gc_secretkey
cfg
let
_
secret
=
_gc_secretkey
cfg
withDevEnv
iniPath
$
\
env
->
do
-- First upgrade the Database Schema
...
...
devops/docker/docker-compose.yaml
View file @
da954a33
...
...
@@ -25,7 +25,7 @@ services:
network_mode
:
host
#command: ["postgres", "-c", "log_statement=all"]
#ports:
#
- 5432:5432
#
- 5432:5432
environment
:
POSTGRES_USER
:
gargantua
POSTGRES_PASSWORD
:
C8kdcUrAQy66U
...
...
devops/postgres/schema.sql
View file @
da954a33
...
...
@@ -164,6 +164,7 @@ CREATE TABLE public.context_node_ngrams (
ngrams_id
INTEGER
NOT
NULL
REFERENCES
public
.
ngrams
(
id
)
ON
DELETE
CASCADE
,
ngrams_type
INTEGER
,
weight
double
precision
,
doc_count
INTEGER
DEFAULT
0
,
PRIMARY
KEY
(
context_id
,
node_id
,
ngrams_id
,
ngrams_type
)
);
ALTER
TABLE
public
.
context_node_ngrams
OWNER
TO
gargantua
;
...
...
devops/postgres/upgrade/0.0.6.7.3.sql
0 → 100644
View file @
da954a33
ALTER
TABLE
context_node_ngrams
ADD
COLUMN
doc_count
INTEGER
DEFAULT
0
;
gargantext.cabal
View file @
da954a33
cabal-version: 1.12
-- This file has been generated from package.yaml by hpack version 0.3
5.0
.
-- This file has been generated from package.yaml by hpack version 0.3
4.7
.
--
-- see: https://github.com/sol/hpack
...
...
@@ -50,6 +50,7 @@ library
Gargantext.Core.Types
Gargantext.Core.Types.Individu
Gargantext.Core.Types.Main
Gargantext.Core.Utils
Gargantext.Core.Utils.Prefix
Gargantext.Utils.Jobs
Gargantext.Utils.Jobs.API
...
...
@@ -87,6 +88,7 @@ library
Gargantext.Core.Text.Prepare
Gargantext.Core.Text.Search
Gargantext.Core.Text.Terms
Gargantext.Core.Text.Terms.Eleve
Gargantext.Core.Text.Terms.Mono
Gargantext.Core.Text.Terms.Multi.Lang.En
Gargantext.Core.Text.Terms.Multi.Lang.Fr
...
...
@@ -211,7 +213,6 @@ library
Gargantext.Core.Text.Samples.EN
Gargantext.Core.Text.Samples.FR
Gargantext.Core.Text.Samples.SP
Gargantext.Core.Text.Terms.Eleve
Gargantext.Core.Text.Terms.Mono.Stem
Gargantext.Core.Text.Terms.Mono.Stem.En
Gargantext.Core.Text.Terms.Mono.Token
...
...
@@ -221,7 +222,6 @@ library
Gargantext.Core.Text.Terms.Multi.PosTagging
Gargantext.Core.Text.Terms.Multi.PosTagging.Types
Gargantext.Core.Types.Phylo
Gargantext.Core.Utils
Gargantext.Core.Utils.DateUtils
Gargantext.Core.Viz
Gargantext.Core.Viz.Chart
...
...
@@ -809,6 +809,7 @@ test-suite garg-test
Core.Text
Core.Text.Examples
Core.Text.Flow
Core.Utils
Graph.Clustering
Graph.Distance
Ngrams.Lang
...
...
package.yaml
View file @
da954a33
...
...
@@ -74,6 +74,7 @@ library:
-
Gargantext.Core.Types
-
Gargantext.Core.Types.Individu
-
Gargantext.Core.Types.Main
-
Gargantext.Core.Utils
-
Gargantext.Core.Utils.Prefix
-
Gargantext.Utils.Jobs
-
Gargantext.Utils.Jobs.API
...
...
@@ -111,6 +112,7 @@ library:
-
Gargantext.Core.Text.Prepare
-
Gargantext.Core.Text.Search
-
Gargantext.Core.Text.Terms
-
Gargantext.Core.Text.Terms.Eleve
-
Gargantext.Core.Text.Terms.Mono
-
Gargantext.Core.Text.Terms.Multi.Lang.En
-
Gargantext.Core.Text.Terms.Multi.Lang.Fr
...
...
src-test/Core/Text.hs
View file @
da954a33
{-|
Module : Graph.Clustering
Description : Basic tests to avoid quick regression
...
...
src-test/Core/Utils.hs
0 → 100644
View file @
da954a33
{-|
Module : Core.Utils
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module
Core.Utils
where
import
Test.Hspec
import
Gargantext.Prelude
import
Gargantext.Core.Utils
-- | Core.Utils tests
test
::
IO
()
test
=
hspec
$
do
describe
"check if groupWithCounts works"
$
do
it
"simple integer array"
$
do
(
groupWithCounts
[
1
,
2
,
3
,
1
,
2
,
3
])
`
shouldBe
`
[(
1
,
2
),
(
2
,
2
),
(
3
,
2
)]
it
"string"
$
do
(
groupWithCounts
"abccba"
)
`
shouldBe
`
[(
'a'
,
2
),
(
'b'
,
2
),
(
'c'
,
2
)]
src-test/Graph/Clustering.hs
View file @
da954a33
{-|
Module : Graph.Clustering
Description : Basic tests to avoid quick regression
...
...
@@ -40,4 +39,3 @@ test = hspec $ do
let
result
=
List
.
length
partitions
>
1
shouldBe
True
result
src-test/Main.hs
View file @
da954a33
...
...
@@ -11,6 +11,8 @@ Portability : POSIX
import
Gargantext.Prelude
import
Gargantext.Core
(
Lang
(
..
))
import
qualified
Core.Utils
as
Utils
--import qualified Ngrams.Lang.Fr as Fr
--import qualified Ngrams.Lang as Lang
import
qualified
Ngrams.Lang.Occurrences
as
Occ
...
...
@@ -22,11 +24,12 @@ import qualified Utils.Crypto as Crypto
main
::
IO
()
main
=
do
Utils
.
test
-- Occ.parsersTest
-- Lang.ngramsExtractionTest FR
-- Lang.ngramsExtractionTest EN
-- Metrics.main
Graph
.
test
Graph
.
test
PD
.
testFromRFC3339
-- GD.test
Crypto
.
test
src-test/Utils/Crypto.hs
View file @
da954a33
{-|
Module : Utils.Crypto
Description :
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
...
...
@@ -43,4 +42,3 @@ test = hspec $ do
let
hash2
=
hash
([
"b"
,
"a"
]
::
[
Text
])
it
"compare"
$
do
hash1
`
shouldBe
`
hash2
src/Gargantext/API/Ngrams.hs
View file @
da954a33
...
...
@@ -187,9 +187,9 @@ saveNodeStory :: ( MonadReader env m, MonadBase IO m, HasNodeStorySaver env )
saveNodeStory
=
do
saver
<-
view
hasNodeStorySaver
liftBase
$
do
Gargantext
.
Prelude
.
putStrLn
"---- Running node story saver ----"
--
Gargantext.Prelude.putStrLn "---- Running node story saver ----"
saver
Gargantext
.
Prelude
.
putStrLn
"---- Node story saver finished ----"
--
Gargantext.Prelude.putStrLn "---- Node story saver finished ----"
saveNodeStoryImmediate
::
(
MonadReader
env
m
,
MonadBase
IO
m
,
HasNodeStoryImmediateSaver
env
)
...
...
@@ -197,9 +197,9 @@ saveNodeStoryImmediate :: ( MonadReader env m, MonadBase IO m, HasNodeStoryImmed
saveNodeStoryImmediate
=
do
saver
<-
view
hasNodeStoryImmediateSaver
liftBase
$
do
Gargantext
.
Prelude
.
putStrLn
"---- Running node story immediate saver ----"
--
Gargantext.Prelude.putStrLn "---- Running node story immediate saver ----"
saver
Gargantext
.
Prelude
.
putStrLn
"---- Node story immediate saver finished ----"
--
Gargantext.Prelude.putStrLn "---- Node story immediate saver finished ----"
listTypeConflictResolution
::
ListType
->
ListType
->
ListType
...
...
src/Gargantext/API/Ngrams/List.hs
View file @
da954a33
...
...
@@ -158,16 +158,17 @@ reIndexWith cId lId nt lts = do
-- Checking Text documents where orphans match
-- TODO Tests here
let
ngramsByDoc
=
map
(
HashMap
.
fromListWith
(
<>
))
$
map
(
map
(
\
(
k
,
v
)
->
(
SimpleNgrams
(
text2ngrams
k
),
v
)))
$
map
(
\
doc
->
List
.
zip
(
termsInText
(
buildPatterns
$
map
(
\
k
->
(
Text
.
splitOn
" "
$
unNgramsTerm
k
,
[]
))
orphans
)
$
Text
.
unlines
$
catMaybes
[
doc
^.
context_hyperdata
.
hd_title
,
doc
^.
context_hyperdata
.
hd_abstract
]
-- fromListWith (<>)
ngramsByDoc
=
map
(
HashMap
.
fromList
)
$
map
(
map
(
\
((
k
,
cnt
),
v
)
->
(
SimpleNgrams
(
text2ngrams
k
),
over
(
traverse
.
traverse
)
(
\
p
->
(
p
,
cnt
))
v
)))
$
map
(
\
doc
->
List
.
zip
(
termsInText
(
buildPatterns
$
map
(
\
k
->
(
Text
.
splitOn
" "
$
unNgramsTerm
k
,
[]
))
orphans
)
$
Text
.
unlines
$
catMaybes
[
doc
^.
context_hyperdata
.
hd_title
,
doc
^.
context_hyperdata
.
hd_abstract
]
)
(
List
.
cycle
[
Map
.
fromList
$
[(
nt
,
Map
.
singleton
(
doc
^.
context_id
)
1
)]])
(
List
.
cycle
[
Map
.
fromList
$
[(
nt
,
Map
.
singleton
(
doc
^.
context_id
)
1
)]])
)
docs
-- printDebug "ngramsByDoc" ngramsByDoc
...
...
src/Gargantext/API/Ngrams/Tools.hs
View file @
da954a33
...
...
@@ -195,7 +195,7 @@ getCoocByNgrams' f (Diagonal diag) m =
listToCombi
identity
ks
]
where
where
ks
=
HM
.
keys
m
-- TODO k could be either k1 or k2 here
...
...
@@ -220,7 +220,7 @@ getCoocByNgrams'' (Diagonal diag) (f1,f2) (m1,m2) =
-- TODO check optim
-- listToCombi identity ks1
]
where
where
ks1
=
HM
.
keys
m1
ks2
=
HM
.
keys
m2
...
...
src/Gargantext/API/Search.hs
View file @
da954a33
...
...
@@ -18,6 +18,7 @@ module Gargantext.API.Search
where
import
Data.Aeson
hiding
(
defaultTaggedObject
)
import
Data.List
(
concat
)
import
Data.Maybe
(
fromMaybe
)
import
Data.Swagger
hiding
(
fieldLabelModifier
,
Contact
)
import
Data.Text
(
Text
)
...
...
@@ -55,7 +56,7 @@ api :: NodeId -> GargServer (API SearchResult)
api
nId
(
SearchQuery
q
SearchDoc
)
o
l
order
=
SearchResult
<$>
SearchResultDoc
<$>
map
(
toRow
nId
)
<$>
searchInCorpus
nId
False
q
o
l
order
<$>
searchInCorpus
nId
False
(
concat
q
)
o
l
order
api
nId
(
SearchQuery
q
SearchContact
)
o
l
order
=
do
printDebug
"isPairedWith"
nId
...
...
@@ -67,13 +68,15 @@ api nId (SearchQuery q SearchContact) o l order = do
Just
aId
->
SearchResult
<$>
SearchResultContact
<$>
map
(
toRow
aId
)
<$>
searchInCorpusWithContacts
nId
aId
q
o
l
order
<$>
searchInCorpusWithContacts
nId
aId
(
concat
q
)
o
l
order
api
_nId
(
SearchQuery
_q
SearchDocWithNgrams
)
_o
_l
_order
=
undefined
-----------------------------------------------------------------------
-----------------------------------------------------------------------
-- | Main Types
-----------------------------------------------------------------------
data
SearchType
=
SearchDoc
|
SearchContact
data
SearchType
=
SearchDoc
|
SearchContact
|
SearchDocWithNgrams
deriving
(
Generic
)
instance
FromJSON
SearchType
where
parseJSON
=
genericParseJSON
(
defaultOptions
{
sumEncoding
=
ObjectWithSingleField
})
...
...
@@ -85,7 +88,7 @@ instance Arbitrary SearchType where
-----------------------------------------------------------------------
data
SearchQuery
=
SearchQuery
{
query
::
!
[
Text
]
SearchQuery
{
query
::
!
[
[
Text
]
]
,
expected
::
!
SearchType
}
deriving
(
Generic
)
...
...
@@ -100,7 +103,7 @@ instance ToSchema SearchQuery
-}
instance
Arbitrary
SearchQuery
where
arbitrary
=
elements
[
SearchQuery
[
"electrodes"
]
SearchDoc
]
arbitrary
=
elements
[
SearchQuery
[
[
"electrodes"
]
]
SearchDoc
]
-- arbitrary = elements [SearchQuery "electrodes" 1 ] --SearchDoc]
-----------------------------------------------------------------------
data
SearchResult
=
...
...
@@ -132,7 +135,7 @@ instance FromJSON SearchResultTypes where
parseJSON
=
genericParseJSON
(
defaultOptions
{
sumEncoding
=
defaultTaggedObject
})
instance
ToJSON
SearchResultTypes
where
toJSON
=
genericToJSON
(
defaultOptions
{
sumEncoding
=
defaultTaggedObject
})
instance
Arbitrary
SearchResultTypes
where
arbitrary
=
do
srd
<-
SearchResultDoc
<$>
arbitrary
...
...
@@ -163,7 +166,7 @@ data Row =
deriving
(
Generic
)
instance
FromJSON
Row
where
parseJSON
=
genericParseJSON
parseJSON
=
genericParseJSON
(
defaultOptions
{
sumEncoding
=
defaultTaggedObject
}
)
instance
ToJSON
Row
where
...
...
src/Gargantext/Core/NodeStory.hs
View file @
da954a33
...
...
@@ -455,7 +455,7 @@ insertArchiveList c nodeId a = do
where
query
::
PGS
.
Query
query
=
[
sql
|
WITH s as (SELECT ? as sid, ? sversion, ? sngrams_type_id, ngrams.id as sngrams_id, ?::jsonb as srepo FROM ngrams WHERE terms = ?)
INSERT INTO node_stories(node_id, version, ngrams_type_id, ngrams_id, ngrams_repo_element)
INSERT INTO node_stories(node_id, version, ngrams_type_id, ngrams_id, ngrams_repo_element)
SELECT s.sid, s.sversion, s.sngrams_type_id, s.sngrams_id, s.srepo from s s join nodes n on s.sid = n.id
|]
...
...
@@ -505,22 +505,22 @@ updateNodeStory c nodeId@(NodeId _nId) currentArchive newArchive = do
--printDebug "[updateNodeStory] updates" $ Text.unlines $ (Text.pack . show) <$> updates
-- 2. Perform inserts/deletes/updates
printDebug
"[updateNodeStory] applying insert"
()
--
printDebug "[updateNodeStory] applying insert" ()
insertArchiveList
c
nodeId
$
Archive
{
_a_version
=
newArchive
^.
a_version
,
_a_history
=
[]
,
_a_state
=
archiveStateFromList
inserts
}
printDebug
"[updateNodeStory] insert applied"
()
--
printDebug "[updateNodeStory] insert applied" ()
--TODO Use currentArchive ^. a_version in delete and report error
-- if entries with (node_id, ngrams_type_id, ngrams_id) but
-- different version are found.
deleteArchiveList
c
nodeId
$
Archive
{
_a_version
=
newArchive
^.
a_version
,
_a_history
=
[]
,
_a_state
=
archiveStateFromList
deletes
}
printDebug
"[updateNodeStory] delete applied"
()
--
printDebug "[updateNodeStory] delete applied" ()
updateArchiveList
c
nodeId
$
Archive
{
_a_version
=
newArchive
^.
a_version
,
_a_history
=
[]
,
_a_state
=
archiveStateFromList
updates
}
printDebug
"[updateNodeStory] update applied"
()
--
printDebug "[updateNodeStory] update applied" ()
pure
()
-- where
...
...
src/Gargantext/Core/Text/Terms.hs
View file @
da954a33
...
...
@@ -28,8 +28,9 @@ compute graph
-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE ConstrainedClassMethods #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TemplateHaskell #-}
module
Gargantext.Core.Text.Terms
where
...
...
@@ -47,6 +48,7 @@ import qualified Data.Set as Set
import
qualified
Data.Text
as
Text
import
qualified
Data.HashMap.Strict
as
HashMap
import
Gargantext.Core
import
Gargantext.Core.Utils
(
groupWithCounts
)
import
Gargantext.Core.Text
(
sentences
,
HasText
(
..
))
import
Gargantext.Core.Text.Terms.Eleve
(
mainEleveWith
,
Tries
,
Token
,
buildTries
,
toToken
)
import
Gargantext.Core.Text.Terms.Mono
(
monoTerms
)
...
...
@@ -70,24 +72,23 @@ data TermType lang
,
_tt_model
::
!
(
Maybe
(
Tries
Token
()
))
}
deriving
(
Generic
)
deriving
instance
(
Show
lang
)
=>
Show
(
TermType
lang
)
makeLenses
''
T
ermType
--group :: [Text] -> [Text]
--group = undefined
-- remove Stop Words
-- map (filter (\t -> not . elem t)) $
-- map (filter (\t -> not . elem t)) $
------------------------------------------------------------------------
-- | Sugar to extract terms from text (hidd
eng mapM
from end user).
-- | Sugar to extract terms from text (hidd
ing 'mapM'
from end user).
--extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
extractTerms
::
TermType
Lang
->
[
Text
]
->
IO
[[
Terms
]]
extractTerms
::
TermType
Lang
->
[
Text
]
->
IO
[[
TermsWithCount
]]
extractTerms
(
Unsupervised
{
..
})
xs
=
mapM
(
terms
(
Unsupervised
{
_tt_model
=
Just
m'
,
..
}))
xs
where
m'
=
case
_tt_model
of
Just
m''
->
m''
Nothing
->
newTries
_tt_windowSize
(
Text
.
intercalate
" "
xs
)
extractTerms
termTypeLang
xs
=
mapM
(
terms
termTypeLang
)
xs
...
...
@@ -116,12 +117,13 @@ data ExtractedNgrams = SimpleNgrams { unSimpleNgrams :: Ngrams }
instance
Hashable
ExtractedNgrams
-- | A typeclass that represents extracting ngrams from an entity.
class
ExtractNgramsT
h
where
extractNgramsT
::
HasText
h
=>
TermType
Lang
->
h
->
Cmd
err
(
HashMap
ExtractedNgrams
(
Map
NgramsType
Int
))
->
Cmd
err
(
HashMap
ExtractedNgrams
(
Map
NgramsType
Int
,
TermsCount
))
------------------------------------------------------------------------
enrichedTerms
::
Lang
->
PosTagAlgo
->
POS
->
Terms
->
NgramsPostag
enrichedTerms
l
pa
po
(
Terms
ng1
ng2
)
=
...
...
@@ -132,7 +134,7 @@ enrichedTerms l pa po (Terms ng1 ng2) =
------------------------------------------------------------------------
cleanNgrams
::
Int
->
Ngrams
->
Ngrams
cleanNgrams
s
ng
cleanNgrams
s
ng
|
Text
.
length
(
ng
^.
ngramsTerms
)
<
s
=
ng
|
otherwise
=
text2ngrams
(
Text
.
take
s
(
ng
^.
ngramsTerms
))
...
...
@@ -151,10 +153,10 @@ insertExtractedNgrams ngs = do
let
(
s
,
e
)
=
List
.
partition
isSimpleNgrams
ngs
m1
<-
insertNgrams
(
map
unSimpleNgrams
s
)
--printDebug "others" m1
m2
<-
insertNgramsPostag
(
map
unEnrichedNgrams
e
)
--printDebug "terms" m2
let
result
=
HashMap
.
union
m1
m2
pure
result
...
...
@@ -163,43 +165,41 @@ isSimpleNgrams (SimpleNgrams _) = True
isSimpleNgrams
_
=
False
------------------------------------------------------------------------
-- | Terms from
Text
--
Mono
: mono terms
--
Multi
: multi terms
--
MonoMulti
: mono and multi
-- | Terms from
'Text'
--
'Mono'
: mono terms
--
'Multi'
: multi terms
--
'MonoMulti'
: mono and multi
-- TODO : multi terms should exclude mono (intersection is not empty yet)
terms
::
TermType
Lang
->
Text
->
IO
[
Terms
]
terms
::
TermType
Lang
->
Text
->
IO
[
Terms
WithCount
]
terms
(
Mono
lang
)
txt
=
pure
$
monoTerms
lang
txt
terms
(
Multi
lang
)
txt
=
multiterms
lang
txt
terms
(
MonoMulti
lang
)
txt
=
terms
(
Multi
lang
)
txt
terms
(
Unsupervised
{
..
})
txt
=
termsUnsupervised
(
Unsupervised
{
_tt_model
=
Just
m'
,
..
})
txt
terms
(
Unsupervised
{
..
})
txt
=
pure
$
termsUnsupervised
(
Unsupervised
{
_tt_model
=
Just
m'
,
..
})
txt
where
m'
=
maybe
(
newTries
_tt_ngramsSize
txt
)
identity
_tt_model
-- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
------------------------------------------------------------------------
-- | Unsupervised ngrams extraction
-- language agnostic extraction
-- TODO: remove IO
-- TODO: newtype BlockText
type
WindowSize
=
Int
type
MinNgramSize
=
Int
termsUnsupervised
::
TermType
Lang
->
Text
->
IO
[
Terms
]
termsUnsupervised
(
Unsupervised
l
n
s
m
)
=
pure
.
map
(
text2term
l
)
.
List
.
nub
.
(
List
.
filter
(
\
l'
->
List
.
length
l'
>=
s
))
-- | Unsupervised ngrams extraction
-- language agnostic extraction
-- TODO: newtype BlockText
termsUnsupervised
::
TermType
Lang
->
Text
->
[
TermsWithCount
]
termsUnsupervised
(
Unsupervised
{
_tt_model
=
Nothing
})
=
panic
"[termsUnsupervised] no model"
termsUnsupervised
(
Unsupervised
{
_tt_model
=
Just
_tt_model
,
..
})
=
map
(
\
(
t
,
cnt
)
->
(
text2term
_tt_lang
t
,
cnt
))
.
groupWithCounts
-- . List.nub
.
(
List
.
filter
(
\
l'
->
List
.
length
l'
>=
_tt_windowSize
))
.
List
.
concat
.
mainEleveWith
(
maybe
(
panic
"no model"
)
identity
m
)
n
.
mainEleveWith
_tt_model
_tt_ngramsSize
.
uniText
termsUnsupervised
_
=
undefined
newTries
::
Int
->
Text
->
Tries
Token
()
newTries
n
t
=
buildTries
n
(
fmap
toToken
$
uniText
t
)
...
...
@@ -217,5 +217,3 @@ text2term lang txt = Terms txt (Set.fromList $ map (stem lang) txt)
isPunctuation
::
Text
->
Bool
isPunctuation
x
=
List
.
elem
x
$
(
Text
.
pack
.
pure
)
<$>
(
"!?(),;.:"
::
String
)
src/Gargantext/Core/Text/Terms/Eleve.hs
View file @
da954a33
...
...
@@ -32,6 +32,7 @@ Notes for current implementation:
-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeFamilies #-}
...
...
@@ -278,6 +279,8 @@ data Tries k e = Tries
makeLenses
''
T
ries
deriving
instance
(
Show
k
,
Show
e
)
=>
Show
(
Tries
k
e
)
buildTries
::
Int
->
[[
Token
]]
->
Tries
Token
()
buildTries
n
sentences
=
Tries
{
_fwd
=
buildTrie
Forward
n
sentences
...
...
src/Gargantext/Core/Text/Terms/Mono.hs
View file @
da954a33
...
...
@@ -40,8 +40,8 @@ words = monoTexts
isSep
::
Char
->
Bool
isSep
=
(`
elem
`
(
",.:;?!(){}[]
\"\'
"
::
String
))
monoTerms
::
Lang
->
Text
->
[
Terms
]
monoTerms
l
txt
=
map
(
monoText2term
l
)
$
monoTexts
txt
monoTerms
::
Lang
->
Text
->
[
Terms
WithCount
]
monoTerms
l
txt
=
map
(
\
t
->
(
monoText2term
l
t
,
1
)
)
$
monoTexts
txt
monoTexts
::
Text
->
[
Text
]
monoTexts
=
L
.
concat
.
monoTextsBySentence
...
...
src/Gargantext/Core/Text/Terms/Multi.hs
View file @
da954a33
...
...
@@ -21,6 +21,7 @@ import Data.List (concat)
import
Gargantext.Prelude
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core.Types
import
Gargantext.Core.Utils
(
groupWithCounts
)
import
Gargantext.Core.Text.Terms.Multi.PosTagging
import
Gargantext.Core.Text.Terms.Multi.PosTagging.Types
...
...
@@ -37,14 +38,16 @@ import qualified Gargantext.Utils.SpacyNLP as SpacyNLP
type
NLP_API
=
Lang
->
Text
->
IO
PosSentences
-------------------------------------------------------------------
multiterms
::
Lang
->
Text
->
IO
[
Terms
]
multiterms
=
multiterms'
tokenTag2terms
multiterms
::
Lang
->
Text
->
IO
[
TermsWithCount
]
multiterms
l
txt
=
do
ret
<-
multiterms'
tokenTag2terms
l
txt
pure
$
groupWithCounts
ret
where
multiterms'
::
(
TokenTag
->
a
)
->
Lang
->
Text
->
IO
[
a
]
multiterms'
f
lang
txt
=
concat
multiterms'
f
lang
txt
'
=
concat
<$>
map
(
map
f
)
<$>
map
(
filter
(
\
t
->
_my_token_pos
t
==
Just
NP
))
<$>
tokenTags
lang
txt
<$>
tokenTags
lang
txt
'
-------------------------------------------------------------------
tokenTag2terms
::
TokenTag
->
Terms
...
...
@@ -57,10 +60,10 @@ tokenTags l _ = panic $ "[G.C.T.T.Multi] Lang NLP API not implemented yet " <
tokenTagsWith
::
Lang
->
Text
->
NLP_API
->
IO
[[
TokenTag
]]
tokenTagsWith
lang
txt
nlp
=
map
(
groupTokens
lang
)
<$>
map
tokens2tokensTags
<$>
map
_sentenceTokens
<$>
_sentences
<$>
nlp
lang
txt
<$>
map
tokens2tokensTags
<$>
map
_sentenceTokens
<$>
_sentences
<$>
nlp
lang
txt
---- | This function analyses and groups (or not) ngrams according to
...
...
src/Gargantext/Core/Text/Terms/Multi/Group.hs
View file @
da954a33
...
...
@@ -30,4 +30,3 @@ group2 p1 p2 (x@(TokenTag _ _ Nothing _):y) = (x: group2 p1 p2 y)
group2
_
_
[
x
@
(
TokenTag
_
_
(
Just
_
)
_
)]
=
[
x
]
group2
p1
p2
(
x
@
(
TokenTag
_
_
(
Just
_
)
_
)
:
y
@
(
TokenTag
_
_
Nothing
_
)
:
z
)
=
(
x
:
y
:
group2
p1
p2
(
y
:
z
))
group2
_
_
[]
=
[]
src/Gargantext/Core/Text/Terms/Multi/Lang/En.hs
View file @
da954a33
...
...
@@ -40,4 +40,3 @@ groupTokens ntags = group2 NP NP
--groupNgrams ((x,_,"LOCATION"):(y,yy,"LOCATION"):xs) = groupNgrams ((x <> " " <> y,yy,"LOCATION"):xs)
--
--groupNgrams (x:xs) = (x:(groupNgrams xs))
src/Gargantext/Core/Text/Terms/Multi/PosTagging.hs
View file @
da954a33
...
...
@@ -45,19 +45,22 @@ tokens2tokensTags :: [Token] -> [TokenTag]
tokens2tokensTags
ts
=
filter'
$
map
tokenTag
ts
------------------------------------------------------------------------
tokenTag
::
Token
->
TokenTag
tokenTag
(
Token
_
w
_
l
_
_
p
n
_
_
)
=
TokenTag
w'
l'
p
n
tokenTag
(
Token
{
..
})
=
TokenTag
{
_my_token_word
=
w'
,
_my_token_lemma
=
l'
,
_my_token_pos
=
_tokenPos
,
_my_token_ner
=
_tokenNer
}
where
w'
=
split
w
l'
=
fromList
(
split
l
)
w'
=
split
_tokenWord
l'
=
fromList
(
split
_tokenLemma
)
split
=
splitOn
(
pack
" "
)
.
toLower
filter'
::
[
TokenTag
]
->
[
TokenTag
]
filter'
xs
=
filter
isNgrams
xs
where
isNgrams
(
TokenTag
_
_
p
n
)
=
isJust
p
||
isJust
n
isNgrams
(
TokenTag
{
..
})
=
isJust
_my_token_pos
||
isJust
_my_token_ner
------------------------------------------------------------------------
-- request =
-- request =
-- "fr" : {
-- "tokenize.language" : "fr",
-- "pos.model" : "edu/stanford/nlp/models/pos-tagger/french/french.tagger",
...
...
@@ -66,9 +69,9 @@ filter' xs = filter isNgrams xs
-- "depparse.model" : "edu/stanford/nlp/models/parser/nndep/UD_French.gz",
-- "depparse.language" : "french",
-- "ner.model": DATA_ROOT+"/eunews.fr.crf.gz",
-- "ssplit.newlineIsSentenceBreak": "always"
-- "ssplit.newlineIsSentenceBreak": "always"
-- },
--
--
corenlp'
::
(
FromJSON
a
,
ConvertibleStrings
p
ByteString
...
...
@@ -80,6 +83,7 @@ corenlp' lang txt = do
FR
->
"{
\"
annotators
\"
:
\"
tokenize,ssplit,pos,lemma,ner
\"
,
\"
parse.model
\"
:
\"
edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz
\"
,
\"
pos.model
\"
:
\"
edu/stanford/nlp/models/pos-tagger/french/french.tagger
\"
,
\"
tokenize.language
\"
:
\"
fr
\"
,
\"
outputFormat
\"
:
\"
json
\"
}"
_
->
panic
$
pack
"not implemented yet"
url
<-
parseRequest
$
"POST http://localhost:9000/?properties="
<>
properties
-- curl -XPOST 'http://localhost:9000/?properties=%7B%22annotators%22:%20%22tokenize,ssplit,pos,ner%22,%20%22outputFormat%22:%20%22json%22%7D' -d 'hello world, hello' | jq .
let
request
=
setRequestBodyLBS
(
cs
txt
)
url
httpJSON
request
...
...
src/Gargantext/Core/Text/Terms/Multi/PosTagging/Types.hs
View file @
da954a33
...
...
@@ -52,4 +52,3 @@ data PosSentences = PosSentences { _sentences :: [Sentence]}
deriving
(
Show
,
Generic
)
$
(
deriveJSON
(
unPrefix
"_"
)
''
P
osSentences
)
src/Gargantext/Core/Text/Terms/WithList.hs
View file @
da954a33
{-|
Module : Gargantext.Core.Text.Terms.WithList
Description :
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
...
...
@@ -21,6 +21,8 @@ import Data.Text (Text, concat, unwords)
import
Gargantext.Prelude
import
Gargantext.Core.Text.Context
import
Gargantext.Core.Text.Terms.Mono
(
monoTextsBySentence
)
import
Gargantext.Core.Types
(
TermsCount
)
import
Gargantext.Core.Utils
(
groupWithCounts
)
import
Prelude
(
error
)
import
qualified
Data.Algorithms.KMP
as
KMP
import
qualified
Data.IntMap.Strict
as
IntMap
...
...
@@ -71,8 +73,8 @@ buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
--------------------------------------------------------------------------
-- Utils
type
MatchedText
=
Text
termsInText
::
Patterns
->
Text
->
[
MatchedText
]
termsInText
pats
txt
=
List
.
nub
termsInText
::
Patterns
->
Text
->
[
(
MatchedText
,
TermsCount
)
]
termsInText
pats
txt
=
groupWithCounts
$
List
.
concat
$
map
(
map
unwords
)
$
extractTermsWithList
pats
txt
...
...
@@ -95,7 +97,7 @@ extractTermsWithList' pats = map (concat . map concat . replaceTerms pats)
{- | Not used
filterWith :: TermList
-> (a -> Text)
-> [a]
-> [a]
-> [(a, [Text])]
filterWith termList f xs = filterWith' termList f zip xs
...
...
@@ -103,7 +105,7 @@ filterWith termList f xs = filterWith' termList f zip xs
filterWith' :: TermList
-> (a -> Text)
-> ([a] -> [[Text]] -> [b])
-> [a]
-> [a]
-> [b]
filterWith' termList f f' xs = f' xs
$ map (extractTermsWithList' pats)
...
...
src/Gargantext/Core/Types.hs
View file @
da954a33
{-|
Module : Gargantext.Types
Description :
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
...
...
@@ -17,7 +17,7 @@ commentary with @some markup@.
module
Gargantext.Core.Types
(
module
Gargantext
.
Core
.
Types
.
Main
,
module
Gargantext
.
Database
.
Admin
.
Types
.
Node
,
DebugMode
(
..
),
withDebugMode
,
Term
,
Terms
(
..
)
,
Term
,
Terms
(
..
)
,
TermsCount
,
TermsWithCount
,
TokenTag
(
..
),
POS
(
..
),
NER
(
..
)
,
Label
,
Stems
,
HasInvalidError
(
..
),
assertValid
...
...
@@ -70,10 +70,13 @@ type Label = [Text]
data
Terms
=
Terms
{
_terms_label
::
Label
,
_terms_stem
::
Stems
}
deriving
(
Ord
,
Show
)
instance
Eq
Terms
where
(
==
)
(
Terms
_
s1
)
(
Terms
_
s2
)
=
s1
==
s2
type
TermsCount
=
Int
type
TermsWithCount
=
(
Terms
,
TermsCount
)
------------------------------------------------------------------------
data
Tag
=
POS
|
NER
deriving
(
Show
,
Eq
)
...
...
@@ -208,5 +211,3 @@ data TODO = TODO
instance
ToSchema
TODO
where
instance
ToParamSchema
TODO
where
----------------------------------------------------------------------------
src/Gargantext/Core/Utils.hs
View file @
da954a33
{-|
Module : Gargantext.Utils
Description :
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
...
...
@@ -12,16 +12,19 @@ commentary with @some markup@.
-}
module
Gargantext.Core.Utils
(
module
Gargantext.Core.Utils
(
-- module Gargantext.Utils.Chronos
module
Gargantext
.
Core
.
Utils
.
Prefix
,
something
,
alphanum
,
choices
,
randomString
,
groupWithCounts
,
addTuples
)
where
import
Data.Char
(
chr
,
ord
)
import
qualified
Data.List
as
List
import
Data.Maybe
import
Data.Monoid
import
Data.Text
(
Text
,
pack
)
...
...
@@ -57,3 +60,17 @@ randomString :: Int -> IO Text
randomString
num
=
do
str
<-
choices
num
alphanum
pure
$
pack
str
-- | Given a list of items of type 'a', return list with unique items
-- (like List.nub) but tuple-d with their counts in the original list
groupWithCounts
::
(
Ord
a
,
Eq
a
)
=>
[
a
]
->
[(
a
,
Int
)]
groupWithCounts
=
map
f
.
List
.
group
.
List
.
sort
where
f
[]
=
panic
"[groupWithCounts] impossible"
f
ts
@
(
t
:
_
)
=
(
t
,
length
ts
)
addTuples
::
(
Num
a
,
Num
b
)
=>
(
a
,
b
)
->
(
a
,
b
)
->
(
a
,
b
)
addTuples
(
a1
,
b1
)
(
a2
,
b2
)
=
(
a1
+
a2
,
b1
+
b2
)
src/Gargantext/Core/Viz/Phylo/Legacy/LegacyMain.hs
View file @
da954a33
...
...
@@ -65,7 +65,7 @@ flowPhylo cId = do
patterns
=
buildPatterns
termList
-- | To filter the Ngrams of a document based on the termList
filterTerms
::
Patterns
->
(
Date
,
Text
)
->
(
Date
,
[
Text
])
filterTerms
patterns'
(
y
,
d
)
=
(
y
,
termsInText
patterns'
d
)
filterTerms
patterns'
(
y
,
d
)
=
(
y
,
fst
<$>
termsInText
patterns'
d
)
docs
=
map
((
\
(
y
,
t
)
->
Document
y
t
)
.
filterTerms
patterns
)
docs'
...
...
@@ -123,4 +123,3 @@ writePhylo _fp _phview = undefined
-- refactor 2021
-- viewPhylo2Svg :: PhyloView -> IO DB.ByteString
-- viewPhylo2Svg p = graphvizWithHandle Dot (viewToDot p) Svg DB.hGetContents
src/Gargantext/Database/Action/Flow.hs
View file @
da954a33
...
...
@@ -48,7 +48,7 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
where
import
Conduit
import
Control.Lens
((
^.
),
view
,
_Just
,
makeLenses
)
import
Control.Lens
((
^.
),
view
,
_Just
,
makeLenses
,
over
,
traverse
)
import
Data.Aeson.TH
(
deriveJSON
)
import
Data.Conduit.Internal
(
zipSources
)
import
Data.Either
...
...
@@ -60,7 +60,6 @@ import Data.Maybe (catMaybes)
import
Data.Monoid
import
Data.Swagger
import
qualified
Data.Text
as
T
import
Data.Traversable
(
traverse
)
import
Data.Tuple.Extra
(
first
,
second
)
import
GHC.Generics
(
Generic
)
import
Servant.Client
(
ClientError
)
...
...
@@ -83,9 +82,10 @@ import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(.
import
Gargantext.Core.Text.List.Social
(
FlowSocialListWith
)
import
Gargantext.Core.Text.Terms
import
Gargantext.Core.Text.Terms.Mono.Stem.En
(
stemIt
)
import
Gargantext.Core.Types
(
POS
(
NP
))
import
Gargantext.Core.Types
(
POS
(
NP
)
,
TermsCount
)
import
Gargantext.Core.Types.Individu
(
User
(
..
))
import
Gargantext.Core.Types.Main
import
Gargantext.Core.Utils
(
addTuples
)
import
Gargantext.Core.Utils.Prefix
(
unPrefix
,
unPrefixSwagger
)
import
Gargantext.Database.Action.Flow.List
import
Gargantext.Database.Action.Flow.Types
...
...
@@ -284,7 +284,7 @@ flow c u cn la mfslw (mLength, docsC) logStatus = do
,
_scst_events
=
Just
[]
}
pure
$
Prelude
.
head
id
------------------------------------------------------------------------
...
...
@@ -357,25 +357,27 @@ insertMasterDocs c lang hs = do
-- this will enable global database monitoring
-- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
mapNgramsDocs'
::
HashMap
ExtractedNgrams
(
Map
NgramsType
(
Map
NodeId
Int
))
mapNgramsDocs'
::
HashMap
ExtractedNgrams
(
Map
NgramsType
(
Map
NodeId
(
Int
,
TermsCount
)
))
<-
mapNodeIdNgrams
<$>
documentIdWithNgrams
(
extractNgramsT
$
withLang
lang
documentsWithId
)
documentsWithId
(
extractNgramsT
$
withLang
lang
documentsWithId
)
documentsWithId
lId
<-
getOrMkList
masterCorpusId
masterUserId
-- _ <- saveDocNgramsWith lId mapNgramsDocs'
_
<-
saveDocNgramsWith
lId
mapNgramsDocs'
-- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
pure
ids'
saveDocNgramsWith
::
(
FlowCmdM
env
err
m
)
saveDocNgramsWith
::
(
FlowCmdM
env
err
m
)
=>
ListId
->
HashMap
ExtractedNgrams
(
Map
NgramsType
(
Map
NodeId
Int
))
->
HashMap
ExtractedNgrams
(
Map
NgramsType
(
Map
NodeId
(
Int
,
TermsCount
)
))
->
m
()
saveDocNgramsWith
lId
mapNgramsDocs'
=
do
terms2id
<-
insertExtractedNgrams
$
HashMap
.
keys
mapNgramsDocs'
--printDebug "terms2id" terms2id
--printDebug "[saveDocNgramsWith] mapNgramsDocs'" mapNgramsDocs'
let
mapNgramsDocsNoCount
=
over
(
traverse
.
traverse
.
traverse
)
fst
mapNgramsDocs'
terms2id
<-
insertExtractedNgrams
$
HashMap
.
keys
mapNgramsDocsNoCount
let
mapNgramsDocs
=
HashMap
.
mapKeys
extracted2ngrams
mapNgramsDocs'
...
...
@@ -392,7 +394,7 @@ saveDocNgramsWith lId mapNgramsDocs' = do
<*>
Just
(
fromIntegral
w
::
Double
)
|
(
terms''
,
mapNgramsTypes
)
<-
HashMap
.
toList
mapNgramsDocs
,
(
ngrams_type
,
mapNodeIdWeight
)
<-
Map
.
toList
mapNgramsTypes
,
(
nId
,
w
)
<-
Map
.
toList
mapNodeIdWeight
,
(
nId
,
(
w
,
_cnt
)
)
<-
Map
.
toList
mapNodeIdWeight
]
-- to be removed
...
...
@@ -451,28 +453,32 @@ mergeData rs = catMaybes . map toDocumentWithId . Map.toList
------------------------------------------------------------------------
documentIdWithNgrams
::
HasNodeError
err
=>
(
a
->
Cmd
err
(
HashMap
b
(
Map
NgramsType
Int
)))
->
Cmd
err
(
HashMap
b
(
Map
NgramsType
Int
,
TermsCount
)))
->
[
Indexed
NodeId
a
]
->
Cmd
err
[
DocumentIdWithNgrams
a
b
]
documentIdWithNgrams
f
=
traverse
toDocumentIdWithNgrams
where
toDocumentIdWithNgrams
d
=
do
e
<-
f
$
_unIndex
d
pure
$
DocumentIdWithNgrams
d
e
pure
$
DocumentIdWithNgrams
d
e
-- | TODO check optimization
mapNodeIdNgrams
::
(
Ord
b
,
Hashable
b
)
=>
[
DocumentIdWithNgrams
a
b
]
->
HashMap
b
(
Map
NgramsType
(
Map
NodeId
Int
)
(
Map
NgramsType
(
Map
NodeId
(
Int
,
TermsCount
)
)
)
mapNodeIdNgrams
=
HashMap
.
unionsWith
(
Map
.
unionWith
(
Map
.
unionWith
(
+
)
))
.
fmap
f
mapNodeIdNgrams
=
HashMap
.
unionsWith
(
Map
.
unionWith
(
Map
.
unionWith
addTuples
))
.
fmap
f
where
-- | NOTE We are somehow multiplying 'TermsCount' here: If the
-- same ngrams term has different ngrams types, the 'TermsCount'
-- for it (which is the number of times the terms appears in a
-- document) is copied over to all its types.
f
::
DocumentIdWithNgrams
a
b
->
HashMap
b
(
Map
NgramsType
(
Map
NodeId
Int
))
f
d
=
fmap
(
fmap
(
Map
.
singleton
nId
)
)
$
documentNgrams
d
->
HashMap
b
(
Map
NgramsType
(
Map
NodeId
(
Int
,
TermsCount
)
))
f
d
=
fmap
(
\
(
ngramsTypeMap
,
cnt
)
->
fmap
(
\
i
->
Map
.
singleton
nId
(
i
,
cnt
))
ngramsTypeMap
)
$
documentNgrams
d
where
nId
=
_index
$
documentWithId
d
...
...
@@ -483,25 +489,25 @@ instance ExtractNgramsT HyperdataContact
extractNgramsT
l
hc
=
HashMap
.
mapKeys
(
cleanExtractedNgrams
255
)
<$>
extract
l
hc
where
extract
::
TermType
Lang
->
HyperdataContact
->
Cmd
err
(
HashMap
ExtractedNgrams
(
Map
NgramsType
Int
))
->
Cmd
err
(
HashMap
ExtractedNgrams
(
Map
NgramsType
Int
,
TermsCount
))
extract
_l
hc'
=
do
let
authors
=
map
text2ngrams
$
maybe
[
"Nothing"
]
(
\
a
->
[
a
])
$
view
(
hc_who
.
_Just
.
cw_lastName
)
hc'
pure
$
HashMap
.
fromList
$
[(
SimpleNgrams
a'
,
Map
.
singleton
Authors
1
)
|
a'
<-
authors
]
pure
$
HashMap
.
fromList
$
[(
SimpleNgrams
a'
,
(
Map
.
singleton
Authors
1
,
1
)
)
|
a'
<-
authors
]
instance
ExtractNgramsT
HyperdataDocument
where
extractNgramsT
::
TermType
Lang
->
HyperdataDocument
->
Cmd
err
(
HashMap
ExtractedNgrams
(
Map
NgramsType
Int
))
->
Cmd
err
(
HashMap
ExtractedNgrams
(
Map
NgramsType
Int
,
TermsCount
))
extractNgramsT
lang
hd
=
HashMap
.
mapKeys
(
cleanExtractedNgrams
255
)
<$>
extractNgramsT'
lang
hd
where
extractNgramsT'
::
TermType
Lang
->
HyperdataDocument
->
Cmd
err
(
HashMap
ExtractedNgrams
(
Map
NgramsType
Int
))
->
Cmd
err
(
HashMap
ExtractedNgrams
(
Map
NgramsType
Int
,
TermsCount
))
extractNgramsT'
lang'
doc
=
do
let
source
=
text2ngrams
$
maybe
"Nothing"
identity
...
...
@@ -515,23 +521,23 @@ instance ExtractNgramsT HyperdataDocument
$
maybe
[
"Nothing"
]
(
T
.
splitOn
", "
)
$
_hd_authors
doc
terms
'
<-
map
(
enrichedTerms
(
lang'
^.
tt_lang
)
CoreNLP
NP
)
<$>
concat
<$>
liftBase
(
extractTerms
lang'
$
hasText
doc
)
terms
WithCounts'
<-
map
(
\
(
t
,
cnt
)
->
(
enrichedTerms
(
lang'
^.
tt_lang
)
CoreNLP
NP
t
,
cnt
)
)
<$>
concat
<$>
liftBase
(
extractTerms
lang'
$
hasText
doc
)
pure
$
HashMap
.
fromList
$
[(
SimpleNgrams
source
,
Map
.
singleton
Sources
1
)
]
<>
[(
SimpleNgrams
i'
,
Map
.
singleton
Institutes
1
)
|
i'
<-
institutes
]
<>
[(
SimpleNgrams
a'
,
Map
.
singleton
Authors
1
)
|
a'
<-
authors
]
<>
[(
EnrichedNgrams
t'
,
Map
.
singleton
NgramsTerms
1
)
|
t'
<-
term
s'
]
$
[(
SimpleNgrams
source
,
(
Map
.
singleton
Sources
1
,
1
)
)
]
<>
[(
SimpleNgrams
i'
,
(
Map
.
singleton
Institutes
1
,
1
)
)
|
i'
<-
institutes
]
<>
[(
SimpleNgrams
a'
,
(
Map
.
singleton
Authors
1
,
1
)
)
|
a'
<-
authors
]
<>
[(
EnrichedNgrams
t'
,
(
Map
.
singleton
NgramsTerms
1
,
cnt'
))
|
(
t'
,
cnt'
)
<-
termsWithCount
s'
]
instance
(
ExtractNgramsT
a
,
HasText
a
)
=>
ExtractNgramsT
(
Node
a
)
where
extractNgramsT
l
(
Node
_
_
_
_
_
_
_
h
)
=
extractNgramsT
l
h
extractNgramsT
l
(
Node
{
_node_hyperdata
=
h
}
)
=
extractNgramsT
l
h
instance
HasText
a
=>
HasText
(
Node
a
)
where
hasText
(
Node
_
_
_
_
_
_
_
h
)
=
hasText
h
hasText
(
Node
{
_node_hyperdata
=
h
}
)
=
hasText
h
...
...
src/Gargantext/Database/Action/Flow/Utils.hs
View file @
da954a33
...
...
@@ -15,6 +15,7 @@ module Gargantext.Database.Action.Flow.Utils
import
Data.Map
(
Map
)
import
Data.HashMap.Strict
(
HashMap
)
import
Gargantext.Core.Types
(
TermsCount
)
import
Gargantext.Database.Admin.Types.Node
import
Gargantext.Database.Prelude
(
Cmd
)
import
Gargantext.Database.Query.Table.ContextNodeNgrams
...
...
@@ -29,24 +30,24 @@ import qualified Data.HashMap.Strict as HashMap
data
DocumentIdWithNgrams
a
b
=
DocumentIdWithNgrams
{
documentWithId
::
Indexed
NodeId
a
,
documentNgrams
::
HashMap
b
(
Map
NgramsType
Int
)
,
documentNgrams
::
HashMap
b
(
Map
NgramsType
Int
,
TermsCount
)
}
deriving
(
Show
)
insertDocNgrams
::
ListId
->
HashMap
(
Indexed
NgramsId
Ngrams
)
(
Map
NgramsType
(
Map
DocId
Int
))
->
HashMap
(
Indexed
NgramsId
Ngrams
)
(
Map
NgramsType
(
Map
DocId
(
Int
,
TermsCount
)
))
->
Cmd
err
Int
insertDocNgrams
lId
m
=
insertContextNodeNgrams
ns
insertDocNgrams
lId
m
=
do
printDebug
"[insertDocNgrams] ns"
ns
insertContextNodeNgrams
ns
where
ns
=
[
ContextNodeNgrams
docId
lId
(
ng
^.
index
)
(
ngramsTypeId
t
)
(
fromIntegral
i
)
cnt
|
(
ng
,
t2n2i
)
<-
HashMap
.
toList
m
,
(
t
,
n2i
)
<-
DM
.
toList
t2n2i
,
(
docId
,
i
)
<-
DM
.
toList
n2i
,
(
docId
,
(
i
,
cnt
)
)
<-
DM
.
toList
n2i
]
-- [(NodeId, {Ngrams: ({NgramsType: Int}, TermsCount)})]
-- {Ngrams: {NgramsType: {NodeId: (Int, TermsCount)}}}
src/Gargantext/Database/Action/Search.hs
View file @
da954a33
...
...
@@ -14,7 +14,10 @@ module Gargantext.Database.Action.Search where
import
Control.Arrow
(
returnA
)
import
Control.Lens
((
^.
))
import
qualified
Data.List
as
List
import
qualified
Data.Map
as
Map
import
Data.Maybe
import
qualified
Data.Set
as
Set
import
Data.Text
(
Text
,
unpack
,
intercalate
)
import
Data.Time
(
UTCTime
)
import
Gargantext.Core
...
...
@@ -26,8 +29,11 @@ import Gargantext.Database.Query.Filter
import
Gargantext.Database.Query.Join
(
leftJoin5
)
import
Gargantext.Database.Query.Table.Node
import
Gargantext.Database.Query.Table.Context
import
Gargantext.Database.Query.Table.ContextNodeNgrams
(
queryContextNodeNgramsTable
)
import
Gargantext.Database.Query.Table.NodeContext
import
Gargantext.Database.Query.Table.NodeContext_NodeContext
import
Gargantext.Database.Schema.ContextNodeNgrams
(
ContextNodeNgramsPoly
(
..
))
import
Gargantext.Database.Schema.Ngrams
(
NgramsType
(
..
))
import
Gargantext.Database.Schema.Node
import
Gargantext.Database.Schema.Context
import
Gargantext.Prelude
...
...
@@ -43,7 +49,7 @@ searchDocInDatabase :: HasDBid NodeType
->
Cmd
err
[(
NodeId
,
HyperdataDocument
)]
searchDocInDatabase
p
t
=
runOpaQuery
(
queryDocInDatabase
p
t
)
where
-- | Global search query where ParentId is Master Node Corpus Id
-- | Global search query where ParentId is Master Node Corpus Id
queryDocInDatabase
::
ParentId
->
Text
->
O
.
Select
(
Column
SqlInt4
,
Column
SqlJsonb
)
queryDocInDatabase
_p
q
=
proc
()
->
do
row
<-
queryNodeSearchTable
-<
()
...
...
@@ -51,6 +57,75 @@ searchDocInDatabase p t = runOpaQuery (queryDocInDatabase p t)
restrict
-<
(
_ns_typename
row
)
.==
(
sqlInt4
$
toDBid
NodeDocument
)
returnA
-<
(
_ns_id
row
,
_ns_hyperdata
row
)
------------------------------------------------------------------------
-- | Search ngrams in documents, ranking them by TF-IDF. We narrow our
-- search only to map/candidate terms.
searchInCorpusWithNgrams
::
HasDBid
NodeType
=>
CorpusId
->
ListId
->
IsTrash
->
NgramsType
->
[[
Text
]]
->
Maybe
Offset
->
Maybe
Limit
->
Maybe
OrderBy
->
Cmd
err
[
FacetDoc
]
searchInCorpusWithNgrams
_cId
_lId
_t
_ngt
_q
_o
_l
_order
=
undefined
-- | Compute TF-IDF for all 'ngramIds' in given 'CorpusId'. In this
-- case only the "TF" part makes sense and so we only compute the
-- ratio of "number of times our terms appear in given document" and
-- "number of all terms in document" and return a sorted list of
-- document ids
tfidfAll
::
CorpusId
->
[
Int
]
->
Cmd
err
[
Int
]
tfidfAll
cId
ngramIds
=
do
let
ngramIdsSet
=
Set
.
fromList
ngramIds
docsWithNgrams
<-
runOpaQuery
(
queryCorpusWithNgrams
cId
ngramIds
)
::
Cmd
err
[(
Int
,
Int
,
Int
)]
-- NOTE The query returned docs with ANY ngramIds. We need to further
-- restrict to ALL ngramIds.
let
docsNgramsM
=
Map
.
fromListWith
(
Set
.
union
)
[
(
ctxId
,
Set
.
singleton
ngrams_id
)
|
(
ctxId
,
ngrams_id
,
_
)
<-
docsWithNgrams
]
let
docsWithAllNgramsS
=
Set
.
fromList
$
List
.
map
fst
$
List
.
filter
(
\
(
_
,
docNgrams
)
->
ngramIdsSet
==
Set
.
intersection
ngramIdsSet
docNgrams
)
$
Map
.
toList
docsNgramsM
let
docsWithAllNgrams
=
List
.
filter
(
\
(
ctxId
,
_
,
_
)
->
Set
.
member
ctxId
docsWithAllNgramsS
)
docsWithNgrams
printDebug
"[tfidfAll] docsWithAllNgrams"
docsWithAllNgrams
let
docsWithCounts
=
Map
.
fromListWith
(
+
)
[
(
ctxId
,
doc_count
)
|
(
ctxId
,
_
,
doc_count
)
<-
docsWithAllNgrams
]
printDebug
"[tfidfAll] docsWithCounts"
docsWithCounts
let
totals
=
[
(
ctxId
,
ngrams_id
,
fromIntegral
doc_count
::
Double
,
fromIntegral
(
fromMaybe
0
$
Map
.
lookup
ctxId
docsWithCounts
)
::
Double
)
|
(
ctxId
,
ngrams_id
,
doc_count
)
<-
docsWithAllNgrams
]
let
tfidf_sorted
=
List
.
sortOn
snd
[(
ctxId
,
doc_count
/
s
)
|
(
ctxId
,
_
,
doc_count
,
s
)
<-
totals
]
pure
$
List
.
map
fst
$
List
.
reverse
tfidf_sorted
-- | Query for searching the 'context_node_ngrams' table so that we
-- find docs with ANY given 'ngramIds'.
queryCorpusWithNgrams
::
CorpusId
->
[
Int
]
->
Select
(
Column
SqlInt4
,
Column
SqlInt4
,
Column
SqlInt4
)
queryCorpusWithNgrams
cId
ngramIds
=
proc
()
->
do
row
<-
queryContextNodeNgramsTable
-<
()
restrict
-<
(
_cnng_node_id
row
)
.==
(
pgNodeId
cId
)
restrict
-<
in_
(
sqlInt4
<$>
ngramIds
)
(
_cnng_ngrams_id
row
)
returnA
-<
(
_cnng_context_id
row
,
_cnng_ngrams_id
row
,
_cnng_doc_count
row
)
--returnA -< row
-- returnA -< ( _cnng_context_id row
-- , _cnng_node_id row
-- , _cnng_ngrams_id row
-- , _cnng_ngramsType row
-- , _cnng_weight row
-- , _cnng_doc_count row)
------------------------------------------------------------------------
-- | todo add limit and offset and order
searchInCorpus
::
HasDBid
NodeType
...
...
@@ -209,4 +284,3 @@ queryContactViaDoc =
)
)
->
Column
SqlBool
cond45
(
doc
,
(
corpus
,
(
_
,(
_
,
_
))))
=
doc
^.
cs_id
.==
corpus
^.
nc_context_id
src/Gargantext/Database/Query/Facet.hs
View file @
da954a33
...
...
@@ -103,7 +103,7 @@ data Facet id created title hyperdata category ngramCount score =
,
facetDoc_score
::
score
}
deriving
(
Show
,
Generic
)
{- | TODO after demo
data Facet id date hyperdata score =
data Facet id date hyperdata score =
FacetDoc { facetDoc_id :: id
, facetDoc_date :: date
, facetDoc_hyperdata :: hyperdata
...
...
@@ -163,7 +163,7 @@ type FacetPairedReadNull = FacetPaired (Column (Nullable SqlInt4) )
(
Column
(
Nullable
SqlInt4
)
)
type
FacetPairedReadNullAgg
=
FacetPaired
(
Aggregator
(
Column
(
Nullable
SqlInt4
)
)
(
Column
(
Nullable
SqlInt4
)
)
(
Column
(
Nullable
SqlInt4
)
)
)
(
Aggregator
(
Column
(
Nullable
SqlTimestamptz
))
(
Column
(
Nullable
SqlTimestamptz
))
...
...
src/Gargantext/Database/Query/Join.hs
View file @
da954a33
...
...
@@ -43,7 +43,7 @@ import Opaleye.Internal.Join (NullMaker(..))
import
qualified
Opaleye.Internal.Unpackspec
()
keepWhen
::
(
a
->
Field
SqlBool
)
->
SelectArr
a
a
keepWhen
::
(
a
->
Field
SqlBool
)
->
SelectArr
a
a
keepWhen
p
=
proc
a
->
do
restrict
-<
p
a
returnA
-<
a
...
...
@@ -61,7 +61,7 @@ leftJoin2 = leftJoin
------------------------------------------------------------------------
-- | LeftJoin3 in two ways to write it
leftJoin3
::
Select
columnsA
->
Select
columnsB
->
Select
columnsC
->
((
columnsA
,
columnsB
,
columnsC
)
->
Column
SqlBool
)
->
((
columnsA
,
columnsB
,
columnsC
)
->
Column
SqlBool
)
->
Select
(
columnsA
,
columnsB
,
columnsC
)
leftJoin3
q1
q2
q3
cond
=
((,,)
<$>
q1
<*>
q2
<*>
q3
)
>>>
keepWhen
cond
...
...
@@ -82,7 +82,7 @@ leftJoin4' :: Select columnsA
->
Select
columnsB
->
Select
columnsC
->
Select
columnsD
->
((
columnsA
,
columnsB
,
columnsC
,
columnsD
)
->
Column
SqlBool
)
->
((
columnsA
,
columnsB
,
columnsC
,
columnsD
)
->
Column
SqlBool
)
->
Select
(
columnsA
,
columnsB
,
columnsC
,
columnsD
)
leftJoin4'
q1
q2
q3
q4
cond
=
((,,,)
<$>
q1
<*>
q2
<*>
q3
<*>
q4
)
>>>
keepWhen
cond
...
...
@@ -375,4 +375,3 @@ leftJoin9 q1 q2 q3 q4 q5 q6 q7 q8 q9
)
cond67
)
cond78
)
cond89
src/Gargantext/Database/Query/Table/ContextNodeNgrams.hs
View file @
da954a33
...
...
@@ -36,12 +36,13 @@ queryContextNodeNgramsTable = selectTable contextNodeNgramsTable
-- | Insert utils
insertContextNodeNgrams
::
[
ContextNodeNgrams
]
->
Cmd
err
Int
insertContextNodeNgrams
=
insertContextNodeNgramsW
.
map
(
\
(
ContextNodeNgrams
c
n
ng
nt
w
)
->
.
map
(
\
(
ContextNodeNgrams
c
n
ng
nt
w
dc
)
->
ContextNodeNgrams
(
pgContextId
c
)
(
pgNodeId
n
)
(
sqlInt4
ng
)
(
pgNgramsTypeId
nt
)
(
sqlDouble
w
)
(
sqlInt4
dc
)
)
insertContextNodeNgramsW
::
[
ContextNodeNgramsWrite
]
->
Cmd
err
Int
...
...
src/Gargantext/Database/Query/Table/Node/UpdateOpaleye.hs
View file @
da954a33
...
...
@@ -35,8 +35,9 @@ updateHyperdata i h = mkCmd $ \c -> putStrLn "before runUpdate_" >>
updateHyperdataQuery
::
ToJSON
a
=>
NodeId
->
a
->
Update
Int64
updateHyperdataQuery
i
h
=
seq
h'
$
trace
"updateHyperdataQuery: encoded JSON"
$
Update
{
uTable
=
nodeTable
,
uUpdateWith
=
updateEasy
(
\
(
Node
_ni
_nh
_nt
_nu
_np
_nn
_nd
_h
)
->
trace
"updating mate"
$
Node
_ni
_nh
_nt
_nu
_np
_nn
_nd
h'
,
uUpdateWith
=
updateEasy
(
\
(
Node
{
..
})
->
Node
{
_node_hyperdata
=
h'
,
..
}
-- -> trace "updating mate" $ Node _ni _nh _nt _nu _np _nn _nd h'
)
,
uWhere
=
(
\
row
->
{-trace "uWhere" $-}
_node_id
row
.==
pgNodeId
i
)
,
uReturning
=
rCount
...
...
@@ -63,5 +64,3 @@ updateNodesWithType_ :: ( HasNodeError err
updateNodesWithType_
nt
h
=
do
ns
<-
getNodesIdWithType
nt
mapM
(
\
n
->
updateHyperdata
n
h
)
ns
src/Gargantext/Database/Schema/ContextNodeNgrams.hs
View file @
da954a33
...
...
@@ -19,6 +19,7 @@ module Gargantext.Database.Schema.ContextNodeNgrams
where
import
Prelude
import
Gargantext.Core.Types
(
TermsCount
)
import
Gargantext.Database.Schema.Prelude
import
Gargantext.Database.Schema.Ngrams
(
NgramsTypeId
,
NgramsId
)
import
Gargantext.Database.Admin.Types.Node
...
...
@@ -26,15 +27,16 @@ import Gargantext.Database.Admin.Types.Node
type
ContextNodeNgrams
=
ContextNodeNgramsPoly
ContextId
ListId
NgramsId
NgramsTypeId
Double
ContextNodeNgramsPoly
ContextId
ListId
NgramsId
NgramsTypeId
Double
TermsCount
data
ContextNodeNgramsPoly
c
n
ngrams_id
ngt
w
data
ContextNodeNgramsPoly
c
n
ngrams_id
ngt
w
dc
=
ContextNodeNgrams
{
_cnng_context_id
::
!
c
,
_cnng_node_id
::
!
n
,
_cnng_ngrams_id
::
!
ngrams_id
,
_cnng_ngramsType
::
!
ngt
,
_cnng_weight
::
!
w
,
_cnng_doc_count
::
!
dc
}
deriving
(
Show
)
type
ContextNodeNgramsWrite
=
...
...
@@ -43,6 +45,7 @@ type ContextNodeNgramsWrite =
(
Column
SqlInt4
)
(
Column
SqlInt4
)
(
Column
SqlFloat8
)
(
Column
SqlInt4
)
type
ContextNodeNgramsRead
=
ContextNodeNgramsPoly
(
Column
SqlInt4
)
...
...
@@ -50,6 +53,7 @@ type ContextNodeNgramsRead =
(
Column
SqlInt4
)
(
Column
SqlInt4
)
(
Column
SqlFloat8
)
(
Column
SqlInt4
)
type
ContextNodeNgramsReadNull
=
ContextNodeNgramsPoly
(
Column
(
Nullable
SqlInt4
))
...
...
@@ -57,6 +61,7 @@ type ContextNodeNgramsReadNull =
(
Column
(
Nullable
SqlInt4
))
(
Column
(
Nullable
SqlInt4
))
(
Column
(
Nullable
SqlFloat8
))
(
Column
(
Nullable
SqlInt4
))
$
(
makeAdaptorAndInstance
"pContextNodeNgrams"
''
C
ontextNodeNgramsPoly
)
makeLenses
''
C
ontextNodeNgramsPoly
...
...
@@ -70,5 +75,6 @@ contextNodeNgramsTable = Table "context_node_ngrams"
,
_cnng_ngrams_id
=
requiredTableField
"ngrams_id"
,
_cnng_ngramsType
=
requiredTableField
"ngrams_type"
,
_cnng_weight
=
requiredTableField
"weight"
,
_cnng_doc_count
=
requiredTableField
"doc_count"
}
)
src/Gargantext/Database/Schema/NodeNgrams.hs
View file @
da954a33
{-|
Module : Gargantext.Database.Schema.NodeNgrams
Description :
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
...
...
@@ -86,7 +86,7 @@ type NgramsClass = Int
type
NgramsText
=
Text
-- Example of list Ngrams
-- type ListNgrams = NodeNgramsPoly (Maybe Int) ListType Text
-- type ListNgrams = NodeNgramsPoly (Maybe Int) ListType Text
type
NodeNgramsW
=
NodeNgramsPoly
(
Maybe
NodeNgramsId
)
NodeId
ListType
NgramsText
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment