Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
153
Issues
153
List
Board
Labels
Milestones
Merge Requests
12
Merge Requests
12
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
3b163685
Commit
3b163685
authored
Oct 27, 2022
by
Przemyslaw Kaminski
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[ngrams] first version of ngrams counting in docs
parent
0d6a5eaa
Pipeline
#3329
passed with stage
in 91 minutes and 4 seconds
Changes
34
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
34 changed files
with
230 additions
and
151 deletions
+230
-151
README.md
README.md
+2
-0
Main.hs
bin/gargantext-upgrade/Main.hs
+1
-2
docker-compose.yaml
devops/docker/docker-compose.yaml
+1
-1
schema.sql
devops/postgres/schema.sql
+1
-0
0.0.6.7.3.sql
devops/postgres/upgrade/0.0.6.7.3.sql
+2
-0
gargantext.cabal
gargantext.cabal
+5
-4
package.yaml
package.yaml
+2
-0
Text.hs
src-test/Core/Text.hs
+0
-1
Utils.hs
src-test/Core/Utils.hs
+28
-0
Clustering.hs
src-test/Graph/Clustering.hs
+0
-2
Main.hs
src-test/Main.hs
+4
-1
Crypto.hs
src-test/Utils/Crypto.hs
+1
-3
Ngrams.hs
src/Gargantext/API/Ngrams.hs
+4
-4
List.hs
src/Gargantext/API/Ngrams/List.hs
+8
-8
Tools.hs
src/Gargantext/API/Ngrams/Tools.hs
+2
-2
NodeStory.hs
src/Gargantext/Core/NodeStory.hs
+5
-5
Terms.hs
src/Gargantext/Core/Text/Terms.hs
+39
-34
Eleve.hs
src/Gargantext/Core/Text/Terms/Eleve.hs
+3
-0
Mono.hs
src/Gargantext/Core/Text/Terms/Mono.hs
+2
-2
Multi.hs
src/Gargantext/Core/Text/Terms/Multi.hs
+11
-8
Group.hs
src/Gargantext/Core/Text/Terms/Multi/Group.hs
+0
-1
En.hs
src/Gargantext/Core/Text/Terms/Multi/Lang/En.hs
+0
-1
PosTagging.hs
src/Gargantext/Core/Text/Terms/Multi/PosTagging.hs
+11
-7
Types.hs
src/Gargantext/Core/Text/Terms/Multi/PosTagging/Types.hs
+0
-1
WithList.hs
src/Gargantext/Core/Text/Terms/WithList.hs
+7
-5
Types.hs
src/Gargantext/Core/Types.hs
+6
-5
Utils.hs
src/Gargantext/Core/Utils.hs
+19
-2
LegacyMain.hs
src/Gargantext/Core/Viz/Phylo/Legacy/LegacyMain.hs
+1
-2
Flow.hs
src/Gargantext/Database/Action/Flow.hs
+40
-32
Utils.hs
src/Gargantext/Database/Action/Flow/Utils.hs
+10
-9
ContextNodeNgrams.hs
src/Gargantext/Database/Query/Table/ContextNodeNgrams.hs
+2
-1
UpdateOpaleye.hs
src/Gargantext/Database/Query/Table/Node/UpdateOpaleye.hs
+3
-4
ContextNodeNgrams.hs
src/Gargantext/Database/Schema/ContextNodeNgrams.hs
+8
-2
NodeNgrams.hs
src/Gargantext/Database/Schema/NodeNgrams.hs
+2
-2
No files found.
README.md
View file @
3b163685
...
...
@@ -196,6 +196,8 @@ To build documentation, run:
stack
--docker
build
--haddock
--no-haddock-deps
--fast
```
(in
`.stack-work/dist/x86_64-linux-nix/Cabal-3.2.1.0/doc/html/gargantext`
).
## GraphQL
Some introspection information.
...
...
bin/gargantext-upgrade/Main.hs
View file @
3b163685
...
...
@@ -33,7 +33,6 @@ import Gargantext.Database.Admin.Trigger.Init
import
Gargantext.Database.Admin.Types.Hyperdata
(
HyperdataCorpus
)
import
Gargantext.Database.Admin.Types.Node
(
NodeType
(
NodeDocument
,
NodeContact
))
import
Gargantext.Database.Prelude
(
Cmd
''
,
Cmd
,
execPGSQuery
)
import
Gargantext.Database.Query.Table.Node
(
getOrMkList
)
import
Gargantext.Prelude
import
Gargantext.Prelude.Config
(
GargConfig
(
..
),
readConfig
)
import
Prelude
(
getLine
)
...
...
@@ -67,7 +66,7 @@ main = do
_ok
<-
getLine
cfg
<-
readConfig
iniPath
let
secret
=
_gc_secretkey
cfg
let
_
secret
=
_gc_secretkey
cfg
withDevEnv
iniPath
$
\
env
->
do
-- First upgrade the Database Schema
...
...
devops/docker/docker-compose.yaml
View file @
3b163685
...
...
@@ -24,7 +24,7 @@ services:
network_mode
:
host
#command: ["postgres", "-c", "log_statement=all"]
#ports:
#
- 5432:5432
#
- 5432:5432
environment
:
POSTGRES_USER
:
gargantua
POSTGRES_PASSWORD
:
C8kdcUrAQy66U
...
...
devops/postgres/schema.sql
View file @
3b163685
...
...
@@ -164,6 +164,7 @@ CREATE TABLE public.context_node_ngrams (
ngrams_id
INTEGER
NOT
NULL
REFERENCES
public
.
ngrams
(
id
)
ON
DELETE
CASCADE
,
ngrams_type
INTEGER
,
weight
double
precision
,
doc_count
INTEGER
,
PRIMARY
KEY
(
context_id
,
node_id
,
ngrams_id
,
ngrams_type
)
);
ALTER
TABLE
public
.
context_node_ngrams
OWNER
TO
gargantua
;
...
...
devops/postgres/upgrade/0.0.6.7.3.sql
0 → 100644
View file @
3b163685
ALTER
TABLE
context_node_ngrams
ADD
COLUMN
doc_count
INTEGER
;
gargantext.cabal
View file @
3b163685
cabal-version: 1.12
-- This file has been generated from package.yaml by hpack version 0.3
5.0
.
-- This file has been generated from package.yaml by hpack version 0.3
4.7
.
--
-- see: https://github.com/sol/hpack
name: gargantext
version: 0.0.6.8.2
version:
0.0.6.8.2
synopsis: Search, map, share
description: Please see README.md
category: Data
...
...
@@ -50,6 +50,7 @@ library
Gargantext.Core.Types
Gargantext.Core.Types.Individu
Gargantext.Core.Types.Main
Gargantext.Core.Utils
Gargantext.Core.Utils.Prefix
Gargantext.Utils.Jobs
Gargantext.Utils.Jobs.API
...
...
@@ -87,6 +88,7 @@ library
Gargantext.Core.Text.Prepare
Gargantext.Core.Text.Search
Gargantext.Core.Text.Terms
Gargantext.Core.Text.Terms.Eleve
Gargantext.Core.Text.Terms.Mono
Gargantext.Core.Text.Terms.Multi.Lang.En
Gargantext.Core.Text.Terms.Multi.Lang.Fr
...
...
@@ -211,7 +213,6 @@ library
Gargantext.Core.Text.Samples.EN
Gargantext.Core.Text.Samples.FR
Gargantext.Core.Text.Samples.SP
Gargantext.Core.Text.Terms.Eleve
Gargantext.Core.Text.Terms.Mono.Stem
Gargantext.Core.Text.Terms.Mono.Stem.En
Gargantext.Core.Text.Terms.Mono.Token
...
...
@@ -221,7 +222,6 @@ library
Gargantext.Core.Text.Terms.Multi.PosTagging
Gargantext.Core.Text.Terms.Multi.PosTagging.Types
Gargantext.Core.Types.Phylo
Gargantext.Core.Utils
Gargantext.Core.Utils.DateUtils
Gargantext.Core.Viz
Gargantext.Core.Viz.Chart
...
...
@@ -809,6 +809,7 @@ test-suite garg-test
Core.Text
Core.Text.Examples
Core.Text.Flow
Core.Utils
Graph.Clustering
Graph.Distance
Ngrams.Lang
...
...
package.yaml
View file @
3b163685
...
...
@@ -74,6 +74,7 @@ library:
-
Gargantext.Core.Types
-
Gargantext.Core.Types.Individu
-
Gargantext.Core.Types.Main
-
Gargantext.Core.Utils
-
Gargantext.Core.Utils.Prefix
-
Gargantext.Utils.Jobs
-
Gargantext.Utils.Jobs.API
...
...
@@ -111,6 +112,7 @@ library:
-
Gargantext.Core.Text.Prepare
-
Gargantext.Core.Text.Search
-
Gargantext.Core.Text.Terms
-
Gargantext.Core.Text.Terms.Eleve
-
Gargantext.Core.Text.Terms.Mono
-
Gargantext.Core.Text.Terms.Multi.Lang.En
-
Gargantext.Core.Text.Terms.Multi.Lang.Fr
...
...
src-test/Core/Text.hs
View file @
3b163685
{-|
Module : Graph.Clustering
Description : Basic tests to avoid quick regression
...
...
src-test/Core/Utils.hs
0 → 100644
View file @
3b163685
{-|
Module : Core.Utils
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module
Core.Utils
where
import
Test.Hspec
import
Gargantext.Prelude
import
Gargantext.Core.Utils
-- | Core.Utils tests
test
::
IO
()
test
=
hspec
$
do
describe
"check if groupWithCounts works"
$
do
it
"simple integer array"
$
do
(
groupWithCounts
[
1
,
2
,
3
,
1
,
2
,
3
])
`
shouldBe
`
[(
1
,
2
),
(
2
,
2
),
(
3
,
2
)]
it
"string"
$
do
(
groupWithCounts
"abccba"
)
`
shouldBe
`
[(
'a'
,
2
),
(
'b'
,
2
),
(
'c'
,
2
)]
src-test/Graph/Clustering.hs
View file @
3b163685
{-|
Module : Graph.Clustering
Description : Basic tests to avoid quick regression
...
...
@@ -40,4 +39,3 @@ test = hspec $ do
let
result
=
List
.
length
partitions
>
1
shouldBe
True
result
src-test/Main.hs
View file @
3b163685
...
...
@@ -11,6 +11,8 @@ Portability : POSIX
import
Gargantext.Prelude
import
Gargantext.Core
(
Lang
(
..
))
import
qualified
Core.Utils
as
Utils
--import qualified Ngrams.Lang.Fr as Fr
--import qualified Ngrams.Lang as Lang
import
qualified
Ngrams.Lang.Occurrences
as
Occ
...
...
@@ -22,11 +24,12 @@ import qualified Utils.Crypto as Crypto
main
::
IO
()
main
=
do
Utils
.
test
-- Occ.parsersTest
-- Lang.ngramsExtractionTest FR
-- Lang.ngramsExtractionTest EN
-- Metrics.main
Graph
.
test
Graph
.
test
PD
.
testFromRFC3339
-- GD.test
Crypto
.
test
src-test/Utils/Crypto.hs
View file @
3b163685
{-|
Module : Utils.Crypto
Description :
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
...
...
@@ -43,4 +42,3 @@ test = hspec $ do
let
hash2
=
hash
([
"b"
,
"a"
]
::
[
Text
])
it
"compare"
$
do
hash1
`
shouldBe
`
hash2
src/Gargantext/API/Ngrams.hs
View file @
3b163685
...
...
@@ -184,9 +184,9 @@ saveNodeStory :: ( MonadReader env m, MonadBase IO m, HasNodeStorySaver env )
saveNodeStory
=
do
saver
<-
view
hasNodeStorySaver
liftBase
$
do
Gargantext
.
Prelude
.
putStrLn
"---- Running node story saver ----"
--
Gargantext.Prelude.putStrLn "---- Running node story saver ----"
saver
Gargantext
.
Prelude
.
putStrLn
"---- Node story saver finished ----"
--
Gargantext.Prelude.putStrLn "---- Node story saver finished ----"
saveNodeStoryImmediate
::
(
MonadReader
env
m
,
MonadBase
IO
m
,
HasNodeStoryImmediateSaver
env
)
...
...
@@ -194,9 +194,9 @@ saveNodeStoryImmediate :: ( MonadReader env m, MonadBase IO m, HasNodeStoryImmed
saveNodeStoryImmediate
=
do
saver
<-
view
hasNodeStoryImmediateSaver
liftBase
$
do
Gargantext
.
Prelude
.
putStrLn
"---- Running node story immediate saver ----"
--
Gargantext.Prelude.putStrLn "---- Running node story immediate saver ----"
saver
Gargantext
.
Prelude
.
putStrLn
"---- Node story immediate saver finished ----"
--
Gargantext.Prelude.putStrLn "---- Node story immediate saver finished ----"
listTypeConflictResolution
::
ListType
->
ListType
->
ListType
...
...
src/Gargantext/API/Ngrams/List.hs
View file @
3b163685
...
...
@@ -159,15 +159,15 @@ reIndexWith cId lId nt lts = do
-- TODO Tests here
let
ngramsByDoc
=
map
(
HashMap
.
fromList
)
$
map
(
map
(
\
(
k
,
v
)
->
(
SimpleNgrams
(
text2ngrams
k
),
v
)))
$
map
(
\
doc
->
List
.
zip
(
termsInText
(
buildPatterns
$
map
(
\
k
->
(
Text
.
splitOn
" "
$
unNgramsTerm
k
,
[]
))
orphans
)
$
Text
.
unlines
$
catMaybes
[
doc
^.
context_hyperdata
.
hd_title
,
doc
^.
context_hyperdata
.
hd_abstract
]
$
map
(
map
(
\
((
k
,
cnt
),
v
)
->
(
SimpleNgrams
(
text2ngrams
k
),
over
(
traverse
.
traverse
)
(
\
p
->
(
p
,
cnt
))
v
)))
$
map
(
\
doc
->
List
.
zip
(
termsInText
(
buildPatterns
$
map
(
\
k
->
(
Text
.
splitOn
" "
$
unNgramsTerm
k
,
[]
))
orphans
)
$
Text
.
unlines
$
catMaybes
[
doc
^.
context_hyperdata
.
hd_title
,
doc
^.
context_hyperdata
.
hd_abstract
]
)
(
List
.
cycle
[
Map
.
fromList
$
[(
nt
,
Map
.
singleton
(
doc
^.
context_id
)
1
)]])
(
List
.
cycle
[
Map
.
fromList
$
[(
nt
,
Map
.
singleton
(
doc
^.
context_id
)
1
)]])
)
docs
-- printDebug "ngramsByDoc" ngramsByDoc
...
...
src/Gargantext/API/Ngrams/Tools.hs
View file @
3b163685
...
...
@@ -195,7 +195,7 @@ getCoocByNgrams' f (Diagonal diag) m =
listToCombi
identity
ks
]
where
where
ks
=
HM
.
keys
m
-- TODO k could be either k1 or k2 here
...
...
@@ -220,7 +220,7 @@ getCoocByNgrams'' (Diagonal diag) (f1,f2) (m1,m2) =
-- TODO check optim
-- listToCombi identity ks1
]
where
where
ks1
=
HM
.
keys
m1
ks2
=
HM
.
keys
m2
...
...
src/Gargantext/Core/NodeStory.hs
View file @
3b163685
...
...
@@ -455,7 +455,7 @@ insertArchiveList c nodeId a = do
where
query
::
PGS
.
Query
query
=
[
sql
|
WITH s as (SELECT ? as sid, ? sversion, ? sngrams_type_id, ngrams.id as sngrams_id, ?::jsonb as srepo FROM ngrams WHERE terms = ?)
INSERT INTO node_stories(node_id, version, ngrams_type_id, ngrams_id, ngrams_repo_element)
INSERT INTO node_stories(node_id, version, ngrams_type_id, ngrams_id, ngrams_repo_element)
SELECT s.sid, s.sversion, s.sngrams_type_id, s.sngrams_id, s.srepo from s s join nodes n on s.sid = n.id
|]
...
...
@@ -505,22 +505,22 @@ updateNodeStory c nodeId@(NodeId _nId) currentArchive newArchive = do
--printDebug "[updateNodeStory] updates" $ Text.unlines $ (Text.pack . show) <$> updates
-- 2. Perform inserts/deletes/updates
printDebug
"[updateNodeStory] applying insert"
()
--
printDebug "[updateNodeStory] applying insert" ()
insertArchiveList
c
nodeId
$
Archive
{
_a_version
=
newArchive
^.
a_version
,
_a_history
=
[]
,
_a_state
=
archiveStateFromList
inserts
}
printDebug
"[updateNodeStory] insert applied"
()
--
printDebug "[updateNodeStory] insert applied" ()
--TODO Use currentArchive ^. a_version in delete and report error
-- if entries with (node_id, ngrams_type_id, ngrams_id) but
-- different version are found.
deleteArchiveList
c
nodeId
$
Archive
{
_a_version
=
newArchive
^.
a_version
,
_a_history
=
[]
,
_a_state
=
archiveStateFromList
deletes
}
printDebug
"[updateNodeStory] delete applied"
()
--
printDebug "[updateNodeStory] delete applied" ()
updateArchiveList
c
nodeId
$
Archive
{
_a_version
=
newArchive
^.
a_version
,
_a_history
=
[]
,
_a_state
=
archiveStateFromList
updates
}
printDebug
"[updateNodeStory] update applied"
()
--
printDebug "[updateNodeStory] update applied" ()
pure
()
-- where
...
...
src/Gargantext/Core/Text/Terms.hs
View file @
3b163685
...
...
@@ -28,8 +28,9 @@ compute graph
-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE ConstrainedClassMethods #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TemplateHaskell #-}
module
Gargantext.Core.Text.Terms
where
...
...
@@ -47,6 +48,7 @@ import qualified Data.Set as Set
import
qualified
Data.Text
as
Text
import
qualified
Data.HashMap.Strict
as
HashMap
import
Gargantext.Core
import
Gargantext.Core.Utils
(
groupWithCounts
)
import
Gargantext.Core.Text
(
sentences
,
HasText
(
..
))
import
Gargantext.Core.Text.Terms.Eleve
(
mainEleveWith
,
Tries
,
Token
,
buildTries
,
toToken
)
import
Gargantext.Core.Text.Terms.Mono
(
monoTerms
)
...
...
@@ -70,24 +72,23 @@ data TermType lang
,
_tt_model
::
!
(
Maybe
(
Tries
Token
()
))
}
deriving
(
Generic
)
deriving
instance
(
Show
lang
)
=>
Show
(
TermType
lang
)
makeLenses
''
T
ermType
--group :: [Text] -> [Text]
--group = undefined
-- remove Stop Words
-- map (filter (\t -> not . elem t)) $
-- map (filter (\t -> not . elem t)) $
------------------------------------------------------------------------
-- | Sugar to extract terms from text (hidd
eng mapM
from end user).
-- | Sugar to extract terms from text (hidd
ing 'mapM'
from end user).
--extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
extractTerms
::
TermType
Lang
->
[
Text
]
->
IO
[[
Terms
]]
extractTerms
::
TermType
Lang
->
[
Text
]
->
IO
[[
TermsWithCount
]]
extractTerms
(
Unsupervised
{
..
})
xs
=
mapM
(
terms
(
Unsupervised
{
_tt_model
=
Just
m'
,
..
}))
xs
where
m'
=
case
_tt_model
of
Just
m''
->
m''
Nothing
->
newTries
_tt_windowSize
(
Text
.
intercalate
" "
xs
)
extractTerms
termTypeLang
xs
=
mapM
(
terms
termTypeLang
)
xs
...
...
@@ -116,12 +117,13 @@ data ExtractedNgrams = SimpleNgrams { unSimpleNgrams :: Ngrams }
instance
Hashable
ExtractedNgrams
-- | A typeclass that represents extracting ngrams from an entity.
class
ExtractNgramsT
h
where
extractNgramsT
::
HasText
h
=>
TermType
Lang
->
h
->
Cmd
err
(
HashMap
ExtractedNgrams
(
Map
NgramsType
Int
))
->
Cmd
err
(
HashMap
ExtractedNgrams
(
Map
NgramsType
Int
,
TermsCount
))
------------------------------------------------------------------------
enrichedTerms
::
Lang
->
PosTagAlgo
->
POS
->
Terms
->
NgramsPostag
enrichedTerms
l
pa
po
(
Terms
ng1
ng2
)
=
...
...
@@ -132,7 +134,7 @@ enrichedTerms l pa po (Terms ng1 ng2) =
------------------------------------------------------------------------
cleanNgrams
::
Int
->
Ngrams
->
Ngrams
cleanNgrams
s
ng
cleanNgrams
s
ng
|
Text
.
length
(
ng
^.
ngramsTerms
)
<
s
=
ng
|
otherwise
=
text2ngrams
(
Text
.
take
s
(
ng
^.
ngramsTerms
))
...
...
@@ -151,10 +153,10 @@ insertExtractedNgrams ngs = do
let
(
s
,
e
)
=
List
.
partition
isSimpleNgrams
ngs
m1
<-
insertNgrams
(
map
unSimpleNgrams
s
)
--printDebug "others" m1
m2
<-
insertNgramsPostag
(
map
unEnrichedNgrams
e
)
--printDebug "terms" m2
let
result
=
HashMap
.
union
m1
m2
pure
result
...
...
@@ -163,43 +165,48 @@ isSimpleNgrams (SimpleNgrams _) = True
isSimpleNgrams
_
=
False
------------------------------------------------------------------------
-- | Terms from
Text
--
Mono
: mono terms
--
Multi
: multi terms
--
MonoMulti
: mono and multi
-- | Terms from
'Text'
--
'Mono'
: mono terms
--
'Multi'
: multi terms
--
'MonoMulti'
: mono and multi
-- TODO : multi terms should exclude mono (intersection is not empty yet)
terms
::
TermType
Lang
->
Text
->
IO
[
Terms
]
terms
(
Mono
lang
)
txt
=
pure
$
monoTerms
lang
txt
terms
(
Multi
lang
)
txt
=
multiterms
lang
txt
terms
(
MonoMulti
lang
)
txt
=
terms
(
Multi
lang
)
txt
terms
(
Unsupervised
{
..
})
txt
=
termsUnsupervised
(
Unsupervised
{
_tt_model
=
Just
m'
,
..
})
txt
terms
::
TermType
Lang
->
Text
->
IO
[
TermsWithCount
]
terms
tt
txt
=
do
printDebug
"[terms] tt"
tt
printDebug
"[terms] txt"
txt
out
<-
termsNoLog
tt
txt
printDebug
"[terms] out"
out
pure
out
termsNoLog
::
TermType
Lang
->
Text
->
IO
[
TermsWithCount
]
termsNoLog
(
Mono
lang
)
txt
=
pure
$
monoTerms
lang
txt
termsNoLog
(
Multi
lang
)
txt
=
multiterms
lang
txt
termsNoLog
(
MonoMulti
lang
)
txt
=
terms
(
Multi
lang
)
txt
termsNoLog
(
Unsupervised
{
..
})
txt
=
pure
$
termsUnsupervised
(
Unsupervised
{
_tt_model
=
Just
m'
,
..
})
txt
where
m'
=
maybe
(
newTries
_tt_ngramsSize
txt
)
identity
_tt_model
-- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
------------------------------------------------------------------------
-- | Unsupervised ngrams extraction
-- language agnostic extraction
-- TODO: remove IO
-- TODO: newtype BlockText
type
WindowSize
=
Int
type
MinNgramSize
=
Int
termsUnsupervised
::
TermType
Lang
->
Text
->
IO
[
Terms
]
termsUnsupervised
(
Unsupervised
l
n
s
m
)
=
pure
.
map
(
text2term
l
)
.
List
.
nub
.
(
List
.
filter
(
\
l'
->
List
.
length
l'
>=
s
))
-- | Unsupervised ngrams extraction
-- language agnostic extraction
-- TODO: newtype BlockText
termsUnsupervised
::
TermType
Lang
->
Text
->
[
TermsWithCount
]
termsUnsupervised
(
Unsupervised
{
_tt_model
=
Nothing
})
=
panic
"[termsUnsupervised] no model"
termsUnsupervised
(
Unsupervised
{
_tt_model
=
Just
_tt_model
,
..
})
=
map
(
\
(
t
,
cnt
)
->
(
text2term
_tt_lang
t
,
cnt
))
.
groupWithCounts
-- . List.nub
.
(
List
.
filter
(
\
l'
->
List
.
length
l'
>=
_tt_windowSize
))
.
List
.
concat
.
mainEleveWith
(
maybe
(
panic
"no model"
)
identity
m
)
n
.
mainEleveWith
_tt_model
_tt_ngramsSize
.
uniText
termsUnsupervised
_
=
undefined
newTries
::
Int
->
Text
->
Tries
Token
()
newTries
n
t
=
buildTries
n
(
fmap
toToken
$
uniText
t
)
...
...
@@ -217,5 +224,3 @@ text2term lang txt = Terms txt (Set.fromList $ map (stem lang) txt)
isPunctuation
::
Text
->
Bool
isPunctuation
x
=
List
.
elem
x
$
(
Text
.
pack
.
pure
)
<$>
(
"!?(),;.:"
::
String
)
src/Gargantext/Core/Text/Terms/Eleve.hs
View file @
3b163685
...
...
@@ -32,6 +32,7 @@ Notes for current implementation:
-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeFamilies #-}
...
...
@@ -278,6 +279,8 @@ data Tries k e = Tries
makeLenses
''
T
ries
deriving
instance
(
Show
k
,
Show
e
)
=>
Show
(
Tries
k
e
)
buildTries
::
Int
->
[[
Token
]]
->
Tries
Token
()
buildTries
n
sentences
=
Tries
{
_fwd
=
buildTrie
Forward
n
sentences
...
...
src/Gargantext/Core/Text/Terms/Mono.hs
View file @
3b163685
...
...
@@ -40,8 +40,8 @@ words = monoTexts
isSep
::
Char
->
Bool
isSep
=
(`
elem
`
(
",.:;?!(){}[]
\"\'
"
::
String
))
monoTerms
::
Lang
->
Text
->
[
Terms
]
monoTerms
l
txt
=
map
(
monoText2term
l
)
$
monoTexts
txt
monoTerms
::
Lang
->
Text
->
[
Terms
WithCount
]
monoTerms
l
txt
=
map
(
\
t
->
(
monoText2term
l
t
,
1
)
)
$
monoTexts
txt
monoTexts
::
Text
->
[
Text
]
monoTexts
=
L
.
concat
.
monoTextsBySentence
...
...
src/Gargantext/Core/Text/Terms/Multi.hs
View file @
3b163685
...
...
@@ -21,6 +21,7 @@ import Data.List (concat)
import
Gargantext.Prelude
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core.Types
import
Gargantext.Core.Utils
(
groupWithCounts
)
import
Gargantext.Core.Text.Terms.Multi.PosTagging
import
Gargantext.Core.Text.Terms.Multi.PosTagging.Types
...
...
@@ -37,14 +38,16 @@ import qualified Gargantext.Utils.SpacyNLP as SpacyNLP
type
NLP_API
=
Lang
->
Text
->
IO
PosSentences
-------------------------------------------------------------------
multiterms
::
Lang
->
Text
->
IO
[
Terms
]
multiterms
=
multiterms'
tokenTag2terms
multiterms
::
Lang
->
Text
->
IO
[
TermsWithCount
]
multiterms
l
txt
=
do
ret
<-
multiterms'
tokenTag2terms
l
txt
pure
$
groupWithCounts
ret
where
multiterms'
::
(
TokenTag
->
a
)
->
Lang
->
Text
->
IO
[
a
]
multiterms'
f
lang
txt
=
concat
multiterms'
f
lang
txt
'
=
concat
<$>
map
(
map
f
)
<$>
map
(
filter
(
\
t
->
_my_token_pos
t
==
Just
NP
))
<$>
tokenTags
lang
txt
<$>
tokenTags
lang
txt
'
-------------------------------------------------------------------
tokenTag2terms
::
TokenTag
->
Terms
...
...
@@ -57,10 +60,10 @@ tokenTags l _ = panic $ "[G.C.T.T.Multi] Lang NLP API not implemented yet " <
tokenTagsWith
::
Lang
->
Text
->
NLP_API
->
IO
[[
TokenTag
]]
tokenTagsWith
lang
txt
nlp
=
map
(
groupTokens
lang
)
<$>
map
tokens2tokensTags
<$>
map
_sentenceTokens
<$>
_sentences
<$>
nlp
lang
txt
<$>
map
tokens2tokensTags
<$>
map
_sentenceTokens
<$>
_sentences
<$>
nlp
lang
txt
---- | This function analyses and groups (or not) ngrams according to
...
...
src/Gargantext/Core/Text/Terms/Multi/Group.hs
View file @
3b163685
...
...
@@ -30,4 +30,3 @@ group2 p1 p2 (x@(TokenTag _ _ Nothing _):y) = (x: group2 p1 p2 y)
group2
_
_
[
x
@
(
TokenTag
_
_
(
Just
_
)
_
)]
=
[
x
]
group2
p1
p2
(
x
@
(
TokenTag
_
_
(
Just
_
)
_
)
:
y
@
(
TokenTag
_
_
Nothing
_
)
:
z
)
=
(
x
:
y
:
group2
p1
p2
(
y
:
z
))
group2
_
_
[]
=
[]
src/Gargantext/Core/Text/Terms/Multi/Lang/En.hs
View file @
3b163685
...
...
@@ -40,4 +40,3 @@ groupTokens ntags = group2 NP NP
--groupNgrams ((x,_,"LOCATION"):(y,yy,"LOCATION"):xs) = groupNgrams ((x <> " " <> y,yy,"LOCATION"):xs)
--
--groupNgrams (x:xs) = (x:(groupNgrams xs))
src/Gargantext/Core/Text/Terms/Multi/PosTagging.hs
View file @
3b163685
...
...
@@ -45,19 +45,22 @@ tokens2tokensTags :: [Token] -> [TokenTag]
tokens2tokensTags
ts
=
filter'
$
map
tokenTag
ts
------------------------------------------------------------------------
tokenTag
::
Token
->
TokenTag
tokenTag
(
Token
_
w
_
l
_
_
p
n
_
_
)
=
TokenTag
w'
l'
p
n
tokenTag
(
Token
{
..
})
=
TokenTag
{
_my_token_word
=
w'
,
_my_token_lemma
=
l'
,
_my_token_pos
=
_tokenPos
,
_my_token_ner
=
_tokenNer
}
where
w'
=
split
w
l'
=
fromList
(
split
l
)
w'
=
split
_tokenWord
l'
=
fromList
(
split
_tokenLemma
)
split
=
splitOn
(
pack
" "
)
.
toLower
filter'
::
[
TokenTag
]
->
[
TokenTag
]
filter'
xs
=
filter
isNgrams
xs
where
isNgrams
(
TokenTag
_
_
p
n
)
=
isJust
p
||
isJust
n
isNgrams
(
TokenTag
{
..
})
=
isJust
_my_token_pos
||
isJust
_my_token_ner
------------------------------------------------------------------------
-- request =
-- request =
-- "fr" : {
-- "tokenize.language" : "fr",
-- "pos.model" : "edu/stanford/nlp/models/pos-tagger/french/french.tagger",
...
...
@@ -66,9 +69,9 @@ filter' xs = filter isNgrams xs
-- "depparse.model" : "edu/stanford/nlp/models/parser/nndep/UD_French.gz",
-- "depparse.language" : "french",
-- "ner.model": DATA_ROOT+"/eunews.fr.crf.gz",
-- "ssplit.newlineIsSentenceBreak": "always"
-- "ssplit.newlineIsSentenceBreak": "always"
-- },
--
--
corenlp'
::
(
FromJSON
a
,
ConvertibleStrings
p
ByteString
...
...
@@ -80,6 +83,7 @@ corenlp' lang txt = do
FR
->
"{
\"
annotators
\"
:
\"
tokenize,ssplit,pos,lemma,ner
\"
,
\"
parse.model
\"
:
\"
edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz
\"
,
\"
pos.model
\"
:
\"
edu/stanford/nlp/models/pos-tagger/french/french.tagger
\"
,
\"
tokenize.language
\"
:
\"
fr
\"
,
\"
outputFormat
\"
:
\"
json
\"
}"
_
->
panic
$
pack
"not implemented yet"
url
<-
parseRequest
$
"POST http://localhost:9000/?properties="
<>
properties
-- curl -XPOST 'http://localhost:9000/?properties=%7B%22annotators%22:%20%22tokenize,ssplit,pos,ner%22,%20%22outputFormat%22:%20%22json%22%7D' -d 'hello world, hello' | jq .
let
request
=
setRequestBodyLBS
(
cs
txt
)
url
httpJSON
request
...
...
src/Gargantext/Core/Text/Terms/Multi/PosTagging/Types.hs
View file @
3b163685
...
...
@@ -52,4 +52,3 @@ data PosSentences = PosSentences { _sentences :: [Sentence]}
deriving
(
Show
,
Generic
)
$
(
deriveJSON
(
unPrefix
"_"
)
''
P
osSentences
)
src/Gargantext/Core/Text/Terms/WithList.hs
View file @
3b163685
{-|
Module : Gargantext.Core.Text.Terms.WithList
Description :
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
...
...
@@ -21,6 +21,8 @@ import Data.Text (Text, concat, unwords)
import
Gargantext.Prelude
import
Gargantext.Core.Text.Context
import
Gargantext.Core.Text.Terms.Mono
(
monoTextsBySentence
)
import
Gargantext.Core.Types
(
TermsCount
)
import
Gargantext.Core.Utils
(
groupWithCounts
)
import
Prelude
(
error
)
import
qualified
Data.Algorithms.KMP
as
KMP
import
qualified
Data.IntMap.Strict
as
IntMap
...
...
@@ -72,8 +74,8 @@ buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
-- Utils
type
BlockText
=
Text
type
MatchedText
=
Text
termsInText
::
Patterns
->
BlockText
->
[
MatchedText
]
termsInText
pats
txt
=
List
.
nub
termsInText
::
Patterns
->
BlockText
->
[
(
MatchedText
,
TermsCount
)
]
termsInText
pats
txt
=
groupWithCounts
$
List
.
concat
$
map
(
map
unwords
)
$
extractTermsWithList
pats
txt
...
...
@@ -96,7 +98,7 @@ extractTermsWithList' pats = map (concat . map concat . replaceTerms pats)
{- | Not used
filterWith :: TermList
-> (a -> Text)
-> [a]
-> [a]
-> [(a, [Text])]
filterWith termList f xs = filterWith' termList f zip xs
...
...
@@ -104,7 +106,7 @@ filterWith termList f xs = filterWith' termList f zip xs
filterWith' :: TermList
-> (a -> Text)
-> ([a] -> [[Text]] -> [b])
-> [a]
-> [a]
-> [b]
filterWith' termList f f' xs = f' xs
$ map (extractTermsWithList' pats)
...
...
src/Gargantext/Core/Types.hs
View file @
3b163685
{-|
Module : Gargantext.Types
Description :
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
...
...
@@ -17,7 +17,7 @@ commentary with @some markup@.
module
Gargantext.Core.Types
(
module
Gargantext
.
Core
.
Types
.
Main
,
module
Gargantext
.
Database
.
Admin
.
Types
.
Node
,
DebugMode
(
..
),
withDebugMode
,
Term
,
Terms
(
..
)
,
Term
,
Terms
(
..
)
,
TermsCount
,
TermsWithCount
,
TokenTag
(
..
),
POS
(
..
),
NER
(
..
)
,
Label
,
Stems
,
HasInvalidError
(
..
),
assertValid
...
...
@@ -70,10 +70,13 @@ type Label = [Text]
data
Terms
=
Terms
{
_terms_label
::
Label
,
_terms_stem
::
Stems
}
deriving
(
Ord
,
Show
)
instance
Eq
Terms
where
(
==
)
(
Terms
_
s1
)
(
Terms
_
s2
)
=
s1
==
s2
type
TermsCount
=
Int
type
TermsWithCount
=
(
Terms
,
TermsCount
)
------------------------------------------------------------------------
data
Tag
=
POS
|
NER
deriving
(
Show
,
Eq
)
...
...
@@ -208,5 +211,3 @@ data TODO = TODO
instance
ToSchema
TODO
where
instance
ToParamSchema
TODO
where
----------------------------------------------------------------------------
src/Gargantext/Core/Utils.hs
View file @
3b163685
{-|
Module : Gargantext.Utils
Description :
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
...
...
@@ -12,16 +12,19 @@ commentary with @some markup@.
-}
module
Gargantext.Core.Utils
(
module
Gargantext.Core.Utils
(
-- module Gargantext.Utils.Chronos
module
Gargantext
.
Core
.
Utils
.
Prefix
,
something
,
alphanum
,
choices
,
randomString
,
groupWithCounts
,
addTuples
)
where
import
Data.Char
(
chr
,
ord
)
import
qualified
Data.List
as
List
import
Data.Maybe
import
Data.Monoid
import
Data.Text
(
Text
,
pack
)
...
...
@@ -57,3 +60,17 @@ randomString :: Int -> IO Text
randomString
num
=
do
str
<-
choices
num
alphanum
pure
$
pack
str
-- | Given a list of items of type 'a', return list with unique items
-- (like List.nub) but tuple-d with their counts in the original list
groupWithCounts
::
(
Ord
a
,
Eq
a
)
=>
[
a
]
->
[(
a
,
Int
)]
groupWithCounts
=
map
f
.
List
.
group
.
List
.
sort
where
f
[]
=
panic
"[groupWithCounts] impossible"
f
ts
@
(
t
:
_
)
=
(
t
,
length
ts
)
addTuples
::
(
Num
a
,
Num
b
)
=>
(
a
,
b
)
->
(
a
,
b
)
->
(
a
,
b
)
addTuples
(
a1
,
b1
)
(
a2
,
b2
)
=
(
a1
+
a2
,
b1
+
b2
)
src/Gargantext/Core/Viz/Phylo/Legacy/LegacyMain.hs
View file @
3b163685
...
...
@@ -65,7 +65,7 @@ flowPhylo cId = do
patterns
=
buildPatterns
termList
-- | To filter the Ngrams of a document based on the termList
filterTerms
::
Patterns
->
(
Date
,
Text
)
->
(
Date
,
[
Text
])
filterTerms
patterns'
(
y
,
d
)
=
(
y
,
termsInText
patterns'
d
)
filterTerms
patterns'
(
y
,
d
)
=
(
y
,
fst
<$>
termsInText
patterns'
d
)
docs
=
map
((
\
(
y
,
t
)
->
Document
y
t
)
.
filterTerms
patterns
)
docs'
...
...
@@ -123,4 +123,3 @@ writePhylo _fp _phview = undefined
-- refactor 2021
-- viewPhylo2Svg :: PhyloView -> IO DB.ByteString
-- viewPhylo2Svg p = graphvizWithHandle Dot (viewToDot p) Svg DB.hGetContents
src/Gargantext/Database/Action/Flow.hs
View file @
3b163685
...
...
@@ -48,7 +48,7 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
where
import
Conduit
import
Control.Lens
((
^.
),
view
,
_Just
,
makeLenses
)
import
Control.Lens
((
^.
),
view
,
_Just
,
makeLenses
,
over
,
traverse
)
import
Data.Aeson.TH
(
deriveJSON
)
import
Data.Conduit.Internal
(
zipSources
)
import
Data.Either
...
...
@@ -60,7 +60,6 @@ import Data.Maybe (catMaybes)
import
Data.Monoid
import
Data.Swagger
import
qualified
Data.Text
as
T
import
Data.Traversable
(
traverse
)
import
Data.Tuple.Extra
(
first
,
second
)
import
GHC.Generics
(
Generic
)
import
Servant.Client
(
ClientError
)
...
...
@@ -83,9 +82,10 @@ import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(.
import
Gargantext.Core.Text.List.Social
(
FlowSocialListWith
)
import
Gargantext.Core.Text.Terms
import
Gargantext.Core.Text.Terms.Mono.Stem.En
(
stemIt
)
import
Gargantext.Core.Types
(
POS
(
NP
))
import
Gargantext.Core.Types
(
POS
(
NP
)
,
TermsCount
)
import
Gargantext.Core.Types.Individu
(
User
(
..
))
import
Gargantext.Core.Types.Main
import
Gargantext.Core.Utils
(
addTuples
)
import
Gargantext.Core.Utils.Prefix
(
unPrefix
,
unPrefixSwagger
)
import
Gargantext.Database.Action.Flow.List
import
Gargantext.Database.Action.Flow.Types
...
...
@@ -284,7 +284,7 @@ flow c u cn la mfslw (mLength, docsC) logStatus = do
,
_scst_events
=
Just
[]
}
pure
$
Prelude
.
head
id
------------------------------------------------------------------------
...
...
@@ -357,25 +357,27 @@ insertMasterDocs c lang hs = do
-- this will enable global database monitoring
-- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
mapNgramsDocs'
::
HashMap
ExtractedNgrams
(
Map
NgramsType
(
Map
NodeId
Int
))
mapNgramsDocs'
::
HashMap
ExtractedNgrams
(
Map
NgramsType
(
Map
NodeId
(
Int
,
TermsCount
)
))
<-
mapNodeIdNgrams
<$>
documentIdWithNgrams
(
extractNgramsT
$
withLang
lang
documentsWithId
)
documentsWithId
(
extractNgramsT
$
withLang
lang
documentsWithId
)
documentsWithId
lId
<-
getOrMkList
masterCorpusId
masterUserId
-- _ <- saveDocNgramsWith lId mapNgramsDocs'
_
<-
saveDocNgramsWith
lId
mapNgramsDocs'
-- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
pure
ids'
saveDocNgramsWith
::
(
FlowCmdM
env
err
m
)
saveDocNgramsWith
::
(
FlowCmdM
env
err
m
)
=>
ListId
->
HashMap
ExtractedNgrams
(
Map
NgramsType
(
Map
NodeId
Int
))
->
HashMap
ExtractedNgrams
(
Map
NgramsType
(
Map
NodeId
(
Int
,
TermsCount
)
))
->
m
()
saveDocNgramsWith
lId
mapNgramsDocs'
=
do
terms2id
<-
insertExtractedNgrams
$
HashMap
.
keys
mapNgramsDocs'
--printDebug "terms2id" terms2id
printDebug
"[saveDocNgramsWith] mapNgramsDocs'"
mapNgramsDocs'
let
mapNgramsDocsNoCount
=
over
(
traverse
.
traverse
.
traverse
)
fst
mapNgramsDocs'
terms2id
<-
insertExtractedNgrams
$
HashMap
.
keys
mapNgramsDocsNoCount
let
mapNgramsDocs
=
HashMap
.
mapKeys
extracted2ngrams
mapNgramsDocs'
...
...
@@ -392,7 +394,7 @@ saveDocNgramsWith lId mapNgramsDocs' = do
<*>
Just
(
fromIntegral
w
::
Double
)
|
(
terms''
,
mapNgramsTypes
)
<-
HashMap
.
toList
mapNgramsDocs
,
(
ngrams_type
,
mapNodeIdWeight
)
<-
Map
.
toList
mapNgramsTypes
,
(
nId
,
w
)
<-
Map
.
toList
mapNodeIdWeight
,
(
nId
,
(
w
,
_cnt
)
)
<-
Map
.
toList
mapNodeIdWeight
]
-- to be removed
...
...
@@ -451,28 +453,32 @@ mergeData rs = catMaybes . map toDocumentWithId . Map.toList
------------------------------------------------------------------------
documentIdWithNgrams
::
HasNodeError
err
=>
(
a
->
Cmd
err
(
HashMap
b
(
Map
NgramsType
Int
)))
->
Cmd
err
(
HashMap
b
(
Map
NgramsType
Int
,
TermsCount
)))
->
[
Indexed
NodeId
a
]
->
Cmd
err
[
DocumentIdWithNgrams
a
b
]
documentIdWithNgrams
f
=
traverse
toDocumentIdWithNgrams
where
toDocumentIdWithNgrams
d
=
do
e
<-
f
$
_unIndex
d
pure
$
DocumentIdWithNgrams
d
e
pure
$
DocumentIdWithNgrams
d
e
-- | TODO check optimization
mapNodeIdNgrams
::
(
Ord
b
,
Hashable
b
)
=>
[
DocumentIdWithNgrams
a
b
]
->
HashMap
b
(
Map
NgramsType
(
Map
NodeId
Int
)
(
Map
NgramsType
(
Map
NodeId
(
Int
,
TermsCount
)
)
)
mapNodeIdNgrams
=
HashMap
.
unionsWith
(
Map
.
unionWith
(
Map
.
unionWith
(
+
)
))
.
fmap
f
mapNodeIdNgrams
=
HashMap
.
unionsWith
(
Map
.
unionWith
(
Map
.
unionWith
addTuples
))
.
fmap
f
where
-- | NOTE We are somehow multiplying 'TermsCount' here: If the
-- same ngrams term has different ngrams types, the 'TermsCount'
-- for it (which is the number of times the terms appears in a
-- document) is copied over to all its types.
f
::
DocumentIdWithNgrams
a
b
->
HashMap
b
(
Map
NgramsType
(
Map
NodeId
Int
))
f
d
=
fmap
(
fmap
(
Map
.
singleton
nId
)
)
$
documentNgrams
d
->
HashMap
b
(
Map
NgramsType
(
Map
NodeId
(
Int
,
TermsCount
)
))
f
d
=
fmap
(
\
(
ngramsTypeMap
,
cnt
)
->
fmap
(
\
i
->
Map
.
singleton
nId
(
i
,
cnt
))
ngramsTypeMap
)
$
documentNgrams
d
where
nId
=
_index
$
documentWithId
d
...
...
@@ -483,25 +489,25 @@ instance ExtractNgramsT HyperdataContact
extractNgramsT
l
hc
=
HashMap
.
mapKeys
(
cleanExtractedNgrams
255
)
<$>
extract
l
hc
where
extract
::
TermType
Lang
->
HyperdataContact
->
Cmd
err
(
HashMap
ExtractedNgrams
(
Map
NgramsType
Int
))
->
Cmd
err
(
HashMap
ExtractedNgrams
(
Map
NgramsType
Int
,
TermsCount
))
extract
_l
hc'
=
do
let
authors
=
map
text2ngrams
$
maybe
[
"Nothing"
]
(
\
a
->
[
a
])
$
view
(
hc_who
.
_Just
.
cw_lastName
)
hc'
pure
$
HashMap
.
fromList
$
[(
SimpleNgrams
a'
,
Map
.
singleton
Authors
1
)
|
a'
<-
authors
]
pure
$
HashMap
.
fromList
$
[(
SimpleNgrams
a'
,
(
Map
.
singleton
Authors
1
,
1
)
)
|
a'
<-
authors
]
instance
ExtractNgramsT
HyperdataDocument
where
extractNgramsT
::
TermType
Lang
->
HyperdataDocument
->
Cmd
err
(
HashMap
ExtractedNgrams
(
Map
NgramsType
Int
))
->
Cmd
err
(
HashMap
ExtractedNgrams
(
Map
NgramsType
Int
,
TermsCount
))
extractNgramsT
lang
hd
=
HashMap
.
mapKeys
(
cleanExtractedNgrams
255
)
<$>
extractNgramsT'
lang
hd
where
extractNgramsT'
::
TermType
Lang
->
HyperdataDocument
->
Cmd
err
(
HashMap
ExtractedNgrams
(
Map
NgramsType
Int
))
->
Cmd
err
(
HashMap
ExtractedNgrams
(
Map
NgramsType
Int
,
TermsCount
))
extractNgramsT'
lang'
doc
=
do
let
source
=
text2ngrams
$
maybe
"Nothing"
identity
...
...
@@ -515,23 +521,25 @@ instance ExtractNgramsT HyperdataDocument
$
maybe
[
"Nothing"
]
(
T
.
splitOn
", "
)
$
_hd_authors
doc
terms'
<-
map
(
enrichedTerms
(
lang'
^.
tt_lang
)
CoreNLP
NP
)
<$>
concat
<$>
liftBase
(
extractTerms
lang'
$
hasText
doc
)
termsWithCounts'
<-
map
(
\
(
t
,
cnt
)
->
(
enrichedTerms
(
lang'
^.
tt_lang
)
CoreNLP
NP
t
,
cnt
))
<$>
concat
<$>
liftBase
(
extractTerms
lang'
$
hasText
doc
)
printDebug
"[extractNgramsT HyperdataDocument] termsWithCounts'"
termsWithCounts'
printDebug
"[extractNgramsT HyperdataDocument] termsWithLargerCounts"
$
filter
(
\
(
_
,
cnt
)
->
cnt
>
1
)
termsWithCounts'
pure
$
HashMap
.
fromList
$
[(
SimpleNgrams
source
,
Map
.
singleton
Sources
1
)
]
<>
[(
SimpleNgrams
i'
,
Map
.
singleton
Institutes
1
)
|
i'
<-
institutes
]
<>
[(
SimpleNgrams
a'
,
Map
.
singleton
Authors
1
)
|
a'
<-
authors
]
<>
[(
EnrichedNgrams
t'
,
Map
.
singleton
NgramsTerms
1
)
|
t'
<-
term
s'
]
$
[(
SimpleNgrams
source
,
(
Map
.
singleton
Sources
1
,
1
)
)
]
<>
[(
SimpleNgrams
i'
,
(
Map
.
singleton
Institutes
1
,
1
)
)
|
i'
<-
institutes
]
<>
[(
SimpleNgrams
a'
,
(
Map
.
singleton
Authors
1
,
1
)
)
|
a'
<-
authors
]
<>
[(
EnrichedNgrams
t'
,
(
Map
.
singleton
NgramsTerms
1
,
cnt'
))
|
(
t'
,
cnt'
)
<-
termsWithCount
s'
]
instance
(
ExtractNgramsT
a
,
HasText
a
)
=>
ExtractNgramsT
(
Node
a
)
where
extractNgramsT
l
(
Node
_
_
_
_
_
_
_
h
)
=
extractNgramsT
l
h
extractNgramsT
l
(
Node
{
_node_hyperdata
=
h
}
)
=
extractNgramsT
l
h
instance
HasText
a
=>
HasText
(
Node
a
)
where
hasText
(
Node
_
_
_
_
_
_
_
h
)
=
hasText
h
hasText
(
Node
{
_node_hyperdata
=
h
}
)
=
hasText
h
...
...
src/Gargantext/Database/Action/Flow/Utils.hs
View file @
3b163685
...
...
@@ -15,6 +15,7 @@ module Gargantext.Database.Action.Flow.Utils
import
Data.Map
(
Map
)
import
Data.HashMap.Strict
(
HashMap
)
import
Gargantext.Core.Types
(
TermsCount
)
import
Gargantext.Database.Admin.Types.Node
import
Gargantext.Database.Prelude
(
Cmd
)
import
Gargantext.Database.Query.Table.ContextNodeNgrams
...
...
@@ -29,24 +30,24 @@ import qualified Data.HashMap.Strict as HashMap
data
DocumentIdWithNgrams
a
b
=
DocumentIdWithNgrams
{
documentWithId
::
Indexed
NodeId
a
,
documentNgrams
::
HashMap
b
(
Map
NgramsType
Int
)
,
documentNgrams
::
HashMap
b
(
Map
NgramsType
Int
,
TermsCount
)
}
deriving
(
Show
)
insertDocNgrams
::
ListId
->
HashMap
(
Indexed
NgramsId
Ngrams
)
(
Map
NgramsType
(
Map
DocId
Int
))
->
HashMap
(
Indexed
NgramsId
Ngrams
)
(
Map
NgramsType
(
Map
DocId
(
Int
,
TermsCount
)
))
->
Cmd
err
Int
insertDocNgrams
lId
m
=
insertContextNodeNgrams
ns
insertDocNgrams
lId
m
=
do
printDebug
"[insertDocNgrams] ns"
ns
insertContextNodeNgrams
ns
where
ns
=
[
ContextNodeNgrams
docId
lId
(
ng
^.
index
)
(
ngramsTypeId
t
)
(
fromIntegral
i
)
cnt
|
(
ng
,
t2n2i
)
<-
HashMap
.
toList
m
,
(
t
,
n2i
)
<-
DM
.
toList
t2n2i
,
(
docId
,
i
)
<-
DM
.
toList
n2i
,
(
docId
,
(
i
,
cnt
)
)
<-
DM
.
toList
n2i
]
-- [(NodeId, {Ngrams: ({NgramsType: Int}, TermsCount)})]
-- {Ngrams: {NgramsType: {NodeId: (Int, TermsCount)}}}
src/Gargantext/Database/Query/Table/ContextNodeNgrams.hs
View file @
3b163685
...
...
@@ -36,12 +36,13 @@ queryContextNodeNgramsTable = selectTable contextNodeNgramsTable
-- | Insert utils
insertContextNodeNgrams
::
[
ContextNodeNgrams
]
->
Cmd
err
Int
insertContextNodeNgrams
=
insertContextNodeNgramsW
.
map
(
\
(
ContextNodeNgrams
c
n
ng
nt
w
)
->
.
map
(
\
(
ContextNodeNgrams
c
n
ng
nt
w
dc
)
->
ContextNodeNgrams
(
pgContextId
c
)
(
pgNodeId
n
)
(
sqlInt4
ng
)
(
pgNgramsTypeId
nt
)
(
sqlDouble
w
)
(
sqlInt4
dc
)
)
insertContextNodeNgramsW
::
[
ContextNodeNgramsWrite
]
->
Cmd
err
Int
...
...
src/Gargantext/Database/Query/Table/Node/UpdateOpaleye.hs
View file @
3b163685
...
...
@@ -35,8 +35,9 @@ updateHyperdata i h = mkCmd $ \c -> putStrLn "before runUpdate_" >>
updateHyperdataQuery
::
ToJSON
a
=>
NodeId
->
a
->
Update
Int64
updateHyperdataQuery
i
h
=
seq
h'
$
trace
"updateHyperdataQuery: encoded JSON"
$
Update
{
uTable
=
nodeTable
,
uUpdateWith
=
updateEasy
(
\
(
Node
_ni
_nh
_nt
_nu
_np
_nn
_nd
_h
)
->
trace
"updating mate"
$
Node
_ni
_nh
_nt
_nu
_np
_nn
_nd
h'
,
uUpdateWith
=
updateEasy
(
\
(
Node
{
..
})
->
Node
{
_node_hyperdata
=
h'
,
..
}
-- -> trace "updating mate" $ Node _ni _nh _nt _nu _np _nn _nd h'
)
,
uWhere
=
(
\
row
->
{-trace "uWhere" $-}
_node_id
row
.==
pgNodeId
i
)
,
uReturning
=
rCount
...
...
@@ -63,5 +64,3 @@ updateNodesWithType_ :: ( HasNodeError err
updateNodesWithType_
nt
h
=
do
ns
<-
getNodesIdWithType
nt
mapM
(
\
n
->
updateHyperdata
n
h
)
ns
src/Gargantext/Database/Schema/ContextNodeNgrams.hs
View file @
3b163685
...
...
@@ -19,6 +19,7 @@ module Gargantext.Database.Schema.ContextNodeNgrams
where
import
Prelude
import
Gargantext.Core.Types
(
TermsCount
)
import
Gargantext.Database.Schema.Prelude
import
Gargantext.Database.Schema.Ngrams
(
NgramsTypeId
,
NgramsId
)
import
Gargantext.Database.Admin.Types.Node
...
...
@@ -26,15 +27,16 @@ import Gargantext.Database.Admin.Types.Node
type
ContextNodeNgrams
=
ContextNodeNgramsPoly
ContextId
ListId
NgramsId
NgramsTypeId
Double
ContextNodeNgramsPoly
ContextId
ListId
NgramsId
NgramsTypeId
Double
TermsCount
data
ContextNodeNgramsPoly
c
n
ngrams_id
ngt
w
data
ContextNodeNgramsPoly
c
n
ngrams_id
ngt
w
dc
=
ContextNodeNgrams
{
_cnng_context_id
::
!
c
,
_cnng_node_id
::
!
n
,
_cnng_ngrams_id
::
!
ngrams_id
,
_cnng_ngramsType
::
!
ngt
,
_cnng_weight
::
!
w
,
_cnng_doc_count
::
!
dc
}
deriving
(
Show
)
type
ContextNodeNgramsWrite
=
...
...
@@ -43,6 +45,7 @@ type ContextNodeNgramsWrite =
(
Column
SqlInt4
)
(
Column
SqlInt4
)
(
Column
SqlFloat8
)
(
Column
SqlInt4
)
type
ContextNodeNgramsRead
=
ContextNodeNgramsPoly
(
Column
SqlInt4
)
...
...
@@ -50,6 +53,7 @@ type ContextNodeNgramsRead =
(
Column
SqlInt4
)
(
Column
SqlInt4
)
(
Column
SqlFloat8
)
(
Column
SqlInt4
)
type
ContextNodeNgramsReadNull
=
ContextNodeNgramsPoly
(
Column
(
Nullable
SqlInt4
))
...
...
@@ -57,6 +61,7 @@ type ContextNodeNgramsReadNull =
(
Column
(
Nullable
SqlInt4
))
(
Column
(
Nullable
SqlInt4
))
(
Column
(
Nullable
SqlFloat8
))
(
Column
(
Nullable
SqlInt4
))
$
(
makeAdaptorAndInstance
"pContextNodeNgrams"
''
C
ontextNodeNgramsPoly
)
makeLenses
''
C
ontextNodeNgramsPoly
...
...
@@ -70,5 +75,6 @@ contextNodeNgramsTable = Table "context_node_ngrams"
,
_cnng_ngrams_id
=
requiredTableField
"ngrams_id"
,
_cnng_ngramsType
=
requiredTableField
"ngrams_type"
,
_cnng_weight
=
requiredTableField
"weight"
,
_cnng_doc_count
=
requiredTableField
"doc_count"
}
)
src/Gargantext/Database/Schema/NodeNgrams.hs
View file @
3b163685
{-|
Module : Gargantext.Database.Schema.NodeNgrams
Description :
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
...
...
@@ -86,7 +86,7 @@ type NgramsClass = Int
type
NgramsText
=
Text
-- Example of list Ngrams
-- type ListNgrams = NodeNgramsPoly (Maybe Int) ListType Text
-- type ListNgrams = NodeNgramsPoly (Maybe Int) ListType Text
type
NodeNgramsW
=
NodeNgramsPoly
(
Maybe
NodeNgramsId
)
NodeId
ListType
NgramsText
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment