Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
152
Issues
152
List
Board
Labels
Milestones
Merge Requests
9
Merge Requests
9
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
cb739183
Commit
cb739183
authored
Jul 13, 2023
by
Alexandre Delanoë
Committed by
Alfredo Di Napoli
Jul 13, 2023
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[SPECS] Main Specifications to add Chinese lang
parent
f0674838
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
34 additions
and
18 deletions
+34
-18
List.hs
src/Gargantext/API/Ngrams/List.hs
+12
-10
New.hs
src/Gargantext/API/Node/Corpus/New.hs
+3
-3
WithList.hs
src/Gargantext/Core/Text/Terms/WithList.hs
+16
-3
Tools.hs
src/Gargantext/Core/Viz/Phylo/API/Tools.hs
+2
-1
LegacyMain.hs
src/Gargantext/Core/Viz/Phylo/Legacy/LegacyMain.hs
+1
-1
No files found.
src/Gargantext/API/Ngrams/List.hs
View file @
cb739183
...
@@ -24,6 +24,7 @@ import Data.Set (Set)
...
@@ -24,6 +24,7 @@ import Data.Set (Set)
import
Data.Text
(
Text
,
concat
,
pack
,
splitOn
)
import
Data.Text
(
Text
,
concat
,
pack
,
splitOn
)
import
Data.Vector
(
Vector
)
import
Data.Vector
(
Vector
)
import
Gargantext.API.Admin.EnvTypes
(
Env
,
GargJob
(
..
))
import
Gargantext.API.Admin.EnvTypes
(
Env
,
GargJob
(
..
))
import
Gargantext.Core
(
Lang
)
import
Gargantext.API.Admin.Orchestrator.Types
import
Gargantext.API.Admin.Orchestrator.Types
import
Gargantext.API.Ngrams
(
setListNgrams
)
import
Gargantext.API.Ngrams
(
setListNgrams
)
import
Gargantext.API.Ngrams.List.Types
import
Gargantext.API.Ngrams.List.Types
...
@@ -34,7 +35,7 @@ import Gargantext.API.Prelude (GargServer, GargM, GargError)
...
@@ -34,7 +35,7 @@ import Gargantext.API.Prelude (GargServer, GargM, GargError)
import
Gargantext.API.Types
import
Gargantext.API.Types
import
Gargantext.Core.NodeStory
import
Gargantext.Core.NodeStory
import
Gargantext.Core.Text.Terms
(
ExtractedNgrams
(
..
))
import
Gargantext.Core.Text.Terms
(
ExtractedNgrams
(
..
))
import
Gargantext.Core.Text.Terms.WithList
(
MatchedText
,
buildPatterns
,
termsInText
)
import
Gargantext.Core.Text.Terms.WithList
(
MatchedText
,
buildPatterns
With
,
termsInText
)
import
Gargantext.Core.Types
(
TermsCount
)
import
Gargantext.Core.Types
(
TermsCount
)
import
Gargantext.Core.Types.Main
(
ListType
(
..
))
import
Gargantext.Core.Types.Main
(
ListType
(
..
))
import
Gargantext.Database.Action.Flow
(
saveDocNgramsWith
)
import
Gargantext.Database.Action.Flow
(
saveDocNgramsWith
)
...
@@ -142,7 +143,7 @@ setList l m = do
...
@@ -142,7 +143,7 @@ setList l m = do
pure
True
pure
True
------------------------------------------------------------------------
------------------------------------------------------------------------
-- | Re-index documents of a corpus with n
ew ngrams (called orphans here)
-- | Re-index documents of a corpus with n
grams in the list
reIndexWith
::
(
HasNodeStory
env
err
m
reIndexWith
::
(
HasNodeStory
env
err
m
,
FlowCmdM
env
err
m
,
FlowCmdM
env
err
m
)
)
...
@@ -153,6 +154,7 @@ reIndexWith :: ( HasNodeStory env err m
...
@@ -153,6 +154,7 @@ reIndexWith :: ( HasNodeStory env err m
->
m
()
->
m
()
reIndexWith
cId
lId
nt
lts
=
do
reIndexWith
cId
lId
nt
lts
=
do
-- printDebug "(cId,lId,nt,lts)" (cId, lId, nt, lts)
-- printDebug "(cId,lId,nt,lts)" (cId, lId, nt, lts)
-- corpus_node <- getNode cId -- (Proxy :: Proxy HyperdataCorpus)
-- Getting [NgramsTerm]
-- Getting [NgramsTerm]
ts
<-
List
.
concat
ts
<-
List
.
concat
...
@@ -167,23 +169,23 @@ reIndexWith cId lId nt lts = do
...
@@ -167,23 +169,23 @@ reIndexWith cId lId nt lts = do
-- fromListWith (<>)
-- fromListWith (<>)
ngramsByDoc
=
map
(
HashMap
.
fromListWith
(
Map
.
unionWith
(
Map
.
unionWith
(
\
(
_a
,
b
)
(
_a'
,
b'
)
->
(
1
,
b
+
b'
)))))
ngramsByDoc
=
map
(
HashMap
.
fromListWith
(
Map
.
unionWith
(
Map
.
unionWith
(
\
(
_a
,
b
)
(
_a'
,
b'
)
->
(
1
,
b
+
b'
)))))
$
map
(
map
(
\
((
k
,
cnt
),
v
)
->
(
SimpleNgrams
(
text2ngrams
k
),
over
(
traverse
.
traverse
)
(
\
p
->
(
p
,
cnt
))
v
)))
$
map
(
map
(
\
((
k
,
cnt
),
v
)
->
(
SimpleNgrams
(
text2ngrams
k
),
over
(
traverse
.
traverse
)
(
\
p
->
(
p
,
cnt
))
v
)))
$
map
(
docNgrams
nt
ts
)
docs
$
map
(
docNgrams
Nothing
{-here lang-}
nt
ts
)
docs
-- printDebug "ngramsByDoc: " ngramsByDoc
-- Saving the indexation in database
-- Saving the indexation in database
_
<-
mapM
(
saveDocNgramsWith
lId
)
ngramsByDoc
_
<-
mapM
(
saveDocNgramsWith
lId
)
ngramsByDoc
-- _ <- refreshNgramsMaterialized
pure
()
pure
()
docNgrams
::
NgramsType
docNgrams
::
Maybe
Lang
->
NgramsType
->
[
NgramsTerm
]
->
[
NgramsTerm
]
->
Gargantext
.
Database
.
Admin
.
Types
.
Node
.
Context
HyperdataDocument
->
Gargantext
.
Database
.
Admin
.
Types
.
Node
.
Context
HyperdataDocument
->
[((
MatchedText
,
TermsCount
),
->
[((
MatchedText
,
TermsCount
),
Map
NgramsType
(
Map
NodeId
Int
))]
Map
NgramsType
(
Map
NodeId
Int
))]
docNgrams
nt
ts
doc
=
docNgrams
lang
nt
ts
doc
=
List
.
zip
List
.
zip
(
termsInText
(
buildPatterns
$
map
(
\
k
->
(
Text
.
splitOn
" "
$
unNgramsTerm
k
,
[]
))
ts
)
(
termsInText
lang
(
buildPatternsWith
lang
ts
)
$
Text
.
unlines
$
catMaybes
$
Text
.
unlines
$
catMaybes
[
doc
^.
context_hyperdata
.
hd_title
[
doc
^.
context_hyperdata
.
hd_title
,
doc
^.
context_hyperdata
.
hd_abstract
,
doc
^.
context_hyperdata
.
hd_abstract
...
...
src/Gargantext/API/Node/Corpus/New.hs
View file @
cb739183
...
@@ -205,7 +205,7 @@ addToCorpusWithQuery user cid (WithQuery { _wq_query = q
...
@@ -205,7 +205,7 @@ addToCorpusWithQuery user cid (WithQuery { _wq_query = q
-- printDebug "[addToCorpusWithQuery] datafield" datafield
-- printDebug "[addToCorpusWithQuery] datafield" datafield
-- printDebug "[addToCorpusWithQuery] flowListWith" flw
-- printDebug "[addToCorpusWithQuery] flowListWith" flw
-- TODO
-- TODO
: update Node Corpus with the Lang
-- n <- getNode cid
-- n <- getNode cid
-- let n.wq_lang = l
-- let n.wq_lang = l
-- saveNode n
-- saveNode n
...
@@ -270,7 +270,7 @@ addToCorpusWithForm user cid (NewWithForm ft ff d l _n sel) jobHandle = do
...
@@ -270,7 +270,7 @@ addToCorpusWithForm user cid (NewWithForm ft ff d l _n sel) jobHandle = do
-- printDebug "[addToCorpusWithForm] fileType" ft
-- printDebug "[addToCorpusWithForm] fileType" ft
-- printDebug "[addToCorpusWithForm] fileFormat" ff
-- printDebug "[addToCorpusWithForm] fileFormat" ff
-- TODO
-- TODO
: update Node Corpus with the Lang
-- n <- getNode cid
-- n <- getNode cid
-- let n.wq_lang = l
-- let n.wq_lang = l
-- saveNode n
-- saveNode n
...
@@ -380,7 +380,7 @@ addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m, MonadJobStatus m)
...
@@ -380,7 +380,7 @@ addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m, MonadJobStatus m)
->
m
()
->
m
()
addToCorpusWithFile
user
cid
nwf
@
(
NewWithFile
_d
_l
fName
)
jobHandle
=
do
addToCorpusWithFile
user
cid
nwf
@
(
NewWithFile
_d
_l
fName
)
jobHandle
=
do
-- TODO
-- TODO
: update Node Corpus with the Lang
-- n <- getNode cid
-- n <- getNode cid
-- let n.wq_lang = l
-- let n.wq_lang = l
-- saveNode n
-- saveNode n
...
...
src/Gargantext/Core/Text/Terms/WithList.hs
View file @
cb739183
...
@@ -18,7 +18,9 @@ module Gargantext.Core.Text.Terms.WithList where
...
@@ -18,7 +18,9 @@ module Gargantext.Core.Text.Terms.WithList where
import
Data.List
(
null
)
import
Data.List
(
null
)
import
Data.Ord
import
Data.Ord
import
Data.Text
(
Text
,
concat
,
unwords
)
import
Data.Text
(
Text
,
concat
,
unwords
)
import
Gargantext.API.Ngrams.Types
(
NgramsTerm
(
..
))
import
Gargantext.Prelude
import
Gargantext.Prelude
import
Gargantext.Core
(
Lang
(
ZH
))
import
Gargantext.Core.Text.Context
import
Gargantext.Core.Text.Context
import
Gargantext.Core.Text.Terms.Mono
(
monoTextsBySentence
)
import
Gargantext.Core.Text.Terms.Mono
(
monoTextsBySentence
)
import
Gargantext.Core.Types
(
TermsCount
)
import
Gargantext.Core.Types
(
TermsCount
)
...
@@ -27,6 +29,8 @@ import Prelude (error)
...
@@ -27,6 +29,8 @@ import Prelude (error)
import
qualified
Data.Algorithms.KMP
as
KMP
import
qualified
Data.Algorithms.KMP
as
KMP
import
qualified
Data.IntMap.Strict
as
IntMap
import
qualified
Data.IntMap.Strict
as
IntMap
import
qualified
Data.List
as
List
import
qualified
Data.List
as
List
import
qualified
Data.Text
as
Text
------------------------------------------------------------------------
------------------------------------------------------------------------
data
Pattern
=
Pattern
data
Pattern
=
Pattern
...
@@ -63,6 +67,10 @@ replaceTerms rplaceTerms pats terms = go 0
...
@@ -63,6 +67,10 @@ replaceTerms rplaceTerms pats terms = go 0
merge
(
len1
,
lab1
)
(
len2
,
lab2
)
=
merge
(
len1
,
lab1
)
(
len2
,
lab2
)
=
if
len2
<
len1
then
(
len1
,
lab1
)
else
(
len2
,
lab2
)
if
len2
<
len1
then
(
len1
,
lab1
)
else
(
len2
,
lab2
)
buildPatternsWith
::
Maybe
Lang
->
[
NgramsTerm
]
->
Patterns
buildPatternsWith
(
Just
ZH
)
ts
=
buildPatterns
$
map
(
\
k
->
(
Text
.
chunksOf
1
$
unNgramsTerm
k
,
[]
))
ts
buildPatternsWith
_
ts
=
buildPatterns
$
map
(
\
k
->
(
Text
.
splitOn
" "
$
unNgramsTerm
k
,
[]
))
ts
buildPatterns
::
TermList
->
Patterns
buildPatterns
::
TermList
->
Patterns
buildPatterns
=
sortWith
(
Down
.
_pat_length
)
.
concatMap
buildPattern
buildPatterns
=
sortWith
(
Down
.
_pat_length
)
.
concatMap
buildPattern
where
where
...
@@ -78,14 +86,14 @@ buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
...
@@ -78,14 +86,14 @@ buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
--------------------------------------------------------------------------
--------------------------------------------------------------------------
-- Utils
-- Utils
type
MatchedText
=
Text
type
MatchedText
=
Text
termsInText
::
Patterns
->
Text
->
[(
MatchedText
,
TermsCount
)]
termsInText
::
Maybe
Lang
->
Patterns
->
Text
->
[(
MatchedText
,
TermsCount
)]
termsInText
pats
txt
=
groupWithCounts
termsInText
(
Just
ZH
)
pats
txt
=
termsInText
Nothing
pats
(
addSpaces
txt
)
termsInText
_
pats
txt
=
groupWithCounts
$
List
.
concat
$
List
.
concat
$
map
(
map
unwords
)
$
map
(
map
unwords
)
$
extractTermsWithList
pats
txt
$
extractTermsWithList
pats
txt
--------------------------------------------------------------------------
--------------------------------------------------------------------------
extractTermsWithList
::
Patterns
->
Text
->
Corpus
[
Text
]
extractTermsWithList
::
Patterns
->
Text
->
Corpus
[
Text
]
extractTermsWithList
pats
=
map
(
replaceTerms
KeepAll
pats
)
.
monoTextsBySentence
extractTermsWithList
pats
=
map
(
replaceTerms
KeepAll
pats
)
.
monoTextsBySentence
...
@@ -97,6 +105,11 @@ extractTermsWithList' :: Patterns -> Text -> [Text]
...
@@ -97,6 +105,11 @@ extractTermsWithList' :: Patterns -> Text -> [Text]
extractTermsWithList'
pats
=
map
(
concat
.
map
concat
.
replaceTerms
KeepAll
pats
)
extractTermsWithList'
pats
=
map
(
concat
.
map
concat
.
replaceTerms
KeepAll
pats
)
.
monoTextsBySentence
.
monoTextsBySentence
--------------------------------------------------------------------------
addSpaces
::
Text
->
Text
addSpaces
=
(
Text
.
intercalate
" "
)
.
(
Text
.
chunksOf
1
)
--------------------------------------------------------------------------
--------------------------------------------------------------------------
{- | Not used
{- | Not used
...
...
src/Gargantext/Core/Viz/Phylo/API/Tools.hs
View file @
cb739183
...
@@ -107,8 +107,9 @@ corpusIdtoDocuments timeUnit corpusId = do
...
@@ -107,8 +107,9 @@ corpusIdtoDocuments timeUnit corpusId = do
Just
termList'
->
buildPatterns
termList'
Just
termList'
->
buildPatterns
termList'
pure
$
map
(
toPhyloDocs
patterns
timeUnit
)
(
map
_context_hyperdata
docs
)
pure
$
map
(
toPhyloDocs
patterns
timeUnit
)
(
map
_context_hyperdata
docs
)
-- TODO: Add lang to enable Chinese phylomemy
termsInText'
::
Patterns
->
Text
->
[
Text
]
termsInText'
::
Patterns
->
Text
->
[
Text
]
termsInText'
p
t
=
(
map
fst
)
$
termsInText
p
t
termsInText'
p
t
=
(
map
fst
)
$
termsInText
Nothing
p
t
toPhyloDocs
::
Patterns
->
TimeUnit
->
HyperdataDocument
->
Document
toPhyloDocs
::
Patterns
->
TimeUnit
->
HyperdataDocument
->
Document
toPhyloDocs
patterns
time
d
=
toPhyloDocs
patterns
time
d
=
...
...
src/Gargantext/Core/Viz/Phylo/Legacy/LegacyMain.hs
View file @
cb739183
...
@@ -65,7 +65,7 @@ flowPhylo cId = do
...
@@ -65,7 +65,7 @@ flowPhylo cId = do
patterns
=
buildPatterns
termList
patterns
=
buildPatterns
termList
-- | To filter the Ngrams of a document based on the termList
-- | To filter the Ngrams of a document based on the termList
filterTerms
::
Patterns
->
(
Date
,
Text
)
->
(
Date
,
[
Text
])
filterTerms
::
Patterns
->
(
Date
,
Text
)
->
(
Date
,
[
Text
])
filterTerms
patterns'
(
y
,
d
)
=
(
y
,
fst
<$>
termsInText
patterns'
d
)
filterTerms
patterns'
(
y
,
d
)
=
(
y
,
fst
<$>
termsInText
Nothing
patterns'
d
)
docs
=
map
((
\
(
y
,
t
)
->
Document
y
t
)
.
filterTerms
patterns
)
docs'
docs
=
map
((
\
(
y
,
t
)
->
Document
y
t
)
.
filterTerms
patterns
)
docs'
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment