Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
160
Issues
160
List
Board
Labels
Milestones
Merge Requests
14
Merge Requests
14
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
85f1dffe
Verified
Commit
85f1dffe
authored
Dec 30, 2024
by
Przemyslaw Kaminski
1
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[ngrams] refactor opaque Int into TermsWeight newtype
parent
a81ea049
Pipeline
#7176
passed with stages
in 47 minutes and 37 seconds
Changes
6
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
28 additions
and
43 deletions
+28
-43
Terms.hs
src/Gargantext/Core/Text/Terms.hs
+2
-2
Types.hs
src/Gargantext/Core/Types.hs
+4
-1
Flow.hs
src/Gargantext/Database/Action/Flow.hs
+5
-5
Extract.hs
src/Gargantext/Database/Action/Flow/Extract.hs
+4
-4
Types.hs
src/Gargantext/Database/Action/Flow/Types.hs
+2
-2
Utils.hs
src/Gargantext/Database/Action/Flow/Utils.hs
+11
-29
No files found.
src/Gargantext/Core/Text/Terms.hs
View file @
85f1dffe
...
...
@@ -52,7 +52,7 @@ import Gargantext.Core.Text.Terms.Mono (monoTerms)
import
Gargantext.Core.Text.Terms.Mono.Stem
(
stem
,
StemmingAlgorithm
(
..
))
import
Gargantext.Core.Text.Terms.Mono.Token.En
(
tokenize
)
import
Gargantext.Core.Text.Terms.Multi
(
multiterms
)
import
Gargantext.Core.Types
(
TermsCount
,
POS
,
Terms
(
..
),
TermsWithCount
)
import
Gargantext.Core.Types
(
TermsCount
,
TermsWeight
,
POS
,
Terms
(
..
),
TermsWithCount
)
import
Gargantext.Core.Utils
(
groupWithCounts
)
import
Gargantext.Database.Prelude
(
DBCmd
)
import
Gargantext.Database.Query.Table.Ngrams
(
insertNgrams
)
...
...
@@ -122,7 +122,7 @@ class ExtractNgramsT h
=>
NLPServerConfig
->
TermType
Lang
->
h
->
DBCmd
err
(
HashMap
ExtractedNgrams
(
Map
NgramsType
In
t
,
TermsCount
))
->
DBCmd
err
(
HashMap
ExtractedNgrams
(
Map
NgramsType
TermsWeigh
t
,
TermsCount
))
------------------------------------------------------------------------
enrichedTerms
::
Lang
->
PosTagAlgo
->
POS
->
Terms
->
NgramsPostag
enrichedTerms
l
pa
po
(
Terms
{
..
})
=
...
...
src/Gargantext/Core/Types.hs
View file @
85f1dffe
...
...
@@ -20,7 +20,7 @@ commentary with @some markup@.
module
Gargantext.Core.Types
(
module
Gargantext
.
Core
.
Types
.
Main
,
module
Gargantext
.
Database
.
Admin
.
Types
.
Node
,
DebugMode
(
..
),
withDebugMode
,
Term
(
..
),
Terms
(
..
),
TermsCount
,
TermsWithCount
,
Term
(
..
),
Terms
(
..
),
TermsCount
,
TermsW
eight
(
..
),
TermsW
ithCount
,
TokenTag
(
..
),
POS
(
..
),
NER
(
..
)
,
Label
,
Stems
,
HasValidationError
(
..
),
assertValid
...
...
@@ -74,6 +74,9 @@ type TermsCount = Int
type
TermsWithCount
=
(
Terms
,
TermsCount
)
newtype
TermsWeight
=
TermsWeight
{
unTermsWeight
::
Int
}
deriving
newtype
(
Eq
,
Ord
,
Num
,
Show
)
------------------------------------------------------------------------
data
Tag
=
POS
|
NER
deriving
(
Show
,
Eq
)
...
...
src/Gargantext/Database/Action/Flow.hs
View file @
85f1dffe
...
...
@@ -82,7 +82,7 @@ import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
import
Gargantext.Core.Text.Ngrams
(
NgramsType
(
NgramsTerms
),
Ngrams
(
_ngramsTerms
))
import
Gargantext.Core.Text.Terms
import
Gargantext.Core.Text.Terms.Mono.Stem
(
stem
,
StemmingAlgorithm
(
..
))
import
Gargantext.Core.Types
(
HasValidationError
,
TermsCount
)
import
Gargantext.Core.Types
(
HasValidationError
,
TermsCount
,
TermsWeight
(
..
)
)
import
Gargantext.Core.Types.Individu
(
User
(
..
))
import
Gargantext.Core.Types.Main
(
ListType
(
MapTerm
)
)
import
Gargantext.Database.Action.Flow.Extract
()
-- ExtractNgramsT instances
...
...
@@ -430,7 +430,7 @@ insertMasterDocs ncs c lang hs = do
-- this will enable global database monitoring
-- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
mapNgramsDocs'
::
HashMap
.
HashMap
ExtractedNgrams
(
Map
NgramsType
(
Map
NodeId
(
In
t
,
TermsCount
)))
mapNgramsDocs'
::
HashMap
.
HashMap
ExtractedNgrams
(
Map
NgramsType
(
Map
NodeId
(
TermsWeigh
t
,
TermsCount
)))
<-
mapNodeIdNgrams
<$>
documentIdWithNgrams
(
extractNgramsT
ncs
$
withLang
lang
documentsWithId
)
...
...
@@ -445,7 +445,7 @@ insertMasterDocs ncs c lang hs = do
saveDocNgramsWith
::
(
IsDBCmd
env
err
m
)
=>
ListId
->
HashMap
.
HashMap
ExtractedNgrams
(
Map
NgramsType
(
Map
NodeId
(
In
t
,
TermsCount
)))
->
HashMap
.
HashMap
ExtractedNgrams
(
Map
NgramsType
(
Map
NodeId
(
TermsWeigh
t
,
TermsCount
)))
->
m
()
saveDocNgramsWith
lId
mapNgramsDocs'
=
do
--printDebug "[saveDocNgramsWith] mapNgramsDocs'" mapNgramsDocs'
...
...
@@ -453,7 +453,7 @@ saveDocNgramsWith lId mapNgramsDocs' = do
-- mapNgramsDocsNoCount = over (traverse . traverse . traverse) fst mapNgramsDocs'
(
terms2id
::
HashMap
.
HashMap
Text
NgramsId
)
<-
insertExtractedNgrams
$
HashMap
.
keys
mapNgramsDocs'
let
mapNgramsDocs
::
HashMap
.
HashMap
Ngrams
(
Map
NgramsType
(
Map
NodeId
(
In
t
,
TermsCount
)))
let
mapNgramsDocs
::
HashMap
.
HashMap
Ngrams
(
Map
NgramsType
(
Map
NodeId
(
TermsWeigh
t
,
TermsCount
)))
mapNgramsDocs
=
HashMap
.
mapKeys
extracted2ngrams
mapNgramsDocs'
-- new
...
...
@@ -465,7 +465,7 @@ saveDocNgramsWith lId mapNgramsDocs' = do
-- insertDocNgrams
let
ngrams2insert
=
catMaybes
[
ContextNodeNgrams2
(
nodeId2ContextId
nId
)
<$>
getCgramsId
mapCgramsId
ngrams_type
(
_ngramsTerms
terms''
)
<*>
Just
(
fromIntegral
w
::
Double
)
<*>
Just
(
fromIntegral
$
unTermsWeight
w
::
Double
)
|
(
terms''
,
mapNgramsTypes
)
<-
HashMap
.
toList
mapNgramsDocs
,
(
ngrams_type
,
mapNodeIdWeight
)
<-
Map
.
toList
mapNgramsTypes
,
(
nId
,
(
w
,
_cnt
))
<-
Map
.
toList
mapNodeIdWeight
...
...
src/Gargantext/Database/Action/Flow/Extract.hs
View file @
85f1dffe
...
...
@@ -25,7 +25,7 @@ import Gargantext.Core.Text (HasText(..))
import
Gargantext.Core.Text.Corpus.Parsers
(
splitOn
)
import
Gargantext.Core.Text.Ngrams
(
NgramsType
(
..
))
import
Gargantext.Core.Text.Terms
(
ExtractNgramsT
,
ExtractedNgrams
(
..
),
TermType
,
cleanExtractedNgrams
,
enrichedTerms
,
extractNgramsT
,
extractTerms
,
tt_lang
)
import
Gargantext.Core.Types
(
POS
(
NP
),
TermsCount
)
import
Gargantext.Core.Types
(
POS
(
NP
),
TermsCount
,
TermsWeight
)
import
Gargantext.Database.Admin.Types.Hyperdata.Contact
(
HyperdataContact
,
cw_lastName
,
hc_who
)
import
Gargantext.Database.Admin.Types.Hyperdata.Document
(
HyperdataDocument
,
hd_authors
,
hd_bdd
,
hd_institutes
,
hd_source
)
import
Gargantext.Database.Admin.Types.Node
(
Node
)
...
...
@@ -42,7 +42,7 @@ instance ExtractNgramsT HyperdataContact
extractNgramsT
_ncs
l
hc
=
HashMap
.
mapKeys
(
cleanExtractedNgrams
255
)
<$>
extract
l
hc
where
extract
::
TermType
Lang
->
HyperdataContact
->
DBCmd
err
(
HashMap
.
HashMap
ExtractedNgrams
(
Map
NgramsType
In
t
,
TermsCount
))
->
DBCmd
err
(
HashMap
.
HashMap
ExtractedNgrams
(
Map
NgramsType
TermsWeigh
t
,
TermsCount
))
extract
_l
hc'
=
do
let
authors
=
map
text2ngrams
$
maybe
[
"Nothing"
]
(
\
a
->
[
a
])
...
...
@@ -59,11 +59,11 @@ instance ExtractNgramsT HyperdataDocument
extractNgramsT
::
NLPServerConfig
->
TermType
Lang
->
HyperdataDocument
->
DBCmd
err
(
HashMap
.
HashMap
ExtractedNgrams
(
Map
NgramsType
In
t
,
TermsCount
))
->
DBCmd
err
(
HashMap
.
HashMap
ExtractedNgrams
(
Map
NgramsType
TermsWeigh
t
,
TermsCount
))
extractNgramsT
ncs
lang
hd
=
HashMap
.
mapKeys
(
cleanExtractedNgrams
255
)
<$>
extractNgramsT'
hd
where
extractNgramsT'
::
HyperdataDocument
->
DBCmd
err
(
HashMap
.
HashMap
ExtractedNgrams
(
Map
NgramsType
In
t
,
TermsCount
))
->
DBCmd
err
(
HashMap
.
HashMap
ExtractedNgrams
(
Map
NgramsType
TermsWeigh
t
,
TermsCount
))
extractNgramsT'
doc
=
do
let
source
=
text2ngrams
$
maybe
"Nothing"
identity
...
...
src/Gargantext/Database/Action/Flow/Types.hs
View file @
85f1dffe
...
...
@@ -26,7 +26,7 @@ import Gargantext.Core.Text ( HasText )
import
Gargantext.API.Admin.Orchestrator.Types
qualified
as
API
import
Gargantext.Core.Text.Ngrams
(
NgramsType
(
..
))
import
Gargantext.Core.Text.Terms
(
ExtractNgramsT
)
import
Gargantext.Core.Types
(
HasValidationError
,
TermsCount
)
import
Gargantext.Core.Types
(
HasValidationError
,
TermsCount
,
TermsWeight
)
import
Gargantext.Core.Utils.Prefix
(
unPrefix
,
unPrefixSwagger
)
import
Gargantext.Database.Admin.Types.Hyperdata.Document
(
HyperdataDocument
)
import
Gargantext.Database.Admin.Types.Node
(
NodeId
)
...
...
@@ -67,7 +67,7 @@ type FlowInsertDB a = ( AddUniqId a
data
DocumentIdWithNgrams
a
b
=
DocumentIdWithNgrams
{
documentWithId
::
Indexed
NodeId
a
,
documentNgrams
::
HashMap
b
(
Map
NgramsType
In
t
,
TermsCount
)
,
documentNgrams
::
HashMap
b
(
Map
NgramsType
TermsWeigh
t
,
TermsCount
)
}
deriving
(
Show
)
...
...
src/Gargantext/Database/Action/Flow/Utils.hs
View file @
85f1dffe
...
...
@@ -28,7 +28,7 @@ import Gargantext.Core.Flow.Types (UniqId, uniqId)
import
Gargantext.Core.Text.Ngrams
(
Ngrams
,
NgramsType
)
import
Gargantext.Core.Text.Terms
(
ExtractedNgrams
(
..
))
import
Gargantext.Core.Text.Terms.WithList
(
MatchedText
,
buildPatternsWith
,
termsInText
)
import
Gargantext.Core.Types
(
TermsCount
)
import
Gargantext.Core.Types
(
TermsCount
,
TermsWeight
(
..
)
)
import
Gargantext.Core.Utils
(
addTuples
)
import
Gargantext.Data.HashMap.Strict.Utils
qualified
as
HashMap
import
Gargantext.Database.Action.Flow.Types
(
DocumentIdWithNgrams
(
..
),
FlowInsertDB
)
...
...
@@ -47,7 +47,7 @@ import Gargantext.Prelude.Crypto.Hash (Hash)
insertDocNgrams
::
ListId
->
HashMap
(
Indexed
NgramsId
Ngrams
)
(
Map
NgramsType
(
Map
DocId
(
In
t
,
TermsCount
)))
->
HashMap
(
Indexed
NgramsId
Ngrams
)
(
Map
NgramsType
(
Map
DocId
(
TermsWeigh
t
,
TermsCount
)))
->
DBCmd
err
Int
insertDocNgrams
lId
m
=
do
-- printDebug "[insertDocNgrams] ns" ns
...
...
@@ -56,11 +56,11 @@ insertDocNgrams lId m = do
ns
=
[
ContextNodeNgrams
(
nodeId2ContextId
docId
)
lId
(
ng
^.
index
)
(
NgramsTypeId
$
toDBid
t
)
(
fromIntegral
i
)
(
fromIntegral
$
unTermsWeight
w
)
cnt
|
(
ng
,
t2n2i
)
<-
HashMap
.
toList
m
,
(
t
,
n2i
)
<-
DM
.
toList
t2n2i
,
(
docId
,
(
i
,
cnt
))
<-
DM
.
toList
n2i
,
(
docId
,
(
w
,
cnt
))
<-
DM
.
toList
n2i
]
-- [(NodeId, {Ngrams: ({NgramsType: Int}, TermsCount)})]
...
...
@@ -88,8 +88,8 @@ docNgrams lang ts doc =
documentIdWithNgrams
::
HasNodeError
err
=>
(
a
->
DBCmd
err
(
HashMap
.
HashMap
b
(
Map
NgramsType
Int
,
TermsCount
))
)
=>
(
a
->
DBCmd
err
(
HashMap
.
HashMap
b
(
Map
NgramsType
TermsWeight
,
TermsCount
))
)
->
[
Indexed
NodeId
a
]
->
DBCmd
err
[
DocumentIdWithNgrams
a
b
]
documentIdWithNgrams
f
=
traverse
toDocumentIdWithNgrams
...
...
@@ -104,7 +104,7 @@ mapNodeIdNgrams :: (Ord b, Hashable b)
=>
[
DocumentIdWithNgrams
a
b
]
->
HashMap
.
HashMap
b
(
Map
NgramsType
(
Map
NodeId
(
In
t
,
TermsCount
))
(
Map
NodeId
(
TermsWeigh
t
,
TermsCount
))
)
mapNodeIdNgrams
=
HashMap
.
unionsWith
(
DM
.
unionWith
(
DM
.
unionWith
addTuples
))
.
fmap
f
where
...
...
@@ -113,8 +113,8 @@ mapNodeIdNgrams = HashMap.unionsWith (DM.unionWith (DM.unionWith addTuples)) . f
-- for it (which is the number of times the terms appears in a
-- document) is copied over to all its types.
f
::
DocumentIdWithNgrams
a
b
->
HashMap
.
HashMap
b
(
Map
NgramsType
(
Map
NodeId
(
In
t
,
TermsCount
)))
f
d
=
fmap
(
\
(
ngramsTypeMap
,
cnt
)
->
fmap
(
\
i
->
DM
.
singleton
nId
(
i
,
cnt
))
ngramsTypeMap
)
$
documentNgrams
d
->
HashMap
.
HashMap
b
(
Map
NgramsType
(
Map
NodeId
(
TermsWeigh
t
,
TermsCount
)))
f
d
=
fmap
(
\
(
ngramsTypeMap
,
cnt
)
->
fmap
(
\
w
->
DM
.
singleton
nId
(
w
,
cnt
))
ngramsTypeMap
)
$
documentNgrams
d
where
nId
=
_index
$
documentWithId
d
...
...
@@ -192,35 +192,17 @@ ngramsByDoc :: Lang
->
NgramsType
->
[
NT
.
NgramsTerm
]
->
[
ContextOnlyId
HyperdataDocument
]
->
[
HashMap
.
HashMap
ExtractedNgrams
(
DM
.
Map
NgramsType
(
Map
NodeId
(
In
t
,
TermsCount
)))]
->
[
HashMap
.
HashMap
ExtractedNgrams
(
DM
.
Map
NgramsType
(
Map
NodeId
(
TermsWeigh
t
,
TermsCount
)))]
ngramsByDoc
l
nt
ts
docs
=
ngramsByDoc'
l
nt
ts
<$>
docs
-- | Given list of terms and a document, produce a map for this doc's
-- terms count and weights. Notice that the weight is always 1 here.
-- ngramsByDoc' :: Lang
-- -> NgramsType
-- -> [NT.NgramsTerm]
-- -> ContextOnlyId HyperdataDocument
-- -> HashMap.HashMap ExtractedNgrams (DM.Map NgramsType (Map NodeId (Int, TermsCount)))
-- ngramsByDoc' l nt ts doc =
-- HashMap.fromListWith (DM.unionWith (DM.unionWith (\(_a,b) (_a',b') -> (1,b+b')))) withExtractedNgrams
-- where
-- _docNgrams' :: ([(MatchedText, TermsCount)], NodeId)
-- _docNgrams'@(matched, nId) = (docNgrams l ts doc, doc ^. context_oid_id)
-- withExtractedNgrams :: [(ExtractedNgrams, Map NgramsType (Map NodeId (Int, TermsCount)))]
-- withExtractedNgrams =
-- map (\(matchedText, cnt) ->
-- ( SimpleNgrams (text2ngrams matchedText)
-- , DM.singleton nt $ DM.singleton nId (1, cnt) ) ) matched
ngramsByDoc'
::
Lang
->
NgramsType
->
[
NT
.
NgramsTerm
]
->
ContextOnlyId
HyperdataDocument
->
HashMap
.
HashMap
ExtractedNgrams
(
DM
.
Map
NgramsType
(
Map
NodeId
(
In
t
,
TermsCount
)))
->
HashMap
.
HashMap
ExtractedNgrams
(
DM
.
Map
NgramsType
(
Map
NodeId
(
TermsWeigh
t
,
TermsCount
)))
ngramsByDoc'
l
nt
ts
doc
=
HashMap
.
map
(
\
cnt
->
DM
.
singleton
nt
$
DM
.
singleton
nId
(
1
,
cnt
))
extractedMap
where
...
...
Przemyslaw Kaminski
@cgenie
mentioned in commit
03b33383
·
Jan 30, 2025
mentioned in commit
03b33383
mentioned in commit 03b33383dd67c1821a4edb4628923cf7bd039d90
Toggle commit list
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment