Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Christian Merten
haskell-gargantext
Commits
c9c21779
Verified
Commit
c9c21779
authored
Dec 22, 2023
by
Przemyslaw Kaminski
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[tokenize] working backend version of tokenization/highlighting
parent
e66f7257
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
343 additions
and
15 deletions
+343
-15
gargantext.cabal
gargantext.cabal
+6
-1
Public.hs
src/Gargantext/API/Public.hs
+69
-0
Multi.hs
src/Gargantext/Core/Text/Terms/Multi.hs
+18
-0
CoreNLP.hs
src/Gargantext/Core/Text/Terms/Multi/CoreNLP.hs
+1
-0
Tokenize.hs
src/Gargantext/Core/Text/Terms/Tokenize.hs
+108
-10
Types.hs
src/Gargantext/Core/Text/Terms/Tokenize/Types.hs
+99
-0
Types.hs
src/Gargantext/Core/Types.hs
+17
-4
Array.hs
src/Gargantext/Utils/Array.hs
+22
-0
Main.hs
test/drivers/tasty/Main.hs
+3
-0
No files found.
gargantext.cabal
View file @
c9c21779
...
@@ -106,6 +106,8 @@ library
...
@@ -106,6 +106,8 @@ library
Gargantext.Core.Text.Terms.Multi.Lang.En
Gargantext.Core.Text.Terms.Multi.Lang.En
Gargantext.Core.Text.Terms.Multi.Lang.Fr
Gargantext.Core.Text.Terms.Multi.Lang.Fr
Gargantext.Core.Text.Terms.Multi.RAKE
Gargantext.Core.Text.Terms.Multi.RAKE
Gargantext.Core.Text.Terms.Tokenize
Gargantext.Core.Text.Terms.Tokenize.Types
Gargantext.Core.Text.Terms.WithList
Gargantext.Core.Text.Terms.WithList
Gargantext.Core.Types
Gargantext.Core.Types
Gargantext.Core.Types.Individu
Gargantext.Core.Types.Individu
...
@@ -152,6 +154,7 @@ library
...
@@ -152,6 +154,7 @@ library
Gargantext.Database.Schema.User
Gargantext.Database.Schema.User
Gargantext.Defaults
Gargantext.Defaults
Gargantext.System.Logging
Gargantext.System.Logging
Gargantext.Utils.Array
Gargantext.Utils.Dict
Gargantext.Utils.Dict
Gargantext.Utils.Jobs
Gargantext.Utils.Jobs
Gargantext.Utils.Jobs.Internal
Gargantext.Utils.Jobs.Internal
...
@@ -275,7 +278,6 @@ library
...
@@ -275,7 +278,6 @@ library
Gargantext.Core.Text.Terms.Multi.Group
Gargantext.Core.Text.Terms.Multi.Group
Gargantext.Core.Text.Terms.Multi.CoreNLP
Gargantext.Core.Text.Terms.Multi.CoreNLP
Gargantext.Core.Text.Terms.Multi.PosTagging.Types
Gargantext.Core.Text.Terms.Multi.PosTagging.Types
Gargantext.Core.Text.Terms.Tokenize
Gargantext.Core.Text.Upload
Gargantext.Core.Text.Upload
Gargantext.Core.Types.Search
Gargantext.Core.Types.Search
Gargantext.Core.Utils.DateUtils
Gargantext.Core.Utils.DateUtils
...
@@ -436,6 +438,7 @@ library
...
@@ -436,6 +438,7 @@ library
, crawlerPubMed
, crawlerPubMed
, cron ^>= 0.7.0
, cron ^>= 0.7.0
, cryptohash ^>= 0.11.9
, cryptohash ^>= 0.11.9
, data-interval ^>= 2.1.1
, data-time-segment ^>= 0.1.0.0
, data-time-segment ^>= 0.1.0.0
, deepseq ^>= 1.4.4.0
, deepseq ^>= 1.4.4.0
, directory ^>= 1.3.6.0
, directory ^>= 1.3.6.0
...
@@ -444,6 +447,7 @@ library
...
@@ -444,6 +447,7 @@ library
, ekg-json ^>= 0.1.0.7
, ekg-json ^>= 0.1.0.7
, epo-api-client
, epo-api-client
, exceptions ^>= 0.10.4
, exceptions ^>= 0.10.4
, extended-reals ^>= 0.2.4.0
, extra ^>= 1.7.9
, extra ^>= 1.7.9
, fast-logger ^>= 3.0.5
, fast-logger ^>= 3.0.5
, fclabels ^>= 2.0.5
, fclabels ^>= 2.0.5
...
@@ -934,6 +938,7 @@ test-suite garg-test-tasty
...
@@ -934,6 +938,7 @@ test-suite garg-test-tasty
Test.Core.Text.Corpus.Query
Test.Core.Text.Corpus.Query
Test.Core.Text.Examples
Test.Core.Text.Examples
Test.Core.Text.Flow
Test.Core.Text.Flow
Test.Core.Text.Tokenize
Test.Core.Utils
Test.Core.Utils
Test.Database.Operations
Test.Database.Operations
Test.Database.Operations.DocumentSearch
Test.Database.Operations.DocumentSearch
...
...
src/Gargantext/API/Public.hs
View file @
c9c21779
...
@@ -11,18 +11,25 @@ Portability : POSIX
...
@@ -11,18 +11,25 @@ Portability : POSIX
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TemplateHaskell #-}
module
Gargantext.API.Public
module
Gargantext.API.Public
where
where
import
Control.Lens
((
^?
),
(
^.
),
_Just
)
import
Control.Lens
((
^?
),
(
^.
),
_Just
)
import
Data.Aeson
import
Data.Aeson
import
Data.Aeson.TH
(
deriveJSON
)
import
Data.List
qualified
as
List
import
Data.List
qualified
as
List
import
Data.Map.Strict
qualified
as
Map
import
Data.Map.Strict
qualified
as
Map
import
Data.Set
qualified
as
Set
import
Data.Set
qualified
as
Set
import
Data.Swagger
hiding
(
title
,
url
)
import
Data.Swagger
hiding
(
title
,
url
)
import
Gargantext.API.Ngrams.Types
(
NgramsTerm
(
..
))
import
Gargantext.API.Node.File
import
Gargantext.API.Node.File
import
Gargantext.API.Prelude
import
Gargantext.API.Prelude
import
Gargantext.Core
(
Lang
)
import
Gargantext.Core.Text.Terms.Tokenize
qualified
as
Tokenize
import
Gargantext.Core.Text.Terms.Tokenize.Types
qualified
as
Tokenize
import
Gargantext.Core.Types
(
TokenTag
(
..
))
import
Gargantext.Core.Utils.DateUtils
(
utc2year
)
import
Gargantext.Core.Utils.DateUtils
(
utc2year
)
import
Gargantext.Database.Admin.Types.Hyperdata
import
Gargantext.Database.Admin.Types.Hyperdata
import
Gargantext.Database.Admin.Types.Hyperdata.CorpusField
import
Gargantext.Database.Admin.Types.Hyperdata.CorpusField
...
@@ -32,6 +39,7 @@ import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
...
@@ -32,6 +39,7 @@ import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
import
Gargantext.Database.Query.Table.NodeNode
(
selectPublicNodes
)
import
Gargantext.Database.Query.Table.NodeNode
(
selectPublicNodes
)
import
Gargantext.Database.Schema.Node
-- (NodePoly(..))
import
Gargantext.Database.Schema.Node
-- (NodePoly(..))
import
Gargantext.Prelude
import
Gargantext.Prelude
import
Gargantext.Core.Utils.Prefix
(
unPrefix
,
unPrefixSwagger
)
import
Gargantext.Utils.Aeson
qualified
as
GUA
import
Gargantext.Utils.Aeson
qualified
as
GUA
import
Servant
import
Servant
import
Test.QuickCheck
(
elements
)
import
Test.QuickCheck
(
elements
)
...
@@ -40,10 +48,12 @@ import Test.QuickCheck.Arbitrary
...
@@ -40,10 +48,12 @@ import Test.QuickCheck.Arbitrary
------------------------------------------------------------------------
------------------------------------------------------------------------
type
API
=
API_Home
type
API
=
API_Home
:<|>
API_Node
:<|>
API_Node
:<|>
API_NLP
api
::
Text
->
GargServer
API
api
::
Text
->
GargServer
API
api
baseUrl
=
(
api_home
baseUrl
)
api
baseUrl
=
(
api_home
baseUrl
)
:<|>
api_node
:<|>
api_node
:<|>
api_nlp
-------------------------------------------------------------------------
-------------------------------------------------------------------------
type
API_Home
=
Summary
" Public Home API"
type
API_Home
=
Summary
" Public Home API"
...
@@ -70,6 +80,60 @@ api_node nId = do
...
@@ -70,6 +80,60 @@ api_node nId = do
-------------------------------------------------------------------------
-------------------------------------------------------------------------
type
API_NLP
=
Summary
" NLP"
:>
"nlp"
:>
(
"tokenize"
:>
ReqBody
'[
J
SON
]
TokenizeData
:>
Post
'[
J
SON
]
[
TokenTag
]
:<|>
"highlight"
:>
ReqBody
'[
J
SON
]
HighlightData
:>
Post
'[
J
SON
]
[
Tokenize
.
HighlightedTerm
]
:<|>
"highlight-total"
:>
ReqBody
'[
J
SON
]
HighlightData
:>
Post
'[
J
SON
]
[
Tokenize
.
HighlightResult
]
)
api_nlp
::
GargServer
API_NLP
api_nlp
=
api_tokenize
:<|>
api_highlight
:<|>
api_highlight_total
data
TokenizeData
=
TokenizeData
{
_td_lang
::
Lang
,
_td_text
::
Text
}
deriving
(
Show
,
Eq
,
Generic
)
instance
ToSchema
TokenizeData
where
declareNamedSchema
=
genericDeclareNamedSchema
(
unPrefixSwagger
"_td_"
)
api_tokenize
::
(
GargServerC
env
err
m
)
=>
TokenizeData
->
m
[
TokenTag
]
api_tokenize
(
TokenizeData
{
..
})
=
do
ret
<-
Tokenize
.
tokenize
_td_lang
_td_text
liftBase
$
putText
$
"[tokenize] ret: "
<>
show
ret
pure
ret
data
HighlightData
=
HighlightData
{
_hd_lang
::
Lang
,
_hd_text
::
Text
,
_hd_terms
::
[
Text
]
}
deriving
(
Show
,
Eq
,
Generic
)
instance
ToSchema
HighlightData
where
declareNamedSchema
=
genericDeclareNamedSchema
(
unPrefixSwagger
"_hd_"
)
api_highlight
::
(
GargServerC
env
err
m
)
=>
HighlightData
->
m
[
Tokenize
.
HighlightedTerm
]
api_highlight
(
HighlightData
{
..
})
=
do
Tokenize
.
highlightTerms
(
NgramsTerm
<$>
_hd_terms
)
_hd_lang
_hd_text
-- | Similar to 'api_highlight' but returns an array containing the
-- whole text, i.e. if it's not highlighted, it is still part of the
-- returned array.
api_highlight_total
::
(
GargServerC
env
err
m
)
=>
HighlightData
->
m
[
Tokenize
.
HighlightResult
]
api_highlight_total
hd
@
(
HighlightData
{
..
})
=
do
hts
<-
api_highlight
hd
liftBase
$
putText
$
"[api_highlight_total] _hd_text: "
<>
_hd_text
pure
$
Tokenize
.
fillHighlightGaps
hts
_hd_text
-------------------------------------------------------------------------
selectPublic
::
HasNodeError
err
selectPublic
::
HasNodeError
err
=>
DBCmd
err
[(
Node
HyperdataFolder
,
Maybe
Int
)]
=>
DBCmd
err
[(
Node
HyperdataFolder
,
Maybe
Int
)]
...
@@ -153,3 +217,8 @@ defaultPublicData =
...
@@ -153,3 +217,8 @@ defaultPublicData =
,
date
=
"YY/MM/DD"
,
date
=
"YY/MM/DD"
,
database
=
"database"
,
database
=
"database"
,
author
=
"Author"
}
,
author
=
"Author"
}
$
(
deriveJSON
(
unPrefix
"_td_"
)
''
T
okenizeData
)
$
(
deriveJSON
(
unPrefix
"_hd_"
)
''
H
ighlightData
)
src/Gargantext/Core/Text/Terms/Multi.hs
View file @
c9c21779
...
@@ -17,6 +17,7 @@ module Gargantext.Core.Text.Terms.Multi
...
@@ -17,6 +17,7 @@ module Gargantext.Core.Text.Terms.Multi
(
multiterms
(
multiterms
,
multiterms_rake
,
multiterms_rake
,
tokenTags
,
tokenTags
,
tokenTagsNoGroup
,
cleanTextForNLP
)
,
cleanTextForNLP
)
where
where
...
@@ -76,6 +77,23 @@ tokenTags (NLPServerConfig { server = Spacy, url }) l txt = do
...
@@ -76,6 +77,23 @@ tokenTags (NLPServerConfig { server = Spacy, url }) l txt = do
tokenTags
_
l
_
=
panicTrace
$
"[G.C.T.T.Multi] Lang NLP API not implemented yet "
<>
(
show
l
)
tokenTags
_
l
_
=
panicTrace
$
"[G.C.T.T.Multi] Lang NLP API not implemented yet "
<>
(
show
l
)
tokenTagsNoGroup
::
NLPServerConfig
->
Lang
->
Text
->
IO
[[
TokenTag
]]
tokenTagsNoGroup
(
NLPServerConfig
{
server
=
CoreNLP
,
url
})
EN
txt
=
do
CoreNLP
.
coreNLPTokenTags
url
EN
txt
tokenTagsNoGroup
(
NLPServerConfig
{
server
=
CoreNLP
,
url
})
FR
txt
=
do
CoreNLP
.
coreNLPTokenTags
url
FR
txt
tokenTagsNoGroup
(
NLPServerConfig
{
server
=
Spacy
,
url
})
_l
txt
=
do
-- printDebug "NLP Debug" txt
SpacyNLP
.
nlpTokenTags
url
txt
-- tokenTagsWith l txt $ SpacyNLP.nlp url
-- tokenTags FR txt = do
-- -- printDebug "[Spacy Debug]" txt
-- if txt == ""
-- then pure [[]]
-- else tokenTagsWith FR txt SpacyNLP.nlp
tokenTagsNoGroup
_
l
_
=
panicTrace
$
"[G.C.T.T.Multi] Lang NLP API not implemented yet "
<>
(
show
l
)
---- | This function analyses and groups (or not) ngrams according to
---- | This function analyses and groups (or not) ngrams according to
---- specific grammars of each language.
---- specific grammars of each language.
...
...
src/Gargantext/Core/Text/Terms/Multi/CoreNLP.hs
View file @
c9c21779
...
@@ -147,6 +147,7 @@ corenlp uri lang txt = do
...
@@ -147,6 +147,7 @@ corenlp uri lang txt = do
coreNLPTokenTags
::
URI
->
Lang
->
Text
->
IO
[[
TokenTag
]]
coreNLPTokenTags
::
URI
->
Lang
->
Text
->
IO
[[
TokenTag
]]
coreNLPTokenTags
uri
lang
txt
=
do
coreNLPTokenTags
uri
lang
txt
=
do
document
<-
corenlp
uri
lang
txt
document
<-
corenlp
uri
lang
txt
putText
$
"[coreNLPTokenTags] document: "
<>
show
document
pure
$
map
tokens2tokensTags
$
allTokens
document
pure
$
map
tokens2tokensTags
$
allTokens
document
-- | parseWith
-- | parseWith
...
...
src/Gargantext/Core/Text/Terms/Tokenize.hs
View file @
c9c21779
...
@@ -9,21 +9,119 @@ Portability : POSIX
...
@@ -9,21 +9,119 @@ Portability : POSIX
-}
-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE TemplateHaskell #-}
module
Gargantext.Core.Text.Terms.Tokenize
module
Gargantext.Core.Text.Terms.Tokenize
where
where
import
Control.Lens
(
view
)
import
Control.Lens
(
view
)
-- over
import
Control.Monad.Trans.Control
(
MonadBaseControl
)
import
Data.ExtendedReal
(
Extended
(
..
))
import
Gargantext.Core
(
Lang
)
import
Data.Interval
((
<=..<=
))
import
Gargantext.Core.NLP
(
nlpServerGet
,
HasNLPServer
)
import
Data.Interval
qualified
as
I
import
Gargantext.Core.Text.Terms.Multi
(
tokenTags
)
import
Data.IntervalSet
qualified
as
IS
import
Gargantext.Core.Types
(
TokenTag
(
..
))
import
Data.Set
qualified
as
Set
import
Data.Text
qualified
as
T
import
Gargantext.API.Ngrams.Types
(
NgramsTerm
(
..
))
import
Gargantext.Core
(
Lang
,
NLPServerConfig
(
..
))
--, PosTagAlgo(CoreNLP))
import
Gargantext.Core.NLP
(
nlpServerGet
)
import
Gargantext.Core.Text.Terms.Multi
(
tokenTagsNoGroup
)
import
Gargantext.Core.Text.Terms.Tokenize.Types
import
Gargantext.Core.Types
(
TokenTag
(
..
),
POS
(
..
))
--, my_token_offset_end)
import
Gargantext.Prelude
import
Gargantext.Prelude
import
Gargantext.Utils.Array
(
window
)
tokenize
::
(
HasNLPServer
env
-- | Just pick an NLP server and tokenize the given string using given
,
MonadReader
env
m
-- language.
,
MonadBaseControl
IO
m
)
=>
Lang
->
Text
->
m
[
TokenTag
]
tokenize
::
HasTokenizer
env
m
=>
Lang
->
Text
->
m
[
TokenTag
]
tokenize
lang
txt
=
do
tokenize
lang
txt
=
do
nlp
<-
view
(
nlpServerGet
lang
)
nlp
<-
view
(
nlpServerGet
lang
)
liftBase
$
concat
<$>
tokenTags
nlp
lang
txt
ret
<-
liftBase
$
concat
<$>
tokenTagsNoGroup
nlp
lang
txt
let
f
=
case
server
nlp
of
-- CoreNLP -> over my_token_offset_end (\o -> o - 1)
_
->
identity
pure
$
f
<$>
ret
-------
-- | This function, given a list of 'NgramsTerm' and a text,
-- highlights these terms using the 'tokenize' function above.
highlightTerms
::
HasTokenizer
env
m
=>
[
NgramsTerm
]
->
Lang
->
Text
->
m
[
HighlightedTerm
]
highlightTerms
ngramsTerms
lang
txt
=
do
txtTokens'
<-
tokenize
lang
txt
let
txtTokens
=
relevantTokens
txtTokens'
liftBase
$
putText
$
"[highlightTerms] txtTokens: "
<>
show
txtTokens
tokenizedTerms
<-
mapM
tokenizeTerms
ngramsTerms
liftBase
$
putText
$
"[highlightTerms] tokenizedTerms: "
<>
show
tokenizedTerms
-- TODO This isn't the most optimal, of O(n*m) complexity. One can
-- try to compute hashes, incrementally, for the windowed tokens
let
ht
=
highlight
txt
txtTokens
<$>
tokenizedTerms
pure
$
catMaybes
$
concat
ht
where
tokenizeTerms
::
HasTokenizer
env
m
=>
NgramsTerm
->
m
(
NgramsTerm
,
[
TokenTag
])
tokenizeTerms
t
=
do
tt'
<-
tokenize
lang
$
unNgramsTerm
t
let
tt
=
relevantTokens
tt'
pure
(
t
,
tt
)
-- | Fills in all "gaps" created by 'highlightTerms', i.e. inserts
-- text parts where there are no highlights.
fillHighlightGaps
::
[
HighlightedTerm
]
->
Text
->
[
HighlightResult
]
fillHighlightGaps
hts
txt
=
sortBy
compareHR
((
HRHighlighted
<$>
hts
)
<>
gapHt
)
where
txtInt
=
IS
.
singleton
(
Finite
0
<=..<=
(
Finite
$
T
.
length
txt
))
compareHR
hr1
hr2
=
compare
(
I
.
lowerBound
$
hrToInterval
hr1
)
(
I
.
lowerBound
$
hrToInterval
hr2
)
htIntervals
=
IS
.
fromList
(
htToInterval
<$>
hts
)
intDiff
=
IS
.
toList
(
IS
.
difference
txtInt
htIntervals
)
gapHt
=
HRNormal
<$>
intervalToNt
txt
<$>
intDiff
------- UTILITY FUNCTIONS
-- | Keep only relevant tokens for token highlight. This is because
-- things like hyphens etc prevent us from highlighting terms
-- separated e.g. with dashes.
relevantTokens
::
[
TokenTag
]
->
[
TokenTag
]
relevantTokens
=
filter
f
where
f
(
TokenTag
{
..
})
=
case
_my_token_pos
of
Just
(
NotFound
{
})
->
False
_
->
True
highlight
::
Text
->
[
TokenTag
]
->
(
NgramsTerm
,
[
TokenTag
])
->
[
Maybe
HighlightedTerm
]
highlight
txt
txtTokens
(
ngramsTerm
,
tokenizedTerms
)
=
highlightInWindow
(
ngramsTerm
,
tokenizedTerms
)
<$>
(
window
(
length
tokenizedTerms
)
txtTokens
)
where
highlightInWindow
::
(
NgramsTerm
,
[
TokenTag
])
->
[
TokenTag
]
->
Maybe
HighlightedTerm
highlightInWindow
(
nt
,
tt
)
windowTxtTokens
=
case
(
compareSets
(
_my_token_lemma
<$>
tt
)
(
_my_token_lemma
<$>
windowTxtTokens
)
,
head
windowTxtTokens
,
lastMay
windowTxtTokens
)
of
(
True
,
Just
h
,
Just
l
)
->
let
(
lb
,
ub
)
=
(
_my_token_offset_begin
h
,
_my_token_offset_end
l
)
in
Just
(
HighlightedTerm
{
_ht_term
=
unNgramsTerm
nt
,
_ht_original_text
=
T
.
take
(
ub
-
lb
)
$
T
.
drop
lb
txt
,
_ht_start
=
lb
,
_ht_end
=
ub
})
_
->
Nothing
intersects
::
Ord
a
=>
Set
a
->
Set
a
->
Bool
intersects
s1
s2
=
not
$
Set
.
disjoint
s1
s2
-- | We treat lemmas as equal, if sets intersect. This is a comparison
-- function for a list of such sets.
compareSets
::
Ord
a
=>
[
Set
a
]
->
[
Set
a
]
->
Bool
compareSets
ss1
ss2
=
(
length
ss1
==
length
ss2
)
&&
all
(
\
(
s1
,
s2
)
->
intersects
s1
s2
)
(
zip
ss1
ss2
)
src/Gargantext/Core/Text/Terms/Tokenize/Types.hs
0 → 100644
View file @
c9c21779
{-|
Module : Gargantext.Core.Text.Terms.Tokenize.Types
Description : String tokenization
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE TemplateHaskell #-}
module
Gargantext.Core.Text.Terms.Tokenize.Types
where
import
Control.Monad.Trans.Control
(
MonadBaseControl
)
import
Data.Aeson.TH
(
defaultOptions
,
deriveJSON
)
import
Data.ExtendedReal
(
Extended
(
..
))
import
Data.Interval
((
<=..<=
))
import
Data.Interval
qualified
as
I
import
Data.Swagger
(
ToSchema
,
declareNamedSchema
,
defaultSchemaOptions
,
genericDeclareNamedSchema
,
genericDeclareNamedSchemaUnrestricted
)
import
Data.Text
qualified
as
T
import
Gargantext.Core.NLP
(
HasNLPServer
)
import
Gargantext.Core.Utils.Prefix
(
unPrefix
,
unPrefixSwagger
)
import
Gargantext.Prelude
type
HasTokenizer
env
m
=
(
HasNLPServer
env
,
MonadReader
env
m
,
MonadBaseControl
IO
m
)
------------------------------
-- NOTE: To highlight terms, we actually need to know what these terms
-- are. Terms consist of compounds of, possibly, multiple tokens and
-- it's not the same as NLP tokenization.
-- https://en.wikipedia.org/wiki/Terminology_extraction
data
HighlightedTerm
=
HighlightedTerm
{
_ht_term
::
Text
,
_ht_original_text
::
Text
,
_ht_start
::
Int
-- start position of the term
,
_ht_end
::
Int
-- end position of the term
}
deriving
(
Show
,
Eq
,
Generic
)
instance
ToSchema
HighlightedTerm
where
declareNamedSchema
=
genericDeclareNamedSchema
(
unPrefixSwagger
"_ht_"
)
htToInterval
::
HighlightedTerm
->
I
.
Interval
Int
htToInterval
(
HighlightedTerm
{
..
})
=
(
Finite
_ht_start
)
<=..<=
(
Finite
_ht_end
)
------------------------------
data
NormalText
=
NormalText
{
_nt_text
::
Text
,
_nt_start
::
Int
,
_nt_end
::
Int
}
deriving
(
Show
,
Eq
,
Generic
)
instance
ToSchema
NormalText
where
declareNamedSchema
=
genericDeclareNamedSchema
(
unPrefixSwagger
"_nt_"
)
ntToInterval
::
NormalText
->
I
.
Interval
Int
ntToInterval
(
NormalText
{
..
})
=
(
Finite
_nt_start
)
<=..<=
(
Finite
_nt_end
)
intervalToNt
::
Text
->
I
.
Interval
Int
->
NormalText
intervalToNt
txt
int
=
NormalText
{
_nt_text
=
T
.
take
(
ub
-
lb
)
$
T
.
drop
lb
txt
,
_nt_start
=
lb
,
_nt_end
=
ub
}
where
lb'
=
I
.
lowerBound
int
lb
=
case
lb'
of
Finite
l
->
l
_
->
0
ub'
=
I
.
upperBound
int
ub
=
case
ub'
of
Finite
u
->
u
_
->
0
------------------------------
data
HighlightResult
=
HRHighlighted
HighlightedTerm
|
HRNormal
NormalText
deriving
(
Show
,
Eq
,
Generic
)
instance
ToSchema
HighlightResult
where
declareNamedSchema
=
genericDeclareNamedSchemaUnrestricted
defaultSchemaOptions
hrToInterval
::
HighlightResult
->
I
.
Interval
Int
hrToInterval
(
HRHighlighted
ht
)
=
htToInterval
ht
hrToInterval
(
HRNormal
nt
)
=
ntToInterval
nt
$
(
deriveJSON
(
unPrefix
"_ht_"
)
''
H
ighlightedTerm
)
$
(
deriveJSON
(
unPrefix
"_nt_"
)
''
N
ormalText
)
$
(
deriveJSON
defaultOptions
''
H
ighlightResult
)
src/Gargantext/Core/Types.hs
View file @
c9c21779
...
@@ -22,6 +22,12 @@ module Gargantext.Core.Types ( module Gargantext.Core.Types.Main
...
@@ -22,6 +22,12 @@ module Gargantext.Core.Types ( module Gargantext.Core.Types.Main
,
DebugMode
(
..
),
withDebugMode
,
DebugMode
(
..
),
withDebugMode
,
Term
(
..
),
Terms
(
..
),
TermsCount
,
TermsWithCount
,
Term
(
..
),
Terms
(
..
),
TermsCount
,
TermsWithCount
,
TokenTag
(
..
),
POS
(
..
),
NER
(
..
)
,
TokenTag
(
..
),
POS
(
..
),
NER
(
..
)
,
my_token_offset_begin
,
my_token_offset_end
,
my_token_lemma
,
my_token_ner
,
my_token_pos
,
my_token_word
,
combineTokenTags
,
emptyTokenTag
,
combineTokenTags
,
emptyTokenTag
,
Label
,
Stems
,
Label
,
Stems
,
HasValidationError
(
..
),
assertValid
,
HasValidationError
(
..
),
assertValid
...
@@ -39,13 +45,13 @@ import Data.Maybe
...
@@ -39,13 +45,13 @@ import Data.Maybe
import
Data.Monoid
import
Data.Monoid
import
Data.Set
(
empty
)
import
Data.Set
(
empty
)
import
Data.String
import
Data.String
import
Data.Swagger
(
ToParamSchema
)
import
Data.Swagger
(
ToParamSchema
,
defaultSchemaOptions
,
genericDeclareNamedSchema
,
genericDeclareNamedSchemaUnrestricted
)
import
Data.Swagger
(
ToSchema
(
..
))
import
Data.Swagger
(
ToSchema
(
..
))
import
Data.Text
(
unpack
)
import
Data.Text
(
unpack
)
import
Data.Validity
import
Data.Validity
import
GHC.Generics
import
GHC.Generics
import
Gargantext.Core.Types.Main
import
Gargantext.Core.Types.Main
import
Gargantext.Core.Utils.Prefix
(
unPrefix
,
wellNamedSchema
)
import
Gargantext.Core.Utils.Prefix
(
unPrefix
,
unPrefixSwagger
,
wellNamedSchema
)
import
Gargantext.Database.Admin.Types.Node
import
Gargantext.Database.Admin.Types.Node
import
Gargantext.Prelude
hiding
(
Ordering
,
empty
)
import
Gargantext.Prelude
hiding
(
Ordering
,
empty
)
import
Test.QuickCheck.Arbitrary
(
Arbitrary
,
arbitrary
)
import
Test.QuickCheck.Arbitrary
(
Arbitrary
,
arbitrary
)
...
@@ -133,9 +139,11 @@ instance FromJSON POS where
...
@@ -133,9 +139,11 @@ instance FromJSON POS where
instance
ToJSON
POS
instance
ToJSON
POS
instance
Hashable
POS
instance
Hashable
POS
instance
ToSchema
POS
where
declareNamedSchema
=
genericDeclareNamedSchemaUnrestricted
defaultSchemaOptions
------------------------------------------------------------------------
------------------------------------------------------------------------
data
NER
=
PERSON
|
ORGANIZATION
|
LOCATION
|
NoNER
{
noNer
::
!
Text
}
data
NER
=
PERSON
|
ORGANIZATION
|
LOCATION
|
NoNER
{
noNer
::
!
Text
}
deriving
(
Show
,
Generic
)
deriving
(
Show
,
Eq
,
Generic
)
------------------------------------------------------------------------
------------------------------------------------------------------------
instance
FromJSON
NER
where
instance
FromJSON
NER
where
parseJSON
=
withText
"String"
(
\
x
->
pure
(
ner
$
unpack
x
))
parseJSON
=
withText
"String"
(
\
x
->
pure
(
ner
$
unpack
x
))
...
@@ -149,6 +157,8 @@ instance FromJSON NER where
...
@@ -149,6 +157,8 @@ instance FromJSON NER where
ner
x
=
NoNER
(
cs
x
)
ner
x
=
NoNER
(
cs
x
)
instance
ToJSON
NER
instance
ToJSON
NER
instance
ToSchema
NER
where
declareNamedSchema
=
genericDeclareNamedSchemaUnrestricted
defaultSchemaOptions
data
TokenTag
=
TokenTag
{
_my_token_word
::
[
Text
]
data
TokenTag
=
TokenTag
{
_my_token_word
::
[
Text
]
,
_my_token_lemma
::
Set
Text
,
_my_token_lemma
::
Set
Text
...
@@ -156,8 +166,11 @@ data TokenTag = TokenTag { _my_token_word :: [Text]
...
@@ -156,8 +166,11 @@ data TokenTag = TokenTag { _my_token_word :: [Text]
,
_my_token_ner
::
Maybe
NER
,
_my_token_ner
::
Maybe
NER
,
_my_token_offset_begin
::
Int
,
_my_token_offset_begin
::
Int
,
_my_token_offset_end
::
Int
,
_my_token_offset_end
::
Int
}
deriving
(
Show
)
}
deriving
(
Show
,
Eq
,
Generic
)
instance
ToSchema
TokenTag
where
declareNamedSchema
=
genericDeclareNamedSchema
(
unPrefixSwagger
"_my_token_"
)
$
(
deriveJSON
(
unPrefix
"_my_token_"
)
''
T
okenTag
)
$
(
deriveJSON
(
unPrefix
"_my_token_"
)
''
T
okenTag
)
makeLenses
''
T
okenTag
-- | NOTE: Combining 'TokenTag' doesn't make much sense to me. And
-- | NOTE: Combining 'TokenTag' doesn't make much sense to me. And
-- lemma combining is just wrong. You can't just "cat" <> "woman" to
-- lemma combining is just wrong. You can't just "cat" <> "woman" to
...
...
src/Gargantext/Utils/Array.hs
0 → 100644
View file @
c9c21779
{-|
Module : Gargantext.Utils.Array
Description : Gargantext utilities
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Utilities for handling arrays.
-}
module
Gargantext.Utils.Array
where
import
Protolude
-- | A sliding window of given size for an array.
-- https://stackoverflow.com/questions/27726739/implementing-an-efficient-sliding-window-algorithm-in-haskell
window
::
Int
->
[
a
]
->
[[
a
]]
window
size
=
foldr
(
zipWith
(
:
))
(
repeat
[]
)
.
take
size
.
tails
test/drivers/tasty/Main.hs
View file @
c9c21779
...
@@ -14,6 +14,7 @@ import Gargantext.Prelude
...
@@ -14,6 +14,7 @@ import Gargantext.Prelude
import
qualified
Test.Core.Text.Corpus.Query
as
CorpusQuery
import
qualified
Test.Core.Text.Corpus.Query
as
CorpusQuery
import
qualified
Test.Core.Utils
as
Utils
import
qualified
Test.Core.Utils
as
Utils
import
qualified
Test.Core.Text.Tokenize
as
Tokenize
import
qualified
Test.Graph.Clustering
as
Graph
import
qualified
Test.Graph.Clustering
as
Graph
import
qualified
Test.Ngrams.NLP
as
NLP
import
qualified
Test.Ngrams.NLP
as
NLP
import
qualified
Test.Ngrams.Query
as
NgramsQuery
import
qualified
Test.Ngrams.Query
as
NgramsQuery
...
@@ -35,6 +36,7 @@ main = do
...
@@ -35,6 +36,7 @@ main = do
cryptoSpec
<-
testSpec
"Crypto"
Crypto
.
test
cryptoSpec
<-
testSpec
"Crypto"
Crypto
.
test
nlpSpec
<-
testSpec
"NLP"
NLP
.
test
nlpSpec
<-
testSpec
"NLP"
NLP
.
test
jobsSpec
<-
testSpec
"Jobs"
Jobs
.
test
jobsSpec
<-
testSpec
"Jobs"
Jobs
.
test
tokenizeSpec
<-
testSpec
"Tokenize"
Tokenize
.
test
defaultMain
$
testGroup
"Gargantext"
defaultMain
$
testGroup
"Gargantext"
[
utilSpec
[
utilSpec
...
@@ -44,6 +46,7 @@ main = do
...
@@ -44,6 +46,7 @@ main = do
,
cryptoSpec
,
cryptoSpec
,
nlpSpec
,
nlpSpec
,
jobsSpec
,
jobsSpec
,
tokenizeSpec
,
NgramsQuery
.
tests
,
NgramsQuery
.
tests
,
CorpusQuery
.
tests
,
CorpusQuery
.
tests
,
JSON
.
tests
,
JSON
.
tests
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment