Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
153
Issues
153
List
Board
Labels
Milestones
Merge Requests
7
Merge Requests
7
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
cf12ea50
Verified
Commit
cf12ea50
authored
Jan 15, 2025
by
Przemyslaw Kaminski
1
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[ngrams] declarative ngrams demo
parent
b3910bb4
Pipeline
#7218
passed with stages
in 72 minutes and 35 seconds
Changes
4
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
194 additions
and
28 deletions
+194
-28
gargantext.cabal
gargantext.cabal
+2
-0
DeclarativeNgrams.hs
src/Gargantext/Core/Text/DeclarativeNgrams.hs
+101
-0
DeclarativeNgrams.hs
test/Test/Core/Text/DeclarativeNgrams.hs
+60
-0
Main.hs
test/drivers/tasty/Main.hs
+31
-28
No files found.
gargantext.cabal
View file @
cf12ea50
...
...
@@ -213,6 +213,7 @@ library
Gargantext.Core.Text.Corpus.Parsers.Date
Gargantext.Core.Text.Corpus.Parsers.TSV
Gargantext.Core.Text.Corpus.Query
Gargantext.Core.Text.DeclarativeNgrams
Gargantext.Core.Text.List
Gargantext.Core.Text.List.Formats.TSV
Gargantext.Core.Text.List.Group.WithStem
...
...
@@ -793,6 +794,7 @@ test-suite garg-test-tasty
Test.Core.Text
Test.Core.Text.Corpus.Query
Test.Core.Text.Corpus.TSV
Test.Core.Text.DeclarativeNgrams
Test.Core.Text.Examples
Test.Core.Text.Flow
Test.Core.Utils
...
...
src/Gargantext/Core/Text/DeclarativeNgrams.hs
0 → 100644
View file @
cf12ea50
{-|
Module : Gargantext.Core.Text.DeclarativeNgrams
Description : "Declarative" ngrams, i.e. plain text with optional modifiers
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
See https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/386#note_12413
-}
{-# OPTIONS_GHC -Wno-deprecations #-}
-- warning about 'undefined' usage is thrown
module
Gargantext.Core.Text.DeclarativeNgrams
where
import
Data.Text
qualified
as
T
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Prelude
data
Ngrams
=
Ngrams
{
terms
::
Text
,
declaration
::
Declaration
}
deriving
(
Eq
,
Show
)
data
Declaration
=
Exact
|
CaseInsensitive
|
SingularPlural
Lang
-- TODO: SingularPlural Lang
-- TODO: FrenchDeterminers (https://en.wikipedia.org/wiki/French_articles_and_determiners)
-- TODO Regex
-- TODO Combine CaseInsensitive and SingluarPlural
-- TODO Combine CaseInsensitive and FrenchDeterminers
deriving
(
Eq
,
Show
)
data
NgramsWithPos
=
NgramsWithPos
{
ngrams
::
Ngrams
,
match
::
Text
,
pos
::
Int
}
deriving
(
Eq
,
Show
)
-- | TODO Use parsec for this?
-- e.g. matching case-insensitive:
-- https://stackoverflow.com/questions/12937325/whats-the-cleanest-way-to-do-case-insensitive-parsing-with-text-combinators-par
findNgrams
::
[
Ngrams
]
->
Text
->
[
NgramsWithPos
]
findNgrams
[]
_txt
=
[]
findNgrams
(
ng
:
ngs
)
txt
=
findNgrams'
ng
txt
0
<>
(
findNgrams
ngs
txt
)
findNgrams'
::
Ngrams
->
Text
->
Int
->
[
NgramsWithPos
]
findNgrams'
_
""
_
=
[]
findNgrams'
ngrams
@
(
Ngrams
{
terms
,
declaration
})
txt
initialPos
=
foldl
(
<>
)
[]
$
map
handleBreak
(
breakOnDeclaration
declaration
terms
txt
)
where
handleBreak
::
(
Text
,
Text
,
Text
)
->
[
NgramsWithPos
]
handleBreak
(
s
,
matching
,
_e
)
=
let
pos
=
initialPos
+
T
.
length
s
txt'
=
T
.
drop
(
T
.
length
s
)
txt
match
=
T
.
take
(
T
.
length
matching
)
txt'
ngramsWithPos
=
NgramsWithPos
{
ngrams
,
pos
,
match
}
in
(
ngramsWithPos
:
(
findNgrams'
ngrams
(
T
.
drop
(
T
.
length
matching
)
txt'
)
(
pos
+
T
.
length
matching
)))
-- | Split declaration function in spirit of `T.breakOn` + Declaration type
breakOnDeclaration
::
Declaration
->
Text
->
Text
->
[(
Text
,
Text
,
Text
)]
breakOnDeclaration
Exact
terms
txt
=
case
T
.
breakOn
terms
txt
of
(
_
,
""
)
->
[]
(
s
,
e
)
->
[(
s
,
terms
,
T
.
drop
(
T
.
length
terms
)
e
)]
breakOnDeclaration
CaseInsensitive
terms
txt
=
case
T
.
breakOn
(
T
.
toLower
terms
)
(
T
.
toLower
txt
)
of
(
_
,
""
)
->
[]
(
s
,
e
)
->
[(
s
,
T
.
take
(
T
.
length
terms
)
e
,
T
.
drop
(
T
.
length
terms
)
e
)]
breakOnDeclaration
(
SingularPlural
lang
)
terms
txt
=
singularOrPlural
where
pf
=
pluralForm
lang
terms
-- | Singular is a bit tricky, because it's a "subset" of plural,
-- without the "s", so we filter things out.
-- singular = filter (\(_s, m, e) -> T.take (T.length pf) (m <> e) /= pf) $ breakOnDeclaration Exact terms txt
singular
=
breakOnDeclaration
Exact
terms
txt
plural
=
breakOnDeclaration
Exact
pf
txt
singularOrPlural
=
case
(
singular
,
plural
)
of
([
s
],
[
p
])
->
let
(
ss
,
_
,
_
)
=
s
(
sp
,
_
,
_
)
=
p
in
if
T
.
length
ss
<
T
.
length
sp
then
[
s
]
else
[
p
]
([
s
],
_
)
->
[
s
]
(
_
,
p
)
->
p
pluralForm
::
Lang
->
Text
->
Text
pluralForm
EN
"door"
=
"door"
pluralForm
EN
"sheep"
=
"sheep"
pluralForm
EN
term
=
term
<>
"s"
pluralForm
_
_
=
undefined
test/Test/Core/Text/DeclarativeNgrams.hs
0 → 100644
View file @
cf12ea50
{-|
Module : Test.Core.Text.DecarativeNgrams
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module
Test.Core.Text.DeclarativeNgrams
where
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core.Text.DeclarativeNgrams
import
Gargantext.Prelude
import
Test.Hspec
-- | Core.Text.DeclarativeNgrams tests
test
::
Spec
test
=
do
describe
"test text matching"
$
do
it
"exact match works"
$
do
let
ngrams
=
Ngrams
{
terms
=
"hello"
,
declaration
=
Exact
}
findNgrams
[
ngrams
]
"hello world, hello"
`
shouldBe
`
[
NgramsWithPos
{
ngrams
,
pos
=
0
,
match
=
"hello"
}
,
NgramsWithPos
{
ngrams
,
pos
=
13
,
match
=
"hello"
}
]
findNgrams
[
ngrams
]
"hello world, Hello"
`
shouldBe
`
[
NgramsWithPos
{
ngrams
,
pos
=
0
,
match
=
"hello"
}
]
findNgrams
[
ngrams
]
"hello world, hello, hello"
`
shouldBe
`
[
NgramsWithPos
{
ngrams
,
pos
=
0
,
match
=
"hello"
}
,
NgramsWithPos
{
ngrams
,
pos
=
13
,
match
=
"hello"
}
,
NgramsWithPos
{
ngrams
,
pos
=
20
,
match
=
"hello"
}
]
it
"case insensitive works"
$
do
let
ngrams
=
Ngrams
{
terms
=
"Hello"
,
declaration
=
CaseInsensitive
}
findNgrams
[
ngrams
]
"hello world, hello"
`
shouldBe
`
[
NgramsWithPos
{
ngrams
,
pos
=
0
,
match
=
"hello"
}
,
NgramsWithPos
{
ngrams
,
pos
=
13
,
match
=
"hello"
}
]
findNgrams
[
ngrams
]
"hello world, Hello"
`
shouldBe
`
[
NgramsWithPos
{
ngrams
,
pos
=
0
,
match
=
"hello"
}
,
NgramsWithPos
{
ngrams
,
pos
=
13
,
match
=
"Hello"
}
]
findNgrams
[
ngrams
]
"hello world, Hello, hEllo"
`
shouldBe
`
[
NgramsWithPos
{
ngrams
,
pos
=
0
,
match
=
"hello"
}
,
NgramsWithPos
{
ngrams
,
pos
=
13
,
match
=
"Hello"
}
,
NgramsWithPos
{
ngrams
,
pos
=
20
,
match
=
"hEllo"
}
]
it
"singular/plural works"
$
do
let
ngrams
=
Ngrams
{
terms
=
"world"
,
declaration
=
SingularPlural
EN
}
findNgrams
[
ngrams
]
"hello world, hello"
`
shouldBe
`
[
NgramsWithPos
{
ngrams
,
pos
=
6
,
match
=
"world"
}
]
findNgrams
[
ngrams
]
"hello worlds, hello"
`
shouldBe
`
[
NgramsWithPos
{
ngrams
,
pos
=
6
,
match
=
"worlds"
}
]
findNgrams
[
ngrams
]
"hello world, Hello worlds"
`
shouldBe
`
[
NgramsWithPos
{
ngrams
,
pos
=
6
,
match
=
"world"
}
,
NgramsWithPos
{
ngrams
,
pos
=
19
,
match
=
"worlds"
}
]
findNgrams
[
ngrams
]
"hello world, Hello, World"
`
shouldBe
`
[
NgramsWithPos
{
ngrams
,
pos
=
6
,
match
=
"world"
}
]
findNgrams
[
ngrams
]
"hello worlds, Hello, world"
`
shouldBe
`
[
NgramsWithPos
{
ngrams
,
pos
=
6
,
match
=
"worlds"
}
,
NgramsWithPos
{
ngrams
,
pos
=
21
,
match
=
"world"
}]
test/drivers/tasty/Main.hs
View file @
cf12ea50
...
...
@@ -12,25 +12,26 @@ module Main where
import
Gargantext.Prelude
import
qualified
Test.Core.Notifications
as
Notifications
import
qualified
Test.Core.Orchestrator
as
Orchestrator
import
qualified
Test.Core.Similarity
as
Similarity
import
qualified
Test.Core.Text.Corpus.Query
as
CorpusQuery
import
qualified
Test.Core.Text.Corpus.TSV
as
TSVParser
import
qualified
Test.Core.Utils
as
Utils
import
qualified
Test.Core.Worker
as
Worker
import
qualified
Test.Graph.Clustering
as
Graph
import
qualified
Test.Ngrams.Lang.Occurrences
as
Occurrences
import
qualified
Test.Ngrams.NLP
as
NLP
import
qualified
Test.Ngrams.Query
as
NgramsQuery
import
qualified
Test.Ngrams.Terms
as
NgramsTerms
import
qualified
Test.Offline.Errors
as
Errors
import
qualified
Test.Offline.JSON
as
JSON
import
qualified
Test.Offline.Phylo
as
Phylo
import
qualified
Test.Offline.Stemming.Lancaster
as
Lancaster
import
qualified
Test.Parsers.Date
as
PD
import
qualified
Test.Utils.Crypto
as
Crypto
import
qualified
Test.Utils.Jobs
as
Jobs
import
qualified
Test.Core.Notifications
as
Notifications
import
qualified
Test.Core.Orchestrator
as
Orchestrator
import
qualified
Test.Core.Similarity
as
Similarity
import
qualified
Test.Core.Text.Corpus.Query
as
CorpusQuery
import
qualified
Test.Core.Text.Corpus.TSV
as
TSVParser
import
qualified
Test.Core.Text.DeclarativeNgrams
as
DNgrams
import
qualified
Test.Core.Utils
as
Utils
import
qualified
Test.Core.Worker
as
Worker
import
qualified
Test.Graph.Clustering
as
Graph
import
qualified
Test.Ngrams.Lang.Occurrences
as
Occurrences
import
qualified
Test.Ngrams.NLP
as
NLP
import
qualified
Test.Ngrams.Query
as
NgramsQuery
import
qualified
Test.Ngrams.Terms
as
NgramsTerms
import
qualified
Test.Offline.Errors
as
Errors
import
qualified
Test.Offline.JSON
as
JSON
import
qualified
Test.Offline.Phylo
as
Phylo
import
qualified
Test.Offline.Stemming.Lancaster
as
Lancaster
import
qualified
Test.Parsers.Date
as
PD
import
qualified
Test.Utils.Crypto
as
Crypto
import
qualified
Test.Utils.Jobs
as
Jobs
import
System.IO
(
hGetBuffering
,
hSetBuffering
)
import
Test.Tasty
...
...
@@ -46,15 +47,16 @@ protectStdoutBuffering action =
main
::
IO
()
main
=
do
utilSpec
<-
testSpec
"Utils"
Utils
.
test
clusteringSpec
<-
testSpec
"Graph Clustering"
Graph
.
test
dateSplitSpec
<-
testSpec
"Date split"
PD
.
testDateSplit
cryptoSpec
<-
testSpec
"Crypto"
Crypto
.
test
nlpSpec
<-
testSpec
"NLP"
NLP
.
test
jobsSpec
<-
testSpec
"Jobs"
Jobs
.
test
similaritySpec
<-
testSpec
"Similarity"
Similarity
.
test
asyncUpdatesSpec
<-
testSpec
"Notifications"
Notifications
.
test
occurrencesSpec
<-
testSpec
"Occurrences"
Occurrences
.
test
utilSpec
<-
testSpec
"Utils"
Utils
.
test
clusteringSpec
<-
testSpec
"Graph Clustering"
Graph
.
test
dateSplitSpec
<-
testSpec
"Date split"
PD
.
testDateSplit
cryptoSpec
<-
testSpec
"Crypto"
Crypto
.
test
nlpSpec
<-
testSpec
"NLP"
NLP
.
test
jobsSpec
<-
testSpec
"Jobs"
Jobs
.
test
similaritySpec
<-
testSpec
"Similarity"
Similarity
.
test
asyncUpdatesSpec
<-
testSpec
"Notifications"
Notifications
.
test
occurrencesSpec
<-
testSpec
"Occurrences"
Occurrences
.
test
declarativeNgramsSpec
<-
testSpec
"Declarative Ngrams"
DNgrams
.
test
protectStdoutBuffering
$
defaultMain
$
testGroup
"Gargantext"
[
utilSpec
...
...
@@ -78,4 +80,5 @@ main = do
,
Notifications
.
qcTests
,
Orchestrator
.
qcTests
,
NgramsTerms
.
tests
,
declarativeNgramsSpec
]
Przemyslaw Kaminski
@cgenie
mentioned in issue
#386 (closed)
·
Jan 15, 2025
mentioned in issue
#386 (closed)
mentioned in issue #386
Toggle commit list
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment