Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
153
Issues
153
List
Board
Labels
Milestones
Merge Requests
12
Merge Requests
12
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
1049ea25
Commit
1049ea25
authored
Apr 16, 2018
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FIS][FIX] Frequent Item Set and fix ngrams extraction test.
parent
9f29cddb
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
60 additions
and
48 deletions
+60
-48
package.yaml
package.yaml
+1
-0
Ngrams.hs
src/Gargantext/Ngrams.hs
+33
-20
CoreNLP.hs
src/Gargantext/Ngrams/CoreNLP.hs
+2
-3
FrequentItemSet.hs
src/Gargantext/Ngrams/FrequentItemSet.hs
+2
-4
En.hs
src/Gargantext/Ngrams/Lang/En.hs
+14
-14
Parser.hs
src/Gargantext/Ngrams/Parser.hs
+6
-6
Main.hs
src/Gargantext/Types/Main.hs
+0
-1
stack.yaml
stack.yaml
+2
-0
No files found.
package.yaml
View file @
1049ea25
...
...
@@ -116,6 +116,7 @@ library:
-
servant-swagger-ui
-
servant-static-th
-
split
-
stemmer
-
swagger2
-
tagsoup
-
text-metrics
...
...
src/Gargantext/Ngrams.hs
View file @
1049ea25
...
...
@@ -21,7 +21,7 @@ module Gargantext.Ngrams ( module Gargantext.Ngrams.Letters
,
module
Gargantext
.
Ngrams
.
Occurrences
,
module
Gargantext
.
Ngrams
.
TextMining
,
module
Gargantext
.
Ngrams
.
Metrics
,
ngrams
,
occ
,
sumOcc
,
text2fis
,
Ngrams
(
..
),
ngrams
,
occ
,
sumOcc
,
text2fis
--, module Gargantext.Ngrams.Words
)
where
...
...
@@ -40,35 +40,46 @@ import qualified Gargantext.Ngrams.FrequentItemSet as FIS
import
Data.Char
(
Char
,
isAlpha
,
isSpace
)
import
Data.Text
(
Text
,
words
,
filter
,
toLower
)
import
Data.Map.Strict
(
Map
,
empty
,
keys
import
Data.Map.Strict
(
Map
,
empty
,
insertWith
,
unionWith
,
fromList
,
lookupIndex
--, fromList, keys
)
import
qualified
Data.Map.Strict
as
M
(
filter
)
import
Data.Foldable
(
foldl'
)
import
Gargantext.Prelude
hiding
(
filter
)
import
qualified
Data.List
as
L
(
filter
)
-- Maybe useful later:
--import NLP.Stemmer (stem, Stemmer(..))
--import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
--import Language.Aspell.Options (ACOption(..))
data
Ngrams
=
Ngrams
{
_ngramsNgrams
::
Text
,
_ngramsStem
::
Text
}
deriving
(
Show
)
instance
Eq
Ngrams
where
Ngrams
n1
s1
==
Ngrams
n2
s2
=
n1
==
n2
||
s1
==
s2
type
Occ
=
Int
type
Index
=
Int
type
FreqMin
=
Int
--type Index = Int
ngrams
::
Text
->
[
Text
]
ngrams
xs
=
monograms
$
toLower
$
filter
is
Gram
xs
ngrams
xs
=
monograms
$
toLower
$
filter
is
Char
xs
monograms
::
Text
->
[
Text
]
monograms
=
words
isGram
::
Char
->
Bool
isGram
'-'
=
True
isGram
'/'
=
True
isGram
c
=
isAlpha
c
||
isSpace
c
-- TODO
-- 12-b
isChar
::
Char
->
Bool
isChar
'-'
=
True
isChar
'/'
=
True
isChar
c
=
isAlpha
c
||
isSpace
c
-- | Compute the occurrences (occ)
occ
::
Ord
a
=>
[
a
]
->
Map
a
Occ
...
...
@@ -78,18 +89,18 @@ occ xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs
sumOcc
::
Ord
a
=>
[
Map
a
Occ
]
->
Map
a
Occ
sumOcc
xs
=
foldl'
(
\
x
y
->
unionWith
(
+
)
x
y
)
empty
xs
noApax
::
Ord
a
=>
Map
a
Occ
->
Map
a
Occ
noApax
m
=
M
.
filter
(
>
1
)
m
--
noApax :: Ord a => Map a Occ -> Map a Occ
--
noApax m = M.filter (>1) m
-- | /!\ indexes are not the same:
-- | Index ngrams from Map
indexNgram
::
Ord
a
=>
Map
a
Occ
->
Map
Index
a
indexNgram
m
=
fromList
(
zip
[
1
..
]
(
keys
m
))
--
indexNgram :: Ord a => Map a Occ -> Map Index a
--
indexNgram m = fromList (zip [1..] (keys m))
-- | Index ngrams from Map
ngramIndex
::
Ord
a
=>
Map
a
Occ
->
Map
a
Index
ngramIndex
m
=
fromList
(
zip
(
keys
m
)
[
1
..
])
--
ngramIndex :: Ord a => Map a Occ -> Map a Index
--
ngramIndex m = fromList (zip (keys m) [1..])
indexWith
::
Ord
a
=>
Map
a
Occ
->
[
a
]
->
[
Int
]
indexWith
m
xs
=
unMaybe
$
map
(
\
x
->
lookupIndex
x
m
)
xs
...
...
@@ -101,14 +112,16 @@ indexIt xs = (m, is)
is
=
map
(
indexWith
m
)
xs
list2fis
::
Ord
a
=>
FIS
.
Frequency
->
[[
a
]]
->
(
Map
a
Int
,
[
FIS
.
Fis
])
list2fis
n
xs
=
(
m
,
fs
)
list2fis
n
xs
=
(
m
'
,
fs
)
where
(
m
,
is
)
=
indexIt
xs
m'
=
M
.
filter
(
>
50000
)
m
fs
=
FIS
.
all
n
is
text2fis
::
FIS
.
Frequency
->
[
Text
]
->
(
Map
Text
Int
,
[
FIS
.
Fis
])
text2fis
n
xs
=
list2fis
n
(
map
ngrams
xs
)
text2fisWith
::
FIS
.
Size
->
FIS
.
Frequency
->
[
Text
]
->
(
Map
Text
Int
,
[
FIS
.
Fis
])
text2fisWith
=
undefined
--text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
--text2fisWith = undefined
src/Gargantext/Ngrams/CoreNLP.hs
View file @
1049ea25
...
...
@@ -18,7 +18,6 @@ Portability : POSIX
module
Gargantext.Ngrams.CoreNLP
where
import
Data.Aeson
import
Data.Aeson.TH
(
deriveJSON
)
import
GHC.Generics
import
Data.Monoid
((
<>
))
...
...
@@ -65,8 +64,8 @@ $(deriveJSON (unPrefix "_properties") ''Properties)
data
Sentences
=
Sentences
{
_sentences
::
[
Sentence
]}
deriving
(
Show
,
Generic
)
instance
ToJSON
Sentences
instance
FromJSON
Sentences
$
(
deriveJSON
(
unPrefix
"_"
)
''
S
entences
)
-- request =
...
...
src/Gargantext/Ngrams/FrequentItemSet.hs
View file @
1049ea25
...
...
@@ -45,8 +45,8 @@ all f is = fisWith Nothing f is
between
::
(
Int
,
Int
)
->
Frequency
->
[[
Item
]]
->
[
Fis
]
between
(
x
,
y
)
f
is
=
fisWithSize
(
Right
(
x
,
y
))
f
is
maximum
::
Int
->
Frequency
->
[[
Item
]]
->
[
Fis
]
maximum
m
f
is
=
between
(
0
,
m
)
f
is
--
maximum :: Int -> Frequency -> [[Item]] -> [Fis]
--
maximum m f is = between (0,m) f is
------------------------------------------------------------------------
...
...
@@ -67,7 +67,6 @@ items2fis is = case head is of
------------------------------------------------------------------------
------------------------------------------------------------------------
------------------------------------------------------------------------
fisWithSize
::
Size
->
Frequency
->
[[
Item
]]
->
[
Fis
]
fisWithSize
n
f
is
=
case
n
of
...
...
@@ -87,4 +86,3 @@ fisWith s f is = unMaybe $ map items2fis $ filter' $ runLCMmatrix is f
------------------------------------------------------------------------
------------------------------------------------------------------------
------------------------------------------------------------------------
src/Gargantext/Ngrams/Lang/En.hs
View file @
1049ea25
...
...
@@ -8,16 +8,16 @@ import Data.Text (Text)
import
Data.Monoid
((
<>
))
selectNgrams
::
[(
Text
,
Text
,
Text
)]
->
[(
Text
,
Text
,
Text
)]
selectNgrams
xs
=
filter
selectNgrams'
xs
selectNgrams
xs
=
filter
isNgrams
xs
where
selectNgrams'
(
_
,
"NN"
,
_
)
=
True
selectNgrams'
(
_
,
"NNS"
,
_
)
=
True
selectNgrams'
(
_
,
"NNP"
,
_
)
=
True
selectNgrams'
(
_
,
"NN+CC"
,
_
)
=
True
selectNgrams'
(
_
,
_
,
"PERSON"
)
=
True
selectNgrams'
(
_
,
_
,
"ORGANIZATION"
)
=
True
selectNgrams'
(
_
,
_
,
"LOCATION"
)
=
True
selectNgrams'
(
_
,
_
,
_
)
=
False
isNgrams
(
_
,
"NN"
,
_
)
=
True
isNgrams
(
_
,
"NNS"
,
_
)
=
True
isNgrams
(
_
,
"NNP"
,
_
)
=
True
isNgrams
(
_
,
"NN+CC"
,
_
)
=
True
isNgrams
(
_
,
_
,
"PERSON"
)
=
True
isNgrams
(
_
,
_
,
"ORGANIZATION"
)
=
True
isNgrams
(
_
,
_
,
"LOCATION"
)
=
True
isNgrams
(
_
,
_
,
_
)
=
False
groupNgrams
::
[(
Text
,
Text
,
Text
)]
->
[(
Text
,
Text
,
Text
)]
...
...
@@ -26,7 +26,7 @@ groupNgrams [] = []
groupNgrams
((
j1
,
"JJ"
,
j1'
)
:
(
c1
,
"CC"
,
c1'
)
:
(
j2
,
"JJ"
,
j2'
)
:
(
j3
,
"JJ"
,
_
)
:
xs
)
=
groupNgrams
(
jn1
:
cc
:
jn2
:
xs
)
where
jn
j'
j''
jn'
=
(
j'
<>
" "
<>
j''
,
"JJ"
,
jn'
)
cc
=
(
c1
,
"CC"
,
c1'
)
cc
=
(
c1
,
"CC"
,
c1'
)
jn1
=
(
j1
,
"JJ"
,
j1'
)
jn2
=
jn
j2
j3
j2'
...
...
@@ -60,14 +60,14 @@ groupNgrams ((x,"NN",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> y, "NN",
-- > should be (antiinflammatory activity) <> (analgesic activity)
groupNgrams
((
x
,
"NN"
,
_
)
:
(
o
,
"IN"
,
_
)
:
(
y
,
"NN"
,
yy
)
:
xs
)
=
groupNgrams
((
x
<>
" "
<>
o
<>
" "
<>
y
,
"NN"
,
yy
)
:
xs
)
groupNgrams
((
x
,
"NN"
,
_
)
:
(
o
,
"IN"
,
_
)
:
(
y
,
"NNP"
,
yy
)
:
xs
)
=
groupNgrams
((
x
<>
" "
<>
o
<>
" "
<>
y
,
"NN"
,
yy
)
:
xs
)
groupNgrams
((
x
,
"NN"
,
_
)
:
(
o
,
"IN"
,
_
)
:
(
y
,
"NNP"
,
yy
)
:
xs
)
=
groupNgrams
((
x
<>
" "
<>
o
<>
" "
<>
y
,
"NN"
,
yy
)
:
xs
)
groupNgrams
((
x
,
"NN"
,
_
)
:
(
o
,
"IN"
,
_
)
:
(
det
,
"DT"
,
_
)
:
(
y
,
"NN"
,
yy
)
:
xs
)
=
groupNgrams
((
x
<>
" "
<>
o
<>
" "
<>
det
<>
" "
<>
y
,
"NN"
,
yy
)
:
xs
)
groupNgrams
((
x
,
"NN"
,
_
)
:
(
o
,
"IN"
,
_
)
:
(
det
,
"DT"
,
_
)
:
(
y
,
"NNP"
,
yy
)
:
xs
)
=
groupNgrams
((
x
<>
" "
<>
o
<>
" "
<>
det
<>
" "
<>
y
,
"NN"
,
yy
)
:
xs
)
groupNgrams
((
x
,
"NN"
,
_
)
:
(
o
,
"IN"
,
_
)
:
(
det
,
"DT"
,
_
)
:
(
y
,
"NN"
,
yy
)
:
xs
)
=
groupNgrams
((
x
<>
" "
<>
o
<>
" "
<>
det
<>
" "
<>
y
,
"NN"
,
yy
)
:
xs
)
groupNgrams
((
x
,
"NN"
,
_
)
:
(
o
,
"IN"
,
_
)
:
(
det
,
"DT"
,
_
)
:
(
y
,
"NNP"
,
yy
)
:
xs
)
=
groupNgrams
((
x
<>
" "
<>
o
<>
" "
<>
det
<>
" "
<>
y
,
"NN"
,
yy
)
:
xs
)
groupNgrams
((
x
,
_
,
"PERSON"
)
:
(
y
,
yy
,
"PERSON"
)
:
xs
)
=
groupNgrams
((
x
<>
" "
<>
y
,
yy
,
"PERSON"
)
:
xs
)
groupNgrams
((
x
,
_
,
"ORGANIZATION"
)
:
(
y
,
yy
,
"ORGANIZATION"
)
:
xs
)
=
groupNgrams
((
x
<>
" "
<>
y
,
yy
,
"ORGANIZATION"
)
:
xs
)
groupNgrams
((
x
,
_
,
"LOCATION"
)
:
(
y
,
yy
,
"LOCATION"
)
:
xs
)
=
groupNgrams
((
x
<>
" "
<>
y
,
yy
,
"LOCATION"
)
:
xs
)
groupNgrams
((
x
,
_
,
"LOCATION"
)
:
(
y
,
yy
,
"LOCATION"
)
:
xs
)
=
groupNgrams
((
x
<>
" "
<>
y
,
yy
,
"LOCATION"
)
:
xs
)
groupNgrams
(
x
:
xs
)
=
(
x
:
(
groupNgrams
xs
))
...
...
src/Gargantext/Ngrams/Parser.hs
View file @
1049ea25
...
...
@@ -8,10 +8,11 @@ import Gargantext.Prelude
import
Gargantext.Ngrams.CoreNLP
import
Data.Text
hiding
(
map
)
import
Gargantext.Types.Main
(
Language
(
..
)
,
Ngrams
)
import
Gargantext.Types.Main
(
Language
(
..
))
import
qualified
Gargantext.Ngrams.Lang.En
as
En
import
qualified
Gargantext.Ngrams.Lang.Fr
as
Fr
type
SNgrams
=
(
Text
,
Text
,
Text
)
-- | Ngrams selection algorithms
-- A form is a list of characters seperated by one or more spaces in a sentence.
...
...
@@ -31,11 +32,11 @@ import qualified Gargantext.Ngrams.Lang.Fr as Fr
-- TODO for scientific papers: add maesures
-- TODO add the p score regex
extractNgrams
::
Language
->
Text
->
IO
[[
Ngrams
]]
extractNgrams
::
Language
->
Text
->
IO
[[
S
Ngrams
]]
extractNgrams
lang
s
=
map
(
groupNgrams
lang
)
<$>
extractNgrams'
lang
s
extractNgrams'
::
Language
->
Text
->
IO
[[
Ngrams
]]
extractNgrams'
::
Language
->
Text
->
IO
[[
S
Ngrams
]]
extractNgrams'
lang
t
=
map
(
map
token2text
)
<$>
map
_sentenceTokens
<$>
_sentences
...
...
@@ -44,14 +45,13 @@ extractNgrams' lang t = map (map token2text)
-- | This function selects ngrams according to grammars specific
-- of each language.
-- In english, JJ is ADJectiv in french.
selectNgrams
::
Language
->
[
Ngrams
]
->
[
Ngrams
]
selectNgrams
::
Language
->
[
SNgrams
]
->
[
S
Ngrams
]
selectNgrams
EN
=
En
.
selectNgrams
selectNgrams
FR
=
Fr
.
selectNgrams
-- | This function analyze and groups (or not) ngrams according to
-- grammars specific of each language.
groupNgrams
::
Language
->
[
Ngrams
]
->
[
Ngrams
]
groupNgrams
::
Language
->
[
SNgrams
]
->
[
S
Ngrams
]
groupNgrams
EN
=
En
.
groupNgrams
groupNgrams
FR
=
Fr
.
groupNgrams
src/Gargantext/Types/Main.hs
View file @
1049ea25
...
...
@@ -163,7 +163,6 @@ nodeTypeId tn = fromMaybe (panic $ pack $ "Typename " <> show tn <> " does not e
-- Temporary types to be removed
type
Ngrams
=
(
Text
,
Text
,
Text
)
type
ErrorMessage
=
Text
-- Queries
...
...
stack.yaml
View file @
1049ea25
...
...
@@ -2,6 +2,7 @@ flags: {}
extra-package-dbs
:
[]
packages
:
-
.
allow-newer
:
true
extra-deps
:
-
git
:
https://github.com/delanoe/data-time-segment.git
...
...
@@ -25,6 +26,7 @@ extra-deps:
-
servant-multipart-0.11.1
-
servant-server-0.12
-
servant-swagger-ui-0.2.3.2.2.8
-
stemmer-0.5.2
-
text-1.2.3.0
-
text-show-3.6.2
resolver
:
lts-10.6
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment