Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
H
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Przemyslaw Kaminski
haskell-gargantext
Commits
1049ea25
Commit
1049ea25
authored
Apr 16, 2018
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FIS][FIX] Frequent Item Set and fix ngrams extraction test.
parent
9f29cddb
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
60 additions
and
48 deletions
+60
-48
package.yaml
package.yaml
+1
-0
Ngrams.hs
src/Gargantext/Ngrams.hs
+33
-20
CoreNLP.hs
src/Gargantext/Ngrams/CoreNLP.hs
+2
-3
FrequentItemSet.hs
src/Gargantext/Ngrams/FrequentItemSet.hs
+2
-4
En.hs
src/Gargantext/Ngrams/Lang/En.hs
+14
-14
Parser.hs
src/Gargantext/Ngrams/Parser.hs
+6
-6
Main.hs
src/Gargantext/Types/Main.hs
+0
-1
stack.yaml
stack.yaml
+2
-0
No files found.
package.yaml
View file @
1049ea25
...
...
@@ -116,6 +116,7 @@ library:
-
servant-swagger-ui
-
servant-static-th
-
split
-
stemmer
-
swagger2
-
tagsoup
-
text-metrics
...
...
src/Gargantext/Ngrams.hs
View file @
1049ea25
...
...
@@ -21,7 +21,7 @@ module Gargantext.Ngrams ( module Gargantext.Ngrams.Letters
,
module
Gargantext
.
Ngrams
.
Occurrences
,
module
Gargantext
.
Ngrams
.
TextMining
,
module
Gargantext
.
Ngrams
.
Metrics
,
ngrams
,
occ
,
sumOcc
,
text2fis
,
Ngrams
(
..
),
ngrams
,
occ
,
sumOcc
,
text2fis
--, module Gargantext.Ngrams.Words
)
where
...
...
@@ -40,35 +40,46 @@ import qualified Gargantext.Ngrams.FrequentItemSet as FIS
import
Data.Char
(
Char
,
isAlpha
,
isSpace
)
import
Data.Text
(
Text
,
words
,
filter
,
toLower
)
import
Data.Map.Strict
(
Map
,
empty
,
keys
import
Data.Map.Strict
(
Map
,
empty
,
insertWith
,
unionWith
,
fromList
,
lookupIndex
--, fromList, keys
)
import
qualified
Data.Map.Strict
as
M
(
filter
)
import
Data.Foldable
(
foldl'
)
import
Gargantext.Prelude
hiding
(
filter
)
import
qualified
Data.List
as
L
(
filter
)
-- Maybe useful later:
--import NLP.Stemmer (stem, Stemmer(..))
--import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
--import Language.Aspell.Options (ACOption(..))
data
Ngrams
=
Ngrams
{
_ngramsNgrams
::
Text
,
_ngramsStem
::
Text
}
deriving
(
Show
)
instance
Eq
Ngrams
where
Ngrams
n1
s1
==
Ngrams
n2
s2
=
n1
==
n2
||
s1
==
s2
type
Occ
=
Int
type
Index
=
Int
type
FreqMin
=
Int
--type Index = Int
ngrams
::
Text
->
[
Text
]
ngrams
xs
=
monograms
$
toLower
$
filter
is
Gram
xs
ngrams
xs
=
monograms
$
toLower
$
filter
is
Char
xs
monograms
::
Text
->
[
Text
]
monograms
=
words
isGram
::
Char
->
Bool
isGram
'-'
=
True
isGram
'/'
=
True
isGram
c
=
isAlpha
c
||
isSpace
c
-- TODO
-- 12-b
isChar
::
Char
->
Bool
isChar
'-'
=
True
isChar
'/'
=
True
isChar
c
=
isAlpha
c
||
isSpace
c
-- | Compute the occurrences (occ)
occ
::
Ord
a
=>
[
a
]
->
Map
a
Occ
...
...
@@ -78,18 +89,18 @@ occ xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs
sumOcc
::
Ord
a
=>
[
Map
a
Occ
]
->
Map
a
Occ
sumOcc
xs
=
foldl'
(
\
x
y
->
unionWith
(
+
)
x
y
)
empty
xs
noApax
::
Ord
a
=>
Map
a
Occ
->
Map
a
Occ
noApax
m
=
M
.
filter
(
>
1
)
m
--
noApax :: Ord a => Map a Occ -> Map a Occ
--
noApax m = M.filter (>1) m
-- | /!\ indexes are not the same:
-- | Index ngrams from Map
indexNgram
::
Ord
a
=>
Map
a
Occ
->
Map
Index
a
indexNgram
m
=
fromList
(
zip
[
1
..
]
(
keys
m
))
--
indexNgram :: Ord a => Map a Occ -> Map Index a
--
indexNgram m = fromList (zip [1..] (keys m))
-- | Index ngrams from Map
ngramIndex
::
Ord
a
=>
Map
a
Occ
->
Map
a
Index
ngramIndex
m
=
fromList
(
zip
(
keys
m
)
[
1
..
])
--
ngramIndex :: Ord a => Map a Occ -> Map a Index
--
ngramIndex m = fromList (zip (keys m) [1..])
indexWith
::
Ord
a
=>
Map
a
Occ
->
[
a
]
->
[
Int
]
indexWith
m
xs
=
unMaybe
$
map
(
\
x
->
lookupIndex
x
m
)
xs
...
...
@@ -101,14 +112,16 @@ indexIt xs = (m, is)
is
=
map
(
indexWith
m
)
xs
list2fis
::
Ord
a
=>
FIS
.
Frequency
->
[[
a
]]
->
(
Map
a
Int
,
[
FIS
.
Fis
])
list2fis
n
xs
=
(
m
,
fs
)
list2fis
n
xs
=
(
m
'
,
fs
)
where
(
m
,
is
)
=
indexIt
xs
m'
=
M
.
filter
(
>
50000
)
m
fs
=
FIS
.
all
n
is
text2fis
::
FIS
.
Frequency
->
[
Text
]
->
(
Map
Text
Int
,
[
FIS
.
Fis
])
text2fis
n
xs
=
list2fis
n
(
map
ngrams
xs
)
text2fisWith
::
FIS
.
Size
->
FIS
.
Frequency
->
[
Text
]
->
(
Map
Text
Int
,
[
FIS
.
Fis
])
text2fisWith
=
undefined
--text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
--text2fisWith = undefined
src/Gargantext/Ngrams/CoreNLP.hs
View file @
1049ea25
...
...
@@ -18,7 +18,6 @@ Portability : POSIX
module
Gargantext.Ngrams.CoreNLP
where
import
Data.Aeson
import
Data.Aeson.TH
(
deriveJSON
)
import
GHC.Generics
import
Data.Monoid
((
<>
))
...
...
@@ -65,8 +64,8 @@ $(deriveJSON (unPrefix "_properties") ''Properties)
data
Sentences
=
Sentences
{
_sentences
::
[
Sentence
]}
deriving
(
Show
,
Generic
)
instance
ToJSON
Sentences
instance
FromJSON
Sentences
$
(
deriveJSON
(
unPrefix
"_"
)
''
S
entences
)
-- request =
...
...
src/Gargantext/Ngrams/FrequentItemSet.hs
View file @
1049ea25
...
...
@@ -45,8 +45,8 @@ all f is = fisWith Nothing f is
between
::
(
Int
,
Int
)
->
Frequency
->
[[
Item
]]
->
[
Fis
]
between
(
x
,
y
)
f
is
=
fisWithSize
(
Right
(
x
,
y
))
f
is
maximum
::
Int
->
Frequency
->
[[
Item
]]
->
[
Fis
]
maximum
m
f
is
=
between
(
0
,
m
)
f
is
--
maximum :: Int -> Frequency -> [[Item]] -> [Fis]
--
maximum m f is = between (0,m) f is
------------------------------------------------------------------------
...
...
@@ -67,7 +67,6 @@ items2fis is = case head is of
------------------------------------------------------------------------
------------------------------------------------------------------------
------------------------------------------------------------------------
fisWithSize
::
Size
->
Frequency
->
[[
Item
]]
->
[
Fis
]
fisWithSize
n
f
is
=
case
n
of
...
...
@@ -87,4 +86,3 @@ fisWith s f is = unMaybe $ map items2fis $ filter' $ runLCMmatrix is f
------------------------------------------------------------------------
------------------------------------------------------------------------
------------------------------------------------------------------------
src/Gargantext/Ngrams/Lang/En.hs
View file @
1049ea25
...
...
@@ -8,16 +8,16 @@ import Data.Text (Text)
import
Data.Monoid
((
<>
))
selectNgrams
::
[(
Text
,
Text
,
Text
)]
->
[(
Text
,
Text
,
Text
)]
selectNgrams
xs
=
filter
selectNgrams'
xs
selectNgrams
xs
=
filter
isNgrams
xs
where
selectNgrams'
(
_
,
"NN"
,
_
)
=
True
selectNgrams'
(
_
,
"NNS"
,
_
)
=
True
selectNgrams'
(
_
,
"NNP"
,
_
)
=
True
selectNgrams'
(
_
,
"NN+CC"
,
_
)
=
True
selectNgrams'
(
_
,
_
,
"PERSON"
)
=
True
selectNgrams'
(
_
,
_
,
"ORGANIZATION"
)
=
True
selectNgrams'
(
_
,
_
,
"LOCATION"
)
=
True
selectNgrams'
(
_
,
_
,
_
)
=
False
isNgrams
(
_
,
"NN"
,
_
)
=
True
isNgrams
(
_
,
"NNS"
,
_
)
=
True
isNgrams
(
_
,
"NNP"
,
_
)
=
True
isNgrams
(
_
,
"NN+CC"
,
_
)
=
True
isNgrams
(
_
,
_
,
"PERSON"
)
=
True
isNgrams
(
_
,
_
,
"ORGANIZATION"
)
=
True
isNgrams
(
_
,
_
,
"LOCATION"
)
=
True
isNgrams
(
_
,
_
,
_
)
=
False
groupNgrams
::
[(
Text
,
Text
,
Text
)]
->
[(
Text
,
Text
,
Text
)]
...
...
@@ -26,7 +26,7 @@ groupNgrams [] = []
groupNgrams
((
j1
,
"JJ"
,
j1'
)
:
(
c1
,
"CC"
,
c1'
)
:
(
j2
,
"JJ"
,
j2'
)
:
(
j3
,
"JJ"
,
_
)
:
xs
)
=
groupNgrams
(
jn1
:
cc
:
jn2
:
xs
)
where
jn
j'
j''
jn'
=
(
j'
<>
" "
<>
j''
,
"JJ"
,
jn'
)
cc
=
(
c1
,
"CC"
,
c1'
)
cc
=
(
c1
,
"CC"
,
c1'
)
jn1
=
(
j1
,
"JJ"
,
j1'
)
jn2
=
jn
j2
j3
j2'
...
...
@@ -60,14 +60,14 @@ groupNgrams ((x,"NN",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> y, "NN",
-- > should be (antiinflammatory activity) <> (analgesic activity)
groupNgrams
((
x
,
"NN"
,
_
)
:
(
o
,
"IN"
,
_
)
:
(
y
,
"NN"
,
yy
)
:
xs
)
=
groupNgrams
((
x
<>
" "
<>
o
<>
" "
<>
y
,
"NN"
,
yy
)
:
xs
)
groupNgrams
((
x
,
"NN"
,
_
)
:
(
o
,
"IN"
,
_
)
:
(
y
,
"NNP"
,
yy
)
:
xs
)
=
groupNgrams
((
x
<>
" "
<>
o
<>
" "
<>
y
,
"NN"
,
yy
)
:
xs
)
groupNgrams
((
x
,
"NN"
,
_
)
:
(
o
,
"IN"
,
_
)
:
(
y
,
"NNP"
,
yy
)
:
xs
)
=
groupNgrams
((
x
<>
" "
<>
o
<>
" "
<>
y
,
"NN"
,
yy
)
:
xs
)
groupNgrams
((
x
,
"NN"
,
_
)
:
(
o
,
"IN"
,
_
)
:
(
det
,
"DT"
,
_
)
:
(
y
,
"NN"
,
yy
)
:
xs
)
=
groupNgrams
((
x
<>
" "
<>
o
<>
" "
<>
det
<>
" "
<>
y
,
"NN"
,
yy
)
:
xs
)
groupNgrams
((
x
,
"NN"
,
_
)
:
(
o
,
"IN"
,
_
)
:
(
det
,
"DT"
,
_
)
:
(
y
,
"NNP"
,
yy
)
:
xs
)
=
groupNgrams
((
x
<>
" "
<>
o
<>
" "
<>
det
<>
" "
<>
y
,
"NN"
,
yy
)
:
xs
)
groupNgrams
((
x
,
"NN"
,
_
)
:
(
o
,
"IN"
,
_
)
:
(
det
,
"DT"
,
_
)
:
(
y
,
"NN"
,
yy
)
:
xs
)
=
groupNgrams
((
x
<>
" "
<>
o
<>
" "
<>
det
<>
" "
<>
y
,
"NN"
,
yy
)
:
xs
)
groupNgrams
((
x
,
"NN"
,
_
)
:
(
o
,
"IN"
,
_
)
:
(
det
,
"DT"
,
_
)
:
(
y
,
"NNP"
,
yy
)
:
xs
)
=
groupNgrams
((
x
<>
" "
<>
o
<>
" "
<>
det
<>
" "
<>
y
,
"NN"
,
yy
)
:
xs
)
groupNgrams
((
x
,
_
,
"PERSON"
)
:
(
y
,
yy
,
"PERSON"
)
:
xs
)
=
groupNgrams
((
x
<>
" "
<>
y
,
yy
,
"PERSON"
)
:
xs
)
groupNgrams
((
x
,
_
,
"ORGANIZATION"
)
:
(
y
,
yy
,
"ORGANIZATION"
)
:
xs
)
=
groupNgrams
((
x
<>
" "
<>
y
,
yy
,
"ORGANIZATION"
)
:
xs
)
groupNgrams
((
x
,
_
,
"LOCATION"
)
:
(
y
,
yy
,
"LOCATION"
)
:
xs
)
=
groupNgrams
((
x
<>
" "
<>
y
,
yy
,
"LOCATION"
)
:
xs
)
groupNgrams
((
x
,
_
,
"LOCATION"
)
:
(
y
,
yy
,
"LOCATION"
)
:
xs
)
=
groupNgrams
((
x
<>
" "
<>
y
,
yy
,
"LOCATION"
)
:
xs
)
groupNgrams
(
x
:
xs
)
=
(
x
:
(
groupNgrams
xs
))
...
...
src/Gargantext/Ngrams/Parser.hs
View file @
1049ea25
...
...
@@ -8,10 +8,11 @@ import Gargantext.Prelude
import
Gargantext.Ngrams.CoreNLP
import
Data.Text
hiding
(
map
)
import
Gargantext.Types.Main
(
Language
(
..
)
,
Ngrams
)
import
Gargantext.Types.Main
(
Language
(
..
))
import
qualified
Gargantext.Ngrams.Lang.En
as
En
import
qualified
Gargantext.Ngrams.Lang.Fr
as
Fr
type
SNgrams
=
(
Text
,
Text
,
Text
)
-- | Ngrams selection algorithms
-- A form is a list of characters seperated by one or more spaces in a sentence.
...
...
@@ -31,11 +32,11 @@ import qualified Gargantext.Ngrams.Lang.Fr as Fr
-- TODO for scientific papers: add maesures
-- TODO add the p score regex
extractNgrams
::
Language
->
Text
->
IO
[[
Ngrams
]]
extractNgrams
::
Language
->
Text
->
IO
[[
S
Ngrams
]]
extractNgrams
lang
s
=
map
(
groupNgrams
lang
)
<$>
extractNgrams'
lang
s
extractNgrams'
::
Language
->
Text
->
IO
[[
Ngrams
]]
extractNgrams'
::
Language
->
Text
->
IO
[[
S
Ngrams
]]
extractNgrams'
lang
t
=
map
(
map
token2text
)
<$>
map
_sentenceTokens
<$>
_sentences
...
...
@@ -44,14 +45,13 @@ extractNgrams' lang t = map (map token2text)
-- | This function selects ngrams according to grammars specific
-- of each language.
-- In english, JJ is ADJectiv in french.
selectNgrams
::
Language
->
[
Ngrams
]
->
[
Ngrams
]
selectNgrams
::
Language
->
[
SNgrams
]
->
[
S
Ngrams
]
selectNgrams
EN
=
En
.
selectNgrams
selectNgrams
FR
=
Fr
.
selectNgrams
-- | This function analyze and groups (or not) ngrams according to
-- grammars specific of each language.
groupNgrams
::
Language
->
[
Ngrams
]
->
[
Ngrams
]
groupNgrams
::
Language
->
[
SNgrams
]
->
[
S
Ngrams
]
groupNgrams
EN
=
En
.
groupNgrams
groupNgrams
FR
=
Fr
.
groupNgrams
src/Gargantext/Types/Main.hs
View file @
1049ea25
...
...
@@ -163,7 +163,6 @@ nodeTypeId tn = fromMaybe (panic $ pack $ "Typename " <> show tn <> " does not e
-- Temporary types to be removed
type
Ngrams
=
(
Text
,
Text
,
Text
)
type
ErrorMessage
=
Text
-- Queries
...
...
stack.yaml
View file @
1049ea25
...
...
@@ -2,6 +2,7 @@ flags: {}
extra-package-dbs
:
[]
packages
:
-
.
allow-newer
:
true
extra-deps
:
-
git
:
https://github.com/delanoe/data-time-segment.git
...
...
@@ -25,6 +26,7 @@ extra-deps:
-
servant-multipart-0.11.1
-
servant-server-0.12
-
servant-swagger-ui-0.2.3.2.2.8
-
stemmer-0.5.2
-
text-1.2.3.0
-
text-show-3.6.2
resolver
:
lts-10.6
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment