Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Julien Moutinho
haskell-gargantext
Commits
16327c34
Commit
16327c34
authored
May 15, 2018
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FEAT] CSV ngrams extraction, engine and search.
parent
7997ab36
Changes
12
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
297 additions
and
38 deletions
+297
-38
CleanCsvCorpus.hs
app/CleanCsvCorpus.hs
+61
-0
Main.hs
app/Main.hs
+4
-5
package.yaml
package.yaml
+1
-0
Node.hs
src/Gargantext/API/Node.hs
+2
-4
Utils.hs
src/Gargantext/Database/Utils.hs
+1
-2
Prelude.hs
src/Gargantext/Prelude.hs
+2
-0
Text.hs
src/Gargantext/Text.hs
+7
-1
Ngrams.hs
src/Gargantext/Text/Ngrams.hs
+2
-2
Stem.hs
src/Gargantext/Text/Ngrams/Stem.hs
+1
-3
CSV.hs
src/Gargantext/Text/Parsers/CSV.hs
+120
-21
Search.hs
src/Gargantext/Text/Search.hs
+95
-0
stack.yaml
stack.yaml
+1
-0
No files found.
app/CleanCsvCorpus.hs
0 → 100644
View file @
16327c34
{-|
Module : CleanCsvCorpus.hs
Description : Gargantext starter
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Given a Gargantext CSV File and its Query
This script cleans and compress the contexts around the main terms of the query.
-}
{-# LANGUAGE NoImplicitPrelude #-}
module
CleanCsvCorpus
where
--import GHC.IO (FilePath)
import
Data.SearchEngine
as
S
import
qualified
Data.Set
as
S
import
Data.Vector
(
Vector
)
import
qualified
Data.Vector
as
V
import
Gargantext.Prelude
import
Gargantext.Text.Search
import
Gargantext.Text.Parsers.CSV
------------------------------------------------------------------------
type
Query
=
[
S
.
Term
]
filterDocs
::
[
DocId
]
->
Vector
Doc
->
Vector
Doc
filterDocs
docIds
=
V
.
filter
(
\
doc
->
S
.
member
(
d_docId
doc
)
$
S
.
fromList
docIds
)
main
::
IO
()
main
=
do
let
rPath
=
"/tmp/Gargantext_Corpus.csv"
let
wPath
=
"/tmp/Gargantext_Corpus_bis.csv"
--let q = ["water", "scarcity", "morocco", "shortage","flood"]
let
q
=
[
"gratuit"
,
"gratuité"
,
"culture"
,
"culturel"
]
(
h
,
csvDocs
)
<-
readCsv
rPath
putStrLn
$
"Number of documents before:"
<>
show
(
V
.
length
csvDocs
)
putStrLn
$
"Mean size of docs:"
<>
show
(
docsSize
csvDocs
)
let
docs
=
toDocs
csvDocs
let
engine
=
insertDocs
docs
initialDocSearchEngine
let
docIds
=
S
.
query
engine
q
let
docs'
=
fromDocs
$
filterDocs
docIds
(
V
.
fromList
docs
)
putStrLn
$
"Number of documents after:"
<>
show
(
V
.
length
docs'
)
putStrLn
$
"Mean size of docs:"
<>
show
(
docsSize
docs'
)
writeCsv
wPath
(
h
,
docs'
)
app/Main.hs
View file @
16327c34
...
...
@@ -6,9 +6,10 @@ License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
Script to start gargantext with different modes (Dev, Prod, Mock).
-}
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE DeriveGeneric #-}
...
...
@@ -17,7 +18,6 @@ Portability : POSIX
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TypeOperators #-}
module
Main
where
import
Prelude
(
putStrLn
)
...
...
@@ -47,10 +47,9 @@ deriving instance Show (MyOptions Unwrapped)
main
::
IO
()
main
=
do
MyOptions
myMode
myPort
myIniFile
<-
unwrapRecord
"Gargantext: collaborative platform for text-mining"
let
myPort'
=
case
myPort
of
Just
p
->
p
Nothing
->
8008
...
...
@@ -64,6 +63,6 @@ main = do
Just
i
->
i
Mock
->
startGargantextMock
myPort'
_
->
startGargantextMock
myPort'
putStrLn
$
"Starting Gargantext with mode: "
<>
show
myMode
start
package.yaml
View file @
16327c34
...
...
@@ -58,6 +58,7 @@ library:
-
duckling
-
exceptions
-
filepath
-
fullstop
-
fclabels
-
fast-logger
-
full-text-search
...
...
src/Gargantext/API/Node.hs
View file @
16327c34
...
...
@@ -23,8 +23,6 @@ module Gargantext.API.Node
where
-------------------------------------------------------------------
import
System.IO
(
putStrLn
)
import
Control.Monad.IO.Class
(
liftIO
)
import
Control.Monad
((
>>
))
--import System.IO (putStrLn, readFile)
...
...
@@ -98,13 +96,13 @@ type FacetDocAPI = "table"
-- | Node API functions
roots
::
Connection
->
Server
Roots
roots
conn
=
liftIO
(
putStrLn
"Log Needed"
>>
getNodesWithParentId
conn
0
Nothing
)
roots
conn
=
liftIO
(
putStrLn
(
"Log Needed"
::
Text
)
>>
getNodesWithParentId
conn
0
Nothing
)
:<|>
pure
(
panic
"not implemented yet"
)
:<|>
pure
(
panic
"not implemented yet"
)
:<|>
pure
(
panic
"not implemented yet"
)
nodeAPI
::
Connection
->
NodeId
->
Server
NodeAPI
nodeAPI
conn
id
=
liftIO
(
putStrLn
"getNode"
>>
getNode
conn
id
)
nodeAPI
conn
id
=
liftIO
(
putStrLn
(
"getNode"
::
Text
)
>>
getNode
conn
id
)
:<|>
deleteNode'
conn
id
:<|>
getNodesWith'
conn
id
:<|>
getFacet
conn
id
...
...
src/Gargantext/Database/Utils.hs
View file @
16327c34
...
...
@@ -33,7 +33,6 @@ import Database.PostgreSQL.Simple (Connection, connect)
import
Opaleye
(
Query
,
Unpackspec
,
showSqlForPostgres
)
import
Data.Profunctor.Product.Default
(
Default
)
import
Data.Maybe
(
maybe
)
import
Prelude
(
id
,
putStrLn
)
-- TODO add a reader Monad here
-- read this in the init file
...
...
@@ -61,5 +60,5 @@ connectGargandb fp = do
connect
parameters
printSql
::
Default
Unpackspec
a
a
=>
Query
a
->
IO
()
printSql
=
putStrLn
.
maybe
"Empty query"
id
.
showSqlForPostgres
printSql
=
putStrLn
.
maybe
"Empty query"
id
entity
.
showSqlForPostgres
src/Gargantext/Prelude.hs
View file @
16327c34
...
...
@@ -30,8 +30,10 @@ module Gargantext.Prelude
import
Data.Maybe
(
isJust
,
fromJust
,
maybe
)
import
Protolude
(
Bool
(
True
,
False
),
Int
,
Double
,
Integer
,
Fractional
,
Num
,
Maybe
(
Just
,
Nothing
)
,
Enum
,
Bounded
,
Float
,
Floating
,
Char
,
IO
,
pure
,
(
<*>
),
(
<$>
),
panic
,
putStrLn
,
head
,
flip
,
Ord
,
Integral
,
Foldable
,
RealFrac
,
Monad
,
filter
,
reverse
,
map
,
zip
,
drop
,
take
,
zipWith
...
...
src/Gargantext/Text.hs
View file @
16327c34
...
...
@@ -32,6 +32,7 @@ import Data.Map.Strict (Map
import
Data.Text
(
Text
,
split
)
import
qualified
Data.Map.Strict
as
M
(
filter
)
import
NLP.FullStop
(
segment
)
-----------------------------------------------------------------
import
Gargantext.Text.Ngrams
import
Gargantext.Text.Metrics.Occurrences
...
...
@@ -89,11 +90,16 @@ text2fis n xs = list2fis n (map ngrams xs)
-------------------------------------------------------------------
-- Contexts of text
sentences
::
Text
->
[
Text
]
sentences
txt
=
split
isStop
txt
sentences
txt
=
map
DT
.
pack
$
segment
$
DT
.
unpack
txt
sentences'
::
Text
->
[
Text
]
sentences'
txt
=
split
isStop
txt
isStop
::
Char
->
Bool
isStop
c
=
c
`
elem
`
[
'.'
,
'?'
,
'!'
]
unsentences
::
[
Text
]
->
Text
unsentences
txts
=
DT
.
intercalate
" "
txts
-- | https://en.wikipedia.org/wiki/Text_mining
testText_en
::
Text
...
...
src/Gargantext/Text/Ngrams.hs
View file @
16327c34
...
...
@@ -26,7 +26,7 @@ module Gargantext.Text.Ngrams
where
import
Data.Char
(
Char
,
isAlphaNum
,
isSpace
)
import
Data.Text
(
Text
,
split
,
splitOn
,
pack
)
import
Data.Text
(
Text
,
split
,
splitOn
,
pack
,
toLower
)
import
Data.Set
(
Set
)
import
qualified
Data.Set
as
S
...
...
@@ -64,7 +64,7 @@ equivNgrams (Ngrams _ s1) (Ngrams _ s2) = s1 `S.isSubsetOf` s2
--monograms xs = monograms $ toLower $ filter isGram xs
monograms
::
Text
->
[
Text
]
monograms
txt
=
split
isWord
txt
monograms
txt
=
map
toLower
$
split
isWord
txt
where
isWord
c
=
c
`
elem
`
[
' '
,
'
\'
'
,
','
,
';'
]
...
...
src/Gargantext/Text/Ngrams/Stem.hs
View file @
16327c34
...
...
@@ -18,7 +18,7 @@ Source : https://en.wikipedia.org/wiki/Stemming
-}
module
Gargantext.Text.Ngrams.Stem
module
Gargantext.Text.Ngrams.Stem
(
stem
,
Lang
(
..
))
where
import
Data.Text
(
Text
)
...
...
@@ -38,7 +38,6 @@ import Gargantext.Core (Lang(..))
-- A stemmer for English, for example, should identify the string "cats"
-- (and possibly "catlike", "catty" etc.) as based on the root "cat".
-- and
-- "stems", "stemmer", "stemming", "stemmed" as based on "stem". A stemming
-- algorithm reduces the words "fishing", "fished", and "fisher" to the
...
...
@@ -48,7 +47,6 @@ import Gargantext.Core (Lang(..))
-- "arguments" reduce to the stem "argument".
stem
::
Lang
->
Text
->
Text
stem
lang
=
DT
.
pack
.
N
.
stem
lang'
.
DT
.
unpack
where
...
...
src/Gargantext/Text/Parsers/CSV.hs
View file @
16327c34
...
...
@@ -7,8 +7,8 @@ Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Here is a longer description of this module, containing some
commentary with @some markup@.
CSV parser for Gargantext corpus files.
-}
{-# LANGUAGE NoImplicitPrelude #-}
...
...
@@ -17,29 +17,123 @@ commentary with @some markup@.
module
Gargantext.Text.Parsers.CSV
where
import
GHC.
Generics
(
Generic
)
import
GHC.
Real
(
round
)
import
GHC.IO
(
FilePath
)
import
Data.Either
(
Either
(
Left
,
Right
))
import
Data.Text
(
Text
)
import
Control.Applicative
import
qualified
Data.ByteString.Lazy
as
BL
import
Data.Char
(
ord
)
import
Data.Csv
import
Data.Either
(
Either
(
Left
,
Right
))
import
Data.List
(
concat
)
import
Data.String
(
IsString
)
import
Data.Text
(
Text
,
pack
,
unpack
,
length
)
import
qualified
Data.ByteString.Lazy
as
BL
import
Data.Vector
(
Vector
)
import
qualified
Data.Vector
as
V
import
Data.Text
(
pack
)
import
Safe
(
tailMay
)
import
Text.HTML.TagSoup
import
Data.Char
(
ord
)
import
Gargantext.Prelude
import
Gargantext.Text
import
Gargantext.Prelude
hiding
(
length
)
---------------------------------------------------------------
data
Doc
=
Doc
{
d_docId
::
!
Int
,
d_title
::
!
Text
,
d_source
::
!
Text
,
d_publication_year
::
!
Int
,
d_publication_month
::
!
Int
,
d_publication_day
::
!
Int
,
d_abstract
::
!
Text
,
d_authors
::
!
Text
}
deriving
(
Show
)
---------------------------------------------------------------
toDocs
::
Vector
CsvDoc
->
[
Doc
]
toDocs
v
=
V
.
toList
$
V
.
zipWith
(
\
nId
(
CsvDoc
t
s
py
pm
pd
abst
auth
)
->
Doc
nId
t
s
py
pm
pd
abst
auth
)
(
V
.
enumFromN
1
(
V
.
length
v'
))
v''
where
m
=
docsSize
v
v'
=
V
.
concatMap
(
splitDoc
m
Paragraph
)
v
m'
=
docsSize
v
v''
=
V
.
concatMap
(
splitDoc
m'
Sentences
)
v'
m''
=
docsSize
v'
v'''
=
V
.
concatMap
(
splitDoc
m'
Sentences
)
v''
---------------------------------------------------------------
fromDocs
::
Vector
Doc
->
Vector
CsvDoc
fromDocs
docs
=
V
.
map
fromDocs'
docs
where
fromDocs'
(
Doc
_
t
s
py
pm
pd
abst
auth
)
=
(
CsvDoc
t
s
py
pm
pd
abst
auth
)
---------------------------------------------------------------
-- | Split a document in its context
-- TODO adapt the size of the paragraph according to the corpus average
data
SplitBy
=
Paragraph
|
Sentences
|
Chars
splitDoc
::
Mean
->
SplitBy
->
CsvDoc
->
Vector
CsvDoc
splitDoc
m
splt
doc
=
let
docSize
=
(
length
$
c_abstract
doc
)
in
if
docSize
>
1000
then
if
(
mod
(
round
m
)
docSize
)
>=
10
then
splitDoc'
splt
doc
else
V
.
fromList
[
doc
]
else
V
.
fromList
[
doc
]
splitDoc'
::
SplitBy
->
CsvDoc
->
Vector
CsvDoc
splitDoc'
splt
(
CsvDoc
t
s
py
pm
pd
abst
auth
)
=
V
.
fromList
$
[
firstDoc
]
<>
nextDocs
where
firstDoc
=
CsvDoc
t
s
py
pm
pd
firstAbstract
auth
firstAbstract
=
head'
abstracts
nextDocs
=
map
(
\
txt
->
CsvDoc
(
head'
$
sentences
txt
)
s
py
pm
pd
(
unsentences
$
tail'
$
sentences
txt
)
auth
)
(
tail'
abstracts
)
abstracts
=
(
splitBy
splt
)
abst
head'
x
=
maybe
""
identity
(
head
x
)
tail'
x
=
maybe
[
""
]
identity
(
tailMay
x
)
splitBy
::
SplitBy
->
Text
->
[
Text
]
splitBy
Chars
=
map
pack
.
chunkAlong
1000
1
.
unpack
splitBy
Sentences
=
map
unsentences
.
chunkAlong
20
1
.
sentences
splitBy
Paragraph
=
map
removeTag
.
filter
isTagText
.
parseTags
where
removeTag
::
IsString
p
=>
Tag
p
->
p
removeTag
(
TagText
x
)
=
x
removeTag
(
TagComment
x
)
=
x
removeTag
_
=
""
---------------------------------------------------------------
---------------------------------------------------------------
type
Mean
=
Double
docsSize
::
Vector
CsvDoc
->
Mean
docsSize
csvDoc
=
mean
ls
where
ls
=
V
.
toList
$
V
.
map
(
fromIntegral
.
length
.
c_abstract
)
csvDoc
---------------------------------------------------------------
data
CsvDoc
=
CsvDoc
{
title
::
!
Text
,
source
::
!
Text
,
publication_year
::
!
Int
,
publication_month
::
!
Int
,
publication_day
::
!
Int
,
abstract
::
!
Text
,
authors
::
!
Text
{
c_
title
::
!
Text
,
c_
source
::
!
Text
,
c_
publication_year
::
!
Int
,
c_
publication_month
::
!
Int
,
c_
publication_day
::
!
Int
,
c_
abstract
::
!
Text
,
c_
authors
::
!
Text
}
deriving
(
Show
,
Generic
)
deriving
(
Show
)
instance
FromNamedRecord
CsvDoc
where
parseNamedRecord
r
=
CsvDoc
<$>
r
.:
"title"
...
...
@@ -63,20 +157,25 @@ instance ToNamedRecord CsvDoc where
csvDecodeOptions
::
DecodeOptions
csvDecodeOptions
=
(
defaultDecodeOptions
{
decDelimiter
=
fromIntegral
$
ord
'
\t
'
}
)
csvDecodeOptions
=
(
defaultDecodeOptions
{
decDelimiter
=
fromIntegral
$
ord
'
\t
'
}
)
csvEncodeOptions
::
EncodeOptions
csvEncodeOptions
=
(
defaultEncodeOptions
{
encDelimiter
=
fromIntegral
$
ord
'
\t
'
}
)
csvEncodeOptions
=
(
defaultEncodeOptions
{
encDelimiter
=
fromIntegral
$
ord
'
\t
'
}
)
readCsv
::
FilePath
->
IO
(
Header
,
V
.
V
ector
CsvDoc
)
readCsv
::
FilePath
->
IO
(
Header
,
Vector
CsvDoc
)
readCsv
fp
=
do
csvData
<-
BL
.
readFile
fp
case
decodeByNameWith
csvDecodeOptions
csvData
of
Left
e
->
panic
(
pack
e
)
Right
csvDocs
->
pure
csvDocs
writeCsv
::
FilePath
->
(
Header
,
V
.
Vector
CsvDoc
)
->
IO
()
writeCsv
::
FilePath
->
(
Header
,
Vector
CsvDoc
)
->
IO
()
writeCsv
fp
(
h
,
vs
)
=
BL
.
writeFile
fp
$
encodeByNameWith
csvEncodeOptions
h
(
V
.
toList
vs
)
src/Gargantext/Text/Search.hs
0 → 100644
View file @
16327c34
{-|
Module : Gargantext.Text.Search
Description : All parsers of Gargantext in one file.
Copyright : (c) CNRS, 2017 - present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
This search Engine is first made to clean CSV file according to a query.
Starting from this model, a specific Gargantext engine will be made
(using more metrics scores/features).
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings, NamedFieldPuns #-}
module
Gargantext.Text.Search
where
import
Data.SearchEngine
import
Data.Ix
-- Usefull to use stopwords
-- import Data.Set (Set)
-- import qualified Data.Set as Set
import
Data.Text
(
Text
)
import
Gargantext.Prelude
import
Gargantext.Text.Ngrams
import
Gargantext.Text.Ngrams.Stem
as
ST
import
Gargantext.Text.Parsers.CSV
type
DocId
=
Int
type
DocSearchEngine
=
SearchEngine
Doc
DocId
DocField
NoFeatures
data
DocField
=
TitleField
|
AbstractField
deriving
(
Eq
,
Ord
,
Enum
,
Bounded
,
Ix
,
Show
)
initialDocSearchEngine
::
DocSearchEngine
initialDocSearchEngine
=
initSearchEngine
docSearchConfig
defaultSearchRankParameters
docSearchConfig
::
SearchConfig
Doc
DocId
DocField
NoFeatures
docSearchConfig
=
SearchConfig
{
documentKey
=
d_docId
,
extractDocumentTerms
=
extractTokens
,
transformQueryTerm
=
normaliseQueryToken
,
documentFeatureValue
=
const
noFeatures
}
where
extractTokens
::
Doc
->
DocField
->
[
Text
]
extractTokens
doc
TitleField
=
monograms
(
d_title
doc
)
extractTokens
doc
AbstractField
=
monograms
(
d_abstract
doc
)
normaliseQueryToken
::
Text
->
DocField
->
Text
normaliseQueryToken
tok
=
let
tokStem
=
ST
.
stem
ST
.
EN
in
\
field
->
case
field
of
TitleField
->
tokStem
tok
AbstractField
->
tokStem
tok
defaultSearchRankParameters
::
SearchRankParameters
DocField
NoFeatures
defaultSearchRankParameters
=
SearchRankParameters
{
paramK1
,
paramB
,
paramFieldWeights
,
paramFeatureWeights
=
noFeatures
,
paramFeatureFunctions
=
noFeatures
,
paramResultsetSoftLimit
=
2000
,
paramResultsetHardLimit
=
4000
,
paramAutosuggestPrefilterLimit
=
500
,
paramAutosuggestPostfilterLimit
=
500
}
where
paramK1
::
Float
paramK1
=
1.5
paramB
::
DocField
->
Float
paramB
TitleField
=
0.9
paramB
AbstractField
=
0.5
paramFieldWeights
::
DocField
->
Float
paramFieldWeights
TitleField
=
20
paramFieldWeights
AbstractField
=
5
stack.yaml
View file @
16327c34
...
...
@@ -17,6 +17,7 @@ extra-deps:
-
duckling-0.1.3.0
-
extra-1.5.3
-
full-text-search-0.2.1.4
-
fullstop-0.1.4
-
haskell-src-exts-1.18.2
-
http-types-0.12.1
-
protolude-0.2
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment