Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
158
Issues
158
List
Board
Labels
Milestones
Merge Requests
11
Merge Requests
11
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
b6df8e42
Commit
b6df8e42
authored
Apr 05, 2018
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[TEXT-MINING] adding first functions/datatypes.
parent
f152533b
Changes
14
Hide whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
182 additions
and
106 deletions
+182
-106
package.yaml
package.yaml
+3
-2
Node.hs
src/Gargantext/API/Node.hs
+4
-1
Ngram.hs
src/Gargantext/Database/Ngram.hs
+2
-2
Ngrams.hs
src/Gargantext/Ngrams.hs
+57
-8
Analysis.hs
src/Gargantext/Ngrams/Analysis.hs
+1
-1
CoreNLP.hs
src/Gargantext/Ngrams/CoreNLP.hs
+19
-7
Letters.hs
src/Gargantext/Ngrams/Letters.hs
+34
-0
Metrics.hs
src/Gargantext/Ngrams/Metrics.hs
+1
-2
Parser.hs
src/Gargantext/Ngrams/Parser.hs
+1
-1
TFICF.hs
src/Gargantext/Ngrams/TFICF.hs
+42
-0
TFICF_hs
src/Gargantext/Ngrams/TFICF_hs
+0
-16
Words_hs
src/Gargantext/Ngrams/Words_hs
+0
-59
Parsers.hs
src/Gargantext/Parsers.hs
+17
-5
Prelude.hs
src/Gargantext/Prelude.hs
+1
-2
No files found.
package.yaml
View file @
b6df8e42
...
...
@@ -23,7 +23,6 @@ library:
-
-Werror
exposed-modules
:
-
Gargantext
-
Gargantext.Analysis
-
Gargantext.DSL
-
Gargantext.Database
-
Gargantext.Database.Instances
...
...
@@ -37,7 +36,9 @@ library:
-
Gargantext.Database.Utils
-
Gargantext.Database.User
-
Gargantext.Ngrams
-
Gargantext.Ngrams.Count
-
Gargantext.Ngrams.Analysis
-
Gargantext.Ngrams.TFICF
-
Gargantext.Ngrams.Letters
-
Gargantext.Ngrams.CoreNLP
-
Gargantext.Ngrams.Parser
-
Gargantext.Ngrams.Lang.En
...
...
src/Gargantext/API/Node.hs
View file @
b6df8e42
...
...
@@ -62,7 +62,8 @@ type NodeAPI = Get '[JSON] (Node HyperdataDocument)
:>
QueryParam
"offset"
Int
:>
QueryParam
"limit"
Int
:>
Get
'[
J
SON
]
[
Node
HyperdataDocument
]
:<|>
"facet"
:>
"documents"
:>
FacetDocAPI
:<|>
"facet"
:>
Summary
" Facet documents"
:>
"documents"
:>
FacetDocAPI
-- :<|> "facet" :<|> "sources" :<|> FacetSourcesAPI
-- :<|> "facet" :<|> "authors" :<|> FacetAuthorsAPI
-- :<|> "facet" :<|> "terms" :<|> FacetTermsAPI
...
...
@@ -73,11 +74,13 @@ type NodeAPI = Get '[JSON] (Node HyperdataDocument)
type
FacetDocAPI
=
"table"
:>
Summary
" Table data"
:>
QueryParam
"offset"
Int
:>
QueryParam
"limit"
Int
:>
Get
'[
J
SON
]
[
FacetDoc
]
:<|>
"chart"
:>
Summary
" Chart data"
:>
QueryParam
"from"
UTCTime
:>
QueryParam
"to"
UTCTime
:>
Get
'[
J
SON
]
[
FacetChart
]
...
...
src/Gargantext/Database/Ngram.hs
View file @
b6df8e42
...
...
@@ -67,5 +67,5 @@ findWith f t = find (\x -> f x == t)
--userWithId t xs = userWith userUserId t xs
-- | not optimized (get all ngrams without filters)
n
grams
::
PGS
.
Connection
->
IO
[
Ngram
]
n
grams
conn
=
runQuery
conn
queryNgramTable
getN
grams
::
PGS
.
Connection
->
IO
[
Ngram
]
getN
grams
conn
=
runQuery
conn
queryNgramTable
src/Gargantext/Ngrams.hs
View file @
b6df8e42
module
Gargantext.Ngrams
(
module
Gargantext
.
Ngrams
.
Count
{-|
Module : Gargantext.Ngrams
Description : Ngrams tools
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Ngrams exctration.
Definitions of ngrams.
n non negative integer
-}
module
Gargantext.Ngrams
(
module
Gargantext
.
Ngrams
.
Letters
--, module Gargantext.Ngrams.Hetero
,
module
Gargantext
.
Ngrams
.
CoreNLP
,
module
Gargantext
.
Ngrams
.
Parser
,
module
Gargantext
.
Ngrams
.
Occurrences
,
module
Gargantext
.
Ngrams
.
TextMining
,
module
Gargantext
.
Ngrams
.
Metrics
,
module
Gargantext
.
Ngrams
.
CoreNLP
,
module
Gargantext
.
Ngrams
.
Parser
,
module
Gargantext
.
Ngrams
.
Occurrences
,
module
Gargantext
.
Ngrams
.
TextMining
,
module
Gargantext
.
Ngrams
.
Metrics
,
ngrams
,
occurrences
--, module Gargantext.Ngrams.Words
)
where
)
where
import
Gargantext.Ngrams.
Count
import
Gargantext.Ngrams.
Letters
--import Gargantext.Ngrams.Hetero
import
Gargantext.Ngrams.CoreNLP
import
Gargantext.Ngrams.Parser
...
...
@@ -19,3 +36,35 @@ import Gargantext.Ngrams.TextMining
--import Gargantext.Ngrams.Words
import
Gargantext.Ngrams.Metrics
-----------------------------------------------------------------
import
Data.Char
(
Char
,
isAlpha
,
isSpace
)
import
Data.Text
(
Text
,
words
,
filter
,
toLower
)
import
Data.Map.Strict
(
Map
,
empty
,
insertWith
)
import
Data.Foldable
(
foldl'
)
import
Gargantext.Prelude
hiding
(
filter
)
-- Maybe useful later:
--import NLP.Stemmer (stem, Stemmer(..))
--import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
--import Language.Aspell.Options (ACOption(..))
ngrams
::
Text
->
[
Text
]
ngrams
xs
=
monograms
$
toLower
$
filter
isGram
xs
monograms
::
Text
->
[
Text
]
monograms
=
words
isGram
::
Char
->
Bool
isGram
'-'
=
True
isGram
c
=
isAlpha
c
||
isSpace
c
-- | Compute the occurrences
occurrences
::
Ord
a
=>
[
a
]
->
Map
a
Int
occurrences
xs
=
foldl'
(
\
x
y
->
insertWith
(
+
)
y
1
x
)
empty
xs
src/Gargantext/Analysis.hs
→
src/Gargantext/
Ngrams/
Analysis.hs
View file @
b6df8e42
...
...
@@ -11,7 +11,7 @@ Portability : POSIX
{-# LANGUAGE NoImplicitPrelude #-}
module
Gargantext.Analysis
module
Gargantext.
Ngrams.
Analysis
where
import
Gargantext.Prelude
(
undefined
,
IO
(),
Int
())
...
...
src/Gargantext/Ngrams/CoreNLP.hs
View file @
b6df8e42
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-|
Module : Gargantext.Ngrams.CoreNLP
Description : CoreNLP module
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeOperators #-}
module
Gargantext.Ngrams.CoreNLP
where
...
...
@@ -51,7 +63,7 @@ data Properties = Properties { _propertiesAnnotators :: Text
$
(
deriveJSON
(
unPrefix
"_properties"
)
''
P
roperties
)
data
Sentences
=
Sentences
{
sentences
::
[
Sentence
]}
data
Sentences
=
Sentences
{
_
sentences
::
[
Sentence
]}
deriving
(
Show
,
Generic
)
instance
ToJSON
Sentences
instance
FromJSON
Sentences
...
...
@@ -102,7 +114,7 @@ corenlp lang txt = do
-- parseWith _tokenNer "Hello world of Peter."
-- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
tokenWith
::
(
Token
->
t
)
->
Language
->
Text
->
IO
[[(
Text
,
t
)]]
tokenWith
f
lang
s
=
map
(
map
(
\
t
->
(
_tokenWord
t
,
f
t
)))
<$>
map
_sentenceTokens
<$>
sentences
<$>
corenlp
lang
s
tokenWith
f
lang
s
=
map
(
map
(
\
t
->
(
_tokenWord
t
,
f
t
)))
<$>
map
_sentenceTokens
<$>
_
sentences
<$>
corenlp
lang
s
src/Gargantext/Ngrams/
Count
.hs
→
src/Gargantext/Ngrams/
Letters
.hs
View file @
b6df8e42
{-# LANGUAGE OverloadedStrings #-}
module
Gargantext.Ngrams.Count
where
{-|
Module : Gargantext.Ngrams.Letters
Description : Ngrams.Letters module
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
import
Gargantext.Prelude
Sugar to work on letters with Text.
-}
import
Data.Foldable
as
F
{-# LANGUAGE OverloadedStrings #-}
import
Data.Map.Strict
(
insertWith
)
import
Data.Map
(
Map
)
import
qualified
Data.Map
as
M
module
Gargantext.Ngrams.Letters
where
--import qualified Data.Text.Lazy.IO as DTLIO
import
qualified
Data.Text.Lazy
as
DTL
-- import qualified Data.Text.Lazy.IO as DTLIO
import
Gargantext.Prelude
-- | /O(n)/ Breaks a 'Text' up into each Text list of chars.
-- from slower to faster:
...
...
@@ -26,23 +32,3 @@ letters'' :: DTL.Text -> [DTL.Text]
letters''
=
DTL
.
foldr
(
\
ch
xs
->
DTL
.
singleton
ch
:
xs
)
[]
-- words
-- lines
-- words between punctuation
-- number of punctuation
occurrences
::
Ord
a
=>
[
a
]
->
Map
a
Int
occurrences
xs
=
foldl'
(
\
x
y
->
insertWith
(
+
)
y
1
x
)
M
.
empty
xs
-- for optimization :
--occurrences' :: Ord a => [a] -> Map a Integer
--occurrences' xs = DTL.foldl (\x y -> M.insertWith' (+) y 1 x) M.empty xs
--countMain :: IO ()
--countMain = do
-- (fichier:_) <- getArgs
-- c <- DTLIO.readFile fichier
-- --print $ occurrences $ DTL.chunksOf 1 c
-- pure $ occurrences $ letters'' c
-- --print $ occurrences $ DTL.words $ DTL.toLower c
--
src/Gargantext/Ngrams/Metrics.hs
View file @
b6df8e42
...
...
@@ -8,8 +8,7 @@ Maintainer : sample@email.com
Stability : experimental
Portability : POSIX
Here is a longer description of this module, containing some
commentary with @some markup@.
Mainly reexport functions in @Data.Text.Metrics@
-}
module
Gargantext.Ngrams.Metrics
(
levenshtein
...
...
src/Gargantext/Ngrams/Parser.hs
View file @
b6df8e42
...
...
@@ -38,7 +38,7 @@ extractNgrams lang s = map (groupNgrams lang) <$> extractNgrams' lang s
extractNgrams'
::
Language
->
Text
->
IO
[[
Ngrams
]]
extractNgrams'
lang
t
=
map
(
map
token2text
)
<$>
map
_sentenceTokens
<$>
sentences
<$>
_
sentences
<$>
corenlp
lang
t
-- | This function selects ngrams according to grammars specific
...
...
src/Gargantext/Ngrams/TFICF.hs
0 → 100644
View file @
b6df8e42
{-|
Module : Gargantext.Ngrams.TFICF
Description : TFICF Ngrams tools
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Definition of TFICF
-}
{-# LANGUAGE DeriveGeneric #-}
module
Gargantext.Ngrams.TFICF
where
import
GHC.Generics
(
Generic
)
import
Data.Maybe
(
Maybe
)
import
Data.Text
(
Text
)
import
Text.Show
(
Show
())
-- import Gargantext.Types
import
Gargantext.Prelude
data
Context
=
Corpus
|
Document
deriving
(
Show
,
Generic
)
data
TFICF
=
TFICF
{
_tficfTerms
::
Text
,
_tficfContext1
::
Context
,
_tficfContext2
::
Context
,
_tficfScore
::
Maybe
Double
}
deriving
(
Show
,
Generic
)
--tfidf :: Text -> TFICF
--tfidf txt = TFICF txt Document Corpus score
-- where
-- score = Nothing
src/Gargantext/Ngrams/TFICF_hs
deleted
100644 → 0
View file @
f152533b
module Data.Gargantext.Ngrams.TFICF where
data TFICF = TFICF { _tficfTerms :: Text
, _tficfContext1 :: Context
, _tficfContext2 :: Context
, _tficfScore :: Maybe Double
} deriving (Read, Show, Generics)
tfidf :: Text -> TFICF
tfidf txt = TFICF txt Document Corpus score
where
score = Nothing
src/Gargantext/Ngrams/Words_hs
deleted
100644 → 0
View file @
f152533b
module Data.Gargantext.Ngrams.Words where
import Data.List (partition)
import Data.Set (fromList, notMember, member)
import Data.Char (isPunctuation, toLower, isAlpha, isSpace)
import NLP.Stemmer (stem, Stemmer(..))
import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
import Language.Aspell.Options (ACOption(..))
--import Data.Either.Utils (fromRight)
import Data.ByteString.Internal (packChars)
get_lang x = do
let lang = Lang (packChars x)
spell_lang <- spellCheckerWithOptions [lang]
return spell_lang
check' lang x = check lang (packChars x)
suggest' lang x = suggest lang (packChars x)
--spell_lang <- spellChecker
--lang = fromRight s
--suggest' lang x
-- stem French "naturelles"
-- paragraphes
-- lines
-- sentences
-- Prelude.map (\x -> stem French x) $ cleanText "Les hirondelles s envolent dans les cieux."
repl :: Char -> Char
repl x
| x == '\'' = ' '
| x == '/' = ' '
-- | x == '\t' = ' '
-- | x == '\n' = ' '
| otherwise = x
cleanText text = do
-- pb avec \'
--words $ filter (not . isPunctuation) $ Prelude.map toLower text
words $ filter (\x -> isAlpha x || isSpace x) $ Prelude.map (repl . toLower) text
isMiamWord word = do
let miamWord_set = fromList ["salut", "phrase"]
member word miamWord_set
isStopWord word = do
let stopWord_set = fromList ["de", "la", "une", "avec"]
member word stopWord_set
wordsMain = do
let text = "Salut, ceci est une phrase \n\n avec de la ponctuation !"
print $ partition (not . isStopWord) $ cleanText text
print $ filter (not . isStopWord) $ cleanText text
--print $ filter isStopWord $ words $ filter (not . isPunctuation) text
src/Gargantext/Parsers.hs
View file @
b6df8e42
...
...
@@ -25,11 +25,16 @@ import Gargantext.Prelude
import
System.FilePath
(
takeExtension
,
FilePath
())
import
Data.Attoparsec.ByteString
(
parseOnly
,
Parser
)
import
Data.ByteString
as
DB
import
Data.Map
as
DM
import
qualified
Data.ByteString
as
DB
import
qualified
Data.Map
as
DM
import
Data.Either.Extra
(
partitionEithers
)
import
Data.Ord
()
import
Data.Foldable
(
concat
)
import
Data.String
()
import
Data.Either.Extra
(
Either
()
)
import
Data.Text
(
Text
)
import
Data.Text.Encoding
(
decodeUtf8
)
----
--import Control.Monad (join)
import
Codec.Archive.Zip
(
withArchive
,
getEntry
,
getEntries
)
...
...
@@ -57,13 +62,20 @@ data FileFormat = WOS -- Implemented (ISI Format)
-- | XML -- Not Implemented / see :
-- -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
parse
::
FileFormat
->
FilePath
->
IO
[
Either
String
[[(
DB
.
ByteString
,
DB
.
ByteString
)]]]
-- TODO: to debug maybe add the filepath in error message
type
ParseError
=
String
parse
::
FileFormat
->
FilePath
->
IO
([
ParseError
],
[[(
Text
,
Text
)]])
parse
format
path
=
do
files
<-
case
takeExtension
path
of
".zip"
->
openZip
path
_
->
pure
<$>
DB
.
readFile
path
mapConcurrently
(
runParser
format
)
files
(
as
,
bs
)
<-
partitionEithers
<$>
mapConcurrently
(
runParser
format
)
files
pure
(
as
,
map
toText
$
concat
bs
)
where
-- TODO : decode with bayesian inference on encodings
toText
=
map
(
\
(
a
,
b
)
->
(
decodeUtf8
a
,
decodeUtf8
b
))
-- | withParser:
...
...
src/Gargantext/Prelude.hs
View file @
b6df8e42
...
...
@@ -27,13 +27,12 @@ import Protolude ( Bool(True, False), Int, Double, Integer
,
(
+
),
(
*
),
(
/
),
(
-
),
(
.
),
(
>=
),
(
$
),
(
**
),
(
^
),
(
<
),
(
>
)
,
Eq
,
(
==
),
(
<>
)
,
(
&&
),
(
||
),
not
,
toS
,
fst
,
snd
,
toS
)
-- TODO import functions optimized in Utils.Count
-- import Protolude hiding (head, last, all, any, sum, product, length)
-- import Gargantext.Utils.Count
import
qualified
Data.List
as
L
hiding
(
head
,
sum
)
import
qualified
Control.Monad
as
M
import
qualified
Data.Map
as
Map
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment