Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Christian Merten
haskell-gargantext
Commits
746cd19d
Commit
746cd19d
authored
Jan 12, 2018
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FEAT/TYPES] Parsers main types.
parent
2cd0e36a
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
123 additions
and
149 deletions
+123
-149
moduleDesc.model
moduleDesc.model
+12
-0
Parsers.hs
src/Data/Gargantext/Parsers.hs
+40
-54
Date.hs
src/Data/Gargantext/Parsers/Date.hs
+8
-10
WOS.hs
src/Data/Gargantext/Parsers/WOS.hs
+47
-56
Prelude.hs
src/Data/Gargantext/Prelude.hs
+14
-24
Main.hs
src/Data/Gargantext/Types/Main.hs
+2
-5
No files found.
moduleDesc.model
0 → 100644
View file @
746cd19d
{-|
Module : Data.Gargantext...
Description : Short description
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Here is a longer description of this module, containing some
commentary with @some markup@.
-}
src/Data/Gargantext/Parsers.hs
View file @
746cd19d
...
@@ -3,98 +3,84 @@ Module : Data.Gargantext.Parsers
...
@@ -3,98 +3,84 @@ Module : Data.Gargantext.Parsers
Description : All parsers of Gargantext in one file.
Description : All parsers of Gargantext in one file.
Copyright : (c) CNRS, 2017
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
License : AGPL + CECILL v3
Maintainer :
alexandre.delanoe@iscpif.fr
Maintainer :
team@gargantext.org
Stability : experimental
Stability : experimental
Portability : POSIX
Portability : POSIX
Gargantext enables analyzing semi-structured text that should be parsed
Gargantext enables analyzing semi-structured text that should be parsed
in order to be analyzed.
in order to be analyzed.
The parsers suppose
,
we know the format of the Text (TextFormat data
The parsers suppose we know the format of the Text (TextFormat data
type) according which the right parser is chosen among the list of
type) according
to
which the right parser is chosen among the list of
available parsers.
available parsers.
This module mainly describe how to add a new parser to Gargantext,
This module mainly describe how to add a new parser to Gargantext,
please follow the types.
please follow the types.
-}
-}
module
Data.Gargantext.Parsers
-- (parse, FileFormat(..))
module
Data.Gargantext.Parsers
(
module
Data
.
Gargantext
.
Parsers
.
WOS
--, module Data.Gargantext.Parsers.XML
--, module Data.Gargantext.Parsers.DOC
--, module Data.Gargantext.Parsers.ODS
)
where
where
import
System.FilePath
(
takeExtension
)
import
Data.Attoparsec.ByteString
import
Data.Attoparsec.ByteString
(
parseOnly
,
Parser
)
import
Data.ByteString
(
ByteString
)
import
Data.ByteString
as
DB
import
Data.Map
as
DM
import
Data.Map
as
DM
import
Data.Either.Extra
(
Either
(
..
))
----
import Data.Either.Extra(Either(..))
----
import
Control.Monad
(
join
)
--
import Control.Monad (join)
import
Codec.Archive.Zip
import
Codec.Archive.Zip
import
Path.IO
(
resolveFile'
)
import
Path.IO
(
resolveFile'
)
-- import qualified Data.ByteString.Lazy as B
------ import qualified Data.ByteString.Lazy as B
import
Control.Applicative
(
(
<$>
)
)
--import Control.Applicative ( (<$>) )
import
Control.Concurrent.Async
as
CCA
(
mapConcurrently
)
import
Control.Concurrent.Async
as
CCA
(
mapConcurrently
)
import
Data.Gargantext.Parsers.WOS
(
wosParser
)
import
Data.Gargantext.Parsers.WOS
(
wosParser
)
-- import Data.Gargantext.Parsers.XML (xmlParser)
--
--
import Data.Gargantext.Parsers.XML (xmlParser)
-- import Data.Gargantext.Parsers.DOC (docParser)
--
--
import Data.Gargantext.Parsers.DOC (docParser)
-- import Data.Gargantext.Parsers.ODS (odsParser)
--
--
import Data.Gargantext.Parsers.ODS (odsParser)
import
Data.Gargantext.Prelude
--import Data.Gargantext.Prelude (pm)
import
Data.Gargantext.Types.Main
(
ErrorMessage
(),
GargParser
(),
Corpus
)
--import Data.Gargantext.Types.Main (ErrorMessage
(), Corpus)
-- | According to the format of Input file,
-- | According to the format of Input file,
-- different parser are available.
-- different parser are available.
data
FileFormat
=
WOS
-- Implemented (ISI Format)
data
FileFormat
=
WOS
-- Implemented (ISI Format)
|
XML
-- Not Implemented / see :
-- | DOC -- Not Implemented / import Pandoc
-- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
-- | ODS -- Not Implemented / import Pandoc
|
DOC
-- Not Implemented / import Pandoc
-- | PDF -- Not Implemented / pdftotext and import Pandoc ?
|
ODS
-- Not Implemented / import Pandoc
-- | XML -- Not Implemented / see :
|
PDF
-- Not Implemented / pdftotext and import Pandoc ?
-- -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
---- | withParser:
---- According the format of the text, choosing the right parser.
--withParser :: FileFormat -> ByteString -> IO Corpus
-- | withParser:
withParser
::
FileFormat
->
Parser
[[(
DB
.
ByteString
,
DB
.
ByteString
)]]
-- According the format of the text, choosing the right parser.
withParser
::
FileFormat
->
GargParser
withParser
WOS
=
wosParser
withParser
WOS
=
wosParser
--withParser XML = xmlParser
--withParser DOC = docParser
--withParser DOC = docParser
--withParser ODS = odsParser
--withParser ODS = odsParser
withParser
_
=
error
"[ERROR] Parser not implemented yet"
--withParser XML = xmlParser
--withParser _ = error "[ERROR] Parser not implemented yet"
runParser
::
FileFormat
->
ByteString
->
Either
ErrorMessage
(
IO
(
Maybe
Corpus
))
runParser
format
text
=
parseOnly
(
withParser
format
)
text
parseZip
::
FilePath
->
ByteString
->
IO
Corpus
parseZip
=
undefined
parseFile
::
FileFormat
->
ByteString
->
IO
Corpus
parseFile
p
x
=
case
runParser
p
x
of
Left
_
->
pure
0
Right
r
->
pure
$
length
r
runParser
::
FileFormat
->
DB
.
ByteString
->
IO
(
Either
String
[[(
DB
.
ByteString
,
DB
.
ByteString
)]])
runParser
format
text
=
pure
$
parseOnly
(
withParser
format
)
text
openZip
Files
::
FilePath
->
IO
[
ByteString
]
openZip
::
FilePath
->
IO
[
DB
.
ByteString
]
openZip
Files
fp
=
do
openZip
fp
=
do
path
<-
resolveFile'
fp
path
<-
resolveFile'
fp
entries
<-
withArchive
path
(
DM
.
keys
<$>
getEntries
)
entries
<-
withArchive
path
(
DM
.
keys
<$>
getEntries
)
bs
<-
mapConcurrently
(
\
s
->
withArchive
path
(
getEntry
s
))
entries
bs
<-
mapConcurrently
(
\
s
->
withArchive
path
(
getEntry
s
))
entries
pure
bs
pure
bs
parse
::
FileFormat
->
FilePath
wosParserTest
::
FilePath
->
IO
[
Int
]
->
IO
[
Either
String
[[(
DB
.
ByteString
,
DB
.
ByteString
)]]]
wosParserTest
fp
=
join
$
mapConcurrently
(
parseFile
WOS
)
<$>
openZipFiles
fp
parse
format
path
=
do
files
<-
case
takeExtension
path
of
".zip"
->
openZip
path
_
->
pure
<$>
DB
.
readFile
path
mapConcurrently
(
runParser
format
)
files
src/Data/Gargantext/Parsers/Date.hs
View file @
746cd19d
...
@@ -3,7 +3,7 @@ Module : Data.Gargantext.Parsers.Date
...
@@ -3,7 +3,7 @@ Module : Data.Gargantext.Parsers.Date
Description : Some utils to parse dates
Description : Some utils to parse dates
Copyright : (c) CNRS 2017
Copyright : (c) CNRS 2017
License : AGPL + CECILL v3
License : AGPL + CECILL v3
Maintainer :
alexandre.delanoe@iscpif.fr
Maintainer :
dev@gargantext.org
Stability : experimental
Stability : experimental
Portability : POSIX
Portability : POSIX
...
@@ -12,13 +12,14 @@ According to the language of the text, parseDate1 returns date as Text:
...
@@ -12,13 +12,14 @@ According to the language of the text, parseDate1 returns date as Text:
TODO : Add some tests
TODO : Add some tests
import Data.Gargantext.Parsers as DGP
import Data.Gargantext.Parsers as DGP
DGP.parseDate1 DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
DGP.parseDate1 DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
-}
-}
{-# LANGUAGE OverloadedStrings #-}
module
Data.Gargantext.Parsers.Date
(
parseDate1
,
Lang
(
FR
,
EN
))
where
module
Data.Gargantext.Parsers.Date
(
parseDate1
,
Lang
(
FR
,
EN
))
where
import
Data.Gargantext.Prelude
import
Data.Gargantext.Prelude
import
qualified
Data.Gargantext.Types.Main
as
G
--import
Data.Gargantext.Types.Main as G
import
Data.Time.Clock
(
UTCTime
,
getCurrentTime
)
import
Data.Time.Clock
(
UTCTime
,
getCurrentTime
)
import
Data.Time.LocalTime.TimeZone.Series
(
zonedTimeToZoneSeriesTime
)
import
Data.Time.LocalTime.TimeZone.Series
(
zonedTimeToZoneSeriesTime
)
...
@@ -40,19 +41,16 @@ import Data.Text (Text)
...
@@ -40,19 +41,16 @@ import Data.Text (Text)
-- import Duckling.Rules (rulesFor)
-- import Duckling.Rules (rulesFor)
-- import Duckling.Debug as DB
-- import Duckling.Debug as DB
import
Safe
(
headMay
)
import
Duckling.Types
(
ResolvedToken
)
import
Duckling.Types
(
ResolvedToken
)
import
Safe
(
headMay
)
-- TODO add Paris at Duckling.Locale Region datatype
-- TODO add Paris at Duckling.Locale Region datatype
-- | To get Homogeinity of the languages
-- | To get Homogeinity of the languages
-- TODO : put this in a more generic place in the source code
-- TODO : put this in a more generic place in the source code
parserLang
::
G
.
Language
->
Lang
--parserLang :: G.Language -> Lang
parserLang
G
.
FR
=
FR
--parserLang G.FR = FR
parserLang
G
.
EN
=
EN
--parserLang G.EN = EN
-- | Final Date parser API
-- | Final Date parser API
...
...
src/Data/Gargantext/Parsers/WOS.hs
View file @
746cd19d
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE OverloadedStrings #-}
module
Data.Gargantext.Parsers.WOS
where
module
Data.Gargantext.Parsers.WOS
(
wosParser
)
where
-- TOFIX : Should import Data.Gargantext.Prelude here
-- TOFIX : Should import Data.Gargantext.Prelude here
import
Prelude
hiding
(
takeWhile
,
take
,
concat
,
readFile
)
import
Prelude
hiding
(
takeWhile
,
take
,
concat
,
readFile
,
lines
,
concat
)
import
qualified
Data.List
as
DL
import
qualified
Data.List
as
DL
import
Data.Attoparsec.ByteString
import
Data.Monoid
((
<>
))
import
Data.Attoparsec.ByteString
(
Parser
,
try
,
string
,
takeTill
,
take
,
manyTill
,
many1
)
import
Data.Attoparsec.ByteString.Char8
(
anyChar
,
isEndOfLine
)
import
Data.Attoparsec.ByteString.Char8
(
anyChar
,
isEndOfLine
)
import
Data.ByteString
(
ByteString
)
import
Data.ByteString
(
ByteString
,
concat
)
import
Data.ByteString.Char8
(
pack
)
import
Data.ByteString.Char8
(
pack
)
import
Control.Applicative
import
Control.Applicative
--import Data.Gargantext.Types
import
Data.Gargantext.Types
-- | wosParser parses ISI format from
-- | wosParser parses ISI format from
-- Web Of Science Database
-- Web Of Science Database
wosParser
::
ByteString
->
IO
Corpus
wosParser
::
Parser
[[(
ByteString
,
ByteString
)]]
wosParser
=
undefined
wosParser
=
do
wosParser'
::
Parser
[
Maybe
[
ByteString
]]
wosParser'
=
do
-- TODO Warning if version /= 1.0
-- TODO Warning if version /= 1.0
-- FIXME anyChar (string ..) /= exact string "\nVR 1.0" ?
-- FIXME anyChar (string ..) /= exact string "\nVR 1.0" ?
_
<-
manyTill
anyChar
(
string
$
pack
"
\n
VR 1.0"
)
_
<-
manyTill
anyChar
(
string
$
pack
"
\n
VR 1.0"
)
ns
<-
many1
wosNotice
<*
(
string
$
pack
"
\n
EF"
)
ns
<-
many1
notice
<*
(
string
$
pack
"
\n
EF"
)
return
ns
pure
ns
wosNotice
::
Parser
(
Maybe
[
ByteString
])
notice
::
Parser
[(
ByteString
,
ByteString
)]
wosNotice
=
startNotice
*>
wosFields
<*
endNotice
notice
=
start
*>
fields
<*
end
where
where
endNotice
::
Parser
[
Char
]
start
::
Parser
ByteString
endNotice
=
manyTill
anyChar
(
string
$
pack
"
\n
ER
\n
"
)
start
=
"
\n
PT "
*>
takeTill
isEndOfLine
startNotice
::
Parser
ByteString
end
::
Parser
[
Char
]
startNotice
=
"
\n
PT "
*>
takeTill
isEndOfLine
end
=
manyTill
anyChar
(
string
$
pack
"
\n
ER
\n
"
)
field'
::
Parser
(
ByteString
,
[
ByteString
])
field'
=
do
f
<-
"
\n
"
*>
take
2
<*
" "
a
<-
takeTill
isEndOfLine
as
<-
try
wosLines
let
as'
=
case
DL
.
length
as
>
0
of
True
->
as
False
->
[]
return
(
f
,
[
a
]
++
as'
)
wosFields'
::
Parser
[(
ByteString
,
[
ByteString
])]
fields
::
Parser
[(
ByteString
,
ByteString
)]
wosFields'
=
many
field'
fields
=
many
field
where
wosFields
::
Parser
(
Maybe
[
ByteString
])
field
::
Parser
(
ByteString
,
ByteString
)
wosFields
=
do
field
=
do
-- a <- field "AU"
name
<-
"
\n
"
*>
take
2
<*
" "
-- t <- field "TI"
txt
<-
takeTill
isEndOfLine
-- s <- field "SO"
txts
<-
try
lines
-- d <- field "DI" -- DOI
let
txts'
=
case
DL
.
length
txts
>
0
of
-- p <- field "PD"
True
->
txts
-- b <- field "AB"
False
->
[]
-- u <- field "UT"
pure
(
translate
name
,
concat
([
txt
]
<>
txts'
))
ws
<-
many
field'
return
$
DL
.
lookup
"UT"
ws
-- return $ HyperdataDocument
lines
::
Parser
[
ByteString
]
-- Just "WOS"
lines
=
many
line
-- DL.lookup "DI" ws
-- DL.lookup "URL" ws
-- DL.lookup "PA" ws
-- DL.lookup "TI" ws
wosLines
::
Parser
[
ByteString
]
wosLines
=
many
line
where
where
line
::
Parser
ByteString
line
::
Parser
ByteString
line
=
"
\n
"
*>
takeTill
isEndOfLine
line
=
"
\n
"
*>
takeTill
isEndOfLine
translate
::
ByteString
->
ByteString
translate
champs
|
champs
==
"AU"
=
"author"
|
champs
==
"TI"
=
"title"
|
champs
==
"SO"
=
"source"
|
champs
==
"DI"
=
"doi"
|
champs
==
"PD"
=
"publication_date"
|
champs
==
"AB"
=
"abstract"
|
otherwise
=
champs
src/Data/Gargantext/Prelude.hs
View file @
746cd19d
{-# OPTIONS_GHC -fno-warn-name-shadowing #-}
{-# OPTIONS_GHC -fno-warn-name-shadowing #-}
{-# OPTIONS_GHC -fno-warn-type-defaults #-}
{-# OPTIONS_GHC -fno-warn-type-defaults #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-
TODO: import head impossible from Protolude: why ?
-}
module
Data.Gargantext.Prelude
where
module
Data.Gargantext.Prelude
where
import
Protolude
(
Bool
(
True
,
False
),
Int
,
Double
,
Integer
,
Fractional
,
Num
,
Maybe
,
Floating
,
Char
,
Ord
,
Integral
,
Foldable
,
RealFrac
,
Monad
,
filter
,
import
Protolude
(
Bool
(
True
,
False
),
Int
,
Double
,
Integer
reverse
,
Fractional
,
Num
,
Maybe
,
Floating
,
Char
,
map
,
Ord
,
Integral
,
Foldable
,
RealFrac
,
Monad
,
filter
,
zip
,
reverse
,
map
,
zip
,
drop
,
take
,
zipWith
,
drop
,
sum
,
fromIntegral
,
length
,
fmap
,
take
,
takeWhile
,
sqrt
,
undefined
,
identity
,
zipWith
,
abs
,
maximum
,
minimum
,
return
,
snd
,
truncate
,
sum
,
fromIntegral
,
length
,
fmap
,
takeWhile
,
sqrt
,
undefined
,
identity
,
abs
,
maximum
,
minimum
,
return
,
snd
,
truncate
,
(
+
),
(
*
),
(
/
),
(
-
),
(
.
),
(
>=
),
(
$
),
(
**
),
(
^
)
,
(
+
),
(
*
),
(
/
),
(
-
),
(
.
),
(
>=
),
(
$
),
(
**
),
(
^
)
)
)
...
@@ -36,7 +26,7 @@ import qualified Data.List as L hiding (head, sum)
...
@@ -36,7 +26,7 @@ import qualified Data.List as L hiding (head, sum)
import
qualified
Control.Monad
as
M
import
qualified
Control.Monad
as
M
import
qualified
Data.Map
as
Map
import
qualified
Data.Map
as
Map
import
qualified
Data.Vector
as
V
import
qualified
Data.Vector
as
V
import
Safe
(
headMay
)
--
import Safe (headMay)
pf
::
(
a
->
Bool
)
->
[
a
]
->
[
a
]
pf
::
(
a
->
Bool
)
->
[
a
]
->
[
a
]
...
...
src/Data/Gargantext/Types/Main.hs
View file @
746cd19d
...
@@ -6,7 +6,7 @@ module Data.Gargantext.Types.Main where
...
@@ -6,7 +6,7 @@ module Data.Gargantext.Types.Main where
import
Protolude
(
fromMaybe
)
import
Protolude
(
fromMaybe
)
import
Data.ByteString
(
ByteString
())
--
import Data.ByteString (ByteString())
import
Data.Text
(
Text
)
import
Data.Text
(
Text
)
import
Data.Time
(
UTCTime
)
import
Data.Time
(
UTCTime
)
import
Data.Gargantext.Types.Node
(
NodePoly
import
Data.Gargantext.Types.Node
(
NodePoly
...
@@ -20,7 +20,7 @@ import Data.Gargantext.Types.Node ( NodePoly
...
@@ -20,7 +20,7 @@ import Data.Gargantext.Types.Node ( NodePoly
-- | Language of a Text
-- | Language of a Text
-- For simplicity, we suppose
Text
as an homogenous language
-- For simplicity, we suppose
text h
as an homogenous language
data
Language
=
EN
|
FR
-- | DE | IT | SP
data
Language
=
EN
|
FR
-- | DE | IT | SP
-- > EN == english
-- > EN == english
-- > FR == french
-- > FR == french
...
@@ -33,9 +33,6 @@ type Ngrams = (Text, Text, Text)
...
@@ -33,9 +33,6 @@ type Ngrams = (Text, Text, Text)
type
ErrorMessage
=
String
type
ErrorMessage
=
String
-- Parse Texts
type
GargParser
=
ByteString
->
Either
ErrorMessage
Corpus
-- | TODO add Symbolic Node / Document
-- | TODO add Symbolic Node / Document
-- TODO make instances of Nodes
-- TODO make instances of Nodes
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment