Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
157
Issues
157
List
Board
Labels
Milestones
Merge Requests
9
Merge Requests
9
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
746cd19d
Commit
746cd19d
authored
Jan 12, 2018
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FEAT/TYPES] Parsers main types.
parent
2cd0e36a
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
123 additions
and
149 deletions
+123
-149
moduleDesc.model
moduleDesc.model
+12
-0
Parsers.hs
src/Data/Gargantext/Parsers.hs
+40
-54
Date.hs
src/Data/Gargantext/Parsers/Date.hs
+8
-10
WOS.hs
src/Data/Gargantext/Parsers/WOS.hs
+47
-56
Prelude.hs
src/Data/Gargantext/Prelude.hs
+14
-24
Main.hs
src/Data/Gargantext/Types/Main.hs
+2
-5
No files found.
moduleDesc.model
0 → 100644
View file @
746cd19d
{-|
Module : Data.Gargantext...
Description : Short description
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Here is a longer description of this module, containing some
commentary with @some markup@.
-}
src/Data/Gargantext/Parsers.hs
View file @
746cd19d
...
...
@@ -3,98 +3,84 @@ Module : Data.Gargantext.Parsers
Description : All parsers of Gargantext in one file.
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer :
alexandre.delanoe@iscpif.fr
Maintainer :
team@gargantext.org
Stability : experimental
Portability : POSIX
Gargantext enables analyzing semi-structured text that should be parsed
in order to be analyzed.
The parsers suppose
,
we know the format of the Text (TextFormat data
type) according which the right parser is chosen among the list of
The parsers suppose we know the format of the Text (TextFormat data
type) according
to
which the right parser is chosen among the list of
available parsers.
This module mainly describe how to add a new parser to Gargantext,
please follow the types.
-}
module
Data.Gargantext.Parsers
(
module
Data
.
Gargantext
.
Parsers
.
WOS
--, module Data.Gargantext.Parsers.XML
--, module Data.Gargantext.Parsers.DOC
--, module Data.Gargantext.Parsers.ODS
)
module
Data.Gargantext.Parsers
-- (parse, FileFormat(..))
where
import
Data.Attoparsec.ByteString
import
Data.ByteString
(
ByteString
)
import
System.FilePath
(
takeExtension
)
import
Data.Attoparsec.ByteString
(
parseOnly
,
Parser
)
import
Data.ByteString
as
DB
import
Data.Map
as
DM
import
Data.Either.Extra
(
Either
(
..
))
import
Control.Monad
(
join
)
----
import Data.Either.Extra(Either(..))
----
--
import Control.Monad (join)
import
Codec.Archive.Zip
import
Path.IO
(
resolveFile'
)
-- import qualified Data.ByteString.Lazy as B
import
Control.Applicative
(
(
<$>
)
)
------ import qualified Data.ByteString.Lazy as B
--import Control.Applicative ( (<$>) )
import
Control.Concurrent.Async
as
CCA
(
mapConcurrently
)
import
Data.Gargantext.Parsers.WOS
(
wosParser
)
-- import Data.Gargantext.Parsers.XML (xmlParser)
-- import Data.Gargantext.Parsers.DOC (docParser)
-- import Data.Gargantext.Parsers.ODS (odsParser)
--
--
import Data.Gargantext.Parsers.XML (xmlParser)
--
--
import Data.Gargantext.Parsers.DOC (docParser)
--
--
import Data.Gargantext.Parsers.ODS (odsParser)
import
Data.Gargantext.Prelude
import
Data.Gargantext.Types.Main
(
ErrorMessage
(),
GargParser
(),
Corpus
)
--import Data.Gargantext.Prelude (pm)
--import Data.Gargantext.Types.Main (ErrorMessage
(), Corpus)
-- | According to the format of Input file,
-- different parser are available.
data
FileFormat
=
WOS
-- Implemented (ISI Format)
|
XML
-- Not Implemented / see :
-- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
|
DOC
-- Not Implemented / import Pandoc
|
ODS
-- Not Implemented / import Pandoc
|
PDF
-- Not Implemented / pdftotext and import Pandoc ?
-- | DOC -- Not Implemented / import Pandoc
-- | ODS -- Not Implemented / import Pandoc
-- | PDF -- Not Implemented / pdftotext and import Pandoc ?
-- | XML -- Not Implemented / see :
-- -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
---- | withParser:
---- According the format of the text, choosing the right parser.
-- | withParser:
-- According the format of the text, choosing the right parser.
withParser
::
FileFormat
->
GargParser
--withParser :: FileFormat -> ByteString -> IO Corpus
withParser
::
FileFormat
->
Parser
[[(
DB
.
ByteString
,
DB
.
ByteString
)]]
withParser
WOS
=
wosParser
--withParser XML = xmlParser
--withParser DOC = docParser
--withParser ODS = odsParser
withParser
_
=
error
"[ERROR] Parser not implemented yet"
runParser
::
FileFormat
->
ByteString
->
Either
ErrorMessage
(
IO
(
Maybe
Corpus
))
runParser
format
text
=
parseOnly
(
withParser
format
)
text
parseZip
::
FilePath
->
ByteString
->
IO
Corpus
parseZip
=
undefined
parseFile
::
FileFormat
->
ByteString
->
IO
Corpus
parseFile
p
x
=
case
runParser
p
x
of
Left
_
->
pure
0
Right
r
->
pure
$
length
r
--withParser XML = xmlParser
--withParser _ = error "[ERROR] Parser not implemented yet"
runParser
::
FileFormat
->
DB
.
ByteString
->
IO
(
Either
String
[[(
DB
.
ByteString
,
DB
.
ByteString
)]])
runParser
format
text
=
pure
$
parseOnly
(
withParser
format
)
text
openZip
Files
::
FilePath
->
IO
[
ByteString
]
openZip
Files
fp
=
do
openZip
::
FilePath
->
IO
[
DB
.
ByteString
]
openZip
fp
=
do
path
<-
resolveFile'
fp
entries
<-
withArchive
path
(
DM
.
keys
<$>
getEntries
)
bs
<-
mapConcurrently
(
\
s
->
withArchive
path
(
getEntry
s
))
entries
pure
bs
wosParserTest
::
FilePath
->
IO
[
Int
]
wosParserTest
fp
=
join
$
mapConcurrently
(
parseFile
WOS
)
<$>
openZipFiles
fp
parse
::
FileFormat
->
FilePath
->
IO
[
Either
String
[[(
DB
.
ByteString
,
DB
.
ByteString
)]]]
parse
format
path
=
do
files
<-
case
takeExtension
path
of
".zip"
->
openZip
path
_
->
pure
<$>
DB
.
readFile
path
mapConcurrently
(
runParser
format
)
files
src/Data/Gargantext/Parsers/Date.hs
View file @
746cd19d
...
...
@@ -3,7 +3,7 @@ Module : Data.Gargantext.Parsers.Date
Description : Some utils to parse dates
Copyright : (c) CNRS 2017
License : AGPL + CECILL v3
Maintainer :
alexandre.delanoe@iscpif.fr
Maintainer :
dev@gargantext.org
Stability : experimental
Portability : POSIX
...
...
@@ -12,13 +12,14 @@ According to the language of the text, parseDate1 returns date as Text:
TODO : Add some tests
import Data.Gargantext.Parsers as DGP
DGP.parseDate1 DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
-}
{-# LANGUAGE OverloadedStrings #-}
module
Data.Gargantext.Parsers.Date
(
parseDate1
,
Lang
(
FR
,
EN
))
where
import
Data.Gargantext.Prelude
import
qualified
Data.Gargantext.Types.Main
as
G
--import
Data.Gargantext.Types.Main as G
import
Data.Time.Clock
(
UTCTime
,
getCurrentTime
)
import
Data.Time.LocalTime.TimeZone.Series
(
zonedTimeToZoneSeriesTime
)
...
...
@@ -40,19 +41,16 @@ import Data.Text (Text)
-- import Duckling.Rules (rulesFor)
-- import Duckling.Debug as DB
import
Safe
(
headMay
)
import
Duckling.Types
(
ResolvedToken
)
import
Safe
(
headMay
)
-- TODO add Paris at Duckling.Locale Region datatype
-- | To get Homogeinity of the languages
-- TODO : put this in a more generic place in the source code
parserLang
::
G
.
Language
->
Lang
parserLang
G
.
FR
=
FR
parserLang
G
.
EN
=
EN
--parserLang :: G.Language -> Lang
--parserLang G.FR = FR
--parserLang G.EN = EN
-- | Final Date parser API
...
...
src/Data/Gargantext/Parsers/WOS.hs
View file @
746cd19d
{-# LANGUAGE OverloadedStrings #-}
module
Data.Gargantext.Parsers.WOS
where
module
Data.Gargantext.Parsers.WOS
(
wosParser
)
where
-- TOFIX : Should import Data.Gargantext.Prelude here
import
Prelude
hiding
(
takeWhile
,
take
,
concat
,
readFile
)
import
Prelude
hiding
(
takeWhile
,
take
,
concat
,
readFile
,
lines
,
concat
)
import
qualified
Data.List
as
DL
import
Data.Attoparsec.ByteString
import
Data.Monoid
((
<>
))
import
Data.Attoparsec.ByteString
(
Parser
,
try
,
string
,
takeTill
,
take
,
manyTill
,
many1
)
import
Data.Attoparsec.ByteString.Char8
(
anyChar
,
isEndOfLine
)
import
Data.ByteString
(
ByteString
)
import
Data.ByteString
(
ByteString
,
concat
)
import
Data.ByteString.Char8
(
pack
)
import
Control.Applicative
import
Data.Gargantext.Types
--import Data.Gargantext.Types
-- | wosParser parses ISI format from
-- Web Of Science Database
wosParser
::
ByteString
->
IO
Corpus
wosParser
=
undefined
wosParser'
::
Parser
[
Maybe
[
ByteString
]]
wosParser'
=
do
wosParser
::
Parser
[[(
ByteString
,
ByteString
)]]
wosParser
=
do
-- TODO Warning if version /= 1.0
-- FIXME anyChar (string ..) /= exact string "\nVR 1.0" ?
_
<-
manyTill
anyChar
(
string
$
pack
"
\n
VR 1.0"
)
ns
<-
many1
wosNotice
<*
(
string
$
pack
"
\n
EF"
)
return
ns
_
<-
manyTill
anyChar
(
string
$
pack
"
\n
VR 1.0"
)
ns
<-
many1
notice
<*
(
string
$
pack
"
\n
EF"
)
pure
ns
wosNotice
::
Parser
(
Maybe
[
ByteString
])
wosNotice
=
startNotice
*>
wosFields
<*
endNotice
notice
::
Parser
[(
ByteString
,
ByteString
)]
notice
=
start
*>
fields
<*
end
where
endNotice
::
Parser
[
Char
]
endNotice
=
manyTill
anyChar
(
string
$
pack
"
\n
ER
\n
"
)
startNotice
::
Parser
ByteString
startNotice
=
"
\n
PT "
*>
takeTill
isEndOfLine
start
::
Parser
ByteString
start
=
"
\n
PT "
*>
takeTill
isEndOfLine
end
::
Parser
[
Char
]
end
=
manyTill
anyChar
(
string
$
pack
"
\n
ER
\n
"
)
field'
::
Parser
(
ByteString
,
[
ByteString
])
field'
=
do
f
<-
"
\n
"
*>
take
2
<*
" "
a
<-
takeTill
isEndOfLine
as
<-
try
wosLines
let
as'
=
case
DL
.
length
as
>
0
of
True
->
as
False
->
[]
return
(
f
,
[
a
]
++
as'
)
wosFields'
::
Parser
[(
ByteString
,
[
ByteString
])]
wosFields'
=
many
field'
wosFields
::
Parser
(
Maybe
[
ByteString
])
wosFields
=
do
-- a <- field "AU"
-- t <- field "TI"
-- s <- field "SO"
-- d <- field "DI" -- DOI
-- p <- field "PD"
-- b <- field "AB"
-- u <- field "UT"
ws
<-
many
field'
return
$
DL
.
lookup
"UT"
ws
-- return $ HyperdataDocument
-- Just "WOS"
-- DL.lookup "DI" ws
-- DL.lookup "URL" ws
-- DL.lookup "PA" ws
-- DL.lookup "TI" ws
wosLines
::
Parser
[
ByteString
]
wosLines
=
many
line
fields
::
Parser
[(
ByteString
,
ByteString
)]
fields
=
many
field
where
field
::
Parser
(
ByteString
,
ByteString
)
field
=
do
name
<-
"
\n
"
*>
take
2
<*
" "
txt
<-
takeTill
isEndOfLine
txts
<-
try
lines
let
txts'
=
case
DL
.
length
txts
>
0
of
True
->
txts
False
->
[]
pure
(
translate
name
,
concat
([
txt
]
<>
txts'
))
lines
::
Parser
[
ByteString
]
lines
=
many
line
where
line
::
Parser
ByteString
line
=
"
\n
"
*>
takeTill
isEndOfLine
translate
::
ByteString
->
ByteString
translate
champs
|
champs
==
"AU"
=
"author"
|
champs
==
"TI"
=
"title"
|
champs
==
"SO"
=
"source"
|
champs
==
"DI"
=
"doi"
|
champs
==
"PD"
=
"publication_date"
|
champs
==
"AB"
=
"abstract"
|
otherwise
=
champs
src/Data/Gargantext/Prelude.hs
View file @
746cd19d
{-# OPTIONS_GHC -fno-warn-name-shadowing #-}
{-# OPTIONS_GHC -fno-warn-type-defaults #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# OPTIONS_GHC -fno-warn-type-defaults #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-
TODO: import head impossible from Protolude: why ?
-}
module
Data.Gargantext.Prelude
where
import
Protolude
(
Bool
(
True
,
False
),
Int
,
Double
,
Integer
,
Fractional
,
Num
,
Maybe
,
Floating
,
Char
,
Ord
,
Integral
,
Foldable
,
RealFrac
,
Monad
,
filter
,
reverse
,
map
,
zip
,
drop
,
take
,
zipWith
,
sum
,
fromIntegral
,
length
,
fmap
,
takeWhile
,
sqrt
,
undefined
,
identity
,
abs
,
maximum
,
minimum
,
return
,
snd
,
truncate
import
Protolude
(
Bool
(
True
,
False
),
Int
,
Double
,
Integer
,
Fractional
,
Num
,
Maybe
,
Floating
,
Char
,
Ord
,
Integral
,
Foldable
,
RealFrac
,
Monad
,
filter
,
reverse
,
map
,
zip
,
drop
,
take
,
zipWith
,
sum
,
fromIntegral
,
length
,
fmap
,
takeWhile
,
sqrt
,
undefined
,
identity
,
abs
,
maximum
,
minimum
,
return
,
snd
,
truncate
,
(
+
),
(
*
),
(
/
),
(
-
),
(
.
),
(
>=
),
(
$
),
(
**
),
(
^
)
)
...
...
@@ -36,7 +26,7 @@ import qualified Data.List as L hiding (head, sum)
import
qualified
Control.Monad
as
M
import
qualified
Data.Map
as
Map
import
qualified
Data.Vector
as
V
import
Safe
(
headMay
)
--
import Safe (headMay)
pf
::
(
a
->
Bool
)
->
[
a
]
->
[
a
]
...
...
src/Data/Gargantext/Types/Main.hs
View file @
746cd19d
...
...
@@ -6,7 +6,7 @@ module Data.Gargantext.Types.Main where
import
Protolude
(
fromMaybe
)
import
Data.ByteString
(
ByteString
())
--
import Data.ByteString (ByteString())
import
Data.Text
(
Text
)
import
Data.Time
(
UTCTime
)
import
Data.Gargantext.Types.Node
(
NodePoly
...
...
@@ -20,7 +20,7 @@ import Data.Gargantext.Types.Node ( NodePoly
-- | Language of a Text
-- For simplicity, we suppose
Text
as an homogenous language
-- For simplicity, we suppose
text h
as an homogenous language
data
Language
=
EN
|
FR
-- | DE | IT | SP
-- > EN == english
-- > FR == french
...
...
@@ -33,9 +33,6 @@ type Ngrams = (Text, Text, Text)
type
ErrorMessage
=
String
-- Parse Texts
type
GargParser
=
ByteString
->
Either
ErrorMessage
Corpus
-- | TODO add Symbolic Node / Document
-- TODO make instances of Nodes
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment