Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
H
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Przemyslaw Kaminski
haskell-gargantext
Commits
33f1f510
Commit
33f1f510
authored
May 03, 2019
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FEAT] Parsers RIS && Presse.
parent
dd670d8e
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
177 additions
and
112 deletions
+177
-112
Parsers.hs
src/Gargantext/Text/Parsers.hs
+37
-27
CSV.hs
src/Gargantext/Text/Parsers/CSV.hs
+22
-3
Date.hs
src/Gargantext/Text/Parsers/Date.hs
+77
-73
RIS.hs
src/Gargantext/Text/Parsers/RIS.hs
+41
-9
No files found.
src/Gargantext/Text/Parsers.hs
View file @
33f1f510
...
...
@@ -22,42 +22,38 @@ please follow the types.
{-# LANGUAGE PackageImports #-}
{-# LANGUAGE OverloadedStrings #-}
module
Gargantext.Text.Parsers
(
parse
,
FileFormat
(
..
),
clean
,
parseDocs
)
module
Gargantext.Text.Parsers
(
parse
,
FileFormat
(
..
),
clean
,
parseDocs
,
risPress2csv
)
where
import
System.FilePath
(
FilePath
(),
takeExtension
)
import
"zip"
Codec.Archive.Zip
(
withArchive
,
getEntry
,
getEntries
)
import
Control.Concurrent.Async
as
CCA
(
mapConcurrently
)
import
Control.Monad
(
join
)
import
qualified
Data.Time
as
DT
import
Data.Attoparsec.ByteString
(
parseOnly
,
Parser
)
import
Data.Either
(
Either
(
..
))
import
Data.Either.Extra
(
partitionEithers
)
import
Data.Time
(
UTCTime
(
..
))
import
Data.List
(
concat
)
import
qualified
Data.Map
as
DM
import
qualified
Data.ByteString
as
DB
import
Data.List
(
lookup
)
import
Data.Ord
()
import
Data.String
(
String
())
import
Data.String
()
import
Data.Either
(
Either
(
..
))
import
Data.Attoparsec.ByteString
(
parseOnly
,
Parser
)
import
Data.Text
(
Text
)
import
qualified
Data.Text
as
DT
-- Activate Async for to parse in parallel
import
Control.Concurrent.Async
as
CCA
(
mapConcurrently
)
import
Data.Text.Encoding
(
decodeUtf8
)
import
Data.String
(
String
())
import
Data.List
(
lookup
)
import
Data.Time
(
UTCTime
(
..
))
import
Data.Tuple.Extra
(
both
,
second
)
import
System.FilePath
(
FilePath
(),
takeExtension
)
import
qualified
Data.ByteString
as
DB
import
qualified
Data.Map
as
DM
import
qualified
Data.Text
as
DT
import
qualified
Data.Time
as
DT
------------------------------------------------------------------------
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Prelude
import
Gargantext.Database.Types.Node
(
HyperdataDocument
(
..
))
import
Gargantext.Text.Parsers.WOS
(
wosParser
)
import
Gargantext.Text.Parsers.RIS
(
risParser
)
import
Gargantext.Text.Parsers.RIS
(
risParser
,
presseParser
)
import
Gargantext.Text.Parsers.Date
(
parseDate
)
import
Gargantext.Text.Parsers.CSV
(
parseHal
)
import
Gargantext.Text.Parsers.CSV
(
parseHal
,
writeDocs2Csv
)
import
Gargantext.Text.Terms.Stop
(
detectLang
)
------------------------------------------------------------------------
...
...
@@ -72,7 +68,7 @@ type ParseError = String
-- | According to the format of Input file,
-- different parser are available.
data
FileFormat
=
WOS
|
RIS
|
CsvHalFormat
-- | CsvGargV3
data
FileFormat
=
WOS
|
RIS
|
CsvHalFormat
|
RisPresse
-- | CsvGargV3
deriving
(
Show
)
-- Implemented (ISI Format)
...
...
@@ -88,8 +84,9 @@ data FileFormat = WOS | RIS | CsvHalFormat -- | CsvGargV3
-- | Parse file into documents
-- TODO manage errors here
parseDocs
::
FileFormat
->
FilePath
->
IO
[
HyperdataDocument
]
parseDocs
ff
path
=
join
$
mapM
(
toDoc
ff
)
<$>
snd
<$>
parse
ff
path
parseDocs
CsvHalFormat
p
=
parseHal
p
parseDocs
RisPresse
p
=
join
$
mapM
(
toDoc
RIS
)
<$>
snd
<$>
enrichWith
presseParser
<$>
parse'
RIS
p
parseDocs
ff
path
=
join
$
mapM
(
toDoc
ff
)
<$>
snd
<$>
parse
ff
path
type
Year
=
Int
type
Month
=
Int
...
...
@@ -138,15 +135,23 @@ toDoc ff d = do
toDoc
_
_
=
undefined
parse
::
FileFormat
->
FilePath
->
IO
([
ParseError
],
[[(
Text
,
Text
)]])
parse
format
path
=
do
parse
ff
fp
=
enrichWith
identity
<$>
parse'
ff
fp
enrichWith
::
([(
DB
.
ByteString
,
DB
.
ByteString
)]
->
[(
DB
.
ByteString
,
DB
.
ByteString
)])
->
(
a
,
[[[(
DB
.
ByteString
,
DB
.
ByteString
)]]])
->
(
a
,
[[(
Text
,
Text
)]])
enrichWith
f
=
second
(
map
both'
.
map
f
.
concat
)
where
both'
=
map
(
both
decodeUtf8
)
parse'
::
FileFormat
->
FilePath
->
IO
([
ParseError
],
[[[(
DB
.
ByteString
,
DB
.
ByteString
)]]])
parse'
format
path
=
do
files
<-
case
takeExtension
path
of
".zip"
->
openZip
path
_
->
pure
<$>
DB
.
readFile
path
(
as
,
bs
)
<-
partitionEithers
<$>
mapConcurrently
(
runParser
format
)
files
pure
(
as
,
map
toText
$
concat
bs
)
where
-- TODO : decode with bayesian inference on encodings
toText
=
map
(
\
(
a
,
b
)
->
(
decodeUtf8
a
,
decodeUtf8
b
))
partitionEithers
<$>
mapConcurrently
(
runParser
format
)
files
-- | withParser:
...
...
@@ -175,3 +180,8 @@ clean txt = DT.map clean' txt
clean'
'’'
=
'
\'
'
clean'
c
=
c
risPress2csv
f
=
parseDocs
RisPresse
(
f
<>
".ris"
)
>>=
\
hs
->
writeDocs2Csv
(
f
<>
".csv"
)
hs
src/Gargantext/Text/Parsers/CSV.hs
View file @
33f1f510
...
...
@@ -25,14 +25,14 @@ import Control.Applicative
import
Data.Char
(
ord
)
import
Data.Csv
import
Data.Either
(
Either
(
Left
,
Right
))
import
Data.Text
(
Text
,
pack
,
length
,
intercalate
)
import
Data.Text
(
Text
,
pack
,
length
,
intercalate
,
unpack
)
import
qualified
Data.ByteString.Lazy
as
BL
import
Data.Time.Segment
(
jour
)
import
Data.Vector
(
Vector
)
import
qualified
Data.Vector
as
V
import
Gargantext.Database.Types.Node
(
HyperdataDocument
(
..
))
import
Gargantext.Database.Types.Node
--
(HyperdataDocument(..))
import
Gargantext.Text
import
Gargantext.Text.Context
import
Gargantext.Prelude
hiding
(
length
)
...
...
@@ -83,6 +83,10 @@ doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
Nothing
Nothing
Nothing
---------------------------------------------------------------
-- | Types Conversions
toDocs
::
Vector
CsvDoc
->
[
Doc
]
...
...
@@ -174,6 +178,19 @@ instance ToNamedRecord CsvDoc where
,
"authors"
.=
aut
]
hyperdataDocument2csvDoc
::
HyperdataDocument
->
CsvDoc
hyperdataDocument2csvDoc
h
=
CsvDoc
(
m
$
_hyperdataDocument_title
h
)
(
m
$
_hyperdataDocument_source
h
)
(
mI
$
_hyperdataDocument_publication_year
h
)
(
mI
$
_hyperdataDocument_publication_month
h
)
(
mI
$
_hyperdataDocument_publication_day
h
)
(
m
$
_hyperdataDocument_abstract
h
)
(
m
$
_hyperdataDocument_authors
h
)
where
m
=
maybe
""
identity
mI
=
maybe
0
identity
csvDecodeOptions
::
DecodeOptions
csvDecodeOptions
=
(
defaultDecodeOptions
...
...
@@ -212,7 +229,9 @@ writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
writeCsv
fp
(
h
,
vs
)
=
BL
.
writeFile
fp
$
encodeByNameWith
csvEncodeOptions
h
(
V
.
toList
vs
)
writeDocs2Csv
::
FilePath
->
[
HyperdataDocument
]
->
IO
()
writeDocs2Csv
fp
hs
=
BL
.
writeFile
fp
$
encodeByNameWith
csvEncodeOptions
headerCsvGargV3
(
map
hyperdataDocument2csvDoc
hs
)
------------------------------------------------------------------------
-- Hal Format
data
CsvHal
=
CsvHal
...
...
src/Gargantext/Text/Parsers/Date.hs
View file @
33f1f510
...
...
@@ -18,7 +18,7 @@ DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module
Gargantext.Text.Parsers.Date
(
parseDate
,
parseDateRaw
)
where
module
Gargantext.Text.Parsers.Date
(
parseDate
,
parseDateRaw
,
parseGregorian
,
wrapDST
)
where
import
Data.HashMap.Strict
as
HM
hiding
(
map
)
import
Data.Text
(
Text
,
unpack
,
splitOn
,
pack
)
...
...
@@ -38,23 +38,23 @@ import qualified Data.HashSet as HashSet
import
qualified
Duckling.Core
as
DC
-- | Unused import (to parse Date Format, keeping it for maybe next steps)
--
import Control.Monad ((=<<))
--
import Data.Either (Either)
--
import Data.Fixed (Fixed (MkFixed))
--
import Data.Foldable (length)
--
import Data.String (String)
--
import Data.Time (ZonedTime(..), LocalTime(..), TimeZone(..), TimeOfDay(..))
--
import Data.Time.Calendar (Day, fromGregorian)
--
import Duckling.Debug as DB
--
import Duckling.Engine (parseAndResolve)
--
import Duckling.Rules (rulesFor)
--
import Prelude (toInteger, div, otherwise, (++))
--
import Text.Parsec.Error (ParseError)
--
import Text.Parsec.Prim (Stream, ParsecT)
--
import Text.Parsec.String (Parser)
--
import Text.ParserCombinators.Parsec (many1, noneOf, anyChar, char, oneOf)
--
import Text.XML.HXT.DOM.Util (decimalStringToInt)
--
import qualified Text.ParserCombinators.Parsec (parse)
import
Control.Monad
((
=<<
))
import
Data.Either
(
Either
)
import
Data.Fixed
(
Fixed
(
MkFixed
))
import
Data.Foldable
(
length
)
import
Data.String
(
String
)
import
Data.Time
(
ZonedTime
(
..
),
LocalTime
(
..
),
TimeZone
(
..
),
TimeOfDay
(
..
))
import
Data.Time.Calendar
(
Day
,
fromGregorian
)
import
Duckling.Debug
as
DB
import
Duckling.Engine
(
parseAndResolve
)
import
Duckling.Rules
(
rulesFor
)
import
Prelude
(
toInteger
,
div
,
otherwise
,
(
++
))
import
Text.Parsec.Error
(
ParseError
)
import
Text.Parsec.Prim
(
Stream
,
ParsecT
)
import
Text.Parsec.String
(
Parser
)
import
Text.ParserCombinators.Parsec
(
many1
,
noneOf
,
anyChar
,
char
,
oneOf
)
import
Text.XML.HXT.DOM.Util
(
decimalStringToInt
)
import
qualified
Text.ParserCombinators.Parsec
(
parse
)
------------------------------------------------------------------------
-- | Date Parser
...
...
@@ -64,13 +64,17 @@ import qualified Duckling.Core as DC
-- >>> parseDate EN (pack "April 10 1979")
-- 1979-04-10 00:00:00 UTC
parseDate
::
Lang
->
Text
->
IO
UTCTime
parseDate
lang
s
=
do
parseDate
lang
s
=
parseDate'
"%Y-%m-%dT%T"
"0-0-0T0:0:0"
lang
s
type
DateFormat
=
Text
type
DateNull
=
Text
parseDate'
::
DateFormat
->
DateNull
->
Lang
->
Text
->
IO
UTCTime
parseDate'
format
def
lang
s
=
do
dateStr'
<-
parseDateRaw
lang
s
let
format
=
"%Y-%m-%dT%T"
let
dateStr
=
unpack
$
maybe
"0-0-0T0:0:0"
identity
let
dateStr
=
unpack
$
maybe
def
identity
$
head
$
splitOn
"."
dateStr'
pure
$
parseTimeOrError
True
defaultTimeLocale
format
dateStr
pure
$
parseTimeOrError
True
defaultTimeLocale
(
unpack
format
)
dateStr
-- TODO add Paris at Duckling.Locale Region datatype
...
...
@@ -117,63 +121,63 @@ parseDateWithDuckling lang input = do
pure
$
analyze
input
contxt
$
HashSet
.
fromList
[(
This
Time
)]
-- | Permit to transform a String to an Int in a monadic context
--
wrapDST :: Monad m => String -> m Int
--wrapDST = (return . decimalStringToInt)
wrapDST
::
Monad
m
=>
String
->
m
Int
wrapDST
=
return
.
decimalStringToInt
-- | Generic parser which take at least one element not given in argument
--
many1NoneOf :: Stream s m Char => [Char] -> ParsecT s u m [Char]
--
many1NoneOf = (many1 . noneOf)
many1NoneOf
::
Stream
s
m
Char
=>
[
Char
]
->
ParsecT
s
u
m
[
Char
]
many1NoneOf
=
(
many1
.
noneOf
)
--getMultiplicator :: Int -> Int
--
getMultiplicator a
--
| 0 >= a = 1
--
| otherwise = 10 * (getMultiplicator $ div a 10)
getMultiplicator
a
|
0
>=
a
=
1
|
otherwise
=
10
*
(
getMultiplicator
$
div
a
10
)
-- | Parser for date format y-m-d
--
parseGregorian :: Parser Day
--
parseGregorian = do
--
y <- wrapDST =<< many1NoneOf ['-']
--
_ <- char '-'
--
m <- wrapDST =<< many1NoneOf ['-']
--
_ <- char '-'
--
d <- wrapDST =<< many1NoneOf ['T']
--
_ <- char 'T'
--
return $ fromGregorian (toInteger y) m d
--
parseGregorian
::
Parser
Day
parseGregorian
=
do
y
<-
wrapDST
=<<
many1NoneOf
[
'-'
]
_
<-
char
'-'
m
<-
wrapDST
=<<
many1NoneOf
[
'-'
]
_
<-
char
'-'
d
<-
wrapDST
=<<
many1NoneOf
[
'T'
]
_
<-
char
'T'
return
$
fromGregorian
(
toInteger
y
)
m
d
---- | Parser for time format h:m:s
--
parseTimeOfDay :: Parser TimeOfDay
--
parseTimeOfDay = do
--
h <- wrapDST =<< many1NoneOf [':']
--
_ <- char ':'
--
m <- wrapDST =<< many1NoneOf [':']
--
_ <- char ':'
--
r <- many1NoneOf ['.']
--
_ <- char '.'
--
dec <- many1NoneOf ['+', '-']
--
let (nb, l) = (decimalStringToInt $ r ++ dec, length dec)
--
seconds = nb * 10^(12-l)
--
return $ TimeOfDay h m (MkFixed . toInteger $ seconds)
--
--
parseTimeOfDay
::
Parser
TimeOfDay
parseTimeOfDay
=
do
h
<-
wrapDST
=<<
many1NoneOf
[
':'
]
_
<-
char
':'
m
<-
wrapDST
=<<
many1NoneOf
[
':'
]
_
<-
char
':'
r
<-
many1NoneOf
[
'.'
]
_
<-
char
'.'
dec
<-
many1NoneOf
[
'+'
,
'-'
]
let
(
nb
,
l
)
=
(
decimalStringToInt
$
r
++
dec
,
length
dec
)
seconds
=
nb
*
10
^
(
12
-
l
)
return
$
TimeOfDay
h
m
(
MkFixed
.
toInteger
$
seconds
)
-- | Parser for timezone format +hh:mm
--
parseTimeZone :: Parser TimeZone
--
parseTimeZone = do
--
sign <- oneOf ['+', '-']
--
h <- wrapDST =<< many1NoneOf [':']
--
_ <- char ':'
--
m <- wrapDST =<< (many1 $ anyChar)
--
let timeInMinute = if sign == '+' then h * 60 + m else -h * 60 - m
--
in return $ TimeZone timeInMinute False "CET"
--
parseTimeZone
::
Parser
TimeZone
parseTimeZone
=
do
sign
<-
oneOf
[
'+'
,
'-'
]
h
<-
wrapDST
=<<
many1NoneOf
[
':'
]
_
<-
char
':'
m
<-
wrapDST
=<<
(
many1
$
anyChar
)
let
timeInMinute
=
if
sign
==
'+'
then
h
*
60
+
m
else
-
h
*
60
-
m
in
return
$
TimeZone
timeInMinute
False
"CET"
---- | Parser which use parseGregorian, parseTimeOfDay and parseTimeZone to create a ZonedTime
--
parseZonedTime :: Parser ZonedTime
--
parseZonedTime= do
--
d <- parseGregorian
--
tod <- parseTimeOfDay
--
tz <- parseTimeZone
--
return $ ZonedTime (LocalTime d (tod)) tz
--
parseZonedTime
::
Parser
ZonedTime
parseZonedTime
=
do
d
<-
parseGregorian
tod
<-
parseTimeOfDay
tz
<-
parseTimeZone
return
$
ZonedTime
(
LocalTime
d
(
tod
))
tz
---- | Opposite of toRFC3339
--
fromRFC3339 :: Text -> Either ParseError ZonedTime
--
fromRFC3339 t = Text.ParserCombinators.Parsec.parse parseZonedTime "ERROR: Couldn't parse zoned time." input
--
where input = unpack t
fromRFC3339
::
Text
->
Either
ParseError
ZonedTime
fromRFC3339
t
=
Text
.
ParserCombinators
.
Parsec
.
parse
parseZonedTime
"ERROR: Couldn't parse zoned time."
input
where
input
=
unpack
t
src/Gargantext/Text/Parsers/RIS.hs
View file @
33f1f510
...
...
@@ -10,28 +10,30 @@ Portability : POSIX
RIS is a standardized tag format developed by Research Information
Systems, Incorporated (the format name refers to the company) to enable
citation programs to exchange data.[More](https://en.wikipedia.org/wiki/RIS_(file_format))
citation programs to exchange data.
[More](https://en.wikipedia.org/wiki/RIS_(file_format))
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module
Gargantext.Text.Parsers.RIS
(
risParser
)
where
module
Gargantext.Text.Parsers.RIS
(
risParser
,
risDate
,
toDate
,
presseParser
)
where
import
Data.Either
(
either
)
import
Data.List
(
lookup
)
import
Data.Tuple.Extra
(
first
)
import
Control.Applicative
import
Data.Attoparsec.ByteString
(
Parser
,
try
,
string
,
takeTill
,
take
,
manyTill
,
many1
,
endOfInput
)
import
Data.Attoparsec.ByteString
(
Parser
,
try
,
string
,
takeTill
,
take
,
manyTill
,
many1
,
endOfInput
,
parseOnly
)
import
Data.Attoparsec.ByteString.Char8
(
anyChar
,
isEndOfLine
)
import
Data.ByteString
(
ByteString
,
concat
)
import
Data.ByteString
(
ByteString
,
concat
,
length
)
import
Data.ByteString.Char8
(
pack
)
import
Data.Monoid
((
<>
))
import
Gargantext.Prelude
hiding
(
takeWhile
,
take
,
concat
,
readFile
,
lines
,
concat
)
import
qualified
Data.List
as
DL
-------------------------------------------------------------
data
Lines
=
OneLine
|
MultiLine
risParser
::
Parser
[[(
ByteString
,
ByteString
)]]
risParser
=
do
n
<-
notice
"TY -"
...
...
@@ -57,7 +59,6 @@ field = do
False
->
[]
pure
(
translate
name
,
concat
([
txt
]
<>
txts'
))
lines
::
Parser
[
ByteString
]
lines
=
many
line
where
...
...
@@ -72,7 +73,38 @@ translate champs
|
champs
==
"LA"
=
"language"
|
champs
==
"DI"
=
"doi"
|
champs
==
"UR"
=
"url"
|
champs
==
"DA"
=
"publication_date"
|
champs
==
"N2"
=
"abstract"
|
otherwise
=
champs
-------------------------------------------------------------
presseParser
::
[(
ByteString
,
ByteString
)]
->
[(
ByteString
,
ByteString
)]
presseParser
=
(
toDate
"DA"
(
\
x
->
either
(
const
[]
)
identity
$
parseOnly
risDate
x
))
.
(
toDate
"LA"
presseLang
)
risDate
::
Parser
[(
ByteString
,
ByteString
)]
risDate
=
do
day
<-
take
2
<*
"/"
mon
<-
take
2
<*
"/"
yea
<-
take
4
pure
$
map
(
first
(
\
x
->
"publication_"
<>
x
))
[
(
"day"
,
day
)
,
(
"month"
,
mon
)
,
(
"year"
,
yea
)
,
(
"date"
,
yea
<>
"-"
<>
mon
<>
"-"
<>
day
<>
"T0:0:0"
)
]
toDate
::
ByteString
->
(
ByteString
->
[(
ByteString
,
ByteString
)])
->
[(
ByteString
,
ByteString
)]
->
[(
ByteString
,
ByteString
)]
toDate
k
f
m
=
m
<>
(
maybe
[]
f
(
lookup
k
m
)
)
presseLang
::
ByteString
->
[(
ByteString
,
ByteString
)]
presseLang
"Français"
=
[(
"language"
,
"FR"
)]
presseLang
"English"
=
[(
"langauge"
,
"EN"
)]
presseLang
_
=
undefined
{-
fixTitle :: [(ByteString, ByteString)] -> [(ByteString, ByteString)]
fixTitle ns = ns <> [ti, ab]
where
ti = case
-}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment