Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
H
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Przemyslaw Kaminski
haskell-gargantext
Commits
33f1f510
Commit
33f1f510
authored
May 03, 2019
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FEAT] Parsers RIS && Presse.
parent
dd670d8e
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
177 additions
and
112 deletions
+177
-112
Parsers.hs
src/Gargantext/Text/Parsers.hs
+37
-27
CSV.hs
src/Gargantext/Text/Parsers/CSV.hs
+22
-3
Date.hs
src/Gargantext/Text/Parsers/Date.hs
+77
-73
RIS.hs
src/Gargantext/Text/Parsers/RIS.hs
+41
-9
No files found.
src/Gargantext/Text/Parsers.hs
View file @
33f1f510
...
@@ -22,42 +22,38 @@ please follow the types.
...
@@ -22,42 +22,38 @@ please follow the types.
{-# LANGUAGE PackageImports #-}
{-# LANGUAGE PackageImports #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE OverloadedStrings #-}
module
Gargantext.Text.Parsers
(
parse
,
FileFormat
(
..
),
clean
,
parseDocs
)
module
Gargantext.Text.Parsers
(
parse
,
FileFormat
(
..
),
clean
,
parseDocs
,
risPress2csv
)
where
where
import
System.FilePath
(
FilePath
(),
takeExtension
)
import
"zip"
Codec.Archive.Zip
(
withArchive
,
getEntry
,
getEntries
)
import
"zip"
Codec.Archive.Zip
(
withArchive
,
getEntry
,
getEntries
)
import
Control.Concurrent.Async
as
CCA
(
mapConcurrently
)
import
Control.Monad
(
join
)
import
Control.Monad
(
join
)
import
qualified
Data.Time
as
DT
import
Data.Attoparsec.ByteString
(
parseOnly
,
Parser
)
import
Data.Either
(
Either
(
..
))
import
Data.Either.Extra
(
partitionEithers
)
import
Data.Either.Extra
(
partitionEithers
)
import
Data.Time
(
UTCTime
(
..
))
import
Data.List
(
concat
)
import
Data.List
(
concat
)
import
qualified
Data.Map
as
DM
import
Data.List
(
lookup
)
import
qualified
Data.ByteString
as
DB
import
Data.Ord
()
import
Data.Ord
()
import
Data.String
(
String
())
import
Data.String
()
import
Data.String
()
import
Data.Either
(
Either
(
..
))
import
Data.Attoparsec.ByteString
(
parseOnly
,
Parser
)
import
Data.Text
(
Text
)
import
Data.Text
(
Text
)
import
qualified
Data.Text
as
DT
-- Activate Async for to parse in parallel
import
Control.Concurrent.Async
as
CCA
(
mapConcurrently
)
import
Data.Text.Encoding
(
decodeUtf8
)
import
Data.Text.Encoding
(
decodeUtf8
)
import
Data.String
(
String
())
import
Data.Time
(
UTCTime
(
..
))
import
Data.List
(
lookup
)
import
Data.Tuple.Extra
(
both
,
second
)
import
System.FilePath
(
FilePath
(),
takeExtension
)
import
qualified
Data.ByteString
as
DB
import
qualified
Data.Map
as
DM
import
qualified
Data.Text
as
DT
import
qualified
Data.Time
as
DT
------------------------------------------------------------------------
------------------------------------------------------------------------
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Prelude
import
Gargantext.Prelude
import
Gargantext.Database.Types.Node
(
HyperdataDocument
(
..
))
import
Gargantext.Database.Types.Node
(
HyperdataDocument
(
..
))
import
Gargantext.Text.Parsers.WOS
(
wosParser
)
import
Gargantext.Text.Parsers.WOS
(
wosParser
)
import
Gargantext.Text.Parsers.RIS
(
risParser
)
import
Gargantext.Text.Parsers.RIS
(
risParser
,
presseParser
)
import
Gargantext.Text.Parsers.Date
(
parseDate
)
import
Gargantext.Text.Parsers.Date
(
parseDate
)
import
Gargantext.Text.Parsers.CSV
(
parseHal
)
import
Gargantext.Text.Parsers.CSV
(
parseHal
,
writeDocs2Csv
)
import
Gargantext.Text.Terms.Stop
(
detectLang
)
import
Gargantext.Text.Terms.Stop
(
detectLang
)
------------------------------------------------------------------------
------------------------------------------------------------------------
...
@@ -72,7 +68,7 @@ type ParseError = String
...
@@ -72,7 +68,7 @@ type ParseError = String
-- | According to the format of Input file,
-- | According to the format of Input file,
-- different parser are available.
-- different parser are available.
data
FileFormat
=
WOS
|
RIS
|
CsvHalFormat
-- | CsvGargV3
data
FileFormat
=
WOS
|
RIS
|
CsvHalFormat
|
RisPresse
-- | CsvGargV3
deriving
(
Show
)
deriving
(
Show
)
-- Implemented (ISI Format)
-- Implemented (ISI Format)
...
@@ -88,8 +84,9 @@ data FileFormat = WOS | RIS | CsvHalFormat -- | CsvGargV3
...
@@ -88,8 +84,9 @@ data FileFormat = WOS | RIS | CsvHalFormat -- | CsvGargV3
-- | Parse file into documents
-- | Parse file into documents
-- TODO manage errors here
-- TODO manage errors here
parseDocs
::
FileFormat
->
FilePath
->
IO
[
HyperdataDocument
]
parseDocs
::
FileFormat
->
FilePath
->
IO
[
HyperdataDocument
]
parseDocs
ff
path
=
join
$
mapM
(
toDoc
ff
)
<$>
snd
<$>
parse
ff
path
parseDocs
CsvHalFormat
p
=
parseHal
p
parseDocs
CsvHalFormat
p
=
parseHal
p
parseDocs
RisPresse
p
=
join
$
mapM
(
toDoc
RIS
)
<$>
snd
<$>
enrichWith
presseParser
<$>
parse'
RIS
p
parseDocs
ff
path
=
join
$
mapM
(
toDoc
ff
)
<$>
snd
<$>
parse
ff
path
type
Year
=
Int
type
Year
=
Int
type
Month
=
Int
type
Month
=
Int
...
@@ -138,15 +135,23 @@ toDoc ff d = do
...
@@ -138,15 +135,23 @@ toDoc ff d = do
toDoc
_
_
=
undefined
toDoc
_
_
=
undefined
parse
::
FileFormat
->
FilePath
->
IO
([
ParseError
],
[[(
Text
,
Text
)]])
parse
::
FileFormat
->
FilePath
->
IO
([
ParseError
],
[[(
Text
,
Text
)]])
parse
format
path
=
do
parse
ff
fp
=
enrichWith
identity
<$>
parse'
ff
fp
enrichWith
::
([(
DB
.
ByteString
,
DB
.
ByteString
)]
->
[(
DB
.
ByteString
,
DB
.
ByteString
)])
->
(
a
,
[[[(
DB
.
ByteString
,
DB
.
ByteString
)]]])
->
(
a
,
[[(
Text
,
Text
)]])
enrichWith
f
=
second
(
map
both'
.
map
f
.
concat
)
where
both'
=
map
(
both
decodeUtf8
)
parse'
::
FileFormat
->
FilePath
->
IO
([
ParseError
],
[[[(
DB
.
ByteString
,
DB
.
ByteString
)]]])
parse'
format
path
=
do
files
<-
case
takeExtension
path
of
files
<-
case
takeExtension
path
of
".zip"
->
openZip
path
".zip"
->
openZip
path
_
->
pure
<$>
DB
.
readFile
path
_
->
pure
<$>
DB
.
readFile
path
(
as
,
bs
)
<-
partitionEithers
<$>
mapConcurrently
(
runParser
format
)
files
partitionEithers
<$>
mapConcurrently
(
runParser
format
)
files
pure
(
as
,
map
toText
$
concat
bs
)
where
-- TODO : decode with bayesian inference on encodings
toText
=
map
(
\
(
a
,
b
)
->
(
decodeUtf8
a
,
decodeUtf8
b
))
-- | withParser:
-- | withParser:
...
@@ -175,3 +180,8 @@ clean txt = DT.map clean' txt
...
@@ -175,3 +180,8 @@ clean txt = DT.map clean' txt
clean'
'’'
=
'
\'
'
clean'
'’'
=
'
\'
'
clean'
c
=
c
clean'
c
=
c
risPress2csv
f
=
parseDocs
RisPresse
(
f
<>
".ris"
)
>>=
\
hs
->
writeDocs2Csv
(
f
<>
".csv"
)
hs
src/Gargantext/Text/Parsers/CSV.hs
View file @
33f1f510
...
@@ -25,14 +25,14 @@ import Control.Applicative
...
@@ -25,14 +25,14 @@ import Control.Applicative
import
Data.Char
(
ord
)
import
Data.Char
(
ord
)
import
Data.Csv
import
Data.Csv
import
Data.Either
(
Either
(
Left
,
Right
))
import
Data.Either
(
Either
(
Left
,
Right
))
import
Data.Text
(
Text
,
pack
,
length
,
intercalate
)
import
Data.Text
(
Text
,
pack
,
length
,
intercalate
,
unpack
)
import
qualified
Data.ByteString.Lazy
as
BL
import
qualified
Data.ByteString.Lazy
as
BL
import
Data.Time.Segment
(
jour
)
import
Data.Time.Segment
(
jour
)
import
Data.Vector
(
Vector
)
import
Data.Vector
(
Vector
)
import
qualified
Data.Vector
as
V
import
qualified
Data.Vector
as
V
import
Gargantext.Database.Types.Node
(
HyperdataDocument
(
..
))
import
Gargantext.Database.Types.Node
--
(HyperdataDocument(..))
import
Gargantext.Text
import
Gargantext.Text
import
Gargantext.Text.Context
import
Gargantext.Text.Context
import
Gargantext.Prelude
hiding
(
length
)
import
Gargantext.Prelude
hiding
(
length
)
...
@@ -83,6 +83,10 @@ doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
...
@@ -83,6 +83,10 @@ doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
Nothing
Nothing
Nothing
Nothing
Nothing
Nothing
---------------------------------------------------------------
---------------------------------------------------------------
-- | Types Conversions
-- | Types Conversions
toDocs
::
Vector
CsvDoc
->
[
Doc
]
toDocs
::
Vector
CsvDoc
->
[
Doc
]
...
@@ -174,6 +178,19 @@ instance ToNamedRecord CsvDoc where
...
@@ -174,6 +178,19 @@ instance ToNamedRecord CsvDoc where
,
"authors"
.=
aut
,
"authors"
.=
aut
]
]
hyperdataDocument2csvDoc
::
HyperdataDocument
->
CsvDoc
hyperdataDocument2csvDoc
h
=
CsvDoc
(
m
$
_hyperdataDocument_title
h
)
(
m
$
_hyperdataDocument_source
h
)
(
mI
$
_hyperdataDocument_publication_year
h
)
(
mI
$
_hyperdataDocument_publication_month
h
)
(
mI
$
_hyperdataDocument_publication_day
h
)
(
m
$
_hyperdataDocument_abstract
h
)
(
m
$
_hyperdataDocument_authors
h
)
where
m
=
maybe
""
identity
mI
=
maybe
0
identity
csvDecodeOptions
::
DecodeOptions
csvDecodeOptions
::
DecodeOptions
csvDecodeOptions
=
(
defaultDecodeOptions
csvDecodeOptions
=
(
defaultDecodeOptions
...
@@ -212,7 +229,9 @@ writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
...
@@ -212,7 +229,9 @@ writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
writeCsv
fp
(
h
,
vs
)
=
BL
.
writeFile
fp
$
writeCsv
fp
(
h
,
vs
)
=
BL
.
writeFile
fp
$
encodeByNameWith
csvEncodeOptions
h
(
V
.
toList
vs
)
encodeByNameWith
csvEncodeOptions
h
(
V
.
toList
vs
)
writeDocs2Csv
::
FilePath
->
[
HyperdataDocument
]
->
IO
()
writeDocs2Csv
fp
hs
=
BL
.
writeFile
fp
$
encodeByNameWith
csvEncodeOptions
headerCsvGargV3
(
map
hyperdataDocument2csvDoc
hs
)
------------------------------------------------------------------------
------------------------------------------------------------------------
-- Hal Format
-- Hal Format
data
CsvHal
=
CsvHal
data
CsvHal
=
CsvHal
...
...
src/Gargantext/Text/Parsers/Date.hs
View file @
33f1f510
...
@@ -18,7 +18,7 @@ DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
...
@@ -18,7 +18,7 @@ DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE OverloadedStrings #-}
module
Gargantext.Text.Parsers.Date
(
parseDate
,
parseDateRaw
)
where
module
Gargantext.Text.Parsers.Date
(
parseDate
,
parseDateRaw
,
parseGregorian
,
wrapDST
)
where
import
Data.HashMap.Strict
as
HM
hiding
(
map
)
import
Data.HashMap.Strict
as
HM
hiding
(
map
)
import
Data.Text
(
Text
,
unpack
,
splitOn
,
pack
)
import
Data.Text
(
Text
,
unpack
,
splitOn
,
pack
)
...
@@ -38,23 +38,23 @@ import qualified Data.HashSet as HashSet
...
@@ -38,23 +38,23 @@ import qualified Data.HashSet as HashSet
import
qualified
Duckling.Core
as
DC
import
qualified
Duckling.Core
as
DC
-- | Unused import (to parse Date Format, keeping it for maybe next steps)
-- | Unused import (to parse Date Format, keeping it for maybe next steps)
--
import Control.Monad ((=<<))
import
Control.Monad
((
=<<
))
--
import Data.Either (Either)
import
Data.Either
(
Either
)
--
import Data.Fixed (Fixed (MkFixed))
import
Data.Fixed
(
Fixed
(
MkFixed
))
--
import Data.Foldable (length)
import
Data.Foldable
(
length
)
--
import Data.String (String)
import
Data.String
(
String
)
--
import Data.Time (ZonedTime(..), LocalTime(..), TimeZone(..), TimeOfDay(..))
import
Data.Time
(
ZonedTime
(
..
),
LocalTime
(
..
),
TimeZone
(
..
),
TimeOfDay
(
..
))
--
import Data.Time.Calendar (Day, fromGregorian)
import
Data.Time.Calendar
(
Day
,
fromGregorian
)
--
import Duckling.Debug as DB
import
Duckling.Debug
as
DB
--
import Duckling.Engine (parseAndResolve)
import
Duckling.Engine
(
parseAndResolve
)
--
import Duckling.Rules (rulesFor)
import
Duckling.Rules
(
rulesFor
)
--
import Prelude (toInteger, div, otherwise, (++))
import
Prelude
(
toInteger
,
div
,
otherwise
,
(
++
))
--
import Text.Parsec.Error (ParseError)
import
Text.Parsec.Error
(
ParseError
)
--
import Text.Parsec.Prim (Stream, ParsecT)
import
Text.Parsec.Prim
(
Stream
,
ParsecT
)
--
import Text.Parsec.String (Parser)
import
Text.Parsec.String
(
Parser
)
--
import Text.ParserCombinators.Parsec (many1, noneOf, anyChar, char, oneOf)
import
Text.ParserCombinators.Parsec
(
many1
,
noneOf
,
anyChar
,
char
,
oneOf
)
--
import Text.XML.HXT.DOM.Util (decimalStringToInt)
import
Text.XML.HXT.DOM.Util
(
decimalStringToInt
)
--
import qualified Text.ParserCombinators.Parsec (parse)
import
qualified
Text.ParserCombinators.Parsec
(
parse
)
------------------------------------------------------------------------
------------------------------------------------------------------------
-- | Date Parser
-- | Date Parser
...
@@ -64,13 +64,17 @@ import qualified Duckling.Core as DC
...
@@ -64,13 +64,17 @@ import qualified Duckling.Core as DC
-- >>> parseDate EN (pack "April 10 1979")
-- >>> parseDate EN (pack "April 10 1979")
-- 1979-04-10 00:00:00 UTC
-- 1979-04-10 00:00:00 UTC
parseDate
::
Lang
->
Text
->
IO
UTCTime
parseDate
::
Lang
->
Text
->
IO
UTCTime
parseDate
lang
s
=
do
parseDate
lang
s
=
parseDate'
"%Y-%m-%dT%T"
"0-0-0T0:0:0"
lang
s
type
DateFormat
=
Text
type
DateNull
=
Text
parseDate'
::
DateFormat
->
DateNull
->
Lang
->
Text
->
IO
UTCTime
parseDate'
format
def
lang
s
=
do
dateStr'
<-
parseDateRaw
lang
s
dateStr'
<-
parseDateRaw
lang
s
let
format
=
"%Y-%m-%dT%T"
let
dateStr
=
unpack
$
maybe
def
identity
let
dateStr
=
unpack
$
maybe
"0-0-0T0:0:0"
identity
$
head
$
splitOn
"."
dateStr'
$
head
$
splitOn
"."
dateStr'
pure
$
parseTimeOrError
True
defaultTimeLocale
format
dateStr
pure
$
parseTimeOrError
True
defaultTimeLocale
(
unpack
format
)
dateStr
-- TODO add Paris at Duckling.Locale Region datatype
-- TODO add Paris at Duckling.Locale Region datatype
...
@@ -117,63 +121,63 @@ parseDateWithDuckling lang input = do
...
@@ -117,63 +121,63 @@ parseDateWithDuckling lang input = do
pure
$
analyze
input
contxt
$
HashSet
.
fromList
[(
This
Time
)]
pure
$
analyze
input
contxt
$
HashSet
.
fromList
[(
This
Time
)]
-- | Permit to transform a String to an Int in a monadic context
-- | Permit to transform a String to an Int in a monadic context
--
wrapDST :: Monad m => String -> m Int
wrapDST
::
Monad
m
=>
String
->
m
Int
--wrapDST = (return . decimalStringToInt)
wrapDST
=
return
.
decimalStringToInt
-- | Generic parser which take at least one element not given in argument
-- | Generic parser which take at least one element not given in argument
--
many1NoneOf :: Stream s m Char => [Char] -> ParsecT s u m [Char]
many1NoneOf
::
Stream
s
m
Char
=>
[
Char
]
->
ParsecT
s
u
m
[
Char
]
--
many1NoneOf = (many1 . noneOf)
many1NoneOf
=
(
many1
.
noneOf
)
--getMultiplicator :: Int -> Int
--getMultiplicator :: Int -> Int
--
getMultiplicator a
getMultiplicator
a
--
| 0 >= a = 1
|
0
>=
a
=
1
--
| otherwise = 10 * (getMultiplicator $ div a 10)
|
otherwise
=
10
*
(
getMultiplicator
$
div
a
10
)
-- | Parser for date format y-m-d
-- | Parser for date format y-m-d
--
parseGregorian :: Parser Day
parseGregorian
::
Parser
Day
--
parseGregorian = do
parseGregorian
=
do
--
y <- wrapDST =<< many1NoneOf ['-']
y
<-
wrapDST
=<<
many1NoneOf
[
'-'
]
--
_ <- char '-'
_
<-
char
'-'
--
m <- wrapDST =<< many1NoneOf ['-']
m
<-
wrapDST
=<<
many1NoneOf
[
'-'
]
--
_ <- char '-'
_
<-
char
'-'
--
d <- wrapDST =<< many1NoneOf ['T']
d
<-
wrapDST
=<<
many1NoneOf
[
'T'
]
--
_ <- char 'T'
_
<-
char
'T'
--
return $ fromGregorian (toInteger y) m d
return
$
fromGregorian
(
toInteger
y
)
m
d
--
---- | Parser for time format h:m:s
---- | Parser for time format h:m:s
--
parseTimeOfDay :: Parser TimeOfDay
parseTimeOfDay
::
Parser
TimeOfDay
--
parseTimeOfDay = do
parseTimeOfDay
=
do
--
h <- wrapDST =<< many1NoneOf [':']
h
<-
wrapDST
=<<
many1NoneOf
[
':'
]
--
_ <- char ':'
_
<-
char
':'
--
m <- wrapDST =<< many1NoneOf [':']
m
<-
wrapDST
=<<
many1NoneOf
[
':'
]
--
_ <- char ':'
_
<-
char
':'
--
r <- many1NoneOf ['.']
r
<-
many1NoneOf
[
'.'
]
--
_ <- char '.'
_
<-
char
'.'
--
dec <- many1NoneOf ['+', '-']
dec
<-
many1NoneOf
[
'+'
,
'-'
]
--
let (nb, l) = (decimalStringToInt $ r ++ dec, length dec)
let
(
nb
,
l
)
=
(
decimalStringToInt
$
r
++
dec
,
length
dec
)
--
seconds = nb * 10^(12-l)
seconds
=
nb
*
10
^
(
12
-
l
)
--
return $ TimeOfDay h m (MkFixed . toInteger $ seconds)
return
$
TimeOfDay
h
m
(
MkFixed
.
toInteger
$
seconds
)
--
--
-- | Parser for timezone format +hh:mm
-- | Parser for timezone format +hh:mm
--
parseTimeZone :: Parser TimeZone
parseTimeZone
::
Parser
TimeZone
--
parseTimeZone = do
parseTimeZone
=
do
--
sign <- oneOf ['+', '-']
sign
<-
oneOf
[
'+'
,
'-'
]
--
h <- wrapDST =<< many1NoneOf [':']
h
<-
wrapDST
=<<
many1NoneOf
[
':'
]
--
_ <- char ':'
_
<-
char
':'
--
m <- wrapDST =<< (many1 $ anyChar)
m
<-
wrapDST
=<<
(
many1
$
anyChar
)
--
let timeInMinute = if sign == '+' then h * 60 + m else -h * 60 - m
let
timeInMinute
=
if
sign
==
'+'
then
h
*
60
+
m
else
-
h
*
60
-
m
--
in return $ TimeZone timeInMinute False "CET"
in
return
$
TimeZone
timeInMinute
False
"CET"
--
---- | Parser which use parseGregorian, parseTimeOfDay and parseTimeZone to create a ZonedTime
---- | Parser which use parseGregorian, parseTimeOfDay and parseTimeZone to create a ZonedTime
--
parseZonedTime :: Parser ZonedTime
parseZonedTime
::
Parser
ZonedTime
--
parseZonedTime= do
parseZonedTime
=
do
--
d <- parseGregorian
d
<-
parseGregorian
--
tod <- parseTimeOfDay
tod
<-
parseTimeOfDay
--
tz <- parseTimeZone
tz
<-
parseTimeZone
--
return $ ZonedTime (LocalTime d (tod)) tz
return
$
ZonedTime
(
LocalTime
d
(
tod
))
tz
--
---- | Opposite of toRFC3339
---- | Opposite of toRFC3339
--
fromRFC3339 :: Text -> Either ParseError ZonedTime
fromRFC3339
::
Text
->
Either
ParseError
ZonedTime
--
fromRFC3339 t = Text.ParserCombinators.Parsec.parse parseZonedTime "ERROR: Couldn't parse zoned time." input
fromRFC3339
t
=
Text
.
ParserCombinators
.
Parsec
.
parse
parseZonedTime
"ERROR: Couldn't parse zoned time."
input
--
where input = unpack t
where
input
=
unpack
t
src/Gargantext/Text/Parsers/RIS.hs
View file @
33f1f510
...
@@ -10,28 +10,30 @@ Portability : POSIX
...
@@ -10,28 +10,30 @@ Portability : POSIX
RIS is a standardized tag format developed by Research Information
RIS is a standardized tag format developed by Research Information
Systems, Incorporated (the format name refers to the company) to enable
Systems, Incorporated (the format name refers to the company) to enable
citation programs to exchange data.[More](https://en.wikipedia.org/wiki/RIS_(file_format))
citation programs to exchange data.
[More](https://en.wikipedia.org/wiki/RIS_(file_format))
-}
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE OverloadedStrings #-}
module
Gargantext.Text.Parsers.RIS
(
risParser
)
where
module
Gargantext.Text.Parsers.RIS
(
risParser
,
risDate
,
toDate
,
presseParser
)
where
import
Data.Either
(
either
)
import
Data.List
(
lookup
)
import
Data.Tuple.Extra
(
first
)
import
Control.Applicative
import
Control.Applicative
import
Data.Attoparsec.ByteString
(
Parser
,
try
,
string
,
takeTill
,
take
,
manyTill
,
many1
,
endOfInput
)
import
Data.Attoparsec.ByteString
(
Parser
,
try
,
string
,
takeTill
,
take
,
manyTill
,
many1
,
endOfInput
,
parseOnly
)
import
Data.Attoparsec.ByteString.Char8
(
anyChar
,
isEndOfLine
)
import
Data.Attoparsec.ByteString.Char8
(
anyChar
,
isEndOfLine
)
import
Data.ByteString
(
ByteString
,
concat
)
import
Data.ByteString
(
ByteString
,
concat
,
length
)
import
Data.ByteString.Char8
(
pack
)
import
Data.ByteString.Char8
(
pack
)
import
Data.Monoid
((
<>
))
import
Data.Monoid
((
<>
))
import
Gargantext.Prelude
hiding
(
takeWhile
,
take
,
concat
,
readFile
,
lines
,
concat
)
import
Gargantext.Prelude
hiding
(
takeWhile
,
take
,
concat
,
readFile
,
lines
,
concat
)
import
qualified
Data.List
as
DL
import
qualified
Data.List
as
DL
-------------------------------------------------------------
-------------------------------------------------------------
data
Lines
=
OneLine
|
MultiLine
risParser
::
Parser
[[(
ByteString
,
ByteString
)]]
risParser
::
Parser
[[(
ByteString
,
ByteString
)]]
risParser
=
do
risParser
=
do
n
<-
notice
"TY -"
n
<-
notice
"TY -"
...
@@ -57,7 +59,6 @@ field = do
...
@@ -57,7 +59,6 @@ field = do
False
->
[]
False
->
[]
pure
(
translate
name
,
concat
([
txt
]
<>
txts'
))
pure
(
translate
name
,
concat
([
txt
]
<>
txts'
))
lines
::
Parser
[
ByteString
]
lines
::
Parser
[
ByteString
]
lines
=
many
line
lines
=
many
line
where
where
...
@@ -72,7 +73,38 @@ translate champs
...
@@ -72,7 +73,38 @@ translate champs
|
champs
==
"LA"
=
"language"
|
champs
==
"LA"
=
"language"
|
champs
==
"DI"
=
"doi"
|
champs
==
"DI"
=
"doi"
|
champs
==
"UR"
=
"url"
|
champs
==
"UR"
=
"url"
|
champs
==
"DA"
=
"publication_date"
|
champs
==
"N2"
=
"abstract"
|
champs
==
"N2"
=
"abstract"
|
otherwise
=
champs
|
otherwise
=
champs
-------------------------------------------------------------
-------------------------------------------------------------
presseParser
::
[(
ByteString
,
ByteString
)]
->
[(
ByteString
,
ByteString
)]
presseParser
=
(
toDate
"DA"
(
\
x
->
either
(
const
[]
)
identity
$
parseOnly
risDate
x
))
.
(
toDate
"LA"
presseLang
)
risDate
::
Parser
[(
ByteString
,
ByteString
)]
risDate
=
do
day
<-
take
2
<*
"/"
mon
<-
take
2
<*
"/"
yea
<-
take
4
pure
$
map
(
first
(
\
x
->
"publication_"
<>
x
))
[
(
"day"
,
day
)
,
(
"month"
,
mon
)
,
(
"year"
,
yea
)
,
(
"date"
,
yea
<>
"-"
<>
mon
<>
"-"
<>
day
<>
"T0:0:0"
)
]
toDate
::
ByteString
->
(
ByteString
->
[(
ByteString
,
ByteString
)])
->
[(
ByteString
,
ByteString
)]
->
[(
ByteString
,
ByteString
)]
toDate
k
f
m
=
m
<>
(
maybe
[]
f
(
lookup
k
m
)
)
presseLang
::
ByteString
->
[(
ByteString
,
ByteString
)]
presseLang
"Français"
=
[(
"language"
,
"FR"
)]
presseLang
"English"
=
[(
"langauge"
,
"EN"
)]
presseLang
_
=
undefined
{-
fixTitle :: [(ByteString, ByteString)] -> [(ByteString, ByteString)]
fixTitle ns = ns <> [ti, ab]
where
ti = case
-}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment