Commit d388d621 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[PARSERS] refactor, split, organize (TODO: tests parsers).

parent 64e0bc84
......@@ -40,22 +40,20 @@ import Data.String()
import Data.Text (Text)
import Data.Text.Encoding (decodeUtf8)
import Data.Time (UTCTime(..))
import Data.Tuple.Extra (both, second)
import Data.Tuple.Extra (both, first, second)
import System.FilePath (FilePath(), takeExtension)
import qualified Data.ByteString as DB
import qualified Data.Map as DM
import qualified Data.Text as DT
import qualified Data.Time as DT
------------------------------------------------------------------------
import Gargantext.Core (Lang(..))
import Gargantext.Prelude
import Gargantext.Database.Types.Node (HyperdataDocument(..))
import Gargantext.Text.Parsers.WOS (wosParser)
import Gargantext.Text.Parsers.RIS (risParser)
import qualified Gargantext.Text.Parsers.WOS as WOS
import qualified Gargantext.Text.Parsers.RIS as RIS
import Gargantext.Text.Parsers.RIS.Presse (presseEnrich)
import Gargantext.Text.Parsers.Date (parseDate)
import Gargantext.Text.Parsers.CSV (parseHal, writeDocs2Csv)
import Gargantext.Text.Parsers.CSV (parseHal)
import Gargantext.Text.Terms.Stop (detectLang)
------------------------------------------------------------------------
......@@ -70,7 +68,8 @@ type ParseError = String
-- | According to the format of Input file,
-- different parser are available.
data FileFormat = WOS | RIS | CsvHalFormat | RisPresse -- | CsvGargV3
data FileFormat = WOS | RIS | RisPresse
| CsvGargV3 | CsvHalFormat
deriving (Show)
-- Implemented (ISI Format)
......@@ -87,8 +86,9 @@ data FileFormat = WOS | RIS | CsvHalFormat | RisPresse -- | CsvGargV3
-- TODO manage errors here
parseDocs :: FileFormat -> FilePath -> IO [HyperdataDocument]
parseDocs CsvHalFormat p = parseHal p
parseDocs RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith presseEnrich <$> parse' RIS p
parseDocs ff path = join $ mapM (toDoc ff) <$> snd <$> parse ff path
parseDocs RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith presseEnrich <$> parse' RIS p
parseDocs WOS p = join $ mapM (toDoc WOS) <$> snd <$> enrichWith (map (first WOS.keys)) <$> parse' WOS p
parseDocs ff p = join $ mapM (toDoc ff) <$> snd <$> parse ff p
type Year = Int
type Month = Int
......@@ -159,8 +159,8 @@ parse' format path = do
-- According to the format of the text, choose the right parser.
-- TODO withParser :: FileFormat -> Parser [Document]
withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
withParser WOS = wosParser
withParser RIS = risParser
withParser WOS = WOS.parser
withParser RIS = RIS.parser
--withParser ODT = odtParser
--withParser XML = xmlParser
withParser _ = panic "[ERROR] Parser not implemented yet"
......
......@@ -18,7 +18,7 @@ DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Parsers.Date (parseDate, parseDateRaw, parseGregorian, wrapDST) where
module Gargantext.Text.Parsers.Date (parseDate, parseDateRaw) where
import Data.HashMap.Strict as HM hiding (map)
import Data.Text (Text, unpack, splitOn, pack)
......@@ -37,25 +37,6 @@ import qualified Data.Aeson as Json
import qualified Data.HashSet as HashSet
import qualified Duckling.Core as DC
-- | Unused import (to parse Date Format, keeping it for maybe next steps)
import Control.Monad ((=<<))
import Data.Either (Either)
import Data.Fixed (Fixed (MkFixed))
import Data.Foldable (length)
import Data.String (String)
import Data.Time (ZonedTime(..), LocalTime(..), TimeZone(..), TimeOfDay(..))
import Data.Time.Calendar (Day, fromGregorian)
import Duckling.Debug as DB
import Duckling.Engine (parseAndResolve)
import Duckling.Rules (rulesFor)
import Prelude (toInteger, div, otherwise, (++))
import Text.Parsec.Error (ParseError)
import Text.Parsec.Prim (Stream, ParsecT)
import Text.Parsec.String (Parser)
import Text.ParserCombinators.Parsec (many1, noneOf, anyChar, char, oneOf)
import Text.XML.HXT.DOM.Util (decimalStringToInt)
import qualified Text.ParserCombinators.Parsec (parse)
------------------------------------------------------------------------
-- | Date Parser
-- Parses dates mentions in full text given the language.
......@@ -120,64 +101,3 @@ parseDateWithDuckling lang input = do
--pure $ parseAndResolve (rulesFor (locale ctx) (HashSet.fromList [(This Time)])) input ctx
pure $ analyze input contxt $ HashSet.fromList [(This Time)]
-- | Permit to transform a String to an Int in a monadic context
wrapDST :: Monad m => String -> m Int
wrapDST = return . decimalStringToInt
-- | Generic parser which take at least one element not given in argument
many1NoneOf :: Stream s m Char => [Char] -> ParsecT s u m [Char]
many1NoneOf = (many1 . noneOf)
--getMultiplicator :: Int -> Int
getMultiplicator a
| 0 >= a = 1
| otherwise = 10 * (getMultiplicator $ div a 10)
-- | Parser for date format y-m-d
parseGregorian :: Parser Day
parseGregorian = do
y <- wrapDST =<< many1NoneOf ['-']
_ <- char '-'
m <- wrapDST =<< many1NoneOf ['-']
_ <- char '-'
d <- wrapDST =<< many1NoneOf ['T']
_ <- char 'T'
return $ fromGregorian (toInteger y) m d
---- | Parser for time format h:m:s
parseTimeOfDay :: Parser TimeOfDay
parseTimeOfDay = do
h <- wrapDST =<< many1NoneOf [':']
_ <- char ':'
m <- wrapDST =<< many1NoneOf [':']
_ <- char ':'
r <- many1NoneOf ['.']
_ <- char '.'
dec <- many1NoneOf ['+', '-']
let (nb, l) = (decimalStringToInt $ r ++ dec, length dec)
seconds = nb * 10^(12-l)
return $ TimeOfDay h m (MkFixed . toInteger $ seconds)
-- | Parser for timezone format +hh:mm
parseTimeZone :: Parser TimeZone
parseTimeZone = do
sign <- oneOf ['+', '-']
h <- wrapDST =<< many1NoneOf [':']
_ <- char ':'
m <- wrapDST =<< (many1 $ anyChar)
let timeInMinute = if sign == '+' then h * 60 + m else -h * 60 - m
in return $ TimeZone timeInMinute False "CET"
---- | Parser which use parseGregorian, parseTimeOfDay and parseTimeZone to create a ZonedTime
parseZonedTime :: Parser ZonedTime
parseZonedTime= do
d <- parseGregorian
tod <- parseTimeOfDay
tz <- parseTimeZone
return $ ZonedTime (LocalTime d (tod)) tz
---- | Opposite of toRFC3339
fromRFC3339 :: Text -> Either ParseError ZonedTime
fromRFC3339 t = Text.ParserCombinators.Parsec.parse parseZonedTime "ERROR: Couldn't parse zoned time." input
where input = unpack t
{-|
Module : Gargantext.Text.Parsers.Date.Attoparsec
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Parsers.Date.Attoparsec
where
import Control.Applicative ((<*))
import Data.Attoparsec.ByteString (Parser, take)
import Data.ByteString (ByteString)
import Data.Monoid ((<>))
import Data.Tuple.Extra (first)
import Gargantext.Prelude hiding (takeWhile, take)
-------------------------------------------------------------
parserWith :: Parser ByteString -> Parser [(ByteString, ByteString)]
parserWith sep = do
day <- take 2 <* sep
mon <- take 2 <* sep
yea <- take 4
pure $ map (first (\x -> "publication_" <> x))
[ ("day",day)
, ("month", mon)
, ("year", yea)
, ("date", yea <> "-" <> mon <> "-" <> day <> "T0:0:0")
]
{-|
Module : Gargantext.Text.Parsers.Date
Description : Some utils to parse dates
Copyright : (c) CNRS 2017-present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Parsers.Date.Parsec
where
import Control.Monad ((=<<))
import Data.Either (Either)
import Data.Fixed (Fixed (MkFixed))
import Data.Foldable (length)
import Data.String (String)
import Data.Text (Text, unpack)
import Data.Time (ZonedTime(..), LocalTime(..), TimeZone(..), TimeOfDay(..))
import Data.Time.Calendar (Day, fromGregorian)
import Gargantext.Prelude
import Prelude (toInteger, div, otherwise, (++))
import Text.Parsec.Error (ParseError)
import Text.Parsec.Prim (Stream, ParsecT)
import Text.Parsec.String (Parser)
import Text.ParserCombinators.Parsec (many1, noneOf, anyChar, char, oneOf)
import Text.XML.HXT.DOM.Util (decimalStringToInt)
import qualified Text.ParserCombinators.Parsec (parse)
-- | Permit to transform a String to an Int in a monadic context
wrapDST :: Monad m => String -> m Int
wrapDST = return . decimalStringToInt
-- | Generic parser which take at least one element not given in argument
many1NoneOf :: Stream s m Char => [Char] -> ParsecT s u m [Char]
many1NoneOf = (many1 . noneOf)
getMultiplicator :: Int -> Int
getMultiplicator a
| 0 >= a = 1
| otherwise = 10 * (getMultiplicator $ div a 10)
-- | Parser for date format y-m-d
parseGregorian :: Parser Day
parseGregorian = do
y <- wrapDST =<< many1NoneOf ['-']
_ <- char '-'
m <- wrapDST =<< many1NoneOf ['-']
_ <- char '-'
d <- wrapDST =<< many1NoneOf ['T']
_ <- char 'T'
return $ fromGregorian (toInteger y) m d
---- | Parser for time format h:m:s
parseTimeOfDay :: Parser TimeOfDay
parseTimeOfDay = do
h <- wrapDST =<< many1NoneOf [':']
_ <- char ':'
m <- wrapDST =<< many1NoneOf [':']
_ <- char ':'
r <- many1NoneOf ['.']
_ <- char '.'
dec <- many1NoneOf ['+', '-']
let (nb, l) = (decimalStringToInt $ r ++ dec, length dec)
seconds = nb * 10^(12-l)
return $ TimeOfDay h m (MkFixed . toInteger $ seconds)
-- | Parser for timezone format +hh:mm
parseTimeZone :: Parser TimeZone
parseTimeZone = do
sign <- oneOf ['+', '-']
h <- wrapDST =<< many1NoneOf [':']
_ <- char ':'
m <- wrapDST =<< (many1 $ anyChar)
let timeInMinute = if sign == '+' then h * 60 + m else -h * 60 - m
in return $ TimeZone timeInMinute False "CET"
---- | Parser which use parseGregorian, parseTimeOfDay and parseTimeZone to create a ZonedTime
parseZonedTime :: Parser ZonedTime
parseZonedTime= do
d <- parseGregorian
tod <- parseTimeOfDay
tz <- parseTimeZone
return $ ZonedTime (LocalTime d (tod)) tz
---- | Opposite of toRFC3339
fromRFC3339 :: Text -> Either ParseError ZonedTime
fromRFC3339 t = Text.ParserCombinators.Parsec.parse parseZonedTime "ERROR: Couldn't parse zoned time." input
where input = unpack t
......@@ -19,23 +19,20 @@ citation programs to exchange data.
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Parsers.RIS (risParser, withField, fieldWith, lines) where
module Gargantext.Text.Parsers.RIS (parser, withField, fieldWith, lines) where
import Data.Either (either)
import Data.List (lookup)
import Data.Tuple.Extra (first)
import Control.Applicative
import Data.Attoparsec.ByteString (Parser, try, string, takeTill, take, manyTill, many1, endOfInput, parseOnly)
import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
import Data.ByteString (ByteString, concat, length)
import Data.ByteString.Char8 (pack)
import Data.Attoparsec.ByteString (Parser, try, takeTill, take, many1)
import Data.Attoparsec.ByteString.Char8 (isEndOfLine)
import Data.ByteString (ByteString, concat)
import Data.Monoid ((<>))
import Gargantext.Prelude hiding (takeWhile, take, concat, readFile, lines, concat)
import Gargantext.Prelude hiding (takeWhile, take)
import qualified Data.List as DL
-------------------------------------------------------------
risParser :: Parser [[(ByteString, ByteString)]]
risParser = do
parser :: Parser [[(ByteString, ByteString)]]
parser = do
n <- notice "TY -"
ns <- many1 (notice "\nTY -")
pure $ [n] <> ns
......
......@@ -17,19 +17,13 @@ Presse RIS format parser en enricher.
module Gargantext.Text.Parsers.RIS.Presse (presseEnrich) where
import Data.Either (either)
import Data.List (lookup)
import Data.Tuple.Extra (first)
import Control.Applicative
import Data.Attoparsec.ByteString (Parser, try, string, takeTill, take, manyTill, many1, endOfInput, parseOnly)
import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
import Data.ByteString (ByteString, concat, length)
import Data.ByteString.Char8 (pack)
import Data.Monoid ((<>))
import Gargantext.Prelude hiding (takeWhile, take, concat, readFile, lines, concat)
import Data.Attoparsec.ByteString (parseOnly)
import Data.ByteString (ByteString)
import Gargantext.Prelude hiding (takeWhile, take)
import Gargantext.Text.Parsers.RIS (withField)
import qualified Data.List as DL
import qualified Gargantext.Text.Parsers.Date.Attoparsec as Date
-------------------------------------------------------------
-------------------------------------------------------------
presseEnrich :: [(ByteString, ByteString)] -> [(ByteString, ByteString)]
presseEnrich = (withField "DA" presseDate)
......@@ -37,26 +31,13 @@ presseEnrich = (withField "DA" presseDate)
. (map (first presseFields))
presseDate :: ByteString -> [(ByteString, ByteString)]
presseDate str = either (const []) identity $ parseOnly parseDate str
parseDate :: Parser [(ByteString, ByteString)]
parseDate = do
day <- take 2 <* "/"
mon <- take 2 <* "/"
yea <- take 4
pure $ map (first (\x -> "publication_" <> x))
[ ("day",day)
, ("month", mon)
, ("year", yea)
, ("date", yea <> "-" <> mon <> "-" <> day <> "T0:0:0")
]
presseDate str = either (const []) identity $ parseOnly (Date.parserWith "/") str
presseLang :: ByteString -> [(ByteString, ByteString)]
presseLang "Français" = [("language", "FR")]
presseLang "English" = [("language", "EN")]
presseLang x = [("language", x)]
presseFields :: ByteString -> ByteString
presseFields champs
| champs == "AU" = "authors"
......@@ -67,7 +48,6 @@ presseFields champs
| champs == "N2" = "abstract"
| otherwise = champs
{-
fixTitle :: [(ByteString, ByteString)] -> [(ByteString, ByteString)]
fixTitle ns = ns <> [ti, ab]
......
......@@ -14,28 +14,21 @@ commentary with @some markup@.
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Parsers.WOS (wosParser) where
module Gargantext.Text.Parsers.WOS (parser, keys) where
-- TOFIX : Should import Gargantext.Prelude here
import Prelude hiding (takeWhile, take, concat, readFile, lines, concat)
import qualified Data.List as DL
import Data.Monoid ((<>))
import Data.Attoparsec.ByteString (Parser, try, string
, takeTill, take
, manyTill, many1)
import Control.Applicative
import Data.Attoparsec.ByteString (Parser, string, takeTill, take, manyTill, many1)
import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
import Data.ByteString (ByteString, concat)
import Data.ByteString (ByteString)
import Data.ByteString.Char8 (pack)
import Control.Applicative
import Gargantext.Text.Parsers.RIS (fieldWith, lines)
import Gargantext.Text.Parsers.RIS (fieldWith)
import Prelude hiding (takeWhile, take, concat, readFile, lines, concat)
-------------------------------------------------------------
-- | wosParser parses ISI format from
-- Web Of Science Database
wosParser :: Parser [[(ByteString, ByteString)]]
wosParser = do
parser :: Parser [[(ByteString, ByteString)]]
parser = do
-- TODO Warning if version /= 1.0
-- FIXME anyChar (string ..) /= exact string "\nVR 1.0" ?
_ <- manyTill anyChar (string $ pack "\nVR 1.0")
......@@ -55,8 +48,8 @@ notice = start *> many (fieldWith field) <* end
end = manyTill anyChar (string $ pack "\nER\n")
translate :: ByteString -> ByteString
translate champs
keys :: ByteString -> ByteString
keys champs
| champs == "AF" = "authors"
| champs == "TI" = "title"
| champs == "SO" = "source"
......@@ -64,5 +57,3 @@ translate champs
| champs == "PD" = "publication_date"
| champs == "AB" = "abstract"
| otherwise = champs
-------------------------------------------------------------
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment