Commit d388d621 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[PARSERS] refactor, split, organize (TODO: tests parsers).

parent 64e0bc84
...@@ -40,22 +40,20 @@ import Data.String() ...@@ -40,22 +40,20 @@ import Data.String()
import Data.Text (Text) import Data.Text (Text)
import Data.Text.Encoding (decodeUtf8) import Data.Text.Encoding (decodeUtf8)
import Data.Time (UTCTime(..)) import Data.Time (UTCTime(..))
import Data.Tuple.Extra (both, second) import Data.Tuple.Extra (both, first, second)
import System.FilePath (FilePath(), takeExtension) import System.FilePath (FilePath(), takeExtension)
import qualified Data.ByteString as DB import qualified Data.ByteString as DB
import qualified Data.Map as DM import qualified Data.Map as DM
import qualified Data.Text as DT import qualified Data.Text as DT
import qualified Data.Time as DT import qualified Data.Time as DT
------------------------------------------------------------------------
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Database.Types.Node (HyperdataDocument(..)) import Gargantext.Database.Types.Node (HyperdataDocument(..))
import Gargantext.Text.Parsers.WOS (wosParser) import qualified Gargantext.Text.Parsers.WOS as WOS
import Gargantext.Text.Parsers.RIS (risParser) import qualified Gargantext.Text.Parsers.RIS as RIS
import Gargantext.Text.Parsers.RIS.Presse (presseEnrich) import Gargantext.Text.Parsers.RIS.Presse (presseEnrich)
import Gargantext.Text.Parsers.Date (parseDate) import Gargantext.Text.Parsers.Date (parseDate)
import Gargantext.Text.Parsers.CSV (parseHal, writeDocs2Csv) import Gargantext.Text.Parsers.CSV (parseHal)
import Gargantext.Text.Terms.Stop (detectLang) import Gargantext.Text.Terms.Stop (detectLang)
------------------------------------------------------------------------ ------------------------------------------------------------------------
...@@ -70,7 +68,8 @@ type ParseError = String ...@@ -70,7 +68,8 @@ type ParseError = String
-- | According to the format of Input file, -- | According to the format of Input file,
-- different parser are available. -- different parser are available.
data FileFormat = WOS | RIS | CsvHalFormat | RisPresse -- | CsvGargV3 data FileFormat = WOS | RIS | RisPresse
| CsvGargV3 | CsvHalFormat
deriving (Show) deriving (Show)
-- Implemented (ISI Format) -- Implemented (ISI Format)
...@@ -88,7 +87,8 @@ data FileFormat = WOS | RIS | CsvHalFormat | RisPresse -- | CsvGargV3 ...@@ -88,7 +87,8 @@ data FileFormat = WOS | RIS | CsvHalFormat | RisPresse -- | CsvGargV3
parseDocs :: FileFormat -> FilePath -> IO [HyperdataDocument] parseDocs :: FileFormat -> FilePath -> IO [HyperdataDocument]
parseDocs CsvHalFormat p = parseHal p parseDocs CsvHalFormat p = parseHal p
parseDocs RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith presseEnrich <$> parse' RIS p parseDocs RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith presseEnrich <$> parse' RIS p
parseDocs ff path = join $ mapM (toDoc ff) <$> snd <$> parse ff path parseDocs WOS p = join $ mapM (toDoc WOS) <$> snd <$> enrichWith (map (first WOS.keys)) <$> parse' WOS p
parseDocs ff p = join $ mapM (toDoc ff) <$> snd <$> parse ff p
type Year = Int type Year = Int
type Month = Int type Month = Int
...@@ -159,8 +159,8 @@ parse' format path = do ...@@ -159,8 +159,8 @@ parse' format path = do
-- According to the format of the text, choose the right parser. -- According to the format of the text, choose the right parser.
-- TODO withParser :: FileFormat -> Parser [Document] -- TODO withParser :: FileFormat -> Parser [Document]
withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]] withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
withParser WOS = wosParser withParser WOS = WOS.parser
withParser RIS = risParser withParser RIS = RIS.parser
--withParser ODT = odtParser --withParser ODT = odtParser
--withParser XML = xmlParser --withParser XML = xmlParser
withParser _ = panic "[ERROR] Parser not implemented yet" withParser _ = panic "[ERROR] Parser not implemented yet"
......
...@@ -18,7 +18,7 @@ DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00" ...@@ -18,7 +18,7 @@ DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Parsers.Date (parseDate, parseDateRaw, parseGregorian, wrapDST) where module Gargantext.Text.Parsers.Date (parseDate, parseDateRaw) where
import Data.HashMap.Strict as HM hiding (map) import Data.HashMap.Strict as HM hiding (map)
import Data.Text (Text, unpack, splitOn, pack) import Data.Text (Text, unpack, splitOn, pack)
...@@ -37,25 +37,6 @@ import qualified Data.Aeson as Json ...@@ -37,25 +37,6 @@ import qualified Data.Aeson as Json
import qualified Data.HashSet as HashSet import qualified Data.HashSet as HashSet
import qualified Duckling.Core as DC import qualified Duckling.Core as DC
-- | Unused import (to parse Date Format, keeping it for maybe next steps)
import Control.Monad ((=<<))
import Data.Either (Either)
import Data.Fixed (Fixed (MkFixed))
import Data.Foldable (length)
import Data.String (String)
import Data.Time (ZonedTime(..), LocalTime(..), TimeZone(..), TimeOfDay(..))
import Data.Time.Calendar (Day, fromGregorian)
import Duckling.Debug as DB
import Duckling.Engine (parseAndResolve)
import Duckling.Rules (rulesFor)
import Prelude (toInteger, div, otherwise, (++))
import Text.Parsec.Error (ParseError)
import Text.Parsec.Prim (Stream, ParsecT)
import Text.Parsec.String (Parser)
import Text.ParserCombinators.Parsec (many1, noneOf, anyChar, char, oneOf)
import Text.XML.HXT.DOM.Util (decimalStringToInt)
import qualified Text.ParserCombinators.Parsec (parse)
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | Date Parser -- | Date Parser
-- Parses dates mentions in full text given the language. -- Parses dates mentions in full text given the language.
...@@ -120,64 +101,3 @@ parseDateWithDuckling lang input = do ...@@ -120,64 +101,3 @@ parseDateWithDuckling lang input = do
--pure $ parseAndResolve (rulesFor (locale ctx) (HashSet.fromList [(This Time)])) input ctx --pure $ parseAndResolve (rulesFor (locale ctx) (HashSet.fromList [(This Time)])) input ctx
pure $ analyze input contxt $ HashSet.fromList [(This Time)] pure $ analyze input contxt $ HashSet.fromList [(This Time)]
-- | Permit to transform a String to an Int in a monadic context
wrapDST :: Monad m => String -> m Int
wrapDST = return . decimalStringToInt
-- | Generic parser which take at least one element not given in argument
many1NoneOf :: Stream s m Char => [Char] -> ParsecT s u m [Char]
many1NoneOf = (many1 . noneOf)
--getMultiplicator :: Int -> Int
getMultiplicator a
| 0 >= a = 1
| otherwise = 10 * (getMultiplicator $ div a 10)
-- | Parser for date format y-m-d
parseGregorian :: Parser Day
parseGregorian = do
y <- wrapDST =<< many1NoneOf ['-']
_ <- char '-'
m <- wrapDST =<< many1NoneOf ['-']
_ <- char '-'
d <- wrapDST =<< many1NoneOf ['T']
_ <- char 'T'
return $ fromGregorian (toInteger y) m d
---- | Parser for time format h:m:s
parseTimeOfDay :: Parser TimeOfDay
parseTimeOfDay = do
h <- wrapDST =<< many1NoneOf [':']
_ <- char ':'
m <- wrapDST =<< many1NoneOf [':']
_ <- char ':'
r <- many1NoneOf ['.']
_ <- char '.'
dec <- many1NoneOf ['+', '-']
let (nb, l) = (decimalStringToInt $ r ++ dec, length dec)
seconds = nb * 10^(12-l)
return $ TimeOfDay h m (MkFixed . toInteger $ seconds)
-- | Parser for timezone format +hh:mm
parseTimeZone :: Parser TimeZone
parseTimeZone = do
sign <- oneOf ['+', '-']
h <- wrapDST =<< many1NoneOf [':']
_ <- char ':'
m <- wrapDST =<< (many1 $ anyChar)
let timeInMinute = if sign == '+' then h * 60 + m else -h * 60 - m
in return $ TimeZone timeInMinute False "CET"
---- | Parser which use parseGregorian, parseTimeOfDay and parseTimeZone to create a ZonedTime
parseZonedTime :: Parser ZonedTime
parseZonedTime= do
d <- parseGregorian
tod <- parseTimeOfDay
tz <- parseTimeZone
return $ ZonedTime (LocalTime d (tod)) tz
---- | Opposite of toRFC3339
fromRFC3339 :: Text -> Either ParseError ZonedTime
fromRFC3339 t = Text.ParserCombinators.Parsec.parse parseZonedTime "ERROR: Couldn't parse zoned time." input
where input = unpack t
{-|
Module : Gargantext.Text.Parsers.Date.Attoparsec
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Parsers.Date.Attoparsec
where
import Control.Applicative ((<*))
import Data.Attoparsec.ByteString (Parser, take)
import Data.ByteString (ByteString)
import Data.Monoid ((<>))
import Data.Tuple.Extra (first)
import Gargantext.Prelude hiding (takeWhile, take)
-------------------------------------------------------------
parserWith :: Parser ByteString -> Parser [(ByteString, ByteString)]
parserWith sep = do
day <- take 2 <* sep
mon <- take 2 <* sep
yea <- take 4
pure $ map (first (\x -> "publication_" <> x))
[ ("day",day)
, ("month", mon)
, ("year", yea)
, ("date", yea <> "-" <> mon <> "-" <> day <> "T0:0:0")
]
{-|
Module : Gargantext.Text.Parsers.Date
Description : Some utils to parse dates
Copyright : (c) CNRS 2017-present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Parsers.Date.Parsec
where
import Control.Monad ((=<<))
import Data.Either (Either)
import Data.Fixed (Fixed (MkFixed))
import Data.Foldable (length)
import Data.String (String)
import Data.Text (Text, unpack)
import Data.Time (ZonedTime(..), LocalTime(..), TimeZone(..), TimeOfDay(..))
import Data.Time.Calendar (Day, fromGregorian)
import Gargantext.Prelude
import Prelude (toInteger, div, otherwise, (++))
import Text.Parsec.Error (ParseError)
import Text.Parsec.Prim (Stream, ParsecT)
import Text.Parsec.String (Parser)
import Text.ParserCombinators.Parsec (many1, noneOf, anyChar, char, oneOf)
import Text.XML.HXT.DOM.Util (decimalStringToInt)
import qualified Text.ParserCombinators.Parsec (parse)
-- | Permit to transform a String to an Int in a monadic context
wrapDST :: Monad m => String -> m Int
wrapDST = return . decimalStringToInt
-- | Generic parser which take at least one element not given in argument
many1NoneOf :: Stream s m Char => [Char] -> ParsecT s u m [Char]
many1NoneOf = (many1 . noneOf)
getMultiplicator :: Int -> Int
getMultiplicator a
| 0 >= a = 1
| otherwise = 10 * (getMultiplicator $ div a 10)
-- | Parser for date format y-m-d
parseGregorian :: Parser Day
parseGregorian = do
y <- wrapDST =<< many1NoneOf ['-']
_ <- char '-'
m <- wrapDST =<< many1NoneOf ['-']
_ <- char '-'
d <- wrapDST =<< many1NoneOf ['T']
_ <- char 'T'
return $ fromGregorian (toInteger y) m d
---- | Parser for time format h:m:s
parseTimeOfDay :: Parser TimeOfDay
parseTimeOfDay = do
h <- wrapDST =<< many1NoneOf [':']
_ <- char ':'
m <- wrapDST =<< many1NoneOf [':']
_ <- char ':'
r <- many1NoneOf ['.']
_ <- char '.'
dec <- many1NoneOf ['+', '-']
let (nb, l) = (decimalStringToInt $ r ++ dec, length dec)
seconds = nb * 10^(12-l)
return $ TimeOfDay h m (MkFixed . toInteger $ seconds)
-- | Parser for timezone format +hh:mm
parseTimeZone :: Parser TimeZone
parseTimeZone = do
sign <- oneOf ['+', '-']
h <- wrapDST =<< many1NoneOf [':']
_ <- char ':'
m <- wrapDST =<< (many1 $ anyChar)
let timeInMinute = if sign == '+' then h * 60 + m else -h * 60 - m
in return $ TimeZone timeInMinute False "CET"
---- | Parser which use parseGregorian, parseTimeOfDay and parseTimeZone to create a ZonedTime
parseZonedTime :: Parser ZonedTime
parseZonedTime= do
d <- parseGregorian
tod <- parseTimeOfDay
tz <- parseTimeZone
return $ ZonedTime (LocalTime d (tod)) tz
---- | Opposite of toRFC3339
fromRFC3339 :: Text -> Either ParseError ZonedTime
fromRFC3339 t = Text.ParserCombinators.Parsec.parse parseZonedTime "ERROR: Couldn't parse zoned time." input
where input = unpack t
...@@ -19,23 +19,20 @@ citation programs to exchange data. ...@@ -19,23 +19,20 @@ citation programs to exchange data.
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Parsers.RIS (risParser, withField, fieldWith, lines) where module Gargantext.Text.Parsers.RIS (parser, withField, fieldWith, lines) where
import Data.Either (either)
import Data.List (lookup) import Data.List (lookup)
import Data.Tuple.Extra (first)
import Control.Applicative import Control.Applicative
import Data.Attoparsec.ByteString (Parser, try, string, takeTill, take, manyTill, many1, endOfInput, parseOnly) import Data.Attoparsec.ByteString (Parser, try, takeTill, take, many1)
import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine) import Data.Attoparsec.ByteString.Char8 (isEndOfLine)
import Data.ByteString (ByteString, concat, length) import Data.ByteString (ByteString, concat)
import Data.ByteString.Char8 (pack)
import Data.Monoid ((<>)) import Data.Monoid ((<>))
import Gargantext.Prelude hiding (takeWhile, take, concat, readFile, lines, concat) import Gargantext.Prelude hiding (takeWhile, take)
import qualified Data.List as DL import qualified Data.List as DL
------------------------------------------------------------- -------------------------------------------------------------
risParser :: Parser [[(ByteString, ByteString)]] parser :: Parser [[(ByteString, ByteString)]]
risParser = do parser = do
n <- notice "TY -" n <- notice "TY -"
ns <- many1 (notice "\nTY -") ns <- many1 (notice "\nTY -")
pure $ [n] <> ns pure $ [n] <> ns
......
...@@ -17,19 +17,13 @@ Presse RIS format parser en enricher. ...@@ -17,19 +17,13 @@ Presse RIS format parser en enricher.
module Gargantext.Text.Parsers.RIS.Presse (presseEnrich) where module Gargantext.Text.Parsers.RIS.Presse (presseEnrich) where
import Data.Either (either) import Data.Either (either)
import Data.List (lookup)
import Data.Tuple.Extra (first) import Data.Tuple.Extra (first)
import Control.Applicative import Data.Attoparsec.ByteString (parseOnly)
import Data.Attoparsec.ByteString (Parser, try, string, takeTill, take, manyTill, many1, endOfInput, parseOnly) import Data.ByteString (ByteString)
import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine) import Gargantext.Prelude hiding (takeWhile, take)
import Data.ByteString (ByteString, concat, length)
import Data.ByteString.Char8 (pack)
import Data.Monoid ((<>))
import Gargantext.Prelude hiding (takeWhile, take, concat, readFile, lines, concat)
import Gargantext.Text.Parsers.RIS (withField) import Gargantext.Text.Parsers.RIS (withField)
import qualified Data.List as DL import qualified Gargantext.Text.Parsers.Date.Attoparsec as Date
------------------------------------------------------------- -------------------------------------------------------------
------------------------------------------------------------- -------------------------------------------------------------
presseEnrich :: [(ByteString, ByteString)] -> [(ByteString, ByteString)] presseEnrich :: [(ByteString, ByteString)] -> [(ByteString, ByteString)]
presseEnrich = (withField "DA" presseDate) presseEnrich = (withField "DA" presseDate)
...@@ -37,26 +31,13 @@ presseEnrich = (withField "DA" presseDate) ...@@ -37,26 +31,13 @@ presseEnrich = (withField "DA" presseDate)
. (map (first presseFields)) . (map (first presseFields))
presseDate :: ByteString -> [(ByteString, ByteString)] presseDate :: ByteString -> [(ByteString, ByteString)]
presseDate str = either (const []) identity $ parseOnly parseDate str presseDate str = either (const []) identity $ parseOnly (Date.parserWith "/") str
parseDate :: Parser [(ByteString, ByteString)]
parseDate = do
day <- take 2 <* "/"
mon <- take 2 <* "/"
yea <- take 4
pure $ map (first (\x -> "publication_" <> x))
[ ("day",day)
, ("month", mon)
, ("year", yea)
, ("date", yea <> "-" <> mon <> "-" <> day <> "T0:0:0")
]
presseLang :: ByteString -> [(ByteString, ByteString)] presseLang :: ByteString -> [(ByteString, ByteString)]
presseLang "Français" = [("language", "FR")] presseLang "Français" = [("language", "FR")]
presseLang "English" = [("language", "EN")] presseLang "English" = [("language", "EN")]
presseLang x = [("language", x)] presseLang x = [("language", x)]
presseFields :: ByteString -> ByteString presseFields :: ByteString -> ByteString
presseFields champs presseFields champs
| champs == "AU" = "authors" | champs == "AU" = "authors"
...@@ -67,7 +48,6 @@ presseFields champs ...@@ -67,7 +48,6 @@ presseFields champs
| champs == "N2" = "abstract" | champs == "N2" = "abstract"
| otherwise = champs | otherwise = champs
{- {-
fixTitle :: [(ByteString, ByteString)] -> [(ByteString, ByteString)] fixTitle :: [(ByteString, ByteString)] -> [(ByteString, ByteString)]
fixTitle ns = ns <> [ti, ab] fixTitle ns = ns <> [ti, ab]
......
...@@ -14,28 +14,21 @@ commentary with @some markup@. ...@@ -14,28 +14,21 @@ commentary with @some markup@.
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Parsers.WOS (wosParser) where module Gargantext.Text.Parsers.WOS (parser, keys) where
-- TOFIX : Should import Gargantext.Prelude here import Control.Applicative
import Prelude hiding (takeWhile, take, concat, readFile, lines, concat) import Data.Attoparsec.ByteString (Parser, string, takeTill, take, manyTill, many1)
import qualified Data.List as DL
import Data.Monoid ((<>))
import Data.Attoparsec.ByteString (Parser, try, string
, takeTill, take
, manyTill, many1)
import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine) import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
import Data.ByteString (ByteString, concat) import Data.ByteString (ByteString)
import Data.ByteString.Char8 (pack) import Data.ByteString.Char8 (pack)
import Control.Applicative import Gargantext.Text.Parsers.RIS (fieldWith)
import Gargantext.Text.Parsers.RIS (fieldWith, lines) import Prelude hiding (takeWhile, take, concat, readFile, lines, concat)
------------------------------------------------------------- -------------------------------------------------------------
-- | wosParser parses ISI format from -- | wosParser parses ISI format from
-- Web Of Science Database -- Web Of Science Database
wosParser :: Parser [[(ByteString, ByteString)]] parser :: Parser [[(ByteString, ByteString)]]
wosParser = do parser = do
-- TODO Warning if version /= 1.0 -- TODO Warning if version /= 1.0
-- FIXME anyChar (string ..) /= exact string "\nVR 1.0" ? -- FIXME anyChar (string ..) /= exact string "\nVR 1.0" ?
_ <- manyTill anyChar (string $ pack "\nVR 1.0") _ <- manyTill anyChar (string $ pack "\nVR 1.0")
...@@ -55,8 +48,8 @@ notice = start *> many (fieldWith field) <* end ...@@ -55,8 +48,8 @@ notice = start *> many (fieldWith field) <* end
end = manyTill anyChar (string $ pack "\nER\n") end = manyTill anyChar (string $ pack "\nER\n")
translate :: ByteString -> ByteString keys :: ByteString -> ByteString
translate champs keys champs
| champs == "AF" = "authors" | champs == "AF" = "authors"
| champs == "TI" = "title" | champs == "TI" = "title"
| champs == "SO" = "source" | champs == "SO" = "source"
...@@ -64,5 +57,3 @@ translate champs ...@@ -64,5 +57,3 @@ translate champs
| champs == "PD" = "publication_date" | champs == "PD" = "publication_date"
| champs == "AB" = "abstract" | champs == "AB" = "abstract"
| otherwise = champs | otherwise = champs
-------------------------------------------------------------
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment