Commit 33f1f510 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT] Parsers RIS && Presse.

parent dd670d8e
......@@ -22,42 +22,38 @@ please follow the types.
{-# LANGUAGE PackageImports #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Parsers (parse, FileFormat(..), clean, parseDocs)
module Gargantext.Text.Parsers (parse, FileFormat(..), clean, parseDocs, risPress2csv)
where
import System.FilePath (FilePath(), takeExtension)
import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
import Control.Concurrent.Async as CCA (mapConcurrently)
import Control.Monad (join)
import qualified Data.Time as DT
import Data.Attoparsec.ByteString (parseOnly, Parser)
import Data.Either(Either(..))
import Data.Either.Extra (partitionEithers)
import Data.Time (UTCTime(..))
import Data.List (concat)
import qualified Data.Map as DM
import qualified Data.ByteString as DB
import Data.List (lookup)
import Data.Ord()
import Data.String (String())
import Data.String()
import Data.Either(Either(..))
import Data.Attoparsec.ByteString (parseOnly, Parser)
import Data.Text (Text)
import qualified Data.Text as DT
-- Activate Async for to parse in parallel
import Control.Concurrent.Async as CCA (mapConcurrently)
import Data.Text.Encoding (decodeUtf8)
import Data.String (String())
import Data.List (lookup)
import Data.Time (UTCTime(..))
import Data.Tuple.Extra (both, second)
import System.FilePath (FilePath(), takeExtension)
import qualified Data.ByteString as DB
import qualified Data.Map as DM
import qualified Data.Text as DT
import qualified Data.Time as DT
------------------------------------------------------------------------
import Gargantext.Core (Lang(..))
import Gargantext.Prelude
import Gargantext.Database.Types.Node (HyperdataDocument(..))
import Gargantext.Text.Parsers.WOS (wosParser)
import Gargantext.Text.Parsers.RIS (risParser)
import Gargantext.Text.Parsers.RIS (risParser, presseParser)
import Gargantext.Text.Parsers.Date (parseDate)
import Gargantext.Text.Parsers.CSV (parseHal)
import Gargantext.Text.Parsers.CSV (parseHal, writeDocs2Csv)
import Gargantext.Text.Terms.Stop (detectLang)
------------------------------------------------------------------------
......@@ -72,7 +68,7 @@ type ParseError = String
-- | According to the format of Input file,
-- different parser are available.
data FileFormat = WOS | RIS | CsvHalFormat -- | CsvGargV3
data FileFormat = WOS | RIS | CsvHalFormat | RisPresse -- | CsvGargV3
deriving (Show)
-- Implemented (ISI Format)
......@@ -88,8 +84,9 @@ data FileFormat = WOS | RIS | CsvHalFormat -- | CsvGargV3
-- | Parse file into documents
-- TODO manage errors here
parseDocs :: FileFormat -> FilePath -> IO [HyperdataDocument]
parseDocs ff path = join $ mapM (toDoc ff) <$> snd <$> parse ff path
parseDocs CsvHalFormat p = parseHal p
parseDocs RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith presseParser <$> parse' RIS p
parseDocs ff path = join $ mapM (toDoc ff) <$> snd <$> parse ff path
type Year = Int
type Month = Int
......@@ -138,15 +135,23 @@ toDoc ff d = do
toDoc _ _ = undefined
parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
parse format path = do
parse ff fp = enrichWith identity <$> parse' ff fp
enrichWith ::
([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
-> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith f = second (map both' . map f . concat)
where
both' = map (both decodeUtf8)
parse' :: FileFormat -> FilePath
-> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
parse' format path = do
files <- case takeExtension path of
".zip" -> openZip path
_ -> pure <$> DB.readFile path
(as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
pure (as, map toText $ concat bs)
where
-- TODO : decode with bayesian inference on encodings
toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))
partitionEithers <$> mapConcurrently (runParser format) files
-- | withParser:
......@@ -175,3 +180,8 @@ clean txt = DT.map clean' txt
clean' '’' = '\''
clean' c = c
risPress2csv f = parseDocs RisPresse (f <> ".ris") >>= \hs -> writeDocs2Csv (f <> ".csv") hs
......@@ -25,14 +25,14 @@ import Control.Applicative
import Data.Char (ord)
import Data.Csv
import Data.Either (Either(Left, Right))
import Data.Text (Text, pack, length, intercalate)
import Data.Text (Text, pack, length, intercalate, unpack)
import qualified Data.ByteString.Lazy as BL
import Data.Time.Segment (jour)
import Data.Vector (Vector)
import qualified Data.Vector as V
import Gargantext.Database.Types.Node (HyperdataDocument(..))
import Gargantext.Database.Types.Node -- (HyperdataDocument(..))
import Gargantext.Text
import Gargantext.Text.Context
import Gargantext.Prelude hiding (length)
......@@ -83,6 +83,10 @@ doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
Nothing
Nothing
Nothing
---------------------------------------------------------------
-- | Types Conversions
toDocs :: Vector CsvDoc -> [Doc]
......@@ -174,6 +178,19 @@ instance ToNamedRecord CsvDoc where
, "authors" .= aut
]
hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
hyperdataDocument2csvDoc h = CsvDoc (m $ _hyperdataDocument_title h)
(m $ _hyperdataDocument_source h)
(mI $ _hyperdataDocument_publication_year h)
(mI $ _hyperdataDocument_publication_month h)
(mI $ _hyperdataDocument_publication_day h)
(m $ _hyperdataDocument_abstract h)
(m $ _hyperdataDocument_authors h)
where
m = maybe "" identity
mI = maybe 0 identity
csvDecodeOptions :: DecodeOptions
csvDecodeOptions = (defaultDecodeOptions
......@@ -212,7 +229,9 @@ writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
writeCsv fp (h, vs) = BL.writeFile fp $
encodeByNameWith csvEncodeOptions h (V.toList vs)
writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
writeDocs2Csv fp hs = BL.writeFile fp $
encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
------------------------------------------------------------------------
-- Hal Format
data CsvHal = CsvHal
......
......@@ -18,7 +18,7 @@ DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Parsers.Date (parseDate, parseDateRaw) where
module Gargantext.Text.Parsers.Date (parseDate, parseDateRaw, parseGregorian, wrapDST) where
import Data.HashMap.Strict as HM hiding (map)
import Data.Text (Text, unpack, splitOn, pack)
......@@ -38,23 +38,23 @@ import qualified Data.HashSet as HashSet
import qualified Duckling.Core as DC
-- | Unused import (to parse Date Format, keeping it for maybe next steps)
-- import Control.Monad ((=<<))
-- import Data.Either (Either)
-- import Data.Fixed (Fixed (MkFixed))
-- import Data.Foldable (length)
-- import Data.String (String)
-- import Data.Time (ZonedTime(..), LocalTime(..), TimeZone(..), TimeOfDay(..))
-- import Data.Time.Calendar (Day, fromGregorian)
-- import Duckling.Debug as DB
-- import Duckling.Engine (parseAndResolve)
-- import Duckling.Rules (rulesFor)
-- import Prelude (toInteger, div, otherwise, (++))
-- import Text.Parsec.Error (ParseError)
-- import Text.Parsec.Prim (Stream, ParsecT)
-- import Text.Parsec.String (Parser)
-- import Text.ParserCombinators.Parsec (many1, noneOf, anyChar, char, oneOf)
-- import Text.XML.HXT.DOM.Util (decimalStringToInt)
-- import qualified Text.ParserCombinators.Parsec (parse)
import Control.Monad ((=<<))
import Data.Either (Either)
import Data.Fixed (Fixed (MkFixed))
import Data.Foldable (length)
import Data.String (String)
import Data.Time (ZonedTime(..), LocalTime(..), TimeZone(..), TimeOfDay(..))
import Data.Time.Calendar (Day, fromGregorian)
import Duckling.Debug as DB
import Duckling.Engine (parseAndResolve)
import Duckling.Rules (rulesFor)
import Prelude (toInteger, div, otherwise, (++))
import Text.Parsec.Error (ParseError)
import Text.Parsec.Prim (Stream, ParsecT)
import Text.Parsec.String (Parser)
import Text.ParserCombinators.Parsec (many1, noneOf, anyChar, char, oneOf)
import Text.XML.HXT.DOM.Util (decimalStringToInt)
import qualified Text.ParserCombinators.Parsec (parse)
------------------------------------------------------------------------
-- | Date Parser
......@@ -64,13 +64,17 @@ import qualified Duckling.Core as DC
-- >>> parseDate EN (pack "April 10 1979")
-- 1979-04-10 00:00:00 UTC
parseDate :: Lang -> Text -> IO UTCTime
parseDate lang s = do
parseDate lang s = parseDate' "%Y-%m-%dT%T" "0-0-0T0:0:0" lang s
type DateFormat = Text
type DateNull = Text
parseDate' :: DateFormat -> DateNull -> Lang -> Text -> IO UTCTime
parseDate' format def lang s = do
dateStr' <- parseDateRaw lang s
let format = "%Y-%m-%dT%T"
let dateStr = unpack $ maybe "0-0-0T0:0:0" identity
let dateStr = unpack $ maybe def identity
$ head $ splitOn "." dateStr'
pure $ parseTimeOrError True defaultTimeLocale format dateStr
pure $ parseTimeOrError True defaultTimeLocale (unpack format) dateStr
-- TODO add Paris at Duckling.Locale Region datatype
......@@ -117,63 +121,63 @@ parseDateWithDuckling lang input = do
pure $ analyze input contxt $ HashSet.fromList [(This Time)]
-- | Permit to transform a String to an Int in a monadic context
--wrapDST :: Monad m => String -> m Int
--wrapDST = (return . decimalStringToInt)
wrapDST :: Monad m => String -> m Int
wrapDST = return . decimalStringToInt
-- | Generic parser which take at least one element not given in argument
--many1NoneOf :: Stream s m Char => [Char] -> ParsecT s u m [Char]
--many1NoneOf = (many1 . noneOf)
many1NoneOf :: Stream s m Char => [Char] -> ParsecT s u m [Char]
many1NoneOf = (many1 . noneOf)
--getMultiplicator :: Int -> Int
--getMultiplicator a
-- | 0 >= a = 1
-- | otherwise = 10 * (getMultiplicator $ div a 10)
getMultiplicator a
| 0 >= a = 1
| otherwise = 10 * (getMultiplicator $ div a 10)
-- | Parser for date format y-m-d
--parseGregorian :: Parser Day
--parseGregorian = do
-- y <- wrapDST =<< many1NoneOf ['-']
-- _ <- char '-'
-- m <- wrapDST =<< many1NoneOf ['-']
-- _ <- char '-'
-- d <- wrapDST =<< many1NoneOf ['T']
-- _ <- char 'T'
-- return $ fromGregorian (toInteger y) m d
--
parseGregorian :: Parser Day
parseGregorian = do
y <- wrapDST =<< many1NoneOf ['-']
_ <- char '-'
m <- wrapDST =<< many1NoneOf ['-']
_ <- char '-'
d <- wrapDST =<< many1NoneOf ['T']
_ <- char 'T'
return $ fromGregorian (toInteger y) m d
---- | Parser for time format h:m:s
--parseTimeOfDay :: Parser TimeOfDay
--parseTimeOfDay = do
-- h <- wrapDST =<< many1NoneOf [':']
-- _ <- char ':'
-- m <- wrapDST =<< many1NoneOf [':']
-- _ <- char ':'
-- r <- many1NoneOf ['.']
-- _ <- char '.'
-- dec <- many1NoneOf ['+', '-']
-- let (nb, l) = (decimalStringToInt $ r ++ dec, length dec)
-- seconds = nb * 10^(12-l)
-- return $ TimeOfDay h m (MkFixed . toInteger $ seconds)
--
--
parseTimeOfDay :: Parser TimeOfDay
parseTimeOfDay = do
h <- wrapDST =<< many1NoneOf [':']
_ <- char ':'
m <- wrapDST =<< many1NoneOf [':']
_ <- char ':'
r <- many1NoneOf ['.']
_ <- char '.'
dec <- many1NoneOf ['+', '-']
let (nb, l) = (decimalStringToInt $ r ++ dec, length dec)
seconds = nb * 10^(12-l)
return $ TimeOfDay h m (MkFixed . toInteger $ seconds)
-- | Parser for timezone format +hh:mm
--parseTimeZone :: Parser TimeZone
--parseTimeZone = do
-- sign <- oneOf ['+', '-']
-- h <- wrapDST =<< many1NoneOf [':']
-- _ <- char ':'
-- m <- wrapDST =<< (many1 $ anyChar)
-- let timeInMinute = if sign == '+' then h * 60 + m else -h * 60 - m
-- in return $ TimeZone timeInMinute False "CET"
--
parseTimeZone :: Parser TimeZone
parseTimeZone = do
sign <- oneOf ['+', '-']
h <- wrapDST =<< many1NoneOf [':']
_ <- char ':'
m <- wrapDST =<< (many1 $ anyChar)
let timeInMinute = if sign == '+' then h * 60 + m else -h * 60 - m
in return $ TimeZone timeInMinute False "CET"
---- | Parser which use parseGregorian, parseTimeOfDay and parseTimeZone to create a ZonedTime
--parseZonedTime :: Parser ZonedTime
--parseZonedTime= do
-- d <- parseGregorian
-- tod <- parseTimeOfDay
-- tz <- parseTimeZone
-- return $ ZonedTime (LocalTime d (tod)) tz
--
parseZonedTime :: Parser ZonedTime
parseZonedTime= do
d <- parseGregorian
tod <- parseTimeOfDay
tz <- parseTimeZone
return $ ZonedTime (LocalTime d (tod)) tz
---- | Opposite of toRFC3339
--fromRFC3339 :: Text -> Either ParseError ZonedTime
--fromRFC3339 t = Text.ParserCombinators.Parsec.parse parseZonedTime "ERROR: Couldn't parse zoned time." input
-- where input = unpack t
fromRFC3339 :: Text -> Either ParseError ZonedTime
fromRFC3339 t = Text.ParserCombinators.Parsec.parse parseZonedTime "ERROR: Couldn't parse zoned time." input
where input = unpack t
......@@ -10,28 +10,30 @@ Portability : POSIX
RIS is a standardized tag format developed by Research Information
Systems, Incorporated (the format name refers to the company) to enable
citation programs to exchange data.[More](https://en.wikipedia.org/wiki/RIS_(file_format))
citation programs to exchange data.
[More](https://en.wikipedia.org/wiki/RIS_(file_format))
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Parsers.RIS (risParser) where
module Gargantext.Text.Parsers.RIS (risParser, risDate, toDate, presseParser) where
import Data.Either (either)
import Data.List (lookup)
import Data.Tuple.Extra (first)
import Control.Applicative
import Data.Attoparsec.ByteString (Parser, try, string, takeTill, take, manyTill, many1, endOfInput)
import Data.Attoparsec.ByteString (Parser, try, string, takeTill, take, manyTill, many1, endOfInput, parseOnly)
import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
import Data.ByteString (ByteString, concat)
import Data.ByteString (ByteString, concat, length)
import Data.ByteString.Char8 (pack)
import Data.Monoid ((<>))
import Gargantext.Prelude hiding (takeWhile, take, concat, readFile, lines, concat)
import qualified Data.List as DL
-------------------------------------------------------------
data Lines = OneLine | MultiLine
risParser :: Parser [[(ByteString, ByteString)]]
risParser = do
n <- notice "TY -"
......@@ -57,7 +59,6 @@ field = do
False -> []
pure (translate name, concat ([txt] <> txts'))
lines :: Parser [ByteString]
lines = many line
where
......@@ -72,7 +73,38 @@ translate champs
| champs == "LA" = "language"
| champs == "DI" = "doi"
| champs == "UR" = "url"
| champs == "DA" = "publication_date"
| champs == "N2" = "abstract"
| otherwise = champs
-------------------------------------------------------------
presseParser :: [(ByteString, ByteString)] -> [(ByteString, ByteString)]
presseParser = (toDate "DA" (\x -> either (const []) identity $ parseOnly risDate x))
. (toDate "LA" presseLang)
risDate :: Parser [(ByteString, ByteString)]
risDate = do
day <- take 2 <* "/"
mon <- take 2 <* "/"
yea <- take 4
pure $ map (first (\x -> "publication_" <> x))
[ ("day",day)
, ("month", mon)
, ("year", yea)
, ("date", yea <> "-" <> mon <> "-" <> day <> "T0:0:0")
]
toDate :: ByteString -> (ByteString -> [(ByteString, ByteString)])
-> [(ByteString, ByteString)] -> [(ByteString, ByteString)]
toDate k f m = m <> ( maybe [] f (lookup k m) )
presseLang :: ByteString -> [(ByteString, ByteString)]
presseLang "Français" = [("language", "FR")]
presseLang "English" = [("langauge", "EN")]
presseLang _ = undefined
{-
fixTitle :: [(ByteString, ByteString)] -> [(ByteString, ByteString)]
fixTitle ns = ns <> [ti, ab]
where
ti = case
-}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment