Commit 33f1f510 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT] Parsers RIS && Presse.

parent dd670d8e
...@@ -22,42 +22,38 @@ please follow the types. ...@@ -22,42 +22,38 @@ please follow the types.
{-# LANGUAGE PackageImports #-} {-# LANGUAGE PackageImports #-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Parsers (parse, FileFormat(..), clean, parseDocs) module Gargantext.Text.Parsers (parse, FileFormat(..), clean, parseDocs, risPress2csv)
where where
import System.FilePath (FilePath(), takeExtension)
import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries) import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
import Control.Concurrent.Async as CCA (mapConcurrently)
import Control.Monad (join) import Control.Monad (join)
import qualified Data.Time as DT import Data.Attoparsec.ByteString (parseOnly, Parser)
import Data.Either(Either(..))
import Data.Either.Extra (partitionEithers) import Data.Either.Extra (partitionEithers)
import Data.Time (UTCTime(..))
import Data.List (concat) import Data.List (concat)
import qualified Data.Map as DM import Data.List (lookup)
import qualified Data.ByteString as DB
import Data.Ord() import Data.Ord()
import Data.String (String())
import Data.String() import Data.String()
import Data.Either(Either(..))
import Data.Attoparsec.ByteString (parseOnly, Parser)
import Data.Text (Text) import Data.Text (Text)
import qualified Data.Text as DT
-- Activate Async for to parse in parallel
import Control.Concurrent.Async as CCA (mapConcurrently)
import Data.Text.Encoding (decodeUtf8) import Data.Text.Encoding (decodeUtf8)
import Data.String (String()) import Data.Time (UTCTime(..))
import Data.List (lookup) import Data.Tuple.Extra (both, second)
import System.FilePath (FilePath(), takeExtension)
import qualified Data.ByteString as DB
import qualified Data.Map as DM
import qualified Data.Text as DT
import qualified Data.Time as DT
------------------------------------------------------------------------ ------------------------------------------------------------------------
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Database.Types.Node (HyperdataDocument(..)) import Gargantext.Database.Types.Node (HyperdataDocument(..))
import Gargantext.Text.Parsers.WOS (wosParser) import Gargantext.Text.Parsers.WOS (wosParser)
import Gargantext.Text.Parsers.RIS (risParser) import Gargantext.Text.Parsers.RIS (risParser, presseParser)
import Gargantext.Text.Parsers.Date (parseDate) import Gargantext.Text.Parsers.Date (parseDate)
import Gargantext.Text.Parsers.CSV (parseHal) import Gargantext.Text.Parsers.CSV (parseHal, writeDocs2Csv)
import Gargantext.Text.Terms.Stop (detectLang) import Gargantext.Text.Terms.Stop (detectLang)
------------------------------------------------------------------------ ------------------------------------------------------------------------
...@@ -72,7 +68,7 @@ type ParseError = String ...@@ -72,7 +68,7 @@ type ParseError = String
-- | According to the format of Input file, -- | According to the format of Input file,
-- different parser are available. -- different parser are available.
data FileFormat = WOS | RIS | CsvHalFormat -- | CsvGargV3 data FileFormat = WOS | RIS | CsvHalFormat | RisPresse -- | CsvGargV3
deriving (Show) deriving (Show)
-- Implemented (ISI Format) -- Implemented (ISI Format)
...@@ -88,8 +84,9 @@ data FileFormat = WOS | RIS | CsvHalFormat -- | CsvGargV3 ...@@ -88,8 +84,9 @@ data FileFormat = WOS | RIS | CsvHalFormat -- | CsvGargV3
-- | Parse file into documents -- | Parse file into documents
-- TODO manage errors here -- TODO manage errors here
parseDocs :: FileFormat -> FilePath -> IO [HyperdataDocument] parseDocs :: FileFormat -> FilePath -> IO [HyperdataDocument]
parseDocs ff path = join $ mapM (toDoc ff) <$> snd <$> parse ff path
parseDocs CsvHalFormat p = parseHal p parseDocs CsvHalFormat p = parseHal p
parseDocs RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith presseParser <$> parse' RIS p
parseDocs ff path = join $ mapM (toDoc ff) <$> snd <$> parse ff path
type Year = Int type Year = Int
type Month = Int type Month = Int
...@@ -138,15 +135,23 @@ toDoc ff d = do ...@@ -138,15 +135,23 @@ toDoc ff d = do
toDoc _ _ = undefined toDoc _ _ = undefined
parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]]) parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
parse format path = do parse ff fp = enrichWith identity <$> parse' ff fp
enrichWith ::
([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
-> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith f = second (map both' . map f . concat)
where
both' = map (both decodeUtf8)
parse' :: FileFormat -> FilePath
-> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
parse' format path = do
files <- case takeExtension path of files <- case takeExtension path of
".zip" -> openZip path ".zip" -> openZip path
_ -> pure <$> DB.readFile path _ -> pure <$> DB.readFile path
(as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files partitionEithers <$> mapConcurrently (runParser format) files
pure (as, map toText $ concat bs)
where
-- TODO : decode with bayesian inference on encodings
toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))
-- | withParser: -- | withParser:
...@@ -175,3 +180,8 @@ clean txt = DT.map clean' txt ...@@ -175,3 +180,8 @@ clean txt = DT.map clean' txt
clean' '’' = '\'' clean' '’' = '\''
clean' c = c clean' c = c
risPress2csv f = parseDocs RisPresse (f <> ".ris") >>= \hs -> writeDocs2Csv (f <> ".csv") hs
...@@ -25,14 +25,14 @@ import Control.Applicative ...@@ -25,14 +25,14 @@ import Control.Applicative
import Data.Char (ord) import Data.Char (ord)
import Data.Csv import Data.Csv
import Data.Either (Either(Left, Right)) import Data.Either (Either(Left, Right))
import Data.Text (Text, pack, length, intercalate) import Data.Text (Text, pack, length, intercalate, unpack)
import qualified Data.ByteString.Lazy as BL import qualified Data.ByteString.Lazy as BL
import Data.Time.Segment (jour) import Data.Time.Segment (jour)
import Data.Vector (Vector) import Data.Vector (Vector)
import qualified Data.Vector as V import qualified Data.Vector as V
import Gargantext.Database.Types.Node (HyperdataDocument(..)) import Gargantext.Database.Types.Node -- (HyperdataDocument(..))
import Gargantext.Text import Gargantext.Text
import Gargantext.Text.Context import Gargantext.Text.Context
import Gargantext.Prelude hiding (length) import Gargantext.Prelude hiding (length)
...@@ -83,6 +83,10 @@ doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) = ...@@ -83,6 +83,10 @@ doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
Nothing Nothing
Nothing Nothing
Nothing Nothing
--------------------------------------------------------------- ---------------------------------------------------------------
-- | Types Conversions -- | Types Conversions
toDocs :: Vector CsvDoc -> [Doc] toDocs :: Vector CsvDoc -> [Doc]
...@@ -174,6 +178,19 @@ instance ToNamedRecord CsvDoc where ...@@ -174,6 +178,19 @@ instance ToNamedRecord CsvDoc where
, "authors" .= aut , "authors" .= aut
] ]
hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
hyperdataDocument2csvDoc h = CsvDoc (m $ _hyperdataDocument_title h)
(m $ _hyperdataDocument_source h)
(mI $ _hyperdataDocument_publication_year h)
(mI $ _hyperdataDocument_publication_month h)
(mI $ _hyperdataDocument_publication_day h)
(m $ _hyperdataDocument_abstract h)
(m $ _hyperdataDocument_authors h)
where
m = maybe "" identity
mI = maybe 0 identity
csvDecodeOptions :: DecodeOptions csvDecodeOptions :: DecodeOptions
csvDecodeOptions = (defaultDecodeOptions csvDecodeOptions = (defaultDecodeOptions
...@@ -212,7 +229,9 @@ writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO () ...@@ -212,7 +229,9 @@ writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
writeCsv fp (h, vs) = BL.writeFile fp $ writeCsv fp (h, vs) = BL.writeFile fp $
encodeByNameWith csvEncodeOptions h (V.toList vs) encodeByNameWith csvEncodeOptions h (V.toList vs)
writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
writeDocs2Csv fp hs = BL.writeFile fp $
encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- Hal Format -- Hal Format
data CsvHal = CsvHal data CsvHal = CsvHal
......
...@@ -18,7 +18,7 @@ DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00" ...@@ -18,7 +18,7 @@ DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Parsers.Date (parseDate, parseDateRaw) where module Gargantext.Text.Parsers.Date (parseDate, parseDateRaw, parseGregorian, wrapDST) where
import Data.HashMap.Strict as HM hiding (map) import Data.HashMap.Strict as HM hiding (map)
import Data.Text (Text, unpack, splitOn, pack) import Data.Text (Text, unpack, splitOn, pack)
...@@ -38,23 +38,23 @@ import qualified Data.HashSet as HashSet ...@@ -38,23 +38,23 @@ import qualified Data.HashSet as HashSet
import qualified Duckling.Core as DC import qualified Duckling.Core as DC
-- | Unused import (to parse Date Format, keeping it for maybe next steps) -- | Unused import (to parse Date Format, keeping it for maybe next steps)
-- import Control.Monad ((=<<)) import Control.Monad ((=<<))
-- import Data.Either (Either) import Data.Either (Either)
-- import Data.Fixed (Fixed (MkFixed)) import Data.Fixed (Fixed (MkFixed))
-- import Data.Foldable (length) import Data.Foldable (length)
-- import Data.String (String) import Data.String (String)
-- import Data.Time (ZonedTime(..), LocalTime(..), TimeZone(..), TimeOfDay(..)) import Data.Time (ZonedTime(..), LocalTime(..), TimeZone(..), TimeOfDay(..))
-- import Data.Time.Calendar (Day, fromGregorian) import Data.Time.Calendar (Day, fromGregorian)
-- import Duckling.Debug as DB import Duckling.Debug as DB
-- import Duckling.Engine (parseAndResolve) import Duckling.Engine (parseAndResolve)
-- import Duckling.Rules (rulesFor) import Duckling.Rules (rulesFor)
-- import Prelude (toInteger, div, otherwise, (++)) import Prelude (toInteger, div, otherwise, (++))
-- import Text.Parsec.Error (ParseError) import Text.Parsec.Error (ParseError)
-- import Text.Parsec.Prim (Stream, ParsecT) import Text.Parsec.Prim (Stream, ParsecT)
-- import Text.Parsec.String (Parser) import Text.Parsec.String (Parser)
-- import Text.ParserCombinators.Parsec (many1, noneOf, anyChar, char, oneOf) import Text.ParserCombinators.Parsec (many1, noneOf, anyChar, char, oneOf)
-- import Text.XML.HXT.DOM.Util (decimalStringToInt) import Text.XML.HXT.DOM.Util (decimalStringToInt)
-- import qualified Text.ParserCombinators.Parsec (parse) import qualified Text.ParserCombinators.Parsec (parse)
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | Date Parser -- | Date Parser
...@@ -64,13 +64,17 @@ import qualified Duckling.Core as DC ...@@ -64,13 +64,17 @@ import qualified Duckling.Core as DC
-- >>> parseDate EN (pack "April 10 1979") -- >>> parseDate EN (pack "April 10 1979")
-- 1979-04-10 00:00:00 UTC -- 1979-04-10 00:00:00 UTC
parseDate :: Lang -> Text -> IO UTCTime parseDate :: Lang -> Text -> IO UTCTime
parseDate lang s = do parseDate lang s = parseDate' "%Y-%m-%dT%T" "0-0-0T0:0:0" lang s
type DateFormat = Text
type DateNull = Text
parseDate' :: DateFormat -> DateNull -> Lang -> Text -> IO UTCTime
parseDate' format def lang s = do
dateStr' <- parseDateRaw lang s dateStr' <- parseDateRaw lang s
let format = "%Y-%m-%dT%T" let dateStr = unpack $ maybe def identity
let dateStr = unpack $ maybe "0-0-0T0:0:0" identity
$ head $ splitOn "." dateStr' $ head $ splitOn "." dateStr'
pure $ parseTimeOrError True defaultTimeLocale format dateStr pure $ parseTimeOrError True defaultTimeLocale (unpack format) dateStr
-- TODO add Paris at Duckling.Locale Region datatype -- TODO add Paris at Duckling.Locale Region datatype
...@@ -117,63 +121,63 @@ parseDateWithDuckling lang input = do ...@@ -117,63 +121,63 @@ parseDateWithDuckling lang input = do
pure $ analyze input contxt $ HashSet.fromList [(This Time)] pure $ analyze input contxt $ HashSet.fromList [(This Time)]
-- | Permit to transform a String to an Int in a monadic context -- | Permit to transform a String to an Int in a monadic context
--wrapDST :: Monad m => String -> m Int wrapDST :: Monad m => String -> m Int
--wrapDST = (return . decimalStringToInt) wrapDST = return . decimalStringToInt
-- | Generic parser which take at least one element not given in argument -- | Generic parser which take at least one element not given in argument
--many1NoneOf :: Stream s m Char => [Char] -> ParsecT s u m [Char] many1NoneOf :: Stream s m Char => [Char] -> ParsecT s u m [Char]
--many1NoneOf = (many1 . noneOf) many1NoneOf = (many1 . noneOf)
--getMultiplicator :: Int -> Int --getMultiplicator :: Int -> Int
--getMultiplicator a getMultiplicator a
-- | 0 >= a = 1 | 0 >= a = 1
-- | otherwise = 10 * (getMultiplicator $ div a 10) | otherwise = 10 * (getMultiplicator $ div a 10)
-- | Parser for date format y-m-d -- | Parser for date format y-m-d
--parseGregorian :: Parser Day parseGregorian :: Parser Day
--parseGregorian = do parseGregorian = do
-- y <- wrapDST =<< many1NoneOf ['-'] y <- wrapDST =<< many1NoneOf ['-']
-- _ <- char '-' _ <- char '-'
-- m <- wrapDST =<< many1NoneOf ['-'] m <- wrapDST =<< many1NoneOf ['-']
-- _ <- char '-' _ <- char '-'
-- d <- wrapDST =<< many1NoneOf ['T'] d <- wrapDST =<< many1NoneOf ['T']
-- _ <- char 'T' _ <- char 'T'
-- return $ fromGregorian (toInteger y) m d return $ fromGregorian (toInteger y) m d
--
---- | Parser for time format h:m:s ---- | Parser for time format h:m:s
--parseTimeOfDay :: Parser TimeOfDay parseTimeOfDay :: Parser TimeOfDay
--parseTimeOfDay = do parseTimeOfDay = do
-- h <- wrapDST =<< many1NoneOf [':'] h <- wrapDST =<< many1NoneOf [':']
-- _ <- char ':' _ <- char ':'
-- m <- wrapDST =<< many1NoneOf [':'] m <- wrapDST =<< many1NoneOf [':']
-- _ <- char ':' _ <- char ':'
-- r <- many1NoneOf ['.'] r <- many1NoneOf ['.']
-- _ <- char '.' _ <- char '.'
-- dec <- many1NoneOf ['+', '-'] dec <- many1NoneOf ['+', '-']
-- let (nb, l) = (decimalStringToInt $ r ++ dec, length dec) let (nb, l) = (decimalStringToInt $ r ++ dec, length dec)
-- seconds = nb * 10^(12-l) seconds = nb * 10^(12-l)
-- return $ TimeOfDay h m (MkFixed . toInteger $ seconds) return $ TimeOfDay h m (MkFixed . toInteger $ seconds)
--
--
-- | Parser for timezone format +hh:mm -- | Parser for timezone format +hh:mm
--parseTimeZone :: Parser TimeZone parseTimeZone :: Parser TimeZone
--parseTimeZone = do parseTimeZone = do
-- sign <- oneOf ['+', '-'] sign <- oneOf ['+', '-']
-- h <- wrapDST =<< many1NoneOf [':'] h <- wrapDST =<< many1NoneOf [':']
-- _ <- char ':' _ <- char ':'
-- m <- wrapDST =<< (many1 $ anyChar) m <- wrapDST =<< (many1 $ anyChar)
-- let timeInMinute = if sign == '+' then h * 60 + m else -h * 60 - m let timeInMinute = if sign == '+' then h * 60 + m else -h * 60 - m
-- in return $ TimeZone timeInMinute False "CET" in return $ TimeZone timeInMinute False "CET"
--
---- | Parser which use parseGregorian, parseTimeOfDay and parseTimeZone to create a ZonedTime ---- | Parser which use parseGregorian, parseTimeOfDay and parseTimeZone to create a ZonedTime
--parseZonedTime :: Parser ZonedTime parseZonedTime :: Parser ZonedTime
--parseZonedTime= do parseZonedTime= do
-- d <- parseGregorian d <- parseGregorian
-- tod <- parseTimeOfDay tod <- parseTimeOfDay
-- tz <- parseTimeZone tz <- parseTimeZone
-- return $ ZonedTime (LocalTime d (tod)) tz return $ ZonedTime (LocalTime d (tod)) tz
--
---- | Opposite of toRFC3339 ---- | Opposite of toRFC3339
--fromRFC3339 :: Text -> Either ParseError ZonedTime fromRFC3339 :: Text -> Either ParseError ZonedTime
--fromRFC3339 t = Text.ParserCombinators.Parsec.parse parseZonedTime "ERROR: Couldn't parse zoned time." input fromRFC3339 t = Text.ParserCombinators.Parsec.parse parseZonedTime "ERROR: Couldn't parse zoned time." input
-- where input = unpack t where input = unpack t
...@@ -10,28 +10,30 @@ Portability : POSIX ...@@ -10,28 +10,30 @@ Portability : POSIX
RIS is a standardized tag format developed by Research Information RIS is a standardized tag format developed by Research Information
Systems, Incorporated (the format name refers to the company) to enable Systems, Incorporated (the format name refers to the company) to enable
citation programs to exchange data.[More](https://en.wikipedia.org/wiki/RIS_(file_format)) citation programs to exchange data.
[More](https://en.wikipedia.org/wiki/RIS_(file_format))
-} -}
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Parsers.RIS (risParser) where module Gargantext.Text.Parsers.RIS (risParser, risDate, toDate, presseParser) where
import Data.Either (either)
import Data.List (lookup)
import Data.Tuple.Extra (first)
import Control.Applicative import Control.Applicative
import Data.Attoparsec.ByteString (Parser, try, string, takeTill, take, manyTill, many1, endOfInput) import Data.Attoparsec.ByteString (Parser, try, string, takeTill, take, manyTill, many1, endOfInput, parseOnly)
import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine) import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
import Data.ByteString (ByteString, concat) import Data.ByteString (ByteString, concat, length)
import Data.ByteString.Char8 (pack) import Data.ByteString.Char8 (pack)
import Data.Monoid ((<>)) import Data.Monoid ((<>))
import Gargantext.Prelude hiding (takeWhile, take, concat, readFile, lines, concat) import Gargantext.Prelude hiding (takeWhile, take, concat, readFile, lines, concat)
import qualified Data.List as DL import qualified Data.List as DL
------------------------------------------------------------- -------------------------------------------------------------
data Lines = OneLine | MultiLine
risParser :: Parser [[(ByteString, ByteString)]] risParser :: Parser [[(ByteString, ByteString)]]
risParser = do risParser = do
n <- notice "TY -" n <- notice "TY -"
...@@ -57,7 +59,6 @@ field = do ...@@ -57,7 +59,6 @@ field = do
False -> [] False -> []
pure (translate name, concat ([txt] <> txts')) pure (translate name, concat ([txt] <> txts'))
lines :: Parser [ByteString] lines :: Parser [ByteString]
lines = many line lines = many line
where where
...@@ -72,7 +73,38 @@ translate champs ...@@ -72,7 +73,38 @@ translate champs
| champs == "LA" = "language" | champs == "LA" = "language"
| champs == "DI" = "doi" | champs == "DI" = "doi"
| champs == "UR" = "url" | champs == "UR" = "url"
| champs == "DA" = "publication_date"
| champs == "N2" = "abstract" | champs == "N2" = "abstract"
| otherwise = champs | otherwise = champs
------------------------------------------------------------- -------------------------------------------------------------
presseParser :: [(ByteString, ByteString)] -> [(ByteString, ByteString)]
presseParser = (toDate "DA" (\x -> either (const []) identity $ parseOnly risDate x))
. (toDate "LA" presseLang)
risDate :: Parser [(ByteString, ByteString)]
risDate = do
day <- take 2 <* "/"
mon <- take 2 <* "/"
yea <- take 4
pure $ map (first (\x -> "publication_" <> x))
[ ("day",day)
, ("month", mon)
, ("year", yea)
, ("date", yea <> "-" <> mon <> "-" <> day <> "T0:0:0")
]
toDate :: ByteString -> (ByteString -> [(ByteString, ByteString)])
-> [(ByteString, ByteString)] -> [(ByteString, ByteString)]
toDate k f m = m <> ( maybe [] f (lookup k m) )
presseLang :: ByteString -> [(ByteString, ByteString)]
presseLang "Français" = [("language", "FR")]
presseLang "English" = [("langauge", "EN")]
presseLang _ = undefined
{-
fixTitle :: [(ByteString, ByteString)] -> [(ByteString, ByteString)]
fixTitle ns = ns <> [ti, ab]
where
ti = case
-}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment