[FEAT] Parsers RIS && Presse.

33f1f510 · Alexandre Delanoë · dd670d8e · 33f1f510 · 33f1f510 · 33f1f510
Commit 33f1f510 authored May 03, 2019 by Alexandre Delanoë
Showing with 177 additions and 112 deletions

Parsers.hs src/Gargantext/Text/Parsers.hs +37 -27

CSV.hs src/Gargantext/Text/Parsers/CSV.hs +22 -3

Date.hs src/Gargantext/Text/Parsers/Date.hs +77 -73

RIS.hs src/Gargantext/Text/Parsers/RIS.hs +41 -9

No files found.
--- a/src/Gargantext/Text/Parsers.hs
+++ b/src/Gargantext/Text/Parsers.hs
@@ -22,42 +22,38 @@ please follow the types.
 {-# LANGUAGE PackageImports    #-}
 {-# LANGUAGE OverloadedStrings #-}
-module Gargantext.Text.Parsers (parse, FileFormat(..), clean, parseDocs)
+module Gargantext.Text.Parsers (parse, FileFormat(..), clean, parseDocs, risPress2csv)
    where
-import System.FilePath (FilePath(), takeExtension)
 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
+import Control.Concurrent.Async as CCA (mapConcurrently)
 import Control.Monad (join)
-import qualified Data.Time as DT
+import Data.Attoparsec.ByteString (parseOnly, Parser)
+import Data.Either(Either(..))
 import Data.Either.Extra (partitionEithers)
-import Data.Time (UTCTime(..))
 import Data.List (concat)
-import qualified Data.Map        as DM
+import Data.List (lookup)
-import qualified Data.ByteString as DB
 import Data.Ord()
+import Data.String (String())
 import Data.String()
-import Data.Either(Either(..))
-import Data.Attoparsec.ByteString (parseOnly, Parser)
 import Data.Text (Text)
-import qualified Data.Text as DT
-- Activate Async for to parse in parallel
-import Control.Concurrent.Async as CCA (mapConcurrently)
 import Data.Text.Encoding (decodeUtf8)
-import Data.String (String())
+import Data.Time (UTCTime(..))
-import Data.List (lookup)
+import Data.Tuple.Extra (both, second)
+import System.FilePath (FilePath(), takeExtension)
+import qualified Data.ByteString as DB
+import qualified Data.Map        as DM
+import qualified Data.Text as DT
+import qualified Data.Time as DT
 ------------------------------------------------------------------------
 import Gargantext.Core (Lang(..))
 import Gargantext.Prelude
 import Gargantext.Database.Types.Node (HyperdataDocument(..))
 import Gargantext.Text.Parsers.WOS (wosParser)
-import Gargantext.Text.Parsers.RIS (risParser)
+import Gargantext.Text.Parsers.RIS (risParser, presseParser)
 import Gargantext.Text.Parsers.Date (parseDate)
-import Gargantext.Text.Parsers.CSV (parseHal)
+import Gargantext.Text.Parsers.CSV (parseHal, writeDocs2Csv)
 import Gargantext.Text.Terms.Stop (detectLang)
 ------------------------------------------------------------------------
@@ -72,7 +68,7 @@ type ParseError = String
 -- | According to the format of Input file,
 -- different parser are available.
-data FileFormat = WOS | RIS | CsvHalFormat -- | CsvGargV3
+data FileFormat = WOS | RIS | CsvHalFormat | RisPresse -- | CsvGargV3
  deriving (Show)
 -- Implemented (ISI Format)
@@ -88,8 +84,9 @@ data FileFormat = WOS | RIS | CsvHalFormat -- | CsvGargV3
 -- | Parse file into documents
 -- TODO manage errors here
 parseDocs :: FileFormat -> FilePath -> IO [HyperdataDocument]
-parseDocs ff    path = join $ mapM (toDoc ff) <$> snd <$> parse ff path
 parseDocs CsvHalFormat p = parseHal p
+parseDocs RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith presseParser <$>  parse' RIS p
+parseDocs ff    path = join $ mapM (toDoc ff) <$> snd <$> parse ff path
 type Year  = Int
 type Month = Int
@@ -138,15 +135,23 @@ toDoc ff d = do
 toDoc _ _ = undefined
 parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
-parse format path = do
+parse ff fp = enrichWith identity <$> parse' ff fp
+enrichWith ::
+  ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
+  ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
+enrichWith f = second (map both' . map f . concat)
+  where
+    both'   = map (both decodeUtf8)
+parse' :: FileFormat -> FilePath
+       -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
+parse' format path = do
    files <- case takeExtension path of
              ".zip" -> openZip              path
              _      -> pure <$> DB.readFile path
-    (as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
+    partitionEithers <$> mapConcurrently (runParser format) files
-    pure (as, map toText $ concat bs)
-      where
-        -- TODO : decode with bayesian inference on encodings
-        toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))
 -- | withParser:
@@ -175,3 +180,8 @@ clean txt = DT.map clean' txt
    clean' '’' = '\''
    clean' c  = c
+risPress2csv f = parseDocs RisPresse (f <> ".ris") >>= \hs -> writeDocs2Csv (f <> ".csv") hs
--- a/src/Gargantext/Text/Parsers/CSV.hs
+++ b/src/Gargantext/Text/Parsers/CSV.hs
@@ -25,14 +25,14 @@ import Control.Applicative
 import Data.Char (ord)
 import Data.Csv
 import Data.Either (Either(Left, Right))
-import Data.Text (Text, pack, length, intercalate)
+import Data.Text (Text, pack, length, intercalate, unpack)
 import qualified Data.ByteString.Lazy as BL
 import Data.Time.Segment (jour)
 import Data.Vector (Vector)
 import qualified Data.Vector as V
-import Gargantext.Database.Types.Node (HyperdataDocument(..))
+import Gargantext.Database.Types.Node -- (HyperdataDocument(..))
 import Gargantext.Text
 import Gargantext.Text.Context
 import Gargantext.Prelude hiding (length)
@@ -83,6 +83,10 @@ doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
                    Nothing
                    Nothing
                    Nothing
 ---------------------------------------------------------------
 -- | Types Conversions
 toDocs :: Vector CsvDoc -> [Doc]
@@ -174,6 +178,19 @@ instance ToNamedRecord CsvDoc where
                , "authors"           .= aut
               ]
+hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
+hyperdataDocument2csvDoc h = CsvDoc (m $ _hyperdataDocument_title h)
+                                    (m $ _hyperdataDocument_source h)
+                                    (mI $ _hyperdataDocument_publication_year h)
+                                    (mI $ _hyperdataDocument_publication_month h)
+                                    (mI $ _hyperdataDocument_publication_day   h)
+                                    (m $ _hyperdataDocument_abstract h)
+                                    (m $ _hyperdataDocument_authors h)
+  where
+    m = maybe "" identity
+    mI = maybe 0 identity
 csvDecodeOptions :: DecodeOptions
 csvDecodeOptions = (defaultDecodeOptions
@@ -212,7 +229,9 @@ writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
 writeCsv fp (h, vs) = BL.writeFile fp $
                      encodeByNameWith csvEncodeOptions h (V.toList vs)
+writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
+writeDocs2Csv fp hs = BL.writeFile fp $
+                    encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
 ------------------------------------------------------------------------
 -- Hal Format
 data CsvHal = CsvHal

--- a/src/Gargantext/Text/Parsers/Date.hs
+++ b/src/Gargantext/Text/Parsers/Date.hs
@@ -18,7 +18,7 @@ DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
 {-# LANGUAGE NoImplicitPrelude #-}
 {-# LANGUAGE OverloadedStrings #-}
-module Gargantext.Text.Parsers.Date (parseDate, parseDateRaw) where
+module Gargantext.Text.Parsers.Date (parseDate, parseDateRaw, parseGregorian, wrapDST) where
 import Data.HashMap.Strict as HM hiding (map)
 import Data.Text (Text, unpack, splitOn, pack)
@@ -38,23 +38,23 @@ import qualified Data.HashSet as HashSet
 import qualified Duckling.Core as DC
 -- | Unused import (to parse Date Format, keeping it for maybe next steps)
-- import Control.Monad ((=<<))
+import Control.Monad ((=<<))
-- import Data.Either (Either)
+import Data.Either (Either)
-- import Data.Fixed (Fixed (MkFixed))
+import Data.Fixed (Fixed (MkFixed))
-- import Data.Foldable (length)
+import Data.Foldable (length)
-- import Data.String (String)
+import Data.String (String)
-- import Data.Time (ZonedTime(..), LocalTime(..), TimeZone(..), TimeOfDay(..))
+import Data.Time (ZonedTime(..), LocalTime(..), TimeZone(..), TimeOfDay(..))
-- import Data.Time.Calendar (Day, fromGregorian)
+import Data.Time.Calendar (Day, fromGregorian)
-- import Duckling.Debug as DB
+import Duckling.Debug as DB
-- import Duckling.Engine (parseAndResolve)
+import Duckling.Engine (parseAndResolve)
-- import Duckling.Rules (rulesFor)
+import Duckling.Rules (rulesFor)
-- import Prelude (toInteger, div, otherwise, (++))
+import Prelude (toInteger, div, otherwise, (++))
-- import Text.Parsec.Error (ParseError)
+import Text.Parsec.Error (ParseError)
-- import Text.Parsec.Prim (Stream, ParsecT)
+import Text.Parsec.Prim (Stream, ParsecT)
-- import Text.Parsec.String (Parser)
+import Text.Parsec.String (Parser)
-- import Text.ParserCombinators.Parsec (many1, noneOf, anyChar, char, oneOf)
+import Text.ParserCombinators.Parsec (many1, noneOf, anyChar, char, oneOf)
-- import Text.XML.HXT.DOM.Util (decimalStringToInt)
+import Text.XML.HXT.DOM.Util (decimalStringToInt)
-- import qualified Text.ParserCombinators.Parsec (parse)
+import qualified Text.ParserCombinators.Parsec (parse)
 ------------------------------------------------------------------------
 -- | Date Parser
@@ -64,13 +64,17 @@ import qualified Duckling.Core as DC
 -- >>> parseDate EN (pack "April 10 1979")
 -- 1979-04-10 00:00:00 UTC
 parseDate :: Lang -> Text -> IO UTCTime
-parseDate lang s = do
+parseDate lang s = parseDate' "%Y-%m-%dT%T" "0-0-0T0:0:0" lang s
+type DateFormat = Text
+type DateNull   = Text
+parseDate' :: DateFormat -> DateNull -> Lang -> Text -> IO UTCTime
+parseDate' format def lang s = do
  dateStr' <- parseDateRaw lang s
-  let format  = "%Y-%m-%dT%T"
+  let dateStr = unpack $ maybe def identity
-  let dateStr = unpack $ maybe "0-0-0T0:0:0" identity
                       $ head $ splitOn "." dateStr'
-  pure $ parseTimeOrError True defaultTimeLocale format dateStr
+  pure $ parseTimeOrError True defaultTimeLocale (unpack format) dateStr
 -- TODO add Paris at Duckling.Locale Region datatype
@@ -117,63 +121,63 @@ parseDateWithDuckling lang input = do
    pure $ analyze input contxt $ HashSet.fromList [(This Time)]
 -- | Permit to transform a String to an Int in a monadic context
--wrapDST :: Monad m => String -> m Int
+wrapDST :: Monad m => String -> m Int
--wrapDST = (return . decimalStringToInt)
+wrapDST = return . decimalStringToInt
 -- | Generic parser which take at least one element not given in argument
--many1NoneOf :: Stream s m Char => [Char] -> ParsecT s u m [Char]
+many1NoneOf :: Stream s m Char => [Char] -> ParsecT s u m [Char]
--many1NoneOf = (many1 . noneOf)
+many1NoneOf = (many1 . noneOf)
 --getMultiplicator :: Int -> Int
--getMultiplicator a
+getMultiplicator a
--  | 0 >= a = 1
+  | 0 >= a = 1
--  | otherwise = 10 * (getMultiplicator $ div a 10)
+  | otherwise = 10 * (getMultiplicator $ div a 10)
 -- | Parser for date format y-m-d
--parseGregorian :: Parser Day
+parseGregorian :: Parser Day
--parseGregorian  = do
+parseGregorian  = do
--        y <- wrapDST =<< many1NoneOf ['-']
+        y <- wrapDST =<< many1NoneOf ['-']
--        _ <- char '-'
+        _ <- char '-'
--        m <- wrapDST =<< many1NoneOf ['-']
+        m <- wrapDST =<< many1NoneOf ['-']
--        _ <- char '-'
+        _ <- char '-'
--        d <- wrapDST =<< many1NoneOf ['T']
+        d <- wrapDST =<< many1NoneOf ['T']
--        _ <- char 'T'
+        _ <- char 'T'
--        return $ fromGregorian (toInteger y) m d
+        return $ fromGregorian (toInteger y) m d
--
 ---- | Parser for time format h:m:s
--parseTimeOfDay :: Parser TimeOfDay
+parseTimeOfDay :: Parser TimeOfDay
--parseTimeOfDay = do
+parseTimeOfDay = do
--        h <- wrapDST =<< many1NoneOf [':']
+        h <- wrapDST =<< many1NoneOf [':']
--        _ <- char ':'
+        _ <- char ':'
--        m <- wrapDST =<< many1NoneOf [':']
+        m <- wrapDST =<< many1NoneOf [':']
--        _ <- char ':'
+        _ <- char ':'
--        r <- many1NoneOf ['.']
+        r <- many1NoneOf ['.']
--        _ <- char '.'
+        _ <- char '.'
--        dec <- many1NoneOf ['+', '-']
+        dec <- many1NoneOf ['+', '-']
--        let (nb, l) = (decimalStringToInt $ r ++ dec, length dec)
+        let (nb, l) = (decimalStringToInt $ r ++ dec, length dec)
--            seconds = nb * 10^(12-l)
+            seconds = nb * 10^(12-l)
--        return $ TimeOfDay h m (MkFixed . toInteger $ seconds)
+        return $ TimeOfDay h m (MkFixed . toInteger $ seconds)
--
--
 -- | Parser for timezone format +hh:mm
--parseTimeZone :: Parser TimeZone
+parseTimeZone :: Parser TimeZone
--parseTimeZone = do
+parseTimeZone = do
--        sign <- oneOf ['+', '-']
+        sign <- oneOf ['+', '-']
--        h <- wrapDST =<< many1NoneOf [':']
+        h <- wrapDST =<< many1NoneOf [':']
--        _ <- char ':'
+        _ <- char ':'
--        m <- wrapDST =<< (many1 $ anyChar)
+        m <- wrapDST =<< (many1 $ anyChar)
--        let timeInMinute = if sign == '+' then h * 60 + m else -h * 60 - m
+        let timeInMinute = if sign == '+' then h * 60 + m else -h * 60 - m
--         in return $ TimeZone timeInMinute False "CET"
+         in return $ TimeZone timeInMinute False "CET"
--
 ---- | Parser which use parseGregorian, parseTimeOfDay and parseTimeZone to create a ZonedTime
--parseZonedTime :: Parser ZonedTime
+parseZonedTime :: Parser ZonedTime
--parseZonedTime= do
+parseZonedTime= do
--        d <- parseGregorian
+        d <- parseGregorian
--        tod <- parseTimeOfDay
+        tod <- parseTimeOfDay
--        tz <- parseTimeZone
+        tz <- parseTimeZone
--        return $ ZonedTime (LocalTime d (tod)) tz
+        return $ ZonedTime (LocalTime d (tod)) tz
--
 ---- | Opposite of toRFC3339
--fromRFC3339 :: Text -> Either ParseError ZonedTime
+fromRFC3339 :: Text -> Either ParseError ZonedTime
--fromRFC3339 t = Text.ParserCombinators.Parsec.parse parseZonedTime "ERROR: Couldn't parse zoned time." input
+fromRFC3339 t = Text.ParserCombinators.Parsec.parse parseZonedTime "ERROR: Couldn't parse zoned time." input
--        where input = unpack t
+        where input = unpack t
--- a/src/Gargantext/Text/Parsers/RIS.hs
+++ b/src/Gargantext/Text/Parsers/RIS.hs
@@ -10,28 +10,30 @@ Portability : POSIX
 RIS is a standardized tag format developed by Research Information
 Systems, Incorporated (the format name refers to the company) to enable
-citation programs to exchange data.[More](https://en.wikipedia.org/wiki/RIS_(file_format))
+citation programs to exchange data.
+[More](https://en.wikipedia.org/wiki/RIS_(file_format))
 -}
 {-# LANGUAGE NoImplicitPrelude #-}
 {-# LANGUAGE OverloadedStrings #-}
-module Gargantext.Text.Parsers.RIS (risParser) where
+module Gargantext.Text.Parsers.RIS (risParser, risDate, toDate, presseParser) where
+import Data.Either (either)
+import Data.List (lookup)
+import Data.Tuple.Extra (first)
 import Control.Applicative
-import Data.Attoparsec.ByteString (Parser, try, string, takeTill, take, manyTill, many1, endOfInput)
+import Data.Attoparsec.ByteString (Parser, try, string, takeTill, take, manyTill, many1, endOfInput, parseOnly)
 import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
-import Data.ByteString (ByteString, concat)
+import Data.ByteString (ByteString, concat, length)
 import Data.ByteString.Char8 (pack)
 import Data.Monoid ((<>))
 import Gargantext.Prelude hiding (takeWhile, take, concat, readFile, lines, concat)
 import qualified Data.List as DL
 -------------------------------------------------------------
-data Lines = OneLine | MultiLine
 risParser :: Parser [[(ByteString, ByteString)]]
 risParser = do
    n  <- notice "TY  -"
@@ -57,7 +59,6 @@ field = do
            False -> []
    pure (translate name, concat ([txt] <> txts'))
 lines :: Parser [ByteString]
 lines = many line
    where
@@ -72,7 +73,38 @@ translate champs
            | champs == "LA" = "language"
            | champs == "DI" = "doi"
            | champs == "UR" = "url"
-            | champs == "DA" = "publication_date"
            | champs == "N2" = "abstract"
            | otherwise  = champs
 -------------------------------------------------------------
+presseParser :: [(ByteString, ByteString)] -> [(ByteString, ByteString)]
+presseParser = (toDate "DA" (\x -> either (const []) identity $ parseOnly risDate x))
+             . (toDate "LA" presseLang)
+risDate :: Parser [(ByteString, ByteString)]
+risDate = do
+  day <- take 2 <* "/"
+  mon <- take 2 <* "/"
+  yea <- take 4
+  pure $ map (first (\x -> "publication_" <> x))
+       [ ("day",day)
+       , ("month", mon)
+       , ("year", yea)
+       , ("date", yea <> "-" <> mon <> "-" <> day <> "T0:0:0")
+       ]
+toDate :: ByteString -> (ByteString -> [(ByteString, ByteString)])
+       -> [(ByteString, ByteString)] -> [(ByteString, ByteString)]
+toDate k f m = m <> ( maybe [] f (lookup k m) )
+presseLang :: ByteString -> [(ByteString, ByteString)]
+presseLang "Français" = [("language", "FR")]
+presseLang "English"  = [("langauge", "EN")]
+presseLang _ = undefined
+{-
+fixTitle :: [(ByteString, ByteString)] -> [(ByteString, ByteString)]
+fixTitle ns = ns <> [ti, ab]
+  where
+    ti = case 
+-}