Commit c45062cb authored by Alexandre Delanoë's avatar Alexandre Delanoë

[CONVERT] convert file between formats (RIS -> CSV).

parent 33f1f510
{-|
Module : Gargantext.Text.Convert
Description : All parsers of Gargantext in one file.
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Format Converter.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE PackageImports #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Convert (risPress2csv)
where
import System.FilePath (FilePath()) -- , takeExtension)
import Gargantext.Prelude
import Gargantext.Text.Parsers.CSV (writeDocs2Csv)
import Gargantext.Text.Parsers (parseDocs, FileFormat(..))
risPress2csv :: FilePath -> IO ()
risPress2csv f = parseDocs RisPresse (f <> ".ris")
>>= \hs -> writeDocs2Csv (f <> ".csv") hs
......@@ -51,7 +51,8 @@ import Gargantext.Core (Lang(..))
import Gargantext.Prelude
import Gargantext.Database.Types.Node (HyperdataDocument(..))
import Gargantext.Text.Parsers.WOS (wosParser)
import Gargantext.Text.Parsers.RIS (risParser, presseParser)
import Gargantext.Text.Parsers.RIS (risParser)
import Gargantext.Text.Parsers.RIS.Presse (presseEnrich)
import Gargantext.Text.Parsers.Date (parseDate)
import Gargantext.Text.Parsers.CSV (parseHal, writeDocs2Csv)
import Gargantext.Text.Terms.Stop (detectLang)
......@@ -85,7 +86,7 @@ data FileFormat = WOS | RIS | CsvHalFormat | RisPresse -- | CsvGargV3
-- TODO manage errors here
parseDocs :: FileFormat -> FilePath -> IO [HyperdataDocument]
parseDocs CsvHalFormat p = parseHal p
parseDocs RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith presseParser <$> parse' RIS p
parseDocs RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith presseEnrich <$> parse' RIS p
parseDocs ff path = join $ mapM (toDoc ff) <$> snd <$> parse ff path
type Year = Int
......@@ -132,7 +133,6 @@ toDoc ff d = do
Nothing
Nothing
(Just $ (DT.pack . show) lang)
toDoc _ _ = undefined
parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
parse ff fp = enrichWith identity <$> parse' ff fp
......@@ -184,4 +184,3 @@ clean txt = DT.map clean' txt
risPress2csv f = parseDocs RisPresse (f <> ".ris") >>= \hs -> writeDocs2Csv (f <> ".csv") hs
......@@ -25,7 +25,7 @@ import Control.Applicative
import Data.Char (ord)
import Data.Csv
import Data.Either (Either(Left, Right))
import Data.Text (Text, pack, length, intercalate, unpack)
import Data.Text (Text, pack, length, intercalate)
import qualified Data.ByteString.Lazy as BL
import Data.Time.Segment (jour)
......
......@@ -19,7 +19,7 @@ citation programs to exchange data.
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Parsers.RIS (risParser, risDate, toDate, presseParser) where
module Gargantext.Text.Parsers.RIS (risParser, withField) where
import Data.Either (either)
import Data.List (lookup)
......@@ -57,7 +57,7 @@ field = do
let txts' = case DL.length txts > 0 of
True -> txts
False -> []
pure (translate name, concat ([txt] <> txts'))
pure (name, concat ([txt] <> txts'))
lines :: Parser [ByteString]
lines = many line
......@@ -65,46 +65,8 @@ lines = many line
line :: Parser ByteString
line = "\n\n" *> takeTill isEndOfLine
translate :: ByteString -> ByteString
translate champs
| champs == "AU" = "authors"
| champs == "TI" = "title"
| champs == "JF" = "source"
| champs == "LA" = "language"
| champs == "DI" = "doi"
| champs == "UR" = "url"
| champs == "N2" = "abstract"
| otherwise = champs
-------------------------------------------------------------
presseParser :: [(ByteString, ByteString)] -> [(ByteString, ByteString)]
presseParser = (toDate "DA" (\x -> either (const []) identity $ parseOnly risDate x))
. (toDate "LA" presseLang)
risDate :: Parser [(ByteString, ByteString)]
risDate = do
day <- take 2 <* "/"
mon <- take 2 <* "/"
yea <- take 4
pure $ map (first (\x -> "publication_" <> x))
[ ("day",day)
, ("month", mon)
, ("year", yea)
, ("date", yea <> "-" <> mon <> "-" <> day <> "T0:0:0")
]
toDate :: ByteString -> (ByteString -> [(ByteString, ByteString)])
withField :: ByteString -> (ByteString -> [(ByteString, ByteString)])
-> [(ByteString, ByteString)] -> [(ByteString, ByteString)]
toDate k f m = m <> ( maybe [] f (lookup k m) )
withField k f m = m <> ( maybe [] f (lookup k m) )
presseLang :: ByteString -> [(ByteString, ByteString)]
presseLang "Français" = [("language", "FR")]
presseLang "English" = [("langauge", "EN")]
presseLang _ = undefined
{-
fixTitle :: [(ByteString, ByteString)] -> [(ByteString, ByteString)]
fixTitle ns = ns <> [ti, ab]
where
ti = case
-}
{-|
Module : Gargantext.Text.Parsers.RIS.Presse
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Presse RIS format parser en enricher.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Parsers.RIS.Presse (presseEnrich) where
import Data.Either (either)
import Data.List (lookup)
import Data.Tuple.Extra (first)
import Control.Applicative
import Data.Attoparsec.ByteString (Parser, try, string, takeTill, take, manyTill, many1, endOfInput, parseOnly)
import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
import Data.ByteString (ByteString, concat, length)
import Data.ByteString.Char8 (pack)
import Data.Monoid ((<>))
import Gargantext.Prelude hiding (takeWhile, take, concat, readFile, lines, concat)
import Gargantext.Text.Parsers.RIS (withField)
import qualified Data.List as DL
-------------------------------------------------------------
-------------------------------------------------------------
presseEnrich :: [(ByteString, ByteString)] -> [(ByteString, ByteString)]
presseEnrich = (withField "DA" presseDate)
. (withField "LA" presseLang)
. (map (first presseFields))
presseDate :: ByteString -> [(ByteString, ByteString)]
presseDate str = either (const []) identity $ parseOnly parseDate str
parseDate :: Parser [(ByteString, ByteString)]
parseDate = do
day <- take 2 <* "/"
mon <- take 2 <* "/"
yea <- take 4
pure $ map (first (\x -> "publication_" <> x))
[ ("day",day)
, ("month", mon)
, ("year", yea)
, ("date", yea <> "-" <> mon <> "-" <> day <> "T0:0:0")
]
presseLang :: ByteString -> [(ByteString, ByteString)]
presseLang "Français" = [("language", "FR")]
presseLang "English" = [("language", "EN")]
presseLang _ = undefined
presseFields :: ByteString -> ByteString
presseFields champs
| champs == "AU" = "authors"
| champs == "TI" = "title"
| champs == "JF" = "source"
| champs == "DI" = "doi"
| champs == "UR" = "url"
| champs == "N2" = "abstract"
| otherwise = champs
{-
fixTitle :: [(ByteString, ByteString)] -> [(ByteString, ByteString)]
fixTitle ns = ns <> [ti, ab]
where
ti = case
-}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment