Commit c45062cb authored by Alexandre Delanoë's avatar Alexandre Delanoë

[CONVERT] convert file between formats (RIS -> CSV).

parent 33f1f510
{-|
Module : Gargantext.Text.Convert
Description : All parsers of Gargantext in one file.
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Format Converter.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE PackageImports #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Convert (risPress2csv)
where
import System.FilePath (FilePath()) -- , takeExtension)
import Gargantext.Prelude
import Gargantext.Text.Parsers.CSV (writeDocs2Csv)
import Gargantext.Text.Parsers (parseDocs, FileFormat(..))
risPress2csv :: FilePath -> IO ()
risPress2csv f = parseDocs RisPresse (f <> ".ris")
>>= \hs -> writeDocs2Csv (f <> ".csv") hs
...@@ -51,7 +51,8 @@ import Gargantext.Core (Lang(..)) ...@@ -51,7 +51,8 @@ import Gargantext.Core (Lang(..))
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Database.Types.Node (HyperdataDocument(..)) import Gargantext.Database.Types.Node (HyperdataDocument(..))
import Gargantext.Text.Parsers.WOS (wosParser) import Gargantext.Text.Parsers.WOS (wosParser)
import Gargantext.Text.Parsers.RIS (risParser, presseParser) import Gargantext.Text.Parsers.RIS (risParser)
import Gargantext.Text.Parsers.RIS.Presse (presseEnrich)
import Gargantext.Text.Parsers.Date (parseDate) import Gargantext.Text.Parsers.Date (parseDate)
import Gargantext.Text.Parsers.CSV (parseHal, writeDocs2Csv) import Gargantext.Text.Parsers.CSV (parseHal, writeDocs2Csv)
import Gargantext.Text.Terms.Stop (detectLang) import Gargantext.Text.Terms.Stop (detectLang)
...@@ -85,7 +86,7 @@ data FileFormat = WOS | RIS | CsvHalFormat | RisPresse -- | CsvGargV3 ...@@ -85,7 +86,7 @@ data FileFormat = WOS | RIS | CsvHalFormat | RisPresse -- | CsvGargV3
-- TODO manage errors here -- TODO manage errors here
parseDocs :: FileFormat -> FilePath -> IO [HyperdataDocument] parseDocs :: FileFormat -> FilePath -> IO [HyperdataDocument]
parseDocs CsvHalFormat p = parseHal p parseDocs CsvHalFormat p = parseHal p
parseDocs RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith presseParser <$> parse' RIS p parseDocs RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith presseEnrich <$> parse' RIS p
parseDocs ff path = join $ mapM (toDoc ff) <$> snd <$> parse ff path parseDocs ff path = join $ mapM (toDoc ff) <$> snd <$> parse ff path
type Year = Int type Year = Int
...@@ -132,7 +133,6 @@ toDoc ff d = do ...@@ -132,7 +133,6 @@ toDoc ff d = do
Nothing Nothing
Nothing Nothing
(Just $ (DT.pack . show) lang) (Just $ (DT.pack . show) lang)
toDoc _ _ = undefined
parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]]) parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
parse ff fp = enrichWith identity <$> parse' ff fp parse ff fp = enrichWith identity <$> parse' ff fp
...@@ -184,4 +184,3 @@ clean txt = DT.map clean' txt ...@@ -184,4 +184,3 @@ clean txt = DT.map clean' txt
risPress2csv f = parseDocs RisPresse (f <> ".ris") >>= \hs -> writeDocs2Csv (f <> ".csv") hs risPress2csv f = parseDocs RisPresse (f <> ".ris") >>= \hs -> writeDocs2Csv (f <> ".csv") hs
...@@ -25,7 +25,7 @@ import Control.Applicative ...@@ -25,7 +25,7 @@ import Control.Applicative
import Data.Char (ord) import Data.Char (ord)
import Data.Csv import Data.Csv
import Data.Either (Either(Left, Right)) import Data.Either (Either(Left, Right))
import Data.Text (Text, pack, length, intercalate, unpack) import Data.Text (Text, pack, length, intercalate)
import qualified Data.ByteString.Lazy as BL import qualified Data.ByteString.Lazy as BL
import Data.Time.Segment (jour) import Data.Time.Segment (jour)
......
...@@ -19,7 +19,7 @@ citation programs to exchange data. ...@@ -19,7 +19,7 @@ citation programs to exchange data.
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Parsers.RIS (risParser, risDate, toDate, presseParser) where module Gargantext.Text.Parsers.RIS (risParser, withField) where
import Data.Either (either) import Data.Either (either)
import Data.List (lookup) import Data.List (lookup)
...@@ -57,7 +57,7 @@ field = do ...@@ -57,7 +57,7 @@ field = do
let txts' = case DL.length txts > 0 of let txts' = case DL.length txts > 0 of
True -> txts True -> txts
False -> [] False -> []
pure (translate name, concat ([txt] <> txts')) pure (name, concat ([txt] <> txts'))
lines :: Parser [ByteString] lines :: Parser [ByteString]
lines = many line lines = many line
...@@ -65,46 +65,8 @@ lines = many line ...@@ -65,46 +65,8 @@ lines = many line
line :: Parser ByteString line :: Parser ByteString
line = "\n\n" *> takeTill isEndOfLine line = "\n\n" *> takeTill isEndOfLine
translate :: ByteString -> ByteString
translate champs
| champs == "AU" = "authors"
| champs == "TI" = "title"
| champs == "JF" = "source"
| champs == "LA" = "language"
| champs == "DI" = "doi"
| champs == "UR" = "url"
| champs == "N2" = "abstract"
| otherwise = champs
------------------------------------------------------------- -------------------------------------------------------------
withField :: ByteString -> (ByteString -> [(ByteString, ByteString)])
presseParser :: [(ByteString, ByteString)] -> [(ByteString, ByteString)]
presseParser = (toDate "DA" (\x -> either (const []) identity $ parseOnly risDate x))
. (toDate "LA" presseLang)
risDate :: Parser [(ByteString, ByteString)]
risDate = do
day <- take 2 <* "/"
mon <- take 2 <* "/"
yea <- take 4
pure $ map (first (\x -> "publication_" <> x))
[ ("day",day)
, ("month", mon)
, ("year", yea)
, ("date", yea <> "-" <> mon <> "-" <> day <> "T0:0:0")
]
toDate :: ByteString -> (ByteString -> [(ByteString, ByteString)])
-> [(ByteString, ByteString)] -> [(ByteString, ByteString)] -> [(ByteString, ByteString)] -> [(ByteString, ByteString)]
toDate k f m = m <> ( maybe [] f (lookup k m) ) withField k f m = m <> ( maybe [] f (lookup k m) )
presseLang :: ByteString -> [(ByteString, ByteString)]
presseLang "Français" = [("language", "FR")]
presseLang "English" = [("langauge", "EN")]
presseLang _ = undefined
{-
fixTitle :: [(ByteString, ByteString)] -> [(ByteString, ByteString)]
fixTitle ns = ns <> [ti, ab]
where
ti = case
-}
{-|
Module : Gargantext.Text.Parsers.RIS.Presse
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Presse RIS format parser en enricher.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Parsers.RIS.Presse (presseEnrich) where
import Data.Either (either)
import Data.List (lookup)
import Data.Tuple.Extra (first)
import Control.Applicative
import Data.Attoparsec.ByteString (Parser, try, string, takeTill, take, manyTill, many1, endOfInput, parseOnly)
import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
import Data.ByteString (ByteString, concat, length)
import Data.ByteString.Char8 (pack)
import Data.Monoid ((<>))
import Gargantext.Prelude hiding (takeWhile, take, concat, readFile, lines, concat)
import Gargantext.Text.Parsers.RIS (withField)
import qualified Data.List as DL
-------------------------------------------------------------
-------------------------------------------------------------
presseEnrich :: [(ByteString, ByteString)] -> [(ByteString, ByteString)]
presseEnrich = (withField "DA" presseDate)
. (withField "LA" presseLang)
. (map (first presseFields))
presseDate :: ByteString -> [(ByteString, ByteString)]
presseDate str = either (const []) identity $ parseOnly parseDate str
parseDate :: Parser [(ByteString, ByteString)]
parseDate = do
day <- take 2 <* "/"
mon <- take 2 <* "/"
yea <- take 4
pure $ map (first (\x -> "publication_" <> x))
[ ("day",day)
, ("month", mon)
, ("year", yea)
, ("date", yea <> "-" <> mon <> "-" <> day <> "T0:0:0")
]
presseLang :: ByteString -> [(ByteString, ByteString)]
presseLang "Français" = [("language", "FR")]
presseLang "English" = [("language", "EN")]
presseLang _ = undefined
presseFields :: ByteString -> ByteString
presseFields champs
| champs == "AU" = "authors"
| champs == "TI" = "title"
| champs == "JF" = "source"
| champs == "DI" = "doi"
| champs == "UR" = "url"
| champs == "N2" = "abstract"
| otherwise = champs
{-
fixTitle :: [(ByteString, ByteString)] -> [(ByteString, ByteString)]
fixTitle ns = ns <> [ti, ab]
where
ti = case
-}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment