Presse.hs 2.22 KB
{-|
Module      : Gargantext.Text.Parsers.RIS.Presse
Description : 
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

Presse RIS format parser for Europresse Database.

-}

{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}

module Gargantext.Text.Parsers.RIS.Presse (presseEnrich) where

import Data.List (lookup)
import Data.Either (either)
import Data.Tuple.Extra (first, both, uncurry)
import Data.Attoparsec.ByteString (parseOnly)
import Data.ByteString (ByteString, length)
import Gargantext.Prelude hiding (takeWhile, take, length)
import Gargantext.Text.Parsers.RIS (onField)
import Gargantext.Core (Lang(..))
import qualified Gargantext.Text.Parsers.Date.Attoparsec as Date



presseEnrich :: [(ByteString, ByteString)] -> [(ByteString, ByteString)]
presseEnrich = (onField "DA" parseDate)
             . (onField "LA" parseLang)
             . fixFields
             

parseDate :: ByteString -> [(ByteString, ByteString)]
parseDate str = either (const []) identity $ parseOnly (Date.parserWith "/")  str

parseLang :: ByteString -> [(ByteString, ByteString)]
parseLang "Français" = [(langField, cs $ show FR)]
parseLang "English"  = [(langField, cs $ show EN)]
parseLang x = [(langField, x)]

langField :: ByteString
langField = "language"


fixFields :: [(ByteString, ByteString)] -> [(ByteString, ByteString)]
fixFields ns = map (first fixFields'') ns
  where
    -- | Title is sometimes longer than abstract
    fixFields'' = case uncurry (>) <$> look'' of
      Just True -> fixFields' "abstract" "title"
      _         -> fixFields' "title"    "abstract"

    look'' :: Maybe (Int, Int)
    look'' = both length <$> look

    look :: Maybe (ByteString,ByteString)
    look = (,) <$> lookup "TI" ns <*> lookup "N2" ns


    fixFields' :: ByteString -> ByteString
                 -> ByteString -> ByteString
    fixFields' title abstract champs
                | champs == "AU" = "authors"
                | champs == "TI" = title
                | champs == "JF" = "source"
                | champs == "DI" = "doi"
                | champs == "UR" = "url"
                | champs == "N2" = abstract
                | otherwise  = champs