WOS.hs 2.43 KB
Newer Older
1
{-|
2
Module      : Gargantext.Text.Parsers.WOS
3 4 5 6 7 8 9 10 11 12 13 14
Description : 
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

Here is a longer description of this module, containing some
commentary with @some markup@.
-}

{-# LANGUAGE NoImplicitPrelude #-}
15 16
{-# LANGUAGE OverloadedStrings #-}

17
module Gargantext.Text.Parsers.WOS (wosParser) where
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85

-- TOFIX : Should import Gargantext.Prelude here
import Prelude hiding (takeWhile, take, concat, readFile, lines, concat)

import qualified Data.List as DL

import Data.Monoid ((<>))
import Data.Attoparsec.ByteString (Parser, try, string
                                  , takeTill, take
                                  , manyTill, many1)
import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
import Data.ByteString (ByteString, concat)
import Data.ByteString.Char8 (pack)

import Control.Applicative

--import Gargantext.Types

-- | wosParser parses ISI format from
-- Web Of Science Database
wosParser :: Parser [[(ByteString, ByteString)]]
wosParser = do
    -- TODO Warning if version /= 1.0
    -- FIXME anyChar (string ..) /= exact string "\nVR 1.0" ?
    _  <- manyTill anyChar (string $ pack "\nVR 1.0")
    ns <- many1 notice <*  (string $ pack "\nEF"    )
    pure ns

notice :: Parser [(ByteString, ByteString)]
notice = start *> fields <* end
    where
      start :: Parser ByteString
      start = "\nPT " *> takeTill isEndOfLine
      
      end :: Parser [Char]
      end = manyTill anyChar (string $ pack "\nER\n")


fields :: Parser [(ByteString, ByteString)]
fields = many field
    where
        field :: Parser (ByteString, ByteString)
        field = do
            name  <- "\n" *> take 2 <* " "
            txt   <- takeTill isEndOfLine
            txts  <- try lines
            let txts' = case DL.length txts > 0 of
                    True  -> txts
                    False -> []
            pure (translate name, concat ([txt] <> txts'))


lines :: Parser [ByteString]
lines = many line
    where
        line :: Parser ByteString
        line = "\n  " *> takeTill isEndOfLine

translate :: ByteString -> ByteString
translate champs
            | champs == "AU" = "author"
            | champs == "TI" = "title"
            | champs == "SO" = "source"
            | champs == "DI" = "doi"
            | champs == "PD" = "publication_date"
            | champs == "AB" = "abstract"
            | otherwise  = champs