WOS.hs 2.52 KB
Newer Older
1
{-|
2
Module      : Gargantext.Text.Parsers.WOS
3 4 5 6 7 8 9 10 11 12 13 14
Description : 
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

Here is a longer description of this module, containing some
commentary with @some markup@.
-}

{-# LANGUAGE NoImplicitPrelude #-}
15 16
{-# LANGUAGE OverloadedStrings #-}

17
module Gargantext.Text.Parsers.WOS (wosParser) where
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32

-- TOFIX : Should import Gargantext.Prelude here
import Prelude hiding (takeWhile, take, concat, readFile, lines, concat)

import qualified Data.List as DL

import Data.Monoid ((<>))
import Data.Attoparsec.ByteString (Parser, try, string
                                  , takeTill, take
                                  , manyTill, many1)
import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
import Data.ByteString (ByteString, concat)
import Data.ByteString.Char8 (pack)
import Control.Applicative

33 34 35 36
-------------------------------------------------------------



37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52

-- | wosParser parses ISI format from
-- Web Of Science Database
wosParser :: Parser [[(ByteString, ByteString)]]
wosParser = do
    -- TODO Warning if version /= 1.0
    -- FIXME anyChar (string ..) /= exact string "\nVR 1.0" ?
    _  <- manyTill anyChar (string $ pack "\nVR 1.0")
    ns <- many1 notice <*  (string $ pack "\nEF"    )
    pure ns

notice :: Parser [(ByteString, ByteString)]
notice = start *> fields <* end
    where
      start :: Parser ByteString
      start = "\nPT " *> takeTill isEndOfLine
53

54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
      end :: Parser [Char]
      end = manyTill anyChar (string $ pack "\nER\n")


fields :: Parser [(ByteString, ByteString)]
fields = many field
    where
        field :: Parser (ByteString, ByteString)
        field = do
            name  <- "\n" *> take 2 <* " "
            txt   <- takeTill isEndOfLine
            txts  <- try lines
            let txts' = case DL.length txts > 0 of
                    True  -> txts
                    False -> []
            pure (translate name, concat ([txt] <> txts'))


lines :: Parser [ByteString]
lines = many line
    where
        line :: Parser ByteString
        line = "\n  " *> takeTill isEndOfLine

translate :: ByteString -> ByteString
translate champs
80
            | champs == "AF" = "authors"
81 82 83 84 85 86
            | champs == "TI" = "title"
            | champs == "SO" = "source"
            | champs == "DI" = "doi"
            | champs == "PD" = "publication_date"
            | champs == "AB" = "abstract"
            | otherwise  = champs
87
-------------------------------------------------------------
88