WOS.hs 1.85 KB
Newer Older
1
{-|
2
Module      : Gargantext.Text.Corpus.Parsers.WOS
3 4 5 6 7 8 9 10 11 12 13 14
Description : 
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

Here is a longer description of this module, containing some
commentary with @some markup@.
-}

{-# LANGUAGE NoImplicitPrelude #-}
15 16
{-# LANGUAGE OverloadedStrings #-}

17
module Gargantext.Text.Corpus.Parsers.WOS (parser, keys) where
18

19 20
import Control.Applicative
import Data.Attoparsec.ByteString (Parser, string, takeTill, take, manyTill, many1)
21
import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
22
import Data.ByteString (ByteString)
23
import Data.ByteString.Char8 (pack)
24
import Gargantext.Text.Corpus.Parsers.RIS (fieldWith)
25
import Prelude hiding (takeWhile, take, concat, readFile, lines, concat)
26

27
-------------------------------------------------------------
28 29
-- | wosParser parses ISI format from
-- Web Of Science Database
30 31
parser :: Parser [[(ByteString, ByteString)]]
parser = do
32 33 34 35 36 37 38
    -- TODO Warning if version /= 1.0
    -- FIXME anyChar (string ..) /= exact string "\nVR 1.0" ?
    _  <- manyTill anyChar (string $ pack "\nVR 1.0")
    ns <- many1 notice <*  (string $ pack "\nEF"    )
    pure ns

notice :: Parser [(ByteString, ByteString)]
39
notice = start *> many (fieldWith field) <* end
40
    where
41 42 43
      field :: Parser ByteString
      field = "\n" *> take 2 <* " "

44 45
      start :: Parser ByteString
      start = "\nPT " *> takeTill isEndOfLine
46

47 48 49 50
      end :: Parser [Char]
      end = manyTill anyChar (string $ pack "\nER\n")


51 52
keys :: ByteString -> ByteString
keys champs
53
            | champs == "AF" = "authors"
54 55 56 57 58 59
            | champs == "TI" = "title"
            | champs == "SO" = "source"
            | champs == "DI" = "doi"
            | champs == "PD" = "publication_date"
            | champs == "AB" = "abstract"
            | otherwise  = champs