Parsers.hs 6.42 KB
Newer Older
1
{-|
2
Module      : Gargantext.Text.Parsers
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
Description : All parsers of Gargantext in one file.
Copyright   : (c) CNRS, 2017
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

Gargantext enables analyzing semi-structured text that should be parsed
in order to be analyzed.

The parsers suppose we know the format of the Text (TextFormat data
type) according to which the right parser is chosen among the list of
available parsers.

This module mainly describe how to add a new parser to Gargantext,
please follow the types.
-}

21
{-# LANGUAGE NoImplicitPrelude #-}
22
{-# LANGUAGE PackageImports    #-}
23
{-# LANGUAGE OverloadedStrings #-}
24

25
module Gargantext.Text.Parsers (FileFormat(..), clean, parseFile)
26 27
    where

28
--import Data.ByteString (ByteString)
29
import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
30
import Control.Concurrent.Async as CCA (mapConcurrently)
31
import Control.Monad (join)
32
import qualified Data.ByteString.Char8 as DBC
33 34
import Data.Attoparsec.ByteString (parseOnly, Parser)
import Data.Either(Either(..))
35 36
import Data.Either.Extra (partitionEithers)
import Data.List (concat)
37
import Data.List (lookup)
38
import Data.Ord()
39
import Data.String (String())
40
import Data.String()
41
import Data.Text (Text)
42
import Data.Text.Encoding (decodeUtf8)
43
import Data.Tuple.Extra (both, first, second)
44 45 46 47
import System.FilePath (FilePath(), takeExtension)
import qualified Data.ByteString as DB
import qualified Data.Map        as DM
import qualified Data.Text as DT
48
import Gargantext.Core (Lang(..))
49
import Gargantext.Prelude
50
import Gargantext.Database.Types.Node (HyperdataDocument(..))
51 52
import qualified Gargantext.Text.Parsers.WOS as WOS
import qualified Gargantext.Text.Parsers.RIS as RIS
53
import Gargantext.Text.Parsers.RIS.Presse (presseEnrich)
54
import qualified Gargantext.Text.Parsers.Date as Date
55
import Gargantext.Text.Parsers.CSV (parseHal)
56
import Gargantext.Text.Terms.Stop (detectLang)
57 58
------------------------------------------------------------------------

59
type ParseError = String
60 61 62 63 64 65
--type Field      = Text
--type Document   = DM.Map Field Text
--type FilesParsed = DM.Map FilePath FileParsed
--data FileParsed  = FileParsed { _fileParsed_errors ::  Maybe ParseError
--                              , _fileParsed_result :: [Document]
--                              } deriving (Show)
66

67 68 69

-- | According to the format of Input file,
-- different parser are available.
70 71
data FileFormat = WOS | RIS | RisPresse
                | CsvGargV3 | CsvHalFormat
72 73 74
  deriving (Show)

-- Implemented (ISI Format)
75 76 77 78 79
--                | DOC        -- Not Implemented / import Pandoc
--                | ODT        -- Not Implemented / import Pandoc
--                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
--                | XML        -- Not Implemented / see :

80

81 82 83 84
{-
parseFormat :: FileFormat -> ByteString -> [HyperdataDocument]
parseFormat = undefined
-}
85

86 87
-- | Parse file into documents
-- TODO manage errors here
88
-- TODO: to debug maybe add the filepath in error message
Alexandre Delanoë's avatar
Alexandre Delanoë committed
89 90
parseFile :: FileFormat -> FilePath -> IO [HyperdataDocument]
parseFile CsvHalFormat p = parseHal p
91 92 93
parseFile RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
parseFile WOS       p = join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS       <$> readFileWith WOS p
parseFile ff        p = join $ mapM (toDoc ff)  <$> snd <$> enrichWith ff        <$> readFileWith ff p
94 95

toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
96 97
-- TODO use language for RIS
toDoc ff d = do
98 99 100 101 102
      let abstract = lookup "abstract" d
      let lang = maybe EN identity (join $ detectLang <$> (fmap (DT.take 50) abstract))
      
      let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
      
103
      (utcTime, (pub_year, pub_month, pub_day)) <- Date.split lang  dateToParse
104

105
      pure $ HyperdataDocument (Just $ DT.pack $ show ff)
106 107 108 109
                               (lookup "doi" d)
                               (lookup "URL" d)
                                Nothing
                                Nothing
110
                                Nothing
111
                               (lookup "title" d)
112
                                Nothing
113 114 115 116 117 118 119 120 121 122 123 124
                               (lookup "authors" d)
                               (lookup "source" d)
                               (lookup "abstract" d)
                               (fmap (DT.pack . show) utcTime)
                               (pub_year)
                               (pub_month)
                               (pub_day)
                               Nothing
                               Nothing
                               Nothing
                               (Just $ (DT.pack . show) lang)

125 126 127 128 129 130
enrichWith :: FileFormat
           ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith RisPresse = enrichWith' presseEnrich
enrichWith WOS       = enrichWith' (map (first WOS.keys))
enrichWith _         = enrichWith' identity

131

132 133 134
enrichWith' ::       ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
           ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith' f = second (map both' . map f . concat)
135 136 137
  where
    both'   = map (both decodeUtf8)

138
readFileWith :: FileFormat -> FilePath
139
       -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
140
readFileWith format path = do
141 142
    files <- case takeExtension path of
              ".zip" -> openZip              path
143
              _      -> pure <$> clean <$> DB.readFile path
144 145
    partitionEithers <$> mapConcurrently (runParser format) files

146 147

-- | withParser:
148
-- According to the format of the text, choose the right parser.
149 150
-- TODO  withParser :: FileFormat -> Parser [Document]
withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
151 152
withParser WOS = WOS.parser
withParser RIS = RIS.parser
153 154
--withParser ODT = odtParser
--withParser XML = xmlParser
Alexandre Delanoë's avatar
Alexandre Delanoë committed
155
withParser _   = panic "[ERROR] Parser not implemented yet"
156

157
runParser :: FileFormat -> DB.ByteString
158 159 160 161 162 163 164 165
          -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
runParser format text = pure $ parseOnly (withParser format) text

openZip :: FilePath -> IO [DB.ByteString]
openZip fp = do
    entries <- withArchive fp (DM.keys <$> getEntries)
    bs      <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
    pure bs
166

167 168
clean :: DB.ByteString -> DB.ByteString
clean txt = DBC.map clean' txt
169 170
  where
    clean' '’' = '\''
171
    clean' '\r' = ' '
172
    clean' c  = c