Parsers.hs 6.13 KB
Newer Older
1
{-|
2
Module      : Gargantext.Text.Parsers
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
Description : All parsers of Gargantext in one file.
Copyright   : (c) CNRS, 2017
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

Gargantext enables analyzing semi-structured text that should be parsed
in order to be analyzed.

The parsers suppose we know the format of the Text (TextFormat data
type) according to which the right parser is chosen among the list of
available parsers.

This module mainly describe how to add a new parser to Gargantext,
please follow the types.
-}

21
{-# LANGUAGE NoImplicitPrelude #-}
22
{-# LANGUAGE PackageImports    #-}
23
{-# LANGUAGE OverloadedStrings #-}
24

25
module Gargantext.Text.Parsers (parse, FileFormat(..), clean, parseDocs)
26 27
    where

28
import System.FilePath (FilePath(), takeExtension)
29
import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
30

31 32
import Control.Monad (join)
import qualified Data.Time as DT
33
import Data.Either.Extra (partitionEithers)
34
import Data.Time (UTCTime(..))
35
import Data.List (concat)
36
import qualified Data.Map        as DM
37
import qualified Data.ByteString as DB
38 39
import Data.Ord()
import Data.String()
40 41
import Data.Either(Either(..))
import Data.Attoparsec.ByteString (parseOnly, Parser)
42 43

import Data.Text (Text)
44
import qualified Data.Text as DT
45 46

-- Activate Async for to parse in parallel
47
import Control.Concurrent.Async as CCA (mapConcurrently)
48

49
import Data.Text.Encoding (decodeUtf8)
50
import Data.String (String())
51
import Data.List (lookup)
52

53
------------------------------------------------------------------------
54
import Gargantext.Core (Lang(..))
55
import Gargantext.Prelude
56
import Gargantext.Database.Types.Node (HyperdataDocument(..))
57
import Gargantext.Text.Parsers.WOS (wosParser)
58
import Gargantext.Text.Parsers.Date (parseDate)
59
import Gargantext.Text.Parsers.CSV (parseHal)
60
import Gargantext.Text.Terms.Stop (detectLang)
61 62
------------------------------------------------------------------------

63
type ParseError = String
64 65 66 67 68 69
--type Field      = Text
--type Document   = DM.Map Field Text
--type FilesParsed = DM.Map FilePath FileParsed
--data FileParsed  = FileParsed { _fileParsed_errors ::  Maybe ParseError
--                              , _fileParsed_result :: [Document]
--                              } deriving (Show)
70

71 72 73

-- | According to the format of Input file,
-- different parser are available.
74
data FileFormat = WOS | CsvHalFormat -- | CsvGargV3
75 76 77
  deriving (Show)

-- Implemented (ISI Format)
78 79 80 81 82 83
--                | DOC        -- Not Implemented / import Pandoc
--                | ODT        -- Not Implemented / import Pandoc
--                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
--                | XML        -- Not Implemented / see :
--                             -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml

84 85 86
-- TODO: to debug maybe add the filepath in error message


87 88 89
-- | Parse file into documents
-- TODO manage errors here
parseDocs :: FileFormat -> FilePath -> IO [HyperdataDocument]
90 91
parseDocs WOS    path = join $ mapM (toDoc WOS) <$> snd <$> parse WOS path
parseDocs CsvHalFormat p = parseHal p
92

93
type Year  = Int
94 95 96 97 98 99 100 101 102 103 104
type Month = Int
type Day   = Int

-- | Parse date to Ints
-- TODO add hours, minutes and seconds
parseDate' :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
parseDate' _ Nothing    = pure (Nothing, (Nothing, Nothing, Nothing))
parseDate' l (Just txt) = do
  utcTime <- parseDate l txt
  let (UTCTime day _) = utcTime
  let (y,m,d) = DT.toGregorian day
105
  pure (Just utcTime, (Just (fromIntegral y), Just m,Just d))
106 107 108


toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
109
toDoc WOS d = do
110 111 112 113 114 115 116
      let abstract = lookup "abstract" d
      let lang = maybe EN identity (join $ detectLang <$> (fmap (DT.take 50) abstract))
      
      let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
      
      (utcTime, (pub_year, pub_month, pub_day)) <- parseDate' lang  dateToParse

117
      pure $ HyperdataDocument (Just $ DT.pack $ show WOS)
118 119 120 121
                               (lookup "doi" d)
                               (lookup "URL" d)
                                Nothing
                                Nothing
122
                                Nothing
123
                               (lookup "title" d)
124
                                Nothing
125 126 127 128 129 130 131 132 133 134 135
                               (lookup "authors" d)
                               (lookup "source" d)
                               (lookup "abstract" d)
                               (fmap (DT.pack . show) utcTime)
                               (pub_year)
                               (pub_month)
                               (pub_day)
                               Nothing
                               Nothing
                               Nothing
                               (Just $ (DT.pack . show) lang)
Alexandre Delanoë's avatar
Alexandre Delanoë committed
136
toDoc _ _ = undefined
137

138 139 140 141 142 143 144 145 146 147 148 149 150
parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
parse format path = do
    files <- case takeExtension path of
              ".zip" -> openZip              path
              _      -> pure <$> DB.readFile path
    (as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
    pure (as, map toText $ concat bs)
      where
        -- TODO : decode with bayesian inference on encodings
        toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))


-- | withParser:
151
-- According to the format of the text, choose the right parser.
152 153 154 155 156 157
-- TODO  withParser :: FileFormat -> Parser [Document]
withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
withParser WOS = wosParser
--withParser DOC = docParser
--withParser ODT = odtParser
--withParser XML = xmlParser
Alexandre Delanoë's avatar
Alexandre Delanoë committed
158
withParser _   = panic "[ERROR] Parser not implemented yet"
159

160
runParser :: FileFormat -> DB.ByteString
161 162 163 164 165 166 167 168
          -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
runParser format text = pure $ parseOnly (withParser format) text

openZip :: FilePath -> IO [DB.ByteString]
openZip fp = do
    entries <- withArchive fp (DM.keys <$> getEntries)
    bs      <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
    pure bs
169

170 171 172 173 174 175
clean :: Text -> Text
clean txt = DT.map clean' txt
  where
    clean' '’' = '\''
    clean' c  = c