Parsers.hs 3.67 KB
Newer Older
1
{-|
2
Module      : Gargantext.Text.Parsers
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
Description : All parsers of Gargantext in one file.
Copyright   : (c) CNRS, 2017
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

Gargantext enables analyzing semi-structured text that should be parsed
in order to be analyzed.

The parsers suppose we know the format of the Text (TextFormat data
type) according to which the right parser is chosen among the list of
available parsers.

This module mainly describe how to add a new parser to Gargantext,
please follow the types.
-}

21 22
{-# LANGUAGE NoImplicitPrelude #-}

23
module Gargantext.Text.Parsers -- (parse, FileFormat(..))
24 25
    where

26 27
import System.FilePath (FilePath(), takeExtension)
import Codec.Archive.Zip (withArchive, getEntry, getEntries)
28

29 30
import Data.Either.Extra (partitionEithers)
import Data.List (concat)
31
import qualified Data.Map        as DM
32
import qualified Data.ByteString as DB
33 34
import Data.Ord()
import Data.String()
35 36
import Data.Either(Either(..))
import Data.Attoparsec.ByteString (parseOnly, Parser)
37 38

import Data.Text (Text)
39
import qualified Data.Text as DT
40 41

-- Activate Async for to parse in parallel
42
import Control.Concurrent.Async as CCA (mapConcurrently)
43

44
import Data.Text.Encoding (decodeUtf8)
45
import Data.String (String())
46

47 48 49 50 51
------------------------------------------------------------------------
import Gargantext.Prelude
import Gargantext.Text.Parsers.WOS (wosParser)
------------------------------------------------------------------------

52

53
type ParseError = String
54 55 56 57 58 59 60
type Field      = Text
type Document   = DM.Map Field Text

type FilesParsed = DM.Map FilePath FileParsed
data FileParsed  = FileParsed { _fileParsed_errors ::  Maybe ParseError
                              , _fileParsed_result :: [Document]
                              } deriving (Show)
61

62 63 64 65 66 67 68 69 70 71

-- | According to the format of Input file,
-- different parser are available.
data FileFormat = WOS        -- Implemented (ISI Format)
--                | DOC        -- Not Implemented / import Pandoc
--                | ODT        -- Not Implemented / import Pandoc
--                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
--                | XML        -- Not Implemented / see :
--                             -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml

72 73 74
-- TODO: to debug maybe add the filepath in error message


75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
parse format path = do
    files <- case takeExtension path of
              ".zip" -> openZip              path
              _      -> pure <$> DB.readFile path
    (as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
    pure (as, map toText $ concat bs)
      where
        -- TODO : decode with bayesian inference on encodings
        toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))


-- | withParser:
-- According the format of the text, choosing the right parser.
-- TODO  withParser :: FileFormat -> Parser [Document]
withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
withParser WOS = wosParser
--withParser DOC = docParser
--withParser ODT = odtParser
--withParser XML = xmlParser
--withParser _   = error "[ERROR] Parser not implemented yet"

runParser :: FileFormat -> DB.ByteString 
          -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
runParser format text = pure $ parseOnly (withParser format) text

openZip :: FilePath -> IO [DB.ByteString]
openZip fp = do
    entries <- withArchive fp (DM.keys <$> getEntries)
    bs      <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
    pure bs
106

107 108 109 110 111 112
clean :: Text -> Text
clean txt = DT.map clean' txt
  where
    clean' '’' = '\''
    clean' c  = c

113