Commit 2cd0e36a authored by Alexandre Delanoë's avatar Alexandre Delanoë

[DRAFT] Parser main functions, for meeting.

parent 5c1b33ff
{-|
Module : Data.Gargantext.Parsers
Description : All parsers of Gargantext in one file.
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : alexandre.delanoe@iscpif.fr
Stability : experimental
Portability : POSIX
Gargantext enables analyzing semi-structured text that should be parsed
in order to be analyzed.
The parsers suppose, we know the format of the Text (TextFormat data
type) according which the right parser is chosen among the list of
available parsers.
This module mainly describe how to add a new parser to Gargantext,
please follow the types.
-}
module Data.Gargantext.Parsers ( module Data.Gargantext.Parsers.WOS
, module Data.Gargantext.Parsers.Date
--, module Data.Gargantext.Parsers.XML
--, module Data.Gargantext.Parsers.DOC
--, module Data.Gargantext.Parsers.ODS
)
where
import Data.Gargantext.Parsers.WOS
import Data.Gargantext.Parsers.Date
import Data.Attoparsec.ByteString
import Data.ByteString (ByteString)
import Data.Map as DM
import Data.Either.Extra(Either(..))
import Control.Monad (join)
import Codec.Archive.Zip
import Path.IO (resolveFile')
-- import qualified Data.ByteString.Lazy as B
import Control.Applicative ( (<$>) )
import Control.Concurrent.Async as CCA (mapConcurrently)
import Data.Gargantext.Parsers.WOS (wosParser)
-- import Data.Gargantext.Parsers.XML (xmlParser)
-- import Data.Gargantext.Parsers.DOC (docParser)
-- import Data.Gargantext.Parsers.ODS (odsParser)
import Data.Gargantext.Prelude
import Data.Gargantext.Types.Main (ErrorMessage(), GargParser(), Corpus)
-- | According to the format of Input file,
-- different parser are available.
data FileFormat = WOS -- Implemented (ISI Format)
| XML -- Not Implemented / see :
-- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
| DOC -- Not Implemented / import Pandoc
| ODS -- Not Implemented / import Pandoc
| PDF -- Not Implemented / pdftotext and import Pandoc ?
-- | withParser:
-- According the format of the text, choosing the right parser.
withParser :: FileFormat -> GargParser
withParser WOS = wosParser
--withParser XML = xmlParser
--withParser DOC = docParser
--withParser ODS = odsParser
withParser _ = error "[ERROR] Parser not implemented yet"
runParser :: FileFormat -> ByteString -> Either ErrorMessage (IO (Maybe Corpus))
runParser format text = parseOnly (withParser format) text
parseZip :: FilePath -> ByteString -> IO Corpus
parseZip = undefined
parseFile :: FileFormat -> ByteString -> IO Corpus
parseFile p x = case runParser p x of
Left _ -> pure 0
Right r -> pure $ length r
openZipFiles :: FilePath -> IO [ByteString]
openZipFiles fp = do
path <- resolveFile' fp
entries <- withArchive path (DM.keys <$> getEntries)
bs <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
pure bs
wosParserTest :: FilePath -> IO [Int]
wosParserTest fp = join $ mapConcurrently (parseFile WOS) <$> openZipFiles fp
......@@ -2,53 +2,43 @@
module Data.Gargantext.Parsers.WOS where
-- TOFIX : Should import Data.Gargantext.Prelude here
import Prelude hiding (takeWhile, take, concat, readFile)
import qualified Data.List as DL
import Data.Map as DM
import Data.Attoparsec.ByteString
import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
import Data.ByteString (ByteString)
import Data.ByteString.Char8 (pack)
import Data.Either.Extra(Either(..))
import Control.Applicative
import Control.Monad (join)
-- To be removed just for Tests
--
-- import Codec.Archive.LibZip (withArchive, fileNames, sourceFile, addFile)
--import Codec.Archive.LibZip.Types (ZipSource, OpenFlag (CreateFlag))
import Control.Concurrent.Async as CCA (mapConcurrently)
import Data.Gargantext.Types
import Codec.Archive.Zip
import Path.IO (resolveFile')
-- import qualified Data.ByteString.Lazy as B
import Control.Applicative ( (<$>) )
-- type Parser a = a -> Text -> [Document]
data ParserType = WOS | CSV
-- | wosParser parses ISI format from
-- Web Of Science Database
wosParser :: ByteString -> IO Corpus
wosParser = undefined
type WosDoc = ByteString
wosParser :: Parser [Maybe [WosDoc]]
wosParser = do
wosParser' :: Parser [Maybe [ByteString]]
wosParser' = do
-- TODO Warning if version /= 1.0
-- FIXME anyChar (string ..) /= exact string "\nVR 1.0" ?
_ <- manyTill anyChar (string $ pack "\nVR 1.0")
ns <- many1 wosNotice <* (string $ pack "\nEF")
return ns
wosNotice :: Parser (Maybe [WosDoc])
wosNotice :: Parser (Maybe [ByteString])
wosNotice = startNotice *> wosFields <* endNotice
where
endNotice :: Parser [Char]
endNotice = manyTill anyChar (string $ pack "\nER\n")
endNotice :: Parser [Char]
endNotice = manyTill anyChar (string $ pack "\nER\n")
startNotice :: Parser ByteString
startNotice = "\nPT " *> takeTill isEndOfLine
startNotice :: Parser ByteString
startNotice = "\nPT " *> takeTill isEndOfLine
field' :: Parser (ByteString, [ByteString])
field' = do
......@@ -80,7 +70,7 @@ wosFields = do
-- DL.lookup "URL" ws
-- DL.lookup "PA" ws
-- DL.lookup "TI" ws
--
wosLines :: Parser [ByteString]
wosLines = many line
......@@ -88,32 +78,3 @@ wosLines = many line
line :: Parser ByteString
line = "\n " *> takeTill isEndOfLine
runParser :: ParserType -> ByteString -> Either String [Maybe [WosDoc]]
runParser p x = parseOnly parser x
where
parser = case p of
WOS -> wosParser
_ -> error "Not implemented yet"
-- isTokenChar :: Word8 -> Bool
-- isTokenChar = inClass "!#$%&'()*+./0-9:<=>?@a-zA-Z[]^_`{|}~-\n"
zipFiles :: FilePath -> IO [ByteString]
zipFiles fp = do
path <- resolveFile' fp
entries <- withArchive path (DM.keys <$> getEntries)
bs <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
pure bs
parseFile :: ParserType -> ByteString -> IO Int
parseFile p x = case runParser p x of
Left _ -> pure 0
Right r -> pure $ length r
parseWos :: FilePath -> IO [Int]
parseWos fp = join $ mapConcurrently (parseFile WOS) <$> zipFiles fp
......@@ -5,6 +5,8 @@
module Data.Gargantext.Types.Main where
import Protolude (fromMaybe)
import Data.ByteString (ByteString())
import Data.Text (Text)
import Data.Time (UTCTime)
import Data.Gargantext.Types.Node ( NodePoly
......@@ -17,7 +19,8 @@ import Data.Gargantext.Types.Node ( NodePoly
)
-- | Language of a Text
-- For simplicity, we suppose Text as an homogenous language
data Language = EN | FR -- | DE | IT | SP
-- > EN == english
-- > FR == french
......@@ -28,6 +31,12 @@ data Language = EN | FR -- | DE | IT | SP
type Ngrams = (Text, Text, Text)
type ErrorMessage = String
-- Parse Texts
type GargParser = ByteString -> Either ErrorMessage Corpus
-- | TODO add Symbolic Node / Document
-- TODO make instances of Nodes
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment