[DRAFT] Parser main functions, for meeting.

2cd0e36a · Alexandre Delanoë · 5c1b33ff · 2cd0e36a · 2cd0e36a · 2cd0e36a
Commit 2cd0e36a authored Dec 12, 2017 by Alexandre Delanoë
Hide whitespace changes
Inline Side-by-side

Showing with 123 additions and 60 deletions

Parsers.hs src/Data/Gargantext/Parsers.hs +97 -4

WOS.hs src/Data/Gargantext/Parsers/WOS.hs +16 -55

Main.hs src/Data/Gargantext/Types/Main.hs +10 -1

No files found.
--- a/src/Data/Gargantext/Parsers.hs
+++ b/src/Data/Gargantext/Parsers.hs
+{-|
+Module      : Data.Gargantext.Parsers
+Description : All parsers of Gargantext in one file.
+Copyright   : (c) CNRS, 2017
+License     : AGPL + CECILL v3
+Maintainer  : alexandre.delanoe@iscpif.fr
+Stability   : experimental
+Portability : POSIX
+
+Gargantext enables analyzing semi-structured text that should be parsed
+in order to be analyzed.
+
+The parsers suppose, we know the format of the Text (TextFormat data
+type) according which the right parser is chosen among the list of
+available parsers.
+
+This module mainly describe how to add a new parser to Gargantext,
+please follow the types.
+-}
+
+
 module Data.Gargantext.Parsers ( module Data.Gargantext.Parsers.WOS
-                               , module Data.Gargantext.Parsers.Date
+                               --, module Data.Gargantext.Parsers.XML
+                               --, module Data.Gargantext.Parsers.DOC
+                               --, module Data.Gargantext.Parsers.ODS
                               )
-                                
    where
-import Data.Gargantext.Parsers.WOS
-import Data.Gargantext.Parsers.Date
+
+
+import Data.Attoparsec.ByteString
+import Data.ByteString (ByteString)
+import Data.Map                    as DM
+import Data.Either.Extra(Either(..))
+
+import Control.Monad (join)
+import Codec.Archive.Zip
+import Path.IO (resolveFile')
+-- import qualified Data.ByteString.Lazy as B
+import Control.Applicative ( (<$>) )
+
+
+
+import Control.Concurrent.Async as CCA (mapConcurrently)
+
+
+import Data.Gargantext.Parsers.WOS (wosParser)
+-- import Data.Gargantext.Parsers.XML (xmlParser)
+-- import Data.Gargantext.Parsers.DOC (docParser)
+-- import Data.Gargantext.Parsers.ODS (odsParser)
+
+import Data.Gargantext.Prelude
+import Data.Gargantext.Types.Main (ErrorMessage(), GargParser(), Corpus)
+
+
+-- | According to the format of Input file,
+-- different parser are available.
+data FileFormat = WOS        -- Implemented (ISI Format)
+                | XML        -- Not Implemented / see :
+                             -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
+                | DOC        -- Not Implemented / import Pandoc
+                | ODS        -- Not Implemented / import Pandoc
+                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
+
+
+
+-- | withParser:
+-- According the format of the text, choosing the right parser.
+withParser :: FileFormat -> GargParser
+withParser WOS = wosParser
+--withParser XML = xmlParser
+--withParser DOC = docParser
+--withParser ODS = odsParser
+withParser _   = error "[ERROR] Parser not implemented yet"
+
+
+runParser :: FileFormat -> ByteString -> Either ErrorMessage (IO (Maybe Corpus))
+runParser format text = parseOnly (withParser format) text
+
+
+parseZip :: FilePath -> ByteString -> IO Corpus
+parseZip = undefined
+
+parseFile :: FileFormat -> ByteString -> IO Corpus
+parseFile p x = case runParser p x of
+        Left  _ -> pure 0
+        Right r -> pure $ length r
+
+
+openZipFiles :: FilePath -> IO [ByteString]
+openZipFiles fp = do
+    path    <- resolveFile' fp
+    entries <- withArchive path (DM.keys <$> getEntries)
+    bs      <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
+    pure bs
+
+
+wosParserTest :: FilePath -> IO [Int]
+wosParserTest fp = join $ mapConcurrently (parseFile WOS) <$> openZipFiles fp
+
+
--- a/src/Data/Gargantext/Parsers/WOS.hs
+++ b/src/Data/Gargantext/Parsers/WOS.hs
@@ -2,53 +2,43 @@

 module Data.Gargantext.Parsers.WOS where

+-- TOFIX : Should import Data.Gargantext.Prelude here
 import Prelude hiding (takeWhile, take, concat, readFile)
+
 import qualified Data.List as DL
-import Data.Map as DM
 import Data.Attoparsec.ByteString
 import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
 import Data.ByteString (ByteString)
 import Data.ByteString.Char8 (pack)

-import Data.Either.Extra(Either(..))
 import Control.Applicative

-import Control.Monad (join)
-
-- To be removed just for Tests
--
-- import Codec.Archive.LibZip (withArchive, fileNames, sourceFile, addFile)
--import Codec.Archive.LibZip.Types (ZipSource, OpenFlag (CreateFlag))

-import Control.Concurrent.Async as CCA (mapConcurrently)
+import Data.Gargantext.Types

-import Codec.Archive.Zip
-import Path.IO (resolveFile')
-- import qualified Data.ByteString.Lazy as B
-import Control.Applicative ( (<$>) )

-- type Parser a = a -> Text -> [Document]
-data ParserType = WOS | CSV
+-- | wosParser parses ISI format from
+-- Web Of Science Database
+wosParser :: ByteString -> IO Corpus
+wosParser = undefined

-type WosDoc = ByteString

-
-wosParser :: Parser [Maybe [WosDoc]]
-wosParser = do
+wosParser' :: Parser [Maybe [ByteString]]
+wosParser' = do
    -- TODO Warning if version /= 1.0
    -- FIXME anyChar (string ..) /= exact string "\nVR 1.0" ?
    _ <- manyTill anyChar (string $ pack "\nVR 1.0")
    ns <- many1 wosNotice <* (string $ pack "\nEF")
    return ns

-wosNotice :: Parser (Maybe [WosDoc])
+wosNotice :: Parser (Maybe [ByteString])
 wosNotice = startNotice *> wosFields <* endNotice
+    where
+      endNotice :: Parser [Char]
+      endNotice = manyTill anyChar (string $ pack "\nER\n")

-endNotice :: Parser [Char]
-endNotice = manyTill anyChar (string $ pack "\nER\n")
-
-startNotice :: Parser ByteString
-startNotice = "\nPT " *> takeTill isEndOfLine
+      startNotice :: Parser ByteString
+      startNotice = "\nPT " *> takeTill isEndOfLine

 field' :: Parser (ByteString, [ByteString])
 field' = do
@@ -80,7 +70,7 @@ wosFields = do
 --                    DL.lookup "URL" ws
 --                    DL.lookup "PA" ws
 --                    DL.lookup "TI" ws
--
+

 wosLines :: Parser [ByteString]
 wosLines = many line
@@ -88,32 +78,3 @@ wosLines = many line
        line :: Parser ByteString
        line = "\n  " *> takeTill isEndOfLine

-runParser :: ParserType -> ByteString -> Either String [Maybe [WosDoc]]
-runParser p x = parseOnly parser x
-    where
-        parser = case p of 
-                  WOS -> wosParser
-                  _   -> error "Not implemented yet"
-
-- isTokenChar :: Word8 -> Bool
-- isTokenChar = inClass "!#$%&'()*+./0-9:<=>?@a-zA-Z[]^_`{|}~-\n"
-
-
-zipFiles :: FilePath -> IO [ByteString]
-zipFiles fp = do
-    path    <- resolveFile' fp
-    entries <- withArchive path (DM.keys <$> getEntries)
-    bs      <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
-    pure bs
-
-
-parseFile :: ParserType -> ByteString -> IO Int
-parseFile p x = case runParser p x of
-        Left  _ -> pure 0
-        Right r -> pure $ length r
-
-parseWos :: FilePath -> IO [Int]
-parseWos fp = join $ mapConcurrently (parseFile WOS) <$> zipFiles fp
-
-
-
--- a/src/Data/Gargantext/Types/Main.hs
+++ b/src/Data/Gargantext/Types/Main.hs
@@ -5,6 +5,8 @@
 module Data.Gargantext.Types.Main where

 import Protolude (fromMaybe)
+
+import Data.ByteString (ByteString())
 import Data.Text (Text)
 import Data.Time (UTCTime)
 import Data.Gargantext.Types.Node ( NodePoly
@@ -17,7 +19,8 @@ import Data.Gargantext.Types.Node ( NodePoly
                               )


-
+-- | Language of a Text
+-- For simplicity, we suppose Text as an homogenous language
 data Language = EN | FR -- | DE | IT | SP
    -- > EN == english
    -- > FR == french
@@ -28,6 +31,12 @@ data Language = EN | FR -- | DE | IT | SP

 type Ngrams = (Text, Text, Text)

+type ErrorMessage = String
+
+-- Parse Texts
+type GargParser = ByteString -> Either ErrorMessage Corpus
+
+
 -- | TODO add Symbolic Node / Document
 --   TODO make instances of Nodes