[FEAT/TYPES] Parsers main types.

746cd19d · Alexandre Delanoë · 2cd0e36a · 746cd19d · 746cd19d · 746cd19d
Commit 746cd19d authored Jan 12, 2018 by Alexandre Delanoë
6 changed files
--- a/moduleDesc.model
+++ b/moduleDesc.model
+{-|
+Module      : Data.Gargantext...
+Description : Short description
+Copyright   : (c) CNRS, 2017
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+Here is a longer description of this module, containing some
+commentary with @some markup@.
+-}
--- a/src/Data/Gargantext/Parsers.hs
+++ b/src/Data/Gargantext/Parsers.hs
@@ -3,98 +3,84 @@ Module      : Data.Gargantext.Parsers
 Description : All parsers of Gargantext in one file.
 Copyright   : (c) CNRS, 2017
 License     : AGPL + CECILL v3
-Maintainer  : alexandre.delanoe@iscpif.fr
+Maintainer  : team@gargantext.org
 Stability   : experimental
 Portability : POSIX
 Gargantext enables analyzing semi-structured text that should be parsed
 in order to be analyzed.
-The parsers suppose, we know the format of the Text (TextFormat data
+The parsers suppose we know the format of the Text (TextFormat data
-type) according which the right parser is chosen among the list of
+type) according to which the right parser is chosen among the list of
 available parsers.
 This module mainly describe how to add a new parser to Gargantext,
 please follow the types.
 -}
+module Data.Gargantext.Parsers -- (parse, FileFormat(..))
-module Data.Gargantext.Parsers ( module Data.Gargantext.Parsers.WOS
-                               --, module Data.Gargantext.Parsers.XML
-                               --, module Data.Gargantext.Parsers.DOC
-                               --, module Data.Gargantext.Parsers.ODS
-                               )
    where
+import System.FilePath (takeExtension)
-import Data.Attoparsec.ByteString
+import Data.Attoparsec.ByteString (parseOnly, Parser)
-import Data.ByteString (ByteString)
+import Data.ByteString as DB
 import Data.Map                    as DM
-import Data.Either.Extra(Either(..))
+----import Data.Either.Extra(Either(..))
+----
-import Control.Monad (join)
+--import Control.Monad (join)
 import Codec.Archive.Zip
 import Path.IO (resolveFile')
-- import qualified Data.ByteString.Lazy as B
+------ import qualified Data.ByteString.Lazy as B
-import Control.Applicative ( (<$>) )
+--import Control.Applicative ( (<$>) )
 import Control.Concurrent.Async as CCA (mapConcurrently)
 import Data.Gargantext.Parsers.WOS (wosParser)
-- import Data.Gargantext.Parsers.XML (xmlParser)
+---- import Data.Gargantext.Parsers.XML (xmlParser)
-- import Data.Gargantext.Parsers.DOC (docParser)
+---- import Data.Gargantext.Parsers.DOC (docParser)
-- import Data.Gargantext.Parsers.ODS (odsParser)
+---- import Data.Gargantext.Parsers.ODS (odsParser)
-import Data.Gargantext.Prelude
+--import Data.Gargantext.Prelude (pm)
-import Data.Gargantext.Types.Main (ErrorMessage(), GargParser(), Corpus)
+--import Data.Gargantext.Types.Main (ErrorMessage(), Corpus)
 -- | According to the format of Input file,
 -- different parser are available.
 data FileFormat = WOS        -- Implemented (ISI Format)
-                | XML        -- Not Implemented / see :
+--                | DOC        -- Not Implemented / import Pandoc
-                             -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
+--                | ODS        -- Not Implemented / import Pandoc
-                | DOC        -- Not Implemented / import Pandoc
+--                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
-                | ODS        -- Not Implemented / import Pandoc
+--                | XML        -- Not Implemented / see :
-                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
+--                             -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
+---- | withParser:
+---- According the format of the text, choosing the right parser.
+--withParser :: FileFormat -> ByteString -> IO Corpus
-- | withParser:
+withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
-- According the format of the text, choosing the right parser.
-withParser :: FileFormat -> GargParser
 withParser WOS = wosParser
--withParser XML = xmlParser
 --withParser DOC = docParser
 --withParser ODS = odsParser
-withParser _   = error "[ERROR] Parser not implemented yet"
+--withParser XML = xmlParser
+--withParser _   = error "[ERROR] Parser not implemented yet"
-runParser :: FileFormat -> ByteString -> Either ErrorMessage (IO (Maybe Corpus))
-runParser format text = parseOnly (withParser format) text
-parseZip :: FilePath -> ByteString -> IO Corpus
-parseZip = undefined
-parseFile :: FileFormat -> ByteString -> IO Corpus
-parseFile p x = case runParser p x of
-        Left  _ -> pure 0
-        Right r -> pure $ length r
+runParser :: FileFormat -> DB.ByteString 
+          -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
+runParser format text = pure $ parseOnly (withParser format) text
-openZipFiles :: FilePath -> IO [ByteString]
+openZip :: FilePath -> IO [DB.ByteString]
-openZipFiles fp = do
+openZip fp = do
    path    <- resolveFile' fp
    entries <- withArchive path (DM.keys <$> getEntries)
    bs      <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
    pure bs
+parse :: FileFormat -> FilePath 
-wosParserTest :: FilePath -> IO [Int]
+      -> IO [Either String [[(DB.ByteString, DB.ByteString)]]]
-wosParserTest fp = join $ mapConcurrently (parseFile WOS) <$> openZipFiles fp
+parse format path = do
+    files <- case takeExtension path of
+              ".zip" -> openZip              path
+              _      -> pure <$> DB.readFile path
+    mapConcurrently (runParser format) files
--- a/src/Data/Gargantext/Parsers/Date.hs
+++ b/src/Data/Gargantext/Parsers/Date.hs
@@ -3,7 +3,7 @@ Module      : Data.Gargantext.Parsers.Date
 Description : Some utils to parse dates
 Copyright   : (c) CNRS 2017
 License     : AGPL + CECILL v3
-Maintainer  : alexandre.delanoe@iscpif.fr
+Maintainer  : dev@gargantext.org
 Stability   : experimental
 Portability : POSIX
@@ -12,13 +12,14 @@ According to the language of the text, parseDate1 returns date as Text:
 TODO : Add some tests
 import Data.Gargantext.Parsers as DGP
 DGP.parseDate1 DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
 -}
+{-# LANGUAGE OverloadedStrings #-}
 module Data.Gargantext.Parsers.Date (parseDate1, Lang(FR, EN)) where
 import Data.Gargantext.Prelude
-import qualified Data.Gargantext.Types.Main as G
+--import Data.Gargantext.Types.Main as G
 import Data.Time.Clock (UTCTime, getCurrentTime)
 import Data.Time.LocalTime.TimeZone.Series (zonedTimeToZoneSeriesTime)
@@ -40,19 +41,16 @@ import Data.Text (Text)
 -- import Duckling.Rules (rulesFor)
 -- import Duckling.Debug as DB
-import Safe (headMay)
 import Duckling.Types (ResolvedToken)
+import Safe (headMay)
 -- TODO add Paris at Duckling.Locale Region datatype
 -- | To get Homogeinity of the languages
 --   TODO : put this in a more generic place in the source code
-parserLang :: G.Language -> Lang
+--parserLang :: G.Language -> Lang
-parserLang G.FR = FR
+--parserLang G.FR = FR
-parserLang G.EN = EN
+--parserLang G.EN = EN
 -- | Final Date parser API

--- a/src/Data/Gargantext/Parsers/WOS.hs
+++ b/src/Data/Gargantext/Parsers/WOS.hs
 {-# LANGUAGE OverloadedStrings #-}
-module Data.Gargantext.Parsers.WOS where
+module Data.Gargantext.Parsers.WOS (wosParser) where
 -- TOFIX : Should import Data.Gargantext.Prelude here
-import Prelude hiding (takeWhile, take, concat, readFile)
+import Prelude hiding (takeWhile, take, concat, readFile, lines, concat)
 import qualified Data.List as DL
-import Data.Attoparsec.ByteString
+import Data.Monoid ((<>))
+import Data.Attoparsec.ByteString (Parser, try, string
+                                  , takeTill, take
+                                  , manyTill, many1)
 import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
-import Data.ByteString (ByteString)
+import Data.ByteString (ByteString, concat)
 import Data.ByteString.Char8 (pack)
 import Control.Applicative
+--import Data.Gargantext.Types
-import Data.Gargantext.Types
 -- | wosParser parses ISI format from
 -- Web Of Science Database
-wosParser :: ByteString -> IO Corpus
+wosParser :: Parser [[(ByteString, ByteString)]]
-wosParser = undefined
+wosParser = do
-wosParser' :: Parser [Maybe [ByteString]]
-wosParser' = do
    -- TODO Warning if version /= 1.0
    -- FIXME anyChar (string ..) /= exact string "\nVR 1.0" ?
-    _ <- manyTill anyChar (string $ pack "\nVR 1.0")
+    _  <- manyTill anyChar (string $ pack "\nVR 1.0")
-    ns <- many1 wosNotice <* (string $ pack "\nEF")
+    ns <- many1 notice <*  (string $ pack "\nEF"    )
-    return ns
+    pure ns
-wosNotice :: Parser (Maybe [ByteString])
+notice :: Parser [(ByteString, ByteString)]
-wosNotice = startNotice *> wosFields <* endNotice
+notice = start *> fields <* end
    where
-      endNotice :: Parser [Char]
+      start :: Parser ByteString
-      endNotice = manyTill anyChar (string $ pack "\nER\n")
+      start = "\nPT " *> takeTill isEndOfLine
-      startNotice :: Parser ByteString
+      end :: Parser [Char]
-      startNotice = "\nPT " *> takeTill isEndOfLine
+      end = manyTill anyChar (string $ pack "\nER\n")
-field' :: Parser (ByteString, [ByteString])
-field' = do
-    f  <- "\n" *> take 2 <* " "
-    a  <- takeTill isEndOfLine
-    as <- try wosLines
-    let as' = case DL.length as > 0 of
-            True  -> as
-            False -> []
-    return (f, [a] ++ as')
-wosFields' :: Parser [(ByteString, [ByteString])]
+fields :: Parser [(ByteString, ByteString)]
-wosFields' = many field'
+fields = many field
+    where
-wosFields :: Parser (Maybe [ByteString])
+        field :: Parser (ByteString, ByteString)
-wosFields = do
+        field = do
--    a <- field "AU"
+            name  <- "\n" *> take 2 <* " "
--    t <- field "TI"
+            txt   <- takeTill isEndOfLine
--    s <- field "SO"
+            txts  <- try lines
--    d <- field "DI" -- DOI
+            let txts' = case DL.length txts > 0 of
--    p <- field "PD"
+                    True  -> txts
--    b <- field "AB"
+                    False -> []
--    u <- field "UT"
+            pure (translate name, concat ([txt] <> txts'))
-    ws <- many field'
-    return $ DL.lookup "UT" ws
--    return $ HyperdataDocument 
+lines :: Parser [ByteString]
--                    Just "WOS"
+lines = many line
--                    DL.lookup "DI" ws
--                    DL.lookup "URL" ws
--                    DL.lookup "PA" ws
--                    DL.lookup "TI" ws
-wosLines :: Parser [ByteString]
-wosLines = many line
    where
        line :: Parser ByteString
        line = "\n  " *> takeTill isEndOfLine
+translate :: ByteString -> ByteString
+translate champs
+            | champs == "AU" = "author"
+            | champs == "TI" = "title"
+            | champs == "SO" = "source"
+            | champs == "DI" = "doi"
+            | champs == "PD" = "publication_date"
+            | champs == "AB" = "abstract"
+            | otherwise  = champs
--- a/src/Data/Gargantext/Prelude.hs
+++ b/src/Data/Gargantext/Prelude.hs
 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
-{-# OPTIONS_GHC -fno-warn-type-defaults #-}
+{-# OPTIONS_GHC -fno-warn-type-defaults  #-}
-{-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE     NoImplicitPrelude       #-}
+{-
+TODO: import head impossible from Protolude: why ?
+-}
 module Data.Gargantext.Prelude where
-import Protolude (Bool(True, False), Int, Double, Integer, Fractional, Num, Maybe, Floating, Char, Ord, Integral, Foldable, RealFrac, Monad, filter,
+import Protolude ( Bool(True, False), Int, Double, Integer
-                 reverse
+                 , Fractional, Num, Maybe, Floating, Char
-                 , map
+                 , Ord, Integral, Foldable, RealFrac, Monad, filter
-                 , zip
+                 , reverse, map, zip, drop, take, zipWith
-                 , drop
+                 , sum, fromIntegral, length, fmap
-                 , take
+                 , takeWhile, sqrt, undefined, identity
-                 , zipWith
+                 , abs, maximum, minimum, return, snd, truncate
-                 , sum
-                 , fromIntegral
-                 , length
-                 , fmap
-                 , takeWhile
-                 , sqrt
-                 , undefined
-                 , identity
-                 , abs
-                 , maximum
-                 , minimum
-                 , return
-                 , snd
-                 , truncate
                 , (+), (*), (/), (-), (.), (>=), ($), (**), (^)
                 )
@@ -36,7 +26,7 @@ import qualified Data.List     as L hiding (head, sum)
 import qualified Control.Monad as M
 import qualified Data.Map as Map
 import qualified Data.Vector as V
-import Safe (headMay)
+-- import Safe (headMay)
 pf :: (a -> Bool) -> [a] -> [a]

--- a/src/Data/Gargantext/Types/Main.hs
+++ b/src/Data/Gargantext/Types/Main.hs
@@ -6,7 +6,7 @@ module Data.Gargantext.Types.Main where
 import Protolude (fromMaybe)
-import Data.ByteString (ByteString())
+--import Data.ByteString (ByteString())
 import Data.Text (Text)
 import Data.Time (UTCTime)
 import Data.Gargantext.Types.Node ( NodePoly
@@ -20,7 +20,7 @@ import Data.Gargantext.Types.Node ( NodePoly
 -- | Language of a Text
-- For simplicity, we suppose Text as an homogenous language
+-- For simplicity, we suppose text has an homogenous language
 data Language = EN | FR -- | DE | IT | SP
    -- > EN == english
    -- > FR == french
@@ -33,9 +33,6 @@ type Ngrams = (Text, Text, Text)
 type ErrorMessage = String
-- Parse Texts
-type GargParser = ByteString -> Either ErrorMessage Corpus
 -- | TODO add Symbolic Node / Document
 --   TODO make instances of Nodes