Commit 746cd19d authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT/TYPES] Parsers main types.

parent 2cd0e36a
{-|
Module : Data.Gargantext...
Description : Short description
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Here is a longer description of this module, containing some
commentary with @some markup@.
-}
......@@ -3,98 +3,84 @@ Module : Data.Gargantext.Parsers
Description : All parsers of Gargantext in one file.
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : alexandre.delanoe@iscpif.fr
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Gargantext enables analyzing semi-structured text that should be parsed
in order to be analyzed.
The parsers suppose, we know the format of the Text (TextFormat data
type) according which the right parser is chosen among the list of
The parsers suppose we know the format of the Text (TextFormat data
type) according to which the right parser is chosen among the list of
available parsers.
This module mainly describe how to add a new parser to Gargantext,
please follow the types.
-}
module Data.Gargantext.Parsers ( module Data.Gargantext.Parsers.WOS
--, module Data.Gargantext.Parsers.XML
--, module Data.Gargantext.Parsers.DOC
--, module Data.Gargantext.Parsers.ODS
)
module Data.Gargantext.Parsers -- (parse, FileFormat(..))
where
import Data.Attoparsec.ByteString
import Data.ByteString (ByteString)
import System.FilePath (takeExtension)
import Data.Attoparsec.ByteString (parseOnly, Parser)
import Data.ByteString as DB
import Data.Map as DM
import Data.Either.Extra(Either(..))
import Control.Monad (join)
----import Data.Either.Extra(Either(..))
----
--import Control.Monad (join)
import Codec.Archive.Zip
import Path.IO (resolveFile')
-- import qualified Data.ByteString.Lazy as B
import Control.Applicative ( (<$>) )
------ import qualified Data.ByteString.Lazy as B
--import Control.Applicative ( (<$>) )
import Control.Concurrent.Async as CCA (mapConcurrently)
import Data.Gargantext.Parsers.WOS (wosParser)
-- import Data.Gargantext.Parsers.XML (xmlParser)
-- import Data.Gargantext.Parsers.DOC (docParser)
-- import Data.Gargantext.Parsers.ODS (odsParser)
---- import Data.Gargantext.Parsers.XML (xmlParser)
---- import Data.Gargantext.Parsers.DOC (docParser)
---- import Data.Gargantext.Parsers.ODS (odsParser)
import Data.Gargantext.Prelude
import Data.Gargantext.Types.Main (ErrorMessage(), GargParser(), Corpus)
--import Data.Gargantext.Prelude (pm)
--import Data.Gargantext.Types.Main (ErrorMessage(), Corpus)
-- | According to the format of Input file,
-- different parser are available.
data FileFormat = WOS -- Implemented (ISI Format)
| XML -- Not Implemented / see :
-- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
| DOC -- Not Implemented / import Pandoc
| ODS -- Not Implemented / import Pandoc
| PDF -- Not Implemented / pdftotext and import Pandoc ?
-- | DOC -- Not Implemented / import Pandoc
-- | ODS -- Not Implemented / import Pandoc
-- | PDF -- Not Implemented / pdftotext and import Pandoc ?
-- | XML -- Not Implemented / see :
-- -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
---- | withParser:
---- According the format of the text, choosing the right parser.
-- | withParser:
-- According the format of the text, choosing the right parser.
withParser :: FileFormat -> GargParser
--withParser :: FileFormat -> ByteString -> IO Corpus
withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
withParser WOS = wosParser
--withParser XML = xmlParser
--withParser DOC = docParser
--withParser ODS = odsParser
withParser _ = error "[ERROR] Parser not implemented yet"
runParser :: FileFormat -> ByteString -> Either ErrorMessage (IO (Maybe Corpus))
runParser format text = parseOnly (withParser format) text
parseZip :: FilePath -> ByteString -> IO Corpus
parseZip = undefined
parseFile :: FileFormat -> ByteString -> IO Corpus
parseFile p x = case runParser p x of
Left _ -> pure 0
Right r -> pure $ length r
--withParser XML = xmlParser
--withParser _ = error "[ERROR] Parser not implemented yet"
runParser :: FileFormat -> DB.ByteString
-> IO (Either String [[(DB.ByteString, DB.ByteString)]])
runParser format text = pure $ parseOnly (withParser format) text
openZipFiles :: FilePath -> IO [ByteString]
openZipFiles fp = do
openZip :: FilePath -> IO [DB.ByteString]
openZip fp = do
path <- resolveFile' fp
entries <- withArchive path (DM.keys <$> getEntries)
bs <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
pure bs
wosParserTest :: FilePath -> IO [Int]
wosParserTest fp = join $ mapConcurrently (parseFile WOS) <$> openZipFiles fp
parse :: FileFormat -> FilePath
-> IO [Either String [[(DB.ByteString, DB.ByteString)]]]
parse format path = do
files <- case takeExtension path of
".zip" -> openZip path
_ -> pure <$> DB.readFile path
mapConcurrently (runParser format) files
......@@ -3,7 +3,7 @@ Module : Data.Gargantext.Parsers.Date
Description : Some utils to parse dates
Copyright : (c) CNRS 2017
License : AGPL + CECILL v3
Maintainer : alexandre.delanoe@iscpif.fr
Maintainer : dev@gargantext.org
Stability : experimental
Portability : POSIX
......@@ -12,13 +12,14 @@ According to the language of the text, parseDate1 returns date as Text:
TODO : Add some tests
import Data.Gargantext.Parsers as DGP
DGP.parseDate1 DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
-}
{-# LANGUAGE OverloadedStrings #-}
module Data.Gargantext.Parsers.Date (parseDate1, Lang(FR, EN)) where
import Data.Gargantext.Prelude
import qualified Data.Gargantext.Types.Main as G
--import Data.Gargantext.Types.Main as G
import Data.Time.Clock (UTCTime, getCurrentTime)
import Data.Time.LocalTime.TimeZone.Series (zonedTimeToZoneSeriesTime)
......@@ -40,19 +41,16 @@ import Data.Text (Text)
-- import Duckling.Rules (rulesFor)
-- import Duckling.Debug as DB
import Safe (headMay)
import Duckling.Types (ResolvedToken)
import Safe (headMay)
-- TODO add Paris at Duckling.Locale Region datatype
-- | To get Homogeinity of the languages
-- TODO : put this in a more generic place in the source code
parserLang :: G.Language -> Lang
parserLang G.FR = FR
parserLang G.EN = EN
--parserLang :: G.Language -> Lang
--parserLang G.FR = FR
--parserLang G.EN = EN
-- | Final Date parser API
......
{-# LANGUAGE OverloadedStrings #-}
module Data.Gargantext.Parsers.WOS where
module Data.Gargantext.Parsers.WOS (wosParser) where
-- TOFIX : Should import Data.Gargantext.Prelude here
import Prelude hiding (takeWhile, take, concat, readFile)
import Prelude hiding (takeWhile, take, concat, readFile, lines, concat)
import qualified Data.List as DL
import Data.Attoparsec.ByteString
import Data.Monoid ((<>))
import Data.Attoparsec.ByteString (Parser, try, string
, takeTill, take
, manyTill, many1)
import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
import Data.ByteString (ByteString)
import Data.ByteString (ByteString, concat)
import Data.ByteString.Char8 (pack)
import Control.Applicative
import Data.Gargantext.Types
--import Data.Gargantext.Types
-- | wosParser parses ISI format from
-- Web Of Science Database
wosParser :: ByteString -> IO Corpus
wosParser = undefined
wosParser' :: Parser [Maybe [ByteString]]
wosParser' = do
wosParser :: Parser [[(ByteString, ByteString)]]
wosParser = do
-- TODO Warning if version /= 1.0
-- FIXME anyChar (string ..) /= exact string "\nVR 1.0" ?
_ <- manyTill anyChar (string $ pack "\nVR 1.0")
ns <- many1 wosNotice <* (string $ pack "\nEF")
return ns
_ <- manyTill anyChar (string $ pack "\nVR 1.0")
ns <- many1 notice <* (string $ pack "\nEF" )
pure ns
wosNotice :: Parser (Maybe [ByteString])
wosNotice = startNotice *> wosFields <* endNotice
notice :: Parser [(ByteString, ByteString)]
notice = start *> fields <* end
where
endNotice :: Parser [Char]
endNotice = manyTill anyChar (string $ pack "\nER\n")
startNotice :: Parser ByteString
startNotice = "\nPT " *> takeTill isEndOfLine
start :: Parser ByteString
start = "\nPT " *> takeTill isEndOfLine
end :: Parser [Char]
end = manyTill anyChar (string $ pack "\nER\n")
field' :: Parser (ByteString, [ByteString])
field' = do
f <- "\n" *> take 2 <* " "
a <- takeTill isEndOfLine
as <- try wosLines
let as' = case DL.length as > 0 of
True -> as
False -> []
return (f, [a] ++ as')
wosFields' :: Parser [(ByteString, [ByteString])]
wosFields' = many field'
wosFields :: Parser (Maybe [ByteString])
wosFields = do
-- a <- field "AU"
-- t <- field "TI"
-- s <- field "SO"
-- d <- field "DI" -- DOI
-- p <- field "PD"
-- b <- field "AB"
-- u <- field "UT"
ws <- many field'
return $ DL.lookup "UT" ws
-- return $ HyperdataDocument
-- Just "WOS"
-- DL.lookup "DI" ws
-- DL.lookup "URL" ws
-- DL.lookup "PA" ws
-- DL.lookup "TI" ws
wosLines :: Parser [ByteString]
wosLines = many line
fields :: Parser [(ByteString, ByteString)]
fields = many field
where
field :: Parser (ByteString, ByteString)
field = do
name <- "\n" *> take 2 <* " "
txt <- takeTill isEndOfLine
txts <- try lines
let txts' = case DL.length txts > 0 of
True -> txts
False -> []
pure (translate name, concat ([txt] <> txts'))
lines :: Parser [ByteString]
lines = many line
where
line :: Parser ByteString
line = "\n " *> takeTill isEndOfLine
translate :: ByteString -> ByteString
translate champs
| champs == "AU" = "author"
| champs == "TI" = "title"
| champs == "SO" = "source"
| champs == "DI" = "doi"
| champs == "PD" = "publication_date"
| champs == "AB" = "abstract"
| otherwise = champs
{-# OPTIONS_GHC -fno-warn-name-shadowing #-}
{-# OPTIONS_GHC -fno-warn-type-defaults #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# OPTIONS_GHC -fno-warn-type-defaults #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-
TODO: import head impossible from Protolude: why ?
-}
module Data.Gargantext.Prelude where
import Protolude (Bool(True, False), Int, Double, Integer, Fractional, Num, Maybe, Floating, Char, Ord, Integral, Foldable, RealFrac, Monad, filter,
reverse
, map
, zip
, drop
, take
, zipWith
, sum
, fromIntegral
, length
, fmap
, takeWhile
, sqrt
, undefined
, identity
, abs
, maximum
, minimum
, return
, snd
, truncate
import Protolude ( Bool(True, False), Int, Double, Integer
, Fractional, Num, Maybe, Floating, Char
, Ord, Integral, Foldable, RealFrac, Monad, filter
, reverse, map, zip, drop, take, zipWith
, sum, fromIntegral, length, fmap
, takeWhile, sqrt, undefined, identity
, abs, maximum, minimum, return, snd, truncate
, (+), (*), (/), (-), (.), (>=), ($), (**), (^)
)
......@@ -36,7 +26,7 @@ import qualified Data.List as L hiding (head, sum)
import qualified Control.Monad as M
import qualified Data.Map as Map
import qualified Data.Vector as V
import Safe (headMay)
-- import Safe (headMay)
pf :: (a -> Bool) -> [a] -> [a]
......
......@@ -6,7 +6,7 @@ module Data.Gargantext.Types.Main where
import Protolude (fromMaybe)
import Data.ByteString (ByteString())
--import Data.ByteString (ByteString())
import Data.Text (Text)
import Data.Time (UTCTime)
import Data.Gargantext.Types.Node ( NodePoly
......@@ -20,7 +20,7 @@ import Data.Gargantext.Types.Node ( NodePoly
-- | Language of a Text
-- For simplicity, we suppose Text as an homogenous language
-- For simplicity, we suppose text has an homogenous language
data Language = EN | FR -- | DE | IT | SP
-- > EN == english
-- > FR == french
......@@ -33,9 +33,6 @@ type Ngrams = (Text, Text, Text)
type ErrorMessage = String
-- Parse Texts
type GargParser = ByteString -> Either ErrorMessage Corpus
-- | TODO add Symbolic Node / Document
-- TODO make instances of Nodes
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment