Commit 746cd19d authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT/TYPES] Parsers main types.

parent 2cd0e36a
{-|
Module : Data.Gargantext...
Description : Short description
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Here is a longer description of this module, containing some
commentary with @some markup@.
-}
...@@ -3,98 +3,84 @@ Module : Data.Gargantext.Parsers ...@@ -3,98 +3,84 @@ Module : Data.Gargantext.Parsers
Description : All parsers of Gargantext in one file. Description : All parsers of Gargantext in one file.
Copyright : (c) CNRS, 2017 Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3 License : AGPL + CECILL v3
Maintainer : alexandre.delanoe@iscpif.fr Maintainer : team@gargantext.org
Stability : experimental Stability : experimental
Portability : POSIX Portability : POSIX
Gargantext enables analyzing semi-structured text that should be parsed Gargantext enables analyzing semi-structured text that should be parsed
in order to be analyzed. in order to be analyzed.
The parsers suppose, we know the format of the Text (TextFormat data The parsers suppose we know the format of the Text (TextFormat data
type) according which the right parser is chosen among the list of type) according to which the right parser is chosen among the list of
available parsers. available parsers.
This module mainly describe how to add a new parser to Gargantext, This module mainly describe how to add a new parser to Gargantext,
please follow the types. please follow the types.
-} -}
module Data.Gargantext.Parsers -- (parse, FileFormat(..))
module Data.Gargantext.Parsers ( module Data.Gargantext.Parsers.WOS
--, module Data.Gargantext.Parsers.XML
--, module Data.Gargantext.Parsers.DOC
--, module Data.Gargantext.Parsers.ODS
)
where where
import System.FilePath (takeExtension)
import Data.Attoparsec.ByteString import Data.Attoparsec.ByteString (parseOnly, Parser)
import Data.ByteString (ByteString) import Data.ByteString as DB
import Data.Map as DM import Data.Map as DM
import Data.Either.Extra(Either(..)) ----import Data.Either.Extra(Either(..))
----
import Control.Monad (join) --import Control.Monad (join)
import Codec.Archive.Zip import Codec.Archive.Zip
import Path.IO (resolveFile') import Path.IO (resolveFile')
-- import qualified Data.ByteString.Lazy as B ------ import qualified Data.ByteString.Lazy as B
import Control.Applicative ( (<$>) ) --import Control.Applicative ( (<$>) )
import Control.Concurrent.Async as CCA (mapConcurrently) import Control.Concurrent.Async as CCA (mapConcurrently)
import Data.Gargantext.Parsers.WOS (wosParser) import Data.Gargantext.Parsers.WOS (wosParser)
-- import Data.Gargantext.Parsers.XML (xmlParser) ---- import Data.Gargantext.Parsers.XML (xmlParser)
-- import Data.Gargantext.Parsers.DOC (docParser) ---- import Data.Gargantext.Parsers.DOC (docParser)
-- import Data.Gargantext.Parsers.ODS (odsParser) ---- import Data.Gargantext.Parsers.ODS (odsParser)
import Data.Gargantext.Prelude --import Data.Gargantext.Prelude (pm)
import Data.Gargantext.Types.Main (ErrorMessage(), GargParser(), Corpus) --import Data.Gargantext.Types.Main (ErrorMessage(), Corpus)
-- | According to the format of Input file, -- | According to the format of Input file,
-- different parser are available. -- different parser are available.
data FileFormat = WOS -- Implemented (ISI Format) data FileFormat = WOS -- Implemented (ISI Format)
| XML -- Not Implemented / see : -- | DOC -- Not Implemented / import Pandoc
-- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml -- | ODS -- Not Implemented / import Pandoc
| DOC -- Not Implemented / import Pandoc -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
| ODS -- Not Implemented / import Pandoc -- | XML -- Not Implemented / see :
| PDF -- Not Implemented / pdftotext and import Pandoc ? -- -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
---- | withParser:
---- According the format of the text, choosing the right parser.
--withParser :: FileFormat -> ByteString -> IO Corpus
-- | withParser: withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
-- According the format of the text, choosing the right parser.
withParser :: FileFormat -> GargParser
withParser WOS = wosParser withParser WOS = wosParser
--withParser XML = xmlParser
--withParser DOC = docParser --withParser DOC = docParser
--withParser ODS = odsParser --withParser ODS = odsParser
withParser _ = error "[ERROR] Parser not implemented yet" --withParser XML = xmlParser
--withParser _ = error "[ERROR] Parser not implemented yet"
runParser :: FileFormat -> ByteString -> Either ErrorMessage (IO (Maybe Corpus))
runParser format text = parseOnly (withParser format) text
parseZip :: FilePath -> ByteString -> IO Corpus
parseZip = undefined
parseFile :: FileFormat -> ByteString -> IO Corpus
parseFile p x = case runParser p x of
Left _ -> pure 0
Right r -> pure $ length r
runParser :: FileFormat -> DB.ByteString
-> IO (Either String [[(DB.ByteString, DB.ByteString)]])
runParser format text = pure $ parseOnly (withParser format) text
openZipFiles :: FilePath -> IO [ByteString] openZip :: FilePath -> IO [DB.ByteString]
openZipFiles fp = do openZip fp = do
path <- resolveFile' fp path <- resolveFile' fp
entries <- withArchive path (DM.keys <$> getEntries) entries <- withArchive path (DM.keys <$> getEntries)
bs <- mapConcurrently (\s -> withArchive path (getEntry s)) entries bs <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
pure bs pure bs
parse :: FileFormat -> FilePath
wosParserTest :: FilePath -> IO [Int] -> IO [Either String [[(DB.ByteString, DB.ByteString)]]]
wosParserTest fp = join $ mapConcurrently (parseFile WOS) <$> openZipFiles fp parse format path = do
files <- case takeExtension path of
".zip" -> openZip path
_ -> pure <$> DB.readFile path
mapConcurrently (runParser format) files
...@@ -3,7 +3,7 @@ Module : Data.Gargantext.Parsers.Date ...@@ -3,7 +3,7 @@ Module : Data.Gargantext.Parsers.Date
Description : Some utils to parse dates Description : Some utils to parse dates
Copyright : (c) CNRS 2017 Copyright : (c) CNRS 2017
License : AGPL + CECILL v3 License : AGPL + CECILL v3
Maintainer : alexandre.delanoe@iscpif.fr Maintainer : dev@gargantext.org
Stability : experimental Stability : experimental
Portability : POSIX Portability : POSIX
...@@ -12,13 +12,14 @@ According to the language of the text, parseDate1 returns date as Text: ...@@ -12,13 +12,14 @@ According to the language of the text, parseDate1 returns date as Text:
TODO : Add some tests TODO : Add some tests
import Data.Gargantext.Parsers as DGP import Data.Gargantext.Parsers as DGP
DGP.parseDate1 DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00" DGP.parseDate1 DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
-} -}
{-# LANGUAGE OverloadedStrings #-}
module Data.Gargantext.Parsers.Date (parseDate1, Lang(FR, EN)) where module Data.Gargantext.Parsers.Date (parseDate1, Lang(FR, EN)) where
import Data.Gargantext.Prelude import Data.Gargantext.Prelude
import qualified Data.Gargantext.Types.Main as G --import Data.Gargantext.Types.Main as G
import Data.Time.Clock (UTCTime, getCurrentTime) import Data.Time.Clock (UTCTime, getCurrentTime)
import Data.Time.LocalTime.TimeZone.Series (zonedTimeToZoneSeriesTime) import Data.Time.LocalTime.TimeZone.Series (zonedTimeToZoneSeriesTime)
...@@ -40,19 +41,16 @@ import Data.Text (Text) ...@@ -40,19 +41,16 @@ import Data.Text (Text)
-- import Duckling.Rules (rulesFor) -- import Duckling.Rules (rulesFor)
-- import Duckling.Debug as DB -- import Duckling.Debug as DB
import Safe (headMay)
import Duckling.Types (ResolvedToken) import Duckling.Types (ResolvedToken)
import Safe (headMay)
-- TODO add Paris at Duckling.Locale Region datatype -- TODO add Paris at Duckling.Locale Region datatype
-- | To get Homogeinity of the languages -- | To get Homogeinity of the languages
-- TODO : put this in a more generic place in the source code -- TODO : put this in a more generic place in the source code
parserLang :: G.Language -> Lang --parserLang :: G.Language -> Lang
parserLang G.FR = FR --parserLang G.FR = FR
parserLang G.EN = EN --parserLang G.EN = EN
-- | Final Date parser API -- | Final Date parser API
......
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
module Data.Gargantext.Parsers.WOS where module Data.Gargantext.Parsers.WOS (wosParser) where
-- TOFIX : Should import Data.Gargantext.Prelude here -- TOFIX : Should import Data.Gargantext.Prelude here
import Prelude hiding (takeWhile, take, concat, readFile) import Prelude hiding (takeWhile, take, concat, readFile, lines, concat)
import qualified Data.List as DL import qualified Data.List as DL
import Data.Attoparsec.ByteString
import Data.Monoid ((<>))
import Data.Attoparsec.ByteString (Parser, try, string
, takeTill, take
, manyTill, many1)
import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine) import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
import Data.ByteString (ByteString) import Data.ByteString (ByteString, concat)
import Data.ByteString.Char8 (pack) import Data.ByteString.Char8 (pack)
import Control.Applicative import Control.Applicative
--import Data.Gargantext.Types
import Data.Gargantext.Types
-- | wosParser parses ISI format from -- | wosParser parses ISI format from
-- Web Of Science Database -- Web Of Science Database
wosParser :: ByteString -> IO Corpus wosParser :: Parser [[(ByteString, ByteString)]]
wosParser = undefined wosParser = do
wosParser' :: Parser [Maybe [ByteString]]
wosParser' = do
-- TODO Warning if version /= 1.0 -- TODO Warning if version /= 1.0
-- FIXME anyChar (string ..) /= exact string "\nVR 1.0" ? -- FIXME anyChar (string ..) /= exact string "\nVR 1.0" ?
_ <- manyTill anyChar (string $ pack "\nVR 1.0") _ <- manyTill anyChar (string $ pack "\nVR 1.0")
ns <- many1 wosNotice <* (string $ pack "\nEF") ns <- many1 notice <* (string $ pack "\nEF" )
return ns pure ns
wosNotice :: Parser (Maybe [ByteString]) notice :: Parser [(ByteString, ByteString)]
wosNotice = startNotice *> wosFields <* endNotice notice = start *> fields <* end
where where
endNotice :: Parser [Char] start :: Parser ByteString
endNotice = manyTill anyChar (string $ pack "\nER\n") start = "\nPT " *> takeTill isEndOfLine
startNotice :: Parser ByteString end :: Parser [Char]
startNotice = "\nPT " *> takeTill isEndOfLine end = manyTill anyChar (string $ pack "\nER\n")
field' :: Parser (ByteString, [ByteString])
field' = do
f <- "\n" *> take 2 <* " "
a <- takeTill isEndOfLine
as <- try wosLines
let as' = case DL.length as > 0 of
True -> as
False -> []
return (f, [a] ++ as')
wosFields' :: Parser [(ByteString, [ByteString])] fields :: Parser [(ByteString, ByteString)]
wosFields' = many field' fields = many field
where
wosFields :: Parser (Maybe [ByteString]) field :: Parser (ByteString, ByteString)
wosFields = do field = do
-- a <- field "AU" name <- "\n" *> take 2 <* " "
-- t <- field "TI" txt <- takeTill isEndOfLine
-- s <- field "SO" txts <- try lines
-- d <- field "DI" -- DOI let txts' = case DL.length txts > 0 of
-- p <- field "PD" True -> txts
-- b <- field "AB" False -> []
-- u <- field "UT" pure (translate name, concat ([txt] <> txts'))
ws <- many field'
return $ DL.lookup "UT" ws
-- return $ HyperdataDocument lines :: Parser [ByteString]
-- Just "WOS" lines = many line
-- DL.lookup "DI" ws
-- DL.lookup "URL" ws
-- DL.lookup "PA" ws
-- DL.lookup "TI" ws
wosLines :: Parser [ByteString]
wosLines = many line
where where
line :: Parser ByteString line :: Parser ByteString
line = "\n " *> takeTill isEndOfLine line = "\n " *> takeTill isEndOfLine
translate :: ByteString -> ByteString
translate champs
| champs == "AU" = "author"
| champs == "TI" = "title"
| champs == "SO" = "source"
| champs == "DI" = "doi"
| champs == "PD" = "publication_date"
| champs == "AB" = "abstract"
| otherwise = champs
{-# OPTIONS_GHC -fno-warn-name-shadowing #-} {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
{-# OPTIONS_GHC -fno-warn-type-defaults #-} {-# OPTIONS_GHC -fno-warn-type-defaults #-}
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
{-
TODO: import head impossible from Protolude: why ?
-}
module Data.Gargantext.Prelude where module Data.Gargantext.Prelude where
import Protolude (Bool(True, False), Int, Double, Integer, Fractional, Num, Maybe, Floating, Char, Ord, Integral, Foldable, RealFrac, Monad, filter, import Protolude ( Bool(True, False), Int, Double, Integer
reverse , Fractional, Num, Maybe, Floating, Char
, map , Ord, Integral, Foldable, RealFrac, Monad, filter
, zip , reverse, map, zip, drop, take, zipWith
, drop , sum, fromIntegral, length, fmap
, take , takeWhile, sqrt, undefined, identity
, zipWith , abs, maximum, minimum, return, snd, truncate
, sum
, fromIntegral
, length
, fmap
, takeWhile
, sqrt
, undefined
, identity
, abs
, maximum
, minimum
, return
, snd
, truncate
, (+), (*), (/), (-), (.), (>=), ($), (**), (^) , (+), (*), (/), (-), (.), (>=), ($), (**), (^)
) )
...@@ -36,7 +26,7 @@ import qualified Data.List as L hiding (head, sum) ...@@ -36,7 +26,7 @@ import qualified Data.List as L hiding (head, sum)
import qualified Control.Monad as M import qualified Control.Monad as M
import qualified Data.Map as Map import qualified Data.Map as Map
import qualified Data.Vector as V import qualified Data.Vector as V
import Safe (headMay) -- import Safe (headMay)
pf :: (a -> Bool) -> [a] -> [a] pf :: (a -> Bool) -> [a] -> [a]
......
...@@ -6,7 +6,7 @@ module Data.Gargantext.Types.Main where ...@@ -6,7 +6,7 @@ module Data.Gargantext.Types.Main where
import Protolude (fromMaybe) import Protolude (fromMaybe)
import Data.ByteString (ByteString()) --import Data.ByteString (ByteString())
import Data.Text (Text) import Data.Text (Text)
import Data.Time (UTCTime) import Data.Time (UTCTime)
import Data.Gargantext.Types.Node ( NodePoly import Data.Gargantext.Types.Node ( NodePoly
...@@ -20,7 +20,7 @@ import Data.Gargantext.Types.Node ( NodePoly ...@@ -20,7 +20,7 @@ import Data.Gargantext.Types.Node ( NodePoly
-- | Language of a Text -- | Language of a Text
-- For simplicity, we suppose Text as an homogenous language -- For simplicity, we suppose text has an homogenous language
data Language = EN | FR -- | DE | IT | SP data Language = EN | FR -- | DE | IT | SP
-- > EN == english -- > EN == english
-- > FR == french -- > FR == french
...@@ -33,9 +33,6 @@ type Ngrams = (Text, Text, Text) ...@@ -33,9 +33,6 @@ type Ngrams = (Text, Text, Text)
type ErrorMessage = String type ErrorMessage = String
-- Parse Texts
type GargParser = ByteString -> Either ErrorMessage Corpus
-- | TODO add Symbolic Node / Document -- | TODO add Symbolic Node / Document
-- TODO make instances of Nodes -- TODO make instances of Nodes
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment