[PARSERS] refactor.

68365f40 · Alexandre Delanoë · d2029ea7 · 68365f40 · 68365f40 · 68365f40
Commit 68365f40 authored May 16, 2019 by Alexandre Delanoë
7 changed files
--- a/bin/gargantext-cli/CleanCsvCorpus.hs
+++ b/bin/gargantext-cli/CleanCsvCorpus.hs
@@ -24,14 +24,13 @@ import qualified Data.Vector as V
 import Gargantext.Prelude
 import Gargantext.Text.Search
-import Gargantext.Text.Parsers.CSV
+import qualified Gargantext.Text.Parsers.CSV as CSV
 ------------------------------------------------------------------------
 type Query = [S.Term]
-filterDocs :: [DocId] -> Vector Doc -> Vector Doc
+filterDocs :: [DocId] -> Vector CSV.Doc -> Vector CSV.Doc
-filterDocs docIds = V.filter (\doc -> S.member (d_docId doc) $ S.fromList docIds )
+filterDocs docIds = V.filter (\doc -> S.member (CSV.d_docId doc) $ S.fromList docIds )
 main :: IO ()
@@ -41,17 +40,17 @@ main = do
  --let q = ["water", "scarcity", "morocco", "shortage","flood"]
  let q = ["gratuit", "gratuité", "culture", "culturel"]
-  (h,csvDocs) <- readCsv rPath
+  (h,csvDocs) <- CSV.readFile rPath
  putStrLn $ "Number of documents before:" <> show (V.length csvDocs)
-  putStrLn $ "Mean size of docs:" <> show ( docsSize csvDocs)
+  putStrLn $ "Mean size of docs:" <> show ( CSV.docsSize csvDocs)
-  let docs   = toDocs csvDocs
+  let docs   = CSV.toDocs csvDocs
  let engine = insertDocs docs initialDocSearchEngine
  let docIds = S.query engine (map pack q)
-  let docs'  = fromDocs $ filterDocs docIds (V.fromList docs)
+  let docs'  = CSV.fromDocs $ filterDocs docIds (V.fromList docs)
  putStrLn $ "Number of documents after:" <> show (V.length docs')
-  putStrLn $ "Mean size of docs:" <> show (docsSize docs')
+  putStrLn $ "Mean size of docs:" <> show (CSV.docsSize docs')
-  writeCsv wPath (h, docs')
+  CSV.writeFile wPath (h, docs')
--- a/src/Gargantext/API/Upload.hs
+++ b/src/Gargantext/API/Upload.hs
@@ -38,7 +38,7 @@ import Servant.Swagger (HasSwagger(toSwagger))
 import Control.Monad
 import Control.Monad.IO.Class
 import Gargantext.API.Types
-import Servant.CSV.Cassava (CSV'(..))
+--import Servant.CSV.Cassava (CSV'(..))
 --import Test.QuickCheck.Arbitrary (Arbitrary, arbitrary)
 --import Data.Swagger
 --import Gargantext.API.Ngrams (TODO)
@@ -102,3 +102,4 @@ upload multipartData = do
  pure $ Text.concat $ map cs is
 -------------------------------------------------------------------------------
--- a/src/Gargantext/Ext/IMT.hs
+++ b/src/Gargantext/Ext/IMT.hs
@@ -101,7 +101,7 @@ mapIdSchool :: Map Text Text
 mapIdSchool = M.fromList $ Gargantext.Prelude.map (\(School n _ i) -> (i,n)) schools
 hal_data :: IO (DV.Vector CsvHal)
-hal_data = snd <$> CSV.readHal "doc/corpus_imt/Gargantext_Corpus.csv"
+hal_data = snd <$> CSV.readCsvHal "doc/corpus_imt/Gargantext_Corpus.csv"
 names :: S.Set Text
 names = S.fromList $ Gargantext.Prelude.map (\s -> school_id s) schools

--- a/src/Gargantext/Text/Parsers.hs
+++ b/src/Gargantext/Text/Parsers.hs
@@ -22,9 +22,10 @@ please follow the types.
 {-# LANGUAGE PackageImports    #-}
 {-# LANGUAGE OverloadedStrings #-}
-module Gargantext.Text.Parsers (parse, FileFormat(..), clean, parseFile)
+module Gargantext.Text.Parsers (FileFormat(..), clean, parseFile)
    where
+--import Data.ByteString (ByteString)
 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
 import Control.Concurrent.Async as CCA (mapConcurrently)
 import Control.Monad (join)
@@ -39,20 +40,18 @@ import Data.String (String())
 import Data.String()
 import Data.Text (Text)
 import Data.Text.Encoding (decodeUtf8)
-import Data.Time (UTCTime(..))
 import Data.Tuple.Extra (both, first, second)
 import System.FilePath (FilePath(), takeExtension)
 import qualified Data.ByteString as DB
 import qualified Data.Map        as DM
 import qualified Data.Text as DT
-import qualified Data.Time as DT
 import Gargantext.Core (Lang(..))
 import Gargantext.Prelude
 import Gargantext.Database.Types.Node (HyperdataDocument(..))
 import qualified Gargantext.Text.Parsers.WOS as WOS
 import qualified Gargantext.Text.Parsers.RIS as RIS
 import Gargantext.Text.Parsers.RIS.Presse (presseEnrich)
-import Gargantext.Text.Parsers.Date (parseDate)
+import qualified Gargantext.Text.Parsers.Date as Date
 import Gargantext.Text.Parsers.CSV (parseHal)
 import Gargantext.Text.Terms.Stop (detectLang)
 ------------------------------------------------------------------------
@@ -77,33 +76,21 @@ data FileFormat = WOS | RIS | RisPresse
 --                | ODT        -- Not Implemented / import Pandoc
 --                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
 --                | XML        -- Not Implemented / see :
--                             -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
-- TODO: to debug maybe add the filepath in error message
+{-
+parseFormat :: FileFormat -> ByteString -> [HyperdataDocument]
+parseFormat = undefined
+-}
 -- | Parse file into documents
 -- TODO manage errors here
+-- TODO: to debug maybe add the filepath in error message
 parseFile :: FileFormat -> FilePath -> IO [HyperdataDocument]
 parseFile CsvHalFormat p = parseHal p
-parseFile RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith presseEnrich           <$> parse' RIS p
+parseFile RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
-parseFile WOS       p = join $ mapM (toDoc WOS) <$> snd <$> enrichWith (map (first WOS.keys)) <$> parse' WOS p
+parseFile WOS       p = join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS       <$> readFileWith WOS p
-parseFile ff        p = join $ mapM (toDoc ff)  <$> snd <$> parse ff p
+parseFile ff        p = join $ mapM (toDoc ff)  <$> snd <$> enrichWith ff        <$> readFileWith ff p
-type Year  = Int
-type Month = Int
-type Day   = Int
-- | Parse date to Ints
-- TODO add hours, minutes and seconds
-parseDate' :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
-parseDate' _ Nothing    = pure (Nothing, (Nothing, Nothing, Nothing))
-parseDate' l (Just txt) = do
-  utcTime <- parseDate l txt
-  let (UTCTime day _) = utcTime
-  let (y,m,d) = DT.toGregorian day
-  pure (Just utcTime, (Just (fromIntegral y), Just m,Just d))
 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
 -- TODO use language for RIS
@@ -113,7 +100,7 @@ toDoc ff d = do
      let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
-      (utcTime, (pub_year, pub_month, pub_day)) <- parseDate' lang  dateToParse
+      (utcTime, (pub_year, pub_month, pub_day)) <- Date.split lang  dateToParse
      pure $ HyperdataDocument (Just $ DT.pack $ show ff)
                               (lookup "doi" d)
@@ -135,26 +122,28 @@ toDoc ff d = do
                               Nothing
                               (Just $ (DT.pack . show) lang)
-parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
+enrichWith :: FileFormat
-parse ff fp = enrichWith identity <$> parse' ff fp
+           ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
+enrichWith RisPresse = enrichWith' presseEnrich
+enrichWith WOS       = enrichWith' (map (first WOS.keys))
+enrichWith _         = enrichWith' identity
-enrichWith ::
+enrichWith' ::       ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
-  ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
+           ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
-  ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
+enrichWith' f = second (map both' . map f . concat)
-enrichWith f = second (map both' . map f . concat)
  where
    both'   = map (both decodeUtf8)
-parse' :: FileFormat -> FilePath
+readFileWith :: FileFormat -> FilePath
       -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
-parse' format path = do
+readFileWith format path = do
    files <- case takeExtension path of
              ".zip" -> openZip              path
              _      -> pure <$> clean <$> DB.readFile path
    partitionEithers <$> mapConcurrently (runParser format) files
 -- | withParser:
 -- According to the format of the text, choose the right parser.
 -- TODO  withParser :: FileFormat -> Parser [Document]
@@ -181,4 +170,3 @@ clean txt = DBC.map clean' txt
    clean' '’' = '\''
    clean' '\r' = ' '
    clean' c  = c
--- a/src/Gargantext/Text/Parsers/CSV.hs
+++ b/src/Gargantext/Text/Parsers/CSV.hs
@@ -17,24 +17,23 @@ CSV parser for Gargantext corpus files.
 module Gargantext.Text.Parsers.CSV where
-import GHC.Real (round)
-import GHC.IO (FilePath)
 import Control.Applicative
 import Data.Char (ord)
 import Data.Csv
 import Data.Either (Either(Left, Right))
 import Data.Text (Text, pack, length, intercalate)
-import qualified Data.ByteString.Lazy as BL
 import Data.Time.Segment (jour)
 import Data.Vector (Vector)
-import qualified Data.Vector as V
+import GHC.IO (FilePath)
+import GHC.Real (round)
+import GHC.Word (Word8)
 import Gargantext.Database.Types.Node -- (HyperdataDocument(..))
+import Gargantext.Prelude hiding (length)
 import Gargantext.Text
 import Gargantext.Text.Context
-import Gargantext.Prelude hiding (length)
+import qualified Data.ByteString.Lazy as BL
+import qualified Data.ByteString as  BS
+import qualified Data.Vector as V
 ---------------------------------------------------------------
 headerCsvGargV3 :: Header
@@ -192,40 +191,65 @@ hyperdataDocument2csvDoc h = CsvDoc (m $ _hyperdataDocument_title h)
 csvDecodeOptions :: DecodeOptions
-csvDecodeOptions = (defaultDecodeOptions
+csvDecodeOptions = defaultDecodeOptions {decDelimiter = delimiter}
-                      {decDelimiter = fromIntegral $ ord '\t'}
-                    )
 csvEncodeOptions :: EncodeOptions
-csvEncodeOptions = ( defaultEncodeOptions 
+csvEncodeOptions = defaultEncodeOptions {encDelimiter = delimiter}
-                      {encDelimiter = fromIntegral $ ord '\t'}
-                    )
+delimiter :: Word8
+delimiter = fromIntegral $ ord '\t'
 ------------------------------------------------------------------------
 ------------------------------------------------------------------------
 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
                      <$> snd
-                      <$> readCsv fp
+                      <$> readFile fp
 ------------------------------------------------------------------------
-readCsv :: FilePath -> IO (Header, Vector CsvDoc)
-readCsv fp = do
+readFileLazy :: (FromNamedRecord a) => a -> FilePath -> IO (Header, Vector a)
-    csvData <- BL.readFile fp
+readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
-    case decodeByNameWith csvDecodeOptions csvData of
+readFileStrict :: (FromNamedRecord a) => a -> FilePath -> IO (Header, Vector a)
+readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
+readByteStringLazy :: (FromNamedRecord a) => a -> BL.ByteString -> (Header, Vector a)
+readByteStringLazy f bs = case decodeByNameWith csvDecodeOptions bs of
+      Left e        -> panic (pack e)
+      Right csvDocs -> csvDocs
+readByteStringStrict :: (FromNamedRecord a) => a -> BS.ByteString -> (Header, Vector a)
+readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
+------------------------------------------------------------------------
+-- | TODO use readFileLazy
+readFile :: FilePath -> IO (Header, Vector CsvDoc)
+readFile = fmap readCsvLazyBS . BL.readFile
+-- | TODO use readByteStringLazy
+readCsvLazyBS :: BL.ByteString -> (Header, Vector CsvDoc)
+readCsvLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
      Left e        -> panic (pack e)
-      Right csvDocs -> pure csvDocs
+      Right csvDocs -> csvDocs
+------------------------------------------------------------------------
+-- | TODO use readFileLazy
+readCsvHal :: FilePath -> IO (Header, Vector CsvHal)
+readCsvHal = fmap readCsvHalLazyBS . BL.readFile
-readHal :: FilePath -> IO (Header, Vector CsvHal)
+-- | TODO use readByteStringLazy
-readHal fp = do
+readCsvHalLazyBS :: BL.ByteString -> (Header, Vector CsvHal)
-    csvData <- BL.readFile fp
+readCsvHalLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
-    case decodeByNameWith csvDecodeOptions csvData of
      Left e        -> panic (pack e)
-      Right csvDocs -> pure csvDocs
+      Right csvDocs -> csvDocs
+readCsvHalBSStrict :: BS.ByteString -> (Header, Vector CsvHal)
+readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
 ------------------------------------------------------------------------
-writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
+writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
-writeCsv fp (h, vs) = BL.writeFile fp $
+writeFile fp (h, vs) = BL.writeFile fp $
                      encodeByNameWith csvEncodeOptions h (V.toList vs)
 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
@@ -342,6 +366,6 @@ csvHal2doc (CsvHal title source
 ------------------------------------------------------------------------
 parseHal :: FilePath -> IO [HyperdataDocument]
-parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readHal fp
+parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readCsvHal fp
 ------------------------------------------------------------------------
--- a/src/Gargantext/Text/Parsers/Date.hs
+++ b/src/Gargantext/Text/Parsers/Date.hs
@@ -18,12 +18,12 @@ DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
 {-# LANGUAGE NoImplicitPrelude #-}
 {-# LANGUAGE OverloadedStrings #-}
-module Gargantext.Text.Parsers.Date (parseDate, parseDateRaw) where
+module Gargantext.Text.Parsers.Date (parse, parseRaw, split) where
 import Data.HashMap.Strict as HM hiding (map)
 import Data.Text (Text, unpack, splitOn, pack)
-import Data.Time (parseTimeOrError, defaultTimeLocale)
+import Data.Time (parseTimeOrError, defaultTimeLocale, toGregorian)
-import Data.Time.Clock (UTCTime, getCurrentTime)
+import Data.Time.Clock (UTCTime(..), getCurrentTime)
 import Data.Time.LocalTime (utc)
 import Data.Time.LocalTime.TimeZone.Series (zonedTimeToZoneSeriesTime)
 import Duckling.Api (analyze)
@@ -38,21 +38,41 @@ import qualified Data.HashSet as HashSet
 import qualified Duckling.Core as DC
 ------------------------------------------------------------------------
+-- | Parse date to Ints
+-- TODO add hours, minutes and seconds
+split :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
+split _ Nothing    = pure (Nothing, (Nothing, Nothing, Nothing))
+split l (Just txt) = do
+  utcTime <- parse l txt
+  let (y, m, d) = split' utcTime
+  pure (Just utcTime, (Just y, Just m,Just d))
+split' :: UTCTime -> (Year, Month, Day)
+split' utcTime = (fromIntegral y, m, d)
+  where
+    (UTCTime day _) = utcTime
+    (y,m,d)         = toGregorian day
+type Year  = Int
+type Month = Int
+type Day   = Int
+------------------------------------------------------------------------
 -- | Date Parser
 -- Parses dates mentions in full text given the language.
 -- >>> parseDate FR (pack "10 avril 1979 à 19H")
 -- 1979-04-10 19:00:00 UTC
 -- >>> parseDate EN (pack "April 10 1979")
 -- 1979-04-10 00:00:00 UTC
-parseDate :: Lang -> Text -> IO UTCTime
+parse :: Lang -> Text -> IO UTCTime
-parseDate lang s = parseDate' "%Y-%m-%dT%T" "0-0-0T0:0:0" lang s
+parse lang s = parseDate' "%Y-%m-%dT%T" "0-0-0T0:0:0" lang s
 type DateFormat  = Text
 type DateDefault = Text
 parseDate' :: DateFormat -> DateDefault -> Lang -> Text -> IO UTCTime
 parseDate' format def lang s = do
-  dateStr' <- parseDateRaw lang s
+  dateStr' <- parseRaw lang s
  let dateStr = unpack $ maybe def identity
                       $ head $ splitOn "." dateStr'
  pure $ parseTimeOrError True defaultTimeLocale (unpack format) dateStr
@@ -70,19 +90,19 @@ parserLang EN = DC.EN
 -- IO can be avoided here:
 -- currentContext :: Lang -> IO Context
 -- currentContext lang = localContext lang <$> utcToDucklingTime <$> getCurrentTime
-- parseDateRaw :: Context -> Text -> SomeErrorHandling Text
+-- parseRaw :: Context -> Text -> SomeErrorHandling Text
 -- TODO error handling
-parseDateRaw :: Lang -> Text -> IO (Text)
+parseRaw :: Lang -> Text -> IO (Text)
-parseDateRaw lang text = do
+parseRaw lang text = do
    maybeJson <- map jsonValue <$> parseDateWithDuckling lang text
    case headMay maybeJson of
      Just (Json.Object object) -> case HM.lookup "value" object of
                                     Just (Json.String date) -> pure date
-                                     Just _                  -> panic "ParseDateRaw ERROR: should be a json String"
+                                     Just _                  -> panic "ParseRaw ERROR: should be a json String"
-                                     Nothing                 -> panic $ "ParseDateRaw ERROR: no date found" <> (pack . show) lang <> " " <> text
+                                     Nothing                 -> panic $ "ParseRaw ERROR: no date found" <> (pack . show) lang <> " " <> text
-      _                         -> panic $ "ParseDateRaw ERROR: type error" <> (pack . show) lang <> " " <> text
+      _                         -> panic $ "ParseRaw ERROR: type error" <> (pack . show) lang <> " " <> text
 -- | Current Time in DucklingTime format

--- a/src/Gargantext/Text/Parsers/Json2Csv.hs
+++ b/src/Gargantext/Text/Parsers/Json2Csv.hs
@@ -28,7 +28,7 @@ import Data.Text (Text, unpack)
 import Gargantext.Core.Utils.Prefix (unPrefix)
 import Gargantext.Prelude
 import System.IO (FilePath)
-import Gargantext.Text.Parsers.CSV (CsvDoc(..), writeCsv, headerCsvGargV3)
+import Gargantext.Text.Parsers.CSV (CsvDoc(..), writeFile, headerCsvGargV3)
 import Data.Vector (fromList)
 data Patent = Patent { _patent_title :: Text
@@ -48,7 +48,7 @@ type FilePathOut = FilePath
 json2csv :: FilePathIn -> FilePathOut -> IO ()
 json2csv fin fout = do
  patents <- maybe (panic "json2csv error") identity <$> readPatents fin
-  writeCsv fout (headerCsvGargV3, fromList $ map patent2csvDoc patents)
+  writeFile fout (headerCsvGargV3, fromList $ map patent2csvDoc patents)
 patent2csvDoc :: Patent -> CsvDoc
 patent2csvDoc (Patent title abstract year _) =