Commit 68365f40 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[PARSERS] refactor.

parent d2029ea7
......@@ -24,14 +24,13 @@ import qualified Data.Vector as V
import Gargantext.Prelude
import Gargantext.Text.Search
import Gargantext.Text.Parsers.CSV
import qualified Gargantext.Text.Parsers.CSV as CSV
------------------------------------------------------------------------
type Query = [S.Term]
filterDocs :: [DocId] -> Vector Doc -> Vector Doc
filterDocs docIds = V.filter (\doc -> S.member (d_docId doc) $ S.fromList docIds )
filterDocs :: [DocId] -> Vector CSV.Doc -> Vector CSV.Doc
filterDocs docIds = V.filter (\doc -> S.member (CSV.d_docId doc) $ S.fromList docIds )
main :: IO ()
......@@ -41,17 +40,17 @@ main = do
--let q = ["water", "scarcity", "morocco", "shortage","flood"]
let q = ["gratuit", "gratuité", "culture", "culturel"]
(h,csvDocs) <- readCsv rPath
(h,csvDocs) <- CSV.readFile rPath
putStrLn $ "Number of documents before:" <> show (V.length csvDocs)
putStrLn $ "Mean size of docs:" <> show ( docsSize csvDocs)
putStrLn $ "Mean size of docs:" <> show ( CSV.docsSize csvDocs)
let docs = toDocs csvDocs
let docs = CSV.toDocs csvDocs
let engine = insertDocs docs initialDocSearchEngine
let docIds = S.query engine (map pack q)
let docs' = fromDocs $ filterDocs docIds (V.fromList docs)
let docs' = CSV.fromDocs $ filterDocs docIds (V.fromList docs)
putStrLn $ "Number of documents after:" <> show (V.length docs')
putStrLn $ "Mean size of docs:" <> show (docsSize docs')
putStrLn $ "Mean size of docs:" <> show (CSV.docsSize docs')
writeCsv wPath (h, docs')
CSV.writeFile wPath (h, docs')
......@@ -38,7 +38,7 @@ import Servant.Swagger (HasSwagger(toSwagger))
import Control.Monad
import Control.Monad.IO.Class
import Gargantext.API.Types
import Servant.CSV.Cassava (CSV'(..))
--import Servant.CSV.Cassava (CSV'(..))
--import Test.QuickCheck.Arbitrary (Arbitrary, arbitrary)
--import Data.Swagger
--import Gargantext.API.Ngrams (TODO)
......@@ -102,3 +102,4 @@ upload multipartData = do
pure $ Text.concat $ map cs is
-------------------------------------------------------------------------------
......@@ -101,7 +101,7 @@ mapIdSchool :: Map Text Text
mapIdSchool = M.fromList $ Gargantext.Prelude.map (\(School n _ i) -> (i,n)) schools
hal_data :: IO (DV.Vector CsvHal)
hal_data = snd <$> CSV.readHal "doc/corpus_imt/Gargantext_Corpus.csv"
hal_data = snd <$> CSV.readCsvHal "doc/corpus_imt/Gargantext_Corpus.csv"
names :: S.Set Text
names = S.fromList $ Gargantext.Prelude.map (\s -> school_id s) schools
......
......@@ -22,9 +22,10 @@ please follow the types.
{-# LANGUAGE PackageImports #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Parsers (parse, FileFormat(..), clean, parseFile)
module Gargantext.Text.Parsers (FileFormat(..), clean, parseFile)
where
--import Data.ByteString (ByteString)
import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
import Control.Concurrent.Async as CCA (mapConcurrently)
import Control.Monad (join)
......@@ -39,20 +40,18 @@ import Data.String (String())
import Data.String()
import Data.Text (Text)
import Data.Text.Encoding (decodeUtf8)
import Data.Time (UTCTime(..))
import Data.Tuple.Extra (both, first, second)
import System.FilePath (FilePath(), takeExtension)
import qualified Data.ByteString as DB
import qualified Data.Map as DM
import qualified Data.Text as DT
import qualified Data.Time as DT
import Gargantext.Core (Lang(..))
import Gargantext.Prelude
import Gargantext.Database.Types.Node (HyperdataDocument(..))
import qualified Gargantext.Text.Parsers.WOS as WOS
import qualified Gargantext.Text.Parsers.RIS as RIS
import Gargantext.Text.Parsers.RIS.Presse (presseEnrich)
import Gargantext.Text.Parsers.Date (parseDate)
import qualified Gargantext.Text.Parsers.Date as Date
import Gargantext.Text.Parsers.CSV (parseHal)
import Gargantext.Text.Terms.Stop (detectLang)
------------------------------------------------------------------------
......@@ -77,33 +76,21 @@ data FileFormat = WOS | RIS | RisPresse
-- | ODT -- Not Implemented / import Pandoc
-- | PDF -- Not Implemented / pdftotext and import Pandoc ?
-- | XML -- Not Implemented / see :
-- -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
-- TODO: to debug maybe add the filepath in error message
{-
parseFormat :: FileFormat -> ByteString -> [HyperdataDocument]
parseFormat = undefined
-}
-- | Parse file into documents
-- TODO manage errors here
-- TODO: to debug maybe add the filepath in error message
parseFile :: FileFormat -> FilePath -> IO [HyperdataDocument]
parseFile CsvHalFormat p = parseHal p
parseFile RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith presseEnrich <$> parse' RIS p
parseFile WOS p = join $ mapM (toDoc WOS) <$> snd <$> enrichWith (map (first WOS.keys)) <$> parse' WOS p
parseFile ff p = join $ mapM (toDoc ff) <$> snd <$> parse ff p
type Year = Int
type Month = Int
type Day = Int
-- | Parse date to Ints
-- TODO add hours, minutes and seconds
parseDate' :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
parseDate' _ Nothing = pure (Nothing, (Nothing, Nothing, Nothing))
parseDate' l (Just txt) = do
utcTime <- parseDate l txt
let (UTCTime day _) = utcTime
let (y,m,d) = DT.toGregorian day
pure (Just utcTime, (Just (fromIntegral y), Just m,Just d))
parseFile RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
parseFile WOS p = join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
parseFile ff p = join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
-- TODO use language for RIS
......@@ -113,7 +100,7 @@ toDoc ff d = do
let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
(utcTime, (pub_year, pub_month, pub_day)) <- parseDate' lang dateToParse
(utcTime, (pub_year, pub_month, pub_day)) <- Date.split lang dateToParse
pure $ HyperdataDocument (Just $ DT.pack $ show ff)
(lookup "doi" d)
......@@ -135,26 +122,28 @@ toDoc ff d = do
Nothing
(Just $ (DT.pack . show) lang)
parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
parse ff fp = enrichWith identity <$> parse' ff fp
enrichWith :: FileFormat
-> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith RisPresse = enrichWith' presseEnrich
enrichWith WOS = enrichWith' (map (first WOS.keys))
enrichWith _ = enrichWith' identity
enrichWith ::
([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
-> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith f = second (map both' . map f . concat)
enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
-> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith' f = second (map both' . map f . concat)
where
both' = map (both decodeUtf8)
parse' :: FileFormat -> FilePath
readFileWith :: FileFormat -> FilePath
-> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
parse' format path = do
readFileWith format path = do
files <- case takeExtension path of
".zip" -> openZip path
_ -> pure <$> clean <$> DB.readFile path
partitionEithers <$> mapConcurrently (runParser format) files
-- | withParser:
-- According to the format of the text, choose the right parser.
-- TODO withParser :: FileFormat -> Parser [Document]
......@@ -181,4 +170,3 @@ clean txt = DBC.map clean' txt
clean' '’' = '\''
clean' '\r' = ' '
clean' c = c
......@@ -17,24 +17,23 @@ CSV parser for Gargantext corpus files.
module Gargantext.Text.Parsers.CSV where
import GHC.Real (round)
import GHC.IO (FilePath)
import Control.Applicative
import Data.Char (ord)
import Data.Csv
import Data.Either (Either(Left, Right))
import Data.Text (Text, pack, length, intercalate)
import qualified Data.ByteString.Lazy as BL
import Data.Time.Segment (jour)
import Data.Vector (Vector)
import qualified Data.Vector as V
import GHC.IO (FilePath)
import GHC.Real (round)
import GHC.Word (Word8)
import Gargantext.Database.Types.Node -- (HyperdataDocument(..))
import Gargantext.Prelude hiding (length)
import Gargantext.Text
import Gargantext.Text.Context
import Gargantext.Prelude hiding (length)
import qualified Data.ByteString.Lazy as BL
import qualified Data.ByteString as BS
import qualified Data.Vector as V
---------------------------------------------------------------
headerCsvGargV3 :: Header
......@@ -192,40 +191,65 @@ hyperdataDocument2csvDoc h = CsvDoc (m $ _hyperdataDocument_title h)
csvDecodeOptions :: DecodeOptions
csvDecodeOptions = (defaultDecodeOptions
{decDelimiter = fromIntegral $ ord '\t'}
)
csvDecodeOptions = defaultDecodeOptions {decDelimiter = delimiter}
csvEncodeOptions :: EncodeOptions
csvEncodeOptions = ( defaultEncodeOptions
{encDelimiter = fromIntegral $ ord '\t'}
)
csvEncodeOptions = defaultEncodeOptions {encDelimiter = delimiter}
delimiter :: Word8
delimiter = fromIntegral $ ord '\t'
------------------------------------------------------------------------
------------------------------------------------------------------------
readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
<$> snd
<$> readCsv fp
<$> readFile fp
------------------------------------------------------------------------
readCsv :: FilePath -> IO (Header, Vector CsvDoc)
readCsv fp = do
csvData <- BL.readFile fp
case decodeByNameWith csvDecodeOptions csvData of
readFileLazy :: (FromNamedRecord a) => a -> FilePath -> IO (Header, Vector a)
readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
readFileStrict :: (FromNamedRecord a) => a -> FilePath -> IO (Header, Vector a)
readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
readByteStringLazy :: (FromNamedRecord a) => a -> BL.ByteString -> (Header, Vector a)
readByteStringLazy f bs = case decodeByNameWith csvDecodeOptions bs of
Left e -> panic (pack e)
Right csvDocs -> csvDocs
readByteStringStrict :: (FromNamedRecord a) => a -> BS.ByteString -> (Header, Vector a)
readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
------------------------------------------------------------------------
-- | TODO use readFileLazy
readFile :: FilePath -> IO (Header, Vector CsvDoc)
readFile = fmap readCsvLazyBS . BL.readFile
-- | TODO use readByteStringLazy
readCsvLazyBS :: BL.ByteString -> (Header, Vector CsvDoc)
readCsvLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
Left e -> panic (pack e)
Right csvDocs -> pure csvDocs
Right csvDocs -> csvDocs
------------------------------------------------------------------------
-- | TODO use readFileLazy
readCsvHal :: FilePath -> IO (Header, Vector CsvHal)
readCsvHal = fmap readCsvHalLazyBS . BL.readFile
readHal :: FilePath -> IO (Header, Vector CsvHal)
readHal fp = do
csvData <- BL.readFile fp
case decodeByNameWith csvDecodeOptions csvData of
-- | TODO use readByteStringLazy
readCsvHalLazyBS :: BL.ByteString -> (Header, Vector CsvHal)
readCsvHalLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
Left e -> panic (pack e)
Right csvDocs -> pure csvDocs
Right csvDocs -> csvDocs
readCsvHalBSStrict :: BS.ByteString -> (Header, Vector CsvHal)
readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
------------------------------------------------------------------------
writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
writeCsv fp (h, vs) = BL.writeFile fp $
writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
writeFile fp (h, vs) = BL.writeFile fp $
encodeByNameWith csvEncodeOptions h (V.toList vs)
writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
......@@ -342,6 +366,6 @@ csvHal2doc (CsvHal title source
------------------------------------------------------------------------
parseHal :: FilePath -> IO [HyperdataDocument]
parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readHal fp
parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readCsvHal fp
------------------------------------------------------------------------
......@@ -18,12 +18,12 @@ DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Parsers.Date (parseDate, parseDateRaw) where
module Gargantext.Text.Parsers.Date (parse, parseRaw, split) where
import Data.HashMap.Strict as HM hiding (map)
import Data.Text (Text, unpack, splitOn, pack)
import Data.Time (parseTimeOrError, defaultTimeLocale)
import Data.Time.Clock (UTCTime, getCurrentTime)
import Data.Time (parseTimeOrError, defaultTimeLocale, toGregorian)
import Data.Time.Clock (UTCTime(..), getCurrentTime)
import Data.Time.LocalTime (utc)
import Data.Time.LocalTime.TimeZone.Series (zonedTimeToZoneSeriesTime)
import Duckling.Api (analyze)
......@@ -38,21 +38,41 @@ import qualified Data.HashSet as HashSet
import qualified Duckling.Core as DC
------------------------------------------------------------------------
-- | Parse date to Ints
-- TODO add hours, minutes and seconds
split :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
split _ Nothing = pure (Nothing, (Nothing, Nothing, Nothing))
split l (Just txt) = do
utcTime <- parse l txt
let (y, m, d) = split' utcTime
pure (Just utcTime, (Just y, Just m,Just d))
split' :: UTCTime -> (Year, Month, Day)
split' utcTime = (fromIntegral y, m, d)
where
(UTCTime day _) = utcTime
(y,m,d) = toGregorian day
type Year = Int
type Month = Int
type Day = Int
------------------------------------------------------------------------
-- | Date Parser
-- Parses dates mentions in full text given the language.
-- >>> parseDate FR (pack "10 avril 1979 à 19H")
-- 1979-04-10 19:00:00 UTC
-- >>> parseDate EN (pack "April 10 1979")
-- 1979-04-10 00:00:00 UTC
parseDate :: Lang -> Text -> IO UTCTime
parseDate lang s = parseDate' "%Y-%m-%dT%T" "0-0-0T0:0:0" lang s
parse :: Lang -> Text -> IO UTCTime
parse lang s = parseDate' "%Y-%m-%dT%T" "0-0-0T0:0:0" lang s
type DateFormat = Text
type DateDefault = Text
parseDate' :: DateFormat -> DateDefault -> Lang -> Text -> IO UTCTime
parseDate' format def lang s = do
dateStr' <- parseDateRaw lang s
dateStr' <- parseRaw lang s
let dateStr = unpack $ maybe def identity
$ head $ splitOn "." dateStr'
pure $ parseTimeOrError True defaultTimeLocale (unpack format) dateStr
......@@ -70,19 +90,19 @@ parserLang EN = DC.EN
-- IO can be avoided here:
-- currentContext :: Lang -> IO Context
-- currentContext lang = localContext lang <$> utcToDucklingTime <$> getCurrentTime
-- parseDateRaw :: Context -> Text -> SomeErrorHandling Text
-- parseRaw :: Context -> Text -> SomeErrorHandling Text
-- TODO error handling
parseDateRaw :: Lang -> Text -> IO (Text)
parseDateRaw lang text = do
parseRaw :: Lang -> Text -> IO (Text)
parseRaw lang text = do
maybeJson <- map jsonValue <$> parseDateWithDuckling lang text
case headMay maybeJson of
Just (Json.Object object) -> case HM.lookup "value" object of
Just (Json.String date) -> pure date
Just _ -> panic "ParseDateRaw ERROR: should be a json String"
Nothing -> panic $ "ParseDateRaw ERROR: no date found" <> (pack . show) lang <> " " <> text
Just _ -> panic "ParseRaw ERROR: should be a json String"
Nothing -> panic $ "ParseRaw ERROR: no date found" <> (pack . show) lang <> " " <> text
_ -> panic $ "ParseDateRaw ERROR: type error" <> (pack . show) lang <> " " <> text
_ -> panic $ "ParseRaw ERROR: type error" <> (pack . show) lang <> " " <> text
-- | Current Time in DucklingTime format
......
......@@ -28,7 +28,7 @@ import Data.Text (Text, unpack)
import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Prelude
import System.IO (FilePath)
import Gargantext.Text.Parsers.CSV (CsvDoc(..), writeCsv, headerCsvGargV3)
import Gargantext.Text.Parsers.CSV (CsvDoc(..), writeFile, headerCsvGargV3)
import Data.Vector (fromList)
data Patent = Patent { _patent_title :: Text
......@@ -48,7 +48,7 @@ type FilePathOut = FilePath
json2csv :: FilePathIn -> FilePathOut -> IO ()
json2csv fin fout = do
patents <- maybe (panic "json2csv error") identity <$> readPatents fin
writeCsv fout (headerCsvGargV3, fromList $ map patent2csvDoc patents)
writeFile fout (headerCsvGargV3, fromList $ map patent2csvDoc patents)
patent2csvDoc :: Patent -> CsvDoc
patent2csvDoc (Patent title abstract year _) =
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment