Commit f22f6115 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[PARSING] CSV GARG V3 (is back).

parent e8b08aeb
Pipeline #515 failed with stage
......@@ -52,7 +52,7 @@ import qualified Gargantext.Text.Corpus.Parsers.WOS as WOS
import qualified Gargantext.Text.Corpus.Parsers.RIS as RIS
import Gargantext.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
import qualified Gargantext.Text.Corpus.Parsers.Date as Date
import Gargantext.Text.Corpus.Parsers.CSV (parseHal)
import Gargantext.Text.Corpus.Parsers.CSV (parseHal, parseCsv)
import Gargantext.Text.Terms.Stop (detectLang)
------------------------------------------------------------------------
......@@ -88,6 +88,7 @@ parseFormat = undefined
-- TODO: to debug maybe add the filepath in error message
parseFile :: FileFormat -> FilePath -> IO [HyperdataDocument]
parseFile CsvHalFormat p = parseHal p
parseFile CsvGargV3 p = parseCsv p
parseFile RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
parseFile WOS p = join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
parseFile ff p = join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
......
......@@ -101,7 +101,6 @@ fromDocs docs = V.map fromDocs' docs
---------------------------------------------------------------
-- | Split a document in its context
-- TODO adapt the size of the paragraph according to the corpus average
splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
if docSize > 1000
......@@ -113,22 +112,21 @@ splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
V.fromList [doc]
else
V.fromList [doc]
splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
where
firstDoc = CsvDoc t s py pm pd firstAbstract auth
firstAbstract = head' "splitDoc'1" abstracts
nextDocs = map (\txt -> CsvDoc
(head' "splitDoc'2" $ sentences txt)
s py pm pd
(unsentences $ tail' "splitDoc'1" $ sentences txt)
auth
) (tail' "splitDoc'2" abstracts)
abstracts = (splitBy $ contextSize) abst
where
splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
where
firstDoc = CsvDoc t s py pm pd firstAbstract auth
firstAbstract = head' "splitDoc'1" abstracts
nextDocs = map (\txt -> CsvDoc
(head' "splitDoc'2" $ sentences txt)
s py pm pd
(unsentences $ tail' "splitDoc'1" $ sentences txt)
auth
) (tail' "splitDoc'2" abstracts)
abstracts = (splitBy $ contextSize) abst
---------------------------------------------------------------
---------------------------------------------------------------
......@@ -196,8 +194,8 @@ delimiter :: Word8
delimiter = fromIntegral $ ord '\t'
------------------------------------------------------------------------
------------------------------------------------------------------------
readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
readCsvOn fields fp = V.toList
readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO [Text]
readCsvOn' fields fp = V.toList
<$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
<$> snd
<$> readFile fp
......@@ -231,6 +229,7 @@ readCsvLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
Right csvDocs -> csvDocs
------------------------------------------------------------------------
-- | TODO use readFileLazy
readCsvHal :: FilePath -> IO (Header, Vector CsvHal)
readCsvHal = fmap readCsvHalLazyBS . BL.readFile
......@@ -361,8 +360,35 @@ csvHal2doc (CsvHal title source
Nothing
Nothing
csv2doc :: CsvDoc -> HyperdataDocument
csv2doc (CsvDoc title source
pub_year pub_month pub_day
abstract authors ) = HyperdataDocument (Just "CsvHal")
Nothing
Nothing
Nothing
Nothing
Nothing
(Just title)
(Just authors)
Nothing
(Just source)
(Just abstract)
(Just $ pack . show $ jour (fromIntegral pub_year) pub_month pub_day)
(Just $ fromIntegral pub_year)
(Just pub_month)
(Just pub_day)
Nothing
Nothing
Nothing
Nothing
------------------------------------------------------------------------
parseHal :: FilePath -> IO [HyperdataDocument]
parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readCsvHal fp
parseHal fp = V.toList <$> V.map csvHal2doc <$> snd <$> readCsvHal fp
------------------------------------------------------------------------
parseCsv :: FilePath -> IO [HyperdataDocument]
parseCsv fp = V.toList <$> V.map csv2doc <$> snd <$> readFile fp
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment