Commit f22f6115 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[PARSING] CSV GARG V3 (is back).

parent e8b08aeb
...@@ -52,7 +52,7 @@ import qualified Gargantext.Text.Corpus.Parsers.WOS as WOS ...@@ -52,7 +52,7 @@ import qualified Gargantext.Text.Corpus.Parsers.WOS as WOS
import qualified Gargantext.Text.Corpus.Parsers.RIS as RIS import qualified Gargantext.Text.Corpus.Parsers.RIS as RIS
import Gargantext.Text.Corpus.Parsers.RIS.Presse (presseEnrich) import Gargantext.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
import qualified Gargantext.Text.Corpus.Parsers.Date as Date import qualified Gargantext.Text.Corpus.Parsers.Date as Date
import Gargantext.Text.Corpus.Parsers.CSV (parseHal) import Gargantext.Text.Corpus.Parsers.CSV (parseHal, parseCsv)
import Gargantext.Text.Terms.Stop (detectLang) import Gargantext.Text.Terms.Stop (detectLang)
------------------------------------------------------------------------ ------------------------------------------------------------------------
...@@ -88,6 +88,7 @@ parseFormat = undefined ...@@ -88,6 +88,7 @@ parseFormat = undefined
-- TODO: to debug maybe add the filepath in error message -- TODO: to debug maybe add the filepath in error message
parseFile :: FileFormat -> FilePath -> IO [HyperdataDocument] parseFile :: FileFormat -> FilePath -> IO [HyperdataDocument]
parseFile CsvHalFormat p = parseHal p parseFile CsvHalFormat p = parseHal p
parseFile CsvGargV3 p = parseCsv p
parseFile RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p parseFile RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
parseFile WOS p = join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p parseFile WOS p = join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
parseFile ff p = join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p parseFile ff p = join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
......
...@@ -101,7 +101,6 @@ fromDocs docs = V.map fromDocs' docs ...@@ -101,7 +101,6 @@ fromDocs docs = V.map fromDocs' docs
--------------------------------------------------------------- ---------------------------------------------------------------
-- | Split a document in its context -- | Split a document in its context
-- TODO adapt the size of the paragraph according to the corpus average -- TODO adapt the size of the paragraph according to the corpus average
splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
if docSize > 1000 if docSize > 1000
...@@ -113,22 +112,21 @@ splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in ...@@ -113,22 +112,21 @@ splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
V.fromList [doc] V.fromList [doc]
else else
V.fromList [doc] V.fromList [doc]
where
splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs where
where firstDoc = CsvDoc t s py pm pd firstAbstract auth
firstDoc = CsvDoc t s py pm pd firstAbstract auth firstAbstract = head' "splitDoc'1" abstracts
firstAbstract = head' "splitDoc'1" abstracts
nextDocs = map (\txt -> CsvDoc
nextDocs = map (\txt -> CsvDoc (head' "splitDoc'2" $ sentences txt)
(head' "splitDoc'2" $ sentences txt) s py pm pd
s py pm pd (unsentences $ tail' "splitDoc'1" $ sentences txt)
(unsentences $ tail' "splitDoc'1" $ sentences txt) auth
auth ) (tail' "splitDoc'2" abstracts)
) (tail' "splitDoc'2" abstracts)
abstracts = (splitBy $ contextSize) abst
abstracts = (splitBy $ contextSize) abst
--------------------------------------------------------------- ---------------------------------------------------------------
--------------------------------------------------------------- ---------------------------------------------------------------
...@@ -196,8 +194,8 @@ delimiter :: Word8 ...@@ -196,8 +194,8 @@ delimiter :: Word8
delimiter = fromIntegral $ ord '\t' delimiter = fromIntegral $ ord '\t'
------------------------------------------------------------------------ ------------------------------------------------------------------------
------------------------------------------------------------------------ ------------------------------------------------------------------------
readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text] readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO [Text]
readCsvOn fields fp = V.toList readCsvOn' fields fp = V.toList
<$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields) <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
<$> snd <$> snd
<$> readFile fp <$> readFile fp
...@@ -231,6 +229,7 @@ readCsvLazyBS bs = case decodeByNameWith csvDecodeOptions bs of ...@@ -231,6 +229,7 @@ readCsvLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
Right csvDocs -> csvDocs Right csvDocs -> csvDocs
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | TODO use readFileLazy -- | TODO use readFileLazy
readCsvHal :: FilePath -> IO (Header, Vector CsvHal) readCsvHal :: FilePath -> IO (Header, Vector CsvHal)
readCsvHal = fmap readCsvHalLazyBS . BL.readFile readCsvHal = fmap readCsvHalLazyBS . BL.readFile
...@@ -361,8 +360,35 @@ csvHal2doc (CsvHal title source ...@@ -361,8 +360,35 @@ csvHal2doc (CsvHal title source
Nothing Nothing
Nothing Nothing
csv2doc :: CsvDoc -> HyperdataDocument
csv2doc (CsvDoc title source
pub_year pub_month pub_day
abstract authors ) = HyperdataDocument (Just "CsvHal")
Nothing
Nothing
Nothing
Nothing
Nothing
(Just title)
(Just authors)
Nothing
(Just source)
(Just abstract)
(Just $ pack . show $ jour (fromIntegral pub_year) pub_month pub_day)
(Just $ fromIntegral pub_year)
(Just pub_month)
(Just pub_day)
Nothing
Nothing
Nothing
Nothing
------------------------------------------------------------------------ ------------------------------------------------------------------------
parseHal :: FilePath -> IO [HyperdataDocument] parseHal :: FilePath -> IO [HyperdataDocument]
parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readCsvHal fp parseHal fp = V.toList <$> V.map csvHal2doc <$> snd <$> readCsvHal fp
------------------------------------------------------------------------ ------------------------------------------------------------------------
parseCsv :: FilePath -> IO [HyperdataDocument]
parseCsv fp = V.toList <$> V.map csv2doc <$> snd <$> readFile fp
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment