[PARSING] CSV GARG V3 (is back).

f22f6115 · Alexandre Delanoë · e8b08aeb · f22f6115 · f22f6115
Commit f22f6115 authored Jul 09, 2019 by Alexandre Delanoë
Hide whitespace changes
Inline Side-by-side

Showing with 48 additions and 21 deletions

Parsers.hs src/Gargantext/Text/Corpus/Parsers.hs +2 -1

CSV.hs src/Gargantext/Text/Corpus/Parsers/CSV.hs +46 -20

No files found.
--- a/src/Gargantext/Text/Corpus/Parsers.hs
+++ b/src/Gargantext/Text/Corpus/Parsers.hs
@@ -52,7 +52,7 @@ import qualified Gargantext.Text.Corpus.Parsers.WOS as WOS
 import qualified Gargantext.Text.Corpus.Parsers.RIS as RIS
 import Gargantext.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
 import qualified Gargantext.Text.Corpus.Parsers.Date as Date
-import Gargantext.Text.Corpus.Parsers.CSV (parseHal)
+import Gargantext.Text.Corpus.Parsers.CSV (parseHal, parseCsv)
 import Gargantext.Text.Terms.Stop (detectLang)
 ------------------------------------------------------------------------
@@ -88,6 +88,7 @@ parseFormat = undefined
 -- TODO: to debug maybe add the filepath in error message
 parseFile :: FileFormat -> FilePath -> IO [HyperdataDocument]
 parseFile CsvHalFormat p = parseHal p
+parseFile CsvGargV3 p = parseCsv p
 parseFile RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
 parseFile WOS       p = join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS       <$> readFileWith WOS p
 parseFile ff        p = join $ mapM (toDoc ff)  <$> snd <$> enrichWith ff        <$> readFileWith ff p

--- a/src/Gargantext/Text/Corpus/Parsers/CSV.hs
+++ b/src/Gargantext/Text/Corpus/Parsers/CSV.hs
@@ -101,7 +101,6 @@ fromDocs docs = V.map fromDocs' docs
 ---------------------------------------------------------------
 -- | Split a document in its context
 -- TODO adapt the size of the paragraph according to the corpus average
 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
                          if docSize > 1000
@@ -113,22 +112,21 @@ splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
                                  V.fromList [doc]
                            else
                              V.fromList [doc]
+  where
+    splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
-splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
+    splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
-splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
+        where
-    where
+          firstDoc = CsvDoc t s py pm pd firstAbstract auth
-      firstDoc = CsvDoc t s py pm pd firstAbstract auth
+          firstAbstract = head' "splitDoc'1" abstracts
-      firstAbstract = head' "splitDoc'1" abstracts
+          nextDocs = map (\txt -> CsvDoc
-      nextDocs = map (\txt -> CsvDoc
+                                    (head' "splitDoc'2" $ sentences txt)
-                                (head' "splitDoc'2" $ sentences txt)
+                                    s py pm pd 
-                                s py pm pd 
+                                    (unsentences $ tail' "splitDoc'1" $ sentences txt)
-                                (unsentences $ tail' "splitDoc'1" $ sentences txt)
+                                    auth
-                                auth
+                          ) (tail' "splitDoc'2" abstracts)
-                      ) (tail' "splitDoc'2" abstracts)
+          abstracts    = (splitBy $ contextSize) abst
-      abstracts    = (splitBy $ contextSize) abst
 ---------------------------------------------------------------
 ---------------------------------------------------------------
@@ -196,8 +194,8 @@ delimiter :: Word8
 delimiter = fromIntegral $ ord '\t'
 ------------------------------------------------------------------------
 ------------------------------------------------------------------------
-readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
+readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO [Text]
-readCsvOn fields fp = V.toList
+readCsvOn' fields fp = V.toList
                   <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
                   <$> snd
                   <$> readFile fp
@@ -231,6 +229,7 @@ readCsvLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
      Right csvDocs -> csvDocs
 ------------------------------------------------------------------------
 -- | TODO use readFileLazy
 readCsvHal :: FilePath -> IO (Header, Vector CsvHal)
 readCsvHal = fmap readCsvHalLazyBS . BL.readFile
@@ -361,8 +360,35 @@ csvHal2doc (CsvHal title source
                               Nothing
                               Nothing
+csv2doc :: CsvDoc -> HyperdataDocument
+csv2doc (CsvDoc title source
+       pub_year pub_month pub_day
+       abstract authors ) = HyperdataDocument (Just "CsvHal")
+                               Nothing
+                               Nothing
+                               Nothing
+                               Nothing
+                               Nothing
+                               (Just title)
+                               (Just authors)
+                               Nothing
+                               (Just source)
+                               (Just abstract)
+                               (Just $ pack . show $ jour (fromIntegral pub_year) pub_month pub_day)
+                               (Just $ fromIntegral pub_year)
+                               (Just pub_month)
+                               (Just pub_day)
+                               Nothing
+                               Nothing
+                               Nothing
+                               Nothing
 ------------------------------------------------------------------------
 parseHal :: FilePath -> IO [HyperdataDocument]
-parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readCsvHal fp
+parseHal fp = V.toList <$> V.map csvHal2doc <$> snd <$> readCsvHal fp
 ------------------------------------------------------------------------
+parseCsv :: FilePath -> IO [HyperdataDocument]
+parseCsv fp = V.toList <$> V.map csv2doc <$> snd <$> readFile fp