[TEXTLINE] adding CSV format parser.

f939fb19 · Alexandre Delanoë · 8a838c7f · f939fb19 · f939fb19 · f939fb19
Commit f939fb19 authored Jun 13, 2018 by Alexandre Delanoë
Show whitespace changes
Inline Side-by-side

Showing with 50 additions and 35 deletions

Pipeline.hs src/Gargantext/Pipeline.hs +27 -21

Metrics.hs src/Gargantext/Text/Metrics.hs +4 -1

CSV.hs src/Gargantext/Text/Parsers/CSV.hs +19 -13

No files found.
--- a/src/Gargantext/Pipeline.hs
+++ b/src/Gargantext/Pipeline.hs
@@ -38,6 +38,8 @@ import Gargantext.Text.Metrics
 import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms)
 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
+import Gargantext.Text.Parsers.CSV
 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
@@ -51,17 +53,22 @@ import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
 -}
-workflow lang path = do
+data WorkType = CSV | FullText
+-- workflow :: Lang (EN|FR) -> FilePath -> Graph
+workflow termsLang workType path = do
  -- Text  <- IO Text <- FilePath
-  text     <- readFile path
+  contexts <- case workType of
+                FullText -> splitBy (Sentences 5) <$> readFile path
+                CSV      -> readCsvOn [csv_title, csv_abstract] path
-  let contexts = splitBy (Sentences 5) text
  -- Context :: Text -> [Text]
  -- Contexts = Paragraphs n | Sentences n | Chars n
-  myterms <- extractTerms (Mono lang) contexts
+  myterms <- extractTerms (Mono FR) contexts
+  -- TermsType = Mono | Multi | MonoMulti
  -- myterms # filter (\t -> not . elem t stopList)
-  --         # groupBy (Stem|GroupList)
+  --         # groupBy (Stem|GroupList|Ontology)
  printDebug "myterms" (sum $ map length myterms)
  -- Bulding the map list
@@ -75,11 +82,11 @@ workflow lang path = do
  printDebug "myCooc2" (M.size myCooc2)
  -- Filtering terms with inclusion/Exclusion and Specifity/Genericity scores
-  let myCooc3 = filterCooc ( FilterConfig (MapListSize     20 )
+  let myCooc3 = filterCooc ( FilterConfig (MapListSize   1000 )
-                                          (InclusionSize 1000 )
+                                          (InclusionSize 4000 )
                                          (SampleBins      10 )
                                          (Clusters         3 )
-                                          (DefaultValue   (-1))
+                                          (DefaultValue     0 )
                           ) myCooc2
  printDebug "myCooc3" $ M.size myCooc3
@@ -90,26 +97,25 @@ workflow lang path = do
  let myCooc4 = toIndex ti myCooc3
  printDebug "myCooc4" $ M.size myCooc4
-  let matCooc = map2mat (-2) (M.size ti) myCooc4
+  let matCooc = map2mat (0) (M.size ti) myCooc4
-  printDebug "matCooc" matCooc
+  --printDebug "matCooc" matCooc
-  pure matCooc
  -- Matrix -> Clustering
-  --let distanceMat = conditional matCooc
+  let distanceMat = conditional matCooc
 --  let distanceMat = distributional matCooc
--  printDebug "distanceMat" $ A.arrayShape distanceMat
+  printDebug "distanceMat" $ A.arrayShape distanceMat
--  printDebug "distanceMat" distanceMat
+  --printDebug "distanceMat" distanceMat
 -- 
--  let distanceMap = mat2map distanceMat
+  let distanceMap = mat2map distanceMat
--  printDebug "distanceMap" $ M.size distanceMap
+  printDebug "distanceMap" $ M.size distanceMap
 --{-
 --  let distance = fromIndex fi distanceMap
 --  printDebug "distance" $ M.size distance
 ---}
--  partitions <- cLouvain distanceMap
+  partitions <- cLouvain distanceMap
 ------ | Building : -> Graph -> JSON
--  printDebug "partitions" $ length partitions
+  printDebug "partitions" $ length partitions
--  pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
+  --printDebug "partitions" partitions
+  pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
 -----------------------------------------------------------

--- a/src/Gargantext/Text/Metrics.hs
+++ b/src/Gargantext/Text/Metrics.hs
@@ -82,7 +82,10 @@ filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScore
  foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
    M.empty selection
  where
-    selection  = [(x,y) | x <- ts, y <- ts, x > y]
+    selection  = [(x,y) | x <- ts
+                        , y <- ts
+                       -- , x >= y
+                        ]
 -- | Map list creation

--- a/src/Gargantext/Text/Parsers/CSV.hs
+++ b/src/Gargantext/Text/Parsers/CSV.hs
@@ -12,7 +12,6 @@ CSV parser for Gargantext corpus files.
 -}
 {-# LANGUAGE NoImplicitPrelude #-}
-{-# LANGUAGE OverloadedStrings #-}
 {-# LANGUAGE DeriveGeneric     #-}
 module Gargantext.Text.Parsers.CSV where
@@ -25,7 +24,7 @@ import Control.Applicative
 import Data.Char (ord)
 import Data.Csv
 import Data.Either (Either(Left, Right))
-import Data.Text (Text, pack, length)
+import Data.Text (Text, pack, length, intercalate)
 import qualified Data.ByteString.Lazy as BL
 import Data.Vector (Vector)
@@ -68,9 +67,8 @@ fromDocs docs = V.map fromDocs' docs
 -- | Split a document in its context
 -- TODO adapt the size of the paragraph according to the corpus average
 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
-splitDoc m splt doc = let docSize = (length $ c_abstract doc) in
+splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
                          if docSize > 1000
                            then
                              if (mod (round m) docSize) >= 10
@@ -101,18 +99,18 @@ type Mean = Double
 docsSize :: Vector CsvDoc -> Mean
 docsSize csvDoc = mean ls
  where
-    ls = V.toList $ V.map (fromIntegral . length . c_abstract) csvDoc
+    ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
 ---------------------------------------------------------------
 data CsvDoc = CsvDoc
-    { c_title  :: !Text
+    { csv_title  :: !Text
-    , c_source :: !Text
+    , csv_source :: !Text
-    , c_publication_year  :: !Int
+    , csv_publication_year  :: !Int
-    , c_publication_month :: !Int
+    , csv_publication_month :: !Int
-    , c_publication_day   :: !Int
+    , csv_publication_day   :: !Int
-    , c_abstract          :: !Text
+    , csv_abstract          :: !Text
-    , c_authors           :: !Text
+    , csv_authors           :: !Text
    }
    deriving (Show)
@@ -148,6 +146,14 @@ csvEncodeOptions = ( defaultEncodeOptions
                    )
+------------------------------------------------------------------------
+------------------------------------------------------------------------
+readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
+readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
+                      <$> snd
+                      <$> readCsv fp
+------------------------------------------------------------------------
 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
 readCsv fp = do
    csvData <- BL.readFile fp