[TEXTLINE] adding CSV format parser.

f939fb19 · Alexandre Delanoë · 8a838c7f · f939fb19 · f939fb19 · f939fb19
Commit f939fb19 authored Jun 13, 2018 by Alexandre Delanoë
Hide whitespace changes
Inline Side-by-side

Showing with 50 additions and 35 deletions

Pipeline.hs src/Gargantext/Pipeline.hs +27 -21

Metrics.hs src/Gargantext/Text/Metrics.hs +4 -1

CSV.hs src/Gargantext/Text/Parsers/CSV.hs +19 -13

No files found.
--- a/src/Gargantext/Pipeline.hs
+++ b/src/Gargantext/Pipeline.hs
@@ -38,6 +38,8 @@ import Gargantext.Text.Metrics
 import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms)
 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))

+import Gargantext.Text.Parsers.CSV
+
 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))


@@ -51,17 +53,22 @@ import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))

 -}

-workflow lang path = do
+data WorkType = CSV | FullText
+
+-- workflow :: Lang (EN|FR) -> FilePath -> Graph
+workflow termsLang workType path = do
  -- Text  <- IO Text <- FilePath
-  text     <- readFile path
+  contexts <- case workType of
+                FullText -> splitBy (Sentences 5) <$> readFile path
+                CSV      -> readCsvOn [csv_title, csv_abstract] path

-  let contexts = splitBy (Sentences 5) text
  -- Context :: Text -> [Text]
  -- Contexts = Paragraphs n | Sentences n | Chars n

-  myterms <- extractTerms (Mono lang) contexts
+  myterms <- extractTerms (Mono FR) contexts
+  -- TermsType = Mono | Multi | MonoMulti
  -- myterms # filter (\t -> not . elem t stopList)
-  --         # groupBy (Stem|GroupList)
+  --         # groupBy (Stem|GroupList|Ontology)
  printDebug "myterms" (sum $ map length myterms)

  -- Bulding the map list
@@ -73,13 +80,13 @@ workflow lang path = do
  -- Remove Apax: appears one time only => lighting the matrix
  let myCooc2 = M.filter (>1) myCooc1
  printDebug "myCooc2" (M.size myCooc2)
-  
+
  -- Filtering terms with inclusion/Exclusion and Specifity/Genericity scores
-  let myCooc3 = filterCooc ( FilterConfig (MapListSize     20 )
-                                          (InclusionSize 1000 )
+  let myCooc3 = filterCooc ( FilterConfig (MapListSize   1000 )
+                                          (InclusionSize 4000 )
                                          (SampleBins      10 )
                                          (Clusters         3 )
-                                          (DefaultValue   (-1))
+                                          (DefaultValue     0 )
                           ) myCooc2
  printDebug "myCooc3" $ M.size myCooc3

@@ -90,26 +97,25 @@ workflow lang path = do
  let myCooc4 = toIndex ti myCooc3
  printDebug "myCooc4" $ M.size myCooc4

-  let matCooc = map2mat (-2) (M.size ti) myCooc4
-  printDebug "matCooc" matCooc
-  pure matCooc
+  let matCooc = map2mat (0) (M.size ti) myCooc4
+  --printDebug "matCooc" matCooc
  -- Matrix -> Clustering
-  --let distanceMat = conditional matCooc
+  let distanceMat = conditional matCooc
 --  let distanceMat = distributional matCooc
--  printDebug "distanceMat" $ A.arrayShape distanceMat
--  printDebug "distanceMat" distanceMat
+  printDebug "distanceMat" $ A.arrayShape distanceMat
+  --printDebug "distanceMat" distanceMat
 -- 
--  let distanceMap = mat2map distanceMat
--  printDebug "distanceMap" $ M.size distanceMap
+  let distanceMap = mat2map distanceMat
+  printDebug "distanceMap" $ M.size distanceMap
 --{-
 --  let distance = fromIndex fi distanceMap
 --  printDebug "distance" $ M.size distance
 ---}
--  partitions <- cLouvain distanceMap
+  partitions <- cLouvain distanceMap
 ------ | Building : -> Graph -> JSON
--  printDebug "partitions" $ length partitions
--  pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
-
+  printDebug "partitions" $ length partitions
+  --printDebug "partitions" partitions
+  pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions


 -----------------------------------------------------------

--- a/src/Gargantext/Text/Metrics.hs
+++ b/src/Gargantext/Text/Metrics.hs
@@ -82,7 +82,10 @@ filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScore
  foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
    M.empty selection
  where
-    selection  = [(x,y) | x <- ts, y <- ts, x > y]
+    selection  = [(x,y) | x <- ts
+                        , y <- ts
+                       -- , x >= y
+                        ]


 -- | Map list creation

--- a/src/Gargantext/Text/Parsers/CSV.hs
+++ b/src/Gargantext/Text/Parsers/CSV.hs
@@ -12,7 +12,6 @@ CSV parser for Gargantext corpus files.
 -}

 {-# LANGUAGE NoImplicitPrelude #-}
-{-# LANGUAGE OverloadedStrings #-}
 {-# LANGUAGE DeriveGeneric     #-}

 module Gargantext.Text.Parsers.CSV where
@@ -25,7 +24,7 @@ import Control.Applicative
 import Data.Char (ord)
 import Data.Csv
 import Data.Either (Either(Left, Right))
-import Data.Text (Text, pack, length)
+import Data.Text (Text, pack, length, intercalate)
 import qualified Data.ByteString.Lazy as BL

 import Data.Vector (Vector)
@@ -68,9 +67,8 @@ fromDocs docs = V.map fromDocs' docs
 -- | Split a document in its context
 -- TODO adapt the size of the paragraph according to the corpus average

-
 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
-splitDoc m splt doc = let docSize = (length $ c_abstract doc) in
+splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
                          if docSize > 1000
                            then
                              if (mod (round m) docSize) >= 10
@@ -101,18 +99,18 @@ type Mean = Double
 docsSize :: Vector CsvDoc -> Mean
 docsSize csvDoc = mean ls
  where
-    ls = V.toList $ V.map (fromIntegral . length . c_abstract) csvDoc
+    ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc


 ---------------------------------------------------------------
 data CsvDoc = CsvDoc
-    { c_title  :: !Text
-    , c_source :: !Text
-    , c_publication_year  :: !Int
-    , c_publication_month :: !Int
-    , c_publication_day   :: !Int
-    , c_abstract          :: !Text
-    , c_authors           :: !Text
+    { csv_title  :: !Text
+    , csv_source :: !Text
+    , csv_publication_year  :: !Int
+    , csv_publication_month :: !Int
+    , csv_publication_day   :: !Int
+    , csv_abstract          :: !Text
+    , csv_authors           :: !Text
    }
    deriving (Show)

@@ -148,11 +146,19 @@ csvEncodeOptions = ( defaultEncodeOptions
                    )


+------------------------------------------------------------------------
+------------------------------------------------------------------------
+readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
+readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
+                      <$> snd
+                      <$> readCsv fp
+
+------------------------------------------------------------------------
 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
 readCsv fp = do
    csvData <- BL.readFile fp
    case decodeByNameWith csvDecodeOptions csvData of
-      Left e    -> panic (pack e)
+      Left e        -> panic (pack e)
      Right csvDocs -> pure csvDocs