Commit f939fb19 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[TEXTLINE] adding CSV format parser.

parent 8a838c7f
...@@ -38,6 +38,8 @@ import Gargantext.Text.Metrics ...@@ -38,6 +38,8 @@ import Gargantext.Text.Metrics
import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms) import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms)
import Gargantext.Text.Context (splitBy, SplitContext(Sentences)) import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
import Gargantext.Text.Parsers.CSV
import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..)) import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
...@@ -51,17 +53,22 @@ import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..)) ...@@ -51,17 +53,22 @@ import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
-} -}
workflow lang path = do data WorkType = CSV | FullText
-- workflow :: Lang (EN|FR) -> FilePath -> Graph
workflow termsLang workType path = do
-- Text <- IO Text <- FilePath -- Text <- IO Text <- FilePath
text <- readFile path contexts <- case workType of
FullText -> splitBy (Sentences 5) <$> readFile path
CSV -> readCsvOn [csv_title, csv_abstract] path
let contexts = splitBy (Sentences 5) text
-- Context :: Text -> [Text] -- Context :: Text -> [Text]
-- Contexts = Paragraphs n | Sentences n | Chars n -- Contexts = Paragraphs n | Sentences n | Chars n
myterms <- extractTerms (Mono lang) contexts myterms <- extractTerms (Mono FR) contexts
-- TermsType = Mono | Multi | MonoMulti
-- myterms # filter (\t -> not . elem t stopList) -- myterms # filter (\t -> not . elem t stopList)
-- # groupBy (Stem|GroupList) -- # groupBy (Stem|GroupList|Ontology)
printDebug "myterms" (sum $ map length myterms) printDebug "myterms" (sum $ map length myterms)
-- Bulding the map list -- Bulding the map list
...@@ -75,11 +82,11 @@ workflow lang path = do ...@@ -75,11 +82,11 @@ workflow lang path = do
printDebug "myCooc2" (M.size myCooc2) printDebug "myCooc2" (M.size myCooc2)
-- Filtering terms with inclusion/Exclusion and Specifity/Genericity scores -- Filtering terms with inclusion/Exclusion and Specifity/Genericity scores
let myCooc3 = filterCooc ( FilterConfig (MapListSize 20 ) let myCooc3 = filterCooc ( FilterConfig (MapListSize 1000 )
(InclusionSize 1000 ) (InclusionSize 4000 )
(SampleBins 10 ) (SampleBins 10 )
(Clusters 3 ) (Clusters 3 )
(DefaultValue (-1)) (DefaultValue 0 )
) myCooc2 ) myCooc2
printDebug "myCooc3" $ M.size myCooc3 printDebug "myCooc3" $ M.size myCooc3
...@@ -90,26 +97,25 @@ workflow lang path = do ...@@ -90,26 +97,25 @@ workflow lang path = do
let myCooc4 = toIndex ti myCooc3 let myCooc4 = toIndex ti myCooc3
printDebug "myCooc4" $ M.size myCooc4 printDebug "myCooc4" $ M.size myCooc4
let matCooc = map2mat (-2) (M.size ti) myCooc4 let matCooc = map2mat (0) (M.size ti) myCooc4
printDebug "matCooc" matCooc --printDebug "matCooc" matCooc
pure matCooc
-- Matrix -> Clustering -- Matrix -> Clustering
--let distanceMat = conditional matCooc let distanceMat = conditional matCooc
-- let distanceMat = distributional matCooc -- let distanceMat = distributional matCooc
-- printDebug "distanceMat" $ A.arrayShape distanceMat printDebug "distanceMat" $ A.arrayShape distanceMat
-- printDebug "distanceMat" distanceMat --printDebug "distanceMat" distanceMat
-- --
-- let distanceMap = mat2map distanceMat let distanceMap = mat2map distanceMat
-- printDebug "distanceMap" $ M.size distanceMap printDebug "distanceMap" $ M.size distanceMap
--{- --{-
-- let distance = fromIndex fi distanceMap -- let distance = fromIndex fi distanceMap
-- printDebug "distance" $ M.size distance -- printDebug "distance" $ M.size distance
---} ---}
-- partitions <- cLouvain distanceMap partitions <- cLouvain distanceMap
------ | Building : -> Graph -> JSON ------ | Building : -> Graph -> JSON
-- printDebug "partitions" $ length partitions printDebug "partitions" $ length partitions
-- pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions --printDebug "partitions" partitions
pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
----------------------------------------------------------- -----------------------------------------------------------
......
...@@ -82,7 +82,10 @@ filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScore ...@@ -82,7 +82,10 @@ filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScore
foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m') foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
M.empty selection M.empty selection
where where
selection = [(x,y) | x <- ts, y <- ts, x > y] selection = [(x,y) | x <- ts
, y <- ts
-- , x >= y
]
-- | Map list creation -- | Map list creation
......
...@@ -12,7 +12,6 @@ CSV parser for Gargantext corpus files. ...@@ -12,7 +12,6 @@ CSV parser for Gargantext corpus files.
-} -}
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE DeriveGeneric #-}
module Gargantext.Text.Parsers.CSV where module Gargantext.Text.Parsers.CSV where
...@@ -25,7 +24,7 @@ import Control.Applicative ...@@ -25,7 +24,7 @@ import Control.Applicative
import Data.Char (ord) import Data.Char (ord)
import Data.Csv import Data.Csv
import Data.Either (Either(Left, Right)) import Data.Either (Either(Left, Right))
import Data.Text (Text, pack, length) import Data.Text (Text, pack, length, intercalate)
import qualified Data.ByteString.Lazy as BL import qualified Data.ByteString.Lazy as BL
import Data.Vector (Vector) import Data.Vector (Vector)
...@@ -68,9 +67,8 @@ fromDocs docs = V.map fromDocs' docs ...@@ -68,9 +67,8 @@ fromDocs docs = V.map fromDocs' docs
-- | Split a document in its context -- | Split a document in its context
-- TODO adapt the size of the paragraph according to the corpus average -- TODO adapt the size of the paragraph according to the corpus average
splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
splitDoc m splt doc = let docSize = (length $ c_abstract doc) in splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
if docSize > 1000 if docSize > 1000
then then
if (mod (round m) docSize) >= 10 if (mod (round m) docSize) >= 10
...@@ -101,18 +99,18 @@ type Mean = Double ...@@ -101,18 +99,18 @@ type Mean = Double
docsSize :: Vector CsvDoc -> Mean docsSize :: Vector CsvDoc -> Mean
docsSize csvDoc = mean ls docsSize csvDoc = mean ls
where where
ls = V.toList $ V.map (fromIntegral . length . c_abstract) csvDoc ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
--------------------------------------------------------------- ---------------------------------------------------------------
data CsvDoc = CsvDoc data CsvDoc = CsvDoc
{ c_title :: !Text { csv_title :: !Text
, c_source :: !Text , csv_source :: !Text
, c_publication_year :: !Int , csv_publication_year :: !Int
, c_publication_month :: !Int , csv_publication_month :: !Int
, c_publication_day :: !Int , csv_publication_day :: !Int
, c_abstract :: !Text , csv_abstract :: !Text
, c_authors :: !Text , csv_authors :: !Text
} }
deriving (Show) deriving (Show)
...@@ -148,6 +146,14 @@ csvEncodeOptions = ( defaultEncodeOptions ...@@ -148,6 +146,14 @@ csvEncodeOptions = ( defaultEncodeOptions
) )
------------------------------------------------------------------------
------------------------------------------------------------------------
readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
<$> snd
<$> readCsv fp
------------------------------------------------------------------------
readCsv :: FilePath -> IO (Header, Vector CsvDoc) readCsv :: FilePath -> IO (Header, Vector CsvDoc)
readCsv fp = do readCsv fp = do
csvData <- BL.readFile fp csvData <- BL.readFile fp
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment