Commit f939fb19 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[TEXTLINE] adding CSV format parser.

parent 8a838c7f
......@@ -38,6 +38,8 @@ import Gargantext.Text.Metrics
import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms)
import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
import Gargantext.Text.Parsers.CSV
import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
......@@ -51,17 +53,22 @@ import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
-}
workflow lang path = do
data WorkType = CSV | FullText
-- workflow :: Lang (EN|FR) -> FilePath -> Graph
workflow termsLang workType path = do
-- Text <- IO Text <- FilePath
text <- readFile path
contexts <- case workType of
FullText -> splitBy (Sentences 5) <$> readFile path
CSV -> readCsvOn [csv_title, csv_abstract] path
let contexts = splitBy (Sentences 5) text
-- Context :: Text -> [Text]
-- Contexts = Paragraphs n | Sentences n | Chars n
myterms <- extractTerms (Mono lang) contexts
myterms <- extractTerms (Mono FR) contexts
-- TermsType = Mono | Multi | MonoMulti
-- myterms # filter (\t -> not . elem t stopList)
-- # groupBy (Stem|GroupList)
-- # groupBy (Stem|GroupList|Ontology)
printDebug "myterms" (sum $ map length myterms)
-- Bulding the map list
......@@ -73,13 +80,13 @@ workflow lang path = do
-- Remove Apax: appears one time only => lighting the matrix
let myCooc2 = M.filter (>1) myCooc1
printDebug "myCooc2" (M.size myCooc2)
-- Filtering terms with inclusion/Exclusion and Specifity/Genericity scores
let myCooc3 = filterCooc ( FilterConfig (MapListSize 20 )
(InclusionSize 1000 )
let myCooc3 = filterCooc ( FilterConfig (MapListSize 1000 )
(InclusionSize 4000 )
(SampleBins 10 )
(Clusters 3 )
(DefaultValue (-1))
(DefaultValue 0 )
) myCooc2
printDebug "myCooc3" $ M.size myCooc3
......@@ -90,26 +97,25 @@ workflow lang path = do
let myCooc4 = toIndex ti myCooc3
printDebug "myCooc4" $ M.size myCooc4
let matCooc = map2mat (-2) (M.size ti) myCooc4
printDebug "matCooc" matCooc
pure matCooc
let matCooc = map2mat (0) (M.size ti) myCooc4
--printDebug "matCooc" matCooc
-- Matrix -> Clustering
--let distanceMat = conditional matCooc
let distanceMat = conditional matCooc
-- let distanceMat = distributional matCooc
-- printDebug "distanceMat" $ A.arrayShape distanceMat
-- printDebug "distanceMat" distanceMat
printDebug "distanceMat" $ A.arrayShape distanceMat
--printDebug "distanceMat" distanceMat
--
-- let distanceMap = mat2map distanceMat
-- printDebug "distanceMap" $ M.size distanceMap
let distanceMap = mat2map distanceMat
printDebug "distanceMap" $ M.size distanceMap
--{-
-- let distance = fromIndex fi distanceMap
-- printDebug "distance" $ M.size distance
---}
-- partitions <- cLouvain distanceMap
partitions <- cLouvain distanceMap
------ | Building : -> Graph -> JSON
-- printDebug "partitions" $ length partitions
-- pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
printDebug "partitions" $ length partitions
--printDebug "partitions" partitions
pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
-----------------------------------------------------------
......
......@@ -82,7 +82,10 @@ filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScore
foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
M.empty selection
where
selection = [(x,y) | x <- ts, y <- ts, x > y]
selection = [(x,y) | x <- ts
, y <- ts
-- , x >= y
]
-- | Map list creation
......
......@@ -12,7 +12,6 @@ CSV parser for Gargantext corpus files.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric #-}
module Gargantext.Text.Parsers.CSV where
......@@ -25,7 +24,7 @@ import Control.Applicative
import Data.Char (ord)
import Data.Csv
import Data.Either (Either(Left, Right))
import Data.Text (Text, pack, length)
import Data.Text (Text, pack, length, intercalate)
import qualified Data.ByteString.Lazy as BL
import Data.Vector (Vector)
......@@ -68,9 +67,8 @@ fromDocs docs = V.map fromDocs' docs
-- | Split a document in its context
-- TODO adapt the size of the paragraph according to the corpus average
splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
splitDoc m splt doc = let docSize = (length $ c_abstract doc) in
splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
if docSize > 1000
then
if (mod (round m) docSize) >= 10
......@@ -101,18 +99,18 @@ type Mean = Double
docsSize :: Vector CsvDoc -> Mean
docsSize csvDoc = mean ls
where
ls = V.toList $ V.map (fromIntegral . length . c_abstract) csvDoc
ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
---------------------------------------------------------------
data CsvDoc = CsvDoc
{ c_title :: !Text
, c_source :: !Text
, c_publication_year :: !Int
, c_publication_month :: !Int
, c_publication_day :: !Int
, c_abstract :: !Text
, c_authors :: !Text
{ csv_title :: !Text
, csv_source :: !Text
, csv_publication_year :: !Int
, csv_publication_month :: !Int
, csv_publication_day :: !Int
, csv_abstract :: !Text
, csv_authors :: !Text
}
deriving (Show)
......@@ -148,11 +146,19 @@ csvEncodeOptions = ( defaultEncodeOptions
)
------------------------------------------------------------------------
------------------------------------------------------------------------
readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
<$> snd
<$> readCsv fp
------------------------------------------------------------------------
readCsv :: FilePath -> IO (Header, Vector CsvDoc)
readCsv fp = do
csvData <- BL.readFile fp
case decodeByNameWith csvDecodeOptions csvData of
Left e -> panic (pack e)
Left e -> panic (pack e)
Right csvDocs -> pure csvDocs
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment