[Pipeline] ok until clustering.

0be01d72 · Alexandre Delanoë · 4b81f9d1 · 0be01d72 · 0be01d72 · 0be01d72
Commit 0be01d72 authored May 30, 2018 by Alexandre Delanoë
6 changed files
--- a/src/Gargantext/Pipeline.hs
+++ b/src/Gargantext/Pipeline.hs
@@ -15,35 +15,29 @@ Portability : POSIX
 module Gargantext.Pipeline
  where

-import Data.Text (unpack)
-import qualified Data.Text as DT
-
 import Data.Text.IO (readFile)

 ----------------------------------------------
 ----------------------------------------------
-
 import Gargantext.Core
-import Gargantext.Core.Types
 import Gargantext.Prelude

-import Gargantext.Viz.Graph.Index (map', createIndexes)
-import Gargantext.Viz.Graph.Distances.Matrice (distributional, int2double)
+import Gargantext.Viz.Graph.Index (score)
+import Gargantext.Viz.Graph.Distances.Matrice (distributional)
 import Gargantext.Text.Metrics.Occurrences
 import Gargantext.Text.Terms
 import Gargantext.Text.Context

-import Data.Array.Accelerate as A

-pipeline pth = do
-  text     <- readFile pth
-  let contexts = splitBy Sentences 4 text
-  myterms <- mapM (terms Multi FR) contexts
-  -- todo filter stop words
+pipeline path = do
+  -- Text  <- IO Text <- FilePath
+  text     <- readFile path  
+  let contexts = splitBy (Sentences 3) text
+  myterms <- extractTerms Multi FR contexts
+  -- TODO    filter (\t -> not . elem t stopList) myterms
+  -- TODO    groupBy (Stem | GroupList)
  let myCooc = removeApax $ cooc myterms
-  --pure myCooc
-  -- Cooc map -> Matrix
-  --pure $ createIndexes myCooc
-  pure $ map' int2double myCooc
-  -- Matrix -> Graph
+  -- Cooc -> Matrix
+  pure $ score distributional myCooc
+  -- Matrix -> Clustering -> Graph -> JSON

--- a/src/Gargantext/Text/Context.hs
+++ b/src/Gargantext/Text/Context.hs
@@ -14,26 +14,34 @@ Context of text management tool
 {-# LANGUAGE NoImplicitPrelude #-}
 {-# LANGUAGE OverloadedStrings #-}

-module Gargantext.Text.Context where
+module Gargantext.Text.Context
+  where

 import Data.Text (Text, pack, unpack, length)
 import Data.String (IsString)

-import Text.HTML.TagSoup
+import Text.HTML.TagSoup (parseTags, isTagText, Tag(..))
 import Gargantext.Text
 import Gargantext.Prelude hiding (length)

-data SplitBy = Paragraph | Sentences | Chars
-

-splitBy :: SplitBy -> Int -> Text -> [Text]
-splitBy Chars     n = map pack . chunkAlong n n . unpack
-splitBy Sentences n = map unsentences . chunkAlong n n  . sentences
-splitBy Paragraph _ = map removeTag   . filter isTagText . parseTags
+data SplitContext = Chars Int | Sentences Int | Paragraphs Int
+
+tag = parseTags
+-- | splitBy contexts of Chars or Sentences or Paragraphs
+-- >> splitBy (Chars 0) "abcde"
+-- ["a","b","c","d","e"]
+-- >> splitBy (Chars 1) "abcde"
+-- ["ab","bc","cd","de"]
+-- >> splitBy (Chars 2) "abcde"
+-- ["abc","bcd","cde"]
+splitBy :: SplitContext -> Text -> [Text]
+splitBy (Chars     n)  = map pack        . chunkAlong (n+1) 1 . unpack
+splitBy (Sentences n)  = map unsentences . chunkAlong (n+1) 1 . sentences
+splitBy (Paragraphs _) = map unTag       . filter isTagText   . tag
  where
-    removeTag :: IsString p => Tag p -> p
-    removeTag (TagText x) = x
-    removeTag (TagComment x) = x
-    removeTag _          = ""
+    unTag :: IsString p => Tag p -> p
+    unTag (TagText x) = x
+    unTag _           = ""


--- a/src/Gargantext/Text/Parsers/CSV.hs
+++ b/src/Gargantext/Text/Parsers/CSV.hs
@@ -50,13 +50,13 @@ data Doc = Doc
    deriving (Show)
 ---------------------------------------------------------------
 toDocs :: Vector CsvDoc -> [Doc]
-toDocs v = V.toList 
+toDocs v = V.toList
         $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
                       -> Doc nId t s py pm pd abst auth )
                       (V.enumFromN 1 (V.length v'')) v''
          where
            v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
-            seps= (V.fromList [Paragraph, Sentences, Chars])
+            seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])

 ---------------------------------------------------------------
 fromDocs :: Vector Doc -> Vector CsvDoc
@@ -69,7 +69,7 @@ fromDocs docs = V.map fromDocs' docs
 -- TODO adapt the size of the paragraph according to the corpus average


-splitDoc :: Mean -> SplitBy -> CsvDoc -> Vector CsvDoc
+splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
 splitDoc m splt doc = let docSize = (length $ c_abstract doc) in
                          if docSize > 1000
                            then
@@ -82,15 +82,15 @@ splitDoc m splt doc = let docSize = (length $ c_abstract doc) in
                              V.fromList [doc]


-splitDoc' :: SplitBy -> CsvDoc -> Vector CsvDoc
-splitDoc' splt (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
+splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
+splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
    where
      firstDoc = CsvDoc t s py pm pd firstAbstract auth
      firstAbstract = head' abstracts
      
      nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
      
-      abstracts    = (splitBy splt 20) abst
+      abstracts    = (splitBy $ contextSize) abst
      head' x = maybe ""   identity (head x)
      tail' x = maybe [""] identity (tailMay x)


--- a/src/Gargantext/Text/Terms.hs
+++ b/src/Gargantext/Text/Terms.hs
@@ -34,6 +34,7 @@ module Gargantext.Text.Terms
  where

 import Data.Text (Text)
+import Data.Traversable

 import Gargantext.Prelude
 import Gargantext.Core
@@ -46,6 +47,9 @@ data TermType = Mono | Multi
 -- remove Stop Words
 -- map (filter (\t -> not . elem t)) $ 
 ------------------------------------------------------------------------
+extractTerms :: Traversable t => TermType -> Lang -> t Text -> IO (t [Terms])
+extractTerms termType lang = mapM (terms termType lang)
+------------------------------------------------------------------------
 terms :: TermType -> Lang -> Text -> IO [Terms]
 terms Mono  lang txt = pure $ monoterms' lang txt
 terms Multi lang txt = multiterms lang txt

--- a/src/Gargantext/Viz/Graph/Distances/Matrice.hs
+++ b/src/Gargantext/Viz/Graph/Distances/Matrice.hs
@@ -36,10 +36,8 @@ Implementation use Accelerate library :
 module Gargantext.Viz.Graph.Distances.Matrice
  where

--import Data.Array.Accelerate.Data.Bits
-import Data.Array.Accelerate.Interpreter (run)
-
 import Data.Array.Accelerate
+import Data.Array.Accelerate.Interpreter (run)
 import Data.Array.Accelerate.Smart
 import Data.Array.Accelerate.Type
 import Data.Array.Accelerate.Array.Sugar (fromArr, Array, Z)
@@ -94,14 +92,7 @@ type SpecificityGenericity = Double
 conditional :: Matrix Double -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
 conditional m = (run $ ie (use m), run $ sg (use m))
  where
-    r :: Rank
-    r = rank' m

-    xs :: Matrix' Double -> Matrix' Double
-    xs mat = zipWith (-) (proba r mat) (mkSum r $ proba r mat)
-    ys :: Acc (Matrix Double) -> Acc (Matrix Double)
-    ys mat = zipWith (-) (proba r mat) (mkSum r $ transpose $ proba r mat)
-    
    ie :: Matrix' Double -> Matrix' Double
    ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
    sg :: Acc (Matrix Double) -> Acc (Matrix Double)
@@ -109,7 +100,14 @@ conditional m = (run $ ie (use m), run $ sg (use m))

    n :: Exp Double
    n = P.fromIntegral r
-    
+
+    r :: Rank
+    r = rank' m
+
+    xs :: Matrix' Double -> Matrix' Double
+    xs mat = zipWith (-) (proba r mat) (mkSum r $ proba r mat)
+    ys :: Acc (Matrix Double) -> Acc (Matrix Double)
+    ys mat = zipWith (-) (proba r mat) (mkSum r $ transpose $ proba r mat)

 -- filter with threshold
 -----------------------------------------------------------------------
@@ -121,7 +119,9 @@ distributional m = run $ filter $ ri (map fromIntegral $ use m)
  where
    n    = rank' m
    
-    miniMax m = map (\x -> ifThenElse (x > (the $ minimum $ maximum m)) x 0) m
+    miniMax m = map (\x -> ifThenElse (x > miniMax') x 0) m
+      where
+        miniMax' = (the $ minimum $ maximum m)
    
    filter  m = zipWith (\a b -> max a b) m (transpose m)
    

--- a/src/Gargantext/Viz/Graph/Index.hs
+++ b/src/Gargantext/Viz/Graph/Index.hs
@@ -43,28 +43,12 @@ import Gargantext.Prelude

 type Index    = Int

-
 -------------------------------------------------------------------------------
-{-
-map'' :: (Ord t) => (A.Matrix Int -> A.Matrix Double)
-                -> Map (t, t) Int
-                -> Map (t, t) Double
-map'' f m = back . f' . from m
-  where
-    from (fs, m') = unzip $ M.toAscList m
-    f'  = f $ A.fromList shape m'
-    shape = (A.Z A.:. n A.:. n)
-    back = M.fromAscList . zip fs . A.toList
-}
 -------------------------------------------------------------------------------
-map' :: (Ord t) => (A.Matrix Int -> A.Matrix Double)
+score :: (Ord t) => (A.Matrix Int -> A.Matrix Double)
                -> Map (t, t) Int
                -> Map (t, t) Double
-map' f m = fromIndex fromI . mat2cooc . f $ cooc2mat toI m
-  where
-    (toI, fromI) = createIndexes m
-
-map'' m = cooc2mat toI m
+score f m = fromIndex fromI . mat2map . f $ cooc2mat toI m
  where
    (toI, fromI) = createIndexes m

@@ -81,10 +65,9 @@ map2mat def n m = A.fromFunction shape (\(Z :. x :. y) -> fromMaybe def $ M.look
  where
    shape = (Z :. n :. n)

-- TODO rename mat2map
-mat2cooc :: (Elt a, Shape (Z :. Index)) =>
+mat2map :: (Elt a, Shape (Z :. Index)) =>
            A.Array (Z :. Index :. Index) a -> Map (Index, Index) a
-mat2cooc m = M.fromList . map f . A.toList . A.run . A.indexed $ A.use m
+mat2map m = M.fromList . map f . A.toList . A.run . A.indexed $ A.use m
  where
    Z :. _ :. n = A.arrayShape m
    f ((Z :. i :. j), x) = ((i, j), x)