Commit 0be01d72 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[Pipeline] ok until clustering.

parent 4b81f9d1
...@@ -15,35 +15,29 @@ Portability : POSIX ...@@ -15,35 +15,29 @@ Portability : POSIX
module Gargantext.Pipeline module Gargantext.Pipeline
where where
import Data.Text (unpack)
import qualified Data.Text as DT
import Data.Text.IO (readFile) import Data.Text.IO (readFile)
---------------------------------------------- ----------------------------------------------
---------------------------------------------- ----------------------------------------------
import Gargantext.Core import Gargantext.Core
import Gargantext.Core.Types
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Viz.Graph.Index (map', createIndexes) import Gargantext.Viz.Graph.Index (score)
import Gargantext.Viz.Graph.Distances.Matrice (distributional, int2double) import Gargantext.Viz.Graph.Distances.Matrice (distributional)
import Gargantext.Text.Metrics.Occurrences import Gargantext.Text.Metrics.Occurrences
import Gargantext.Text.Terms import Gargantext.Text.Terms
import Gargantext.Text.Context import Gargantext.Text.Context
import Data.Array.Accelerate as A
pipeline pth = do pipeline path = do
text <- readFile pth -- Text <- IO Text <- FilePath
let contexts = splitBy Sentences 4 text text <- readFile path
myterms <- mapM (terms Multi FR) contexts let contexts = splitBy (Sentences 3) text
-- todo filter stop words myterms <- extractTerms Multi FR contexts
-- TODO filter (\t -> not . elem t stopList) myterms
-- TODO groupBy (Stem | GroupList)
let myCooc = removeApax $ cooc myterms let myCooc = removeApax $ cooc myterms
--pure myCooc -- Cooc -> Matrix
-- Cooc map -> Matrix pure $ score distributional myCooc
--pure $ createIndexes myCooc -- Matrix -> Clustering -> Graph -> JSON
pure $ map' int2double myCooc
-- Matrix -> Graph
...@@ -14,26 +14,34 @@ Context of text management tool ...@@ -14,26 +14,34 @@ Context of text management tool
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Context where module Gargantext.Text.Context
where
import Data.Text (Text, pack, unpack, length) import Data.Text (Text, pack, unpack, length)
import Data.String (IsString) import Data.String (IsString)
import Text.HTML.TagSoup import Text.HTML.TagSoup (parseTags, isTagText, Tag(..))
import Gargantext.Text import Gargantext.Text
import Gargantext.Prelude hiding (length) import Gargantext.Prelude hiding (length)
data SplitBy = Paragraph | Sentences | Chars
splitBy :: SplitBy -> Int -> Text -> [Text] data SplitContext = Chars Int | Sentences Int | Paragraphs Int
splitBy Chars n = map pack . chunkAlong n n . unpack
splitBy Sentences n = map unsentences . chunkAlong n n . sentences tag = parseTags
splitBy Paragraph _ = map removeTag . filter isTagText . parseTags -- | splitBy contexts of Chars or Sentences or Paragraphs
-- >> splitBy (Chars 0) "abcde"
-- ["a","b","c","d","e"]
-- >> splitBy (Chars 1) "abcde"
-- ["ab","bc","cd","de"]
-- >> splitBy (Chars 2) "abcde"
-- ["abc","bcd","cde"]
splitBy :: SplitContext -> Text -> [Text]
splitBy (Chars n) = map pack . chunkAlong (n+1) 1 . unpack
splitBy (Sentences n) = map unsentences . chunkAlong (n+1) 1 . sentences
splitBy (Paragraphs _) = map unTag . filter isTagText . tag
where where
removeTag :: IsString p => Tag p -> p unTag :: IsString p => Tag p -> p
removeTag (TagText x) = x unTag (TagText x) = x
removeTag (TagComment x) = x unTag _ = ""
removeTag _ = ""
...@@ -50,13 +50,13 @@ data Doc = Doc ...@@ -50,13 +50,13 @@ data Doc = Doc
deriving (Show) deriving (Show)
--------------------------------------------------------------- ---------------------------------------------------------------
toDocs :: Vector CsvDoc -> [Doc] toDocs :: Vector CsvDoc -> [Doc]
toDocs v = V.toList toDocs v = V.toList
$ V.zipWith (\nId (CsvDoc t s py pm pd abst auth) $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
-> Doc nId t s py pm pd abst auth ) -> Doc nId t s py pm pd abst auth )
(V.enumFromN 1 (V.length v'')) v'' (V.enumFromN 1 (V.length v'')) v''
where where
v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
seps= (V.fromList [Paragraph, Sentences, Chars]) seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
--------------------------------------------------------------- ---------------------------------------------------------------
fromDocs :: Vector Doc -> Vector CsvDoc fromDocs :: Vector Doc -> Vector CsvDoc
...@@ -69,7 +69,7 @@ fromDocs docs = V.map fromDocs' docs ...@@ -69,7 +69,7 @@ fromDocs docs = V.map fromDocs' docs
-- TODO adapt the size of the paragraph according to the corpus average -- TODO adapt the size of the paragraph according to the corpus average
splitDoc :: Mean -> SplitBy -> CsvDoc -> Vector CsvDoc splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
splitDoc m splt doc = let docSize = (length $ c_abstract doc) in splitDoc m splt doc = let docSize = (length $ c_abstract doc) in
if docSize > 1000 if docSize > 1000
then then
...@@ -82,15 +82,15 @@ splitDoc m splt doc = let docSize = (length $ c_abstract doc) in ...@@ -82,15 +82,15 @@ splitDoc m splt doc = let docSize = (length $ c_abstract doc) in
V.fromList [doc] V.fromList [doc]
splitDoc' :: SplitBy -> CsvDoc -> Vector CsvDoc splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
splitDoc' splt (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
where where
firstDoc = CsvDoc t s py pm pd firstAbstract auth firstDoc = CsvDoc t s py pm pd firstAbstract auth
firstAbstract = head' abstracts firstAbstract = head' abstracts
nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts) nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
abstracts = (splitBy splt 20) abst abstracts = (splitBy $ contextSize) abst
head' x = maybe "" identity (head x) head' x = maybe "" identity (head x)
tail' x = maybe [""] identity (tailMay x) tail' x = maybe [""] identity (tailMay x)
......
...@@ -34,6 +34,7 @@ module Gargantext.Text.Terms ...@@ -34,6 +34,7 @@ module Gargantext.Text.Terms
where where
import Data.Text (Text) import Data.Text (Text)
import Data.Traversable
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Core import Gargantext.Core
...@@ -46,6 +47,9 @@ data TermType = Mono | Multi ...@@ -46,6 +47,9 @@ data TermType = Mono | Multi
-- remove Stop Words -- remove Stop Words
-- map (filter (\t -> not . elem t)) $ -- map (filter (\t -> not . elem t)) $
------------------------------------------------------------------------ ------------------------------------------------------------------------
extractTerms :: Traversable t => TermType -> Lang -> t Text -> IO (t [Terms])
extractTerms termType lang = mapM (terms termType lang)
------------------------------------------------------------------------
terms :: TermType -> Lang -> Text -> IO [Terms] terms :: TermType -> Lang -> Text -> IO [Terms]
terms Mono lang txt = pure $ monoterms' lang txt terms Mono lang txt = pure $ monoterms' lang txt
terms Multi lang txt = multiterms lang txt terms Multi lang txt = multiterms lang txt
......
...@@ -36,10 +36,8 @@ Implementation use Accelerate library : ...@@ -36,10 +36,8 @@ Implementation use Accelerate library :
module Gargantext.Viz.Graph.Distances.Matrice module Gargantext.Viz.Graph.Distances.Matrice
where where
--import Data.Array.Accelerate.Data.Bits
import Data.Array.Accelerate.Interpreter (run)
import Data.Array.Accelerate import Data.Array.Accelerate
import Data.Array.Accelerate.Interpreter (run)
import Data.Array.Accelerate.Smart import Data.Array.Accelerate.Smart
import Data.Array.Accelerate.Type import Data.Array.Accelerate.Type
import Data.Array.Accelerate.Array.Sugar (fromArr, Array, Z) import Data.Array.Accelerate.Array.Sugar (fromArr, Array, Z)
...@@ -94,14 +92,7 @@ type SpecificityGenericity = Double ...@@ -94,14 +92,7 @@ type SpecificityGenericity = Double
conditional :: Matrix Double -> (Matrix InclusionExclusion, Matrix SpecificityGenericity) conditional :: Matrix Double -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
conditional m = (run $ ie (use m), run $ sg (use m)) conditional m = (run $ ie (use m), run $ sg (use m))
where where
r :: Rank
r = rank' m
xs :: Matrix' Double -> Matrix' Double
xs mat = zipWith (-) (proba r mat) (mkSum r $ proba r mat)
ys :: Acc (Matrix Double) -> Acc (Matrix Double)
ys mat = zipWith (-) (proba r mat) (mkSum r $ transpose $ proba r mat)
ie :: Matrix' Double -> Matrix' Double ie :: Matrix' Double -> Matrix' Double
ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat) ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
sg :: Acc (Matrix Double) -> Acc (Matrix Double) sg :: Acc (Matrix Double) -> Acc (Matrix Double)
...@@ -109,7 +100,14 @@ conditional m = (run $ ie (use m), run $ sg (use m)) ...@@ -109,7 +100,14 @@ conditional m = (run $ ie (use m), run $ sg (use m))
n :: Exp Double n :: Exp Double
n = P.fromIntegral r n = P.fromIntegral r
r :: Rank
r = rank' m
xs :: Matrix' Double -> Matrix' Double
xs mat = zipWith (-) (proba r mat) (mkSum r $ proba r mat)
ys :: Acc (Matrix Double) -> Acc (Matrix Double)
ys mat = zipWith (-) (proba r mat) (mkSum r $ transpose $ proba r mat)
-- filter with threshold -- filter with threshold
----------------------------------------------------------------------- -----------------------------------------------------------------------
...@@ -121,7 +119,9 @@ distributional m = run $ filter $ ri (map fromIntegral $ use m) ...@@ -121,7 +119,9 @@ distributional m = run $ filter $ ri (map fromIntegral $ use m)
where where
n = rank' m n = rank' m
miniMax m = map (\x -> ifThenElse (x > (the $ minimum $ maximum m)) x 0) m miniMax m = map (\x -> ifThenElse (x > miniMax') x 0) m
where
miniMax' = (the $ minimum $ maximum m)
filter m = zipWith (\a b -> max a b) m (transpose m) filter m = zipWith (\a b -> max a b) m (transpose m)
......
...@@ -43,28 +43,12 @@ import Gargantext.Prelude ...@@ -43,28 +43,12 @@ import Gargantext.Prelude
type Index = Int type Index = Int
------------------------------------------------------------------------------- -------------------------------------------------------------------------------
{-
map'' :: (Ord t) => (A.Matrix Int -> A.Matrix Double)
-> Map (t, t) Int
-> Map (t, t) Double
map'' f m = back . f' . from m
where
from (fs, m') = unzip $ M.toAscList m
f' = f $ A.fromList shape m'
shape = (A.Z A.:. n A.:. n)
back = M.fromAscList . zip fs . A.toList
-}
------------------------------------------------------------------------------- -------------------------------------------------------------------------------
map' :: (Ord t) => (A.Matrix Int -> A.Matrix Double) score :: (Ord t) => (A.Matrix Int -> A.Matrix Double)
-> Map (t, t) Int -> Map (t, t) Int
-> Map (t, t) Double -> Map (t, t) Double
map' f m = fromIndex fromI . mat2cooc . f $ cooc2mat toI m score f m = fromIndex fromI . mat2map . f $ cooc2mat toI m
where
(toI, fromI) = createIndexes m
map'' m = cooc2mat toI m
where where
(toI, fromI) = createIndexes m (toI, fromI) = createIndexes m
...@@ -81,10 +65,9 @@ map2mat def n m = A.fromFunction shape (\(Z :. x :. y) -> fromMaybe def $ M.look ...@@ -81,10 +65,9 @@ map2mat def n m = A.fromFunction shape (\(Z :. x :. y) -> fromMaybe def $ M.look
where where
shape = (Z :. n :. n) shape = (Z :. n :. n)
-- TODO rename mat2map mat2map :: (Elt a, Shape (Z :. Index)) =>
mat2cooc :: (Elt a, Shape (Z :. Index)) =>
A.Array (Z :. Index :. Index) a -> Map (Index, Index) a A.Array (Z :. Index :. Index) a -> Map (Index, Index) a
mat2cooc m = M.fromList . map f . A.toList . A.run . A.indexed $ A.use m mat2map m = M.fromList . map f . A.toList . A.run . A.indexed $ A.use m
where where
Z :. _ :. n = A.arrayShape m Z :. _ :. n = A.arrayShape m
f ((Z :. i :. j), x) = ((i, j), x) f ((Z :. i :. j), x) = ((i, j), x)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment