Commit 0be01d72 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[Pipeline] ok until clustering.

parent 4b81f9d1
......@@ -15,35 +15,29 @@ Portability : POSIX
module Gargantext.Pipeline
where
import Data.Text (unpack)
import qualified Data.Text as DT
import Data.Text.IO (readFile)
----------------------------------------------
----------------------------------------------
import Gargantext.Core
import Gargantext.Core.Types
import Gargantext.Prelude
import Gargantext.Viz.Graph.Index (map', createIndexes)
import Gargantext.Viz.Graph.Distances.Matrice (distributional, int2double)
import Gargantext.Viz.Graph.Index (score)
import Gargantext.Viz.Graph.Distances.Matrice (distributional)
import Gargantext.Text.Metrics.Occurrences
import Gargantext.Text.Terms
import Gargantext.Text.Context
import Data.Array.Accelerate as A
pipeline pth = do
text <- readFile pth
let contexts = splitBy Sentences 4 text
myterms <- mapM (terms Multi FR) contexts
-- todo filter stop words
pipeline path = do
-- Text <- IO Text <- FilePath
text <- readFile path
let contexts = splitBy (Sentences 3) text
myterms <- extractTerms Multi FR contexts
-- TODO filter (\t -> not . elem t stopList) myterms
-- TODO groupBy (Stem | GroupList)
let myCooc = removeApax $ cooc myterms
--pure myCooc
-- Cooc map -> Matrix
--pure $ createIndexes myCooc
pure $ map' int2double myCooc
-- Matrix -> Graph
-- Cooc -> Matrix
pure $ score distributional myCooc
-- Matrix -> Clustering -> Graph -> JSON
......@@ -14,26 +14,34 @@ Context of text management tool
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Context where
module Gargantext.Text.Context
where
import Data.Text (Text, pack, unpack, length)
import Data.String (IsString)
import Text.HTML.TagSoup
import Text.HTML.TagSoup (parseTags, isTagText, Tag(..))
import Gargantext.Text
import Gargantext.Prelude hiding (length)
data SplitBy = Paragraph | Sentences | Chars
splitBy :: SplitBy -> Int -> Text -> [Text]
splitBy Chars n = map pack . chunkAlong n n . unpack
splitBy Sentences n = map unsentences . chunkAlong n n . sentences
splitBy Paragraph _ = map removeTag . filter isTagText . parseTags
data SplitContext = Chars Int | Sentences Int | Paragraphs Int
tag = parseTags
-- | splitBy contexts of Chars or Sentences or Paragraphs
-- >> splitBy (Chars 0) "abcde"
-- ["a","b","c","d","e"]
-- >> splitBy (Chars 1) "abcde"
-- ["ab","bc","cd","de"]
-- >> splitBy (Chars 2) "abcde"
-- ["abc","bcd","cde"]
splitBy :: SplitContext -> Text -> [Text]
splitBy (Chars n) = map pack . chunkAlong (n+1) 1 . unpack
splitBy (Sentences n) = map unsentences . chunkAlong (n+1) 1 . sentences
splitBy (Paragraphs _) = map unTag . filter isTagText . tag
where
removeTag :: IsString p => Tag p -> p
removeTag (TagText x) = x
removeTag (TagComment x) = x
removeTag _ = ""
unTag :: IsString p => Tag p -> p
unTag (TagText x) = x
unTag _ = ""
......@@ -50,13 +50,13 @@ data Doc = Doc
deriving (Show)
---------------------------------------------------------------
toDocs :: Vector CsvDoc -> [Doc]
toDocs v = V.toList
toDocs v = V.toList
$ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
-> Doc nId t s py pm pd abst auth )
(V.enumFromN 1 (V.length v'')) v''
where
v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
seps= (V.fromList [Paragraph, Sentences, Chars])
seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
---------------------------------------------------------------
fromDocs :: Vector Doc -> Vector CsvDoc
......@@ -69,7 +69,7 @@ fromDocs docs = V.map fromDocs' docs
-- TODO adapt the size of the paragraph according to the corpus average
splitDoc :: Mean -> SplitBy -> CsvDoc -> Vector CsvDoc
splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
splitDoc m splt doc = let docSize = (length $ c_abstract doc) in
if docSize > 1000
then
......@@ -82,15 +82,15 @@ splitDoc m splt doc = let docSize = (length $ c_abstract doc) in
V.fromList [doc]
splitDoc' :: SplitBy -> CsvDoc -> Vector CsvDoc
splitDoc' splt (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
where
firstDoc = CsvDoc t s py pm pd firstAbstract auth
firstAbstract = head' abstracts
nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
abstracts = (splitBy splt 20) abst
abstracts = (splitBy $ contextSize) abst
head' x = maybe "" identity (head x)
tail' x = maybe [""] identity (tailMay x)
......
......@@ -34,6 +34,7 @@ module Gargantext.Text.Terms
where
import Data.Text (Text)
import Data.Traversable
import Gargantext.Prelude
import Gargantext.Core
......@@ -46,6 +47,9 @@ data TermType = Mono | Multi
-- remove Stop Words
-- map (filter (\t -> not . elem t)) $
------------------------------------------------------------------------
extractTerms :: Traversable t => TermType -> Lang -> t Text -> IO (t [Terms])
extractTerms termType lang = mapM (terms termType lang)
------------------------------------------------------------------------
terms :: TermType -> Lang -> Text -> IO [Terms]
terms Mono lang txt = pure $ monoterms' lang txt
terms Multi lang txt = multiterms lang txt
......
......@@ -36,10 +36,8 @@ Implementation use Accelerate library :
module Gargantext.Viz.Graph.Distances.Matrice
where
--import Data.Array.Accelerate.Data.Bits
import Data.Array.Accelerate.Interpreter (run)
import Data.Array.Accelerate
import Data.Array.Accelerate.Interpreter (run)
import Data.Array.Accelerate.Smart
import Data.Array.Accelerate.Type
import Data.Array.Accelerate.Array.Sugar (fromArr, Array, Z)
......@@ -94,14 +92,7 @@ type SpecificityGenericity = Double
conditional :: Matrix Double -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
conditional m = (run $ ie (use m), run $ sg (use m))
where
r :: Rank
r = rank' m
xs :: Matrix' Double -> Matrix' Double
xs mat = zipWith (-) (proba r mat) (mkSum r $ proba r mat)
ys :: Acc (Matrix Double) -> Acc (Matrix Double)
ys mat = zipWith (-) (proba r mat) (mkSum r $ transpose $ proba r mat)
ie :: Matrix' Double -> Matrix' Double
ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
sg :: Acc (Matrix Double) -> Acc (Matrix Double)
......@@ -109,7 +100,14 @@ conditional m = (run $ ie (use m), run $ sg (use m))
n :: Exp Double
n = P.fromIntegral r
r :: Rank
r = rank' m
xs :: Matrix' Double -> Matrix' Double
xs mat = zipWith (-) (proba r mat) (mkSum r $ proba r mat)
ys :: Acc (Matrix Double) -> Acc (Matrix Double)
ys mat = zipWith (-) (proba r mat) (mkSum r $ transpose $ proba r mat)
-- filter with threshold
-----------------------------------------------------------------------
......@@ -121,7 +119,9 @@ distributional m = run $ filter $ ri (map fromIntegral $ use m)
where
n = rank' m
miniMax m = map (\x -> ifThenElse (x > (the $ minimum $ maximum m)) x 0) m
miniMax m = map (\x -> ifThenElse (x > miniMax') x 0) m
where
miniMax' = (the $ minimum $ maximum m)
filter m = zipWith (\a b -> max a b) m (transpose m)
......
......@@ -43,28 +43,12 @@ import Gargantext.Prelude
type Index = Int
-------------------------------------------------------------------------------
{-
map'' :: (Ord t) => (A.Matrix Int -> A.Matrix Double)
-> Map (t, t) Int
-> Map (t, t) Double
map'' f m = back . f' . from m
where
from (fs, m') = unzip $ M.toAscList m
f' = f $ A.fromList shape m'
shape = (A.Z A.:. n A.:. n)
back = M.fromAscList . zip fs . A.toList
-}
-------------------------------------------------------------------------------
map' :: (Ord t) => (A.Matrix Int -> A.Matrix Double)
score :: (Ord t) => (A.Matrix Int -> A.Matrix Double)
-> Map (t, t) Int
-> Map (t, t) Double
map' f m = fromIndex fromI . mat2cooc . f $ cooc2mat toI m
where
(toI, fromI) = createIndexes m
map'' m = cooc2mat toI m
score f m = fromIndex fromI . mat2map . f $ cooc2mat toI m
where
(toI, fromI) = createIndexes m
......@@ -81,10 +65,9 @@ map2mat def n m = A.fromFunction shape (\(Z :. x :. y) -> fromMaybe def $ M.look
where
shape = (Z :. n :. n)
-- TODO rename mat2map
mat2cooc :: (Elt a, Shape (Z :. Index)) =>
mat2map :: (Elt a, Shape (Z :. Index)) =>
A.Array (Z :. Index :. Index) a -> Map (Index, Index) a
mat2cooc m = M.fromList . map f . A.toList . A.run . A.indexed $ A.use m
mat2map m = M.fromList . map f . A.toList . A.run . A.indexed $ A.use m
where
Z :. _ :. n = A.arrayShape m
f ((Z :. i :. j), x) = ((i, j), x)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment