Commit d710b723 authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge branch 'pipeline'

parents 7e930b57 b5124f1e
...@@ -2,3 +2,8 @@ ...@@ -2,3 +2,8 @@
*.swp *.swp
*.cabal *.cabal
*purescript-gargantext *purescript-gargantext
doc
bin
clustering-louvain
profiling
servant-job
...@@ -35,7 +35,6 @@ library: ...@@ -35,7 +35,6 @@ library:
dependencies: dependencies:
- QuickCheck - QuickCheck
- accelerate - accelerate
- accelerate-io
- aeson - aeson
- aeson-lens - aeson-lens
- aeson-pretty - aeson-pretty
...@@ -46,6 +45,7 @@ library: ...@@ -46,6 +45,7 @@ library:
- bytestring - bytestring
- case-insensitive - case-insensitive
- cassava - cassava
- clustering-louvain
- conduit - conduit
- conduit-extra - conduit-extra
- containers - containers
......
...@@ -46,15 +46,10 @@ data Terms = Terms { _terms_label :: Label ...@@ -46,15 +46,10 @@ data Terms = Terms { _terms_label :: Label
instance Show Terms where instance Show Terms where
show (Terms l s) = show l show (Terms l s) = show l
-- class Inclusion where include
--instance Eq Terms where
-- (==) (Terms _ s1) (Terms _ s2) = s1 `S.isSubsetOf` s2
-- || s2 `S.isSubsetOf` s1
instance Eq Terms where instance Eq Terms where
(==) (Terms _ s1) (Terms _ s2) = s1 == s2 (==) (Terms _ s1) (Terms _ s2) = s1 == s2
------------------------------------------------------------------------ ------------------------------------------------------------------------
data Tag = POS | NER data Tag = POS | NER
deriving (Show, Eq) deriving (Show, Eq)
......
...@@ -6,7 +6,6 @@ License : AGPL + CECILL v3 ...@@ -6,7 +6,6 @@ License : AGPL + CECILL v3
Maintainer : team@gargantext.org Maintainer : team@gargantext.org
Stability : experimental Stability : experimental
Portability : POSIX Portability : POSIX
-} -}
{-# OPTIONS_GHC -fno-warn-name-shadowing #-} {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
...@@ -17,27 +16,57 @@ module Gargantext.Pipeline ...@@ -17,27 +16,57 @@ module Gargantext.Pipeline
import Data.Text.IO (readFile) import Data.Text.IO (readFile)
import Control.Arrow ((***))
import Data.Map.Strict (Map)
import qualified Data.Map.Strict as M
import qualified Data.List as L
import Data.Tuple.Extra (both)
---------------------------------------------- ----------------------------------------------
---------------------------------------------- import Gargantext.Core (Lang(FR))
import Gargantext.Core
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Viz.Graph.Index (score) import Gargantext.Viz.Graph.Index (score, createIndices, toIndex, fromIndex, cooc2mat, mat2map)
import Gargantext.Viz.Graph.Distances.Matrice (distributional) import Gargantext.Viz.Graph.Distances.Matrice (conditional', conditional)
import Gargantext.Text.Metrics.Occurrences import Gargantext.Viz.Graph.Index (Index)
import Gargantext.Text.Terms import Gargantext.Text.Metrics.Count (cooc, removeApax)
import Gargantext.Text.Context import Gargantext.Text.Metrics
import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms)
import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain)
{-
____ _ _
/ ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
| | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
| |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
\____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
|___/
-}
pipeline path = do pipeline path = do
-- Text <- IO Text <- FilePath -- Text <- IO Text <- FilePath
text <- readFile path text <- readFile path
let contexts = splitBy (Sentences 3) text let contexts = splitBy (Sentences 5) text
myterms <- extractTerms Multi FR contexts myterms <- extractTerms Multi FR contexts
-- TODO filter (\t -> not . elem t stopList) myterms -- TODO filter (\t -> not . elem t stopList) myterms
-- TODO groupBy (Stem | GroupList) -- TODO groupBy (Stem | GroupList)
let myCooc = removeApax $ cooc myterms let myCooc = removeApax $ cooc myterms
--let (ti, fi) = createIndices myCooc
pure True
--pure $ incExcSpeGen myCooc
-- Cooc -> Matrix -- Cooc -> Matrix
pure $ score distributional myCooc
-- Matrix -> Clustering -> Graph -> JSON -- -- filter by spec/gen (dynmaic programming)
-- let theScores = M.filter (>0) $ score conditional myCoocFiltered
----
------ -- Matrix -> Clustering
------ pure $ bestpartition False $ map2graph $ toIndex ti theScores
-- partitions <- cLouvain theScores
-- pure partitions
---- | Building : -> Graph -> JSON
...@@ -237,5 +237,3 @@ unMaybe = map fromJust . L.filter isJust ...@@ -237,5 +237,3 @@ unMaybe = map fromJust . L.filter isJust
-- maximumWith -- maximumWith
maximumWith f = L.maximumBy (\x y -> compare (f x) (f y)) maximumWith f = L.maximumBy (\x y -> compare (f x) (f y))
...@@ -27,7 +27,7 @@ import NLP.FullStop (segment) ...@@ -27,7 +27,7 @@ import NLP.FullStop (segment)
----------------------------------------------------------------- -----------------------------------------------------------------
import Gargantext.Core import Gargantext.Core
import Gargantext.Core.Types import Gargantext.Core.Types
import Gargantext.Text.Metrics.Occurrences (Occ, occurrences, cooc) import Gargantext.Text.Metrics.Count (Occ, occurrences, cooc)
import Gargantext.Prelude hiding (filter) import Gargantext.Prelude hiding (filter)
----------------------------------------------------------------- -----------------------------------------------------------------
......
...@@ -8,18 +8,132 @@ Stability : experimental ...@@ -8,18 +8,132 @@ Stability : experimental
Portability : POSIX Portability : POSIX
Mainly reexport functions in @Data.Text.Metrics@ Mainly reexport functions in @Data.Text.Metrics@
TODO
noApax :: Ord a => Map a Occ -> Map a Occ
noApax m = M.filter (>1) m
-} -}
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Metrics where module Gargantext.Text.Metrics
where
--import Data.Text (Text) import Data.Text (Text, pack)
import Data.Map (Map)
import qualified Data.List as L
import qualified Data.Map as M
import qualified Data.Set as S
import qualified Data.Text as T
import Data.Tuple.Extra (both)
--import GHC.Real (Ratio) --import GHC.Real (Ratio)
--import qualified Data.Text.Metrics as DTM --import qualified Data.Text.Metrics as DTM
-- import Data.Array.Accelerate (toList)
--import Gargantext.Prelude
--
--noApax :: Ord a => Map a Occ -> Map a Occ import Gargantext.Prelude
--noApax m = M.filter (>1) m
import Gargantext.Text.Metrics.Count (occurrences, cooc)
import Gargantext.Text.Terms (TermType(MonoMulti), terms)
import Gargantext.Core (Lang(EN))
import Gargantext.Core.Types (Terms(..))
import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
import Gargantext.Viz.Graph.Distances.Matrice
import Gargantext.Viz.Graph.Index
-- ord relevance: top n plus inclus
-- échantillonnage de généricity
--
--filterCooc :: Ord t => Map (t, t) Int -> Map (t, t) Int
--filterCooc m =
---- filterCooc m = foldl (\k -> maybe (panic "no key") identity $ M.lookup k m) M.empty selection
----(ti, fi) = createIndices m
-- . fromIndex fi $ filterMat $ cooc2mat ti m
import Data.Array.Accelerate (Matrix)
filterMat :: Matrix Int -> [(Index, Index)]
filterMat m = S.toList $ S.take n $ S.fromList $ (L.take nIe incExc') <> (L.take nSg speGen')
where
(incExc', speGen') = both ( map fst . L.sortOn snd . M.toList . mat2map) (conditional' m)
n = nIe + nSg
nIe = 30
nSg = 70
incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
where
(ti,fi) = createIndices m
ordonne x = L.reverse $ L.sortOn snd $ zip (map snd $ M.toList fi) (toList x)
metrics_text :: Text
metrics_text = T.intercalate " " metrics_sentences
metrics_sentences' :: [Text]
metrics_sentences' = splitBy (Sentences 0) metrics_text
-- | Sentences
metrics_sentences :: [Text]
metrics_sentences = [ "There is a table with a glass of wine and a spoon."
, "I can see the glass on the table."
, "There was only a spoon on that table."
, "The glass just fall from the table, pouring wine everywhere."
, "I wish the glass did not contain wine."
]
metrics_sentences_Test = metrics_sentences == metrics_sentences'
-- | Terms reordered to visually check occurrences
-- >>>
{- [ [["table"],["glass"],["wine"],["spoon"]]
, [["glass"],["table"]]
, [["spoon"],["table"]]
, [["glass"],["table"],["wine"]]
, [["glass"],["wine"]]
]
-}
metrics_terms :: IO [[Terms]]
metrics_terms = mapM (terms MonoMulti EN) $ splitBy (Sentences 0) metrics_text
-- | Occurrences
{-
fromList [ (fromList ["table"] ,fromList [(["table"] , 3 )])]
, (fromList ["object"],fromList [(["object"], 3 )])
, (fromList ["glas"] ,fromList [(["glas"] , 2 )])
, (fromList ["spoon"] ,fromList [(["spoon"] , 2 )])
-}
metrics_occ = occurrences <$> L.concat <$> metrics_terms
{-
-- fromList [((["glas"],["object"]),6)
,((["glas"],["spoon"]),4)
,((["glas"],["table"]),6),((["object"],["spoon"]),6),((["object"],["table"]),9),((["spoon"],["table"]),6)]
-}
metrics_cooc = cooc <$> metrics_terms
metrics_cooc_mat = do
m <- metrics_cooc
let (ti,_) = createIndices m
let mat_cooc = cooc2mat ti m
pure ( ti
, mat_cooc
, incExcSpeGen_proba mat_cooc
, incExcSpeGen mat_cooc
)
metrics_incExcSpeGen = incExcSpeGen_sorted <$> metrics_cooc
{-| {-|
Module : Gargantext.Text.Metrics.Occurrences Module : Gargantext.Text.Metrics.Count
Description : Description :
Copyright : (c) CNRS, 2017-Present Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3 License : AGPL + CECILL v3
...@@ -25,7 +25,7 @@ Source : https://en.wikipedia.org/wiki/Type%E2%80%93token_distinction#Occurrence ...@@ -25,7 +25,7 @@ Source : https://en.wikipedia.org/wiki/Type%E2%80%93token_distinction#Occurrence
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Metrics.Occurrences module Gargantext.Text.Metrics.Count
where where
...@@ -71,7 +71,6 @@ type Grouped = Stems ...@@ -71,7 +71,6 @@ type Grouped = Stems
--λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon blues lagoon", "red lagoon red lagoon", "red lagoon"] --λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon blues lagoon", "red lagoon red lagoon", "red lagoon"]
--fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)] --fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
---- ----
-} -}
type Occs = Int type Occs = Int
...@@ -81,10 +80,16 @@ removeApax :: Map (Label, Label) Int -> Map (Label, Label) Int ...@@ -81,10 +80,16 @@ removeApax :: Map (Label, Label) Int -> Map (Label, Label) Int
removeApax = DMS.filter (> 1) removeApax = DMS.filter (> 1)
cooc :: [[Terms]] -> Map (Label, Label) Int cooc :: [[Terms]] -> Map (Label, Label) Int
cooc tss = cooc tss = coocOnWithLabel _terms_stem (labelPolicy terms_occs) tss
mapKeys (delta $ labelPolicy terms_occs) $ cooc' (map (Set.fromList . map _terms_stem) tss) where
terms_occs = occurrencesOn _terms_stem (List.concat tss)
coocOnWithLabel :: (Ord label, Ord b) => (a -> b) -> (b -> label)
-> [[a]] -> Map (label, label) Coocs
coocOnWithLabel on policy tss =
mapKeys (delta policy) $ coocOn on tss
where where
terms_occs = occurrences (List.concat tss)
delta f = f *** f delta f = f *** f
...@@ -93,26 +98,29 @@ labelPolicy m g = case _terms_label <$> fst <$> maximumWith snd <$> DMS.toList ...@@ -93,26 +98,29 @@ labelPolicy m g = case _terms_label <$> fst <$> maximumWith snd <$> DMS.toList
Just label -> label Just label -> label
Nothing -> panic $ "Label of Grouped not found: " <> (pack $ show g) Nothing -> panic $ "Label of Grouped not found: " <> (pack $ show g)
cooc' :: Ord b => [Set b] -> Map (b, b) Coocs coocOn :: Ord b => (a -> b) -> [[a]] -> Map (b, b) Coocs
cooc' tss = foldl' (\m (xy,c) -> insertWith ((+)) xy c m) empty xs coocOn f as = foldl' (\a b -> DMS.unionWith (+) a b) empty $ map (coocOn' f) as
where where
xs = [ ((x, y), 1) coocOn' :: Ord b => (a -> b) -> [a] -> Map (b, b) Coocs
| xs <- tss coocOn' f ts = foldl' (\m (xy,c) -> insertWith ((+)) xy c m) empty xs
, ys <- tss where
, x <- Set.toList xs ts' = List.nub $ map f ts
, y <- Set.toList ys xs = [ ((x, y), 1)
, x < y | x <- ts'
] , y <- ts'
-- , x /= y
]
-- | Compute the grouped occurrences (occ) -- | Compute the grouped occurrences (occ)
occurrences :: [Terms] -> Map Grouped (Map Terms Int) occurrences :: [Terms] -> Map Grouped (Map Terms Int)
occurrences = occurrences' _terms_stem occurrences = occurrencesOn _terms_stem
occurrences' :: (Ord a, Ord b) => (a -> b) -> [a] -> Map b (Map a Int) occurrencesOn :: (Ord a, Ord b) => (a -> b) -> [a] -> Map b (Map a Int)
occurrences' f = foldl' (\m a -> insertWith (unionWith (+)) (f a) (singleton a 1) m) empty occurrencesOn f = foldl' (\m a -> insertWith (unionWith (+)) (f a) (singleton a 1) m) empty
-- TODO add groups and filter stops -- TODO add groups and filter stops
sumOcc :: Ord a => [Occ a] -> Occ a sumOcc :: Ord a => [Occ a] -> Occ a
sumOcc xs = foldl' (unionWith (+)) empty xs sumOcc xs = foldl' (unionWith (+)) empty xs
......
...@@ -14,42 +14,52 @@ Domain Specific Language to manage Frequent Item Set (FIS) ...@@ -14,42 +14,52 @@ Domain Specific Language to manage Frequent Item Set (FIS)
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Metrics.FrequentItemSet module Gargantext.Text.Metrics.FrequentItemSet
( Fis, Size ( Fis, Size(..)
, occ_hlcm, cooc_hlcm , occ_hlcm, cooc_hlcm
, all, between , all, between
, fisWithSize
, fisWith
, fisWithSizePoly
, fisWithSizePoly2
, module HLCM , module HLCM
) )
where where
import Data.List (tail, filter) import Prelude (Functor(..)) -- TODO
import Data.Either import qualified Data.Map.Strict as Map
import Data.Map.Strict (Map)
import qualified Data.Set as Set
import Data.Set (Set)
import qualified Data.Vector as V
import Data.Vector (Vector)
import Data.List (filter, concat)
import Data.Maybe (catMaybes)
import HLCM import HLCM
import Gargantext.Prelude import Gargantext.Prelude
type Size = Either Int (Int, Int) data Size = Point Int | Segment Int Int
--data Size = Point | Segment
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | Occurrence is Frequent Item Set of size 1 -- | Occurrence is Frequent Item Set of size 1
occ_hlcm :: Frequency -> [[Item]] -> [Fis] occ_hlcm :: Frequency -> [[Item]] -> [Fis]
occ_hlcm f is = fisWithSize (Left 1) f is occ_hlcm = fisWithSize (Point 1)
-- | Cooccurrence is Frequent Item Set of size 2 -- | Cooccurrence is Frequent Item Set of size 2
cooc_hlcm :: Frequency -> [[Item]] -> [Fis] cooc_hlcm :: Frequency -> [[Item]] -> [Fis]
cooc_hlcm f is = fisWithSize (Left 2) f is cooc_hlcm = fisWithSize (Point 2)
all :: Frequency -> [[Item]] -> [Fis] all :: Frequency -> [[Item]] -> [Fis]
all f is = fisWith Nothing f is all = fisWith Nothing
------------------------------------------------------------------------ ------------------------------------------------------------------------
between :: (Int, Int) -> Frequency -> [[Item]] -> [Fis] between :: (Int, Int) -> Frequency -> [[Item]] -> [Fis]
between (x,y) f is = fisWithSize (Right (x,y)) f is between (x,y) = fisWithSize (Segment x y)
--maximum :: Int -> Frequency -> [[Item]] -> [Fis] --maximum :: Int -> Frequency -> [[Item]] -> [Fis]
--maximum m f is = between (0,m) f is --maximum m = between (0,m)
------------------------------------------------------------------------ ------------------------------------------------------------------------
...@@ -62,31 +72,48 @@ data Fis' a = Fis' { _fisCount :: Int ...@@ -62,31 +72,48 @@ data Fis' a = Fis' { _fisCount :: Int
, _fisItemSet :: [a] , _fisItemSet :: [a]
} deriving (Show) } deriving (Show)
instance Functor Fis' where
fmap f (Fis' c is) = Fis' c (fmap f is)
-- | Sugar from items to FIS -- | Sugar from items to FIS
items2fis :: [Item] -> Maybe Fis items2fis :: [Item] -> Maybe Fis
items2fis is = case head is of items2fis [] = Nothing
Nothing -> Nothing items2fis (i:is) = Just $ Fis' i is
Just h -> Just (Fis' h (tail is))
------------------------------------------------------------------------ ------------------------------------------------------------------------
------------------------------------------------------------------------ ------------------------------------------------------------------------
fisWithSize :: Size -> Frequency -> [[Item]] -> [Fis] fisWithSize :: Size -> Frequency -> [[Item]] -> [Fis]
fisWithSize n f is = case n of fisWithSize n f is = case n of
Left n' -> fisWith (Just (\x -> length x == (n'+1) )) f is Point n' -> fisWith (Just (\x -> length x == (n'+1) )) f is
Right (a,b) -> fisWith (Just (\x -> cond1 a x && cond2 b x)) f is Segment a b -> fisWith (Just (\x -> cond a (length x) b)) f is
where where
cond1 a' x = length x >= a' cond a' x b' = a' <= x && x <= b'
cond2 b' x = length x <= b'
fisWith :: Maybe ([Item] -> Bool) -> Frequency -> [[Item]] -> [Fis] fisWith :: Maybe ([Item] -> Bool) -> Frequency -> [[Item]] -> [Fis]
fisWith s f is = unMaybe $ map items2fis $ filter' $ runLCMmatrix is f fisWith s f is = catMaybes $ map items2fis $ filter' $ runLCMmatrix is f
where where
filter' = case s of filter' = case s of
Nothing -> identity Nothing -> identity
Just fun -> filter fun Just fun -> filter fun
-- Here the sole purpose to take the keys as a Set is tell we do not want
-- duplicates.
fisWithSizePoly :: Ord a => Size -> Frequency -> Set a -> [[a]] -> [Fis' a]
fisWithSizePoly n f ks = map (fmap fromItem) . fisWithSize n f . map (map toItem)
where
ksv = V.fromList $ Set.toList ks
ksm = Map.fromList . flip zip [0..] $ V.toList ksv
toItem = (ksm Map.!)
fromItem = (ksv V.!)
fisWithSizePoly2 :: Ord a => Size -> Frequency -> [[a]] -> [Fis' a]
fisWithSizePoly2 n f is = fisWithSizePoly n f ks is
where
ks = Set.fromList $ concat is
------------------------------------------------------------------------ ------------------------------------------------------------------------
------------------------------------------------------------------------ ------------------------------------------------------------------------
......
...@@ -42,16 +42,23 @@ import Gargantext.Core.Types ...@@ -42,16 +42,23 @@ import Gargantext.Core.Types
import Gargantext.Text.Terms.Multi (multiterms) import Gargantext.Text.Terms.Multi (multiterms)
import Gargantext.Text.Terms.Mono (monoterms') import Gargantext.Text.Terms.Mono (monoterms')
data TermType = Mono | Multi data TermType = Mono | Multi | MonoMulti
-- remove Stop Words -- remove Stop Words
-- map (filter (\t -> not . elem t)) $ -- map (filter (\t -> not . elem t)) $
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | Sugar to extract terms from text (hiddeng mapM from end user).
extractTerms :: Traversable t => TermType -> Lang -> t Text -> IO (t [Terms]) extractTerms :: Traversable t => TermType -> Lang -> t Text -> IO (t [Terms])
extractTerms termType lang = mapM (terms termType lang) extractTerms termType lang = mapM (terms termType lang)
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | Terms from Text
-- Mono : mono terms
-- Multi : multi terms
-- MonoMulti : mono and multi
-- TODO : multi terms should exclude mono (intersection is not empty yet)
terms :: TermType -> Lang -> Text -> IO [Terms] terms :: TermType -> Lang -> Text -> IO [Terms]
terms Mono lang txt = pure $ monoterms' lang txt terms Mono lang txt = pure $ monoterms' lang txt
terms Multi lang txt = multiterms lang txt terms Multi lang txt = multiterms lang txt
terms MonoMulti lang txt = terms Multi lang txt
------------------------------------------------------------------------ ------------------------------------------------------------------------
...@@ -18,23 +18,27 @@ module Gargantext.Text.Terms.Multi (multiterms) ...@@ -18,23 +18,27 @@ module Gargantext.Text.Terms.Multi (multiterms)
import Data.Text hiding (map, group, filter, concat) import Data.Text hiding (map, group, filter, concat)
import Data.List (concat) import Data.List (concat)
import qualified Data.Set as S
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Core.Types import Gargantext.Core.Types
import Gargantext.Text.Terms.Multi.PosTagging import Gargantext.Text.Terms.Multi.PosTagging
import Gargantext.Text.Terms.Mono.Stem (stem)
import qualified Gargantext.Text.Terms.Multi.Lang.En as En import qualified Gargantext.Text.Terms.Multi.Lang.En as En
import qualified Gargantext.Text.Terms.Multi.Lang.Fr as Fr import qualified Gargantext.Text.Terms.Multi.Lang.Fr as Fr
multiterms :: Lang -> Text -> IO [Terms] multiterms :: Lang -> Text -> IO [Terms]
multiterms lang txt = concat multiterms lang txt = concat
<$> map (map tokenTag2terms) <$> map (map (tokenTag2terms lang))
<$> map (filter (\t -> _my_token_pos t == Just NP)) <$> map (filter (\t -> _my_token_pos t == Just NP))
<$> tokenTags lang txt <$> tokenTags lang txt
tokenTag2terms :: TokenTag -> Terms tokenTag2terms :: Lang -> TokenTag -> Terms
tokenTag2terms (TokenTag w t _ _) = Terms w t tokenTag2terms lang (TokenTag w t _ _) = Terms w t'
where
t' = S.fromList $ map (stem lang) $ S.toList t
tokenTags :: Lang -> Text -> IO [[TokenTag]] tokenTags :: Lang -> Text -> IO [[TokenTag]]
tokenTags lang s = map (group lang) <$> tokenTags' lang s tokenTags lang s = map (group lang) <$> tokenTags' lang s
......
...@@ -28,9 +28,9 @@ group :: [TokenTag] -> [TokenTag] ...@@ -28,9 +28,9 @@ group :: [TokenTag] -> [TokenTag]
group [] = [] group [] = []
group ntags = group2 NP NP group ntags = group2 NP NP
$ group2 NP VB $ group2 NP VB
$ group2 NP IN -- $ group2 NP IN
$ group2 IN DT $ group2 IN DT
$ group2 VB NP -- $ group2 VB NP
$ group2 JJ NP $ group2 JJ NP
$ group2 JJ JJ $ group2 JJ JJ
$ group2 JJ CC $ group2 JJ CC
......
...@@ -27,8 +27,8 @@ group :: [TokenTag] -> [TokenTag] ...@@ -27,8 +27,8 @@ group :: [TokenTag] -> [TokenTag]
group [] = [] group [] = []
group ntags = group2 NP NP group ntags = group2 NP NP
$ group2 NP VB $ group2 NP VB
$ group2 NP IN -- $ group2 NP IN
$ group2 IN DT -- $ group2 IN DT
$ group2 VB NP $ group2 VB NP
$ group2 JJ NP $ group2 JJ NP
$ group2 NP JJ $ group2 NP JJ
......
...@@ -96,10 +96,10 @@ data Properties = Properties { _propertiesAnnotators :: Text ...@@ -96,10 +96,10 @@ data Properties = Properties { _propertiesAnnotators :: Text
$(deriveJSON (unPrefix "_properties") ''Properties) $(deriveJSON (unPrefix "_properties") ''Properties)
data Sentences = Sentences { _sentences :: [Sentence]} data PosSentences = PosSentences { _sentences :: [Sentence]}
deriving (Show, Generic) deriving (Show, Generic)
$(deriveJSON (unPrefix "_") ''Sentences) $(deriveJSON (unPrefix "_") ''PosSentences)
-- request = -- request =
...@@ -134,7 +134,7 @@ corenlpRaw lang txt = do ...@@ -134,7 +134,7 @@ corenlpRaw lang txt = do
pure (getResponseBody response) pure (getResponseBody response)
corenlp :: Lang -> Text -> IO Sentences corenlp :: Lang -> Text -> IO PosSentences
corenlp lang txt = do corenlp lang txt = do
response <- corenlp' lang txt response <- corenlp' lang txt
pure (getResponseBody response) pure (getResponseBody response)
......
...@@ -60,8 +60,3 @@ $(deriveJSON (unPrefix "g_") ''Graph) ...@@ -60,8 +60,3 @@ $(deriveJSON (unPrefix "g_") ''Graph)
...@@ -28,10 +28,12 @@ Implementation use Accelerate library : ...@@ -28,10 +28,12 @@ Implementation use Accelerate library :
-} -}
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE TypeFamilies #-} {-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE TypeOperators #-} {-# LANGUAGE TypeOperators #-}
{-# LANGUAGE ScopedTypeVariables #-}
module Gargantext.Viz.Graph.Distances.Matrice module Gargantext.Viz.Graph.Distances.Matrice
where where
...@@ -46,7 +48,7 @@ import Data.Maybe (Maybe(Just)) ...@@ -46,7 +48,7 @@ import Data.Maybe (Maybe(Just))
import qualified Gargantext.Prelude as P import qualified Gargantext.Prelude as P
import qualified Data.Array.Accelerate.Array.Representation as Repr import qualified Data.Array.Accelerate.Array.Representation as Repr
import Gargantext.Text.Metrics.Occurrences import Gargantext.Text.Metrics.Count
----------------------------------------------------------------------- -----------------------------------------------------------------------
...@@ -67,33 +69,48 @@ myMat n = matrix n [1..] ...@@ -67,33 +69,48 @@ myMat n = matrix n [1..]
rank :: (Matrix a) -> Int rank :: (Matrix a) -> Int
rank m = arrayRank $ arrayShape m rank m = arrayRank $ arrayShape m
rank' :: (Matrix a) -> Int -----------------------------------------------------------------------
rank' m = n -- | Dimension of a square Matrix
-- How to force use with SquareMatrix ?
type Dim = Int
dim :: (Matrix a) -> Dim
dim m = n
where where
Z :. _ :. n = arrayShape m Z :. _ :. n = arrayShape m
-- == indexTail (arrayShape m)
----------------------------------------------------------------------- -----------------------------------------------------------------------
-- | Conditional Distance proba :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
proba r mat = zipWith (/) mat (mkSum r mat)
type Rank = Int mkSum :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
mkSum r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum mat
proba :: Rank -> Acc (Matrix Double) -> Acc (Matrix Double) -- divByDiag
proba r mat = zipWith (/) mat (mkSum r mat) divByDiag :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
divByDiag d mat = zipWith (/) mat (replicate (constant (Z :. (d :: Int) :. All)) $ diag mat)
where
diag :: Elt e => Acc (Matrix e) -> Acc (Vector e)
diag m = backpermute (indexTail (shape m)) (lift1 (\(Z :. x) -> (Z :. x :. (x :: Exp Int)))) m
-----------------------------------------------------------------------
mkSum :: Rank -> Acc (Matrix Double) -> Acc (Matrix Double) miniMax :: Acc (Matrix Double) -> Acc (Matrix Double)
mkSum r mat = replicate (constant (Z :. (r :: Int) :. All)) miniMax m = map (\x -> ifThenElse (x > miniMax') x 0) m
$ fold (+) 0 mat where
miniMax' = (the $ minimum $ maximum m)
-- | Conditional distance (basic version)
conditional :: Matrix Int -> Matrix Double
conditional m = run (miniMax $ proba (dim m) $ map fromIntegral $ use m)
type Matrix' a = Acc (Matrix a)
type InclusionExclusion = Double
type SpecificityGenericity = Double
conditional :: Matrix Double -> (Matrix InclusionExclusion, Matrix SpecificityGenericity) -- | Conditional distance (advanced version)
conditional m = (run $ ie (use m), run $ sg (use m)) conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
conditional' m = (run $ ie $ map fromIntegral $ use m, run $ sg $ map fromIntegral $ use m)
where where
ie :: Matrix' Double -> Matrix' Double ie :: Acc (Matrix Double) -> Acc (Matrix Double)
ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat) ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
sg :: Acc (Matrix Double) -> Acc (Matrix Double) sg :: Acc (Matrix Double) -> Acc (Matrix Double)
sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat) sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat)
...@@ -101,27 +118,21 @@ conditional m = (run $ ie (use m), run $ sg (use m)) ...@@ -101,27 +118,21 @@ conditional m = (run $ ie (use m), run $ sg (use m))
n :: Exp Double n :: Exp Double
n = P.fromIntegral r n = P.fromIntegral r
r :: Rank r :: Dim
r = rank' m r = dim m
xs :: Matrix' Double -> Matrix' Double xs :: Acc (Matrix Double) -> Acc (Matrix Double)
xs mat = zipWith (-) (proba r mat) (mkSum r $ proba r mat) xs mat = zipWith (-) (proba r mat) (mkSum r $ proba r mat)
ys :: Acc (Matrix Double) -> Acc (Matrix Double) ys :: Acc (Matrix Double) -> Acc (Matrix Double)
ys mat = zipWith (-) (proba r mat) (mkSum r $ transpose $ proba r mat) ys mat = zipWith (-) (proba r mat) (mkSum r $ transpose $ proba r mat)
-- filter with threshold
----------------------------------------------------------------------- -----------------------------------------------------------------------
-- | Distributional Distance -- | Distributional Distance
distributional :: Matrix Int -> Matrix Double distributional :: Matrix Int -> Matrix Double
distributional m = run $ filter $ ri (map fromIntegral $ use m) distributional m = run $ miniMax $ ri (map fromIntegral $ use m)
where where
n = rank' m n = dim m
miniMax m = map (\x -> ifThenElse (x > miniMax') x 0) m
where
miniMax' = (the $ minimum $ maximum m)
filter m = zipWith (\a b -> max a b) m (transpose m) filter m = zipWith (\a b -> max a b) m (transpose m)
...@@ -139,6 +150,98 @@ distributional m = run $ filter $ ri (map fromIntegral $ use m) ...@@ -139,6 +150,98 @@ distributional m = run $ filter $ ri (map fromIntegral $ use m)
cross mat = zipWith (-) (mkSum n mat) (mat) cross mat = zipWith (-) (mkSum n mat) (mat)
int2double :: Matrix Int -> Matrix Double
int2double m = run (map fromIntegral $ use m) -----------------------------------------------------------------------
-----------------------------------------------------------------------
{-
Metric Specificity and genericity: select terms
let N termes
Ni : occ de i
Nij : cooc i et j
Probability to get i given j : P(i|j)=Nij/Nj
Gen(i) : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity of i
Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
Inclusion (i) = Gen(i)+Spec(i)
Genericity score = Gen(i)- Spec(i)
References:
* Science mapping with asymmetrical paradigmatic proximity Jean-Philippe Cointet (CREA, TSV), David Chavalarias (CREA) (Submitted on 15 Mar 2008), Networks and Heterogeneous Media 3, 2 (2008) 267 - 276, arXiv:0803.2315 [cs.OH]
-}
type InclusionExclusion = Double
type SpecificityGenericity = Double
data SquareMatrix = SymetricMatrix | NonSymetricMatrix
type SymetricMatrix = Matrix
type NonSymetricMatrix = Matrix
incExcSpeGen :: Matrix Int -> (Vector InclusionExclusion, Vector SpecificityGenericity)
incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
where
run' fun mat = run $ fun $ map fromIntegral $ use mat
-- | Inclusion (i) = Gen(i)+Spec(i)
inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
inclusionExclusion mat = zipWith (+) (pV mat) (pH mat)
--
-- | Genericity score = Gen(i)- Spec(i)
specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
specificityGenericity mat = zipWith (-) (pV mat) (pH mat)
-- | Gen(i) : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity of i
pV :: Acc (Matrix Double) -> Acc (Vector Double)
pV mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ij mat
-- | Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
pH :: Acc (Matrix Double) -> Acc (Vector Double)
pH mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ji mat
cardN :: Exp Double
cardN = constant (P.fromIntegral (dim m) :: Double)
-- | P(i|j) = Nij /N(jj) Probability to get i given j
p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (SymetricMatrix e) -> Acc (Matrix e)
p_ij m = zipWith (/) m (n_jj m)
where
n_jj :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
n_jj m = backpermute (shape m)
(lift1 ( \(Z :. (_ :: Exp Int) :. (j:: Exp Int))
-> (Z :. j :. j)
)
) m
-- | P(j|i) = Nij /N(ii) Probability to get i given j
-- to test
p_ji :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
p_ji = transpose . p_ij
-- | Step to ckeck the result in visual/qualitative tests
incExcSpeGen_proba :: Matrix Int -> Matrix Double
incExcSpeGen_proba m = run' pro m
where
run' fun mat = run $ fun $ map fromIntegral $ use mat
pro mat = p_ji mat
{-
-- | Hypothesis to test maybe later (or not)
-- TODO ask accelerate for instances to ease such writtings:
p_ :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
p_ m = zipWith (/) m (n_ m)
where
n_ :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
n_ m = backpermute (shape m)
(lift1 ( \(Z :. (i :: Exp Int) :. (j:: Exp Int))
-> (ifThenElse (i < j) (lift (Z :. j :. j)) (lift (Z :. i :. i)) :: Exp DIM2)
)
) m
-}
...@@ -39,6 +39,8 @@ import qualified Data.Set as S ...@@ -39,6 +39,8 @@ import qualified Data.Set as S
import Data.Map (Map) import Data.Map (Map)
import qualified Data.Map.Strict as M import qualified Data.Map.Strict as M
import Data.Vector (Vector)
import Gargantext.Prelude import Gargantext.Prelude
type Index = Int type Index = Int
...@@ -50,7 +52,7 @@ score :: (Ord t) => (A.Matrix Int -> A.Matrix Double) ...@@ -50,7 +52,7 @@ score :: (Ord t) => (A.Matrix Int -> A.Matrix Double)
-> Map (t, t) Double -> Map (t, t) Double
score f m = fromIndex fromI . mat2map . f $ cooc2mat toI m score f m = fromIndex fromI . mat2map . f $ cooc2mat toI m
where where
(toI, fromI) = createIndexes m (toI, fromI) = createIndices m
------------------------------------------------------------------------------- -------------------------------------------------------------------------------
------------------------------------------------------------------------------- -------------------------------------------------------------------------------
...@@ -79,21 +81,30 @@ toIndex ni ns = indexConversion ni ns ...@@ -79,21 +81,30 @@ toIndex ni ns = indexConversion ni ns
fromIndex :: Ord t => Map Index t -> Map (Index, Index) a -> Map (t,t) a fromIndex :: Ord t => Map Index t -> Map (Index, Index) a -> Map (t,t) a
fromIndex ni ns = indexConversion ni ns fromIndex ni ns = indexConversion ni ns
---------------------------------------------------------------------------------
indexConversion :: (Ord b, Ord k) => Map k b -> Map (k,k) a -> Map (b, b) a indexConversion :: (Ord b, Ord k) => Map k b -> Map (k,k) a -> Map (b, b) a
indexConversion index ms = M.fromList $ map (\((k1,k2),c) -> ( ((M.!) index k1, (M.!) index k2), c)) (M.toList ms) indexConversion index ms = M.fromList $ map (\((k1,k2),c) -> ( ((M.!) index k1, (M.!) index k2), c)) (M.toList ms)
---------------------------------------------------------------------------------
------------------------------------------------------------------------------- -------------------------------------------------------------------------------
------------------------------------------------------------------------------- -- TODO
createIndexes :: Ord t => Map (t, t) b -> (Map t Index, Map Index t) fromIndex' :: Ord t => Vector t -> Map (Index, Index) a -> Map (t,t) a
createIndexes = set2indexes . cooc2set fromIndex' vi ns = undefined
-- TODO
createIndices' :: Ord t => Map (t, t) b -> (Map t Index, Vector t)
createIndices' = undefined
createIndices :: Ord t => Map (t, t) b -> (Map t Index, Map Index t)
createIndices = set2indices . map2set
where where
cooc2set :: Ord t => Map (t, t) a -> Set t map2set :: Ord t => Map (t, t) a -> Set t
cooc2set cs' = foldl' (\s ((t1,t2),_) -> insert [t1,t2] s ) S.empty (M.toList cs') map2set cs' = foldl' (\s ((t1,t2),_) -> insert [t1,t2] s ) S.empty (M.toList cs')
where where
insert as s = foldl' (\s' t -> S.insert t s') s as insert as s = foldl' (\s' t -> S.insert t s') s as
set2indexes :: Ord t => Set t -> (Map t Index, Map Index t) set2indices :: Ord t => Set t -> (Map t Index, Map Index t)
set2indexes s = (M.fromList toIndex', M.fromList fromIndex') set2indices s = (M.fromList toIndex', M.fromList fromIndex')
where where
fromIndex' = zip [0..] xs fromIndex' = zip [0..] xs
toIndex' = zip xs [0..] toIndex' = zip xs [0..]
......
...@@ -14,7 +14,7 @@ extra-deps: ...@@ -14,7 +14,7 @@ extra-deps:
- git: https://github.com/delanoe/servant-static-th.git - git: https://github.com/delanoe/servant-static-th.git
commit: fff77e79fe94d563ab5cae2609b78c17b5c1f434 commit: fff77e79fe94d563ab5cae2609b78c17b5c1f434
- accelerate-1.2.0.0 - accelerate-1.2.0.0
- accelerate-io-1.2.0.0 - hashtables-1.2.3.0 # needed by accelerate-1.2.0.0
- aeson-1.2.4.0 - aeson-1.2.4.0
- aeson-lens-0.5.0.0 - aeson-lens-0.5.0.0
- duckling-0.1.3.0 - duckling-0.1.3.0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment