Metrics.hs 4.87 KB
Newer Older
1
{-|
2 3 4 5 6
Module      : Gargantext.Text.Metrics
Description : All parsers of Gargantext in one file.
Copyright   : (c) CNRS, 2017 - present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
7 8 9
Stability   : experimental
Portability : POSIX

10
Mainly reexport functions in @Data.Text.Metrics@
11 12 13 14 15 16


TODO
noApax :: Ord a => Map a Occ -> Map a Occ
noApax m = M.filter (>1) m

17 18
-}

19
{-# LANGUAGE NoImplicitPrelude #-}
20
{-# LANGUAGE OverloadedStrings #-}
21

22 23
module Gargantext.Text.Metrics 
  where
24

25
import Data.Text (Text, pack)
26
import Data.Map (Map)
27

28 29
import qualified Data.List as L
import qualified Data.Map  as M
30
import qualified Data.Set  as S
31 32
import qualified Data.Text as T
import Data.Tuple.Extra (both)
33 34
--import GHC.Real (Ratio)
--import qualified Data.Text.Metrics as DTM
35 36
import Data.Array.Accelerate (toList)

37 38 39

import Gargantext.Prelude

40
import Gargantext.Text.Metrics.Count (occurrences, cooc)
41
import Gargantext.Text.Terms (TermType(MonoMulti), terms)
42 43 44 45
import Gargantext.Core (Lang(EN))
import Gargantext.Core.Types (Terms(..))
import Gargantext.Text.Context (splitBy, SplitContext(Sentences))

46 47 48
import Gargantext.Viz.Graph.Distances.Matrice
import Gargantext.Viz.Graph.Index

49 50 51 52
import qualified Data.Array.Accelerate.Interpreter as DAA
import qualified Data.Array.Accelerate as DAA

import GHC.Real (round)
53 54 55 56 57 58 59 60

--filterCooc :: Ord t => Map (t, t) Int -> Map (t, t) Int
--filterCooc m = 
---- filterCooc m = foldl (\k -> maybe (panic "no key") identity $ M.lookup k m) M.empty selection
----(ti, fi)  = createIndices m
-- . fromIndex fi $ filterMat $ cooc2mat ti m


61 62
type ListSize  = Int
type BinSize = Double
63

64 65 66 67 68
takeSome :: Ord t => ListSize -> BinSize -> [Scored t] -> [Scored t]
takeSome l s scores = L.take l
                    $ takeSample n m
                    $ takeKmeans l'
                    $ L.reverse $ L.sortOn _scored_incExc scores
69
  where
70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
    -- TODO : KMEAN split into 2 main clusters 
    -- (advice: use accelerate-example kmeans version 
    --  and maybe benchmark it to be sure)
    takeKmeans = L.take
    l' = 4000
    n = round ((fromIntegral l)/s)
    m = round $ (fromIntegral $ length scores) / (s)
    takeSample n m xs = L.concat $ map (L.take n)
                                 $ L.reverse $ map (L.sortOn _scored_incExc)
                                 $ splitEvery m
                                 $ L.reverse $ L.sortOn _scored_speGen xs


data Scored t = Scored { _scored_terms  :: t
                       , _scored_incExc :: InclusionExclusion
                       , _scored_speGen :: SpecificityGenericity
                     } deriving (Show)

incExcSpeGen_sorted' :: Ord t => Map (t,t) Int -> [Scored t]
incExcSpeGen_sorted' m = zipWith (\(i,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
  where
    (ti,fi) = createIndices m
    (is, ss) = incExcSpeGen $ cooc2mat ti m
    scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)
94 95 96 97 98 99 100 101 102


incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
  where
    (ti,fi) = createIndices m
    ordonne x = L.reverse $ L.sortOn snd $ zip (map snd $ M.toList fi) (toList x)


103

104 105

metrics_text :: Text
106
metrics_text = T.intercalate " " metrics_sentences
107 108 109

metrics_sentences' :: [Text]
metrics_sentences' = splitBy (Sentences 0) metrics_text
110 111 112

-- | Sentences 
metrics_sentences :: [Text]
113 114
metrics_sentences = [ "There is a table with a glass of wine and a spoon."
                    , "I can see the glass on the table."
115
                    , "There was only a spoon on that table."
116
                    , "The glass just fall from the table, pouring wine everywhere."
117 118
                    , "I wish the glass did not contain wine."
                    ]
119

120
metrics_sentences_Test = metrics_sentences == metrics_sentences'
121 122

-- | Terms reordered to visually check occurrences
123 124 125 126 127 128 129 130 131
-- >>> 
{- [ [["table"],["glass"],["wine"],["spoon"]]
   , [["glass"],["table"]]
   , [["spoon"],["table"]]
   , [["glass"],["table"],["wine"]]
   , [["glass"],["wine"]]
   ]
-}

132 133
metrics_terms :: IO [[Terms]]
metrics_terms = mapM (terms MonoMulti EN) $ splitBy (Sentences 0) metrics_text
134 135 136 137 138 139 140 141

-- | Occurrences
{-
fromList [ (fromList ["table"] ,fromList [(["table"] , 3 )])]
         , (fromList ["object"],fromList [(["object"], 3 )])
         , (fromList ["glas"]  ,fromList [(["glas"]  , 2 )])
         , (fromList ["spoon"] ,fromList [(["spoon"] , 2 )])
-}
142
metrics_occ = occurrences <$> L.concat <$> metrics_terms
143 144 145 146 147 148 149

{- 
-- fromList [((["glas"],["object"]),6)
            ,((["glas"],["spoon"]),4)
            ,((["glas"],["table"]),6),((["object"],["spoon"]),6),((["object"],["table"]),9),((["spoon"],["table"]),6)]

-}
150
metrics_cooc = cooc <$> metrics_terms
151

152 153 154
metrics_cooc_mat = do
  m <- metrics_cooc
  let (ti,_) = createIndices m
155
  let mat_cooc = cooc2mat ti m
156
  pure ( ti
157
       , mat_cooc
158 159
       , incExcSpeGen_proba  mat_cooc
       , incExcSpeGen        mat_cooc
160 161
       )

162
metrics_incExcSpeGen = incExcSpeGen_sorted <$> metrics_cooc
163