[FEAT] cooc added for pipeline

7e4ad917 · Alexandre Delanoë · 7665bcb6 · 7e4ad917 · 7e4ad917 · 7e4ad917
Commit 7e4ad917 authored May 25, 2018 by Alexandre Delanoë
10 changed files
--- a/src/Gargantext/Core/Types.hs
+++ b/src/Gargantext/Core/Types.hs
@@ -19,13 +19,14 @@ module Gargantext.Core.Types ( module Gargantext.Core.Types.Main
                             , module Gargantext.Core.Types.Node
                             , Term, Terms(..)
                             , TokenTag(..), POS(..), NER(..)
+                             , Label, Stems
                             ) where

 import GHC.Generics
 import Data.Aeson
 import Data.Monoid
-import qualified Data.Set as S
 import Data.Set (Set, empty)
+--import qualified Data.Set as S

 import Data.Text (Text, unpack)

@@ -36,14 +37,20 @@ import Gargantext.Prelude
 ------------------------------------------------------------------------

 type Term  = Text
+type Stems = Set Text
+type Label = [Text]

-data Terms = Terms { _terms_label :: [Text]
-                   , _terms_stem  :: Set Text
-                   } deriving (Show)
+data Terms = Terms { _terms_label :: Label
+                   , _terms_stem  :: Stems
+                   } deriving (Show, Ord)
+
+-- class Inclusion where include
+--instance Eq Terms where
+--  (==) (Terms _ s1) (Terms _ s2) = s1 `S.isSubsetOf` s2
+--                                || s2 `S.isSubsetOf` s1

 instance Eq Terms where
-  (==) (Terms _ s1) (Terms _ s2) = s1 `S.isSubsetOf` s2
-                           || s2 `S.isSubsetOf` s1
+  (==) (Terms _ s1) (Terms _ s2) = s1 == s2


 ------------------------------------------------------------------------

--- a/src/Gargantext/Database/Cooc.hs
+++ b/src/Gargantext/Database/Cooc.hs
@@ -11,16 +11,18 @@ Here is a longer description of this module, containing some
 commentary with @some markup@.
 -}

--{-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE NoImplicitPrelude #-}
 --{-# LANGUAGE OverloadedStrings #-}

 {-# LANGUAGE QuasiQuotes #-}

 module Gargantext.Database.Cooc where

+import Control.Monad ((>>=))
 import Database.PostgreSQL.Simple
 import Database.PostgreSQL.Simple.SqlQQ

+import Gargantext.Prelude
 import Gargantext (connectGargandb)

 type CorpusId    = Int
@@ -29,10 +31,10 @@ type GroupListId = Int

 coocTest :: IO [(Int, Int, Int)]
 coocTest = connectGargandb "gargantext.ini"
-  >>= \conn -> cooc conn 421968 446602 446599 
+  >>= \conn -> dBcooc conn 421968 446602 446599 

-cooc :: Connection -> CorpusId -> MainListId -> GroupListId -> IO [(Int, Int, Int)]
-cooc conn corpus mainList groupList = query conn [sql|
+dBcooc :: Connection -> CorpusId -> MainListId -> GroupListId -> IO [(Int, Int, Int)]
+dBcooc conn corpus mainList groupList = query conn [sql|
  set work_mem='1GB';

  --EXPLAIN ANALYZE

--- a/src/Gargantext/Prelude.hs
+++ b/src/Gargantext/Prelude.hs
@@ -42,12 +42,13 @@ import Protolude ( Bool(True, False), Int, Double, Integer
                 , abs, min, max, maximum, minimum, return, snd, truncate
                 , (+), (*), (/), (-), (.), ($), (**), (^), (<), (>), log
                 , Eq, (==), (>=), (<=), (<>), (/=)
-                 , (&&), (||), not
+                 , (&&), (||), not, any
                 , fst, snd, toS
                 , elem, die, mod, div, const, either
                 , curry, uncurry
                 , otherwise, when
                 , undefined
+                 , IO()
                 )

 -- TODO import functions optimized in Utils.Count

--- a/src/Gargantext/Text.hs
+++ b/src/Gargantext/Text.hs
@@ -17,24 +17,31 @@ Text gathers terms in unit of contexts.
 module Gargantext.Text
  where

+import Data.Maybe
 import qualified Data.Text as DT
--import Data.Text.IO (readFile)
-

+import qualified Data.Set as S
 import Data.Text (Text, split)

 import NLP.FullStop (segment)
 -----------------------------------------------------------------
-
+import Gargantext.Core
 import Gargantext.Core.Types
+import Gargantext.Text.Metrics.Occurrences (Occ, occurrences, cooc)
 import Gargantext.Prelude hiding (filter)
 -----------------------------------------------------------------

-data Group = Group { _group_label  :: Terms
-                   , _group_terms :: Terms
-                   } deriving (Show)
+type Config  = Lang -> Context
+
+type Context = Text -> [Text]
+
+data Viz = Graph | Phylo | Chart

+pipeline :: Config -> Text -> Viz
+pipeline = undefined

+
+-----------------------------------------------------------------
 -------------------------------------------------------------------
 -- Contexts of text
 sentences :: Text -> [Text]
@@ -57,6 +64,10 @@ testText_en = DT.pack "Text mining, also referred to as text data mining, roughl
 testText_fr :: Text
 testText_fr = DT.pack "La fouille de textes ou « l'extraction de connaissances » dans les textes est une spécialisation de la fouille de données et fait partie du domaine de l'intelligence artificielle. Cette technique est souvent désignée sous l'anglicisme text mining. Elle désigne un ensemble de traitements informatiques consistant à extraire des connaissances selon un critère de nouveauté ou de similarité dans des textes produits par des humains pour des humains. Dans la pratique, cela revient à mettre en algorithme un modèle simplifié des théories linguistiques dans des systèmes informatiques d'apprentissage et de statistiques. Les disciplines impliquées sont donc la linguistique calculatoire, l'ingénierie des langues, l'apprentissage artificiel, les statistiques et l'informatique."

+termTests :: Text
+termTests = "It is hard to detect important articles in a specific context. Information retrieval techniques based on full text search can be inaccurate to identify main topics and they are not able to provide an indication about the importance of the article. Generating a citation network is a good way to find most popular articles but this approach is not context aware. The text around a citation mark is generally a good summary of the referred article. So citation context analysis presents an opportunity to use the wisdom of crowd for detecting important articles in a context sensitive way. In this work, we analyze citation contexts to rank articles properly for a given topic. The model proposed uses citation contexts in order to create a directed and edge-labeled citation network based on the target topic. Then we apply common ranking algorithms in order to find important articles in this newly created network. We showed that this method successfully detects a good subset of most prominent articles in a given topic. The biggest contribution of this approach is that we are able to identify important articles for a given search term even though these articles do not contain this search term. This technique can be used in other linked documents including web pages, legal documents, and patents as well as scientific papers."
+
+
 -- | Ngrams Test
 -- >>> ngramsTest testText
 -- 248

--- a/src/Gargantext/Text/Metrics/FrequentItemSet.hs
+++ b/src/Gargantext/Text/Metrics/FrequentItemSet.hs
@@ -15,7 +15,7 @@ Domain Specific Language to manage Frequent Item Set (FIS)

 module Gargantext.Text.Metrics.FrequentItemSet
  ( Fis, Size
-  , occ, cooc
+  , occ_hlcm, cooc_hlcm
  , all, between
  , module HLCM
  )
@@ -34,12 +34,12 @@ type Size = Either Int (Int, Int)

 ------------------------------------------------------------------------
 -- | Occurrence is Frequent Item Set of size 1
-occ  :: Frequency -> [[Item]] -> [Fis]
-occ f is  = fisWithSize (Left 1) f is
+occ_hlcm  :: Frequency -> [[Item]] -> [Fis]
+occ_hlcm f is  = fisWithSize (Left 1) f is

 -- | Cooccurrence is Frequent Item Set of size 2
-cooc :: Frequency -> [[Item]] -> [Fis]
-cooc f is = fisWithSize (Left 2) f is
+cooc_hlcm :: Frequency -> [[Item]] -> [Fis]
+cooc_hlcm f is = fisWithSize (Left 2) f is

 all :: Frequency -> [[Item]] -> [Fis]
 all f is  = fisWith Nothing f is

--- a/src/Gargantext/Text/Metrics/Occurrences.hs
+++ b/src/Gargantext/Text/Metrics/Occurrences.hs
@@ -28,43 +28,76 @@ Source : https://en.wikipedia.org/wiki/Type%E2%80%93token_distinction#Occurrence
 module Gargantext.Text.Metrics.Occurrences
  where

-import Gargantext.Prelude

 import Data.Map.Strict  (Map
                        , empty
-                        , insertWith, unionWith
+                        , insertWith, insertWithKey, unionWith
+                        , toList
                        )

+import qualified Data.Map.Strict as DMS
 import Control.Monad ((>>),(>>=))
 import Data.String (String())
 import Data.Attoparsec.Text
-import Data.Text (Text)

-import Data.Either.Extra(Either(..))
-import qualified Data.Text as T
-import Control.Applicative hiding (empty)
-----------------------------------------------------------
+------------------------------------------------------------------------
+import Gargantext.Prelude
+import Gargantext.Core.Types

-type Occ = Int
+------------------------------------------------------------------------

-- | Compute the occurrences (occ)
-occ :: Ord a => [a] -> Map a Occ
-occ xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs
+type Occ  a = Map      a  Int
+type Cooc a = Map (a,  a) Int
+type FIS  a = Map (Set a) Int

-- TODO add groups and filter stops
-sumOcc :: Ord a => [Map a Occ] -> Map a Occ
-sumOcc xs = foldl' (unionWith (+)) empty xs
+data Group = ByStem | ByOntology
+
+type Grouped = Stems
+
+
+-- >> let testData = ["blue lagoon", "blues lagoon", "red lagoon"]
+-- >> map occurrences <$> Prelude.mapM (terms Mono EN) 
+-- [fromList [(fromList ["blue"],1),(fromList ["lagoon"],1)],fromList [(fromList ["blue"],1),(fromList ["lagoon"],1)],fromList [(fromList ["lagoon"],1),(fromList ["red"],1)]]
+--λ:   cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon"]
+--fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),1)]
+--λ:   cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon", "red lagoon"]
+--fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
+--λ:   cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon red lagoon", "red lagoon"]
+--fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
+--λ:   cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon blues lagoon", "red lagoon red lagoon", "red lagoon"]
+--fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
+---- 
+
+
+cooc :: (Ord b, Num a) => [Map b a] -> Map (b, b) a
+cooc ts = cooc' $ map cooc'' ts

+cooc' :: (Ord b, Num a) => [Map (b, b) a] -> Map (b,b) a
+cooc' = foldl' (\x y -> unionWith (+) x y) empty

-occurrenceParser :: Text -> Parser Bool
-occurrenceParser txt = manyTill anyChar (string txt) >> pure True
+cooc'' :: (Ord b, Num a) => Map b a -> Map (b, b) a
+cooc'' m = foldl' (\x (y,c) -> insertWith (+) y c x) empty xs
+  where
+      xs =[ ((x'',y''), c') | x' <- toList m
+                            , y' <- toList m
+                            , let x'' = fst x'
+                            , let y'' = fst y'
+                            , x'' < y''
+                            , let c' = 1
+                            --, let c' = snd x' + snd y'
+                            ]
+
+
+-- | Compute the grouped occurrences (occ)
+occurrences :: [Terms] -> Map Grouped Int
+occurrences = occurrences' _terms_stem
+
+occurrences' :: Ord b => (a -> b) -> [a] -> Occ b
+occurrences' f xs = foldl' (\x y -> insertWith (+) (f y) 1 x) empty xs
+
+
+-- TODO add groups and filter stops
+sumOcc :: Ord a => [Occ a] -> Occ a
+sumOcc xs = foldl' (unionWith (+)) empty xs

-occurrencesParser :: Text -> Parser Int
-occurrencesParser txt = case txt of
-                    "" -> pure 0
-                    _  -> many (occurrenceParser txt') >>= \matches -> pure (length matches)
-    where
-        txt' = T.toLower txt

-parseOccurrences :: Text -> Text -> Either String Int
-parseOccurrences x = parseOnly (occurrencesParser x)
--- a/src/Gargantext/Text/Terms.hs
+++ b/src/Gargantext/Text/Terms.hs
@@ -44,16 +44,8 @@ import Gargantext.Text.Terms.Mono  (monoterms')
 data TermType = Mono | Multi

 ------------------------------------------------------------------------
-terms :: TermType -> Maybe Lang -> Text -> IO [Terms]
-terms Mono  (Just lang)  txt = pure $ monoterms' lang txt
-terms Multi (Just lang ) txt = multiterms lang txt
-terms _      Nothing _ = panic "Lang needed"
+terms :: TermType -> Lang -> Text -> IO [Terms]
+terms Mono  lang txt = pure $ monoterms' lang txt
+terms Multi lang txt = multiterms lang txt
 ------------------------------------------------------------------------

-termTests :: Text
-termTests = "It is hard to detect important articles in a specific context. Information retrieval techniques based on full text search can be inaccurate to identify main topics and they are not able to provide an indication about the importance of the article. Generating a citation network is a good way to find most popular articles but this approach is not context aware. The text around a citation mark is generally a good summary of the referred article. So citation context analysis presents an opportunity to use the wisdom of crowd for detecting important articles in a context sensitive way. In this work, we analyze citation contexts to rank articles properly for a given topic. The model proposed uses citation contexts in order to create a directed and edge-labeled citation network based on the target topic. Then we apply common ranking algorithms in order to find important articles in this newly created network. We showed that this method successfully detects a good subset of most prominent articles in a given topic. The biggest contribution of this approach is that we are able to identify important articles for a given search term even though these articles do not contain this search term. This technique can be used in other linked documents including web pages, legal documents, and patents as well as scientific papers."
-
-
-
-
-
--- a/src/Gargantext/Text/Terms/Mono/Stem.hs
+++ b/src/Gargantext/Text/Terms/Mono/Stem.hs
@@ -17,6 +17,7 @@ Source : https://en.wikipedia.org/wiki/Stemming

 -}

+{-# LANGUAGE NoImplicitPrelude #-}

 module Gargantext.Text.Terms.Mono.Stem (stem, Lang(..))
  where
@@ -25,6 +26,7 @@ import Data.Text (Text)
 import qualified Data.Text   as DT
 import qualified NLP.Stemmer as N

+import Gargantext.Prelude
 import Gargantext.Core (Lang(..))

 -- (stem, Stemmer(..))

--- a/src/Gargantext/Text/Terms/Mono/Stem/En.hs
+++ b/src/Gargantext/Text/Terms/Mono/Stem/En.hs
@@ -17,7 +17,7 @@ Adapted from:

 {-# LANGUAGE NoImplicitPrelude #-}

-module Gargantext.Text.Terms.Mono.Stem.En (stem)
+module Gargantext.Text.Terms.Mono.Stem.En (stemIt)
  where

 import Control.Monad
@@ -196,8 +196,8 @@ step5 = step5b . step5a
 allSteps :: [Char] -> [Char]
 allSteps = step5 . step4 . step3 . step2 . step1

-stem :: Text -> Text
-stem s = pack (stem' $ unpack s)
+stemIt :: Text -> Text
+stemIt s = pack (stem' $ unpack s)

 stem' :: [Char] -> [Char]
 stem' s | length s < 3 = s

--- a/src/Gargantext/Text/Terms/Mono/Token/En.hs
+++ b/src/Gargantext/Text/Terms/Mono/Token/En.hs
@@ -10,6 +10,7 @@ Portability : POSIX
 First inspired from https://bitbucket.org/gchrupala/lingo/overview
 -}

+{-# LANGUAGE NoImplicitPrelude #-}
 {-# LANGUAGE OverloadedStrings #-}

 module Gargantext.Text.Terms.Mono.Token.En
@@ -29,13 +30,18 @@ module Gargantext.Text.Terms.Mono.Token.En
    )
  where

+import Data.Foldable (concatMap)
 import qualified Data.Char as Char
 import Data.Maybe
 import Control.Monad
+import Control.Applicative (Applicative)

 import Data.Text (Text)
 import qualified Data.Text as T

+import Data.Either
+import Gargantext.Prelude
+
 -- | A Tokenizer is function which takes a list and returns a list of Eithers
 --  (wrapped in a newtype). Right Texts will be passed on for processing
 --  to tokenizers down