Commit 7e4ad917 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT] cooc added for pipeline

parent 7665bcb6
......@@ -19,13 +19,14 @@ module Gargantext.Core.Types ( module Gargantext.Core.Types.Main
, module Gargantext.Core.Types.Node
, Term, Terms(..)
, TokenTag(..), POS(..), NER(..)
, Label, Stems
) where
import GHC.Generics
import Data.Aeson
import Data.Monoid
import qualified Data.Set as S
import Data.Set (Set, empty)
--import qualified Data.Set as S
import Data.Text (Text, unpack)
......@@ -36,14 +37,20 @@ import Gargantext.Prelude
------------------------------------------------------------------------
type Term = Text
type Stems = Set Text
type Label = [Text]
data Terms = Terms { _terms_label :: [Text]
, _terms_stem :: Set Text
} deriving (Show)
data Terms = Terms { _terms_label :: Label
, _terms_stem :: Stems
} deriving (Show, Ord)
-- class Inclusion where include
--instance Eq Terms where
-- (==) (Terms _ s1) (Terms _ s2) = s1 `S.isSubsetOf` s2
-- || s2 `S.isSubsetOf` s1
instance Eq Terms where
(==) (Terms _ s1) (Terms _ s2) = s1 `S.isSubsetOf` s2
|| s2 `S.isSubsetOf` s1
(==) (Terms _ s1) (Terms _ s2) = s1 == s2
------------------------------------------------------------------------
......
......@@ -11,16 +11,18 @@ Here is a longer description of this module, containing some
commentary with @some markup@.
-}
--{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE NoImplicitPrelude #-}
--{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE QuasiQuotes #-}
module Gargantext.Database.Cooc where
import Control.Monad ((>>=))
import Database.PostgreSQL.Simple
import Database.PostgreSQL.Simple.SqlQQ
import Gargantext.Prelude
import Gargantext (connectGargandb)
type CorpusId = Int
......@@ -29,10 +31,10 @@ type GroupListId = Int
coocTest :: IO [(Int, Int, Int)]
coocTest = connectGargandb "gargantext.ini"
>>= \conn -> cooc conn 421968 446602 446599
>>= \conn -> dBcooc conn 421968 446602 446599
cooc :: Connection -> CorpusId -> MainListId -> GroupListId -> IO [(Int, Int, Int)]
cooc conn corpus mainList groupList = query conn [sql|
dBcooc :: Connection -> CorpusId -> MainListId -> GroupListId -> IO [(Int, Int, Int)]
dBcooc conn corpus mainList groupList = query conn [sql|
set work_mem='1GB';
--EXPLAIN ANALYZE
......
......@@ -42,12 +42,13 @@ import Protolude ( Bool(True, False), Int, Double, Integer
, abs, min, max, maximum, minimum, return, snd, truncate
, (+), (*), (/), (-), (.), ($), (**), (^), (<), (>), log
, Eq, (==), (>=), (<=), (<>), (/=)
, (&&), (||), not
, (&&), (||), not, any
, fst, snd, toS
, elem, die, mod, div, const, either
, curry, uncurry
, otherwise, when
, undefined
, IO()
)
-- TODO import functions optimized in Utils.Count
......
......@@ -17,24 +17,31 @@ Text gathers terms in unit of contexts.
module Gargantext.Text
where
import Data.Maybe
import qualified Data.Text as DT
--import Data.Text.IO (readFile)
import qualified Data.Set as S
import Data.Text (Text, split)
import NLP.FullStop (segment)
-----------------------------------------------------------------
import Gargantext.Core
import Gargantext.Core.Types
import Gargantext.Text.Metrics.Occurrences (Occ, occurrences, cooc)
import Gargantext.Prelude hiding (filter)
-----------------------------------------------------------------
data Group = Group { _group_label :: Terms
, _group_terms :: Terms
} deriving (Show)
type Config = Lang -> Context
type Context = Text -> [Text]
data Viz = Graph | Phylo | Chart
pipeline :: Config -> Text -> Viz
pipeline = undefined
-----------------------------------------------------------------
-------------------------------------------------------------------
-- Contexts of text
sentences :: Text -> [Text]
......@@ -57,6 +64,10 @@ testText_en = DT.pack "Text mining, also referred to as text data mining, roughl
testText_fr :: Text
testText_fr = DT.pack "La fouille de textes ou « l'extraction de connaissances » dans les textes est une spécialisation de la fouille de données et fait partie du domaine de l'intelligence artificielle. Cette technique est souvent désignée sous l'anglicisme text mining. Elle désigne un ensemble de traitements informatiques consistant à extraire des connaissances selon un critère de nouveauté ou de similarité dans des textes produits par des humains pour des humains. Dans la pratique, cela revient à mettre en algorithme un modèle simplifié des théories linguistiques dans des systèmes informatiques d'apprentissage et de statistiques. Les disciplines impliquées sont donc la linguistique calculatoire, l'ingénierie des langues, l'apprentissage artificiel, les statistiques et l'informatique."
termTests :: Text
termTests = "It is hard to detect important articles in a specific context. Information retrieval techniques based on full text search can be inaccurate to identify main topics and they are not able to provide an indication about the importance of the article. Generating a citation network is a good way to find most popular articles but this approach is not context aware. The text around a citation mark is generally a good summary of the referred article. So citation context analysis presents an opportunity to use the wisdom of crowd for detecting important articles in a context sensitive way. In this work, we analyze citation contexts to rank articles properly for a given topic. The model proposed uses citation contexts in order to create a directed and edge-labeled citation network based on the target topic. Then we apply common ranking algorithms in order to find important articles in this newly created network. We showed that this method successfully detects a good subset of most prominent articles in a given topic. The biggest contribution of this approach is that we are able to identify important articles for a given search term even though these articles do not contain this search term. This technique can be used in other linked documents including web pages, legal documents, and patents as well as scientific papers."
-- | Ngrams Test
-- >>> ngramsTest testText
-- 248
......
......@@ -15,7 +15,7 @@ Domain Specific Language to manage Frequent Item Set (FIS)
module Gargantext.Text.Metrics.FrequentItemSet
( Fis, Size
, occ, cooc
, occ_hlcm, cooc_hlcm
, all, between
, module HLCM
)
......@@ -34,12 +34,12 @@ type Size = Either Int (Int, Int)
------------------------------------------------------------------------
-- | Occurrence is Frequent Item Set of size 1
occ :: Frequency -> [[Item]] -> [Fis]
occ f is = fisWithSize (Left 1) f is
occ_hlcm :: Frequency -> [[Item]] -> [Fis]
occ_hlcm f is = fisWithSize (Left 1) f is
-- | Cooccurrence is Frequent Item Set of size 2
cooc :: Frequency -> [[Item]] -> [Fis]
cooc f is = fisWithSize (Left 2) f is
cooc_hlcm :: Frequency -> [[Item]] -> [Fis]
cooc_hlcm f is = fisWithSize (Left 2) f is
all :: Frequency -> [[Item]] -> [Fis]
all f is = fisWith Nothing f is
......
......@@ -28,43 +28,76 @@ Source : https://en.wikipedia.org/wiki/Type%E2%80%93token_distinction#Occurrence
module Gargantext.Text.Metrics.Occurrences
where
import Gargantext.Prelude
import Data.Map.Strict (Map
, empty
, insertWith, unionWith
, insertWith, insertWithKey, unionWith
, toList
)
import qualified Data.Map.Strict as DMS
import Control.Monad ((>>),(>>=))
import Data.String (String())
import Data.Attoparsec.Text
import Data.Text (Text)
import Data.Either.Extra(Either(..))
import qualified Data.Text as T
import Control.Applicative hiding (empty)
-----------------------------------------------------------
------------------------------------------------------------------------
import Gargantext.Prelude
import Gargantext.Core.Types
type Occ = Int
------------------------------------------------------------------------
-- | Compute the occurrences (occ)
occ :: Ord a => [a] -> Map a Occ
occ xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs
type Occ a = Map a Int
type Cooc a = Map (a, a) Int
type FIS a = Map (Set a) Int
-- TODO add groups and filter stops
sumOcc :: Ord a => [Map a Occ] -> Map a Occ
sumOcc xs = foldl' (unionWith (+)) empty xs
data Group = ByStem | ByOntology
type Grouped = Stems
-- >> let testData = ["blue lagoon", "blues lagoon", "red lagoon"]
-- >> map occurrences <$> Prelude.mapM (terms Mono EN)
-- [fromList [(fromList ["blue"],1),(fromList ["lagoon"],1)],fromList [(fromList ["blue"],1),(fromList ["lagoon"],1)],fromList [(fromList ["lagoon"],1),(fromList ["red"],1)]]
--λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon"]
--fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),1)]
--λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon", "red lagoon"]
--fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
--λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon red lagoon", "red lagoon"]
--fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
--λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon blues lagoon", "red lagoon red lagoon", "red lagoon"]
--fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
----
cooc :: (Ord b, Num a) => [Map b a] -> Map (b, b) a
cooc ts = cooc' $ map cooc'' ts
cooc' :: (Ord b, Num a) => [Map (b, b) a] -> Map (b,b) a
cooc' = foldl' (\x y -> unionWith (+) x y) empty
occurrenceParser :: Text -> Parser Bool
occurrenceParser txt = manyTill anyChar (string txt) >> pure True
cooc'' :: (Ord b, Num a) => Map b a -> Map (b, b) a
cooc'' m = foldl' (\x (y,c) -> insertWith (+) y c x) empty xs
where
xs =[ ((x'',y''), c') | x' <- toList m
, y' <- toList m
, let x'' = fst x'
, let y'' = fst y'
, x'' < y''
, let c' = 1
--, let c' = snd x' + snd y'
]
-- | Compute the grouped occurrences (occ)
occurrences :: [Terms] -> Map Grouped Int
occurrences = occurrences' _terms_stem
occurrences' :: Ord b => (a -> b) -> [a] -> Occ b
occurrences' f xs = foldl' (\x y -> insertWith (+) (f y) 1 x) empty xs
-- TODO add groups and filter stops
sumOcc :: Ord a => [Occ a] -> Occ a
sumOcc xs = foldl' (unionWith (+)) empty xs
occurrencesParser :: Text -> Parser Int
occurrencesParser txt = case txt of
"" -> pure 0
_ -> many (occurrenceParser txt') >>= \matches -> pure (length matches)
where
txt' = T.toLower txt
parseOccurrences :: Text -> Text -> Either String Int
parseOccurrences x = parseOnly (occurrencesParser x)
......@@ -44,16 +44,8 @@ import Gargantext.Text.Terms.Mono (monoterms')
data TermType = Mono | Multi
------------------------------------------------------------------------
terms :: TermType -> Maybe Lang -> Text -> IO [Terms]
terms Mono (Just lang) txt = pure $ monoterms' lang txt
terms Multi (Just lang ) txt = multiterms lang txt
terms _ Nothing _ = panic "Lang needed"
terms :: TermType -> Lang -> Text -> IO [Terms]
terms Mono lang txt = pure $ monoterms' lang txt
terms Multi lang txt = multiterms lang txt
------------------------------------------------------------------------
termTests :: Text
termTests = "It is hard to detect important articles in a specific context. Information retrieval techniques based on full text search can be inaccurate to identify main topics and they are not able to provide an indication about the importance of the article. Generating a citation network is a good way to find most popular articles but this approach is not context aware. The text around a citation mark is generally a good summary of the referred article. So citation context analysis presents an opportunity to use the wisdom of crowd for detecting important articles in a context sensitive way. In this work, we analyze citation contexts to rank articles properly for a given topic. The model proposed uses citation contexts in order to create a directed and edge-labeled citation network based on the target topic. Then we apply common ranking algorithms in order to find important articles in this newly created network. We showed that this method successfully detects a good subset of most prominent articles in a given topic. The biggest contribution of this approach is that we are able to identify important articles for a given search term even though these articles do not contain this search term. This technique can be used in other linked documents including web pages, legal documents, and patents as well as scientific papers."
......@@ -17,6 +17,7 @@ Source : https://en.wikipedia.org/wiki/Stemming
-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Terms.Mono.Stem (stem, Lang(..))
where
......@@ -25,6 +26,7 @@ import Data.Text (Text)
import qualified Data.Text as DT
import qualified NLP.Stemmer as N
import Gargantext.Prelude
import Gargantext.Core (Lang(..))
-- (stem, Stemmer(..))
......
......@@ -17,7 +17,7 @@ Adapted from:
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Terms.Mono.Stem.En (stem)
module Gargantext.Text.Terms.Mono.Stem.En (stemIt)
where
import Control.Monad
......@@ -196,8 +196,8 @@ step5 = step5b . step5a
allSteps :: [Char] -> [Char]
allSteps = step5 . step4 . step3 . step2 . step1
stem :: Text -> Text
stem s = pack (stem' $ unpack s)
stemIt :: Text -> Text
stemIt s = pack (stem' $ unpack s)
stem' :: [Char] -> [Char]
stem' s | length s < 3 = s
......
......@@ -10,6 +10,7 @@ Portability : POSIX
First inspired from https://bitbucket.org/gchrupala/lingo/overview
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Terms.Mono.Token.En
......@@ -29,13 +30,18 @@ module Gargantext.Text.Terms.Mono.Token.En
)
where
import Data.Foldable (concatMap)
import qualified Data.Char as Char
import Data.Maybe
import Control.Monad
import Control.Applicative (Applicative)
import Data.Text (Text)
import qualified Data.Text as T
import Data.Either
import Gargantext.Prelude
-- | A Tokenizer is function which takes a list and returns a list of Eithers
-- (wrapped in a newtype). Right Texts will be passed on for processing
-- to tokenizers down
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment