Commit 7e930b57 authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge branch 'pipeline'

parents 6b1bba16 0be01d72
......@@ -48,6 +48,10 @@ sudo apt-get install libbz2-dev lipq-dev
git clone https://github.com/np/servant-job.git
### Get the clustering louvain library
git clone https://gitlab.iscpif.fr/gargantext/clustering-louvain.git
## Building and installing
stack install
......
{-|
Module : CleanCsvCorpus.hs
Description : Gargantext starter
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Given a Gargantext CSV File and its Query This script cleans and
compress the contexts around the main terms of the query.
-}
{-# LANGUAGE NoImplicitPrelude #-}
module CleanCsvCorpus where
--import GHC.IO (FilePath)
import Data.SearchEngine as S
import qualified Data.Set as S
import Data.Text (pack)
import Data.Vector (Vector)
import qualified Data.Vector as V
import Gargantext.Prelude
import Gargantext.Text.Search
import Gargantext.Text.Parsers.CSV
------------------------------------------------------------------------
type Query = [S.Term]
filterDocs :: [DocId] -> Vector Doc -> Vector Doc
filterDocs docIds = V.filter (\doc -> S.member (d_docId doc) $ S.fromList docIds )
main :: IO ()
main = do
let rPath = "/tmp/Gargantext_Corpus.csv"
let wPath = "/tmp/Gargantext_Corpus_bis.csv"
--let q = ["water", "scarcity", "morocco", "shortage","flood"]
let q = ["gratuit", "gratuité", "culture", "culturel"]
(h,csvDocs) <- readCsv rPath
putStrLn $ "Number of documents before:" <> show (V.length csvDocs)
putStrLn $ "Mean size of docs:" <> show ( docsSize csvDocs)
let docs = toDocs csvDocs
let engine = insertDocs docs initialDocSearchEngine
let docIds = S.query engine (map pack q)
let docs' = fromDocs $ filterDocs docIds (V.fromList docs)
putStrLn $ "Number of documents after:" <> show (V.length docs')
putStrLn $ "Mean size of docs:" <> show (docsSize docs')
writeCsv wPath (h, docs')
......@@ -6,27 +6,35 @@ License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
Script to start gargantext with different modes (Dev, Prod, Mock).
-}
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE Strict #-}
module Main where
import Prelude (putStrLn)
import Options.Generic
import Data.Text (unpack)
import Gargantext.Prelude
import Gargantext.API (startGargantext, startGargantextMock)
--------------------------------------------------------
-- Graph Tests
--import qualified Gargantext.Graph.Utils as U
--import qualified Gargantext.Graph.Distances.Conditional as C
--import qualified Gargantext.Graph.Distances.Distributional as D
--import qualified Gargantext.Graph.Distances.Matrice as M
--------------------------------------------------------
data Mode = Dev | Mock | Prod
......@@ -45,12 +53,13 @@ data MyOptions w = MyOptions { run :: w ::: Mode <?> "Possible modes: De
instance ParseRecord (MyOptions Wrapped)
deriving instance Show (MyOptions Unwrapped)
main :: IO ()
main = do
MyOptions myMode myPort myIniFile <- unwrapRecord
"Gargantext: collaborative platform for text-mining"
let myPort' = case myPort of
Just p -> p
Nothing -> 8008
......@@ -64,6 +73,11 @@ main = do
Just i -> i
Mock -> startGargantextMock myPort'
_ -> startGargantextMock myPort'
putStrLn $ "Starting Gargantext with mode: " <> show myMode
start
-- main' :: IO ()
--main' = putStrLn $ show $ M.conditional $ M.myMat 10
......@@ -18,26 +18,24 @@ library:
ghc-options:
- -Wincomplete-uni-patterns
- -Wincomplete-record-updates
- -Werror
- -Wmissing-signatures
- -Wunused-binds
- -Wunused-imports
# - -Werror
exposed-modules:
- Gargantext
- Gargantext.Prelude
- Gargantext.Core
- Gargantext.Core.Utils
- Gargantext.Core.Types
- Gargantext.Core.Types.Node
- Gargantext.Text
- Gargantext.Text.Metrics
- Gargantext.Text.Metrics.Occurrences
- Gargantext.Text.Metrics.FrequentItemSet
- Gargantext.Text.Ngrams.PosTagging.CoreNLP
- Gargantext.Text.Ngrams.PosTagging.Parser
- Gargantext.Text.Ngrams.Token.Text
- Gargantext.Text.Parsers.Date
- Gargantext.Database
- Gargantext.Text.Search
- Gargantext.Text.Parsers.CSV
- Gargantext.API
- Gargantext.Viz.Graph.Distances.Matrice
dependencies:
- QuickCheck
- accelerate
- accelerate-io
- aeson
- aeson-lens
- aeson-pretty
......@@ -47,6 +45,7 @@ library:
- base16-bytestring
- bytestring
- case-insensitive
- cassava
- conduit
- conduit-extra
- containers
......@@ -56,6 +55,7 @@ library:
- duckling
- exceptions
- filepath
- fullstop
- fclabels
- fast-logger
- full-text-search
......@@ -70,6 +70,7 @@ library:
- jose-jwt
- lens
- logging-effect
- matrix
- monad-logger
- mtl
- natural-transformation
......@@ -122,14 +123,23 @@ library:
executable:
main: Main.hs
source-dirs: app
ghc-options: -threaded -rtsopts -with-rtsopts=-N -O2
source-dirs: app
ghc-options:
- -threaded
- -rtsopts
- -with-rtsopts=-N
- -O2
- -Wmissing-signatures
dependencies:
- base
- containers
- gargantext
- vector
- cassava
- ini
- optparse-generic
- unordered-containers
- full-text-search
tests:
garg-test:
......@@ -157,6 +167,7 @@ tests:
- -threaded
- -rtsopts
- -with-rtsopts=-N
- -Wmissing-signatures
dependencies:
- doctest
- Glob
......
......@@ -11,7 +11,7 @@ Portability : POSIX
-}
module Gargantext (
module Gargantext.Database,
module Gargantext.Database
-- module Gargantext.Ngrams,
-- module Gargantext.Utils,
) where
......@@ -19,3 +19,9 @@ module Gargantext (
import Gargantext.Database
-- import Gargantext.Ngrams
-- import Gargantext.Utils
......@@ -6,18 +6,17 @@ License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# OPTIONS_GHC -fno-warn-name-shadowing #-}
Loads all static file for the front-end.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE TemplateHaskell #-}
---------------------------------------------------------------------
module Gargantext.API.FrontEnd
where
module Gargantext.API.FrontEnd where
import Servant.Static.TH (createApiAndServerDecs)
......
......@@ -23,8 +23,6 @@ module Gargantext.API.Node
where
-------------------------------------------------------------------
import System.IO (putStrLn)
import Control.Monad.IO.Class (liftIO)
import Control.Monad ((>>))
--import System.IO (putStrLn, readFile)
......@@ -98,13 +96,13 @@ type FacetDocAPI = "table"
-- | Node API functions
roots :: Connection -> Server Roots
roots conn = liftIO (putStrLn "Log Needed" >> getNodesWithParentId conn 0 Nothing)
roots conn = liftIO (putStrLn ( "Log Needed" :: Text) >> getNodesWithParentId conn 0 Nothing)
:<|> pure (panic "not implemented yet")
:<|> pure (panic "not implemented yet")
:<|> pure (panic "not implemented yet")
nodeAPI :: Connection -> NodeId -> Server NodeAPI
nodeAPI conn id = liftIO (putStrLn "getNode" >> getNode conn id )
nodeAPI conn id = liftIO (putStrLn ("getNode" :: Text) >> getNode conn id )
:<|> deleteNode' conn id
:<|> getNodesWith' conn id
:<|> getFacet conn id
......
......@@ -11,11 +11,113 @@ Here is a longer description of this module, containing some
commentary with @some markup@.
-}
------------------------------------------------------------------------
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE DeriveGeneric #-}
module Gargantext.Core.Types ( module Gargantext.Core.Types.Main
, module Gargantext.Core.Types.Node
) where
, module Gargantext.Core.Types.Node
, Term, Terms(..)
, TokenTag(..), POS(..), NER(..)
, Label, Stems
) where
import GHC.Generics
import Data.Aeson
import Data.Monoid
import Data.Set (Set, empty)
--import qualified Data.Set as S
import Data.Text (Text, unpack)
import Gargantext.Core.Types.Main
import Gargantext.Core.Types.Node
import Gargantext.Prelude
------------------------------------------------------------------------
type Term = Text
type Stems = Set Text
type Label = [Text]
data Terms = Terms { _terms_label :: Label
, _terms_stem :: Stems
} deriving (Ord)
instance Show Terms where
show (Terms l s) = show l
-- class Inclusion where include
--instance Eq Terms where
-- (==) (Terms _ s1) (Terms _ s2) = s1 `S.isSubsetOf` s2
-- || s2 `S.isSubsetOf` s1
instance Eq Terms where
(==) (Terms _ s1) (Terms _ s2) = s1 == s2
------------------------------------------------------------------------
data Tag = POS | NER
deriving (Show, Eq)
------------------------------------------------------------------------
data POS = NP
| JJ | VB
| CC | IN | DT
| NoPos
deriving (Show, Generic, Eq)
------------------------------------------------------------------------
instance FromJSON POS where
parseJSON = withText "String" (\x -> pure (pos $ unpack x))
where
pos :: [Char] -> POS
pos "NP" = NP
pos "NN" = NP
pos "NC" = NP
pos "NNS" = NP
pos "NNP" = NP
pos "JJ" = JJ
pos "ADJ" = JJ
pos "VB" = VB
pos "VBN" = VB
pos "VBG" = VB
pos "CC" = CC
pos "IN" = IN
pos "DT" = DT
-- French specific
pos "P" = IN
pos _ = NoPos
instance ToJSON POS
------------------------------------------------------------------------
data NER = PERSON | ORGANIZATION | LOCATION | NoNER
deriving (Show, Generic)
------------------------------------------------------------------------
instance FromJSON NER where
parseJSON = withText "String" (\x -> pure (ner $ unpack x))
where
ner :: [Char] -> NER
ner "PERSON" = PERSON
ner "ORGANIZATION" = ORGANIZATION
ner "LOCATION" = LOCATION
ner _ = NoNER
instance ToJSON NER
data TokenTag = TokenTag { _my_token_word :: [Text]
, _my_token_stem :: Set Text
, _my_token_pos :: Maybe POS
, _my_token_ner :: Maybe NER
} deriving (Show)
instance Monoid TokenTag where
mempty = TokenTag [] empty Nothing Nothing
mappend (TokenTag w1 s1 p1 n1) (TokenTag w2 s2 p2 _)
= TokenTag (w1 <> w2) (s1 <> s2) p3 n1
where
p3 = case (p1,p2) of
(Just JJ, Just NP) -> Just NP
(Just VB, Just NP) -> Just NP
_ -> p1
mconcat = foldl mappend mempty
......@@ -34,7 +34,7 @@ import Gargantext.Prelude
------------------------------------------------------------------------
-- All the Database is structred like a hierarchical Tree
data Tree a = NodeT a [Tree a]
deriving (Show, Read, Eq)
deriving (Show, Read, Eq)
-- data Tree a = NodeT a [Tree a]
-- same as Data.Tree
......
......@@ -40,9 +40,9 @@ import Gargantext.Core.Utils.Prefix (unPrefix)
-- Duration : time Segment of the whole phylomemy in UTCTime format (start,end)
-- Ngrams : list of all (possible) terms contained in the phylomemy (with their id)
-- Steps : list of all steps to build the phylomemy
data Phylo = Phylo { _phyloDuration :: (Start, End)
, _phyloNgrams :: [Ngram]
, _phyloPeriods :: [PhyloPeriod]
data Phylo = Phylo { _phylo_Duration :: (Start, End)
, _phylo_Ngrams :: [Ngram]
, _phylo_Periods :: [PhyloPeriod]
} deriving (Generic)
-- | UTCTime in seconds since UNIX epoch
......@@ -55,20 +55,20 @@ type NgramId = Int
-- | PhyloStep : steps of phylomemy on temporal axis
-- Period: tuple (start date, end date) of the step of the phylomemy
-- Levels: levels of granularity
data PhyloPeriod = PhyloPeriod { _phyloPeriodId :: PhyloPeriodId
, _phyloPeriodLevels :: [PhyloLevel]
data PhyloPeriod = PhyloPeriod { _phylo_PeriodId :: PhyloPeriodId
, _phylo_PeriodLevels :: [PhyloLevel]
} deriving (Generic)
type PhyloPeriodId = (Start, End)
-- | PhyloLevel : levels of phylomemy on level axis
-- Levels description:
-- Level -1: Ngram equals itself (by identity) == _phyloNgrams
-- Level -1: Ngram equals itself (by identity) == _phylo_Ngrams
-- Level 0: Group of synonyms (by stems + by qualitative expert meaning)
-- Level 1: First level of clustering
-- Level N: Nth level of clustering
data PhyloLevel = PhyloLevel { _phyloLevelId :: PhyloLevelId
, _phyloLevelGroups :: [PhyloGroup]
data PhyloLevel = PhyloLevel { _phylo_LevelId :: PhyloLevelId
, _phylo_LevelGroups :: [PhyloGroup]
} deriving (Generic)
type PhyloLevelId = (PhyloPeriodId, Int)
......@@ -78,15 +78,15 @@ type PhyloLevelId = (PhyloPeriodId, Int)
-- Ngrams: set of terms that build the group
-- Period Parents|Childs: weighted link to Parents|Childs (Temporal Period axis)
-- Level Parents|Childs: weighted link to Parents|Childs (Level Granularity axis)
data PhyloGroup = PhyloGroup { _phyloGroupId :: PhyloGroupId
, _phyloGroupLabel :: Maybe Text
, _phyloGroupNgrams :: [NgramId]
data PhyloGroup = PhyloGroup { _phylo_GroupId :: PhyloGroupId
, _phylo_GroupLabel :: Maybe Text
, _phylo_GroupNgrams :: [NgramId]
, _phyloGroupPeriodParents :: [Edge]
, _phyloGroupPeriodChilds :: [Edge]
, _phylo_GroupPeriodParents :: [Edge]
, _phylo_GroupPeriodChilds :: [Edge]
, _phyloGroupLevelParents :: [Edge]
, _phyloGroupLevelChilds :: [Edge]
, _phylo_GroupLevelParents :: [Edge]
, _phylo_GroupLevelChilds :: [Edge]
} deriving (Generic)
type PhyloGroupId = (PhyloLevelId, Int)
......@@ -94,7 +94,7 @@ type Edge = (PhyloGroupId, Weight)
type Weight = Double
-- | JSON instances
$(deriveJSON (unPrefix "_phylo" ) ''Phylo )
$(deriveJSON (unPrefix "_phyloPeriod" ) ''PhyloPeriod )
$(deriveJSON (unPrefix "_phyloLevel" ) ''PhyloLevel )
$(deriveJSON (unPrefix "_phyloGroup" ) ''PhyloGroup )
$(deriveJSON (unPrefix "_phylo_" ) ''Phylo )
$(deriveJSON (unPrefix "_phylo_Period" ) ''PhyloPeriod )
$(deriveJSON (unPrefix "_phylo_Level" ) ''PhyloLevel )
$(deriveJSON (unPrefix "_phylo_Group" ) ''PhyloGroup )
......@@ -19,7 +19,7 @@ module Gargantext.Database (
, module Gargantext.Database.User
, module Gargantext.Database.Node
, module Gargantext.Database.NodeNode
, module Gargantext.Database.Ngram
-- , module Gargantext.Database.Ngram
, module Gargantext.Database.NodeNgram
, module Gargantext.Database.NodeNodeNgram
, module Gargantext.Database.NodeNgramNgram
......@@ -34,7 +34,7 @@ import Gargantext.Database.Utils
import Gargantext.Database.User
import Gargantext.Database.Node
import Gargantext.Database.NodeNode
import Gargantext.Database.Ngram
--import Gargantext.Database.Ngram
import Gargantext.Database.NodeNgram
import Gargantext.Database.NodeNodeNgram
import Gargantext.Database.NodeNgramNgram
......
......@@ -11,16 +11,18 @@ Here is a longer description of this module, containing some
commentary with @some markup@.
-}
--{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE NoImplicitPrelude #-}
--{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE QuasiQuotes #-}
module Gargantext.Database.Cooc where
import Control.Monad ((>>=))
import Database.PostgreSQL.Simple
import Database.PostgreSQL.Simple.SqlQQ
import Gargantext.Prelude
import Gargantext (connectGargandb)
type CorpusId = Int
......@@ -29,10 +31,10 @@ type GroupListId = Int
coocTest :: IO [(Int, Int, Int)]
coocTest = connectGargandb "gargantext.ini"
>>= \conn -> cooc conn 421968 446602 446599
>>= \conn -> dBcooc conn 421968 446602 446599
cooc :: Connection -> CorpusId -> MainListId -> GroupListId -> IO [(Int, Int, Int)]
cooc conn corpus mainList groupList = query conn [sql|
dBcooc :: Connection -> CorpusId -> MainListId -> GroupListId -> IO [(Int, Int, Int)]
dBcooc conn corpus mainList groupList = query conn [sql|
set work_mem='1GB';
--EXPLAIN ANALYZE
......
......@@ -81,5 +81,5 @@ findWith f t = find (\x -> f x == t)
--userWithId t xs = userWith userUserId t xs
-- | not optimized (get all ngrams without filters)
getNgrams :: PGS.Connection -> IO [Ngram]
getNgrams conn = runQuery conn queryNgramTable
dbGetNgrams :: PGS.Connection -> IO [Ngram]
dbGetNgrams conn = runQuery conn queryNgramTable
......@@ -45,7 +45,7 @@ import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
import Data.Typeable (Typeable)
import qualified Data.ByteString.Internal as DBI
import Database.PostgreSQL.Simple (Connection)
import Opaleye
import Opaleye hiding (FromField)
-- | Types for Node Database Management
data PGTSVector
......
......@@ -33,7 +33,6 @@ import Database.PostgreSQL.Simple (Connection, connect)
import Opaleye (Query, Unpackspec, showSqlForPostgres)
import Data.Profunctor.Product.Default (Default)
import Data.Maybe (maybe)
import Prelude (id, putStrLn)
-- TODO add a reader Monad here
-- read this in the init file
......@@ -61,5 +60,5 @@ connectGargandb fp = do
connect parameters
printSql :: Default Unpackspec a a => Query a -> IO ()
printSql = putStrLn . maybe "Empty query" id . showSqlForPostgres
printSql = putStrLn . maybe "Empty query" identity . showSqlForPostgres
{-|
Module : Gargantext.Pipeline
Description : Server API
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# OPTIONS_GHC -fno-warn-name-shadowing #-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Pipeline
where
import Data.Text.IO (readFile)
----------------------------------------------
----------------------------------------------
import Gargantext.Core
import Gargantext.Prelude
import Gargantext.Viz.Graph.Index (score)
import Gargantext.Viz.Graph.Distances.Matrice (distributional)
import Gargantext.Text.Metrics.Occurrences
import Gargantext.Text.Terms
import Gargantext.Text.Context
pipeline path = do
-- Text <- IO Text <- FilePath
text <- readFile path
let contexts = splitBy (Sentences 3) text
myterms <- extractTerms Multi FR contexts
-- TODO filter (\t -> not . elem t stopList) myterms
-- TODO groupBy (Stem | GroupList)
let myCooc = removeApax $ cooc myterms
-- Cooc -> Matrix
pure $ score distributional myCooc
-- Matrix -> Clustering -> Graph -> JSON
......@@ -30,21 +30,26 @@ module Gargantext.Prelude
import Data.Maybe (isJust, fromJust, maybe)
import Protolude ( Bool(True, False), Int, Double, Integer
, Fractional, Num, Maybe(Just,Nothing)
, Enum, Bounded, Float
, Floating, Char, IO
, pure, (<*>), (<$>), panic
, putStrLn
, head, flip
, Ord, Integral, Foldable, RealFrac, Monad, filter
, reverse, map, zip, drop, take, zipWith
, reverse, map, mapM, zip, drop, take, zipWith
, sum, fromIntegral, length, fmap, foldl, foldl'
, takeWhile, sqrt, undefined, identity
, abs, min, max, maximum, minimum, return, snd, truncate
, (+), (*), (/), (-), (.), ($), (**), (^), (<), (>), log
, (+), (*), (/), (-), (.), ($), (&), (**), (^), (<), (>), log
, Eq, (==), (>=), (<=), (<>), (/=)
, (&&), (||), not
, (&&), (||), not, any
, fst, snd, toS
, elem, die, mod, div, const, either
, curry, uncurry
, curry, uncurry, repeat
, otherwise, when
, undefined
, IO()
, compare
)
-- TODO import functions optimized in Utils.Count
......@@ -75,18 +80,6 @@ pr = reverse
map2 :: (t -> b) -> [[t]] -> [[b]]
map2 fun = map (map fun)
pz :: [a] -> [b] -> [(a, b)]
pz = zip
pd :: Int -> [a] -> [a]
pd = drop
ptk :: Int -> [a] -> [a]
ptk = take
pzw :: (a -> b -> c) -> [a] -> [b] -> [c]
pzw = zipWith
-- Exponential Average
eavg :: [Double] -> Double
eavg (x:xs) = a*x + (1-a)*(eavg xs)
......@@ -114,6 +107,12 @@ movingAverage steps xs = map mean $ chunkAlong steps 1 xs
ma :: [Double] -> [Double]
ma = movingAverage 3
-- | splitEvery n == chunkAlong n n
splitEvery :: Int -> [a] -> [[a]]
splitEvery _ [] = L.cycle [[]]
splitEvery n xs =
let (h,t) = L.splitAt n xs
in h : splitEvery n t
-- | Function to split a range into chunks
chunkAlong :: Int -> Int -> [a] -> [[a]]
......@@ -235,3 +234,8 @@ zipSnd f xs = zip xs (f xs)
unMaybe :: [Maybe a] -> [a]
unMaybe = map fromJust . L.filter isJust
-- maximumWith
maximumWith f = L.maximumBy (\x y -> compare (f x) (f y))
......@@ -7,10 +7,7 @@ Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Ngrams exctration.
Definitions of ngrams.
n non negative integer
Text gathers terms in unit of contexts.
-}
......@@ -20,80 +17,44 @@ n non negative integer
module Gargantext.Text
where
import Data.Maybe
import qualified Data.Text as DT
--import Data.Text.IO (readFile)
import Data.Map.Strict (Map
, lookupIndex
--, fromList, keys
)
import qualified Data.Set as S
import Data.Text (Text, split)
import qualified Data.Map.Strict as M (filter)
import NLP.FullStop (segment)
-----------------------------------------------------------------
import Gargantext.Text.Ngrams
import Gargantext.Text.Metrics.Occurrences
import qualified Gargantext.Text.Metrics.FrequentItemSet as FIS
import Gargantext.Core
import Gargantext.Core.Types
import Gargantext.Text.Metrics.Occurrences (Occ, occurrences, cooc)
import Gargantext.Prelude hiding (filter)
-----------------------------------------------------------------
data Group = Group { _group_label :: Ngrams
, _group_ngrams :: [Ngrams]
} deriving (Show)
type Config = Lang -> Context
clean :: Text -> Text
clean txt = DT.map clean' txt
where
clean' '’' = '\''
clean' c = c
--noApax :: Ord a => Map a Occ -> Map a Occ
--noApax m = M.filter (>1) m
-- | /!\ indexes are not the same:
type Context = Text -> [Text]
-- | Index ngrams from Map
--indexNgram :: Ord a => Map a Occ -> Map Index a
--indexNgram m = fromList (zip [1..] (keys m))
data Viz = Graph | Phylo | Chart
-- | Index ngrams from Map
--ngramIndex :: Ord a => Map a Occ -> Map a Index
--ngramIndex m = fromList (zip (keys m) [1..])
pipeline :: Config -> Text -> Viz
pipeline = undefined
indexWith :: Ord a => Map a Occ -> [a] -> [Int]
indexWith m xs = unMaybe $ map (\x -> lookupIndex x m) xs
indexIt :: Ord a => [[a]] -> (Map a Int, [[Int]])
indexIt xs = (m, is)
where
m = sumOcc (map occ xs)
is = map (indexWith m) xs
list2fis :: Ord a => FIS.Frequency -> [[a]] -> (Map a Int, [FIS.Fis])
list2fis n xs = (m', fs)
where
(m, is) = indexIt xs
m' = M.filter (>50000) m
fs = FIS.all n is
text2fis :: FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
text2fis n xs = list2fis n (map ngrams xs)
--text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
--text2fisWith = undefined
-----------------------------------------------------------------
-------------------------------------------------------------------
-- Contexts of text
sentences :: Text -> [Text]
sentences txt = split isStop txt
sentences txt = map DT.pack $ segment $ DT.unpack txt
sentences' :: Text -> [Text]
sentences' txt = split isStop txt
isStop :: Char -> Bool
isStop c = c `elem` ['.','?','!']
unsentences :: [Text] -> Text
unsentences txts = DT.intercalate " " txts
-- | https://en.wikipedia.org/wiki/Text_mining
testText_en :: Text
......@@ -103,19 +64,23 @@ testText_en = DT.pack "Text mining, also referred to as text data mining, roughl
testText_fr :: Text
testText_fr = DT.pack "La fouille de textes ou « l'extraction de connaissances » dans les textes est une spécialisation de la fouille de données et fait partie du domaine de l'intelligence artificielle. Cette technique est souvent désignée sous l'anglicisme text mining. Elle désigne un ensemble de traitements informatiques consistant à extraire des connaissances selon un critère de nouveauté ou de similarité dans des textes produits par des humains pour des humains. Dans la pratique, cela revient à mettre en algorithme un modèle simplifié des théories linguistiques dans des systèmes informatiques d'apprentissage et de statistiques. Les disciplines impliquées sont donc la linguistique calculatoire, l'ingénierie des langues, l'apprentissage artificiel, les statistiques et l'informatique."
termTests :: Text
termTests = "It is hard to detect important articles in a specific context. Information retrieval techniques based on full text search can be inaccurate to identify main topics and they are not able to provide an indication about the importance of the article. Generating a citation network is a good way to find most popular articles but this approach is not context aware. The text around a citation mark is generally a good summary of the referred article. So citation context analysis presents an opportunity to use the wisdom of crowd for detecting important articles in a context sensitive way. In this work, we analyze citation contexts to rank articles properly for a given topic. The model proposed uses citation contexts in order to create a directed and edge-labeled citation network based on the target topic. Then we apply common ranking algorithms in order to find important articles in this newly created network. We showed that this method successfully detects a good subset of most prominent articles in a given topic. The biggest contribution of this approach is that we are able to identify important articles for a given search term even though these articles do not contain this search term. This technique can be used in other linked documents including web pages, legal documents, and patents as well as scientific papers."
-- | Ngrams Test
-- >>> ngramsTest testText
-- 248
ngramsTest :: Text -> Int
ngramsTest x= length ws
where
--txt = concat <$> lines <$> clean <$> readFile filePath
txt = clean x
-- | Number of sentences
--ls = sentences $ txt
-- | Number of monograms used in the full text
ws = ngrams $ txt
-- | stem ngrams
--ngramsTest :: Text -> Int
--ngramsTest x = length ws
-- where
-- --txt = concat <$> lines <$> clean <$> readFile filePath
-- txt = clean x
-- -- | Number of sentences
-- --ls = sentences $ txt
-- -- | Number of monograms used in the full text
-- ws = ngrams $ txt
-- -- | stem ngrams
-- TODO
-- group ngrams
--ocs = occ $ ws
......
{-|
Module : Gargantext.Text.Context
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Context of text management tool
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Context
where
import Data.Text (Text, pack, unpack, length)
import Data.String (IsString)
import Text.HTML.TagSoup (parseTags, isTagText, Tag(..))
import Gargantext.Text
import Gargantext.Prelude hiding (length)
data SplitContext = Chars Int | Sentences Int | Paragraphs Int
tag = parseTags
-- | splitBy contexts of Chars or Sentences or Paragraphs
-- >> splitBy (Chars 0) "abcde"
-- ["a","b","c","d","e"]
-- >> splitBy (Chars 1) "abcde"
-- ["ab","bc","cd","de"]
-- >> splitBy (Chars 2) "abcde"
-- ["abc","bcd","cde"]
splitBy :: SplitContext -> Text -> [Text]
splitBy (Chars n) = map pack . chunkAlong (n+1) 1 . unpack
splitBy (Sentences n) = map unsentences . chunkAlong (n+1) 1 . sentences
splitBy (Paragraphs _) = map unTag . filter isTagText . tag
where
unTag :: IsString p => Tag p -> p
unTag (TagText x) = x
unTag _ = ""
......@@ -12,84 +12,14 @@ Mainly reexport functions in @Data.Text.Metrics@
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Metrics where
module Gargantext.Text.Metrics (levenshtein
, levenshteinNorm
, damerauLevenshtein
, damerauLevenshteinNorm
, overlap
, jaccard
, hamming
) where
import Data.Text (Text)
import GHC.Real (Ratio)
import qualified Data.Text.Metrics as DTM
import Gargantext.Prelude
{- * Example de titre
-}
-- | This module provide metrics to compare Text
-- starting as an API rexporting main functions of the great lib
-- text-metrics of Mark Karpov
-- | Levenshtein Distance
-- In information theory, Linguistics and computer science,
-- the Levenshtein distance is a string metric for measuring
-- the difference between two sequences.
-- See: https://en.wikipedia.org/wiki/Levenshtein_distance
--import Data.Text (Text)
--import GHC.Real (Ratio)
--import qualified Data.Text.Metrics as DTM
--
levenshtein :: Text -> Text -> Int
levenshtein = DTM.levenshtein
-- | Return normalized Levenshtein distance between two 'Text' values.
-- Result is a non-negative rational number (represented as @'Ratio'
-- 'Data.Numeric.Natural'@), where 0 signifies no similarity between the
-- strings, while 1 means exact match.
--import Gargantext.Prelude
--
levenshteinNorm :: Text -> Text -> Ratio Int
levenshteinNorm = DTM.levenshteinNorm
-- | Return Damerau-Levenshtein distance between two 'Text' values. The
-- function works like 'levenshtein', but the collection of allowed
-- operations also includes transposition of two /adjacent/ characters.
-- See also:
-- <https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance>
--
damerauLevenshtein :: Text -> Text -> Int
damerauLevenshtein = DTM.damerauLevenshtein
-- damerau-Levenshtein distance normalized
--
damerauLevenshteinNorm :: Text -> Text -> Ratio Int
damerauLevenshteinNorm = DTM.damerauLevenshteinNorm
-- Treating inputs like sets
-- | Return overlap coefficient for two 'Text' values. Returned value
-- is in the range from 0 (no similarity) to 1 (exact match). Return 1
-- if both 'Text' values are empty.
--
-- See also: <https://en.wikipedia.org/wiki/Overlap_coefficient>.
overlap :: Text -> Text -> Ratio Int
overlap = DTM.overlap
-- | Jaccard distance
-- measures dissimilarity between sample sets
jaccard :: Text -> Text -> Ratio Int
jaccard = DTM.jaccard
-- | Hamming Distance
-- In information theory, the Hamming distance between two strings of
-- equal length is the number of positions at which the corresponding
-- symbols are different. In other words, it measures the minimum number of
-- substitutions required to change one string into the other
-- See: https://en.wikipedia.org/wiki/Hamming_distance
hamming :: Text -> Text -> Maybe Int
hamming = DTM.hamming
--noApax :: Ord a => Map a Occ -> Map a Occ
--noApax m = M.filter (>1) m
{-|
Module : Gargantext.Text.Metrics.CharByChar
Description : All parsers of Gargantext in one file.
Copyright : (c) CNRS, 2017 - present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Mainly reexport functions in @Data.Text.Metrics@
-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Metrics.CharByChar (levenshtein
, levenshteinNorm
, damerauLevenshtein
, damerauLevenshteinNorm
, overlap
, jaccard
, hamming
) where
import Data.Text (Text)
import GHC.Real (Ratio)
import qualified Data.Text.Metrics as DTM
import Gargantext.Prelude
--noApax :: Ord a => Map a Occ -> Map a Occ
--noApax m = M.filter (>1) m
{- * Example de titre
-}
-- | This module provide metrics to compare Text
-- starting as an API rexporting main functions of the great lib
-- text-metrics of Mark Karpov
-- | Levenshtein Distance
-- In information theory, Linguistics and computer science,
-- the Levenshtein distance is a string metric for measuring
-- the difference between two sequences.
-- See: https://en.wikipedia.org/wiki/Levenshtein_distance
--
levenshtein :: Text -> Text -> Int
levenshtein = DTM.levenshtein
-- | Return normalized Levenshtein distance between two 'Text' values.
-- Result is a non-negative rational number (represented as @'Ratio'
-- 'Data.Numeric.Natural'@), where 0 signifies no similarity between the
-- strings, while 1 means exact match.
--
levenshteinNorm :: Text -> Text -> Ratio Int
levenshteinNorm = DTM.levenshteinNorm
-- | Return Damerau-Levenshtein distance between two 'Text' values. The
-- function works like 'levenshtein', but the collection of allowed
-- operations also includes transposition of two /adjacent/ characters.
-- See also:
-- <https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance>
--
damerauLevenshtein :: Text -> Text -> Int
damerauLevenshtein = DTM.damerauLevenshtein
-- damerau-Levenshtein distance normalized
--
damerauLevenshteinNorm :: Text -> Text -> Ratio Int
damerauLevenshteinNorm = DTM.damerauLevenshteinNorm
-- Treating inputs like sets
-- | Return overlap coefficient for two 'Text' values. Returned value
-- is in the range from 0 (no similarity) to 1 (exact match). Return 1
-- if both 'Text' values are empty.
--
-- See also: <https://en.wikipedia.org/wiki/Overlap_coefficient>.
overlap :: Text -> Text -> Ratio Int
overlap = DTM.overlap
-- | Jaccard distance
-- measures dissimilarity between sample sets
jaccard :: Text -> Text -> Ratio Int
jaccard = DTM.jaccard
-- | Hamming Distance
-- In information theory, the Hamming distance between two strings of
-- equal length is the number of positions at which the corresponding
-- symbols are different. In other words, it measures the minimum number of
-- substitutions required to change one string into the other
-- See: https://en.wikipedia.org/wiki/Hamming_distance
hamming :: Text -> Text -> Maybe Int
hamming = DTM.hamming
......@@ -15,7 +15,7 @@ Domain Specific Language to manage Frequent Item Set (FIS)
module Gargantext.Text.Metrics.FrequentItemSet
( Fis, Size
, occ, cooc
, occ_hlcm, cooc_hlcm
, all, between
, module HLCM
)
......@@ -34,12 +34,12 @@ type Size = Either Int (Int, Int)
------------------------------------------------------------------------
-- | Occurrence is Frequent Item Set of size 1
occ :: Frequency -> [[Item]] -> [Fis]
occ f is = fisWithSize (Left 1) f is
occ_hlcm :: Frequency -> [[Item]] -> [Fis]
occ_hlcm f is = fisWithSize (Left 1) f is
-- | Cooccurrence is Frequent Item Set of size 2
cooc :: Frequency -> [[Item]] -> [Fis]
cooc f is = fisWithSize (Left 2) f is
cooc_hlcm :: Frequency -> [[Item]] -> [Fis]
cooc_hlcm f is = fisWithSize (Left 2) f is
all :: Frequency -> [[Item]] -> [Fis]
all f is = fisWith Nothing f is
......@@ -89,3 +89,39 @@ fisWith s f is = unMaybe $ map items2fis $ filter' $ runLCMmatrix is f
------------------------------------------------------------------------
------------------------------------------------------------------------
--
---- | /!\ indexes are not the same:
--
---- | Index ngrams from Map
----indexNgram :: Ord a => Map a Occ -> Map Index a
----indexNgram m = fromList (zip [1..] (keys m))
--
---- | Index ngrams from Map
----ngramIndex :: Ord a => Map a Occ -> Map a Index
----ngramIndex m = fromList (zip (keys m) [1..])
--
--indexWith :: Ord a => Map a Occ -> [a] -> [Int]
--indexWith m xs = unMaybe $ map (\x -> lookupIndex x m) xs
--
--indexIt :: Ord a => [[a]] -> (Map a Int, [[Int]])
--indexIt xs = (m, is)
-- where
-- m = sumOcc (map occ xs)
-- is = map (indexWith m) xs
--
--list2fis :: Ord a => FIS.Frequency -> [[a]] -> (Map a Int, [FIS.Fis])
--list2fis n xs = (m', fs)
-- where
-- (m, is) = indexIt xs
-- m' = M.filter (>50000) m
-- fs = FIS.all n is
--
--text2fis :: FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
--text2fis n xs = list2fis n (map terms xs)
--
----text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
----text2fisWith = undefined
--
......@@ -28,43 +28,92 @@ Source : https://en.wikipedia.org/wiki/Type%E2%80%93token_distinction#Occurrence
module Gargantext.Text.Metrics.Occurrences
where
import Gargantext.Prelude
import Control.Arrow ((***))
import qualified Data.List as List
import Data.Map.Strict (Map
, empty
, insertWith, unionWith
, empty, singleton
, insertWith, insertWithKey, unionWith
, toList, lookup, mapKeys
)
import Data.Set (Set)
import qualified Data.Set as Set
import Data.Text (pack)
import qualified Data.Map.Strict as DMS
import Control.Monad ((>>),(>>=))
import Data.String (String())
import Data.Attoparsec.Text
import Data.Text (Text)
import Data.Either.Extra(Either(..))
import qualified Data.Text as T
import Control.Applicative hiding (empty)
-----------------------------------------------------------
------------------------------------------------------------------------
import Gargantext.Prelude
import Gargantext.Core.Types
------------------------------------------------------------------------
type Occ a = Map a Int
type Cooc a = Map (a, a) Int
type FIS a = Map (Set a) Int
data Group = ByStem | ByOntology
type Grouped = Stems
{-
-- >> let testData = ["blue lagoon", "blues lagoon", "red lagoon"]
-- >> map occurrences <$> Prelude.mapM (terms Mono EN)
-- [fromList [(fromList ["blue"],1),(fromList ["lagoon"],1)],fromList [(fromList ["blue"],1),(fromList ["lagoon"],1)],fromList [(fromList ["lagoon"],1),(fromList ["red"],1)]]
--λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon"]
--fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),1)]
--λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon", "red lagoon"]
--fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
--λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon red lagoon", "red lagoon"]
--fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
--λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon blues lagoon", "red lagoon red lagoon", "red lagoon"]
--fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
----
-}
type Occs = Int
type Coocs = Int
removeApax :: Map (Label, Label) Int -> Map (Label, Label) Int
removeApax = DMS.filter (> 1)
cooc :: [[Terms]] -> Map (Label, Label) Int
cooc tss =
mapKeys (delta $ labelPolicy terms_occs) $ cooc' (map (Set.fromList . map _terms_stem) tss)
where
terms_occs = occurrences (List.concat tss)
delta f = f *** f
type Occ = Int
-- | Compute the occurrences (occ)
occ :: Ord a => [a] -> Map a Occ
occ xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs
labelPolicy :: Map Grouped (Map Terms Occs) -> Grouped -> Label
labelPolicy m g = case _terms_label <$> fst <$> maximumWith snd <$> DMS.toList <$> lookup g m of
Just label -> label
Nothing -> panic $ "Label of Grouped not found: " <> (pack $ show g)
-- TODO add groups and filter stops
sumOcc :: Ord a => [Map a Occ] -> Map a Occ
sumOcc xs = foldl' (unionWith (+)) empty xs
cooc' :: Ord b => [Set b] -> Map (b, b) Coocs
cooc' tss = foldl' (\m (xy,c) -> insertWith ((+)) xy c m) empty xs
where
xs = [ ((x, y), 1)
| xs <- tss
, ys <- tss
, x <- Set.toList xs
, y <- Set.toList ys
, x < y
]
-- | Compute the grouped occurrences (occ)
occurrences :: [Terms] -> Map Grouped (Map Terms Int)
occurrences = occurrences' _terms_stem
occurrences' :: (Ord a, Ord b) => (a -> b) -> [a] -> Map b (Map a Int)
occurrences' f = foldl' (\m a -> insertWith (unionWith (+)) (f a) (singleton a 1) m) empty
occurrenceParser :: Text -> Parser Bool
occurrenceParser txt = manyTill anyChar (string txt) >> pure True
-- TODO add groups and filter stops
sumOcc :: Ord a => [Occ a] -> Occ a
sumOcc xs = foldl' (unionWith (+)) empty xs
occurrencesParser :: Text -> Parser Int
occurrencesParser txt = case txt of
"" -> pure 0
_ -> many (occurrenceParser txt') >>= \matches -> pure (length matches)
where
txt' = T.toLower txt
parseOccurrences :: Text -> Text -> Either String Int
parseOccurrences x = parseOnly (occurrencesParser x)
{-|
Module : Gargantext.Text.Ngrams.PosTagging.Lang.En
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Here is a longer description of this module, containing some
commentary with @some markup@.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Ngrams.PosTagging.Lang.En (group)
where
--import Data.Text (Text)
import Data.Maybe (Maybe(Just))
import Gargantext.Text.Ngrams.PosTagging.CoreNLP
import Gargantext.Prelude
group :: [NgramsTag] -> [NgramsTag]
group [] = []
group ntags = group2 NP NP
$ group2 NP VB
$ group2 NP IN
$ group2 IN DT
$ group2 VB NP
$ group2 JJ NP
$ group2 JJ JJ
$ group2 JJ CC
$ ntags
------------------------------------------------------------------------
-- | FIXME p1 and p2 not really taken into account
group2 :: POS -> POS -> [NgramsTag] -> [NgramsTag]
group2 p1 p2 (x@(NgramsTag _ _ (Just p1') _):y@(NgramsTag _ _ (Just p2') _):z) =
if (p1 == p1') && (p2 == p2')
then group2 p1 p2 (x<>y : z)
else (x : group2 p1 p2 (y:z))
group2 p1 p2 (x@(NgramsTag _ _ Nothing _):y) = (x: group2 p1 p2 y)
group2 _ _ [x@(NgramsTag _ _ (Just _) _)] = [x]
group2 p1 p2 (x@(NgramsTag _ _ (Just _) _):y@(NgramsTag _ _ Nothing _):z) = (x:y: group2 p1 p2 (y:z))
group2 _ _ [] = []
-- group3 :: POS -> POS -> POS -> [NgramsTag] -> [NgramsTag]
-- group xs = group3 NN IN DT xs
-- TO BE REMOVED old code
--groupNgrams ((j1,"JJ",j1'):(c1,"CC",c1'):(j2,"JJ",j2'):(j3,"JJ",_):xs) = groupNgrams (jn1:cc:jn2:xs)
-- where
-- jn j' j'' jn' = (j' <> " " <> j'', "JJ", jn')
-- cc = (c1, "CC", c1')
-- jn1 = (j1, "JJ", j1')
-- jn2 = jn j2 j3 j2'
--
--groupNgrams ((j1,"JJ",_):(_,"CC",_):(j2,"JJ",_):(n,"NN",nn):xs) = groupNgrams (jn1:jn2:xs)
-- where
-- jn j m mm p = (j <> " " <> m, p, mm)
-- jn1 = jn j1 n nn ("NN+CC" :: Text)
-- jn2 = jn j2 n nn ("NN+CC" :: Text)
--
--groupNgrams ((j1,"JJ",_):(_,"CC",_):(j2,"JJ",_):(n,"NNS",nn):xs) = groupNgrams (jn1:jn2:xs)
-- where
-- jn j m mm p = (j <> " " <> m, p, mm)
-- jn1 = jn j1 n nn ("NN+CC" :: Text)
-- jn2 = jn j2 n nn ("NN+CC" :: Text)
--
--groupNgrams ((x,"JJ",_):(y,"JJ",yy):xs) = groupNgrams ((x <> " " <> y, "JJ", yy):xs)
--groupNgrams ((x,"JJ",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--groupNgrams ((x,"JJ",_):(y,"NNS",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--
--groupNgrams ((x,"NNP",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--groupNgrams ((x,"NN",_):(y,"NP",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--
--groupNgrams ((x,"NN",_):(y,"NNS",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--groupNgrams ((x,"NP",_):(y,"NP",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--
--groupNgrams ((x,"NN",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--
--
---- extractNgrams "Test the antiinflammatory or analgesic activity?"
---- [[("``","``","O"),("Test","VB","O"),("the","DT","O"),("antiinflammatory activity analgesic activity","NN","O"),("?",".","O"),("''","''","O")]]
---- > should be (antiinflammatory activity) <> (analgesic activity)
--
--groupNgrams ((x,"NN",_):(o,"IN",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> y, "NN", yy):xs)
--groupNgrams ((x,"NN",_):(o,"IN",_):(y,"NNP",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> y, "NN", yy):xs)
--
--groupNgrams ((x,"NN",_):(o,"IN",_):(det,"DT",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> det <> " " <> y, "NN", yy):xs)
--groupNgrams ((x,"NN",_):(o,"IN",_):(det,"DT",_):(y,"NNP",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> det <> " " <> y, "NN", yy):xs)
--
--groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs) = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
--groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs)
--groupNgrams ((x,_,"LOCATION"):(y,yy,"LOCATION"):xs) = groupNgrams ((x <> " " <> y,yy,"LOCATION"):xs)
--
--groupNgrams (x:xs) = (x:(groupNgrams xs))
--textTest :: [Text]
--textTest = [ "Alcoholic extract of Kaempferia galanga was tested for analgesic and antiinflammatory activities in animal models. "
-- , "Three doses, 300 mg/kg, 600 mg/kg and 1200 mg/kg of the plant extract prepared as a suspension in 2 ml of 2% gum acacia were used. "
-- , " Acute and sub acute inflammatory activities were studied in rats by carrageenan induced paw edema and cotton pellet induced granuloma models respectively. "
-- , "In both models, the standard drug used was aspirin 100 mg/kg. "
-- , "Two doses 600 mg/kg and 1200 mg/kg of plant extract exhibited significant (P<0.001) antiinflammatory activity in carrageenan model and cotton pellet granuloma model in comparison to control. "
-- , "Analgesic activity was studied in rats using hot plate and tail-flick models. "
-- , "Codeine 5 mg/kg and vehicle served as standard and control respectively. "
-- , "The two doses of plant extract exhibited significant analgesic activity in tail flick model (P<0.001) and hot plate model (P<0.001) in comparison to control. "
-- , "In conclusion K. galanga possesses antiinflammatory and analgesic activities. "]
{-|
Module : Gargantext.Text.Ngrams.PosTagging.Lang.Fr
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
This @group@ function groups horizontally ngrams in their context of
sentence according to grammars specific of each language. In english, JJ
is ADJectiv in french. -
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Ngrams.PosTagging.Lang.Fr (group)
where
import Data.Maybe (Maybe(Just))
import Gargantext.Text.Ngrams.PosTagging.CoreNLP
import Gargantext.Prelude
group :: [NgramsTag] -> [NgramsTag]
group [] = []
group ntags = group2 NP NP
$ group2 NP VB
$ group2 NP IN
$ group2 IN DT
$ group2 VB NP
$ group2 JJ NP
$ group2 NP JJ
$ group2 JJ JJ
$ group2 JJ CC
$ ntags
------------------------------------------------------------------------
-- | FIXME p1 and p2 not really taken into account
group2 :: POS -> POS -> [NgramsTag] -> [NgramsTag]
group2 p1 p2 (x@(NgramsTag _ _ (Just p1') _):y@(NgramsTag _ _ (Just p2') _):z) =
if (p1 == p1') && (p2 == p2')
then group2 p1 p2 (x<>y : z)
else (x : group2 p1 p2 (y:z))
group2 p1 p2 (x@(NgramsTag _ _ Nothing _):y) = (x: group2 p1 p2 y)
group2 _ _ [x@(NgramsTag _ _ (Just _) _)] = [x]
group2 p1 p2 (x@(NgramsTag _ _ (Just _) _):y@(NgramsTag _ _ Nothing _):z) = (x:y: group2 p1 p2 (y:z))
group2 _ _ [] = []
------------------------------------------------------------------------
--module Gargantext.Text.Ngrams.PosTagging.Lang.Fr (selectNgrams, groupNgrams, textTest)
-- where
--
--import Gargantext.Prelude
--import Data.Text (Text)
--import Data.Monoid ((<>))
--
--selectNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
--selectNgrams xs = filter selectNgrams' xs
-- where
-- selectNgrams' (_,"N" ,_ ) = True
-- selectNgrams' (_,"NC" ,_ ) = True
-- selectNgrams' (_,"NN+CC",_ ) = True
-- selectNgrams' (_,_ ,"PERSON" ) = True
-- selectNgrams' (_,_ ,"ORGANIZATION") = True
-- selectNgrams' (_,_ ,"LOCATION" ) = True
-- selectNgrams' (_,_ ,_ ) = False
--
--
--groupNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
--groupNgrams [] = []
--
----groupNgrams ((_,"DET",_):xs) = groupNgrams xs
--
---- "Groupe : nom commun et adjectifs avec conjonction"
--groupNgrams ((n,"NC",n'):(j1,"ADJ",_):(_,"CC",_):(j2,"ADJ",_):xs) = groupNgrams (n1:n2:xs)
-- where
-- n1 = (n <> " " <> j1, "NC", n')
-- n2 = (n <> " " <> j2, "NC", n')
--
---- /!\ sometimes N instead of NC (why?)
--groupNgrams ((n,"N",n'):(j1,"ADJ",_):(_,"CC",_):(j2,"ADJ",_):xs) = groupNgrams (n1:n2:xs)
-- where
-- n1 = (n <> " " <> j1, "N", n')
-- n2 = (n <> " " <> j2, "N", n')
--
---- Groupe : Adjectif + Conjonction de coordination + Adjectif
---- groupNgrams ((j1,"ADJ",_):(_,"CC",_):(j2,"ADJ",j2'):xs) = groupNgrams ((j1 <> " " <> j2, "ADJ", j2'):xs)
--
---- Groupe : Nom commun + préposition + Nom commun
--groupNgrams ((n1,"NC",_):(p,"P",_):(n2,"NC",n2'):xs) = groupNgrams ((n1 <> " " <> p <> " " <> n2, "NC", n2'):xs)
--groupNgrams ((n1,"NC",_):(p,"P",_):(n2,"NPP",n2'):xs) = groupNgrams ((n1 <> " " <> p <> " " <> n2, "NC", n2'):xs)
--groupNgrams ((n1,"NC",_):(prep,"P",_):(det,"DET",_):(n2,"NPP",n2'):xs) = groupNgrams ((n1 <> " " <> prep <> " " <> det <> " " <> n2, "NC", n2'):xs)
--
---- Groupe : Plusieurs adjectifs successifs
--groupNgrams ((x,"ADJ",_):(y,"ADJ",yy):xs) = groupNgrams ((x <> " " <> y, "ADJ", yy):xs)
--
---- Groupe : nom commun et adjectif
--groupNgrams ((x,"NC",_):(y,"ADJ",yy):xs) = groupNgrams ((x <> " " <> y, "NC", yy):xs)
---- /!\ sometimes N instead of NC (why?)
--groupNgrams ((x,"N",_):(y,"ADJ",yy):xs) = groupNgrams ((x <> " " <> y, "NC", yy):xs)
--
---- Groupe : adjectif et nom commun
--groupNgrams ((x,"ADJ",_):(y,"NC",yy):xs) = groupNgrams ((x <> " " <> y, "NC", yy):xs)
---- /!\ sometimes N instead of NC (why?)
--groupNgrams ((x,"ADJ",_):(y,"N",yy):xs) = groupNgrams ((x <> " " <> y, "NC", yy):xs)
--
--
--groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs) = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
--groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs)
--
--
---- Si aucune des règles précédentes n'est remplie
--groupNgrams (x:xs) = (x:(groupNgrams xs))
--
--
--textTest :: [Text]
--textTest = [ "L'heure d'arrivée des coureurs dépend de la météo du jour."]
--
{-|
Module : Gargantext.Text.Ngrams.PosTagging.Parser
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Here is a longer description of this module, containing some
commentary with @some markup@.
Ngrams selection algorithms
A form is a list of characters seperated by one or more spaces in a sentence.
A word is a form.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE ScopedTypeVariables #-}
module Gargantext.Text.Ngrams.PosTagging.Parser
where
import Data.Text hiding (map, group)
import Gargantext.Prelude
import Gargantext.Core (Lang(..))
import Gargantext.Text.Ngrams.PosTagging.CoreNLP
import qualified Gargantext.Text.Ngrams.PosTagging.Lang.En as En
import qualified Gargantext.Text.Ngrams.PosTagging.Lang.Fr as Fr
extractNgrams :: Lang -> Text -> IO [[NgramsTag]]
extractNgrams lang s = map (group lang) <$> extractNgrams' lang s
extractNgrams' :: Lang -> Text -> IO [[NgramsTag]]
extractNgrams' lang t = map tokens2ngramsTags
<$> map _sentenceTokens
<$> _sentences
<$> corenlp lang t
---- | This function analyse and groups (or not) ngrams according to
---- grammars specific of each language.
group :: Lang -> [NgramsTag] -> [NgramsTag]
group EN = En.group
group FR = Fr.group
......@@ -37,6 +37,7 @@ import Data.Either.Extra(Either())
import Data.Text (Text)
import Data.Text.Encoding (decodeUtf8)
import qualified Data.Text as DT
----
--import Control.Monad (join)
import Codec.Archive.Zip (withArchive, getEntry, getEntries)
......@@ -51,16 +52,16 @@ import Gargantext.Text.Parsers.WOS (wosParser)
---- import Gargantext.Parsers.DOC (docParser)
---- import Gargantext.Parsers.ODT (odtParser)
--import Gargantext.Prelude (pm)
--import Gargantext.Types.Main (ErrorMessage(), Corpus)
-- FIXME
--type Field = Text
type ParseError = String
--
--data Corpus = Corpus { _corpusErrors :: [ParseError]
-- , _corpusMap :: Map FilePath (Map Field Text)
-- }
type Field = Text
type Document = DM.Map Field Text
type FilesParsed = DM.Map FilePath FileParsed
data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
, _fileParsed_result :: [Document]
} deriving (Show)
-- | According to the format of Input file,
......@@ -75,37 +76,43 @@ data FileFormat = WOS -- Implemented (ISI Format)
-- TODO: to debug maybe add the filepath in error message
parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
parse format path = do
files <- case takeExtension path of
".zip" -> openZip path
_ -> pure <$> DB.readFile path
(as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
pure (as, map toText $ concat bs)
where
-- TODO : decode with bayesian inference on encodings
toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))
-- | withParser:
-- According the format of the text, choosing the right parser.
-- TODO withParser :: FileFormat -> Parser [Document]
withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
withParser WOS = wosParser
--withParser DOC = docParser
--withParser ODT = odtParser
--withParser XML = xmlParser
--withParser _ = error "[ERROR] Parser not implemented yet"
runParser :: FileFormat -> DB.ByteString
-> IO (Either String [[(DB.ByteString, DB.ByteString)]])
runParser format text = pure $ parseOnly (withParser format) text
openZip :: FilePath -> IO [DB.ByteString]
openZip fp = do
path <- resolveFile' fp
entries <- withArchive path (DM.keys <$> getEntries)
bs <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
pure bs
--parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
--parse format path = do
-- files <- case takeExtension path of
-- ".zip" -> openZip path
-- _ -> pure <$> DB.readFile path
-- (as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
-- pure (as, map toText $ concat bs)
-- where
-- -- TODO : decode with bayesian inference on encodings
-- toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))
--
--
---- | withParser:
---- According the format of the text, choosing the right parser.
---- TODO withParser :: FileFormat -> Parser [Document]
--withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
--withParser WOS = wosParser
----withParser DOC = docParser
----withParser ODT = odtParser
----withParser XML = xmlParser
----withParser _ = error "[ERROR] Parser not implemented yet"
--
--runParser :: FileFormat -> DB.ByteString
-- -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
--runParser format text = pure $ parseOnly (withParser format) text
--
--openZip :: FilePath -> IO [DB.ByteString]
--openZip fp = do
-- path <- resolveFile' fp
-- entries <- withArchive path (DM.keys <$> getEntries)
-- bs <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
-- pure bs
clean :: Text -> Text
clean txt = DT.map clean' txt
where
clean' '’' = '\''
clean' c = c
{-|
Module : Gargantext.Text.Parsers.CSV
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
CSV parser for Gargantext corpus files.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric #-}
module Gargantext.Text.Parsers.CSV where
import GHC.Real (round)
import GHC.IO (FilePath)
import Control.Applicative
import Data.Char (ord)
import Data.Csv
import Data.Either (Either(Left, Right))
import Data.Text (Text, pack, length)
import qualified Data.ByteString.Lazy as BL
import Data.Vector (Vector)
import qualified Data.Vector as V
import Safe (tailMay)
import Gargantext.Text
import Gargantext.Text.Context
import Gargantext.Prelude hiding (length)
---------------------------------------------------------------
data Doc = Doc
{ d_docId :: !Int
, d_title :: !Text
, d_source :: !Text
, d_publication_year :: !Int
, d_publication_month :: !Int
, d_publication_day :: !Int
, d_abstract :: !Text
, d_authors :: !Text
}
deriving (Show)
---------------------------------------------------------------
toDocs :: Vector CsvDoc -> [Doc]
toDocs v = V.toList
$ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
-> Doc nId t s py pm pd abst auth )
(V.enumFromN 1 (V.length v'')) v''
where
v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
---------------------------------------------------------------
fromDocs :: Vector Doc -> Vector CsvDoc
fromDocs docs = V.map fromDocs' docs
where
fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
---------------------------------------------------------------
-- | Split a document in its context
-- TODO adapt the size of the paragraph according to the corpus average
splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
splitDoc m splt doc = let docSize = (length $ c_abstract doc) in
if docSize > 1000
then
if (mod (round m) docSize) >= 10
then
splitDoc' splt doc
else
V.fromList [doc]
else
V.fromList [doc]
splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
where
firstDoc = CsvDoc t s py pm pd firstAbstract auth
firstAbstract = head' abstracts
nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
abstracts = (splitBy $ contextSize) abst
head' x = maybe "" identity (head x)
tail' x = maybe [""] identity (tailMay x)
---------------------------------------------------------------
---------------------------------------------------------------
type Mean = Double
docsSize :: Vector CsvDoc -> Mean
docsSize csvDoc = mean ls
where
ls = V.toList $ V.map (fromIntegral . length . c_abstract) csvDoc
---------------------------------------------------------------
data CsvDoc = CsvDoc
{ c_title :: !Text
, c_source :: !Text
, c_publication_year :: !Int
, c_publication_month :: !Int
, c_publication_day :: !Int
, c_abstract :: !Text
, c_authors :: !Text
}
deriving (Show)
instance FromNamedRecord CsvDoc where
parseNamedRecord r = CsvDoc <$> r .: "title"
<*> r .: "source"
<*> r .: "publication_year"
<*> r .: "publication_month"
<*> r .: "publication_day"
<*> r .: "abstract"
<*> r .: "authors"
instance ToNamedRecord CsvDoc where
toNamedRecord (CsvDoc t s py pm pd abst aut) =
namedRecord [ "title" .= t
, "source" .= s
, "publication_year" .= py
, "publication_month" .= pm
, "publication_day" .= pd
, "abstract" .= abst
, "authors" .= aut
]
csvDecodeOptions :: DecodeOptions
csvDecodeOptions = (defaultDecodeOptions
{decDelimiter = fromIntegral $ ord '\t'}
)
csvEncodeOptions :: EncodeOptions
csvEncodeOptions = ( defaultEncodeOptions
{encDelimiter = fromIntegral $ ord '\t'}
)
readCsv :: FilePath -> IO (Header, Vector CsvDoc)
readCsv fp = do
csvData <- BL.readFile fp
case decodeByNameWith csvDecodeOptions csvData of
Left e -> panic (pack e)
Right csvDocs -> pure csvDocs
writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
writeCsv fp (h, vs) = BL.writeFile fp $
encodeByNameWith csvEncodeOptions h (V.toList vs)
......@@ -18,8 +18,9 @@ DGP.parseDate1 DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Parsers.Date (parseDate1, Lang(FR, EN), parseDate, fromRFC3339, parseTimeOfDay, getMultiplicator) where
module Gargantext.Text.Parsers.Date (parseDate1, parseDate, fromRFC3339, parseTimeOfDay, getMultiplicator) where
import Gargantext.Core (Lang(FR,EN))
import Gargantext.Prelude
import Prelude (toInteger, div, otherwise, (++))
--import Gargantext.Types.Main as G
......@@ -30,7 +31,8 @@ import Data.Time.LocalTime (utc)
import Duckling.Resolve (fromUTC, Context(Context, referenceTime, locale)
, DucklingTime(DucklingTime)
)
import Duckling.Core (makeLocale, Lang(FR,EN), Some(This), Dimension(Time))
import Duckling.Core (makeLocale, Some(This), Dimension(Time))
import qualified Duckling.Core as DC
import Duckling.Types (jsonValue, Entity)
import Duckling.Api (analyze, parse)
......@@ -63,9 +65,9 @@ import Text.XML.HXT.DOM.Util (decimalStringToInt)
-- TODO add Paris at Duckling.Locale Region datatype
-- | To get Homogeinity of the languages
-- TODO : put this in a more generic place in the source code
--parserLang :: G.Language -> Lang
--parserLang G.FR = FR
--parserLang G.EN = EN
parserLang :: Lang -> DC.Lang
parserLang FR = DC.FR
parserLang EN = DC.EN
-- | Final Date parser API
......@@ -93,7 +95,7 @@ utcToDucklingTime time = DucklingTime . zonedTimeToZoneSeriesTime $ fromUTC time
-- | Local Context which depends on Lang and Time
localContext :: Lang -> DucklingTime -> Context
localContext lang dt = Context {referenceTime = dt, locale = makeLocale lang Nothing}
localContext lang dt = Context {referenceTime = dt, locale = makeLocale (parserLang lang) Nothing}
-- | Date parser with Duckling
parseDateWithDuckling :: Lang -> Text -> IO [ResolvedToken]
......
{-|
Module : Gargantext.Text.Search
Description : All parsers of Gargantext in one file.
Copyright : (c) CNRS, 2017 - present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
This search Engine is first made to clean CSV file according to a query.
Starting from this model, a specific Gargantext engine will be made
(using more metrics scores/features).
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings, NamedFieldPuns #-}
module Gargantext.Text.Search where
import Data.SearchEngine
import Data.Ix
-- Usefull to use stopwords
-- import Data.Set (Set)
-- import qualified Data.Set as Set
import Data.Text (Text)
import Gargantext.Prelude
import Gargantext.Text.Terms.Mono (monoterms)
import Gargantext.Text.Terms.Mono.Stem as ST
import Gargantext.Text.Parsers.CSV
type DocId = Int
type DocSearchEngine = SearchEngine
Doc
DocId
DocField
NoFeatures
data DocField = TitleField
| AbstractField
deriving (Eq, Ord, Enum, Bounded, Ix, Show)
initialDocSearchEngine :: DocSearchEngine
initialDocSearchEngine =
initSearchEngine docSearchConfig defaultSearchRankParameters
docSearchConfig :: SearchConfig Doc DocId DocField NoFeatures
docSearchConfig =
SearchConfig {
documentKey = d_docId,
extractDocumentTerms = extractTerms,
transformQueryTerm = normaliseQueryToken,
documentFeatureValue = const noFeatures
}
where
extractTerms :: Doc -> DocField -> [Text]
extractTerms doc TitleField = monoterms (d_title doc)
extractTerms doc AbstractField = monoterms (d_abstract doc)
normaliseQueryToken :: Text -> DocField -> Text
normaliseQueryToken tok =
let tokStem = ST.stem ST.EN
in \field -> case field of
TitleField -> tokStem tok
AbstractField -> tokStem tok
defaultSearchRankParameters :: SearchRankParameters DocField NoFeatures
defaultSearchRankParameters =
SearchRankParameters {
paramK1,
paramB,
paramFieldWeights,
paramFeatureWeights = noFeatures,
paramFeatureFunctions = noFeatures,
paramResultsetSoftLimit = 2000,
paramResultsetHardLimit = 4000,
paramAutosuggestPrefilterLimit = 500,
paramAutosuggestPostfilterLimit = 500
}
where
paramK1 :: Float
paramK1 = 1.5
paramB :: DocField -> Float
paramB TitleField = 0.9
paramB AbstractField = 0.5
paramFieldWeights :: DocField -> Float
paramFieldWeights TitleField = 20
paramFieldWeights AbstractField = 5
......@@ -18,56 +18,40 @@ Using Latin numerical prefixes, an n-gram of size 1 is referred to as a
Source: https://en.wikipedia.org/wiki/Ngrams
TODO
group Ngrams -> Tree
compute occ by node of Tree
group occs according groups
compute cooccurrences
compute graph
-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Ngrams
module Gargantext.Text.Terms
where
import Data.Char (Char, isAlphaNum, isSpace)
import Data.Text (Text, split, splitOn, pack)
import Data.Set (Set)
import qualified Data.Set as S
import Data.Text (Text)
import Data.Traversable
import Gargantext.Prelude
import Gargantext.Core
import Gargantext.Text.Ngrams.Stem (stem)
data Ngrams = Ngrams { _ngrams_label :: [Text]
, _ngrams_stem :: Set Text
} deriving (Show)
data Terms = MonoGrams | MultiGrams
type MonoGrams = Text
type MultiGrams = [Text]
ngrams :: Text -> [Text]
ngrams = monograms
text2ngrams :: Lang -> Text -> Ngrams
text2ngrams lang txt = Ngrams txt' (S.fromList $ map (stem lang) txt')
where
txt' = splitOn (pack " ") txt
equivNgrams :: Ngrams -> Ngrams -> Bool
equivNgrams (Ngrams _ s1) (Ngrams _ s2) = s1 `S.isSubsetOf` s2
|| s2 `S.isSubsetOf` s1
--monograms :: Text -> [Text]
--monograms xs = monograms $ toLower $ filter isGram xs
monograms :: Text -> [Text]
monograms txt = split isWord txt
where
isWord c = c `elem` [' ', '\'', ',', ';']
isGram :: Char -> Bool
isGram c = isAlphaNum c || isSpace c || c `elem` ['-','/','\'']
import Gargantext.Core.Types
import Gargantext.Text.Terms.Multi (multiterms)
import Gargantext.Text.Terms.Mono (monoterms')
data TermType = Mono | Multi
-- remove Stop Words
-- map (filter (\t -> not . elem t)) $
------------------------------------------------------------------------
extractTerms :: Traversable t => TermType -> Lang -> t Text -> IO (t [Terms])
extractTerms termType lang = mapM (terms termType lang)
------------------------------------------------------------------------
terms :: TermType -> Lang -> Text -> IO [Terms]
terms Mono lang txt = pure $ monoterms' lang txt
terms Multi lang txt = multiterms lang txt
------------------------------------------------------------------------
......@@ -13,7 +13,7 @@ commentary with @some markup@.
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Ngrams.Lists
module Gargantext.Text.Terms.Lists
where
--import Data.Maybe
......@@ -35,4 +35,3 @@ data ListName = Stop | Candidate | Graph
--stop :: [Ngrams] -> [Ngrams]
--stop ngs = filter (\ng -> _ngramsListName ng == Just Stop) ngs
{-|
Module : Gargantext.Text.Terms.Mono
Description : Mono Terms module
Copyright : (c) CNRS, 2017 - present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Mono-terms are Nterms where n == 1.
-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Terms.Mono (monoterms, monoterms')
where
import Data.Text (Text, toLower, split, splitOn, pack)
import qualified Data.Set as S
import Gargantext.Core
import Gargantext.Core.Types
import Gargantext.Text.Terms.Mono.Stem (stem)
import Gargantext.Prelude
import Data.Char (isAlphaNum, isSpace)
monoterms' :: Lang -> Text -> [Terms]
monoterms' l txt = map (text2terms l) $ monoterms txt
monoterms :: Text -> [Text]
monoterms txt = map toLower $ split isWord txt
where
isWord c = c `elem` [' ', '\'', ',', ';']
text2terms :: Lang -> Text -> Terms
text2terms lang txt = Terms label stems
where
label = splitOn (pack " ") txt
stems = S.fromList $ map (stem lang) label
--monograms :: Text -> [Text]
--monograms xs = monograms $ toLower $ filter isGram xs
isGram :: Char -> Bool
isGram c = isAlphaNum c || isSpace c || c `elem` ['-','/','\'']
......@@ -17,14 +17,16 @@ Source : https://en.wikipedia.org/wiki/Stemming
-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Ngrams.Stem
module Gargantext.Text.Terms.Mono.Stem (stem, Lang(..))
where
import Data.Text (Text)
import qualified Data.Text as DT
import qualified NLP.Stemmer as N
import Gargantext.Prelude
import Gargantext.Core (Lang(..))
-- (stem, Stemmer(..))
......@@ -38,7 +40,6 @@ import Gargantext.Core (Lang(..))
-- A stemmer for English, for example, should identify the string "cats"
-- (and possibly "catlike", "catty" etc.) as based on the root "cat".
-- and
-- "stems", "stemmer", "stemming", "stemmed" as based on "stem". A stemming
-- algorithm reduces the words "fishing", "fished", and "fisher" to the
......@@ -48,7 +49,6 @@ import Gargantext.Core (Lang(..))
-- "arguments" reduce to the stem "argument".
stem :: Lang -> Text -> Text
stem lang = DT.pack . N.stem lang' . DT.unpack
where
......
......@@ -17,7 +17,7 @@ Adapted from:
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Ngrams.Stem.En (stem)
module Gargantext.Text.Terms.Mono.Stem.En (stemIt)
where
import Control.Monad
......@@ -196,8 +196,8 @@ step5 = step5b . step5a
allSteps :: [Char] -> [Char]
allSteps = step5 . step4 . step3 . step2 . step1
stem :: Text -> Text
stem s = pack (stem' $ unpack s)
stemIt :: Text -> Text
stemIt s = pack (stem' $ unpack s)
stem' :: [Char] -> [Char]
stem' s | length s < 3 = s
......
......@@ -17,17 +17,26 @@ Source: https://en.wikipedia.org/wiki/Tokenize
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Ngrams.Token (tokenize)
module Gargantext.Text.Terms.Mono.Token (tokenize)
where
import Data.Text (Text)
import qualified Gargantext.Text.Ngrams.Token.Text as En
import qualified Gargantext.Text.Terms.Mono.Token.En as En
import Gargantext.Core (Lang(..))
import Gargantext.Prelude
type Token = Text
-- >>> tokenize "A rose is a rose is a rose."
-- ["A","rose","is","a","rose","is","a","rose", "."]
--
data Context = Letter | Word | Sentence | Line | Paragraph
tokenize :: Text -> [Token]
tokenize = En.tokenize
tokenize' :: Lang -> Context -> [Token]
tokenize' = undefined
......@@ -7,13 +7,13 @@ Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Inspired from https://bitbucket.org/gchrupala/lingo/overview
First inspired from https://bitbucket.org/gchrupala/lingo/overview
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Ngrams.Token.Text
module Gargantext.Text.Terms.Mono.Token.En
( EitherList(..)
, Tokenizer
, tokenize
......@@ -30,13 +30,18 @@ module Gargantext.Text.Ngrams.Token.Text
)
where
import Data.Foldable (concatMap)
import qualified Data.Char as Char
import Data.Maybe
import Control.Monad
import Control.Applicative (Applicative)
import Data.Text (Text)
import qualified Data.Text as T
import Data.Either
import Gargantext.Prelude
-- | A Tokenizer is function which takes a list and returns a list of Eithers
-- (wrapped in a newtype). Right Texts will be passed on for processing
-- to tokenizers down
......
{-|
Module : Gargantext.Text.Terms.Multi
Description : Multi Terms module
Copyright : (c) CNRS, 2017 - present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Multi-terms are ngrams where n > 1.
-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Terms.Multi (multiterms)
where
import Data.Text hiding (map, group, filter, concat)
import Data.List (concat)
import Gargantext.Prelude
import Gargantext.Core (Lang(..))
import Gargantext.Core.Types
import Gargantext.Text.Terms.Multi.PosTagging
import qualified Gargantext.Text.Terms.Multi.Lang.En as En
import qualified Gargantext.Text.Terms.Multi.Lang.Fr as Fr
multiterms :: Lang -> Text -> IO [Terms]
multiterms lang txt = concat
<$> map (map tokenTag2terms)
<$> map (filter (\t -> _my_token_pos t == Just NP))
<$> tokenTags lang txt
tokenTag2terms :: TokenTag -> Terms
tokenTag2terms (TokenTag w t _ _) = Terms w t
tokenTags :: Lang -> Text -> IO [[TokenTag]]
tokenTags lang s = map (group lang) <$> tokenTags' lang s
tokenTags' :: Lang -> Text -> IO [[TokenTag]]
tokenTags' lang t = map tokens2tokensTags
<$> map _sentenceTokens
<$> _sentences
<$> corenlp lang t
---- | This function analyses and groups (or not) ngrams according to
---- specific grammars of each language.
group :: Lang -> [TokenTag] -> [TokenTag]
group EN = En.group
group FR = Fr.group
{-|
Module : Gargantext.Text.Terms.Multi.Group
Description : English Grammar rules to group postag tokens.
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Rule-based grammars are computed in this english module in order to
group the tokens into extracted terms.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Terms.Multi.Group (group2)
where
import Data.Maybe (Maybe(Just))
import Gargantext.Core.Types
import Gargantext.Prelude
-- | FIXME p1 and p2 not really taken into account
group2 :: POS -> POS -> [TokenTag] -> [TokenTag]
group2 p1 p2 (x@(TokenTag _ _ (Just p1') _):y@(TokenTag _ _ (Just p2') _):z) =
if (p1 == p1') && (p2 == p2')
then group2 p1 p2 (x<>y : z)
else (x : group2 p1 p2 (y:z))
group2 p1 p2 (x@(TokenTag _ _ Nothing _):y) = (x: group2 p1 p2 y)
group2 _ _ [x@(TokenTag _ _ (Just _) _)] = [x]
group2 p1 p2 (x@(TokenTag _ _ (Just _) _):y@(TokenTag _ _ Nothing _):z) = (x:y: group2 p1 p2 (y:z))
group2 _ _ [] = []
{-|
Module : Gargantext.Text.Terms.Multi.Lang.En
Description : English Grammar rules to group postag tokens.
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Rule-based grammars are computed in this english module in order to group
the tokens into extracted terms.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Terms.Multi.Lang.En (group)
where
import Gargantext.Prelude
import Gargantext.Core.Types
import Gargantext.Text.Terms.Multi.Group
------------------------------------------------------------------------
-- | Rule grammar to group tokens
group :: [TokenTag] -> [TokenTag]
group [] = []
group ntags = group2 NP NP
$ group2 NP VB
$ group2 NP IN
$ group2 IN DT
$ group2 VB NP
$ group2 JJ NP
$ group2 JJ JJ
$ group2 JJ CC
$ ntags
------------------------------------------------------------------------
--groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs) = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
--groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs)
--groupNgrams ((x,_,"LOCATION"):(y,yy,"LOCATION"):xs) = groupNgrams ((x <> " " <> y,yy,"LOCATION"):xs)
--
--groupNgrams (x:xs) = (x:(groupNgrams xs))
{-|
Module : Gargantext.Text.Terms.Multi.Lang.Fr
Description : French Grammar rules to group postag tokens.
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
This @group@ function groups horizontally ngrams in their context of
sentence according to grammars specific of each language. In english, JJ
is ADJectiv in french.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Terms.Multi.Lang.Fr (group)
where
import Gargantext.Prelude
import Gargantext.Core.Types
import Gargantext.Text.Terms.Multi.Group (group2)
group :: [TokenTag] -> [TokenTag]
group [] = []
group ntags = group2 NP NP
$ group2 NP VB
$ group2 NP IN
$ group2 IN DT
$ group2 VB NP
$ group2 JJ NP
$ group2 NP JJ
$ group2 JJ JJ
$ group2 JJ CC
$ ntags
------------------------------------------------------------------------
-- TODO
--groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs) = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
--groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs)
{-|
Module : Gargantext.Text.Ngrams.PosTagging.CoreNLP
Description : CoreNLP module
Module : Gargantext.Text.Terms.Multi.PosTagging
Description : PosTagging module using Stanford java REST API
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
In corpus linguistics, part-of-speech tagging (POS tagging or PoS
tagging or POST), also called grammatical tagging or word-category
disambiguation, is the process of marking up a word in a text (corpus)
as corresponding to a particular part of speech,[1] based on both its
definition and its context—i.e., its relationship with adjacent and
related words in a phrase, sentence, or paragraph. A simplified form of
this is commonly taught to school-age children, in the identification of
words as nouns, verbs, adjectives, adverbs, etc.
Source: https://en.wikipedia.org/wiki/Part-of-speech_tagging
-}
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE FlexibleContexts #-}
module Gargantext.Text.Ngrams.PosTagging.CoreNLP
module Gargantext.Text.Terms.Multi.PosTagging
where
import GHC.Generics
......@@ -26,14 +35,14 @@ import GHC.Show (Show(..))
import Data.ByteString.Lazy.Internal (ByteString)
import Data.Aeson.TH (deriveJSON)
import Data.Aeson
import Data.Monoid
import Data.Maybe (isJust)
import Data.Set (Set, fromList, empty)
import Data.Set (fromList)
import Data.Text (Text, splitOn, pack, toLower, unpack)
import Data.Text (Text, splitOn, pack, toLower)
import Gargantext.Core (Lang(..))
import Gargantext.Core.Types
import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Prelude
......@@ -44,53 +53,6 @@ import Control.Monad.IO.Class (MonadIO)
import Data.String.Conversions (ConvertibleStrings)
------------------------------------------------------------------------
data Tag = POS | NER
deriving (Show, Eq)
------------------------------------------------------------------------
------------------------------------------------------------------------
data POS = NP
| JJ | VB
| CC | IN | DT
| NoPos
deriving (Show, Generic, Eq)
------------------------------------------------------------------------
instance FromJSON POS where
parseJSON = withText "String" (\x -> pure (pos $ unpack x))
where
pos :: [Char] -> POS
pos "NP" = NP
pos "NN" = NP
pos "NC" = NP
pos "NNS" = NP
pos "NNP" = NP
pos "JJ" = JJ
pos "ADJ" = JJ
pos "VB" = VB
pos "VBN" = VB
pos "VBG" = VB
pos "CC" = CC
pos "IN" = IN
pos "DT" = DT
-- French specific
pos "P" = IN
pos _ = NoPos
instance ToJSON POS
------------------------------------------------------------------------
data NER = PERSON | ORGANIZATION | LOCATION | NoNER
deriving (Show, Generic)
------------------------------------------------------------------------
instance FromJSON NER where
parseJSON = withText "String" (\x -> pure (ner $ unpack x))
where
ner :: [Char] -> NER
ner "PERSON" = PERSON
ner "ORGANIZATION" = ORGANIZATION
ner "LOCATION" = LOCATION
ner _ = NoNER
instance ToJSON NER
------------------------------------------------------------------------
data Token = Token { _tokenIndex :: Int
, _tokenWord :: Text
......@@ -105,39 +67,21 @@ data Token = Token { _tokenIndex :: Int
} deriving (Show, Generic)
$(deriveJSON (unPrefix "_token") ''Token)
------------------------------------------------------------------------
data NgramsTag = NgramsTag { _my_token_word :: [Text]
, _my_token_stem :: Set Text
, _my_token_pos :: Maybe POS
, _my_token_ner :: Maybe NER
} deriving (Show)
------------------------------------------------------------------------
tokens2ngramsTags :: [Token] -> [NgramsTag]
tokens2ngramsTags ts = select $ map ngramsTag ts
tokens2tokensTags :: [Token] -> [TokenTag]
tokens2tokensTags ts = filter' $ map tokenTag ts
------------------------------------------------------------------------
ngramsTag :: Token -> NgramsTag
ngramsTag (Token _ _ w s _ _ p n _ _) = NgramsTag w' s' p n
tokenTag :: Token -> TokenTag
tokenTag (Token _ _ w s _ _ p n _ _) = TokenTag w' s' p n
where
w' = split w
s' = fromList (split s)
split = splitOn (pack " ") . toLower
select :: [NgramsTag] -> [NgramsTag]
select xs = filter isNgrams xs
filter' :: [TokenTag] -> [TokenTag]
filter' xs = filter isNgrams xs
where
isNgrams (NgramsTag _ _ p n) = isJust p || isJust n
instance Monoid NgramsTag where
mempty = NgramsTag [] empty Nothing Nothing
mappend (NgramsTag w1 s1 p1 n1) (NgramsTag w2 s2 p2 _)
= NgramsTag (w1 <> w2) (s1 <> s2) p3 n1
where
p3 = case (p1,p2) of
(Just JJ, Just NP) -> Just NP
(Just VB, Just NP) -> Just NP
_ -> p1
mconcat = foldl mappend mempty
isNgrams (TokenTag _ _ p n) = isJust p || isJust n
------------------------------------------------------------------------
data Sentence = Sentence { _sentenceIndex :: Int
......
{-|
Module : Gargantext.Viz.Graph
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE DeriveGeneric #-}
module Gargantext.Viz.Graph
where
import GHC.Generics (Generic)
import Data.Aeson.TH (deriveJSON)
import Data.Text (Text)
import Gargantext.Prelude
import Gargantext.Core.Utils.Prefix (unPrefix)
-----------------------------------------------------------
data TypeNode = Terms | Unknown
deriving (Show, Generic)
$(deriveJSON (unPrefix "") ''TypeNode)
data Attributes = Attributes { clust_default :: Int }
deriving (Show, Generic)
$(deriveJSON (unPrefix "") ''Attributes)
data Node = Node { n_size :: Int
, n_type :: TypeNode
, n_id :: Text
, n_label :: Text
, n_attributes :: Attributes
}
deriving (Show, Generic)
$(deriveJSON (unPrefix "n_") ''Node)
data Edge = Edge { e_source :: Int
, e_target :: Int
, e_weight :: Double
, e_id :: Int
}
deriving (Show, Generic)
$(deriveJSON (unPrefix "e_") ''Edge)
data Graph = Graph { g_nodes :: [Node]
, g_edges :: [Edge]
}
deriving (Show, Generic)
$(deriveJSON (unPrefix "g_") ''Graph)
-----------------------------------------------------------
{-|
Module : Gargantext.Text.Ngrams.PosTagging
Description : All parsers of Gargantext in one file.
Copyright : (c) CNRS, 2017 - present
Module : Gargantext.Graph.Distances
Description : Distance management tools
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Mainly reexport functions in @Data.Text.Metrics@
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE Strict #-}
module Gargantext.Text.Ngrams.PosTagging
module Gargantext.Viz.Graph.Distances
where
{-|
Module : Gargantext.Graph.Distances.Conditional
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Motivation and definition of the @Conditional@ distance.
-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE Strict #-}
module Gargantext.Viz.Graph.Distances.Conditional
where
import Data.Matrix hiding (identity)
import Data.String.Conversions (ConvertibleStrings(..))
import Data.List (concat, sortOn)
import qualified Data.List as L
import Data.Map (Map)
import qualified Data.Map as M
import Data.Set (Set)
import qualified Data.Set as S
import Data.Vector (Vector)
import qualified Data.Vector as V
import Gargantext.Prelude
import Gargantext.Viz.Graph.Utils
------------------------------------------------------------------------
------------------------------------------------------------------------
-- | Optimisation issue
toBeOptimized :: (Num a, Fractional a, Ord a) => Matrix a -> Matrix a
toBeOptimized m = proba Col m
------------------------------------------------------------------------
-- | Main Functions
-- Compute the probability from axis
-- x' = x / (sum Col x)
proba :: (Num a, Fractional a) => Axis -> Matrix a -> Matrix a
proba a m = mapOn a (\c x -> x / V.sum (axis a c m)) m
mapOn :: Axis -> (AxisId -> a -> a) -> Matrix a -> Matrix a
mapOn a f m = V.foldl' f' m (V.enumFromTo 1 (nOf a m))
where
f' m' c = mapOnly a f c m'
mapOnly :: Axis -> (AxisId -> a -> a) -> AxisId -> Matrix a -> Matrix a
mapOnly Col = mapCol
mapOnly Row = mapRow
mapAll :: (a -> a) -> Matrix a -> Matrix a
mapAll f m = mapOn Col (\_ -> f) m
---------------------------------------------------------------
-- | Compute a distance from axis
-- xs = (sum Col x') - x'
distFromSum :: (Num a, Fractional a)
=> Axis -> Matrix a -> Matrix a
distFromSum a m = mapOn a (\c x -> V.sum (axis a c m) - x) m
---------------------------------------------------------------
---------------------------------------------------------------
-- | To compute included/excluded or specific/generic scores
opWith :: (Fractional a1, Num a1)
=> (Matrix a2 -> t -> Matrix a1) -> Matrix a2 -> t -> Matrix a1
opWith op xs ys = mapAll (\x -> x / (2*n -1)) (xs `op` ys)
where
n = fromIntegral $ nOf Col xs
---------------------------------------------------------------
-------------------------------------------------------
conditional :: (Num a, Fractional a, Ord a) => Matrix a -> Matrix a
conditional m = filter (threshold m') m'
where
------------------------------------------------------------------------
-- | Main Operations
-- x' = x / (sum Col x)
x' = proba Col m
------------------------------------------------------------------------
-- xs = (sum Col x') - x'
xs = distFromSum Col x'
-- ys = (sum Row x') - x'
ys = distFromSum Row x'
------------------------------------------------------------------------
-- | Top included or excluded
ie = opWith (+) xs ys
-- ie = ( xs + ys) / (2 * (x.shape[0] - 1))
-- | Top specific or generic
sg = opWith (-) xs ys
-- sg = ( xs - ys) / (2 * (x.shape[0] - 1))
nodes_kept :: [Int]
nodes_kept = take k' $ S.toList
$ foldl' (\s (n1,n2) -> insert [n1,n2] s) S.empty
$ map fst
$ nodes_included k <> nodes_specific k
nodes_included n = take n $ sortOn snd $ toListsWithIndex ie
nodes_specific m = take m $ sortOn snd $ toListsWithIndex sg
insert as s = foldl' (\s' a -> S.insert a s') s as
k' = 2*k
k = 10
dico_nodes :: Map Int Int
dico_nodes = M.fromList $ zip [1..] nodes_kept
dico_nodes_rev = M.fromList $ zip nodes_kept [1..]
m' = matrix (length nodes_kept)
(length nodes_kept)
(\(i,j) -> getElem ((M.!) dico_nodes i) ((M.!) dico_nodes j) x')
threshold m = V.minimum $ V.map (\cId -> V.maximum $ getCol cId m) (V.enumFromTo 1 (nOf Col m))
filter t m = mapAll (\x -> filter' t x) m
where
filter' t x = case (x >= t) of
True -> x
False -> 0
------------------------------------------------------------------------
{-|
Module : Gargantext.Graph.Distances.Distributional
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Motivation and definition of the @Distributional@ distance.
-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE Strict #-}
module Gargantext.Viz.Graph.Distances.Distributional
where
import Data.Matrix hiding (identity)
import Data.String.Conversions (ConvertibleStrings(..))
import Data.Map (Map)
import qualified Data.Map as M
import Data.Set (Set)
import qualified Data.Set as S
import Data.Vector (Vector)
import qualified Data.Vector as V
import Gargantext.Prelude
import Gargantext.Viz.Graph.Utils
distributional :: (Floating a, Ord a) => Matrix a -> [((Int, Int), a)]
distributional m = filter (\((x,y), d) -> foldl' (&&) True (conditions x y d) ) distriList
where
conditions x y d = [ (x /= y)
, (d > miniMax')
, ((M.lookup (x,y) distriMap) > (M.lookup (y,x) distriMap))
]
distriList = toListsWithIndex distriMatrix
distriMatrix = ri (mi m)
distriMap = M.fromList $ distriList
miniMax' = miniMax distriMatrix
ri :: (Ord a, Fractional a) => Matrix a -> Matrix a
ri m = matrix c r doRi
where
doRi (x,y) = doRi' x y m
doRi' x y mi'' = sumMin x y mi'' / (V.sum $ ax Col x y mi'')
sumMin x y mi' = V.sum $ V.map (\(a,b) -> min a b )
$ V.zip (ax Col x y mi') (ax Row x y mi')
(c,r) = (nOf Col m, nOf Row m)
mi :: (Ord a, Floating a) => Matrix a -> Matrix a
mi m = matrix c r createMat
where
(c,r) = (nOf Col m, nOf Row m)
createMat (x,y) = doMi x y m
doMi x y m = if x == y then 0 else (max (log (doMi' x y m)) 0 )
doMi' x y m = (getElem x y m) / ( cross x y m / total m )
cross x y m = (V.sum $ ax Col x y m) * (V.sum $ ax Row x y m)
ax :: Axis -> Int -> Int -> Matrix a -> Vector a
ax a i j m = dropAt j' $ axis a i' m
where
i' = div i c + 1
j' = mod r j + 1
(c,r) = (nOf Col m, nOf Row m)
miniMax :: (Ord a) => Matrix a -> a
miniMax m = V.minimum $ V.map (\c -> V.maximum $ getCol c m) (V.enumFromTo 1 (nOf Col m))
{-|
Module : Gargantext.Graph.Distances.Matrix
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Motivation and definition of the @Conditional@ distance.
Implementation use Accelerate library :
* Manuel M. T. Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell, and Vinod Grover.
[Accelerating Haskell Array Codes with Multicore GPUs][CKLM+11].
In _DAMP '11: Declarative Aspects of Multicore Programming_, ACM, 2011.
* Trevor L. McDonell, Manuel M. T. Chakravarty, Gabriele Keller, and Ben Lippmeier.
[Optimising Purely Functional GPU Programs][MCKL13].
In _ICFP '13: The 18th ACM SIGPLAN International Conference on Functional Programming_, ACM, 2013.
* Robert Clifton-Everest, Trevor L. McDonell, Manuel M. T. Chakravarty, and Gabriele Keller.
[Embedding Foreign Code][CMCK14].
In _PADL '14: The 16th International Symposium on Practical Aspects of Declarative Languages_, Springer-Verlag, LNCS, 2014.
* Trevor L. McDonell, Manuel M. T. Chakravarty, Vinod Grover, and Ryan R. Newton.
[Type-safe Runtime Code Generation: Accelerate to LLVM][MCGN15].
In _Haskell '15: The 8th ACM SIGPLAN Symposium on Haskell_, ACM, 2015.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE TypeOperators #-}
module Gargantext.Viz.Graph.Distances.Matrice
where
import Data.Array.Accelerate
import Data.Array.Accelerate.Interpreter (run)
import Data.Array.Accelerate.Smart
import Data.Array.Accelerate.Type
import Data.Array.Accelerate.Array.Sugar (fromArr, Array, Z)
import Data.Maybe (Maybe(Just))
import qualified Gargantext.Prelude as P
import qualified Data.Array.Accelerate.Array.Representation as Repr
import Gargantext.Text.Metrics.Occurrences
-----------------------------------------------------------------------
-- Test perf.
distriTest = distributional $ myMat 100
-----------------------------------------------------------------------
vector :: Int -> (Array (Z :. Int) Int)
vector n = fromList (Z :. n) [0..n]
matrix :: Elt c => Int -> [c] -> Matrix c
matrix n l = fromList (Z :. n :. n) l
myMat :: Int -> Matrix Int
myMat n = matrix n [1..]
-- | Two ways to get the rank (as documentation)
rank :: (Matrix a) -> Int
rank m = arrayRank $ arrayShape m
rank' :: (Matrix a) -> Int
rank' m = n
where
Z :. _ :. n = arrayShape m
-----------------------------------------------------------------------
-- | Conditional Distance
type Rank = Int
proba :: Rank -> Acc (Matrix Double) -> Acc (Matrix Double)
proba r mat = zipWith (/) mat (mkSum r mat)
mkSum :: Rank -> Acc (Matrix Double) -> Acc (Matrix Double)
mkSum r mat = replicate (constant (Z :. (r :: Int) :. All))
$ fold (+) 0 mat
type Matrix' a = Acc (Matrix a)
type InclusionExclusion = Double
type SpecificityGenericity = Double
conditional :: Matrix Double -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
conditional m = (run $ ie (use m), run $ sg (use m))
where
ie :: Matrix' Double -> Matrix' Double
ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
sg :: Acc (Matrix Double) -> Acc (Matrix Double)
sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat)
n :: Exp Double
n = P.fromIntegral r
r :: Rank
r = rank' m
xs :: Matrix' Double -> Matrix' Double
xs mat = zipWith (-) (proba r mat) (mkSum r $ proba r mat)
ys :: Acc (Matrix Double) -> Acc (Matrix Double)
ys mat = zipWith (-) (proba r mat) (mkSum r $ transpose $ proba r mat)
-- filter with threshold
-----------------------------------------------------------------------
-- | Distributional Distance
distributional :: Matrix Int -> Matrix Double
distributional m = run $ filter $ ri (map fromIntegral $ use m)
where
n = rank' m
miniMax m = map (\x -> ifThenElse (x > miniMax') x 0) m
where
miniMax' = (the $ minimum $ maximum m)
filter m = zipWith (\a b -> max a b) m (transpose m)
ri mat = zipWith (/) mat1 mat2
where
mat1 = mkSum n $ zipWith min (mi mat) (mi $ transpose mat)
mat2 = mkSum n mat
mi m' = zipWith (\a b -> max (log $ a/b) 0) m'
$ zipWith (/) (crossProduct m') (total m')
total m'' = replicate (constant (Z :. n :. n)) $ fold (+) 0 $ fold (+) 0 m''
crossProduct m = zipWith (*) (cross m ) (cross (transpose m))
cross mat = zipWith (-) (mkSum n mat) (mat)
int2double :: Matrix Int -> Matrix Double
int2double m = run (map fromIntegral $ use m)
{-|
Module : Gargantext.Graph.Distances.Utils
Description : Tools to compute distances from Cooccurrences
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Basically @compute@ takes an accelerate function as first input, a Map
of coccurrences as second input and outputs a Map automatically using
indexes.
TODO:
--cooc2fgl :: Ord t, Integral n => Map (t, t) n -> Graph
--fgl2json
-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE TypeOperators #-}
module Gargantext.Viz.Graph.Index
where
import qualified Data.Array.Accelerate as A
import qualified Data.Array.Accelerate.Interpreter as A
import Data.Array.Accelerate (Matrix, Elt, Shape, (:.)(..), Z(..))
import qualified Data.Vector.Unboxed as DVU
import Data.Maybe (fromMaybe)
import Data.Set (Set)
import qualified Data.Set as S
import Data.Map (Map)
import qualified Data.Map.Strict as M
import Gargantext.Prelude
type Index = Int
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
score :: (Ord t) => (A.Matrix Int -> A.Matrix Double)
-> Map (t, t) Int
-> Map (t, t) Double
score f m = fromIndex fromI . mat2map . f $ cooc2mat toI m
where
(toI, fromI) = createIndexes m
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
cooc2mat :: Ord t => Map t Index -> Map (t, t) Int -> Matrix Int
cooc2mat ti m = map2mat 0 n idx
where
n = M.size ti
idx = toIndex ti m -- it is important to make sure that toIndex is ran only once.
map2mat :: Elt a => a -> Int -> Map (Index, Index) a -> Matrix a
map2mat def n m = A.fromFunction shape (\(Z :. x :. y) -> fromMaybe def $ M.lookup (x, y) m)
where
shape = (Z :. n :. n)
mat2map :: (Elt a, Shape (Z :. Index)) =>
A.Array (Z :. Index :. Index) a -> Map (Index, Index) a
mat2map m = M.fromList . map f . A.toList . A.run . A.indexed $ A.use m
where
Z :. _ :. n = A.arrayShape m
f ((Z :. i :. j), x) = ((i, j), x)
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
toIndex :: Ord t => Map t Index -> Map (t,t) a -> Map (Index,Index) a
toIndex ni ns = indexConversion ni ns
fromIndex :: Ord t => Map Index t -> Map (Index, Index) a -> Map (t,t) a
fromIndex ni ns = indexConversion ni ns
---------------------------------------------------------------------------------
indexConversion :: (Ord b, Ord k) => Map k b -> Map (k,k) a -> Map (b, b) a
indexConversion index ms = M.fromList $ map (\((k1,k2),c) -> ( ((M.!) index k1, (M.!) index k2), c)) (M.toList ms)
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
createIndexes :: Ord t => Map (t, t) b -> (Map t Index, Map Index t)
createIndexes = set2indexes . cooc2set
where
cooc2set :: Ord t => Map (t, t) a -> Set t
cooc2set cs' = foldl' (\s ((t1,t2),_) -> insert [t1,t2] s ) S.empty (M.toList cs')
where
insert as s = foldl' (\s' t -> S.insert t s') s as
set2indexes :: Ord t => Set t -> (Map t Index, Map Index t)
set2indexes s = (M.fromList toIndex', M.fromList fromIndex')
where
fromIndex' = zip [0..] xs
toIndex' = zip xs [0..]
xs = S.toList s
{-|
Module : Gargantext.Graph.Distances.Utils
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
These functions are used for Vector.Matrix only.
-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE Strict #-}
module Gargantext.Viz.Graph.Utils
where
import Data.Matrix hiding (identity)
import Data.Map (Map)
import qualified Data.Map as M
import Data.Set (Set)
import qualified Data.Set as S
import Data.Vector (Vector)
import qualified Data.Vector as V
import qualified Data.List as L
import Gargantext.Prelude
------------------------------------------------------------------------
-- | Some utils to build the matrix from cooccurrence results
-- | For tests only, to be removed
-- m1 :: Matrix Double
-- m1 = fromList 300 300 [1..]
------------------------------------------------------------------------
------------------------------------------------------------------------
data Axis = Col | Row
------------------------------------------------------------------------
-- | Matrix functions
type AxisId = Int
-- Data.Vector.Additions
dropAt :: Int -> Vector a -> Vector a
dropAt n v = debut <> (V.tail fin)
where
debut = V.take n v
fin = V.drop n v
total :: Num a => Matrix a -> a
total m = V.sum $ V.map (\c -> V.sum (getCol c m)) (V.enumFromTo 1 (nOf Col m))
nOf :: Axis -> Matrix a -> Int
nOf Row = nrows
nOf Col = ncols
axis :: Axis -> AxisId -> Matrix a -> Vector a
axis Col = getCol
axis Row = getRow
toListsWithIndex :: Matrix a -> [((Int, Int), a)]
toListsWithIndex m = concat' $ zip [1..] $ map (\c -> zip [1..] c) $ toLists m
where
concat' :: [(Int, [(Int, a)])] -> [((Int, Int), a)]
concat' xs = L.concat $ map (\(x, ys) -> map (\(y, a) -> ((x,y), a)) ys ) xs
......@@ -3,6 +3,7 @@ extra-package-dbs: []
packages:
- .
- servant-job
- clustering-louvain
allow-newer: true
extra-deps:
......@@ -12,11 +13,14 @@ extra-deps:
commit: 6f0595d2421005837d59151a8b26eee83ebb67b5
- git: https://github.com/delanoe/servant-static-th.git
commit: fff77e79fe94d563ab5cae2609b78c17b5c1f434
- accelerate-1.2.0.0
- accelerate-io-1.2.0.0
- aeson-1.2.4.0
- aeson-lens-0.5.0.0
- duckling-0.1.3.0
- extra-1.5.3
- full-text-search-0.2.1.4
- fullstop-0.1.4
- haskell-src-exts-1.18.2
- http-types-0.12.1
- protolude-0.2
......@@ -32,4 +36,4 @@ extra-deps:
- text-1.2.3.0
- text-show-3.6.2
- servant-flatten-0.2
resolver: lts-10.6
resolver: lts-11.10
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment