Commit 40da8153 authored by qlobbe's avatar qlobbe

working on the adaptative matching

parent f9d09a80
Pipeline #540 failed with stage
{-|
Module : Main.hs
Description : Gargantext starter binary with Adaptative Phylo
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Adaptative Phylo binaries
-}
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE Strict #-}
module Main where
import Data.Aeson
import Data.ByteString.Lazy (ByteString)
import Data.Maybe ()
import Data.List (concat, nub)
import Data.String (String)
import Data.Text (Text, unwords)
import Gargantext.Prelude
import Gargantext.Text.Context (TermList)
import Gargantext.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
import Gargantext.Text.List.CSV (csvGraphTermList)
import Gargantext.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
import Gargantext.Viz.AdaptativePhylo
import GHC.IO (FilePath)
import Prelude (Either(..))
import System.Environment
import qualified Data.ByteString.Lazy as Lazy
import qualified Data.Vector as Vector
import qualified Gargantext.Text.Corpus.Parsers.CSV as Csv
---------------
-- | Tools | --
---------------
-- | To print an important message as an IO()
printIOMsg :: String -> IO ()
printIOMsg msg =
putStrLn ( "\n"
<> "------------"
<> "\n"
<> "-- | " <> msg <> "\n" )
-- | To print a comment as an IO()
printIOComment :: String -> IO ()
printIOComment cmt =
putStrLn ( "\n" <> cmt <> "\n" )
-- | To read and decode a Json file
readJson :: FilePath -> IO ByteString
readJson path = Lazy.readFile path
-- | To filter the Ngrams of a document based on the termList
filterTerms :: Patterns -> (a, Text) -> (a, [Text])
filterTerms patterns (y,d) = (y,termsInText patterns d)
where
--------------------------------------
termsInText :: Patterns -> Text -> [Text]
termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList pats txt
--------------------------------------
-- | To transform a Csv nfile into a readable corpus
csvToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
csvToCorpus limit path = Vector.toList
<$> Vector.take limit
<$> Vector.map (\row -> (csv_publication_year row, (csv_title row) <> " " <> (csv_abstract row)))
<$> snd <$> Csv.readFile path
-- | To use the correct parser given a CorpusType
fileToCorpus :: CorpusParser -> Int -> FilePath -> IO ([(Int,Text)])
fileToCorpus parser limit path = case parser of
-- To do Wos from legacy Main.hs
Wos -> undefined
Csv -> csvToCorpus limit path
-- | To parse a file into a list of Document
fileToDocs :: CorpusParser -> Int -> FilePath -> TermList -> IO [Document]
fileToDocs parser limit path lst = do
corpus <- fileToCorpus parser limit path
let patterns = buildPatterns lst
pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
--------------
-- | Main | --
--------------
main :: IO ()
main = do
printIOMsg "Starting the reconstruction"
printIOMsg "Read the configuration file"
[args] <- getArgs
jsonArgs <- (eitherDecode <$> readJson args) :: IO (Either String Config)
case jsonArgs of
Left err -> putStrLn err
Right config -> do
printIOMsg "Parse the corpus"
mapList <- csvGraphTermList (listPath config)
corpus <- fileToDocs (corpusParser config) (corpusLimit config) (corpusPath config) mapList
printIOComment (show (length corpus) <> " parsed docs from the corpus")
\ No newline at end of file
...@@ -68,6 +68,7 @@ library: ...@@ -68,6 +68,7 @@ library:
- Gargantext.Viz.Graph.Distances.Matrice - Gargantext.Viz.Graph.Distances.Matrice
- Gargantext.Viz.Graph.Index - Gargantext.Viz.Graph.Index
- Gargantext.Viz.Phylo - Gargantext.Viz.Phylo
- Gargantext.Viz.AdaptativePhylo
- Gargantext.Viz.Phylo.Tools - Gargantext.Viz.Phylo.Tools
- Gargantext.Viz.Phylo.Example - Gargantext.Viz.Phylo.Example
- Gargantext.Viz.Phylo.LevelMaker - Gargantext.Viz.Phylo.LevelMaker
...@@ -276,6 +277,31 @@ executables: ...@@ -276,6 +277,31 @@ executables:
- split - split
- unordered-containers - unordered-containers
gargantext-adaptative-phylo:
main: Main.hs
source-dirs: bin/gargantext-adaptative-phylo
ghc-options:
- -threaded
- -rtsopts
- -with-rtsopts=-N
- -O2
- -Wmissing-signatures
dependencies:
- aeson
- async
- base
- bytestring
- containers
- directory
- gargantext
- vector
- parallel
- cassava
- ini
- optparse-generic
- split
- unordered-containers
gargantext-import: gargantext-import:
main: Main.hs main: Main.hs
source-dirs: bin/gargantext-import source-dirs: bin/gargantext-import
......
{-|
Module : Gargantext.Viz.AdaptativePhylo
Description : Phylomemy definitions and types.
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Specifications of Phylomemy export format.
Phylomemy can be described as a Temporal Graph with different scale of
granularity of group of ngrams (terms and multi-terms).
The main type is Phylo which is synonym of Phylomemy (only difference is
the number of chars).
References:
Chavalarias, D., Cointet, J.-P., 2013. Phylomemetic patterns
in science evolution — the rise and fall of scientific fields. PloS
one 8, e54847.
-}
{-# LANGUAGE DeriveGeneric, DeriveAnyClass #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE MultiParamTypeClasses #-}
module Gargantext.Viz.AdaptativePhylo where
import Data.Aeson
import Data.Aeson.TH (deriveJSON)
import Data.Text (Text)
import Data.Vector (Vector)
import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Prelude
import Gargantext.Text.Context (TermList)
import GHC.Generics
import GHC.IO (FilePath)
import Control.DeepSeq (NFData)
import Control.Lens (makeLenses)
----------------
-- | Config | --
----------------
data CorpusParser = Wos | Csv deriving (Show,Generic)
data Config =
Config { corpusPath :: FilePath
, listPath :: FilePath
, outputPath :: FilePath
, corpusParser :: CorpusParser
, corpusLimit :: Int
, phyloName :: Text
, phyloLevel :: Int
, timePeriod :: Int
, timeStep :: Int
, fisSupport :: Int
, fisSize :: Int
, branchSize :: Int
, safeParall :: Bool
} deriving (Show,Generic)
instance FromJSON Config
instance ToJSON Config
instance FromJSON CorpusParser
instance ToJSON CorpusParser
------------------
-- | Document | --
------------------
-- | Date : a simple Integer
type Date = Int
-- | Ngrams : a contiguous sequence of n terms
type Ngrams = Text
-- | Document : a piece of Text linked to a Date
data Document = Document
{ date :: Date
, text :: [Ngrams]
} deriving (Show,Generic,NFData)
--------------------
-- | Foundation | --
--------------------
-- | The Foundations of a Phylo created from a given TermList
data PhyloFoundations = PhyloFoundations
{ _foundations_roots :: !(Vector Ngrams)
, _foundations_mapList :: TermList
} deriving (Generic, Show, Eq)
----------------
-- | Lenses | --
----------------
makeLenses ''PhyloFoundations
------------------------
-- | JSON instances | --
------------------------
$(deriveJSON (unPrefix "_foundations_" ) ''PhyloFoundations)
\ No newline at end of file
{-|
Module : Gargantext.Viz.Phylo.PhyloExample
Description : Phylomemy example based on history of Cleopatre.
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-- | Cesar et Cleôpatre
-- | Exemple de phylomemie
-- | French without accents
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE MultiParamTypeClasses #-}
module Gargantext.Viz.Phylo.PhyloExample where
import Data.List (sortOn)
import Data.Text (Text, toLower)
import Gargantext.Prelude
import Gargantext.Text.Context (TermList)
import Gargantext.Text.Terms.Mono (monoTexts)
import Gargantext.Viz.AdaptativePhylo
import Gargantext.Viz.Phylo.PhyloTools
import Control.Lens
import qualified Data.Vector as Vector
--------------------------------------------
-- | STEP 1 | -- Init the Base of the Phylo
--------------------------------------------
-- Next is to build the config and the phyloLevel 0
docs :: [Document]
docs = map (\(d,t)
-> Document d ( filter (\n -> isRoots n (foundations ^. foundations_roots))
$ monoTexts t)) corpus
foundations :: PhyloFoundations
foundations = PhyloFoundations (Vector.fromList $ map toLower actants) mapList
--------------------------------------------
-- | STEP 0 | -- Let's start with an example
--------------------------------------------
mapList :: TermList
mapList = map (\a -> ([toLower a],[])) actants
actants :: [Ngrams]
actants = [ "Cleopatre" , "Ptolemee", "Ptolemee-XIII", "Ptolemee-XIV"
, "Marc-Antoine", "Cesar" , "Antoine" , "Octave" , "Rome"
, "Alexandrie" , "Auguste" , "Pompee" , "Cassius" , "Brutus"]
corpus :: [(Date, Text)]
corpus = sortOn fst [ (-51,"Cleopatre règne sur l’egypte entre 51 et 30 av. J.-C. avec ses frères-epoux Ptolemee-XIII et Ptolemee-XIV, puis aux côtes du general romain Marc-Antoine. Elle est celèbre pour avoir ete la compagne de Jules Cesar puis d'Antoine, avec lesquels elle a eu plusieurs enfants. Partie prenante dans la guerre civile opposant Antoine à Octave, elle est vaincue à la bataille d'Actium en 31 av. J.-C. Sa defaite va permettre aux Romains de mener à bien la conquête de l’egypte, evenement qui marquera la fin de l'epoque hellenistique."), (-40,"Il existe relativement peu d'informations sur son sejour à Rome, au lendemain de l'assassinat de Cesar, ou sur la periode passee à Alexandrie durant l'absence d'Antoine, entre -40 et -37."), (-48,"L'historiographie antique lui est globalement defavorable car inspiree par son vainqueur, l'empereur Auguste, et par son entourage, dont l'interêt est de la noircir, afin d'en faire l'adversaire malfaisant de Rome et le mauvais genie d'Antoine. On observe par ailleurs que Cesar ne fait aucune mention de sa liaison avec elle dans les Commentaires sur la Guerre civile"), (-69,"Cleopatre est nee au cours de l'hiver -69/-686 probablement à Alexandrie."), (-48,"Pompee a en effet ete le protecteur de Ptolemee XII, le père de Cleopatre et de Ptolemee-XIII dont il se considère comme le tuteur."), (-48,"Ptolemee-XIII et Cleopatre auraient d'ailleurs aide Pompee par l'envoi d'une flotte de soixante navires."), (-48,"Mais le jeune roi Ptolemee-XIII et ses conseillers jugent sa cause perdue et pensent s'attirer les bonnes graces du vainqueur en le faisant assassiner à peine a-t-il pose le pied sur le sol egyptien, près de Peluse, le 30 juillet 48 av. J.-C., sous les yeux de son entourage."), (-48,"Cesar fait enterrer la tête de Pompee dans le bosquet de Nemesis en bordure du mur est de l'enceinte d'Alexandrie. Pour autant la mort de Pompee est une aubaine pour Cesar qui tente par ailleurs de profiter des querelles dynastiques pour annexer l’egypte."), (-48,"Il est difficile de se prononcer clairement sur les raisons qui ont pousse Cesar à s'attarder à Alexandrie. Il y a des raisons politiques, mais aussi des raisons plus sentimentales (Cleopatre ?). Il tente d'abord d'obtenir le remboursement de dettes que Ptolemee XII"), (-46,"Les deux souverains sont convoques par Cesar au palais royal d'Alexandrie. Ptolemee-XIII s'y rend après diverses tergiversations ainsi que Cleopatre."), (-47,"A Rome, Cleopatre epouse alors un autre de ses frères cadets, à Alexandrie, Ptolemee-XIV, sur l'injonction de Jules Cesar"), (-46,"Cesar a-t-il comme objectif de montrer ce qu'il en coûte de se revolter contre Rome en faisant figurer dans son triomphe la sœur de Cleopatre et de Ptolemee-XIV, Arsinoe, qui s'est fait reconnaître reine par les troupes de Ptolemee-XIII ?"), (-44,"Au debut de l'annee -44, Cesar est assassine par Brutus. Profitant de la situation confuse qui s'ensuit, Cleopatre quitte alors Rome à la mi-avril, faisant escale en Grèce. Elle parvient à Alexandrie en juillet -44."), (-44,"La guerre que se livrent les assassins de Cesar, Cassius et Brutus et ses heritiers, Octave et Marc-Antoine, oblige Cleopatre à des contorsions diplomatiques."), (-41,"Nous ignorons depuis quand Cleopatre, agee de 29 ans en -41, et Marc-Antoine, qui a une quarantaine d'annees, se connaissent. Marc-Antoine est l'un des officiers qui ont participe au retablissement de Ptolemee XII. Il est plus vraisemblable qu'ils se soient frequentes lors du sejour à Rome de Cleopatre."), (-42,"Brutus tient la Grèce tandis que Cassius s'installe en Syrie. Le gouverneur de Cleopatre à Chypre, Serapion, vient en aide à Cassius."), (-42,"Cassius aurait envisage de s'emparer d'Alexandrie quand le 'debarquement' en Grèce d'Antoine et d'Octave l'oblige à renoncer à ses projets")]
\ No newline at end of file
{-|
Module : Gargantext.Viz.Phylo.PhyloExport
Description : Exportation module of a Phylo
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE MultiParamTypeClasses #-}
module Gargantext.Viz.Phylo.PhyloExport where
\ No newline at end of file
{-|
Module : Gargantext.Viz.Phylo.PhyloMaker
Description : Maker engine for rebuilding a Phylo
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE MultiParamTypeClasses #-}
module Gargantext.Viz.Phylo.PhyloMaker where
\ No newline at end of file
{-|
Module : Gargantext.Viz.Phylo.PhyloTools
Description : Module dedicated to all the tools needed for making a Phylo
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE ViewPatterns #-}
module Gargantext.Viz.Phylo.PhyloTools where
import Data.Maybe (Maybe, fromMaybe)
import Data.Text (Text)
import Data.Vector (Vector)
import Gargantext.Prelude
import Gargantext.Viz.AdaptativePhylo
import GHC.IO (FilePath)
import qualified Data.Vector as Vector
----------------
-- | Config | --
----------------
-- | Define a default value
def :: a -> Maybe a -> a
def = fromMaybe
-- | To init a configuration
initConfig :: Maybe FilePath -> Maybe FilePath -> Maybe FilePath -> Maybe CorpusParser -> Maybe Int -> Maybe Text
-> Maybe Int -> Maybe Int -> Maybe Int -> Maybe Int -> Maybe Int -> Maybe Int -> Maybe Bool -> Config
initConfig (def "" -> corpus) (def "" -> mapList) (def "" -> output) (def Csv -> parser) (def 10000 -> limit) (def "A phylomemy" -> name)
(def 2 -> level) (def 3 -> period) (def 1 -> step) (def 3 -> support) (def 4 -> clique) (def 3 -> minBranchSize) (def True -> safe) =
Config corpus mapList output parser limit name level period step support clique minBranchSize safe
---------------------
-- | Foundations | --
---------------------
-- | Is this Ngrams a Foundations Root ?
isRoots :: Ngrams -> Vector Ngrams -> Bool
isRoots n ns = Vector.elem n ns
\ No newline at end of file
{-|
Module : Gargantext.Viz.Phylo.SynchronicClustering
Description : Module dedicated to the adaptative synchronic clustering of a Phylo.
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE MultiParamTypeClasses #-}
module Gargantext.Viz.Phylo.SynchronicClustering where
\ No newline at end of file
{-|
Module : Gargantext.Viz.Phylo.TemporalMatching
Description : Module dedicated to the adaptative temporal matching of a Phylo.
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE MultiParamTypeClasses #-}
module Gargantext.Viz.Phylo.TemporalMatching where
\ No newline at end of file
...@@ -125,7 +125,9 @@ setPeakDotNode :: PhyloBranch -> Dot DotId ...@@ -125,7 +125,9 @@ setPeakDotNode :: PhyloBranch -> Dot DotId
setPeakDotNode pb = node (toBranchDotId $ pb ^. pb_id) setPeakDotNode pb = node (toBranchDotId $ pb ^. pb_id)
([FillColor [toWColor CornSilk], FontName "Arial", FontSize 40, Shape Egg, Style [SItem Bold []], Label (toDotLabel $ pb ^. pb_peak)] ([FillColor [toWColor CornSilk], FontName "Arial", FontSize 40, Shape Egg, Style [SItem Bold []], Label (toDotLabel $ pb ^. pb_peak)]
<> (setAttrFromMetrics $ pb ^. pb_metrics) <> (setAttrFromMetrics $ pb ^. pb_metrics)
<> [setAttr "nodeType" "peak"]) <> [ setAttr "nodeType" "peak"
, setAttr "branchId" ((pack $ show (fst $ getBranchId pb)) <> (pack $ show (snd $ getBranchId pb)))
])
-- | To set a Peak Edge -- | To set a Peak Edge
...@@ -188,14 +190,18 @@ setHtmlTable pn = H.Table H.HTable ...@@ -188,14 +190,18 @@ setHtmlTable pn = H.Table H.HTable
setDotNode :: PhyloNode -> Dot DotId setDotNode :: PhyloNode -> Dot DotId
setDotNode pn = node (toNodeDotId $ pn ^. pn_id) setDotNode pn = node (toNodeDotId $ pn ^. pn_id)
([FontName "Arial", Shape Square, toLabel (setHtmlTable pn)] ([FontName "Arial", Shape Square, toLabel (setHtmlTable pn)]
<> [setAttr "nodeType" "group"]) <> [ setAttr "nodeType" "group"
, setAttr "from" (pack $ show (fst $ getNodePeriod pn))
, setAttr "to" (pack $ show (fst $ getNodePeriod pn))
, setAttr "branchId" ((pack $ show (fst $ getNodeBranchId pn)) <> (pack $ show (snd $ getNodeBranchId pn)))
])
-- | To set an Edge -- | To set an Edge
setDotEdge :: PhyloEdge -> Dot DotId setDotEdge :: PhyloEdge -> Dot DotId
setDotEdge pe setDotEdge pe
| pe ^. pe_weight == 100 = edge (toNodeDotId $ pe ^. pe_source) (toNodeDotId $ pe ^. pe_target) [Width 2, Color [toWColor Red]] | pe ^. pe_weight == 100 = edge (toNodeDotId $ pe ^. pe_source) (toNodeDotId $ pe ^. pe_target) [Width 2, Color [toWColor Red]]
| otherwise = edge (toNodeDotId $ pe ^. pe_source) (toNodeDotId $ pe ^. pe_target) [Width 2, Color [toWColor Black]] | otherwise = edge (toNodeDotId $ pe ^. pe_source) (toNodeDotId $ pe ^. pe_target) [Width 2, Color [toWColor Black], Constraint True]
-- | To set a Period Edge -- | To set a Period Edge
...@@ -215,8 +221,9 @@ viewToDot pv = digraph ((Str . fromStrict) $ pv ^. pv_title) ...@@ -215,8 +221,9 @@ viewToDot pv = digraph ((Str . fromStrict) $ pv ^. pv_title)
<> [setAttr "description" $ fromStrict $ pv ^. pv_description] <> [setAttr "description" $ fromStrict $ pv ^. pv_description]
<> [setAttr "filiation" $ (pack . show) $ pv ^. pv_filiation] <> [setAttr "filiation" $ (pack . show) $ pv ^. pv_filiation]
<> (setAttrFromMetrics $ pv ^. pv_metrics) <> (setAttrFromMetrics $ pv ^. pv_metrics)
<> [FontSize 30, LabelLoc VTop, Splines SplineEdges, Overlap ScaleOverlaps, <> [FontSize 30, LabelLoc VTop, NodeSep 1, RankSep [1], Rank SameRank, Splines SplineEdges, Overlap ScaleOverlaps
Ratio AutoRatio, Style [SItem Filled []],Color [toWColor White]]) , Ratio FillRatio
, Style [SItem Filled []],Color [toWColor White]])
-- set the peaks -- set the peaks
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment