Commit 74d2038a authored by qlobbe's avatar qlobbe Committed by Alexandre Delanoë

add a list parser param

parent 878907d0
......@@ -16,43 +16,40 @@ Adaptative Phylo binaries
module Main where
-- import Debug.Trace (trace)
import Control.Concurrent.Async (mapConcurrently)
import Crypto.Hash.SHA256 (hash)
import Data.Aeson
import Data.Either (Either(..))
import Data.Either (Either(..), fromRight)
import Data.List (concat, nub, isSuffixOf)
import Data.List.Split
import Data.Maybe (fromMaybe)
import Data.String (String)
import GHC.IO (FilePath)
import qualified Prelude as Prelude
import System.Environment
import System.Directory (listDirectory,doesFileExist)
import Data.Text (Text, unwords, unpack, replace, pack)
import Data.Time.Calendar (fromGregorian, diffGregorianDurationClip, cdMonths, diffDays, showGregorian)
import qualified Data.ByteString.Char8 as C8
import qualified Data.ByteString.Lazy as Lazy
import qualified Data.Vector as Vector
import qualified Data.Text as T
import Gargantext.Prelude
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import GHC.IO (FilePath)
import Gargantext.API.Ngrams.Prelude (toTermList)
import Gargantext.API.Ngrams.Types
import Gargantext.Core.Text.Context (TermList)
import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year, csv_publication_month, csv_publication_day,
csv'_source, csv'_title, csv'_abstract, csv'_publication_year, csv'_publication_month, csv'_publication_day, csv'_weight)
import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as Csv
import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..),parseFile)
import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year, csv_publication_month, csv_publication_day, csv'_source, csv'_title, csv'_abstract, csv'_publication_year, csv'_publication_month, csv'_publication_day, csv'_weight)
import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
import Gargantext.Core.Types.Main (ListType(..))
import Gargantext.Core.Viz.Phylo
import Gargantext.Core.Viz.Phylo.API
import Gargantext.Core.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
import Gargantext.Core.Viz.Phylo.PhyloMaker (toPhylo, toPhyloStep)
import Gargantext.Core.Viz.Phylo.PhyloTools (printIOMsg, printIOComment, setConfig)
import Gargantext.Core.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
-- import Gargantext.API.Ngrams.Prelude (toTermList)
import Gargantext.Core.Viz.Phylo.API (toPhyloDate, toPhyloDate')
-- import Debug.Trace (trace)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Database.Schema.Ngrams (NgramsType(..))
import Gargantext.Prelude
import System.Directory (listDirectory,doesFileExist)
import System.Environment
import qualified Data.ByteString.Char8 as C8
import qualified Data.ByteString.Lazy as Lazy
import qualified Data.Text as T
import qualified Data.Vector as Vector
import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as Csv
data PhyloStage = PhyloWithCliques | PhyloWithLinks deriving (Show)
......@@ -60,24 +57,13 @@ data PhyloStage = PhyloWithCliques | PhyloWithLinks deriving (Show)
-- | Tools | --
---------------
-- | To get all the files in a directory or just a file
getFilesFromPath :: FilePath -> IO([FilePath])
getFilesFromPath :: FilePath -> IO [FilePath]
getFilesFromPath path = do
if (isSuffixOf "/" path)
then (listDirectory path)
else return [path]
--------------
-- | Json | --
--------------
-- | To read and decode a Json file
readJson :: FilePath -> IO Lazy.ByteString
readJson path = Lazy.readFile path
----------------
-- | Parser | --
----------------
......@@ -90,31 +76,26 @@ termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList p
-- | To transform a Wos file (or [file]) into a list of Docs
wosToDocs :: Int -> Patterns -> TimeUnit -> FilePath -> IO [Document]
wosToDocs limit patterns time path = do
files <- getFilesFromPath path
let parseFile' file = do
eParsed <- parseFile WOS (path <> file)
case eParsed of
Right ps -> pure ps
Left e -> panic $ "Error: " <> (pack e)
take limit
<$> map (\d -> let title = fromJust $ _hd_title d
abstr = if (isJust $ _hd_abstract d)
then fromJust $ _hd_abstract d
else ""
in Document (toPhyloDate
(fromIntegral $ fromJust $ _hd_publication_year d)
(fromJust $ _hd_publication_month d)
(fromJust $ _hd_publication_day d) time)
(toPhyloDate'
(fromIntegral $ fromJust $ _hd_publication_year d)
(fromJust $ _hd_publication_month d)
(fromJust $ _hd_publication_day d))
(termsInText patterns $ title <> " " <> abstr) Nothing [])
<$> concat
<$> mapConcurrently (\file ->
filter (\d -> (isJust $ _hd_publication_year d)
&& (isJust $ _hd_title d))
<$> parseFile' file) files
files <- getFilesFromPath path
take limit
<$> map (\d -> let title = fromJust $ _hd_title d
abstr = if (isJust $ _hd_abstract d)
then fromJust $ _hd_abstract d
else ""
in Document (toPhyloDate
(fromIntegral $ fromJust $ _hd_publication_year d)
(fromJust $ _hd_publication_month d)
(fromJust $ _hd_publication_day d) time)
(toPhyloDate'
(fromIntegral $ fromJust $ _hd_publication_year d)
(fromJust $ _hd_publication_month d)
(fromJust $ _hd_publication_day d) time)
(termsInText patterns $ title <> " " <> abstr) Nothing [])
<$> concat
<$> mapConcurrently (\file ->
filter (\d -> (isJust $ _hd_publication_year d)
&& (isJust $ _hd_title d))
<$> fromRight [] <$> parseFile WOS (path <> file) ) files
-- To transform a Csv file into a list of Document
......@@ -122,31 +103,21 @@ csvToDocs :: CorpusParser -> Patterns -> TimeUnit -> FilePath -> IO [Document]
csvToDocs parser patterns time path =
case parser of
Wos _ -> undefined
Csv limit -> do
eR <- Csv.readFile path
case eR of
Right r ->
pure $ Vector.toList
$ Vector.take limit
$ Vector.map (\row -> Document (toPhyloDate (Csv.fromMIntOrDec Csv.defaultYear $ csv_publication_year row)
(fromMaybe Csv.defaultMonth $ csv_publication_month row)
(fromMaybe Csv.defaultDay $ csv_publication_day row)
time)
(toPhyloDate' (Csv.fromMIntOrDec Csv.defaultYear $ csv_publication_year row)
(fromMaybe Csv.defaultMonth $ csv_publication_month row)
(fromMaybe Csv.defaultDay $ csv_publication_day row))
(termsInText patterns $ (csv_title row) <> " " <> (csv_abstract row))
Nothing
[]
) $ snd r
Left e -> panic $ "Error: " <> (pack e)
Csv limit -> Vector.toList
<$> Vector.take limit
<$> Vector.map (\row -> Document (toPhyloDate (Csv.fromMIntOrDec Csv.defaultYear $ csv_publication_year row) (fromMaybe Csv.defaultMonth $ csv_publication_month row) (fromMaybe Csv.defaultDay $ csv_publication_day row) time)
(toPhyloDate' (Csv.fromMIntOrDec Csv.defaultYear $ csv_publication_year row) (fromMaybe Csv.defaultMonth $ csv_publication_month row) (fromMaybe Csv.defaultDay $ csv_publication_day row) time)
(termsInText patterns $ (csv_title row) <> " " <> (csv_abstract row))
Nothing
[]
) <$> snd <$> either (\err -> panic $ cs $ "CSV error" <> (show err)) identity <$> Csv.readFile path
Csv' limit -> Vector.toList
<$> Vector.take limit
<$> Vector.map (\row -> Document (toPhyloDate (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row) time)
(toPhyloDate' (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row))
(toPhyloDate' (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row) time)
(termsInText patterns $ (csv'_title row) <> " " <> (csv'_abstract row))
(Just $ csv'_weight row)
[csv'_source row]
(map (T.strip . pack) $ splitOn ";" (unpack $ (csv'_source row)))
) <$> snd <$> Csv.readWeightedCsv path
......@@ -168,10 +139,11 @@ fileToDocs' parser path time lst = do
-- Config time parameters to label
timeToLabel :: Config -> [Char]
timeToLabel config = case (timeUnit config) of
Year p s f -> ("time_years" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
Month p s f -> ("time_months"<> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
Week p s f -> ("time_weeks" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
Day p s f -> ("time_days" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
Epoch p s f -> ("time_epochs" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
Year p s f -> ("time_years" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
Month p s f -> ("time_months" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
Week p s f -> ("time_weeks" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
Day p s f -> ("time_days" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
seaToLabel :: Config -> [Char]
......@@ -182,7 +154,7 @@ seaToLabel config = case (seaElevation config) of
sensToLabel :: Config -> [Char]
sensToLabel config = case (phyloProximity config) of
Hamming -> undefined
Hamming _ -> undefined
WeightedLogJaccard s -> ("WeightedLogJaccard_" <> show s)
WeightedLogSim s -> ( "WeightedLogSim-sens_" <> show s)
......@@ -240,18 +212,23 @@ configToSha stage config = unpack
<> (show (phyloLevel config))
writePhylo :: [Char] -> Phylo -> IO ()
writePhylo path phylo = Lazy.writeFile path $ encode phylo
readPhylo :: [Char] -> IO Phylo
readPhylo path = do
phyloJson <- (eitherDecode <$> readJson path) :: IO (Either String Phylo)
case phyloJson of
readListV4 :: [Char] -> IO NgramsList
readListV4 path = do
listJson <- (eitherDecode <$> readJson path) :: IO (Either String NgramsList)
case listJson of
Left err -> do
putStrLn err
undefined
Right phylo -> pure phylo
Right listV4 -> pure listV4
fileToList :: ListParser -> FilePath -> IO TermList
fileToList parser path =
case parser of
V3 -> csvMapTermList path
V4 -> fromJust
<$> toTermList MapTerm NgramsTerms
<$> readListV4 path
--------------
......@@ -273,7 +250,7 @@ main = do
Right config -> do
printIOMsg "Parse the corpus"
mapList <- csvMapTermList (listPath config)
mapList <- fileToList (listParser config) (listPath config)
corpus <- fileToDocs' (corpusParser config) (corpusPath config) (timeUnit config) mapList
printIOComment (show (length corpus) <> " parsed docs from the corpus")
......@@ -297,7 +274,6 @@ main = do
-- let phylo = toPhylo (setConfig config phyloStep)
-- QL: 2 files read from disk
phyloWithLinks <- if phyloWithLinksExists
then do
printIOMsg "Reconstruct the phylo from an existing file with intertemporal links"
......
......@@ -54,6 +54,7 @@ library:
- Gargantext.API.Ngrams
- Gargantext.API.Ngrams.Tools
- Gargantext.API.Ngrams.Types
- Gargantext.API.Ngrams.Prelude
- Gargantext.API.Admin.Settings
- Gargantext.API.Admin.EnvTypes
- Gargantext.API.Admin.Types
......@@ -73,6 +74,7 @@ library:
- Gargantext.Database.Query.Table.Node
- Gargantext.Database.Query.Table.Node.UpdateOpaleye
- Gargantext.Database.Query.Table.NgramsPostag
- Gargantext.Database.Schema.Ngrams
- Gargantext.Database.Prelude
- Gargantext.Database.Admin.Trigger.Init
- Gargantext.Database.Admin.Config
......
{-|
{-
Module : Gargantext.Core.Viz.AdaptativePhylo
Description : Phylomemy definitions and types.
Copyright : (c) CNRS, 2017-Present
......@@ -26,6 +26,8 @@ one 8, e54847.
module Gargantext.Core.Viz.Phylo where
import Data.Swagger
import Gargantext.Core.Utils.Prefix (unPrefixSwagger)
import Control.DeepSeq (NFData)
import Control.Lens (makeLenses)
import Data.Aeson
......@@ -50,6 +52,14 @@ data CorpusParser =
| Csv' {_csv'_limit :: Int}
deriving (Show,Generic,Eq)
instance ToSchema CorpusParser where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
data ListParser = V3 | V4 deriving (Show,Generic,Eq)
instance ToSchema ListParser
data SeaElevation =
Constante
{ _cons_start :: Double
......@@ -58,6 +68,8 @@ data SeaElevation =
{ _adap_granularity :: Double }
deriving (Show,Generic,Eq)
instance ToSchema SeaElevation
data Proximity =
WeightedLogJaccard
{ _wlj_sensibility :: Double
......@@ -77,13 +89,23 @@ data Proximity =
-- , _wlj_elevation :: Double
-}
}
| Hamming
| Hamming { _wlj_sensibility :: Double }
deriving (Show,Generic,Eq)
instance ToSchema Proximity where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
data SynchronyScope = SingleBranch | SiblingBranches | AllBranches
deriving (Show,Generic,Eq, ToSchema)
data SynchronyScope = SingleBranch | SiblingBranches | AllBranches deriving (Show,Generic,Eq)
data SynchronyStrategy = MergeRegularGroups | MergeAllGroups
deriving (Show,Generic,Eq)
instance ToSchema SynchronyStrategy where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
data SynchronyStrategy = MergeRegularGroups | MergeAllGroups deriving (Show,Generic,Eq)
data Synchrony =
ByProximityThreshold
......@@ -96,9 +118,17 @@ data Synchrony =
, _bpd_strategy :: SynchronyStrategy }
deriving (Show,Generic,Eq)
instance ToSchema Synchrony where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
data TimeUnit =
Year
Epoch
{ _epoch_period :: Int
, _epoch_step :: Int
, _epoch_matchingFrame :: Int }
| Year
{ _year_period :: Int
, _year_step :: Int
, _year_matchingFrame :: Int }
......@@ -116,8 +146,17 @@ data TimeUnit =
, _day_matchingFrame :: Int }
deriving (Show,Generic,Eq)
instance ToSchema TimeUnit where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
data CliqueFilter = ByThreshold | ByNeighbours deriving (Show,Generic,Eq)
instance ToSchema CliqueFilter where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
data Clique =
Fis
{ _fis_support :: Int
......@@ -128,18 +167,26 @@ data Clique =
, _mcl_filter :: CliqueFilter }
deriving (Show,Generic,Eq)
instance ToSchema Clique where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
data Quality =
Quality { _qua_granularity :: Double
, _qua_minBranch :: Int }
deriving (Show,Generic,Eq)
instance ToSchema Quality where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_qua_")
data Config =
Config { corpusPath :: FilePath
, listPath :: FilePath
, outputPath :: FilePath
, corpusParser :: CorpusParser
, listParser :: ListParser
, phyloName :: Text
, phyloLevel :: Int
, phyloProximity :: Proximity
......@@ -154,25 +201,28 @@ data Config =
, exportFilter :: [Filter]
} deriving (Show,Generic,Eq)
instance ToSchema Config
defaultConfig :: Config
defaultConfig =
Config { corpusPath = ""
, listPath = ""
, outputPath = ""
, corpusParser = Csv 1000
, phyloName = pack "Default Phylo"
Config { corpusPath = "corpus.csv" -- useful for commandline only
, listPath = "list.csv" -- useful for commandline only
, outputPath = "data/"
, corpusParser = Csv 100000
, listParser = V4
, phyloName = pack "Phylo Name"
, phyloLevel = 2
, phyloProximity = WeightedLogJaccard 10
, phyloProximity = WeightedLogJaccard 0.5
, seaElevation = Constante 0.1 0.1
, findAncestors = True
, phyloSynchrony = ByProximityThreshold 0.1 10 SiblingBranches MergeAllGroups
, phyloQuality = Quality 0 1
, findAncestors = False
, phyloSynchrony = ByProximityThreshold 0.5 0 AllBranches MergeAllGroups
, phyloQuality = Quality 0.5 1
, timeUnit = Year 3 1 5
, clique = MaxClique 0 3 ByNeighbours
, clique = MaxClique 5 0.0001 ByThreshold
, exportLabel = [BranchLabel MostEmergentTfIdf 2, GroupLabel MostEmergentInclusive 2]
, exportSort = ByHierarchy
, exportFilter = [ByBranchSize 2]
, exportSort = ByHierarchy Desc
, exportFilter = [ByBranchSize 3]
}
instance FromJSON Config
......@@ -181,6 +231,9 @@ instance ToJSON Config
instance FromJSON CorpusParser
instance ToJSON CorpusParser
instance FromJSON ListParser
instance ToJSON ListParser
instance FromJSON Proximity
instance ToJSON Proximity
......@@ -230,6 +283,11 @@ data Software =
, _software_version :: Text
} deriving (Generic, Show, Eq)
instance ToSchema Software where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_software_")
defaultSoftware :: Software
defaultSoftware =
Software { _software_name = pack "Gargantext"
......@@ -243,6 +301,11 @@ data PhyloParam =
, _phyloParam_config :: Config
} deriving (Generic, Show, Eq)
instance ToSchema PhyloParam where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phyloParam_")
defaultPhyloParam :: PhyloParam
defaultPhyloParam =
PhyloParam { _phyloParam_version = pack "v2.adaptative"
......@@ -283,10 +346,16 @@ data PhyloFoundations = PhyloFoundations
, _foundations_mapList :: TermList
} deriving (Generic, Show, Eq)
instance ToSchema PhyloFoundations where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_foundations_")
data PhyloSources = PhyloSources
{ _sources :: !(Vector Text) } deriving (Generic, Show, Eq)
instance ToSchema PhyloSources where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
---------------------------
-- | Coocurency Matrix | --
......@@ -322,6 +391,9 @@ data Phylo =
}
deriving (Generic, Show, Eq)
instance ToSchema Phylo where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
-- | PhyloPeriodId : the id of a given period
type PhyloPeriodId = (Date,Date)
......@@ -335,6 +407,10 @@ data PhyloPeriod =
, _phylo_periodLevels :: Map PhyloLevelId PhyloLevel
} deriving (Generic, Show, Eq)
instance ToSchema PhyloPeriod where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
-- | Level : a level of clustering
type Level = Int
......@@ -355,6 +431,9 @@ data PhyloLevel =
}
deriving (Generic, Show, Eq)
instance ToSchema PhyloLevel where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
type PhyloGroupId = (PhyloLevelId, Int)
......@@ -381,16 +460,25 @@ data PhyloGroup =
, _phylo_groupPeriodParents :: [Pointer]
, _phylo_groupPeriodChilds :: [Pointer]
, _phylo_groupAncestors :: [Pointer]
, _phylo_groupPeriodMemoryParents :: [Pointer']
, _phylo_groupPeriodMemoryChilds :: [Pointer']
}
deriving (Generic, Show, Eq, NFData)
instance ToSchema PhyloGroup where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
-- | Weight : A generic mesure that can be associated with an Id
type Weight = Double
type Thr = Double
-- | Pointer : A weighted pointer to a given PhyloGroup
type Pointer = (PhyloGroupId, Weight)
-- | Pointer' : A weighted pointer to a given PhyloGroup with a lower bounded threshold
type Pointer' = (PhyloGroupId, (Thr,Weight))
data Filiation = ToParents | ToChilds deriving (Generic, Show)
data Filiation = ToParents | ToChilds | ToParentsMemory | ToChildsMemory deriving (Generic, Show)
data PointerType = TemporalPointer | LevelPointer deriving (Generic, Show)
......@@ -415,15 +503,24 @@ data PhyloClique = PhyloClique
type DotId = TextLazy.Text
data EdgeType = GroupToGroup | BranchToGroup | BranchToBranch | GroupToAncestor | PeriodToPeriod deriving (Show,Generic,Eq)
data EdgeType = GroupToGroup | GroupToGroupMemory | BranchToGroup | BranchToBranch | GroupToAncestor | PeriodToPeriod deriving (Show,Generic,Eq)
data Filter = ByBranchSize { _branch_size :: Double } deriving (Show,Generic,Eq)
instance ToSchema Filter where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
data Order = Asc | Desc deriving (Show,Generic,Eq, ToSchema)
data Order = Asc | Desc deriving (Show,Generic,Eq)
data Sort = ByBirthDate { _sort_order :: Order } | ByHierarchy {_sort_order :: Order } deriving (Show,Generic,Eq)
instance ToSchema Sort where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_sort_")
data Sort = ByBirthDate { _sort_order :: Order } | ByHierarchy deriving (Show,Generic,Eq)
data Tagger = MostInclusive | MostEmergentInclusive | MostEmergentTfIdf deriving (Show,Generic,Eq)
instance ToSchema Tagger where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
data PhyloLabel =
BranchLabel
......@@ -434,6 +531,10 @@ data PhyloLabel =
, _group_labelSize :: Int }
deriving (Show,Generic,Eq)
instance ToSchema PhyloLabel where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
data PhyloBranch =
PhyloBranch
{ _branch_id :: PhyloBranchId
......@@ -447,11 +548,17 @@ data PhyloBranch =
, _branch_meta :: Map Text [Double]
} deriving (Generic, Show, Eq)
instance ToSchema PhyloBranch where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_branch_")
data PhyloExport =
PhyloExport
{ _export_groups :: [PhyloGroup]
, _export_branches :: [PhyloBranch]
} deriving (Generic, Show)
instance ToSchema PhyloExport where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_export_")
----------------
-- | Lenses | --
......
......@@ -13,7 +13,7 @@ Portability : POSIX
module Gargantext.Core.Viz.Phylo.PhyloTools where
import Control.Lens hiding (Level)
import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, partition, tails, nubBy, group)
import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, partition, tails, nubBy, group, notElem)
import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey, fromListWith, empty, restrictKeys)
import Data.Set (Set, disjoint)
import Data.String (String)
......@@ -36,9 +36,9 @@ import qualified Data.Vector as Vector
-- | To print an important message as an IO()
printIOMsg :: String -> IO ()
printIOMsg msg =
printIOMsg msg =
putStrLn ( "\n"
<> "------------"
<> "------------"
<> "\n"
<> "-- | " <> msg <> "\n" )
......@@ -59,13 +59,13 @@ printIOComment cmt =
truncate' :: Double -> Int -> Double
truncate' x n = (fromIntegral $ (floor (x * t) :: Int)) / t
where
where
--------------
t :: Double
t = 10 ^n
getInMap :: Int -> Map Int Double -> Double
getInMap k m =
getInMap k m =
if (member k m)
then m ! k
else 0
......@@ -140,15 +140,15 @@ periodsToYears periods = (Set.fromList . sort . concat)
findBounds :: [Date] -> (Date,Date)
findBounds dates =
findBounds dates =
let dates' = sort dates
in (head' "findBounds" dates', last' "findBounds" dates')
toPeriods :: [Date] -> Int -> Int -> [(Date,Date)]
toPeriods dates p s =
toPeriods dates p s =
let (start,end) = findBounds dates
in map (\dates' -> (head' "toPeriods" dates', last' "toPeriods" dates'))
in map (\dates' -> (head' "toPeriods" dates', last' "toPeriods" dates'))
$ chunkAlong p s [start .. end]
......@@ -156,8 +156,8 @@ toFstDate :: [Text] -> Text
toFstDate ds = snd
$ head' "firstDate"
$ sortOn fst
$ map (\d ->
let d' = read (filter (\c -> c /= '-') $ unpack d)::Int
$ map (\d ->
let d' = read (filter (\c -> notElem c ['U','T','C',' ',':','-']) $ unpack d)::Int
in (d',d)) ds
toLstDate :: [Text] -> Text
......@@ -165,46 +165,50 @@ toLstDate ds = snd
$ head' "firstDate"
$ reverse
$ sortOn fst
$ map (\d ->
let d' = read (filter (\c -> c /= '-') $ unpack d)::Int
in (d',d)) ds
$ map (\d ->
let d' = read (filter (\c -> notElem c ['U','T','C',' ',':','-']) $ unpack d)::Int
in (d',d)) ds
getTimeScale :: Phylo -> [Char]
getTimeScale p = case (timeUnit $ getConfig p) of
Epoch _ _ _ -> "epoch"
Year _ _ _ -> "year"
Month _ _ _ -> "month"
Week _ _ _ -> "week"
Day _ _ _ -> "day"
Week _ _ _ -> "week"
Day _ _ _ -> "day"
-- | Get a regular & ascendante timeScale from a given list of dates
toTimeScale :: [Date] -> Int -> [Date]
toTimeScale dates step =
toTimeScale dates step =
let (start,end) = findBounds dates
in [start, (start + step) .. end]
getTimeStep :: TimeUnit -> Int
getTimeStep time = case time of
getTimeStep time = case time of
Epoch _ s _ -> s
Year _ s _ -> s
Month _ s _ -> s
Week _ s _ -> s
Day _ s _ -> s
Month _ s _ -> s
Week _ s _ -> s
Day _ s _ -> s
getTimePeriod :: TimeUnit -> Int
getTimePeriod time = case time of
getTimePeriod time = case time of
Epoch p _ _ -> p
Year p _ _ -> p
Month p _ _ -> p
Week p _ _ -> p
Day p _ _ -> p
Month p _ _ -> p
Week p _ _ -> p
Day p _ _ -> p
getTimeFrame :: TimeUnit -> Int
getTimeFrame time = case time of
getTimeFrame time = case time of
Epoch _ _ f -> f
Year _ _ f -> f
Month _ _ f -> f
Week _ _ f -> f
Day _ _ f -> f
Day _ _ f -> f
-------------
-- | Fis | --
......@@ -217,7 +221,7 @@ isNested l l'
| null l' = True
| length l' > length l = False
| (union l l') == l = True
| otherwise = False
| otherwise = False
-- | To filter Fis with small Support but by keeping non empty Periods
......@@ -233,7 +237,7 @@ traceClique mFis = foldl (\msg cpt -> msg <> show (countSup cpt cliques) <> " (>
--------------------------------------
cliques :: [Double]
cliques = sort $ map (fromIntegral . length . _phyloClique_nodes) $ concat $ elems mFis
--------------------------------------
--------------------------------------
traceSupport :: Map (Date, Date) [PhyloClique] -> String
......@@ -242,7 +246,7 @@ traceSupport mFis = foldl (\msg cpt -> msg <> show (countSup cpt supports) <> "
--------------------------------------
supports :: [Double]
supports = sort $ map (fromIntegral . _phyloClique_support) $ concat $ elems mFis
--------------------------------------
--------------------------------------
traceFis :: [Char] -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
......@@ -257,12 +261,12 @@ traceFis msg mFis = trace ( "\n" <> "-- | " <> msg <> " : " <> show (sum $ map l
getCliqueSupport :: Clique -> Int
getCliqueSupport unit = case unit of
getCliqueSupport unit = case unit of
Fis s _ -> s
MaxClique _ _ _ -> 0
getCliqueSize :: Clique -> Int
getCliqueSize unit = case unit of
getCliqueSize unit = case unit of
Fis _ s -> s
MaxClique s _ _ -> s
......@@ -292,7 +296,7 @@ listToSeq l = nubBy (\x y -> fst x == fst y) $ [ (x,y) | (x:rest) <- tails l, y
sumCooc :: Cooc -> Cooc -> Cooc
sumCooc cooc cooc' = unionWith (+) cooc cooc'
getTrace :: Cooc -> Double
getTrace :: Cooc -> Double
getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
coocToDiago :: Cooc -> Cooc
......@@ -310,7 +314,7 @@ ngramsToCooc ngrams coocs =
-- | PhyloGroup | --
--------------------
getGroupId :: PhyloGroup -> PhyloGroupId
getGroupId :: PhyloGroup -> PhyloGroupId
getGroupId g = ((g ^. phylo_groupPeriod, g ^. phylo_groupLevel), g ^. phylo_groupIndex)
idToPrd :: PhyloGroupId -> PhyloPeriodId
......@@ -320,38 +324,58 @@ groupByField :: Ord a => (PhyloGroup -> a) -> [PhyloGroup] -> Map a [PhyloGroup
groupByField toField groups = fromListWith (++) $ map (\g -> (toField g, [g])) groups
getPeriodPointers :: Filiation -> PhyloGroup -> [Pointer]
getPeriodPointers fil g =
case fil of
getPeriodPointers fil g =
case fil of
ToChilds -> g ^. phylo_groupPeriodChilds
ToParents -> g ^. phylo_groupPeriodParents
ToChildsMemory -> undefined
ToParentsMemory -> undefined
filterProximity :: Proximity -> Double -> Double -> Bool
filterProximity proximity thr local =
filterProximity proximity thr local =
case proximity of
WeightedLogJaccard _ -> local >= thr
WeightedLogSim _ -> local >= thr
Hamming -> undefined
WeightedLogSim _ -> local >= thr
Hamming _ -> undefined
getProximityName :: Proximity -> String
getProximityName proximity =
case proximity of
WeightedLogJaccard _ -> "WLJaccard"
WeightedLogSim _ -> "WeightedLogSim"
Hamming -> "Hamming"
WeightedLogSim _ -> "WeightedLogSim"
Hamming _ -> "Hamming"
---------------
-- | Phylo | --
---------------
addPointers :: Filiation -> PointerType -> [Pointer] -> PhyloGroup -> PhyloGroup
addPointers fil pty pointers g =
case pty of
TemporalPointer -> case fil of
addPointers fil pty pointers g =
case pty of
TemporalPointer -> case fil of
ToChilds -> g & phylo_groupPeriodChilds .~ pointers
ToParents -> g & phylo_groupPeriodParents .~ pointers
LevelPointer -> case fil of
ToChildsMemory -> undefined
ToParentsMemory -> undefined
LevelPointer -> case fil of
ToChilds -> g & phylo_groupLevelChilds .~ pointers
ToParents -> g & phylo_groupLevelParents .~ pointers
ToChildsMemory -> undefined
ToParentsMemory -> undefined
toPointer' :: Double -> Pointer -> Pointer'
toPointer' thr pt = (fst pt,(thr,snd pt))
addMemoryPointers :: Filiation -> PointerType -> Double -> [Pointer] -> PhyloGroup -> PhyloGroup
addMemoryPointers fil pty thr pointers g =
case pty of
TemporalPointer -> case fil of
ToChilds -> undefined
ToParents -> undefined
ToChildsMemory -> g & phylo_groupPeriodMemoryChilds .~ (concat [(g ^. phylo_groupPeriodMemoryChilds),(map (\pt -> toPointer' thr pt) pointers)])
ToParentsMemory -> g & phylo_groupPeriodMemoryParents .~ (concat [(g ^. phylo_groupPeriodMemoryParents),(map (\pt -> toPointer' thr pt) pointers)])
LevelPointer -> undefined
getPeriodIds :: Phylo -> [(Date,Date)]
......@@ -359,14 +383,14 @@ getPeriodIds phylo = sortOn fst
$ keys
$ phylo ^. phylo_periods
getLevelParentId :: PhyloGroup -> PhyloGroupId
getLevelParentId :: PhyloGroup -> PhyloGroupId
getLevelParentId g = fst $ head' "getLevelParentId" $ g ^. phylo_groupLevelParents
getLastLevel :: Phylo -> Level
getLastLevel phylo = last' "lastLevel" $ getLevels phylo
getLevels :: Phylo -> [Level]
getLevels phylo = nub
getLevels phylo = nub
$ map snd
$ keys $ view ( phylo_periods
. traverse
......@@ -381,10 +405,10 @@ getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
setConfig :: Config -> Phylo -> Phylo
setConfig config phylo = phylo
& phylo_param .~ (PhyloParam
((phylo ^. phylo_param) ^. phyloParam_version)
((phylo ^. phylo_param) ^. phyloParam_software)
setConfig config phylo = phylo
& phylo_param .~ (PhyloParam
((phylo ^. phylo_param) ^. phyloParam_version)
((phylo ^. phylo_param) ^. phyloParam_software)
config)
-- & phylo_param & phyloParam_config & phyloParam_config .~ config
......@@ -397,13 +421,13 @@ getSources :: Phylo -> Vector Text
getSources phylo = _sources (phylo ^. phylo_sources)
phyloToLastBranches :: Phylo -> [[PhyloGroup]]
phyloToLastBranches phylo = elems
phyloToLastBranches phylo = elems
$ fromListWith (++)
$ map (\g -> (g ^. phylo_groupBranchId, [g]))
$ getGroupsFromLevel (last' "byBranches" $ getLevels phylo) phylo
getGroupsFromLevel :: Level -> Phylo -> [PhyloGroup]
getGroupsFromLevel lvl phylo =
getGroupsFromLevel lvl phylo =
elems $ view ( phylo_periods
. traverse
. phylo_periodLevels
......@@ -413,18 +437,18 @@ getGroupsFromLevel lvl phylo =
getGroupsFromLevelPeriods :: Level -> [PhyloPeriodId] -> Phylo -> [PhyloGroup]
getGroupsFromLevelPeriods lvl periods phylo =
getGroupsFromLevelPeriods lvl periods phylo =
elems $ view ( phylo_periods
. traverse
. filtered (\phyloPrd -> elem (phyloPrd ^. phylo_periodPeriod) periods)
. phylo_periodLevels
. traverse
. filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
. phylo_levelGroups ) phylo
. phylo_levelGroups ) phylo
getGroupsFromPeriods :: Level -> Map PhyloPeriodId PhyloPeriod -> [PhyloGroup]
getGroupsFromPeriods lvl periods =
getGroupsFromPeriods lvl periods =
elems $ view ( traverse
. phylo_periodLevels
. traverse
......@@ -433,25 +457,25 @@ getGroupsFromPeriods lvl periods =
updatePhyloGroups :: Level -> Map PhyloGroupId PhyloGroup -> Phylo -> Phylo
updatePhyloGroups lvl m phylo =
updatePhyloGroups lvl m phylo =
over ( phylo_periods
. traverse
. phylo_periodLevels
. traverse
. filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
. phylo_levelGroups
. traverse
) (\g ->
. traverse
) (\g ->
let id = getGroupId g
in
if member id m
in
if member id m
then m ! id
else g ) phylo
updatePeriods :: Map (Date,Date) (Text,Text) -> Phylo -> Phylo
updatePeriods periods' phylo =
over (phylo_periods . traverse)
(\prd ->
updatePeriods periods' phylo =
over (phylo_periods . traverse)
(\prd ->
let prd' = periods' ! (prd ^. phylo_periodPeriod)
lvls = map (\lvl -> lvl & phylo_levelPeriod' .~ prd') $ prd ^. phylo_periodLevels
in prd & phylo_periodPeriod' .~ prd'
......@@ -460,10 +484,10 @@ updatePeriods periods' phylo =
traceToPhylo :: Level -> Phylo -> Phylo
traceToPhylo lvl phylo =
traceToPhylo lvl phylo =
trace ("\n" <> "-- | End of phylo making at level " <> show (lvl) <> " with "
<> show (length $ getGroupsFromLevel lvl phylo) <> " groups and "
<> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel lvl phylo) <> " branches" <> "\n") phylo
<> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel lvl phylo) <> " branches" <> "\n") phylo
--------------------
-- | Clustering | --
......@@ -474,28 +498,28 @@ mergeBranchIds ids = (head' "mergeBranchIds" . sort . mostFreq') ids
where
-- | 2) find the most Up Left ids in the hierarchy of similarity
-- mostUpLeft :: [[Int]] -> [[Int]]
-- mostUpLeft ids' =
-- mostUpLeft ids' =
-- let groupIds = (map (\gIds -> (length $ head' "gIds" gIds, head' "gIds" gIds)) . groupBy (\id id' -> length id == length id') . sortOn length) ids'
-- inf = (fst . minimum) groupIds
-- in map snd $ filter (\gIds -> fst gIds == inf) groupIds
-- | 1) find the most frequent ids
mostFreq' :: [[Int]] -> [[Int]]
mostFreq' ids' =
mostFreq' ids' =
let groupIds = (map (\gIds -> (length gIds, head' "gIds" gIds)) . group . sort) ids'
sup = (fst . maximum) groupIds
in map snd $ filter (\gIds -> fst gIds == sup) groupIds
mergeMeta :: [Int] -> [PhyloGroup] -> Map Text [Double]
mergeMeta bId groups =
let ego = head' "mergeMeta" $ filter (\g -> (snd (g ^. phylo_groupBranchId)) == bId) groups
in fromList [("breaks",(ego ^. phylo_groupMeta) ! "breaks"),("seaLevels",(ego ^. phylo_groupMeta) ! "seaLevels")]
mergeMeta bId groups =
let ego = head' "mergeMeta" $ filter (\g -> (snd (g ^. phylo_groupBranchId)) == bId) groups
in fromList [("breaks",(ego ^. phylo_groupMeta) ! "breaks"),("seaLevels",(ego ^. phylo_groupMeta) ! "seaLevels")]
groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
groupsToBranches groups =
{- run the related component algorithm -}
let egos = map (\g -> [getGroupId g]
let egos = map (\g -> [getGroupId g]
++ (map fst $ g ^. phylo_groupPeriodParents)
++ (map fst $ g ^. phylo_groupPeriodChilds)
++ (map fst $ g ^. phylo_groupAncestors)) $ elems groups
......@@ -510,30 +534,30 @@ relatedComponents :: Ord a => [[a]] -> [[a]]
relatedComponents graph = foldl' (\acc groups ->
if (null acc)
then acc ++ [groups]
else
else
let acc' = partition (\groups' -> disjoint (Set.fromList groups') (Set.fromList groups)) acc
in (fst acc') ++ [nub $ concat $ (snd acc') ++ [groups]]) [] graph
toRelatedComponents :: [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)] -> [[PhyloGroup]]
toRelatedComponents nodes edges =
toRelatedComponents nodes edges =
let ref = fromList $ map (\g -> (getGroupId g, g)) nodes
clusters = relatedComponents $ ((map (\((g,g'),_) -> [getGroupId g, getGroupId g']) edges) ++ (map (\g -> [getGroupId g]) nodes))
in map (\cluster -> map (\gId -> ref ! gId) cluster) clusters
clusters = relatedComponents $ ((map (\((g,g'),_) -> [getGroupId g, getGroupId g']) edges) ++ (map (\g -> [getGroupId g]) nodes))
in map (\cluster -> map (\gId -> ref ! gId) cluster) clusters
traceSynchronyEnd :: Phylo -> Phylo
traceSynchronyEnd phylo =
trace ( "-- | End synchronic clustering at level " <> show (getLastLevel phylo)
traceSynchronyEnd phylo =
trace ( "-- | End synchronic clustering at level " <> show (getLastLevel phylo)
<> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
<> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
<> "\n" ) phylo
traceSynchronyStart :: Phylo -> Phylo
traceSynchronyStart phylo =
trace ( "\n" <> "-- | Start synchronic clustering at level " <> show (getLastLevel phylo)
traceSynchronyStart phylo =
trace ( "\n" <> "-- | Start synchronic clustering at level " <> show (getLastLevel phylo)
<> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
<> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
<> "\n" ) phylo
<> "\n" ) phylo
-------------------
......@@ -541,10 +565,10 @@ traceSynchronyStart phylo =
-------------------
getSensibility :: Proximity -> Double
getSensibility proxi = case proxi of
getSensibility proxi = case proxi of
WeightedLogJaccard s -> s
WeightedLogSim s -> s
Hamming -> undefined
WeightedLogSim s -> s
Hamming _ -> undefined
----------------
-- | Branch | --
......@@ -599,7 +623,7 @@ traceMatchLimit branches =
<> ",(1.." <> show (length branches) <> ")]"
<> " | " <> show (length $ concat branches) <> " groups" <> "\n"
<> " - unable to increase the threshold above 1" <> "\n"
) branches
) branches
traceMatchEnd :: [PhyloGroup] -> [PhyloGroup]
......@@ -609,10 +633,10 @@ traceMatchEnd groups =
traceTemporalMatching :: [PhyloGroup] -> [PhyloGroup]
traceTemporalMatching groups =
traceTemporalMatching groups =
trace ( "\n" <> "-- | Start temporal matching for " <> show(length groups) <> " groups" <> "\n") groups
traceGroupsProxi :: Map (PhyloGroupId,PhyloGroupId) Double -> Map (PhyloGroupId,PhyloGroupId) Double
traceGroupsProxi m =
traceGroupsProxi m =
trace ( "\n" <> "-- | " <> show(Map.size m) <> " computed pairs of groups proximity" <> "\n") m
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment