Commit 6bddaf45 authored by qlobbe's avatar qlobbe

add source and fixe date

parent d41e40d9
Pipeline #1445 failed with stage
......@@ -19,13 +19,14 @@ module Main where
import Data.Aeson
import Data.List (concat, nub, isSuffixOf)
import Data.String (String)
import Data.Text (Text, unwords, unpack, replace)
import Data.Text (Text, unwords, unpack, replace, pack)
import Crypto.Hash.SHA256 (hash)
import Gargantext.Prelude
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Core.Text.Context (TermList)
import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year, csv_w_title, csv_w_abstract, csv_w_publication_year, csv_w_weight)
import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year, csv_publication_month, csv_publication_day,
csv'_source, csv'_title, csv'_abstract, csv'_publication_year, csv'_publication_month, csv'_publication_day, csv'_weight)
import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..),parseFile)
import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
......@@ -36,11 +37,13 @@ import Gargantext.Core.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
-- import Gargantext.API.Ngrams.Prelude (toTermList)
import GHC.IO (FilePath)
import Prelude (Either(Left, Right))
import Prelude (Either(Left, Right),toInteger)
import System.Environment
import System.Directory (listDirectory,doesFileExist)
import Control.Concurrent.Async (mapConcurrently)
import Data.Time.Calendar (fromGregorian, diffGregorianDurationClip, cdMonths, diffDays, showGregorian)
import qualified Data.ByteString.Char8 as C8
import qualified Data.ByteString.Lazy as Lazy
import qualified Data.Vector as Vector
......@@ -60,9 +63,36 @@ data PhyloStage = PhyloWithCliques | PhyloWithLinks deriving (Show)
getFilesFromPath :: FilePath -> IO([FilePath])
getFilesFromPath path = do
if (isSuffixOf "/" path)
then (listDirectory path)
then (listDirectory path)
else return [path]
---------------
-- | Dates | --
---------------
toMonths :: Integer -> Int -> Int -> Date
toMonths y m d = fromIntegral $ cdMonths
$ diffGregorianDurationClip (fromGregorian y m d) (fromGregorian 0000 0 0)
toDays :: Integer -> Int -> Int -> Date
toDays y m d = fromIntegral
$ diffDays (fromGregorian y m d) (fromGregorian 0000 0 0)
toPhyloDate :: Int -> Int -> Int -> TimeUnit -> Date
toPhyloDate y m d tu = case tu of
Year _ _ _ -> y
Month _ _ _ -> toMonths (toInteger y) m d
Week _ _ _ -> div (toDays (toInteger y) m d) 7
Day _ _ _ -> toDays (toInteger y) m d
toPhyloDate' :: Int -> Int -> Int -> Text
toPhyloDate' y m d = pack $ showGregorian $ fromGregorian (toInteger y) m d
--------------
-- | Json | --
--------------
......@@ -83,16 +113,23 @@ termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList p
-- | To transform a Wos file (or [file]) into a list of Docs
wosToDocs :: Int -> Patterns -> FilePath -> IO ([Document])
wosToDocs limit patterns path = do
wosToDocs :: Int -> Patterns -> TimeUnit -> FilePath -> IO ([Document])
wosToDocs limit patterns time path = do
files <- getFilesFromPath path
take limit
<$> map (\d -> let date' = fromJust $ _hd_publication_year d
title = fromJust $ _hd_title d
<$> map (\d -> let title = fromJust $ _hd_title d
abstr = if (isJust $ _hd_abstract d)
then fromJust $ _hd_abstract d
else ""
in Document date' (termsInText patterns $ title <> " " <> abstr) Nothing)
in Document (toPhyloDate
(fromIntegral $ fromJust $ _hd_publication_year d)
(fromJust $ _hd_publication_month d)
(fromJust $ _hd_publication_day d) time)
(toPhyloDate'
(fromIntegral $ fromJust $ _hd_publication_year d)
(fromJust $ _hd_publication_month d)
(fromJust $ _hd_publication_day d))
(termsInText patterns $ title <> " " <> abstr) Nothing [])
<$> concat
<$> mapConcurrently (\file ->
filter (\d -> (isJust $ _hd_publication_year d)
......@@ -101,32 +138,36 @@ wosToDocs limit patterns path = do
-- To transform a Csv file into a list of Document
csvToDocs :: CorpusParser -> Patterns -> FilePath -> IO ([Document])
csvToDocs parser patterns path =
csvToDocs :: CorpusParser -> Patterns -> TimeUnit -> FilePath -> IO ([Document])
csvToDocs parser patterns time path =
case parser of
Wos _ -> undefined
Csv limit -> Vector.toList
Wos _ -> undefined
Csv limit -> Vector.toList
<$> Vector.take limit
<$> Vector.map (\row -> Document (csv_publication_year row)
<$> Vector.map (\row -> Document (toPhyloDate (csv_publication_year row) (csv_publication_month row) (csv_publication_day row) time)
(toPhyloDate' (csv_publication_year row) (csv_publication_month row) (csv_publication_day row))
(termsInText patterns $ (csv_title row) <> " " <> (csv_abstract row))
Nothing
[]
) <$> snd <$> Csv.readFile path
CsvWeighted limit -> Vector.toList
Csv' limit -> Vector.toList
<$> Vector.take limit
<$> Vector.map (\row -> Document (csv_w_publication_year row)
(termsInText patterns $ (csv_w_title row) <> " " <> (csv_w_abstract row))
(Just $ csv_w_weight row)
<$> Vector.map (\row -> Document (toPhyloDate (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row) time)
(toPhyloDate' (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row))
(termsInText patterns $ (csv'_title row) <> " " <> (csv'_abstract row))
(Just $ csv'_weight row)
[csv'_source row]
) <$> snd <$> Csv.readWeightedCsv path
-- To parse a file into a list of Document
fileToDocs' :: CorpusParser -> FilePath -> TermList -> IO [Document]
fileToDocs' parser path lst = do
fileToDocs' :: CorpusParser -> FilePath -> TimeUnit -> TermList -> IO [Document]
fileToDocs' parser path time lst = do
let patterns = buildPatterns lst
case parser of
Wos limit -> wosToDocs limit patterns path
Csv _ -> csvToDocs parser patterns path
CsvWeighted _ -> csvToDocs parser patterns path
Wos limit -> wosToDocs limit patterns time path
Csv _ -> csvToDocs parser patterns time path
Csv' _ -> csvToDocs parser patterns time path
---------------
......@@ -137,7 +178,10 @@ fileToDocs' parser path lst = do
-- Config time parameters to label
timeToLabel :: Config -> [Char]
timeToLabel config = case (timeUnit config) of
Year p s f -> ("time"<> "_"<> (show p) <> "_" <> (show s) <> "_" <> (show f))
Year p s f -> ("time_years" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
Month p s f -> ("time_months"<> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
Week p s f -> ("time_weeks" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
Day p s f -> ("time_days" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
seaToLabel :: Config -> [Char]
......@@ -240,7 +284,7 @@ main = do
printIOMsg "Parse the corpus"
mapList <- csvMapTermList (listPath config)
corpus <- fileToDocs' (corpusParser config) (corpusPath config) mapList
corpus <- fileToDocs' (corpusParser config) (corpusPath config) (timeUnit config) mapList
printIOComment (show (length corpus) <> " parsed docs from the corpus")
printIOMsg "Reconstruct the phylo"
......
......@@ -330,7 +330,8 @@ executables:
- optparse-generic
- split
- unordered-containers
- cryptohash
- cryptohash
- time
gargantext-import:
main: Main.hs
......
......@@ -398,28 +398,28 @@ parseCsv' bs = V.toList $ V.map csv2doc $ snd $ readCsvLazyBS bs
------------------------------------------------------------------------
-- Csv v3 weighted for phylo
data WeightedCsv = WeightedCsv
{ csv_w_title :: !Text
, csv_w_source :: !Text
, csv_w_publication_year :: !Int
, csv_w_publication_month :: !Int
, csv_w_publication_day :: !Int
, csv_w_abstract :: !Text
, csv_w_authors :: !Text
, csv_w_weight :: !Double } deriving (Show)
instance FromNamedRecord WeightedCsv where
parseNamedRecord r = WeightedCsv <$> r .: "title"
<*> r .: "source"
<*> r .: "publication_year"
<*> r .: "publication_month"
<*> r .: "publication_day"
<*> r .: "abstract"
<*> r .: "authors"
<*> r .: "weight"
readWeightedCsv :: FilePath -> IO (Header, Vector WeightedCsv)
data Csv' = Csv'
{ csv'_title :: !Text
, csv'_source :: !Text
, csv'_publication_year :: !Int
, csv'_publication_month :: !Int
, csv'_publication_day :: !Int
, csv'_abstract :: !Text
, csv'_authors :: !Text
, csv'_weight :: !Double } deriving (Show)
instance FromNamedRecord Csv' where
parseNamedRecord r = Csv' <$> r .: "title"
<*> r .: "source"
<*> r .: "publication_year"
<*> r .: "publication_month"
<*> r .: "publication_day"
<*> r .: "abstract"
<*> r .: "authors"
<*> r .: "weight"
readWeightedCsv :: FilePath -> IO (Header, Vector Csv')
readWeightedCsv fp =
fmap (\bs ->
case decodeByNameWith csvDecodeOptions bs of
......
......@@ -128,17 +128,17 @@ fisWithSizePolyMap n f is =
isSublistOf :: Ord a => [a] -> [a] -> Bool
isSublistOf sub lst = all (\i -> elem i lst) sub
reIndexFis :: Ord a => [([a],b)] -> [Fis' a] -> [(Fis' a,[b])]
reIndexFis :: Ord a => [([a],(b,c))] -> [Fis' a] -> [(Fis' a,([b],[c]))]
reIndexFis items fis = map (\f ->
let docs = filter (\(lst,_) -> isSublistOf (_fisItemSet f) lst) items
in (f, map snd docs)) fis
in (f, (map (fst . snd) docs,map (snd . snd) docs))) fis
wsum :: [Maybe Double] -> Maybe Double
wsum lst = fmap sum $ sequence lst
fisWithSizePolyMap' :: Ord a => Size -> Frequency -> [([a],Maybe Double)] -> Map (Set a) (Int, Maybe Double)
fisWithSizePolyMap' :: Ord a => Size -> Frequency -> [([a], (Maybe Double,[Int]))] -> Map (Set a) (Int, (Maybe Double,[Int]))
fisWithSizePolyMap' n f is = Map.fromList
$ map (\(fis,ws) -> (Set.fromList (_fisItemSet fis),(_fisCount fis,(wsum ws))))
$ map (\(fis,(ws,sources)) -> (Set.fromList (_fisItemSet fis),(_fisCount fis,(wsum ws,concat sources))))
$ reIndexFis is
$ fisWithSizePoly2 n f (map fst is)
......
......@@ -50,9 +50,9 @@ import qualified Data.Text.Lazy as TextLazy
data CorpusParser =
Wos {_wos_limit :: Int}
| Csv {_csv_limit :: Int}
| CsvWeighted {_csvw_limit :: Int}
Wos {_wos_limit :: Int}
| Csv {_csv_limit :: Int}
| Csv' {_csv'_limit :: Int}
deriving (Show,Generic,Eq)
data SeaElevation =
......@@ -107,6 +107,18 @@ data TimeUnit =
{ _year_period :: Int
, _year_step :: Int
, _year_matchingFrame :: Int }
| Month
{ _month_period :: Int
, _month_step :: Int
, _month_matchingFrame :: Int }
| Week
{ _week_period :: Int
, _week_step :: Int
, _week_matchingFrame :: Int }
| Day
{ _day_period :: Int
, _day_step :: Int
, _day_matchingFrame :: Int }
deriving (Show,Generic,Eq)
data CliqueFilter = ByThreshold | ByNeighbours deriving (Show,Generic,Eq)
......@@ -238,11 +250,14 @@ type Date = Int
-- | Ngrams : a contiguous sequence of n terms
type Ngrams = Text
-- | Document : a piece of Text linked to a Date
-- Document : a piece of Text linked to a Date
-- date = computational date; date' = original string date yyyy-mm-dd
data Document = Document
{ date :: Date
, text :: [Ngrams]
, weight :: Maybe Double
{ date :: Date
, date' :: Text
, text :: [Ngrams]
, weight :: Maybe Double
, sources :: [Text]
} deriving (Eq,Show,Generic,NFData)
......@@ -258,6 +273,10 @@ data PhyloFoundations = PhyloFoundations
} deriving (Generic, Show, Eq)
data PhyloSources = PhyloSources
{ _sources :: !(Vector Text) } deriving (Generic, Show, Eq)
---------------------------
-- | Coocurency Matrix | --
---------------------------
......@@ -280,6 +299,7 @@ type Cooc = Map (Int,Int) Double
-- periods : the temporal steps of a phylomemy
data Phylo =
Phylo { _phylo_foundations :: PhyloFoundations
, _phylo_sources :: PhyloSources
, _phylo_timeCooc :: !(Map Date Cooc)
, _phylo_timeDocs :: !(Map Date Double)
, _phylo_termFreq :: !(Map Int Double)
......@@ -299,8 +319,9 @@ type PhyloPeriodId = (Date,Date)
-- id: tuple (start date, end date) of the temporal step of the phylomemy
-- levels: levels of granularity
data PhyloPeriod =
PhyloPeriod { _phylo_periodPeriod :: (Date,Date)
, _phylo_periodLevels :: Map PhyloLevelId PhyloLevel
PhyloPeriod { _phylo_periodPeriod :: (Date,Date)
, _phylo_periodPeriod' :: (Text,Text)
, _phylo_periodLevels :: Map PhyloLevelId PhyloLevel
} deriving (Generic, Show, Eq)
......@@ -316,9 +337,10 @@ type PhyloLevelId = (PhyloPeriodId,Level)
-- Level 1: First level of clustering (the Fis)
-- Level [2..N]: Nth level of synchronic clustering (cluster of Fis)
data PhyloLevel =
PhyloLevel { _phylo_levelPeriod :: (Date,Date)
, _phylo_levelLevel :: Level
, _phylo_levelGroups :: Map PhyloGroupId PhyloGroup
PhyloLevel { _phylo_levelPeriod :: (Date,Date)
, _phylo_levelPeriod' :: (Text,Text)
, _phylo_levelLevel :: Level
, _phylo_levelGroups :: Map PhyloGroupId PhyloGroup
}
deriving (Generic, Show, Eq)
......@@ -332,11 +354,13 @@ type PhyloBranchId = (Level, [Int])
-- | PhyloGroup : group of ngrams at each level and period
data PhyloGroup =
PhyloGroup { _phylo_groupPeriod :: (Date,Date)
, _phylo_groupPeriod' :: (Text,Text)
, _phylo_groupLevel :: Level
, _phylo_groupIndex :: Int
, _phylo_groupIndex :: Int
, _phylo_groupLabel :: Text
, _phylo_groupSupport :: Support
, _phylo_groupWeight :: Maybe Double
, _phylo_groupSources :: [Int]
, _phylo_groupNgrams :: [Int]
, _phylo_groupCooc :: !(Cooc)
, _phylo_groupBranchId :: PhyloBranchId
......@@ -371,6 +395,7 @@ data PhyloClique = PhyloClique
, _phyloClique_support :: Support
, _phyloClique_period :: (Date,Date)
, _phyloClique_weight :: Maybe Double
, _phyloClique_sources :: [Int]
} deriving (Generic,NFData,Show,Eq)
----------------
......@@ -444,6 +469,8 @@ makeLenses ''PhyloBranch
instance FromJSON Phylo
instance ToJSON Phylo
instance FromJSON PhyloSources
instance ToJSON PhyloSources
instance FromJSON PhyloParam
instance ToJSON PhyloParam
instance FromJSON PhyloPeriod
......
......@@ -109,9 +109,11 @@ config =
docs :: [Document]
docs = map (\(d,t)
-> Document d
-> Document d
""
(filter (\n -> isRoots n (foundations ^. foundations_roots)) $ monoTexts t)
Nothing
[]
) corpus
......
......@@ -120,11 +120,13 @@ branchToDotNode b bId =
, toAttr "label" (pack $ show $ b ^. branch_label)
])
periodToDotNode :: (Date,Date) -> Dot DotId
periodToDotNode prd =
periodToDotNode :: (Date,Date) -> (Text.Text,Text.Text) -> Dot DotId
periodToDotNode prd prd' =
node (periodIdToDotId prd)
([Shape BoxShape, FontSize 50, Label (toDotLabel $ Text.pack (show (fst prd) <> " " <> show (snd prd)))]
<> [ toAttr "nodeType" "period"
<> [ toAttr "nodeType" "period"
, toAttr "strFrom" (fromStrict $ Text.pack $ (show $ fst prd'))
, toAttr "strTo" (fromStrict $ Text.pack $ (show $ snd prd'))
, toAttr "from" (fromStrict $ Text.pack $ (show $ fst prd))
, toAttr "to" (fromStrict $ Text.pack $ (show $ snd prd))])
......@@ -136,10 +138,13 @@ groupToDotNode fdt g bId =
<> [ toAttr "nodeType" "group"
, toAttr "from" (pack $ show (fst $ g ^. phylo_groupPeriod))
, toAttr "to" (pack $ show (snd $ g ^. phylo_groupPeriod))
, toAttr "strFrom" (pack $ show (fst $ g ^. phylo_groupPeriod'))
, toAttr "strTo" (pack $ show (snd $ g ^. phylo_groupPeriod'))
, toAttr "branchId" (pack $ unwords (init $ map show $ snd $ g ^. phylo_groupBranchId))
, toAttr "bId" (pack $ show bId)
, toAttr "support" (pack $ show (g ^. phylo_groupSupport))
, toAttr "weight" (pack $ show (g ^. phylo_groupWeight))
, toAttr "source" (pack $ show (nub $ g ^. phylo_groupSources))
, toAttr "lbl" (pack $ show (ngramsToLabel fdt (g ^. phylo_groupNgrams)))
, toAttr "foundation" (pack $ show (idxToLabel (g ^. phylo_groupNgrams)))
, toAttr "role" (pack $ show (idxToLabel' ((g ^. phylo_groupMeta) ! "dynamics")))
......@@ -194,6 +199,8 @@ exportToDot phylo export =
,(toAttr (fromStrict "phyloPeriods") $ pack $ show (length $ elems $ phylo ^. phylo_periods))
,(toAttr (fromStrict "phyloBranches") $ pack $ show (length $ export ^. export_branches))
,(toAttr (fromStrict "phyloGroups") $ pack $ show (length $ export ^. export_groups))
,(toAttr (fromStrict "phyloSources") $ pack $ show (Vector.toList $ getSources phylo))
,(toAttr (fromStrict "phyloTimeScale") $ pack $ getTimeScale phylo)
-- ,(toAttr (fromStrict "phyloTermsFreq") $ pack $ show (toList $ _phylo_lastTermFreq phylo))
])
......@@ -218,13 +225,13 @@ exportToDot phylo export =
{-- 5) create a layer for each period -}
_ <- mapM (\period ->
subgraph ((Str . fromStrict . Text.pack) $ ("Period" <> show (fst period) <> show (snd period))) $ do
subgraph ((Str . fromStrict . Text.pack) $ ("Period" <> show (fst $ _phylo_periodPeriod period) <> show (snd $ _phylo_periodPeriod period))) $ do
graphAttrs [Rank SameRank]
periodToDotNode period
periodToDotNode (period ^. phylo_periodPeriod) (period ^. phylo_periodPeriod')
{-- 6) create a node for each group -}
mapM (\g -> groupToDotNode (getRoots phylo) g (toBid g (export ^. export_branches))) (filter (\g -> g ^. phylo_groupPeriod == period) $ export ^. export_groups)
) $ getPeriodIds phylo
mapM (\g -> groupToDotNode (getRoots phylo) g (toBid g (export ^. export_branches))) (filter (\g -> g ^. phylo_groupPeriod == (period ^. phylo_periodPeriod)) $ export ^. export_groups)
) $ phylo ^. phylo_periods
{-- 7) create the edges between a branch and its first groups -}
_ <- mapM (\(bId,groups) ->
......
......@@ -14,6 +14,7 @@ module Gargantext.Core.Viz.Phylo.PhyloMaker where
import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy, tail)
import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, foldlWithKey, insert)
import Data.Vector (Vector)
import Data.Text (Text)
import Gargantext.Prelude
import Gargantext.Core.Viz.AdaptativePhylo
......@@ -104,7 +105,7 @@ toGroupsProxi lvl phylo =
in phylo & phylo_groupsProxi .~ ((traceGroupsProxi . fromList) groupsProxi)
appendGroups :: (a -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
appendGroups :: (a -> PhyloPeriodId -> (Text,Text) -> Level -> Int -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
$ over ( phylo_periods
. traverse
......@@ -112,12 +113,13 @@ appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ co
. traverse)
(\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
then
let pId = phyloLvl ^. phylo_levelPeriod
let pId = phyloLvl ^. phylo_levelPeriod
pId' = phyloLvl ^. phylo_levelPeriod'
phyloCUnit = m ! pId
in phyloLvl
& phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
groups ++ [ (((pId,lvl),length groups)
, f obj pId lvl (length groups)
, f obj pId pId' lvl (length groups)
(elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
] ) [] phyloCUnit)
else
......@@ -125,10 +127,11 @@ appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ co
phylo
cliqueToGroup :: PhyloClique -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup
cliqueToGroup fis pId lvl idx coocs = PhyloGroup pId lvl idx ""
cliqueToGroup :: PhyloClique -> PhyloPeriodId -> (Text,Text) -> Level -> Int -> [Cooc] -> PhyloGroup
cliqueToGroup fis pId pId' lvl idx coocs = PhyloGroup pId pId' lvl idx ""
(fis ^. phyloClique_support)
(fis ^. phyloClique_weight)
(fis ^. phyloClique_sources)
(fis ^. phyloClique_nodes)
(ngramsToCooc (fis ^. phyloClique_nodes) coocs)
(1,[0]) -- branchid (lvl,[path in the branching tree])
......@@ -143,14 +146,27 @@ toPhylo1 phyloStep = case (getSeaElevation phyloStep) of
-----------------------
-- | To Phylo Step | --
-----------------------
-----------------------
indexDates' :: Map (Date,Date) [Document] -> Map (Date,Date) (Text,Text)
indexDates' m = map (\docs ->
let ds = map (\d -> date' d) docs
f = if (null ds)
then ""
else toFstDate ds
l = if (null ds)
then ""
else toLstDate ds
in (f,l)) m
-- To build the first phylo step from docs and terms
toPhyloStep :: [Document] -> TermList -> Config -> Phylo
toPhyloStep docs lst conf = case (getSeaElevation phyloBase) of
Constante _ _ -> appendGroups cliqueToGroup 1 phyloClique phyloBase
Adaptative _ -> toGroupsProxi 1 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
Constante _ _ -> appendGroups cliqueToGroup 1 phyloClique (updatePeriods (indexDates' docs') phyloBase)
Adaptative _ -> toGroupsProxi 1
$ appendGroups cliqueToGroup 1 phyloClique (updatePeriods (indexDates' docs') phyloBase)
where
--------------------------------------
phyloClique :: Map (Date,Date) [PhyloClique]
......@@ -219,12 +235,12 @@ toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
Fis _ _ ->
let fis = map (\(prd,docs) ->
case (corpusParser $ getConfig phylo) of
CsvWeighted _ -> let lst = toList
$ fisWithSizePolyMap' (Segment 1 20) 1 (map (\d -> (ngramsToIdx (text d) (getRoots phylo), weight d)) docs)
in (prd, map (\f -> PhyloClique (Set.toList $ fst f) ((fst . snd) f) prd ((snd . snd) f)) lst)
Csv' _ -> let lst = toList
$ fisWithSizePolyMap' (Segment 1 20) 1 (map (\d -> (ngramsToIdx (text d) (getRoots phylo), (weight d, (sourcesToIdx (sources d) (getSources phylo))))) docs)
in (prd, map (\f -> PhyloClique (Set.toList $ fst f) ((fst . snd) f) prd ((fst . snd . snd) f) (((snd . snd . snd) f))) lst)
_ -> let lst = toList
$ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
in (prd, map (\f -> PhyloClique (Set.toList $ fst f) (snd f) prd Nothing) lst)
in (prd, map (\f -> PhyloClique (Set.toList $ fst f) (snd f) prd Nothing []) lst)
)
$ toList phyloDocs
fis' = fis `using` parList rdeepseq
......@@ -235,7 +251,7 @@ toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
$ foldl sumCooc empty
$ map listToMatrix
$ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
in (prd, map (\cl -> PhyloClique cl 0 prd Nothing) $ getMaxCliques filterType Conditional thr cooc))
in (prd, map (\cl -> PhyloClique cl 0 prd Nothing []) $ getMaxCliques filterType Conditional thr cooc))
$ toList phyloDocs
mcl' = mcl `using` parList rdeepseq
in fromList mcl'
......@@ -342,17 +358,20 @@ docsToTimeScaleNb docs =
initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
initPhyloLevels lvlMax pId =
fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId lvl empty)) [1..lvlMax]
fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId ("","") lvl empty)) [1..lvlMax]
-- To init the basic elements of a Phylo
toPhyloBase :: [Document] -> TermList -> Config -> Phylo
toPhyloBase docs lst conf =
let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
docsSources = PhyloSources (Vector.fromList $ nub $ concat $ map sources docs)
params = defaultPhyloParam { _phyloParam_config = conf }
periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
$ Phylo foundations
docsSources
(docsToTimeScaleCooc docs (foundations ^. foundations_roots))
(docsToTimeScaleNb docs)
(docsToTermFreq docs (foundations ^. foundations_roots))
......@@ -360,4 +379,4 @@ toPhyloBase docs lst conf =
empty
empty
params
(fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels 1 prd))) periods)
(fromList $ map (\prd -> (prd, PhyloPeriod prd ("","") (initPhyloLevels 1 prd))) periods)
......@@ -17,9 +17,9 @@ import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tai
import Data.Set (Set, disjoint)
import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey, fromListWith, empty, restrictKeys)
import Data.String (String)
import Data.Text (Text)
import Data.Text (Text,unpack)
import Prelude (floor)
import Prelude (floor,read)
import Gargantext.Prelude
import Gargantext.Core.Viz.AdaptativePhylo
......@@ -115,6 +115,10 @@ isRoots n ns = Vector.elem n ns
ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
ngramsToIdx ns fdt = map (\n -> fromJust $ elemIndex n fdt) ns
-- | To transform a list of sources into a list of sources' index
sourcesToIdx :: [Text] -> Vector Text -> [Int]
sourcesToIdx ss ps = nub $ map (\s -> fromJust $ elemIndex s ps) ss
-- | To transform a list of Ngrams Indexes into a Label
ngramsToLabel :: Vector Ngrams -> [Int] -> Text
ngramsToLabel ngrams l = Text.unwords $ tail' "ngramsToLabel" $ concat $ map (\n -> ["|",n]) $ ngramsToText ngrams l
......@@ -153,6 +157,32 @@ toPeriods dates p s =
$ chunkAlong p s [start .. end]
toFstDate :: [Text] -> Text
toFstDate ds = snd
$ head' "firstDate"
$ sortOn fst
$ map (\d ->
let d' = read (filter (\c -> c /= '-') $ unpack d)::Int
in (d',d)) ds
toLstDate :: [Text] -> Text
toLstDate ds = snd
$ head' "firstDate"
$ reverse
$ sortOn fst
$ map (\d ->
let d' = read (filter (\c -> c /= '-') $ unpack d)::Int
in (d',d)) ds
getTimeScale :: Phylo -> [Char]
getTimeScale p = case (timeUnit $ getConfig p) of
Year _ _ _ -> "year"
Month _ _ _ -> "month"
Week _ _ _ -> "week"
Day _ _ _ -> "day"
-- | Get a regular & ascendante timeScale from a given list of dates
toTimeScale :: [Date] -> Int -> [Date]
toTimeScale dates step =
......@@ -162,15 +192,24 @@ toTimeScale dates step =
getTimeStep :: TimeUnit -> Int
getTimeStep time = case time of
Year _ s _ -> s
Year _ s _ -> s
Month _ s _ -> s
Week _ s _ -> s
Day _ s _ -> s
getTimePeriod :: TimeUnit -> Int
getTimePeriod time = case time of
Year p _ _ -> p
Year p _ _ -> p
Month p _ _ -> p
Week p _ _ -> p
Day p _ _ -> p
getTimeFrame :: TimeUnit -> Int
getTimeFrame time = case time of
Year _ _ f -> f
Year _ _ f -> f
Month _ _ f -> f
Week _ _ f -> f
Day _ _ f -> f
-------------
-- | Fis | --
......@@ -359,6 +398,9 @@ setConfig config phylo = phylo
getRoots :: Phylo -> Vector Ngrams
getRoots phylo = (phylo ^. phylo_foundations) ^. foundations_roots
getSources :: Phylo -> Vector Text
getSources phylo = _sources (phylo ^. phylo_sources)
phyloToLastBranches :: Phylo -> [[PhyloGroup]]
phyloToLastBranches phylo = elems
$ fromListWith (++)
......@@ -411,6 +453,16 @@ updatePhyloGroups lvl m phylo =
then m ! id
else g ) phylo
updatePeriods :: Map (Date,Date) (Text,Text) -> Phylo -> Phylo
updatePeriods periods' phylo =
over (phylo_periods . traverse)
(\prd ->
let prd' = periods' ! (prd ^. phylo_periodPeriod)
lvls = map (\lvl -> lvl & phylo_levelPeriod' .~ prd') $ prd ^. phylo_periodLevels
in prd & phylo_periodPeriod' .~ prd'
& phylo_periodLevels .~ lvls
) phylo
traceToPhylo :: Level -> Phylo -> Phylo
traceToPhylo lvl phylo =
......
......@@ -36,10 +36,12 @@ import qualified Data.Map as Map
mergeGroups :: [Cooc] -> PhyloGroupId -> Map PhyloGroupId PhyloGroupId -> [PhyloGroup] -> PhyloGroup
mergeGroups coocs id mapIds childs =
let ngrams = (sort . nub . concat) $ map _phylo_groupNgrams childs
in PhyloGroup (fst $ fst id) (snd $ fst id) (snd id) ""
in PhyloGroup (fst $ fst id) (_phylo_groupPeriod' $ head' "mergeGroups" childs)
(snd $ fst id) (snd id) ""
(sum $ map _phylo_groupSupport childs)
(fmap sum $ sequence
$ map _phylo_groupWeight childs)
$ map _phylo_groupWeight childs)
(concat $ map _phylo_groupSources childs)
ngrams
(ngramsToCooc ngrams coocs)
((snd $ fst id),bId)
......@@ -58,12 +60,12 @@ mergeGroups coocs id mapIds childs =
mergeAncestors :: [Pointer] -> [Pointer]
mergeAncestors pointers = Map.toList $ fromListWith max pointers
addPhyloLevel :: Level -> Phylo -> Phylo
addPhyloLevel lvl phylo =
over ( phylo_periods . traverse )
(\phyloPrd -> phyloPrd & phylo_periodLevels
%~ (insert (phyloPrd ^. phylo_periodPeriod, lvl) (PhyloLevel (phyloPrd ^. phylo_periodPeriod) lvl empty))) phylo
%~ (insert (phyloPrd ^. phylo_periodPeriod, lvl)
(PhyloLevel (phyloPrd ^. phylo_periodPeriod) (phyloPrd ^. phylo_periodPeriod') lvl empty))) phylo
toNextLevel' :: Phylo -> [PhyloGroup] -> Phylo
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment