[phylo] work on adding day/hour/minute/second

Too many refactorings along the way unfortunately
parent cd0aa9ca
Pipeline #7809 failed with stages
in 18 minutes and 20 seconds
......@@ -168,6 +168,7 @@ source-repository-package
allow-newer: MissingH:base
, *:base
, *:unordered-containers
, crawlerHAL:*
, epo-api-client:http-client-tls
, openalex:http-client-tls
......
......@@ -646,7 +646,7 @@ library
, tree-diff
, tuple ^>= 0.3.0.2
, unicode-collation >= 0.1.3.5
, unordered-containers ^>= 0.2.16.0
, unordered-containers ^>= 0.2.20
-- needed for Worker / System.Posix.Signals
, unix >= 2.7.3 && < 2.9
, uri-encode ^>= 1.5.0.7
......@@ -798,7 +798,7 @@ common commonTestDependencies
, unicode-collation >= 0.1.3.5
, unix >= 2.7.3 && < 2.9
, unliftio
, unordered-containers ^>= 0.2.16.0
, unordered-containers ^>= 0.2.20
, utf8-string ^>= 1.0.2
, validity ^>= 0.12.0.2
, vector >= 0.12.3.0 && <= 0.13.1.0
......
......@@ -15,7 +15,7 @@ import Data.Text qualified as T
import Data.Time (UTCTime(..), fromGregorian, toGregorian, utctDay)
import Data.Time.Format (parseTimeM, defaultTimeLocale)
import Data.Time.Format.ISO8601 (iso8601ParseM)
import Data.Time.LocalTime (TimeOfDay(..), timeOfDayToTime)
import Data.Time.LocalTime (TimeOfDay(..), timeOfDayToTime, timeToTimeOfDay)
import Gargantext.Defaults qualified as Def
import Gargantext.Prelude
import Text.Printf (printf)
......@@ -45,18 +45,45 @@ averageLength l = fromIntegral (sum (map length l)) / fromIntegral (length l)
-- print c -- $ toYear $ toGregorian $ utctDay c
makeUTCTime :: Int -> Int -> Int -> Int -> Int -> Int -> UTCTime
makeUTCTime year month day hour minute second' =
UTCTime dayPart timePart
-- | A simplified UTCTime record for our purposes
data UTCTimeR =
UTCTimeR { year :: Int
, month :: Int
, day :: Int
, hour :: Int
, minute :: Int
, sec :: Int }
deriving (Show, Eq, Generic)
defUTCTimeR :: UTCTimeR
defUTCTimeR = UTCTimeR { year = fromIntegral Def.year
, month = Def.month
, day = Def.day
, hour = Def.hour
, minute = Def.minute
, sec = Def.second }
toUTCTime :: UTCTimeR -> UTCTime
toUTCTime (UTCTimeR { .. }) = UTCTime dayPart timePart
where
dayPart = fromGregorian (fromIntegral year) month day
timePart = timeOfDayToTime (TimeOfDay hour minute (fromIntegral second'))
timePart = timeOfDayToTime (TimeOfDay hour minute (fromIntegral sec))
toUTCTimeR :: UTCTime -> UTCTimeR
toUTCTimeR (UTCTime { .. }) = UTCTimeR { year = fromIntegral year
, sec = round (realToFrac sec :: Float)
, .. }
where
(year, month, day) = toGregorian utctDay
TimeOfDay hour minute sec = timeToTimeOfDay utctDayTime
-- | Parse with multiple format attempts
parseFlexibleTime :: Text -> Maybe UTCTime
parseFlexibleTime t = msum
[ iso8601ParseM s
, parseTimeM True defaultTimeLocale "%Y-%m-%dT%H:%M:%S" s
, parseTimeM True defaultTimeLocale "%Y-%m-%d %H:%M:%S" s
, parseTimeM True defaultTimeLocale "%Y-%m-%d %H:%M:%S UTC" s
, parseTimeM True defaultTimeLocale "%Y-%m-%d" s
]
......@@ -68,6 +95,6 @@ dateParts :: Text -> [Text]
dateParts t = [T.pack $ printf "%04d" y, T.pack $ printf "%02d" m, T.pack $ printf "%02d" d]
where
mParsed = parseFlexibleTime t
parsed = fromMaybe (makeUTCTime (fromIntegral Def.year) Def.month Def.day
Def.hour Def.minute Def.second) mParsed
parsed = fromMaybe (toUTCTime defUTCTimeR) mParsed
(y, m, d) = toGregorian $ utctDay parsed
......@@ -31,6 +31,8 @@ module Gargantext.Core.Viz.Phylo where
import Control.Lens (over)
import Data.Aeson.Types qualified as JS
import Data.Discrimination qualified as D
import Data.HashMap.Strict qualified as HM
import Data.List.NonEmpty qualified as NE
import Data.Swagger (ToSchema(..), genericDeclareNamedSchema)
import Data.Text (pack)
......@@ -354,7 +356,10 @@ defaultPhyloParam =
------------------
-- | Date : a simple Integer (maxInt on 64 bit is good enough for storing number of seconds since year 0)
type Date = Int
newtype Date = Date { unDate :: Int }
deriving (Generic)
deriving newtype (NFData, Show, Eq, ToExpr, Ord, ToSchema, ToJSON, JS.ToJSONKey, FromJSON, JS.FromJSONKey, Hashable, Num, Enum, Integral, Real)
deriving anyclass (D.Grouping, D.Sorting)
-- | DateStr : the string version of a Date
type DateStr = Text
......@@ -388,7 +393,8 @@ data PhyloFoundations = PhyloFoundations
data PhyloCounts = PhyloCounts
{ coocByDate :: !(Map Date Cooc)
, docsByDate :: !(Map Date Double)
-- | For many docs, HashMap seems more performant than Map
, docsByDate :: !(HM.HashMap Date Double)
, rootsCountByDate :: !(Map Date (Map Int Double))
, rootsCount :: !(Map Int Double)
, rootsFreq :: !(Map Int Double)
......
{-|
Module : Gargantext.Core.Viz.Phylo.API
Module : Gargantext.Core.Viz.Phylo.API.Tools
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
......@@ -24,7 +24,7 @@ import Data.ByteString.Lazy qualified as Lazy
import Data.Map.Strict qualified as Map
import Data.Set qualified as Set
import Data.Text (pack)
import Data.Time.Calendar (fromGregorian, diffGregorianDurationClip, cdMonths, diffDays, showGregorian)
import Data.Time.Calendar (fromGregorian, diffGregorianDurationClip, cdMonths, {-diffDays,-} showGregorian, Day)
import Data.Time.Clock (UTCTime(..))
import Data.Time.Clock.POSIX (posixSecondsToUTCTime, utcTimeToPOSIXSeconds)
import Data.Time.Format (defaultTimeLocale, formatTime)
......@@ -35,7 +35,7 @@ import Gargantext.Core.NodeStory.Types (HasNodeStory, NodeStoryEnv, HasNodeStory
import Gargantext.Core.Text.Ngrams (NgramsType(..))
import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, termsInText)
import Gargantext.Core.Types.Main (ListType(MapTerm))
import Gargantext.Core.Utils.DateUtils (makeUTCTime)
import Gargantext.Core.Utils.DateUtils (UTCTimeR(..), toUTCTime)
import Gargantext.Core.Viz.Phylo (TimeUnit(..), Date, Document(..), PhyloConfig(..), Phylo (_phylo_computeTime), trackComputeTime, ComputeTimeHistory)
import Gargantext.Core.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
import Gargantext.Core.Viz.Phylo.PhyloMaker (toPhylo, toPhyloWithoutLink)
......@@ -53,10 +53,15 @@ import Gargantext.Database.Schema.Node ( NodePoly(_node_hyperdata), node_hyperda
import Gargantext.Prelude hiding (to)
import Gargantext.System.Logging ( MonadLogger, LogLevel(DEBUG), logLocM )
import Gargantext.Utils.UTCTime (timeMeasured, timeMeasured'')
import System.Directory (copyFile)
import System.FilePath ((</>))
import System.IO.Temp (withTempDirectory)
import System.Process qualified as Shell
year0 :: Day
year0 = fromGregorian 0000 1 1
--------------------------------------------------------------------
getPhyloData :: HasNodeError err
=> PhyloId -> DBQuery err x (Maybe Phylo)
......@@ -107,6 +112,8 @@ phylo2dot phylo = do
value <- readFile fileFrom
copyFile fileFrom ("/home/przemek/phylo/phyloFrom.dot")
case value of
"" -> panic "[G.C.V.Phylo.API.phylo2dot] Error no file"
_ -> pure value
......@@ -193,6 +200,7 @@ context2phyloDocument timeUnit context (ngs_terms, ngs_sources) = do
-- TODO better default date and log the errors to improve data quality
-- TODO Context already has a date field
context2date :: Context HyperdataDocument -> TimeUnit -> Maybe (Date, Text)
context2date context timeUnit = do
let hyperdata = _context_hyperdata context
......@@ -213,39 +221,44 @@ context2date context timeUnit = do
toMonths :: Int -> Int -> Int -> Date
toMonths y m d = fromIntegral $ cdMonths
$ diffGregorianDurationClip (fromGregorian (fromIntegral y) m d)
(fromGregorian 0000 0 0)
year0
toDays :: Int -> Int -> Int -> Date
toDays y m d = fromIntegral
$ diffDays (fromGregorian (fromIntegral y) m d) (fromGregorian 0000 0 0)
toDays y m d = toSeconds y m d 0 0 0 `div` (24*60*60)
-- toDays y m d = fromIntegral
-- $ diffDays (fromGregorian (fromIntegral y) m d) year0
-- | Convert UTCTime to seconds since year 0
-- utcTimeToSecondsFromYear0 :: UTCTime -> Integer
-- utcTimeToSecondsFromYear0 utcTime =
-- let posixSeconds = utcTimeToPOSIXSeconds utcTime
-- -- POSIX epoch is 1970-01-01, need to add seconds from year 0 to 1970
-- year0To1970Seconds = 1970 * 365.25 * 24 * 3600 -- Approximate
-- in floor (posixSeconds + year0To1970Seconds)
-- Convert UTCTime to seconds since year 0
utcTimeToSecondsFromYear0 :: UTCTime -> Integer
utcTimeToSecondsFromYear0 utcTime =
let posixSeconds = utcTimeToPOSIXSeconds utcTime
-- POSIX epoch is 1970-01-01, need to add seconds from year 0 to 1970
year0To1970Seconds = 1970 * 365.25 * 24 * 3600 -- Approximate
in floor (posixSeconds + year0To1970Seconds)
-- | Convert UTCTime to seconds since year 1970
utcTimeToSecondsFromYear1970 :: UTCTime -> Integer
utcTimeToSecondsFromYear1970 utcTime = floor $ utcTimeToPOSIXSeconds utcTime
toHours :: Int -> Int -> Int -> Int -> Date
toHours y m d hh = fromIntegral $
(utcTimeToSecondsFromYear0 $ makeUTCTime y m d hh 0 0) `div` (60*60)
toHours y m d hh = toSeconds y m d hh 0 0 `div` (60*60)
toMinutes :: Int -> Int -> Int -> Int -> Int -> Date
toMinutes y m d hh mm = fromIntegral $
(utcTimeToSecondsFromYear0 $ makeUTCTime y m d hh mm 0) `div` 60
toMinutes y m d hh mm = toSeconds y m d hh mm 0 `div` 60
toSeconds :: Int -> Int -> Int -> Int -> Int -> Int -> Date
toSeconds y m d hh mm ss = fromIntegral $
utcTimeToSecondsFromYear0 $ makeUTCTime y m d hh mm ss
utcTimeToSecondsFromYear1970 $ toUTCTime utcTimeR
where
utcTimeR = UTCTimeR { year = y, month = m, day = d, hour = hh, minute = mm, sec = ss }
-- | This is kinda like a hashing function that assigns different
-- 'Date' (in fact an 'Int') to different period, depending on the
-- 'TimeUnit'
toPhyloDate :: Int -> Int -> Int -> Int -> Int -> Int -> TimeUnit -> Date
toPhyloDate y m d hh mm ss tu = case tu of
Year {} -> y
Year {} -> fromIntegral y
Month {} -> toMonths y m d
Week {} -> div (toDays y m d) 7
Day {} -> toDays y m d
......@@ -261,11 +274,17 @@ toPhyloDate' y m d _hh _mm _ss (Month {}) = pack $ showGregorian $ fromGregor
toPhyloDate' y m d _hh _mm _ss (Week {}) = pack $ showGregorian $ fromGregorian (toInteger y) m d
toPhyloDate' y m d _hh _mm _ss (Day {}) = pack $ showGregorian $ fromGregorian (toInteger y) m d
toPhyloDate' y m d hh _mm _ss (Hour {}) =
pack $ formatTime defaultTimeLocale "%Y-%m-%d %H" $ makeUTCTime y m d hh 0 0
pack $ formatTime defaultTimeLocale "%Y-%m-%d %H" $ toUTCTime utcTimeR
where
utcTimeR = UTCTimeR { year = y, month = m, day = d, hour = hh, minute = 0, sec = 0 }
toPhyloDate' y m d hh mm _ss (Minute {}) =
pack $ formatTime defaultTimeLocale "%Y-%m-%d %H:%M" $ makeUTCTime y m d hh mm 0
pack $ formatTime defaultTimeLocale "%Y-%m-%d %H:%M" $ toUTCTime utcTimeR
where
utcTimeR = UTCTimeR { year = y, month = m, day = d, hour = hh, minute = mm, sec = 0 }
toPhyloDate' y m d hh mm ss (Second {}) =
pack $ formatTime defaultTimeLocale "%Y-%m-%d %H:%M:%S" $ makeUTCTime y m d hh mm ss
pack $ formatTime defaultTimeLocale "%Y-%m-%d %H:%M:%S" $ toUTCTime utcTimeR
where
utcTimeR = UTCTimeR { year = y, month = m, day = d, hour = hh, minute = mm, sec = ss }
-- Utils
......
......@@ -17,6 +17,7 @@ module Gargantext.Core.Viz.Phylo.Example where
import Control.Lens
import Data.GraphViz.Types.Generalised (DotGraph)
import Data.HashMap.Strict qualified as HM
import Data.List (nub)
import Data.Map qualified as Map
import Data.Text (toLower)
......@@ -90,7 +91,7 @@ periods = toPeriods (sort $ nub $ map date docs)
(getTimeStep $ timeUnit config)
nbDocsByYear :: Map Date Double
nbDocsByYear :: HM.HashMap Date Double
nbDocsByYear = docsToTimeScaleNb docs
......
......@@ -16,6 +16,7 @@ module Gargantext.Core.Viz.Phylo.PhyloExport where
import Control.Lens hiding (Level)
import Control.Parallel.Strategies (parList, rdeepseq, using)
import Data.HashMap.Strict qualified as HM
import Data.GraphViz hiding (DotGraph, Order)
import Data.GraphViz.Attributes.Complete hiding (EdgeType, Order)
import Data.GraphViz.Attributes.HTML qualified as H
......@@ -195,7 +196,7 @@ exportToDot phylo export =
{-- home made attributes -}
<> [ toAttr (fromStrict "phyloFoundations") $ show (length $ Vector.toList $ getRoots phylo)
, toAttr (fromStrict "phyloTerms") $ show (length $ nub $ concat $ map (^. phylo_groupNgrams) $ export ^. export_groups)
, toAttr (fromStrict "phyloDocs") $ show (sum $ elems $ getDocsByDate phylo)
, toAttr (fromStrict "phyloDocs") $ show (sum $ HM.elems $ getDocsByDate phylo)
, toAttr (fromStrict "phyloPeriods") $ show (length $ elems $ phylo ^. phylo_periods)
, toAttr (fromStrict "phyloBranches") $ show (length $ export ^. export_branches)
, toAttr (fromStrict "phyloGroups") $ show (length $ export ^. export_groups)
......@@ -639,7 +640,7 @@ toHorizon phylo =
heads = filter (\g -> (not . null) $ (g ^. phylo_groupPeriodChilds))
$ filter (\g -> null (g ^. phylo_groupPeriodParents) && (notElem (getGroupId g) childs)) groups
noHeads = groups \\ heads
nbDocs = sum $ elems $ filterDocs (getDocsByDate phylo) [prd]
nbDocs = sum $ HM.elems $ filterDocs (getDocsByDate phylo) [prd]
diago = reduceDiagos $ filterDiago (getCoocByDate phylo) [prd]
sim = (similarity $ getConfig phylo)
step = case getSeaElevation phylo of
......
......@@ -19,6 +19,7 @@ import Control.Lens hiding (Level)
import Control.Parallel.Strategies (parMap, rpar)
import Data.Containers.ListUtils (nubOrd)
import Data.Discrimination qualified as D
import Data.HashMap.Strict qualified as HM
import Data.List (partition, intersect, tail)
import Data.List qualified as List
import Data.Map (fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, insert)
......@@ -196,7 +197,7 @@ findSeaLadder phylo = case getSeaElevation phylo of
let candidates = filter (\target -> (> 2) $ length
$ intersect (getGroupNgrams source) (getGroupNgrams target)) targets
in map (\target ->
let nbDocs = (sum . elems)
let nbDocs = (sum . HM.elems)
$ filterDocs docs ([idToPrd (getGroupId source), idToPrd (getGroupId target)])
diago = reduceDiagos
$ filterDiago diagos ([idToPrd (getGroupId source), idToPrd (getGroupId target)])
......@@ -208,10 +209,12 @@ findSeaLadder phylo = case getSeaElevation phylo of
appendGroups :: (a -> Period -> (Text,Text) -> Scale -> Int -> [Cooc] -> Map Int Double -> PhyloGroup) -> Scale -> Map (Date,Date) [a] -> Phylo -> Phylo
appendGroups f lvl m phylo =
tracePhylo ("\n" <> "-- | Append "
<> show (length $ concat $ elems m)
<> " groups to scale "
<> show (lvl) <> "\n" :: Text)
tracePhylo (mconcat [ "\n" :: Text
, "-- | Append "
, show $ length $ concat $ elems m
, " groups to scale "
, show lvl
, "\n" ])
$ over ( phylo_periods
. traverse
. phylo_periodScales
......@@ -397,9 +400,11 @@ docsToTimeScaleCooc docs fdt =
mCooc' = fromList
$ map (\t -> (t,empty))
$ toTimeScale (map date docs) 1
in tracePhylo ("\n" <> "-- | Build the coocurency matrix for "
<> show (length $ keys mCooc')
<> " unit of time" <> "\n" :: Text)
in tracePhylo (mconcat [ "\n" :: Text
, "-- | Build the coocurency matrix for "
, show $ length $ keys mCooc'
, " unit of time"
, "\n" ])
$ unionWith sumCooc mCooc mCooc'
......@@ -423,10 +428,13 @@ groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(da
groupDocsByPeriod' f pds docs =
let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
periods = parMap rpar (inPeriode f docs') pds
in tracePhylo ("\n" <> "-- | Group "
<> show (length docs)
<> " docs by "
<> show (length pds) <> " periods" <> "\n" :: Text)
in tracePhylo (mconcat [ "\n" :: Text
, "-- | Group "
, show $ length docs
, " docs by "
, show $ length pds
, " periods"
,"\n" ])
$ fromList $ zip pds periods
where
--------------------------------------
......@@ -442,9 +450,13 @@ groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents
groupDocsByPeriod f pds es =
let periods = parMap rpar (inPeriode f es) pds
in tracePhylo ("\n" <> "-- | Group "
<> show (length es) <> " docs by "
<> show (length pds) <> " periods" <> "\n" :: Text)
in tracePhylo (mconcat [ "\n" :: Text
, "-- | Group "
, show $ length es
, " docs by "
, show $ length pds
, " periods"
, "\n" ])
$ fromList $ zip pds periods
where
--------------------------------------
......@@ -462,7 +474,13 @@ docsToTermFreq docs fdt =
$ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
$ group $ D.sort $ concat $ map (\d -> D.nub $ ngramsToIdx (text d) fdt) docs
sumFreqs = sum $ elems freqs
in map (/sumFreqs) freqs
in tracePhylo (mconcat [ "\n" :: Text
, "-- | Docs "
, show nbDocs
, " to term freq "
, show $ length freqs
, "\n" ])
$ map (/sumFreqs) freqs
docsToTermCount :: [Document] -> Vector Ngrams -> Map Int Double
......@@ -479,7 +497,13 @@ docsToTimeTermCount docs roots =
$ fromListWith (++)
$ map (\d -> (date d, D.nub $ ngramsToIdx (text d) roots)) docs
time = fromList $ map (\t -> (t,Map.empty)) $ toTimeScale (keys docs') 1
in unionWith (Map.union) time docs'
in tracePhylo (mconcat [ "\n" :: Text
, "-- | Docs "
, show $ length docs'
, " to time term count "
, show $ length time
, "\n" ])
$ unionWith (Map.union) time docs'
docsToLastTermFreq :: Int -> [Document] -> Vector Ngrams -> Map Int Double
......@@ -491,20 +515,30 @@ docsToLastTermFreq n docs fdt =
$ map (\lst -> (head' "docsToLastTermFreq" lst, fromIntegral $ length lst))
$ group $ D.sort $ concat $ map (\d -> D.nub $ ngramsToIdx (text d) fdt) $ filter (\d -> elem (date d) last) docs
sumFreqs = sum $ elems freqs
in map (/sumFreqs) freqs
in tracePhylo (mconcat [ "\n" :: Text
, "-- | Docs "
, show nbDocs
, " to last term freq "
, show $ length freqs
, "\n" ])
$ map (/sumFreqs) freqs
-- To count the number of docs by unit of time
docsToTimeScaleNb :: [Document] -> Map Date Double
docsToTimeScaleNb :: [Document] -> HM.HashMap Date Double
docsToTimeScaleNb docs =
let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
in tracePhylo ("\n" <> "-- | Group "
<> show (length docs)
<> " docs by "
<> show (length time)
<> " unit of time" <> "\n" :: Text)
$ unionWith (+) time docs'
let docs' = HM.fromListWith (+) $ map (\d -> (date d,1)) docs
time = HM.fromList $ map (\t -> (t,0)) $ toTimeScale (HM.keys docs') 1
in tracePhylo (mconcat [ "\n" :: Text
, "-- | Group "
, show $ length docs
, " docs by "
, show $ length time
, " unit of time"
, ", docs keys: "
, show $ HM.keys docs'
, "\n" ])
$ HM.unionWith (+) time docs'
initPhyloScales :: Int -> Period -> Map PhyloScaleId PhyloScale
......@@ -555,16 +589,19 @@ initPhylo docs conf =
then defaultPhyloParam { _phyloParam_config = setDefault conf timeScale (length docs) }
else defaultPhyloParam { _phyloParam_config = conf }
periods = toPeriods (D.sort $ D.nub $ map date docs) (getTimePeriod timeScale) (getTimeStep timeScale)
in tracePhylo ("\n" <> "-- | Init a phylo out of "
<> show (length docs) <> " docs \n" :: Text)
$ tracePhylo ("\n" <> "-- | lambda "
<> show (_qua_granularity $ phyloQuality $ _phyloParam_config params) :: Text)
$ Phylo foundations
docsSources
docsCounts
[]
params
(fromList $ map (\prd -> (prd, PhyloPeriod prd ("","") (initPhyloScales 1 prd))) periods)
0
(_qua_granularity $ phyloQuality $ _phyloParam_config params)
Nothing
in tracePhylo (mconcat [ "\n" :: Text
, "-- | Init a phylo out of "
, show $ length docs
, " docs \n" ])
$ tracePhylo (mconcat [ "\n" :: Text
, "-- | lambda "
, show $ _qua_granularity $ phyloQuality $ _phyloParam_config params ])
$ Phylo { _phylo_foundations = foundations
, _phylo_sources = docsSources
, _phylo_counts = docsCounts
, _phylo_seaLadder = []
, _phylo_param = params
, _phylo_periods = fromList $ map (\prd -> (prd, PhyloPeriod prd ("","") (initPhyloScales 1 prd))) periods
, _phylo_quality = 0
, _phylo_level = _qua_granularity $ phyloQuality $ _phyloParam_config params
, _phylo_computeTime = Nothing }
......@@ -15,6 +15,7 @@ Portability : POSIX
module Gargantext.Core.Viz.Phylo.PhyloTools where
import Control.Lens ( over, filtered, view, (%~) )
import Data.HashMap.Strict qualified as HM
import Data.List (union, nub, init, tail, partition, nubBy, (!!))
import Data.List qualified as List
import Data.Map (elems, empty, fromList, findWithDefault, unionWith, keys, member, (!), filterWithKey, fromListWith, restrictKeys)
......@@ -184,7 +185,7 @@ getTimeScale p = case timeUnit $ getConfig p of
toTimeScale :: [Date] -> Int -> [Date]
toTimeScale dates step =
let (start,end) = findBounds dates
in [start, (start + step) .. end]
in [start, (start + fromIntegral step) .. end]
getTimeStep :: TimeUnit -> Int
......@@ -550,7 +551,7 @@ getCoocByDate phylo = coocByDate (phylo ^. phylo_counts)
getRootsCountByDate :: Phylo -> Map Date (Map Int Double)
getRootsCountByDate phylo = rootsCountByDate (phylo ^. phylo_counts)
getDocsByDate :: Phylo -> Map Date Double
getDocsByDate :: Phylo -> HM.HashMap Date Double
getDocsByDate phylo = docsByDate (phylo ^. phylo_counts)
getRootsCount :: Phylo -> Map Int Double
......
......@@ -14,6 +14,7 @@ module Gargantext.Core.Viz.Phylo.SynchronicClustering where
import Control.Lens hiding (Level)
import Control.Parallel.Strategies (parList, rdeepseq, using)
import Data.HashMap.Strict qualified as HM
import Data.List (intersect, nub)
import Data.Map (fromList, fromListWith, foldlWithKey, (!), insert, empty, restrictKeys, elems, mapWithKey, member, unionWith)
import Data.Map qualified as Map
......@@ -21,6 +22,7 @@ import Gargantext.Core.Viz.Phylo
import Gargantext.Core.Viz.Phylo.PhyloExport (processDynamics)
import Gargantext.Core.Viz.Phylo.PhyloTools
import Gargantext.Core.Viz.Phylo.TemporalMatching (weightedLogJaccard', filterDiago, reduceDiagos)
import Gargantext.Data.HashMap.Strict.Utils qualified as HMU
import Gargantext.Prelude hiding (empty)
......@@ -157,7 +159,12 @@ toParentId :: PhyloGroup -> PhyloGroupId
toParentId child = ((child ^. phylo_groupPeriod, child ^. phylo_groupScale + 1), child ^. phylo_groupIndex)
reduceGroups :: PhyloSimilarity -> Synchrony -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [PhyloGroup]
reduceGroups :: PhyloSimilarity
-> Synchrony
-> HM.HashMap Date Double
-> Map Date Cooc
-> [PhyloGroup]
-> [PhyloGroup]
reduceGroups prox sync docs diagos branch =
-- 1) reduce a branch as a set of periods & groups
let periods = fromListWith (++)
......@@ -167,9 +174,9 @@ reduceGroups prox sync docs diagos branch =
$ mapWithKey (\prd groups ->
-- 2) for each period, transform the groups as a proximity graph filtered by a threshold
let diago = reduceDiagos $ filterDiago diagos [prd]
edgesLeft = fromList $ groupsToEdges prox sync ((sum . elems) $ restrictKeys docs $ periodsToYears [prd]) diago groups
edgesLeft = fromList $ groupsToEdges prox sync ((sum . HM.elems) $ HMU.restrictKeys docs $ periodsToYears [prd]) diago groups
edgesRight = fromList $ map (\((k1,k2),v) -> ((k2,k1),v))
$ groupsToEdges prox sync ((sum . elems) $ restrictKeys docs $ periodsToYears [prd]) diago (reverse groups)
$ groupsToEdges prox sync ((sum . HM.elems) $ HMU.restrictKeys docs $ periodsToYears [prd]) diago (reverse groups)
mergedEdges = Map.toList
$ unionWith (\v1 v2 -> if v1 >= v2
then v1
......
......@@ -15,6 +15,7 @@ module Gargantext.Core.Viz.Phylo.TemporalMatching where
import Control.Lens hiding (Level)
import Control.Parallel.Strategies (parList, rdeepseq, using)
import Data.HashMap.Strict qualified as HM
import Data.List (tail, intersect, nub, nubBy, union, partition)
import Data.List qualified as List
import Data.Map (fromList, elems, restrictKeys, unionWith, findWithDefault, keys, (!), empty, mapKeys, adjust, filterWithKey)
......@@ -25,6 +26,7 @@ import Data.Text qualified as T
import Data.Vector qualified as Vector
import Gargantext.Core.Viz.Phylo
import Gargantext.Core.Viz.Phylo.PhyloTools
import Gargantext.Data.HashMap.Strict.Utils qualified as HMU
import Gargantext.Prelude hiding (empty)
import Text.Printf
......@@ -192,8 +194,8 @@ filterPointersByPeriod fil pts =
ToChildsMemory -> undefined
ToParentsMemory -> undefined
filterDocs :: Map Date Double -> [Period] -> Map Date Double
filterDocs d pds = restrictKeys d $ periodsToYears pds
filterDocs :: HM.HashMap Date Double -> [Period] -> HM.HashMap Date Double
filterDocs d pds = HMU.restrictKeys d $ periodsToYears pds
filterDiago :: Map Date Cooc -> [Period] -> Map Date Cooc
filterDiago diago pds = restrictKeys diago $ periodsToYears pds
......@@ -232,8 +234,16 @@ groupsToBranches groups =
{-
-- find the best pair/singleton of parents/childs for a given group
-}
makePairs :: (PhyloGroupId,[Int]) -> [(PhyloGroupId,[Int])] -> [Period] -> [Pointer] -> Filiation -> Double -> PhyloSimilarity
-> Map Date Double -> Map Date Cooc -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
makePairs :: (PhyloGroupId,[Int])
-> [(PhyloGroupId,[Int])]
-> [Period]
-> [Pointer]
-> Filiation
-> Double
-> PhyloSimilarity
-> HM.HashMap Date Double
-> Map Date Cooc
-> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
makePairs (egoId, egoNgrams) candidates periods oldPointers fil thr prox docs diagos =
if (null periods)
then []
......@@ -247,7 +257,7 @@ makePairs (egoId, egoNgrams) candidates periods oldPointers fil thr prox docs di
inPairs :: [PhyloGroupId]
inPairs = map fst
$ filter (\(id,ngrams) ->
let nbDocs = (sum . elems) $ filterDocs docs ([(fst . fst) egoId, (fst . fst) id])
let nbDocs = (sum . HM.elems) $ filterDocs docs ([(fst . fst) egoId, (fst . fst) id])
diago = reduceDiagos $ filterDiago diagos ([(fst . fst) egoId, (fst . fst) id])
in (toSimilarity nbDocs diago prox egoNgrams egoNgrams ngrams) >= thr
) candidates
......@@ -259,8 +269,15 @@ makePairs (egoId, egoNgrams) candidates periods oldPointers fil thr prox docs di
{-
-- find the best temporal links between a given group and its parents/childs
-}
phyloGroupMatching :: [[(PhyloGroupId,[Int])]] -> Filiation -> PhyloSimilarity -> Map Date Double -> Map Date Cooc
-> Double -> [Pointer] -> (PhyloGroupId,[Int]) -> [Pointer]
phyloGroupMatching :: [[(PhyloGroupId,[Int])]]
-> Filiation
-> PhyloSimilarity
-> HM.HashMap Date Double
-> Map Date Cooc
-> Double
-> [Pointer]
-> (PhyloGroupId,[Int])
-> [Pointer]
phyloGroupMatching candidates filiation proxi docs diagos thr oldPointers (id,ngrams) =
if (null $ filterPointers proxi thr oldPointers)
-- if no previous pointers satisfy the current threshold then let's find new pointers
......@@ -282,7 +299,7 @@ phyloGroupMatching candidates filiation proxi docs diagos thr oldPointers (id,ng
$ scanl (\acc targets ->
let periods = nub $ map (fst . fst . fst) targets
lastPrd = findLastPeriod filiation periods
nbdocs = sum $ elems $ (filterDocs docs ([(fst . fst) id] ++ periods))
nbdocs = sum $ HM.elems $ (filterDocs docs ([(fst . fst) id] ++ periods))
diago = reduceDiagos
$ filterDiago diagos ([(fst . fst) id] ++ periods)
singletons = processSimilarity nbdocs diago $ map (\g -> (g,g)) $ filter (\g -> (fst . fst . fst) g == lastPrd) targets
......@@ -330,7 +347,14 @@ getCandidates minNgrams ego targets =
{-
-- set up and start performing the upstream/downstream inter‐temporal matching period by period
-}
reconstructTemporalLinks :: Int -> [Period] -> PhyloSimilarity -> Double -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [PhyloGroup]
reconstructTemporalLinks :: Int
-> [Period]
-> PhyloSimilarity
-> Double
-> HM.HashMap Date Double
-> Map Date Cooc
-> [PhyloGroup]
-> [PhyloGroup]
reconstructTemporalLinks frame periods similarity thr docs coocs groups =
let groups' = groupByField _phylo_groupPeriod groups
in foldl' (\acc prd ->
......@@ -397,7 +421,15 @@ filterByNgrams inf ngrams groups =
{-
-- perform the upstream/downstream inter‐temporal matching process group by group
-}
reconstructTemporalLinks' :: Int -> [Period] -> PhyloSimilarity -> Double -> Map Date Double -> Map Date Cooc -> Map Int [PhyloGroupId] -> [PhyloGroup] -> [PhyloGroup]
reconstructTemporalLinks' :: Int
-> [Period]
-> PhyloSimilarity
-> Double
-> HM.HashMap Date Double
-> Map Date Cooc
-> Map Int [PhyloGroupId]
-> [PhyloGroup]
-> [PhyloGroup]
reconstructTemporalLinks' frame periods similarity thr docs coocs roots groups =
let egos = map (\ego ->
let -- 1) find the parents/childs matching periods
......@@ -433,7 +465,15 @@ reconstructTemporalLinks' frame periods similarity thr docs coocs roots groups =
{-
-- reconstruct a phylomemetic network from a list of groups and from a given threshold
-}
toPhylomemeticNetwork :: Int -> [Period] -> PhyloSimilarity -> Double -> Map Date Double -> Map Date Cooc -> Map Int [PhyloGroupId] -> [PhyloGroup] -> [Branch]
toPhylomemeticNetwork :: Int
-> [Period]
-> PhyloSimilarity
-> Double
-> HM.HashMap Date Double
-> Map Date Cooc
-> Map Int [PhyloGroupId]
-> [PhyloGroup]
-> [Branch]
toPhylomemeticNetwork timescale periods similarity thr docs coocs roots groups =
groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
$ reconstructTemporalLinks' timescale periods similarity thr docs coocs roots groups
......@@ -593,10 +633,20 @@ thrToMeta thr branches =
-- done = all the already separated branches
-- rest = all the branches we still have to separate
-}
separateBranches :: Double -> PhyloSimilarity -> Double -> Map Int Double -> Int -> Double -> Double
-> Int -> Map Date Double -> Map Date Cooc -> Map Int [PhyloGroupId] -> [Period]
-> [(Branch,ShouldTry)] -> (Branch,ShouldTry) -> [(Branch,ShouldTry)]
-> [(Branch,ShouldTry)]
separateBranches :: Double
-> PhyloSimilarity
-> Double
-> Map Int Double
-> Int
-> Double
-> Double
-> Int
-> HM.HashMap Date Double
-> Map Date Cooc
-> Map Int [PhyloGroupId]
-> [Period]
-> [(Branch,ShouldTry)] -> (Branch,ShouldTry) -> [(Branch,ShouldTry)]
-> [(Branch,ShouldTry)]
separateBranches fdt similarity lambda frequency minBranch thr rise timescale docs coocs roots periods done currentBranch rest =
let done' = done ++ (if snd currentBranch
then
......@@ -651,7 +701,7 @@ separateBranches fdt similarity lambda frequency minBranch thr rise timescale do
seaLevelRise :: Double -> PhyloSimilarity -> Double -> Int -> Map Int Double
-> [Double] -> Double
-> Int -> [Period]
-> Map Date Double -> Map Date Cooc
-> HM.HashMap Date Double -> Map Date Cooc
-> Map Int [PhyloGroupId]
-> [(Branch,ShouldTry)]
-> ([(Branch,ShouldTry)],FinalQuality)
......
......@@ -13,6 +13,7 @@ module Gargantext.Data.HashMap.Strict.Utils where
import Data.HashMap.Strict (HashMap)
import Data.HashMap.Strict qualified as HashMap
import Data.Set qualified as Set
import Gargantext.Prelude
------------------------------------------------------------------------
......@@ -45,3 +46,7 @@ getKeysOrderedByValueMaxFirst m = go [] Nothing (HashMap.toList m)
| v > u = go [k] (Just v) rest
| otherwise = go (k:ks) (Just v) rest
-- | Similar to 'Data.Map.restrictKeys'
restrictKeys :: (Hashable k, Eq k, Ord k) => HashMap k v -> Set.Set k -> HashMap k v
restrictKeys hm keysToKeep = HashMap.filterWithKey (\k _ -> Set.member k keysToKeep) hm
......@@ -20,24 +20,26 @@ import Gargantext.Database.Admin.Types.Hyperdata.Prelude
import Codec.Serialise.Class hiding (decode)
------------------------------------------------------------------------
data HyperdataDocument = HyperdataDocument { _hd_bdd :: !(Maybe Text)
, _hd_doi :: !(Maybe Text)
, _hd_url :: !(Maybe Text)
, _hd_page :: !(Maybe Int)
, _hd_title :: !(Maybe Text)
, _hd_authors :: !(Maybe Text)
, _hd_institutes :: !(Maybe Text)
, _hd_source :: !(Maybe Text)
, _hd_abstract :: !(Maybe Text)
, _hd_publication_date :: !(Maybe Text)
, _hd_publication_year :: !(Maybe Int)
, _hd_publication_month :: !(Maybe Int)
, _hd_publication_day :: !(Maybe Int)
, _hd_publication_hour :: !(Maybe Int)
, _hd_publication_minute :: !(Maybe Int)
, _hd_publication_second :: !(Maybe Int)
, _hd_language_iso2 :: !(Maybe Text)
, _hd_institutes_tree :: !(Maybe (Map Text [Text]))
data HyperdataDocument = HyperdataDocument { _hd_bdd :: !(Maybe Text)
, _hd_doi :: !(Maybe Text)
, _hd_url :: !(Maybe Text)
, _hd_page :: !(Maybe Int)
, _hd_title :: !(Maybe Text)
, _hd_authors :: !(Maybe Text)
, _hd_institutes :: !(Maybe Text)
, _hd_source :: !(Maybe Text)
, _hd_abstract :: !(Maybe Text)
, _hd_language_iso2 :: !(Maybe Text)
, _hd_institutes_tree :: !(Maybe (Map Text [Text]))
, _hd_publication_date :: !(Maybe Text)
, _hd_publication_year :: !(Maybe Int)
, _hd_publication_month :: !(Maybe Int)
, _hd_publication_day :: !(Maybe Int)
, _hd_publication_hour :: !(Maybe Int)
, _hd_publication_minute :: !(Maybe Int)
, _hd_publication_second :: !(Maybe Int)
}
deriving (Show, Generic)
......@@ -73,23 +75,25 @@ data StatusV3 = StatusV3 { statusV3_error :: !(Maybe Text)
} deriving (Show, Generic)
------------------------------------------------------------------------
data HyperdataDocumentV3 = HyperdataDocumentV3 { _hdv3_publication_day :: !(Maybe Int)
, _hdv3_language_iso2 :: !(Maybe Text)
, _hdv3_publication_second :: !(Maybe Int)
, _hdv3_publication_minute :: !(Maybe Int)
, _hdv3_publication_month :: !(Maybe Int)
, _hdv3_publication_hour :: !(Maybe Int)
, _hdv3_error :: !(Maybe Text)
, _hdv3_language_iso3 :: !(Maybe Text)
, _hdv3_authors :: !(Maybe Text)
, _hdv3_publication_year :: !(Maybe Int)
, _hdv3_publication_date :: !(Maybe Text)
, _hdv3_language_name :: !(Maybe Text)
, _hdv3_statuses :: !(Maybe [StatusV3])
, _hdv3_realdate_full_ :: !(Maybe Text)
, _hdv3_source :: !(Maybe Text)
, _hdv3_abstract :: !(Maybe Text)
, _hdv3_title :: !(Maybe Text)
data HyperdataDocumentV3 = HyperdataDocumentV3 { _hdv3_language_iso2 :: !(Maybe Text)
, _hdv3_error :: !(Maybe Text)
, _hdv3_language_iso3 :: !(Maybe Text)
, _hdv3_authors :: !(Maybe Text)
, _hdv3_language_name :: !(Maybe Text)
, _hdv3_statuses :: !(Maybe [StatusV3])
, _hdv3_realdate_full_ :: !(Maybe Text)
, _hdv3_source :: !(Maybe Text)
, _hdv3_abstract :: !(Maybe Text)
, _hdv3_title :: !(Maybe Text)
, _hdv3_publication_date :: !(Maybe Text)
, _hdv3_publication_second :: !(Maybe Int)
, _hdv3_publication_minute :: !(Maybe Int)
, _hdv3_publication_hour :: !(Maybe Int)
, _hdv3_publication_day :: !(Maybe Int)
, _hdv3_publication_month :: !(Maybe Int)
, _hdv3_publication_year :: !(Maybe Int)
} deriving (Show, Generic)
......
......@@ -14,9 +14,12 @@ module Test.Core.Utils where
import Data.Time (toGregorian, utctDay, utctDayTime)
import Data.Time.LocalTime (TimeOfDay(..), timeOfDayToTime)
import Gargantext.Core.Utils
import Gargantext.Core.Utils.DateUtils (dateParts, parseFlexibleTime)
import Gargantext.Core.Utils.DateUtils (dateParts, parseFlexibleTime, toUTCTime, toUTCTimeR)
import Gargantext.Prelude
import Test.Hspec
import Test.Hspec.QuickCheck
import Test.Instances ()
-- | Core.Utils tests
test :: Spec
......@@ -40,7 +43,11 @@ test = do
it "simple list" $ nonemptyIntercalate "," ["x"] `shouldBe` "x"
it "two-element list" $ nonemptyIntercalate "," ["x", "y"] `shouldBe` "x,y"
it "with empty strings" $ nonemptyIntercalate "," ["a", "", "b", "", "c", ""] `shouldBe` "a,b,c"
describe "DateUtils works" $ do
describe "DateUtils" $ do
describe "UTCTimeR works" $ do
prop "can convert to/from" $
\utcTimeR -> toUTCTimeR (toUTCTime utcTimeR) == utcTimeR
describe "parseFlexibleTime works" $ do
it "ISO8601 format works 1" $ do
let parsed = parseFlexibleTime "2025-05-04T12:05:01.000Z"
......@@ -50,6 +57,14 @@ test = do
let parsed = parseFlexibleTime "2025-05-04T12:05:01Z"
(toGregorian . utctDay) <$> parsed `shouldBe` (Just (2025, 5, 4))
utctDayTime <$> parsed `shouldBe` (Just $ timeOfDayToTime $ TimeOfDay 12 5 1)
it "'2025-07-20T01:00:13' format works" $ do
let parsed = parseFlexibleTime "2025-07-20T01:00:13"
(toGregorian . utctDay) <$> parsed `shouldBe` (Just (2025, 7, 20))
utctDayTime <$> parsed `shouldBe` (Just $ timeOfDayToTime $ TimeOfDay 1 0 13)
it "'2025-07-20 01:00:13' format works" $ do
let parsed = parseFlexibleTime "2025-07-20 01:00:13"
(toGregorian . utctDay) <$> parsed `shouldBe` (Just (2025, 7, 20))
utctDayTime <$> parsed `shouldBe` (Just $ timeOfDayToTime $ TimeOfDay 1 0 13)
it "'2025-07-20 01:00:13 UTC' format works" $ do
let parsed = parseFlexibleTime "2025-07-20 01:00:13 UTC"
(toGregorian . utctDay) <$> parsed `shouldBe` (Just (2025, 7, 20))
......
......@@ -54,6 +54,7 @@ import Gargantext.Core.Text.Corpus.Query (RawQuery(..))
import Gargantext.Core.Types (TableResult)
import Gargantext.Core.Types.Individu qualified as Individu
import Gargantext.Core.Types.Main (ListType(CandidateTerm, StopTerm, MapTerm))
import Gargantext.Core.Utils.DateUtils (UTCTimeR(..))
import Gargantext.Core.Viz.Phylo qualified as Phylo
import Gargantext.Core.Worker.Jobs.Types (Job(..))
import Gargantext.Database.Admin.Types.Hyperdata qualified as Hyperdata
......@@ -212,6 +213,22 @@ instance Arbitrary TableQuery where
instance Arbitrary UTCTimeR where
arbitrary = do
year <- arbitrary
month <- chooseInt (1, 12)
day <- if month `elem` [1, 3, 5, 7, 8, 10, 12]
then chooseInt (1, 31)
else if month == 2
then chooseInt (1, 28)
else chooseInt (1, 30)
hour <- chooseInt (0, 23)
minute <- chooseInt (0, 59)
sec <- chooseInt (0, 59)
pure $ UTCTimeR { .. }
-- phylo
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment