[phylo] hourly phylo seems to work now

parent 32c2d5c8
Pipeline #7828 failed with stages
in 65 minutes and 25 seconds
......@@ -20,10 +20,12 @@ import Gargantext.Core.Text.List.Formats.TSV (tsvMapTermList)
import Gargantext.Core.Text.Ngrams (NgramsType(..))
import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
import Gargantext.Core.Types.Main (ListType(..))
import Gargantext.Core.Utils.DateUtils (UTCTimeR(..))
import Gargantext.Core.Viz.Phylo
import Gargantext.Core.Viz.Phylo.API.Tools
import Gargantext.Core.Viz.Phylo.PhyloTools (toPeriods, getTimePeriod, getTimeStep)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..), unwrapHyperdataDateTime)
import Gargantext.Defaults qualified as Def
import Gargantext.Prelude hiding (hash, replace)
import Prelude qualified
import System.Directory (listDirectory)
......@@ -59,15 +61,12 @@ wosToDocs limit patterns time path = do
abstr = if (isJust $ _hd_abstract d)
then fromJust $ _hd_abstract d
else ""
in Document (toPhyloDate
(fromIntegral $ fromJust $ _hd_publication_year d)
(fromJust $ _hd_publication_month d)
(fromJust $ _hd_publication_day d) time)
(toPhyloDate'
(fromIntegral $ fromJust $ _hd_publication_year d)
(fromJust $ _hd_publication_month d)
(fromJust $ _hd_publication_day d) time)
(termsInText patterns $ title <> " " <> abstr) Nothing [] time)
in Document { date = toPhyloDate (unwrapHyperdataDateTime d) time
, date' = toPhyloDate' (unwrapHyperdataDateTime d) time
, text = termsInText patterns $ title <> " " <> abstr
, weight = Nothing
, sources = []
, docTime = time })
<$> concat
<$> mapConcurrently (\file ->
filter (\d -> (isJust $ _hd_publication_year d)
......@@ -82,22 +81,36 @@ tsvToDocs parser patterns time path =
Wos _ -> errorTrace "tsvToDocs: unimplemented"
Tsv limit -> Vector.toList
<$> Vector.take limit
<$> Vector.map (\row -> Document (toPhyloDate (Tsv.fromMIntOrDec Tsv.defaultYear $ tsv_publication_year row) (fromMaybe Tsv.defaultMonth $ tsv_publication_month row) (fromMaybe Tsv.defaultDay $ tsv_publication_day row) time)
(toPhyloDate' (Tsv.fromMIntOrDec Tsv.defaultYear $ tsv_publication_year row) (fromMaybe Tsv.defaultMonth $ tsv_publication_month row) (fromMaybe Tsv.defaultDay $ tsv_publication_day row) time)
(termsInText patterns $ (tsv_title row) <> " " <> (tsv_abstract row))
Nothing
[]
time
<$> Vector.map (\row -> Document { date = toPhyloDate (rowToUTCTimeR row) time
, date' = toPhyloDate' (rowToUTCTimeR row) time
, text = termsInText patterns $ (tsv_title row) <> " " <> (tsv_abstract row)
, weight = Nothing
, sources = []
, docTime = time }
) <$> snd <$> either (\err -> panicTrace $ "TSV error" <> (show err)) identity <$> Tsv.readTSVFile path
where
rowToUTCTimeR row = UTCTimeR { year = Tsv.fromMIntOrDec Tsv.defaultYear $ tsv_publication_year row
, month = fromMaybe Tsv.defaultMonth $ tsv_publication_month row
, day = fromMaybe Tsv.defaultDay $ tsv_publication_day row
, hour = Def.hour
, minute = Def.minute
, sec = Def.second }
Tsv' limit -> Vector.toList
<$> Vector.take limit
<$> Vector.map (\row -> Document (toPhyloDate (tsv'_publication_year row) (tsv'_publication_month row) (tsv'_publication_day row) time)
(toPhyloDate' (tsv'_publication_year row) (tsv'_publication_month row) (tsv'_publication_day row) time)
(termsInText patterns $ (tsv'_title row) <> " " <> (tsv'_abstract row))
(Just $ tsv'_weight row)
(map (T.strip . pack) $ splitOn ";" (unpack $ (tsv'_source row)))
time
<$> Vector.map (\row -> Document { date = toPhyloDate (rowToUTCTimeR row) time
, date' = toPhyloDate' (rowToUTCTimeR row) time
, text = termsInText patterns $ (tsv'_title row) <> " " <> (tsv'_abstract row)
, weight = Just $ tsv'_weight row
, sources = map (T.strip . pack) $ splitOn ";" (unpack $ (tsv'_source row))
, docTime = time }
) <$> snd <$> Tsv.readWeightedTsv path
where
rowToUTCTimeR row = UTCTimeR { year = tsv'_publication_year row
, month = tsv'_publication_month row
, day = tsv'_publication_day row
, hour = Def.hour
, minute = Def.minute
, sec = Def.second }
Csv _ -> panicTrace "CSV is currently not supported."
Csv' _ -> panicTrace "CSV is currently not supported."
......@@ -162,6 +175,7 @@ timeToLabel config = case (timeUnit config) of
Month p s f -> ("time_months" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
Week p s f -> ("time_weeks" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
Day p s f -> ("time_days" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
Hour _p _s _f -> panicTrace "hours not implemented"
seaToLabel :: PhyloConfig -> [Char]
......
......@@ -848,6 +848,7 @@ test-suite garg-test
Test.Core.LinearAlgebra
Test.Core.Notifications
Test.Core.Orchestrator
Test.Core.Phylo
Test.Core.Similarity
Test.Core.Text
Test.Core.Text.Corpus.Query
......
......@@ -109,7 +109,7 @@ type Weight = Double
------------------------------------------------------------------------
-- | Phylo 'GraphData' datatype descriptor. It must be isomorphic to
-- the 'GraphData' type of the purecript frontend.
-- This structure corresponds to the direct output of the 'dot' command.
data GraphData =
GraphData {
_gd__subgraph_cnt :: Int
......@@ -473,10 +473,12 @@ instance FromJSON GraphData where
parseJSON = withObject "GraphData" $ \o -> do
_gd__subgraph_cnt <- o .: "_subgraph_cnt"
_gd_directed <- o .: "directed"
_gd_edges <- o .: "edges"
-- | Sometimes, edges are missing in dot output
gd_edges <- o .:? "edges"
_gd_objects <- o .: "objects"
_gd_strict <- o .: "strict"
_gd_data <- parseJSON (Object o)
let _gd_edges = fromMaybe [] gd_edges
pure GraphData{..}
instance ToJSON EdgeData where
......
......@@ -135,6 +135,10 @@ data TimeUnit =
{ _day_period :: Int
, _day_step :: Int
, _day_matchingFrame :: Int }
| Hour
{ _hour_period :: Int
, _hour_step :: Int
, _hour_matchingFrame :: Int }
deriving (Show,Generic,Eq,NFData,ToExpr)
instance ToSchema TimeUnit where
......
......@@ -24,6 +24,7 @@ import Data.ByteString.Lazy qualified as Lazy
import Data.Map.Strict qualified as Map
import Data.Set qualified as Set
import Data.Text (pack)
import Data.Time (diffUTCTime)
import Data.Time.Calendar (fromGregorian, diffGregorianDurationClip, cdMonths, diffDays, showGregorian)
import Data.Time.Clock.POSIX(posixSecondsToUTCTime)
import Gargantext.API.Ngrams.Prelude (getTermList)
......@@ -33,12 +34,13 @@ import Gargantext.Core.NodeStory.Types (HasNodeStory, NodeStoryEnv, HasNodeStory
import Gargantext.Core.Text.Ngrams (NgramsType(..))
import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, termsInText)
import Gargantext.Core.Types.Main (ListType(MapTerm))
import Gargantext.Core.Utils.DateUtils (UTCTimeR(..), defUTCTimeR, toUTCTime)
import Gargantext.Core.Viz.Phylo (TimeUnit(..), Date, Document(..), PhyloConfig(..), Phylo (_phylo_computeTime), trackComputeTime, ComputeTimeHistory)
import Gargantext.Core.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
import Gargantext.Core.Viz.Phylo.PhyloMaker (toPhylo, toPhyloWithoutLink)
import Gargantext.Core.Viz.Phylo.PhyloTools ({-printIOMsg, printIOComment,-} setConfig)
import Gargantext.Database.Admin.Types.Hyperdata.Corpus ( HyperdataCorpus(..) )
import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..), unwrapHyperdataDateTime)
import Gargantext.Database.Admin.Types.Hyperdata.Phylo ( HyperdataPhylo(..) )
import Gargantext.Database.Admin.Types.Node (Context, CorpusId, ContextId, PhyloId, nodeId2ContextId)
import Gargantext.Database.Prelude
......@@ -54,6 +56,7 @@ import Prelude qualified
import System.FilePath ((</>))
import System.IO.Temp (withTempDirectory)
import System.Process qualified as Shell
import System.Directory (copyFile)
--------------------------------------------------------------------
getPhyloData :: HasNodeError err
......@@ -88,6 +91,10 @@ phylo2dot2json phylo = do
Shell.callProcess "dot" ["-Tdot", "-o", fileDot, fileFrom]
Shell.callProcess "dot" ["-Txdot_json", "-o", fileToJson, fileDot]
copyFile fileToJson "/home/przemek/phylo/output.json"
copyFile fileDot "/home/przemek/phylo/phylo.dot"
copyFile fileFrom "/home/przemek/phylo/phyloFrom.dot"
maybeValue <- decodeFileStrict fileToJson
-- print maybeValue
......@@ -155,14 +162,8 @@ toPhyloDocs :: Lang -> Patterns -> TimeUnit -> HyperdataDocument -> Document
toPhyloDocs lang patterns time d =
let title = fromMaybe "" (_hd_title d)
abstr = fromMaybe "" (_hd_abstract d)
in Document (toPhyloDate
(fromIntegral $ fromMaybe 1 $ _hd_publication_year d)
(fromMaybe 1 $ _hd_publication_month d)
(fromMaybe 1 $ _hd_publication_day d) time)
(toPhyloDate'
(fromIntegral $ fromMaybe 1 $ _hd_publication_year d)
(fromMaybe 1 $ _hd_publication_month d)
(fromMaybe 1 $ _hd_publication_day d) time)
in Document (toPhyloDate (unwrapHyperdataDateTime d) time)
(toPhyloDate' (unwrapHyperdataDateTime d) time)
(termsInText' lang patterns $ title <> " " <> abstr) Nothing [] time
......@@ -189,10 +190,8 @@ context2date :: Context HyperdataDocument -> TimeUnit -> Maybe (Date, Text)
context2date context timeUnit = do
let hyperdata = _context_hyperdata context
let
year = fromMaybe 1 $ _hd_publication_year hyperdata
month = fromMaybe 1 $ _hd_publication_month hyperdata
day = fromMaybe 1 $ _hd_publication_day hyperdata
pure (toPhyloDate year month day timeUnit, toPhyloDate' year month day timeUnit)
utcTimeR = unwrapHyperdataDateTime hyperdata
pure (toPhyloDate utcTimeR timeUnit, toPhyloDate' utcTimeR timeUnit)
---------------
......@@ -201,23 +200,30 @@ context2date context timeUnit = do
toMonths :: Integer -> Int -> Int -> Date
toMonths y m d = fromIntegral $ cdMonths
$ diffGregorianDurationClip (fromGregorian y m d)
(fromGregorian 0000 0 0)
(fromGregorian 0001 1 1)
toDays :: Integer -> Int -> Int -> Date
toDays y m d = fromIntegral
$ diffDays (fromGregorian y m d) (fromGregorian 0000 0 0)
toPhyloDate :: Int -> Int -> Int -> TimeUnit -> Date
toPhyloDate y m d tu = case tu of
Year {} -> y
Month {} -> toMonths (Prelude.toInteger y) m d
Week {} -> div (toDays (Prelude.toInteger y) m d) 7
Day {} -> toDays (Prelude.toInteger y) m d
_ -> panic "[G.C.V.Phylo.API] toPhyloDate"
toPhyloDate' :: Int -> Int -> Int -> TimeUnit -> Text
toPhyloDate' y _m _d (Epoch {}) = pack $ show $ posixSecondsToUTCTime $ fromIntegral y
toPhyloDate' y m d _ = pack $ showGregorian $ fromGregorian (toInteger y) m d
$ diffDays (fromGregorian y m d) (fromGregorian 0001 1 1)
toHours :: UTCTimeR -> Date
toHours utcTimeR = floor diffSeconds `div` 3600
where
diffSeconds = diffUTCTime (toUTCTime utcTimeR) (toUTCTime defUTCTimeR)
-- | This must be in accordance with frontend's 'G.C.PE.Types' -> 'parseNodeDate'
toPhyloDate :: UTCTimeR -> TimeUnit -> Date
toPhyloDate utcTimeR@(UTCTimeR { .. }) tu = case tu of
Year {} -> year
Month {} -> toMonths (Prelude.toInteger year) month day
Week {} -> div (toDays (Prelude.toInteger year) month day) 7
Day {} -> toDays (Prelude.toInteger year) month day
Hour {} -> toHours utcTimeR
_ -> panic "[G.C.V.P.API.Tools] toPhyloDate"
toPhyloDate' :: UTCTimeR -> TimeUnit -> Text
toPhyloDate' (UTCTimeR { .. }) (Epoch {}) = pack $ show $ posixSecondsToUTCTime $ fromIntegral year
toPhyloDate' (UTCTimeR { .. }) _ = pack $ showGregorian $ fromGregorian (toInteger year) month day
-- Utils
......
......@@ -672,7 +672,7 @@ toPhyloExport phylo = exportToDot phylo
$ processMetrics phylo export
where
export :: PhyloExport
export = PhyloExport groups branches
export = PhyloExport { _export_groups = groups, _export_branches = branches }
--------------------------------------
branches :: [PhyloBranch]
branches = map (\g ->
......
......@@ -175,6 +175,7 @@ getTimeScale p = case timeUnit $ getConfig p of
Month {} -> "month"
Week {} -> "week"
Day {} -> "day"
Hour {} -> "hour"
-- | Get a regular & ascendante timeScale from a given list of dates
......@@ -191,6 +192,7 @@ getTimeStep time = case time of
Month { .. } -> _month_step
Week { .. } -> _week_step
Day { .. } -> _day_step
Hour { .. } -> _hour_step
getTimePeriod :: TimeUnit -> Int
getTimePeriod time = case time of
......@@ -199,6 +201,7 @@ getTimePeriod time = case time of
Month { .. } -> _month_period
Week { .. } -> _week_period
Day { .. } -> _day_period
Hour { .. } -> _hour_period
getTimeFrame :: TimeUnit -> Int
getTimeFrame time = case time of
......@@ -207,6 +210,7 @@ getTimeFrame time = case time of
Month { .. } -> _month_matchingFrame
Week { .. } -> _week_matchingFrame
Day { .. } -> _day_matchingFrame
Hour { .. } -> _hour_matchingFrame
-------------
-- | Fis | --
......
......@@ -17,6 +17,8 @@ import Gargantext.Prelude hiding (ByteString)
import Gargantext.Core.Text (HasText(..))
import Gargantext.Core.Utils.Prefix (unCapitalize, dropPrefix)
import Gargantext.Database.Admin.Types.Hyperdata.Prelude
import Gargantext.Defaults qualified as Def
import Gargantext.Core.Utils.DateUtils (UTCTimeR(..))
import Codec.Serialise.Class hiding (decode)
------------------------------------------------------------------------
......@@ -51,11 +53,24 @@ instance HasText HyperdataDocument
]
emptyHyperdataDocument :: HyperdataDocument
emptyHyperdataDocument = HyperdataDocument Nothing Nothing
Nothing Nothing Nothing Nothing
Nothing Nothing Nothing Nothing
Nothing Nothing Nothing Nothing
Nothing Nothing Nothing Nothing
emptyHyperdataDocument = HyperdataDocument { _hd_bdd = Nothing
, _hd_doi = Nothing
, _hd_url = Nothing
, _hd_page = Nothing
, _hd_title = Nothing
, _hd_authors = Nothing
, _hd_institutes = Nothing
, _hd_source = Nothing
, _hd_abstract = Nothing
, _hd_publication_date = Nothing
, _hd_publication_year = Nothing
, _hd_publication_month = Nothing
, _hd_publication_day = Nothing
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Nothing
, _hd_institutes_tree = Nothing }
defaultHyperdataDocument :: HyperdataDocument
defaultHyperdataDocument = case decode docExample of
......@@ -66,6 +81,16 @@ defaultHyperdataDocument = case decode docExample of
docExample :: ByteString
docExample = "{\"doi\":\"sdfds\",\"publication_day\":6,\"language_iso2\":\"en\",\"publication_minute\":0,\"publication_month\":7,\"language_iso3\":\"eng\",\"publication_second\":0,\"authors\":\"Nils Hovdenak, Kjell Haram\",\"publication_year\":2012,\"publication_date\":\"2012-07-06 00:00:00+00:00\",\"language_name\":\"English\",\"realdate_full_\":\"2012 01 12\",\"source\":\"European journal of obstetrics, gynecology, and reproductive biology\",\"abstract\":\"The literature was searched for publications on minerals and vitamins during pregnancy and the possible influence of supplements on pregnancy outcome.\",\"title\":\"Influence of mineral and vitamin supplements on pregnancy outcome.\",\"publication_hour\":0}"
unwrapHyperdataDateTime :: HyperdataDocument -> UTCTimeR
unwrapHyperdataDateTime HyperdataDocument { .. } =
UTCTimeR { year = fromMaybe (fromIntegral Def.year) _hd_publication_year
, month = fromMaybe Def.month _hd_publication_month
, day = fromMaybe Def.day _hd_publication_day
, hour = fromMaybe Def.hour _hd_publication_hour
, minute = fromMaybe Def.minute _hd_publication_minute
, sec = fromMaybe Def.second _hd_publication_second }
------------------------------------------------------------------------
-- | Legacy Garg V3 compatibility (to be removed one year after release)
data StatusV3 = StatusV3 { statusV3_error :: !(Maybe Text)
......
......@@ -65,6 +65,7 @@ import Database.PostgreSQL.Simple.SqlQQ ( sql )
import Database.PostgreSQL.Simple.ToField (toField, Action{-, ToField-})
import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
import Gargantext.Core (HasDBid(toDBid))
import Gargantext.Core.Utils.DateUtils (UTCTimeR(..), toUTCTime)
import Gargantext.Database.Admin.Types.Hyperdata.Contact ( HyperdataContact )
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Database.Admin.Types.Node
......@@ -274,20 +275,42 @@ class ToNode a
toNode :: HasDBid NodeType => UserId -> Maybe ParentId -> a -> Node a
instance ToNode HyperdataDocument where
toNode u p h = Node 0 Nothing (toDBid NodeDocument) u p n date h
toNode u p h = Node { _node_id = 0
, _node_hash_id = Nothing
, _node_typename = toDBid NodeDocument
, _node_user_id = u
, _node_parent_id = p
, _node_name = n
, _node_date = date
, _node_hyperdata = h }
where
n = maybe "No Title" (DT.take 255) (_hd_title h)
date = jour y m d
date = toUTCTime $ UTCTimeR { year = y
, month = m
, day = d
, hour = hh
, minute = mm
, sec = ss }
-- NOTE: There is no year '0' in postgres, there is year 1 AD and beofre that year 1 BC:
-- select '0001-01-01'::date, '0001-01-01'::date - '1 day'::interval;
-- 0001-01-01 0001-12-31 00:00:00 BC
y = fromIntegral $ fromMaybe Defaults.day $ _hd_publication_year h
m = fromMaybe Defaults.month $ _hd_publication_month h
d = fromMaybe (fromIntegral Defaults.year) $ _hd_publication_day h
hh = fromMaybe (fromIntegral Defaults.hour) $ _hd_publication_hour h
mm = fromMaybe (fromIntegral Defaults.minute) $ _hd_publication_minute h
ss = fromMaybe (fromIntegral Defaults.second) $ _hd_publication_second h
-- TODO better Node
instance ToNode HyperdataContact where
toNode u p = Node 0 Nothing (toDBid NodeContact) u p "Contact" date
toNode u p h = Node { _node_id = 0
, _node_hash_id = Nothing
, _node_typename = toDBid NodeContact
, _node_user_id = u
, _node_parent_id = p
, _node_name = "Contact"
, _node_date = date
, _node_hyperdata = h }
where
date = jour 2020 01 01
......
......@@ -22,6 +22,7 @@ import Test.Core.Orchestrator qualified as Orchestrator
import Test.Core.Similarity qualified as Similarity
import Test.Core.Text.Corpus.Query qualified as CorpusQuery
import Test.Core.Text.Corpus.TSV qualified as TSVParser
import Test.Core.Phylo qualified as CorePhylo
import Test.Core.Utils qualified as Utils
import Test.Core.Worker qualified as Worker
import Test.Graph.Clustering qualified as Clustering
......@@ -122,6 +123,7 @@ main = do
DBT.tests
DB.nodeStoryTests
describe "Utils" $ Utils.test
describe "CorePhylo" $ CorePhylo.test
describe "Graph Clustering" $ Clustering.test
describe "Graph Distance" $ Distance.test
describe "Date split" $ PD.testDateSplit
......
{-|
Module : Core.Phylo
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module Test.Core.Phylo where
import Gargantext.Core.Utils.DateUtils (UTCTimeR(..))
import Gargantext.Core.Viz.Phylo.API.Tools (toMonths, toDays, toHours)
import Gargantext.Prelude
import Test.Hspec
import Test.Instances ()
-- | Core.Utils tests
test :: Spec
test = do
describe "phylo tools works" $ do
it "toMonths works" $ do
toMonths 2025 1 1 `shouldBe` 24300
toMonths 2025 2 1 `shouldBe` 24301
toMonths 2025 4 1 `shouldBe` 24303
it "toDays works" $ do
toDays 2025 1 1 `shouldBe` 739617
toDays 2025 1 2 `shouldBe` 739618
toDays 2025 1 4 `shouldBe` 739620
it "toHours works" $ do
toHours (UTCTimeR { year = 2025, month = 1, day = 1, hour = 0, minute = 0, sec = 0 }) `shouldBe` 17742024
toHours (UTCTimeR { year = 2025, month = 1, day = 1, hour = 1, minute = 0, sec = 0 }) `shouldBe` 17742025
toHours (UTCTimeR { year = 2025, month = 1, day = 1, hour = 3, minute = 0, sec = 0 }) `shouldBe` 17742027
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment