Commit 67c70cdb authored by qlobbe's avatar qlobbe

adaptative time scale with phylo 1click

parent 03327f23
...@@ -21,7 +21,7 @@ import Control.Concurrent.Async (mapConcurrently) ...@@ -21,7 +21,7 @@ import Control.Concurrent.Async (mapConcurrently)
import Crypto.Hash.SHA256 (hash) import Crypto.Hash.SHA256 (hash)
import Data.Aeson import Data.Aeson
import Data.Either (Either(..), fromRight) import Data.Either (Either(..), fromRight)
import Data.List (concat, nub, isSuffixOf) import Data.List (concat, nub, isSuffixOf,sort,tail)
import Data.List.Split import Data.List.Split
import Data.Maybe (fromMaybe) import Data.Maybe (fromMaybe)
import Data.String (String) import Data.String (String)
...@@ -39,7 +39,7 @@ import Gargantext.Core.Viz.Phylo ...@@ -39,7 +39,7 @@ import Gargantext.Core.Viz.Phylo
import Gargantext.Core.Viz.Phylo.API.Tools import Gargantext.Core.Viz.Phylo.API.Tools
import Gargantext.Core.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile) import Gargantext.Core.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
import Gargantext.Core.Viz.Phylo.PhyloMaker (toPhylo, toPhyloWithoutLink) import Gargantext.Core.Viz.Phylo.PhyloMaker (toPhylo, toPhyloWithoutLink)
import Gargantext.Core.Viz.Phylo.PhyloTools (printIOMsg, printIOComment, setConfig) import Gargantext.Core.Viz.Phylo.PhyloTools (printIOMsg, printIOComment, setConfig, toPeriods, getTimePeriod, getTimeStep)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..)) import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Database.Schema.Ngrams (NgramsType(..)) import Gargantext.Database.Schema.Ngrams (NgramsType(..))
import Gargantext.Prelude import Gargantext.Prelude
...@@ -89,7 +89,7 @@ wosToDocs limit patterns time path = do ...@@ -89,7 +89,7 @@ wosToDocs limit patterns time path = do
(fromIntegral $ fromJust $ _hd_publication_year d) (fromIntegral $ fromJust $ _hd_publication_year d)
(fromJust $ _hd_publication_month d) (fromJust $ _hd_publication_month d)
(fromJust $ _hd_publication_day d) time) (fromJust $ _hd_publication_day d) time)
(termsInText patterns $ title <> " " <> abstr) Nothing []) (termsInText patterns $ title <> " " <> abstr) Nothing [] time)
<$> concat <$> concat
<$> mapConcurrently (\file -> <$> mapConcurrently (\file ->
filter (\d -> (isJust $ _hd_publication_year d) filter (\d -> (isJust $ _hd_publication_year d)
...@@ -109,6 +109,7 @@ csvToDocs parser patterns time path = ...@@ -109,6 +109,7 @@ csvToDocs parser patterns time path =
(termsInText patterns $ (csv_title row) <> " " <> (csv_abstract row)) (termsInText patterns $ (csv_title row) <> " " <> (csv_abstract row))
Nothing Nothing
[] []
time
) <$> snd <$> either (\err -> panic $ cs $ "CSV error" <> (show err)) identity <$> Csv.readCSVFile path ) <$> snd <$> either (\err -> panic $ cs $ "CSV error" <> (show err)) identity <$> Csv.readCSVFile path
Csv' limit -> Vector.toList Csv' limit -> Vector.toList
<$> Vector.take limit <$> Vector.take limit
...@@ -117,18 +118,35 @@ csvToDocs parser patterns time path = ...@@ -117,18 +118,35 @@ csvToDocs parser patterns time path =
(termsInText patterns $ (csv'_title row) <> " " <> (csv'_abstract row)) (termsInText patterns $ (csv'_title row) <> " " <> (csv'_abstract row))
(Just $ csv'_weight row) (Just $ csv'_weight row)
(map (T.strip . pack) $ splitOn ";" (unpack $ (csv'_source row))) (map (T.strip . pack) $ splitOn ";" (unpack $ (csv'_source row)))
time
) <$> snd <$> Csv.readWeightedCsv path ) <$> snd <$> Csv.readWeightedCsv path
-- To parse a file into a list of Document -- To parse a file into a list of Document
fileToDocs' :: CorpusParser -> FilePath -> TimeUnit -> TermList -> IO [Document] fileToDocsAdvanced :: CorpusParser -> FilePath -> TimeUnit -> TermList -> IO [Document]
fileToDocs' parser path time lst = do fileToDocsAdvanced parser path time lst = do
let patterns = buildPatterns lst let patterns = buildPatterns lst
case parser of case parser of
Wos limit -> wosToDocs limit patterns time path Wos limit -> wosToDocs limit patterns time path
Csv _ -> csvToDocs parser patterns time path Csv _ -> csvToDocs parser patterns time path
Csv' _ -> csvToDocs parser patterns time path Csv' _ -> csvToDocs parser patterns time path
fileToDocsDefault :: CorpusParser -> FilePath -> [TimeUnit] -> TermList -> IO [Document]
fileToDocsDefault parser path timeUnits lst =
if length timeUnits > 0
then
do
let timeUnit = (head' "fileToDocsDefault" timeUnits)
docs <- fileToDocsAdvanced parser path timeUnit lst
let periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod timeUnit) (getTimeStep timeUnit)
if (length periods < 3)
then fileToDocsDefault parser path (tail timeUnits) lst
else pure docs
else panic "this corpus is incompatible with the phylomemy reconstruction"
-- on passe à passer la time unit dans la conf envoyé au phyloMaker
-- dans le phyloMaker si default est true alors dans le setDefault ou pense à utiliser la TimeUnit de la conf
--------------- ---------------
-- | Label | -- -- | Label | --
...@@ -251,7 +269,11 @@ main = do ...@@ -251,7 +269,11 @@ main = do
printIOMsg "Parse the corpus" printIOMsg "Parse the corpus"
mapList <- fileToList (listParser config) (listPath config) mapList <- fileToList (listParser config) (listPath config)
corpus <- fileToDocs' (corpusParser config) (corpusPath config) (timeUnit config) mapList
corpus <- if (defaultMode config)
then fileToDocsDefault (corpusParser config) (corpusPath config) [Year 3 1 5,Month 3 1 5,Week 4 2 5] mapList
else fileToDocsAdvanced (corpusParser config) (corpusPath config) (timeUnit config) mapList
printIOComment (show (length corpus) <> " parsed docs from the corpus") printIOComment (show (length corpus) <> " parsed docs from the corpus")
printIOComment (show (length $ nub $ concat $ map text corpus) <> " Size ngs_coterms") printIOComment (show (length $ nub $ concat $ map text corpus) <> " Size ngs_coterms")
......
...@@ -135,7 +135,7 @@ data TimeUnit = ...@@ -135,7 +135,7 @@ data TimeUnit =
{ _day_period :: Int { _day_period :: Int
, _day_step :: Int , _day_step :: Int
, _day_matchingFrame :: Int } , _day_matchingFrame :: Int }
deriving (Show,Generic,Eq) deriving (Show,Generic,Eq,NFData)
instance ToSchema TimeUnit where instance ToSchema TimeUnit where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "") declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
...@@ -354,6 +354,7 @@ data Document = Document ...@@ -354,6 +354,7 @@ data Document = Document
, text :: [Ngrams] , text :: [Ngrams]
, weight :: Maybe Double , weight :: Maybe Double
, sources :: [Text] , sources :: [Text]
, docTime :: TimeUnit
} deriving (Eq,Show,Generic,NFData) } deriving (Eq,Show,Generic,NFData)
......
...@@ -128,7 +128,7 @@ context2phyloDocument timeUnit context (ngs_terms, ngs_sources) = do ...@@ -128,7 +128,7 @@ context2phyloDocument timeUnit context (ngs_terms, ngs_sources) = do
text' = maybe [] toText $ Map.lookup contextId ngs_terms text' = maybe [] toText $ Map.lookup contextId ngs_terms
sources' = maybe [] toText $ Map.lookup contextId ngs_sources sources' = maybe [] toText $ Map.lookup contextId ngs_sources
pure $ Document date date' text' Nothing sources' pure $ Document date date' text' Nothing sources' (Year 3 1 5)
context2date :: Context HyperdataDocument -> TimeUnit -> Maybe (Date, Text) context2date :: Context HyperdataDocument -> TimeUnit -> Maybe (Date, Text)
......
...@@ -111,6 +111,7 @@ docs = map (\(d,t) ...@@ -111,6 +111,7 @@ docs = map (\(d,t)
(filter (\n -> isRoots n (foundations ^. foundations_roots)) $ monoTexts t) (filter (\n -> isRoots n (foundations ^. foundations_roots)) $ monoTexts t)
Nothing Nothing
[] []
(Year 3 1 5)
) corpus ) corpus
......
...@@ -489,15 +489,15 @@ initPhyloScales lvlMax pId = ...@@ -489,15 +489,15 @@ initPhyloScales lvlMax pId =
fromList $ map (\lvl -> ((pId,lvl),PhyloScale pId ("","") lvl empty)) [1..lvlMax] fromList $ map (\lvl -> ((pId,lvl),PhyloScale pId ("","") lvl empty)) [1..lvlMax]
setDefault :: PhyloConfig -> PhyloConfig setDefault :: PhyloConfig -> TimeUnit -> PhyloConfig
setDefault conf = conf { setDefault conf timeScale = conf {
phyloScale = 2, phyloScale = 2,
similarity = WeightedLogJaccard 0.5 2, similarity = WeightedLogJaccard 0.5 2,
findAncestors = True, findAncestors = True,
phyloSynchrony = ByProximityThreshold 0.6 0 SiblingBranches MergeAllGroups, phyloSynchrony = ByProximityThreshold 0.6 0 SiblingBranches MergeAllGroups,
phyloQuality = Quality 0.5 3, phyloQuality = Quality 0.5 3,
timeUnit = Year 3 1 3, timeUnit = timeScale,
clique = MaxClique 5 30 ByNeighbours, clique = Fis 3 5,
exportLabel = [BranchLabel MostEmergentTfIdf 2, GroupLabel MostEmergentInclusive 2], exportLabel = [BranchLabel MostEmergentTfIdf 2, GroupLabel MostEmergentInclusive 2],
exportSort = ByHierarchy Desc, exportSort = ByHierarchy Desc,
exportFilter = [ByBranchSize 3] exportFilter = [ByBranchSize 3]
...@@ -509,6 +509,7 @@ setDefault conf = conf { ...@@ -509,6 +509,7 @@ setDefault conf = conf {
initPhylo :: [Document] -> PhyloConfig -> Phylo initPhylo :: [Document] -> PhyloConfig -> Phylo
initPhylo docs conf = initPhylo docs conf =
let roots = Vector.fromList $ nub $ concat $ map text docs let roots = Vector.fromList $ nub $ concat $ map text docs
timeScale = head' "initPhylo" $ map docTime docs
foundations = PhyloFoundations roots empty foundations = PhyloFoundations roots empty
docsSources = PhyloSources (Vector.fromList $ nub $ concat $ map sources docs) docsSources = PhyloSources (Vector.fromList $ nub $ concat $ map sources docs)
docsCounts = PhyloCounts (docsToTimeScaleCooc docs (foundations ^. foundations_roots)) docsCounts = PhyloCounts (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
...@@ -516,11 +517,11 @@ initPhylo docs conf = ...@@ -516,11 +517,11 @@ initPhylo docs conf =
(docsToTimeTermCount docs (foundations ^. foundations_roots)) (docsToTimeTermCount docs (foundations ^. foundations_roots))
(docsToTermCount docs (foundations ^. foundations_roots)) (docsToTermCount docs (foundations ^. foundations_roots))
(docsToTermFreq docs (foundations ^. foundations_roots)) (docsToTermFreq docs (foundations ^. foundations_roots))
(docsToLastTermFreq (getTimePeriod $ timeUnit conf) docs (foundations ^. foundations_roots)) (docsToLastTermFreq (getTimePeriod timeScale) docs (foundations ^. foundations_roots))
params = if (defaultMode conf) params = if (defaultMode conf)
then defaultPhyloParam { _phyloParam_config = setDefault conf } then defaultPhyloParam { _phyloParam_config = setDefault conf timeScale }
else defaultPhyloParam { _phyloParam_config = conf } else defaultPhyloParam { _phyloParam_config = conf }
periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf) periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod timeScale) (getTimeStep timeScale)
in trace ("\n" <> "-- | Init a phylo out of " <> show(length docs) <> " docs \n") in trace ("\n" <> "-- | Init a phylo out of " <> show(length docs) <> " docs \n")
$ Phylo foundations $ Phylo foundations
docsSources docsSources
...@@ -529,4 +530,4 @@ initPhylo docs conf = ...@@ -529,4 +530,4 @@ initPhylo docs conf =
params params
(fromList $ map (\prd -> (prd, PhyloPeriod prd ("","") (initPhyloScales 1 prd))) periods) (fromList $ map (\prd -> (prd, PhyloPeriod prd ("","") (initPhyloScales 1 prd))) periods)
0 0
(_qua_granularity $ phyloQuality $ conf) (_qua_granularity $ phyloQuality $ _phyloParam_config params)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment