Main.hs 14.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
{-|
Module      : Main.hs
Description : Gargantext starter binary with Adaptative Phylo
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

Adaptative Phylo binaries
 -}

{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TypeOperators      #-}
{-# LANGUAGE Strict             #-}

module Main where

19 20
import Control.Concurrent.Async (mapConcurrently)
import Crypto.Hash.SHA256 (hash)
21
import Data.Aeson
22
import Data.Either (Either(..))
qlobbe's avatar
qlobbe committed
23
import Data.List  (concat, nub, isSuffixOf)
24
import Data.Maybe (fromMaybe)
25
import Data.String (String)
26 27 28 29
import GHC.IO (FilePath) 
import qualified Prelude as Prelude
import System.Environment
import System.Directory (listDirectory,doesFileExist)
qlobbe's avatar
qlobbe committed
30
import Data.Text  (Text, unwords, unpack, replace, pack)
31 32 33 34 35 36
import Data.Time.Calendar (fromGregorian, diffGregorianDurationClip, cdMonths, diffDays, showGregorian)

import qualified Data.ByteString.Char8 as C8
import qualified Data.ByteString.Lazy as Lazy
import qualified Data.Vector as Vector
import qualified Data.Text as T
37 38

import Gargantext.Prelude
39
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
40
import Gargantext.Core.Text.Context (TermList)
qlobbe's avatar
qlobbe committed
41 42
import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year, csv_publication_month, csv_publication_day,
  csv'_source, csv'_title, csv'_abstract, csv'_publication_year, csv'_publication_month, csv'_publication_day, csv'_weight)
43
import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as Csv
44
import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..),parseFile)
45
import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
46
import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
47
import Gargantext.Core.Viz.AdaptativePhylo
qlobbe's avatar
qlobbe committed
48 49
import Gargantext.Core.Viz.Phylo.PhyloMaker  (toPhylo, toPhyloStep)
import Gargantext.Core.Viz.Phylo.PhyloTools  (printIOMsg, printIOComment, setConfig)
50
import Gargantext.Core.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
qlobbe's avatar
qlobbe committed
51
-- import Gargantext.API.Ngrams.Prelude (toTermList)
52

qlobbe's avatar
qlobbe committed
53
-- import Debug.Trace (trace)
54

qlobbe's avatar
qlobbe committed
55
data PhyloStage = PhyloWithCliques | PhyloWithLinks deriving (Show)
qlobbe's avatar
qlobbe committed
56

57 58 59 60 61
---------------
-- | Tools | --
---------------


qlobbe's avatar
qlobbe committed
62 63 64 65
-- | To get all the files in a directory or just a file
getFilesFromPath :: FilePath -> IO([FilePath])
getFilesFromPath path = do 
  if (isSuffixOf "/" path) 
qlobbe's avatar
qlobbe committed
66
    then (listDirectory path)
qlobbe's avatar
qlobbe committed
67 68
    else return [path]

qlobbe's avatar
qlobbe committed
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
---------------
-- | Dates | --
---------------    


toMonths :: Integer -> Int -> Int -> Date
toMonths y m d = fromIntegral $ cdMonths 
              $ diffGregorianDurationClip (fromGregorian y m d) (fromGregorian 0000 0 0)


toDays :: Integer -> Int -> Int -> Date
toDays y m d = fromIntegral 
             $ diffDays (fromGregorian y m d) (fromGregorian 0000 0 0)


toPhyloDate :: Int -> Int -> Int -> TimeUnit -> Date
toPhyloDate y m d tu = case tu of 
  Year  _ _ _ -> y
87 88 89
  Month _ _ _ -> toMonths (Prelude.toInteger y) m d
  Week  _ _ _ -> div (toDays (Prelude.toInteger y) m d) 7
  Day   _ _ _ -> toDays (Prelude.toInteger y) m d
qlobbe's avatar
qlobbe committed
90 91 92


toPhyloDate' :: Int -> Int -> Int -> Text
93
toPhyloDate' y m d = pack $ showGregorian $ fromGregorian (Prelude.toInteger y) m d
qlobbe's avatar
qlobbe committed
94 95


qlobbe's avatar
qlobbe committed
96 97 98 99 100
--------------
-- | Json | --
--------------


101
-- | To read and decode a Json file
qlobbe's avatar
qlobbe committed
102
readJson :: FilePath -> IO Lazy.ByteString
103 104
readJson path = Lazy.readFile path

qlobbe's avatar
qlobbe committed
105 106 107 108 109

----------------
-- | Parser | --
----------------

110
-- | To filter the Ngrams of a document based on the termList
qlobbe's avatar
qlobbe committed
111 112 113 114 115
termsInText :: Patterns -> Text -> [Text]
termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList pats txt


-- | To transform a Wos file (or [file]) into a list of Docs
116
wosToDocs :: Int -> Patterns -> TimeUnit -> FilePath -> IO [Document]
qlobbe's avatar
qlobbe committed
117
wosToDocs limit patterns time path = do 
118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
  files <- getFilesFromPath path
  let parseFile' file = do
        eParsed <- parseFile WOS (path <> file)
        case eParsed of
          Right ps -> pure ps
          Left e   -> panic $ "Error: " <> (pack e)
  take limit
    <$> map (\d -> let title = fromJust $ _hd_title d
                       abstr = if (isJust $ _hd_abstract d)
                               then fromJust $ _hd_abstract d
                               else ""
                    in Document (toPhyloDate
                                  (fromIntegral $ fromJust $ _hd_publication_year d) 
                                  (fromJust $ _hd_publication_month d) 
                                  (fromJust $ _hd_publication_day d) time)  
                                (toPhyloDate'
                                  (fromIntegral $ fromJust $ _hd_publication_year d) 
                                  (fromJust $ _hd_publication_month d) 
                                  (fromJust $ _hd_publication_day d)) 
                                (termsInText patterns $ title <> " " <> abstr) Nothing []) 
    <$> concat 
    <$> mapConcurrently (\file -> 
          filter (\d -> (isJust $ _hd_publication_year d)
                     && (isJust $ _hd_title d))
             <$> parseFile' file) files
qlobbe's avatar
qlobbe committed
143 144


qlobbe's avatar
qlobbe committed
145
-- To transform a Csv file into a list of Document
146
csvToDocs :: CorpusParser -> Patterns -> TimeUnit -> FilePath -> IO [Document]
qlobbe's avatar
qlobbe committed
147
csvToDocs parser patterns time path = 
qlobbe's avatar
qlobbe committed
148
  case parser of
qlobbe's avatar
qlobbe committed
149
    Wos  _     -> undefined
150 151 152 153 154 155
    Csv  limit -> do
      eR <- Csv.readFile path
      case eR of
        Right r ->
          pure $ Vector.toList
            $ Vector.take limit
156 157 158 159 160 161 162
            $ Vector.map (\row -> Document (toPhyloDate  (Csv.fromMIntOrDec Csv.defaultYear $ csv_publication_year row)
                                                         (fromMaybe Csv.defaultMonth $ csv_publication_month row)
                                                         (fromMaybe Csv.defaultDay $ csv_publication_day row)
                                                         time)
                                           (toPhyloDate' (Csv.fromMIntOrDec Csv.defaultYear $ csv_publication_year row)
                                                         (fromMaybe Csv.defaultMonth $ csv_publication_month row)
                                                         (fromMaybe Csv.defaultDay $ csv_publication_day row))
163 164 165 166 167
                                           (termsInText patterns $ (csv_title row) <> " " <> (csv_abstract row))
                                           Nothing
                                           []
                         ) $ snd r
        Left e -> panic $ "Error: " <> (pack e)
qlobbe's avatar
qlobbe committed
168
    Csv' limit -> Vector.toList
qlobbe's avatar
qlobbe committed
169
      <$> Vector.take limit
qlobbe's avatar
qlobbe committed
170 171 172 173 174
      <$> Vector.map (\row -> Document (toPhyloDate  (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row) time)
                                       (toPhyloDate' (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row))
                                       (termsInText patterns $ (csv'_title row) <> " " <> (csv'_abstract row))
                                       (Just $ csv'_weight row)
                                       [csv'_source row]
qlobbe's avatar
qlobbe committed
175 176 177 178
                     ) <$> snd <$> Csv.readWeightedCsv path


-- To parse a file into a list of Document
qlobbe's avatar
qlobbe committed
179 180
fileToDocs' :: CorpusParser -> FilePath -> TimeUnit -> TermList -> IO [Document]
fileToDocs' parser path time lst = do
qlobbe's avatar
qlobbe committed
181 182
  let patterns = buildPatterns lst
  case parser of 
qlobbe's avatar
qlobbe committed
183 184 185
      Wos limit  -> wosToDocs limit  patterns time path
      Csv  _     -> csvToDocs parser patterns time path
      Csv' _     -> csvToDocs parser patterns time path
186 187


qlobbe's avatar
qlobbe committed
188 189 190
---------------
-- | Label | --
---------------
191

qlobbe's avatar
qlobbe committed
192 193 194 195

-- Config time parameters to label
timeToLabel :: Config -> [Char]
timeToLabel config = case (timeUnit config) of
qlobbe's avatar
qlobbe committed
196 197 198 199
      Year  p s f -> ("time_years" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
      Month p s f -> ("time_months"<> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
      Week  p s f -> ("time_weeks" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
      Day   p s f -> ("time_days"  <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
qlobbe's avatar
qlobbe committed
200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225


seaToLabel :: Config -> [Char]
seaToLabel config = case (seaElevation config) of
      Constante start step   -> ("sea_cst_"  <> (show start) <> "_" <> (show step))
      Adaptative granularity -> ("sea_adapt" <> (show granularity))


sensToLabel :: Config -> [Char]
sensToLabel config = case (phyloProximity config) of
      Hamming -> undefined
      WeightedLogJaccard s -> ("WeightedLogJaccard_"  <> show s)     
      WeightedLogSim s -> ( "WeightedLogSim-sens_"  <> show s)


cliqueToLabel :: Config -> [Char]
cliqueToLabel config = case (clique config) of
      Fis s s' -> "fis_" <> (show s) <> "_" <> (show s')
      MaxClique s t f ->  "clique_" <> (show s)<> "_"  <> (show f)<> "_"  <> (show t)


syncToLabel :: Config -> [Char]
syncToLabel config = case (phyloSynchrony config) of
      ByProximityThreshold scl sync_sens scope _ -> ("scale_" <> (show scope) <> "_" <> (show sync_sens)  <> "_"  <> (show scl))
      ByProximityDistribution _ _ -> undefined

qlobbe's avatar
qlobbe committed
226 227 228 229
qualToConfig :: Config -> [Char]
qualToConfig config = case (phyloQuality config) of
      Quality g m -> "quality_" <> (show g) <> "_" <> (show m)     

qlobbe's avatar
qlobbe committed
230 231 232 233 234 235 236 237 238 239 240 241 242 243

-- To set up the export file's label from the configuration
configToLabel :: Config -> [Char]
configToLabel config = outputPath config
                    <> (unpack $ phyloName config)
                    <> "-" <> (timeToLabel config)
                    <> "-scale_" <> (show (phyloLevel config))
                    <> "-" <> (seaToLabel config)
                    <> "-" <> (sensToLabel config)
                    <> "-" <> (cliqueToLabel config)
                    <> "-level_" <> (show (_qua_granularity $ phyloQuality config))
                    <> "-" <> (syncToLabel config)
                    <> ".dot"

qlobbe's avatar
qlobbe committed
244

qlobbe's avatar
qlobbe committed
245 246 247 248 249
-- To write a sha256 from a set of config's parameters
configToSha :: PhyloStage -> Config -> [Char]
configToSha stage config = unpack 
                         $ replace "/" "-" 
                         $ T.pack (show (hash $ C8.pack label))
qlobbe's avatar
qlobbe committed
250 251
  where 
    label :: [Char]
qlobbe's avatar
qlobbe committed
252 253 254 255 256 257 258 259 260 261 262 263 264 265
    label = case stage of
      PhyloWithCliques -> (corpusPath    config)
                       <> (listPath      config)
                       <> (timeToLabel   config)
                       <> (cliqueToLabel config)
      PhyloWithLinks   -> (corpusPath    config)
                       <> (listPath      config)
                       <> (timeToLabel   config)
                       <> (cliqueToLabel config)
                       <> (sensToLabel   config)
                       <> (seaToLabel    config)
                       <> (syncToLabel   config)
                       <> (qualToConfig  config)
                       <> (show (phyloLevel config))
qlobbe's avatar
qlobbe committed
266 267 268 269 270 271 272 273


writePhylo :: [Char] -> Phylo -> IO ()
writePhylo path phylo = Lazy.writeFile path $ encode phylo


readPhylo :: [Char] -> IO Phylo
readPhylo path = do
qlobbe's avatar
qlobbe committed
274 275
  phyloJson <- (eitherDecode <$> readJson path) :: IO (Either String Phylo)
  case phyloJson of 
qlobbe's avatar
qlobbe committed
276 277 278 279 280
    Left err -> do
      putStrLn err
      undefined
    Right phylo -> pure phylo 

281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300

--------------
-- | Main | --
--------------   


main :: IO ()
main = do

    printIOMsg "Starting the reconstruction"

    printIOMsg "Read the configuration file"
    [args]   <- getArgs
    jsonArgs <- (eitherDecode <$> readJson args) :: IO (Either String Config)

    case jsonArgs of
        Left err     -> putStrLn err
        Right config -> do

            printIOMsg "Parse the corpus"
301
            mapList <- csvMapTermList (listPath config)
qlobbe's avatar
qlobbe committed
302
            corpus  <- fileToDocs' (corpusParser config) (corpusPath config) (timeUnit config) mapList
303 304
            printIOComment (show (length corpus) <> " parsed docs from the corpus")

qlobbe's avatar
qlobbe committed
305 306
            printIOMsg "Reconstruct the phylo"

qlobbe's avatar
qlobbe committed
307 308 309 310 311 312 313 314 315 316 317 318 319 320 321
            let phyloWithCliquesFile =  (outputPath config) <> "phyloWithCliques_" <> (configToSha PhyloWithCliques config) <> ".json"
            let phyloWithLinksFile   =  (outputPath config) <> "phyloWithLinks_"   <> (configToSha PhyloWithLinks   config) <> ".json"

            phyloWithCliquesExists <- doesFileExist phyloWithCliquesFile
            phyloWithLinksExists   <- doesFileExist phyloWithLinksFile

            -- phyloStep <- if phyloWithCliquesExists
            --                   then do
            --                     printIOMsg "Reconstruct the phylo step from an existing file"
            --                     readPhylo phyloWithCliquesFile
            --                   else do
            --                     printIOMsg "Reconstruct the phylo step from scratch"
            --                     pure $ toPhyloStep corpus mapList config

            -- writePhylo phyloWithCliquesFile phyloStep
qlobbe's avatar
qlobbe committed
322

qlobbe's avatar
qlobbe committed
323
            -- let phylo = toPhylo (setConfig config phyloStep)
qlobbe's avatar
qlobbe committed
324

qlobbe's avatar
qlobbe committed
325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340
            phyloWithLinks <- if phyloWithLinksExists
                                  then do 
                                    printIOMsg "Reconstruct the phylo from an existing file with intertemporal links"
                                    readPhylo phyloWithLinksFile
                                  else do 
                                    if phyloWithCliquesExists
                                      then do 
                                        printIOMsg "Reconstruct the phylo from an existing file with cliques"
                                        phyloWithCliques <- readPhylo phyloWithCliquesFile
                                        writePhylo phyloWithCliquesFile phyloWithCliques
                                        pure $ toPhylo (setConfig config phyloWithCliques)
                                      else do 
                                        printIOMsg "Reconstruct the phylo from scratch"
                                        phyloWithCliques <- pure $ toPhyloStep corpus mapList config
                                        writePhylo phyloWithCliquesFile phyloWithCliques
                                        pure $ toPhylo (setConfig config phyloWithCliques)
qlobbe's avatar
qlobbe committed
341

qlobbe's avatar
qlobbe committed
342
            writePhylo phyloWithLinksFile phyloWithLinks                                        
qlobbe's avatar
qlobbe committed
343

344

345
            -- probes
qlobbe's avatar
qlobbe committed
346 347 348 349 350 351 352

            -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_synchronic_distance_cumu_jaccard.txt") 
            --          $ synchronicDistance' phylo 1

            -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_inflexion_points.txt") 
            --         $ inflexionPoints phylo 1                    

qlobbe's avatar
qlobbe committed
353 354
            printIOMsg "End of reconstruction, start the export"

qlobbe's avatar
qlobbe committed
355
            let dot = toPhyloExport (setConfig config phyloWithLinks) 
qlobbe's avatar
qlobbe committed
356 357
                  
            let output = configToLabel config
qlobbe's avatar
qlobbe committed
358

359
            dotToFile output dot