Main.hs 5.69 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
{-|
Module      : Main.hs
Description : Gargantext starter binary with Adaptative Phylo
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

Adaptative Phylo binaries
 -}

{-# LANGUAGE DataKinds          #-}
{-# LANGUAGE DeriveGeneric      #-}
{-# LANGUAGE FlexibleInstances  #-}
{-# LANGUAGE NoImplicitPrelude  #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TypeOperators      #-}
{-# LANGUAGE OverloadedStrings  #-}
{-# LANGUAGE Strict             #-}

module Main where

import Data.Aeson
import Data.ByteString.Lazy (ByteString)
qlobbe's avatar
qlobbe committed
26 27
import Data.Maybe (isJust, fromJust)
import Data.List  (concat, nub, isSuffixOf, take)
28
import Data.String (String)
qlobbe's avatar
qlobbe committed
29
import Data.Text  (Text, unwords, unpack)
30 31

import Gargantext.Prelude
32
import Gargantext.Database.Admin.Types.Node (HyperdataDocument(..))
33 34
import Gargantext.Text.Context (TermList)
import Gargantext.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
qlobbe's avatar
qlobbe committed
35
import Gargantext.Text.Corpus.Parsers (FileFormat(..),parseFile)
36 37 38
import Gargantext.Text.List.CSV (csvGraphTermList)
import Gargantext.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
import Gargantext.Viz.AdaptativePhylo
qlobbe's avatar
qlobbe committed
39 40 41
import Gargantext.Viz.Phylo.PhyloMaker  (toPhylo)
import Gargantext.Viz.Phylo.PhyloTools  (printIOMsg, printIOComment)
import Gargantext.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
qlobbe's avatar
qlobbe committed
42
-- import Gargantext.Viz.Phylo.SynchronicClustering (synchronicDistance')
43 44 45 46

import GHC.IO (FilePath) 
import Prelude (Either(..))
import System.Environment
qlobbe's avatar
qlobbe committed
47 48
import System.Directory (listDirectory)
import Control.Concurrent.Async (mapConcurrently)
49 50 51 52 53 54 55 56 57 58 59

import qualified Data.ByteString.Lazy as Lazy
import qualified Data.Vector as Vector
import qualified Gargantext.Text.Corpus.Parsers.CSV as Csv


---------------
-- | Tools | --
---------------


qlobbe's avatar
qlobbe committed
60 61 62 63 64 65 66 67 68 69 70 71 72
-- | To get all the files in a directory or just a file
getFilesFromPath :: FilePath -> IO([FilePath])
getFilesFromPath path = do 
  if (isSuffixOf "/" path) 
    then (listDirectory path) 
    else return [path]


--------------
-- | Json | --
--------------


73 74 75 76
-- | To read and decode a Json file
readJson :: FilePath -> IO ByteString
readJson path = Lazy.readFile path

qlobbe's avatar
qlobbe committed
77 78 79 80 81

----------------
-- | Parser | --
----------------

82 83 84 85 86 87 88 89 90 91
-- | To filter the Ngrams of a document based on the termList
filterTerms :: Patterns -> (a, Text) -> (a, [Text])
filterTerms patterns (y,d) = (y,termsInText patterns d)
  where
    --------------------------------------
    termsInText :: Patterns -> Text -> [Text]
    termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList pats txt
    --------------------------------------


qlobbe's avatar
qlobbe committed
92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
-- | To transform a Wos file (or [file]) into a readable corpus
wosToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
wosToCorpus limit path = do 
      files <- getFilesFromPath path
      take limit
        <$> map (\d -> let date' = fromJust $ _hyperdataDocument_publication_year d
                           title = fromJust $ _hyperdataDocument_title d
                           abstr = if (isJust $ _hyperdataDocument_abstract d)
                                   then fromJust $ _hyperdataDocument_abstract d
                                   else ""
                        in (date', title <> " " <> abstr)) 
        <$> concat 
        <$> mapConcurrently (\file -> 
              filter (\d -> (isJust $ _hyperdataDocument_publication_year d)
                         && (isJust $ _hyperdataDocument_title d))
                <$> parseFile WOS (path <> file) ) files


-- | To transform a Csv file into a readable corpus
111 112 113 114 115 116 117 118
csvToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
csvToCorpus limit path = Vector.toList
    <$> Vector.take limit
    <$> Vector.map (\row -> (csv_publication_year row, (csv_title row) <> " " <> (csv_abstract row)))
    <$> snd <$> Csv.readFile path


-- | To use the correct parser given a CorpusType
qlobbe's avatar
qlobbe committed
119 120 121 122
fileToCorpus :: CorpusParser -> FilePath -> IO ([(Int,Text)])
fileToCorpus parser path = case parser of 
  Wos limit -> wosToCorpus limit path
  Csv limit -> csvToCorpus limit path
123 124 125


-- | To parse a file into a list of Document
qlobbe's avatar
qlobbe committed
126 127 128
fileToDocs :: CorpusParser -> FilePath -> TermList -> IO [Document]
fileToDocs parser path lst = do
  corpus <- fileToCorpus parser path
129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
  let patterns = buildPatterns lst
  pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus


--------------
-- | Main | --
--------------   


main :: IO ()
main = do

    printIOMsg "Starting the reconstruction"

    printIOMsg "Read the configuration file"
    [args]   <- getArgs
    jsonArgs <- (eitherDecode <$> readJson args) :: IO (Either String Config)

    case jsonArgs of
        Left err     -> putStrLn err
        Right config -> do

            printIOMsg "Parse the corpus"
            mapList <- csvGraphTermList (listPath config)
qlobbe's avatar
qlobbe committed
153
            corpus  <- fileToDocs (corpusParser config) (corpusPath config) mapList
154 155
            printIOComment (show (length corpus) <> " parsed docs from the corpus")

156
            printIOMsg "Reconstruct the Phylo"
qlobbe's avatar
qlobbe committed
157
            
158 159
            let phylo = toPhylo corpus mapList config

qlobbe's avatar
qlobbe committed
160 161 162 163 164 165 166 167
            -- | probes

            -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_synchronic_distance_cumu_jaccard.txt") 
            --          $ synchronicDistance' phylo 1

            -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_inflexion_points.txt") 
            --         $ inflexionPoints phylo 1                    

qlobbe's avatar
qlobbe committed
168 169
            printIOMsg "End of reconstruction, start the export"

qlobbe's avatar
qlobbe committed
170
            let dot = toPhyloExport phylo        
qlobbe's avatar
qlobbe committed
171

qlobbe's avatar
qlobbe committed
172 173 174 175
            let output = (outputPath config) 
                      <> (unpack $ phyloName config)
                      <> "_V2.dot"

176
            dotToFile output dot