Commit 349ed2a2 authored by qlobbe's avatar qlobbe

add Wos parser

parent 40da8153
Pipeline #541 failed with stage
......@@ -23,14 +23,16 @@ module Main where
import Data.Aeson
import Data.ByteString.Lazy (ByteString)
import Data.Maybe ()
import Data.List (concat, nub)
import Data.Maybe (isJust, fromJust)
import Data.List (concat, nub, isSuffixOf, take)
import Data.String (String)
import Data.Text (Text, unwords)
import Gargantext.Prelude
import Gargantext.Database.Types.Node (HyperdataDocument(..))
import Gargantext.Text.Context (TermList)
import Gargantext.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
import Gargantext.Text.Corpus.Parsers (FileFormat(..),parseFile)
import Gargantext.Text.List.CSV (csvGraphTermList)
import Gargantext.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
import Gargantext.Viz.AdaptativePhylo
......@@ -38,6 +40,8 @@ import Gargantext.Viz.AdaptativePhylo
import GHC.IO (FilePath)
import Prelude (Either(..))
import System.Environment
import System.Directory (listDirectory)
import Control.Concurrent.Async (mapConcurrently)
import qualified Data.ByteString.Lazy as Lazy
import qualified Data.Vector as Vector
......@@ -64,10 +68,28 @@ printIOComment cmt =
putStrLn ( "\n" <> cmt <> "\n" )
-- | To get all the files in a directory or just a file
getFilesFromPath :: FilePath -> IO([FilePath])
getFilesFromPath path = do
if (isSuffixOf "/" path)
then (listDirectory path)
else return [path]
--------------
-- | Json | --
--------------
-- | To read and decode a Json file
readJson :: FilePath -> IO ByteString
readJson path = Lazy.readFile path
----------------
-- | Parser | --
----------------
-- | To filter the Ngrams of a document based on the termList
filterTerms :: Patterns -> (a, Text) -> (a, [Text])
filterTerms patterns (y,d) = (y,termsInText patterns d)
......@@ -78,7 +100,25 @@ filterTerms patterns (y,d) = (y,termsInText patterns d)
--------------------------------------
-- | To transform a Csv nfile into a readable corpus
-- | To transform a Wos file (or [file]) into a readable corpus
wosToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
wosToCorpus limit path = do
files <- getFilesFromPath path
take limit
<$> map (\d -> let date' = fromJust $ _hyperdataDocument_publication_year d
title = fromJust $ _hyperdataDocument_title d
abstr = if (isJust $ _hyperdataDocument_abstract d)
then fromJust $ _hyperdataDocument_abstract d
else ""
in (date', title <> " " <> abstr))
<$> concat
<$> mapConcurrently (\file ->
filter (\d -> (isJust $ _hyperdataDocument_publication_year d)
&& (isJust $ _hyperdataDocument_title d))
<$> parseFile WOS (path <> file) ) files
-- | To transform a Csv file into a readable corpus
csvToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
csvToCorpus limit path = Vector.toList
<$> Vector.take limit
......@@ -89,8 +129,7 @@ csvToCorpus limit path = Vector.toList
-- | To use the correct parser given a CorpusType
fileToCorpus :: CorpusParser -> Int -> FilePath -> IO ([(Int,Text)])
fileToCorpus parser limit path = case parser of
-- To do Wos from legacy Main.hs
Wos -> undefined
Wos -> wosToCorpus limit path
Csv -> csvToCorpus limit path
......
......@@ -30,8 +30,10 @@ module Gargantext.Viz.AdaptativePhylo where
import Data.Aeson
import Data.Aeson.TH (deriveJSON)
import Data.Text (Text)
import Data.Text (Text, pack)
import Data.Vector (Vector)
import Data.Map (Map)
import Data.Matrix (Matrix)
import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Prelude
......@@ -48,7 +50,7 @@ import Control.Lens (makeLenses)
----------------
data CorpusParser = Wos | Csv deriving (Show,Generic)
data CorpusParser = Wos | Csv deriving (Show,Generic,Eq)
data Config =
Config { corpusPath :: FilePath
......@@ -62,9 +64,23 @@ data Config =
, timeStep :: Int
, fisSupport :: Int
, fisSize :: Int
, branchSize :: Int
, safeParall :: Bool
} deriving (Show,Generic)
, branchSize :: Int
} deriving (Show,Generic,Eq)
defaultConfig =
Config { corpusPath = ""
, listPath = ""
, outputPath = ""
, corpusParser = Csv
, corpusLimit = 1000
, phyloName = pack "Default Phylo"
, phyloLevel = 2
, timePeriod = 3
, timeStep = 1
, fisSupport = 2
, fisSize = 4
, branchSize = 3
}
instance FromJSON Config
instance ToJSON Config
......@@ -72,6 +88,30 @@ instance FromJSON CorpusParser
instance ToJSON CorpusParser
-- | Software parameters
data Software =
Software { _software_name :: Text
, _software_version :: Text
} deriving (Generic, Show, Eq)
defaultSoftware =
Software { _software_name = pack "Gargantext"
, _software_version = pack "v4" }
-- | Global parameters of a Phylo
data PhyloParam =
PhyloParam { _phyloParam_version :: Text
, _phyloParam_software :: Software
, _phyloParam_config :: Config
} deriving (Generic, Show, Eq)
defaultPhyloParam =
PhyloParam { _phyloParam_version = pack "v2.adaptative"
, _phyloParam_software = defaultSoftware
, _phyloParam_config = defaultConfig }
------------------
-- | Document | --
------------------
......@@ -87,7 +127,7 @@ type Ngrams = Text
data Document = Document
{ date :: Date
, text :: [Ngrams]
} deriving (Show,Generic,NFData)
} deriving (Eq,Show,Generic,NFData)
--------------------
......@@ -102,10 +142,40 @@ data PhyloFoundations = PhyloFoundations
} deriving (Generic, Show, Eq)
---------------------------
-- | Coocurency Matrix | --
---------------------------
-- | Cooc : a weighted (Double) coocurency matrix
type Cooc = Matrix Double
-------------------
-- | Phylomemy | --
-------------------
-- | Phylo datatype of a phylomemy
-- foundations : the foundations of the phylo
-- timeCooc : a Map of coocurency by minimal unit of time (ex: by year)
-- timeDocs : a Map with the numbers of docs by minimal unit of time (ex: by year)
-- param : the parameters of the phylomemy (with the user's configuration)
data Phylo =
Phylo { _phylo_foundations :: PhyloFoundations
, _phylo_timeCooc :: !(Map Date Cooc)
, _phylo_timeDocs :: !(Map Date Double)
, _phylo_param :: PhyloParam
}
deriving (Generic, Show, Eq)
----------------
-- | Lenses | --
----------------
makeLenses ''Config
makeLenses ''PhyloFoundations
------------------------
......
......@@ -20,6 +20,7 @@ Portability : POSIX
module Gargantext.Viz.Phylo.PhyloExample where
import Data.List (sortOn)
import Data.Map (Map)
import Data.Text (Text, toLower)
import Gargantext.Prelude
......@@ -27,6 +28,7 @@ import Gargantext.Text.Context (TermList)
import Gargantext.Text.Terms.Mono (monoTexts)
import Gargantext.Viz.AdaptativePhylo
import Gargantext.Viz.Phylo.PhyloTools
import Gargantext.Viz.Phylo.PhyloMaker
import Control.Lens
......@@ -38,7 +40,19 @@ import qualified Data.Vector as Vector
--------------------------------------------
-- Next is to build the config and the phyloLevel 0
-- cooc et phyloBase
nbDocsByYear :: Map Date Double
nbDocsByYear = nbDocsByTime docs 1
config :: Config
config =
defaultConfig { phyloName = "Cesar et Cleopatre"
, branchSize = 0
, fisSupport = 0
, fisSize = 0 }
docs :: [Document]
......
......@@ -13,4 +13,24 @@ Portability : POSIX
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE MultiParamTypeClasses #-}
module Gargantext.Viz.Phylo.PhyloMaker where
\ No newline at end of file
module Gargantext.Viz.Phylo.PhyloMaker where
import Data.Map (Map, fromListWith, keys, unionWith, fromList)
import Gargantext.Prelude
import Gargantext.Viz.AdaptativePhylo
import Gargantext.Viz.Phylo.PhyloTools
--------------------
-- | to Phylo 0 | --
--------------------
nbDocsByTime :: [Document] -> Int -> Map Date Double
nbDocsByTime docs step =
let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') step
in unionWith (+) time docs'
......@@ -19,6 +19,7 @@ module Gargantext.Viz.Phylo.PhyloTools where
import Data.Maybe (Maybe, fromMaybe)
import Data.Text (Text)
import Data.Vector (Vector)
import Data.List (sort)
import Gargantext.Prelude
import Gargantext.Viz.AdaptativePhylo
......@@ -28,23 +29,6 @@ import GHC.IO (FilePath)
import qualified Data.Vector as Vector
----------------
-- | Config | --
----------------
-- | Define a default value
def :: a -> Maybe a -> a
def = fromMaybe
-- | To init a configuration
initConfig :: Maybe FilePath -> Maybe FilePath -> Maybe FilePath -> Maybe CorpusParser -> Maybe Int -> Maybe Text
-> Maybe Int -> Maybe Int -> Maybe Int -> Maybe Int -> Maybe Int -> Maybe Int -> Maybe Bool -> Config
initConfig (def "" -> corpus) (def "" -> mapList) (def "" -> output) (def Csv -> parser) (def 10000 -> limit) (def "A phylomemy" -> name)
(def 2 -> level) (def 3 -> period) (def 1 -> step) (def 3 -> support) (def 4 -> clique) (def 3 -> minBranchSize) (def True -> safe) =
Config corpus mapList output parser limit name level period step support clique minBranchSize safe
---------------------
-- | Foundations | --
---------------------
......@@ -52,4 +36,16 @@ initConfig (def "" -> corpus) (def "" -> mapList) (def "" -> output) (def Csv ->
-- | Is this Ngrams a Foundations Root ?
isRoots :: Ngrams -> Vector Ngrams -> Bool
isRoots n ns = Vector.elem n ns
\ No newline at end of file
isRoots n ns = Vector.elem n ns
--------------
-- | Time | --
--------------
-- | Get a regular & ascendante timeScale from a given list of dates
toTimeScale :: [Date] -> Int -> [Date]
toTimeScale dates step =
let dates' = sort dates
in [head' "toTimeScale" dates', ((head' "toTimeScale" dates') + step) .. last' "toTimeScale" dates']
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment