Commit 0b0ee22a authored by qlobbe's avatar qlobbe

add weighted csv

parent 3e6c4d4a
...@@ -25,7 +25,7 @@ import Crypto.Hash.SHA256 (hash) ...@@ -25,7 +25,7 @@ import Crypto.Hash.SHA256 (hash)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..)) import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Core.Text.Context (TermList) import Gargantext.Core.Text.Context (TermList)
import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year) import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year, csv_w_title, csv_w_abstract, csv_w_publication_year, csv_w_weight)
import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..),parseFile) import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..),parseFile)
import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList) import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList) import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
...@@ -46,10 +46,9 @@ import qualified Data.Vector as Vector ...@@ -46,10 +46,9 @@ import qualified Data.Vector as Vector
import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as Csv import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as Csv
import qualified Data.Text as T import qualified Data.Text as T
-- import Debug.Trace (trace)
data PhyloStage = PhyloWithCliques | PhyloWithLinks deriving (Show) data PhyloStage = PhyloWithCliques | PhyloWithLinks deriving (Show)
--------------- ---------------
-- | Tools | -- -- | Tools | --
...@@ -63,7 +62,6 @@ getFilesFromPath path = do ...@@ -63,7 +62,6 @@ getFilesFromPath path = do
then (listDirectory path) then (listDirectory path)
else return [path] else return [path]
-------------- --------------
-- | Json | -- -- | Json | --
-------------- --------------
...@@ -79,18 +77,13 @@ readJson path = Lazy.readFile path ...@@ -79,18 +77,13 @@ readJson path = Lazy.readFile path
---------------- ----------------
-- | To filter the Ngrams of a document based on the termList -- | To filter the Ngrams of a document based on the termList
filterTerms :: Patterns -> (a, Text) -> (a, [Text]) termsInText :: Patterns -> Text -> [Text]
filterTerms patterns (y,d) = (y,termsInText patterns d) termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList pats txt
where
--------------------------------------
termsInText :: Patterns -> Text -> [Text] -- | To transform a Wos file (or [file]) into a list of Docs
termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList pats txt wosToDocs :: Int -> Patterns -> FilePath -> IO ([Document])
-------------------------------------- wosToDocs limit patterns path = do
-- | To transform a Wos file (or [file]) into a readable corpus
wosToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
wosToCorpus limit path = do
files <- getFilesFromPath path files <- getFilesFromPath path
take limit take limit
<$> map (\d -> let date' = fromJust $ _hd_publication_year d <$> map (\d -> let date' = fromJust $ _hd_publication_year d
...@@ -98,7 +91,7 @@ wosToCorpus limit path = do ...@@ -98,7 +91,7 @@ wosToCorpus limit path = do
abstr = if (isJust $ _hd_abstract d) abstr = if (isJust $ _hd_abstract d)
then fromJust $ _hd_abstract d then fromJust $ _hd_abstract d
else "" else ""
in (date', title <> " " <> abstr)) in Document date' (termsInText patterns $ title <> " " <> abstr) Nothing)
<$> concat <$> concat
<$> mapConcurrently (\file -> <$> mapConcurrently (\file ->
filter (\d -> (isJust $ _hd_publication_year d) filter (\d -> (isJust $ _hd_publication_year d)
...@@ -106,27 +99,38 @@ wosToCorpus limit path = do ...@@ -106,27 +99,38 @@ wosToCorpus limit path = do
<$> parseFile WOS (path <> file) ) files <$> parseFile WOS (path <> file) ) files
-- | To transform a Csv file into a readable corpus -- To transform a Csv file into a list of Document
csvToCorpus :: Int -> FilePath -> IO ([(Int,Text)]) csvToDocs :: CorpusParser -> Patterns -> FilePath -> IO ([Document])
csvToCorpus limit path = Vector.toList csvToDocs parser patterns path =
<$> Vector.take limit case parser of
<$> Vector.map (\row -> (csv_publication_year row, (csv_title row) <> " " <> (csv_abstract row))) Wos _ -> undefined
<$> snd <$> Csv.readFile path Csv limit -> Vector.toList
<$> Vector.take limit
<$> Vector.map (\row -> Document (csv_publication_year row)
-- | To use the correct parser given a CorpusType (termsInText patterns $ (csv_title row) <> " " <> (csv_abstract row))
fileToCorpus :: CorpusParser -> FilePath -> IO ([(Int,Text)]) Nothing
fileToCorpus parser path = case parser of ) <$> snd <$> Csv.readFile path
Wos limit -> wosToCorpus limit path CsvWeighted limit -> Vector.toList
Csv limit -> csvToCorpus limit path <$> Vector.take limit
<$> Vector.map (\row -> Document (csv_w_publication_year row)
(termsInText patterns $ (csv_w_title row) <> " " <> (csv_w_abstract row))
(Just $ csv_w_weight row)
) <$> snd <$> Csv.readWeightedCsv path
-- To parse a file into a list of Document
fileToDocs' :: CorpusParser -> FilePath -> TermList -> IO [Document]
fileToDocs' parser path lst = do
let patterns = buildPatterns lst
case parser of
Wos limit -> wosToDocs limit patterns path
Csv _ -> csvToDocs parser patterns path
CsvWeighted _ -> csvToDocs parser patterns path
-- | To parse a file into a list of Document ---------------
fileToDocs :: CorpusParser -> FilePath -> TermList -> IO [Document] -- | Label | --
fileToDocs parser path lst = do ---------------
corpus <- fileToCorpus parser path
let patterns = buildPatterns lst
pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
-- Config time parameters to label -- Config time parameters to label
...@@ -235,7 +239,7 @@ main = do ...@@ -235,7 +239,7 @@ main = do
printIOMsg "Parse the corpus" printIOMsg "Parse the corpus"
mapList <- csvMapTermList (listPath config) mapList <- csvMapTermList (listPath config)
corpus <- fileToDocs (corpusParser config) (corpusPath config) mapList corpus <- fileToDocs' (corpusParser config) (corpusPath config) mapList
printIOComment (show (length corpus) <> " parsed docs from the corpus") printIOComment (show (length corpus) <> " parsed docs from the corpus")
printIOMsg "Reconstruct the phylo" printIOMsg "Reconstruct the phylo"
......
...@@ -395,3 +395,34 @@ parseCsv fp = V.toList <$> V.map csv2doc <$> snd <$> readFile fp ...@@ -395,3 +395,34 @@ parseCsv fp = V.toList <$> V.map csv2doc <$> snd <$> readFile fp
parseCsv' :: BL.ByteString -> [HyperdataDocument] parseCsv' :: BL.ByteString -> [HyperdataDocument]
parseCsv' bs = V.toList $ V.map csv2doc $ snd $ readCsvLazyBS bs parseCsv' bs = V.toList $ V.map csv2doc $ snd $ readCsvLazyBS bs
------------------------------------------------------------------------
-- Csv v3 weighted for phylo
data WeightedCsv = WeightedCsv
{ csv_w_title :: !Text
, csv_w_source :: !Text
, csv_w_publication_year :: !Int
, csv_w_publication_month :: !Int
, csv_w_publication_day :: !Int
, csv_w_abstract :: !Text
, csv_w_authors :: !Text
, csv_w_weight :: !Double } deriving (Show)
instance FromNamedRecord WeightedCsv where
parseNamedRecord r = WeightedCsv <$> r .: "title"
<*> r .: "source"
<*> r .: "publication_year"
<*> r .: "publication_month"
<*> r .: "publication_day"
<*> r .: "abstract"
<*> r .: "authors"
<*> r .: "weight"
readWeightedCsv :: FilePath -> IO (Header, Vector WeightedCsv)
readWeightedCsv fp =
fmap (\bs ->
case decodeByNameWith csvDecodeOptions bs of
Left e -> panic (pack e)
Right corpus -> corpus
) $ BL.readFile fp
\ No newline at end of file
...@@ -21,6 +21,7 @@ module Gargantext.Core.Text.Metrics.FrequentItemSet ...@@ -21,6 +21,7 @@ module Gargantext.Core.Text.Metrics.FrequentItemSet
, fisWithSizePoly , fisWithSizePoly
, fisWithSizePoly2 , fisWithSizePoly2
, fisWithSizePolyMap , fisWithSizePolyMap
, fisWithSizePolyMap'
, module HLCM , module HLCM
) )
where where
...@@ -35,6 +36,8 @@ import qualified Data.Map.Strict as Map ...@@ -35,6 +36,8 @@ import qualified Data.Map.Strict as Map
import qualified Data.Set as Set import qualified Data.Set as Set
import qualified Data.Vector as V import qualified Data.Vector as V
import Control.Monad (sequence)
data Size = Point Int | Segment Int Int data Size = Point Int | Segment Int Int
------------------------------------------------------------------------ ------------------------------------------------------------------------
...@@ -120,6 +123,28 @@ fisWithSizePolyMap n f is = ...@@ -120,6 +123,28 @@ fisWithSizePolyMap n f is =
------------------------------------------------------------------------ ------------------------------------------------------------------------
------------------------------------------------------------------------ ------------------------------------------------------------------------
---- Weighted [[Item]]
isSublistOf :: Ord a => [a] -> [a] -> Bool
isSublistOf sub lst = all (\i -> elem i lst) sub
reIndexFis :: Ord a => [([a],b)] -> [Fis' a] -> [(Fis' a,[b])]
reIndexFis items fis = map (\f ->
let docs = filter (\(lst,_) -> isSublistOf (_fisItemSet f) lst) items
in (f, map snd docs)) fis
wsum :: [Maybe Double] -> Maybe Double
wsum lst = fmap sum $ sequence lst
fisWithSizePolyMap' :: Ord a => Size -> Frequency -> [([a],Maybe Double)] -> Map (Set a) (Int, Maybe Double)
fisWithSizePolyMap' n f is = Map.fromList
$ map (\(fis,ws) -> (Set.fromList (_fisItemSet fis),(_fisCount fis,(wsum ws))))
$ reIndexFis is
$ fisWithSizePoly2 n f (map fst is)
------------------------------------------------------------------------
------------------------------------------------------------------------
-- --
---- | /!\ indexes are not the same: ---- | /!\ indexes are not the same:
......
...@@ -52,6 +52,7 @@ import qualified Data.Text.Lazy as TextLazy ...@@ -52,6 +52,7 @@ import qualified Data.Text.Lazy as TextLazy
data CorpusParser = data CorpusParser =
Wos {_wos_limit :: Int} Wos {_wos_limit :: Int}
| Csv {_csv_limit :: Int} | Csv {_csv_limit :: Int}
| CsvWeighted {_csvw_limit :: Int}
deriving (Show,Generic,Eq) deriving (Show,Generic,Eq)
data SeaElevation = data SeaElevation =
...@@ -231,7 +232,6 @@ defaultPhyloParam = ...@@ -231,7 +232,6 @@ defaultPhyloParam =
-- | Document | -- -- | Document | --
------------------ ------------------
-- | Date : a simple Integer -- | Date : a simple Integer
type Date = Int type Date = Int
...@@ -240,8 +240,9 @@ type Ngrams = Text ...@@ -240,8 +240,9 @@ type Ngrams = Text
-- | Document : a piece of Text linked to a Date -- | Document : a piece of Text linked to a Date
data Document = Document data Document = Document
{ date :: Date { date :: Date
, text :: [Ngrams] , text :: [Ngrams]
, weight :: Maybe Double
} deriving (Eq,Show,Generic,NFData) } deriving (Eq,Show,Generic,NFData)
...@@ -335,6 +336,7 @@ data PhyloGroup = ...@@ -335,6 +336,7 @@ data PhyloGroup =
, _phylo_groupIndex :: Int , _phylo_groupIndex :: Int
, _phylo_groupLabel :: Text , _phylo_groupLabel :: Text
, _phylo_groupSupport :: Support , _phylo_groupSupport :: Support
, _phylo_groupWeight :: Maybe Double
, _phylo_groupNgrams :: [Int] , _phylo_groupNgrams :: [Int]
, _phylo_groupCooc :: !(Cooc) , _phylo_groupCooc :: !(Cooc)
, _phylo_groupBranchId :: PhyloBranchId , _phylo_groupBranchId :: PhyloBranchId
...@@ -368,6 +370,7 @@ data PhyloClique = PhyloClique ...@@ -368,6 +370,7 @@ data PhyloClique = PhyloClique
{ _phyloClique_nodes :: [Int] { _phyloClique_nodes :: [Int]
, _phyloClique_support :: Support , _phyloClique_support :: Support
, _phyloClique_period :: (Date,Date) , _phyloClique_period :: (Date,Date)
, _phyloClique_weight :: Maybe Double
} deriving (Generic,NFData,Show,Eq) } deriving (Generic,NFData,Show,Eq)
---------------- ----------------
......
...@@ -109,8 +109,10 @@ config = ...@@ -109,8 +109,10 @@ config =
docs :: [Document] docs :: [Document]
docs = map (\(d,t) docs = map (\(d,t)
-> Document d ( filter (\n -> isRoots n (foundations ^. foundations_roots)) -> Document d
$ monoTexts t)) corpus (filter (\n -> isRoots n (foundations ^. foundations_roots)) $ monoTexts t)
Nothing
) corpus
foundations :: PhyloFoundations foundations :: PhyloFoundations
......
...@@ -139,6 +139,7 @@ groupToDotNode fdt g bId = ...@@ -139,6 +139,7 @@ groupToDotNode fdt g bId =
, toAttr "branchId" (pack $ unwords (init $ map show $ snd $ g ^. phylo_groupBranchId)) , toAttr "branchId" (pack $ unwords (init $ map show $ snd $ g ^. phylo_groupBranchId))
, toAttr "bId" (pack $ show bId) , toAttr "bId" (pack $ show bId)
, toAttr "support" (pack $ show (g ^. phylo_groupSupport)) , toAttr "support" (pack $ show (g ^. phylo_groupSupport))
, toAttr "weight" (pack $ show (g ^. phylo_groupWeight))
, toAttr "lbl" (pack $ show (ngramsToLabel fdt (g ^. phylo_groupNgrams))) , toAttr "lbl" (pack $ show (ngramsToLabel fdt (g ^. phylo_groupNgrams)))
, toAttr "foundation" (pack $ show (idxToLabel (g ^. phylo_groupNgrams))) , toAttr "foundation" (pack $ show (idxToLabel (g ^. phylo_groupNgrams)))
, toAttr "role" (pack $ show (idxToLabel' ((g ^. phylo_groupMeta) ! "dynamics"))) , toAttr "role" (pack $ show (idxToLabel' ((g ^. phylo_groupMeta) ! "dynamics")))
......
...@@ -21,7 +21,7 @@ import Gargantext.Core.Viz.Phylo.PhyloTools ...@@ -21,7 +21,7 @@ import Gargantext.Core.Viz.Phylo.PhyloTools
import Gargantext.Core.Viz.Phylo.TemporalMatching (adaptativeTemporalMatching, constanteTemporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toProximity) import Gargantext.Core.Viz.Phylo.TemporalMatching (adaptativeTemporalMatching, constanteTemporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toProximity)
import Gargantext.Core.Viz.Phylo.SynchronicClustering (synchronicClustering) import Gargantext.Core.Viz.Phylo.SynchronicClustering (synchronicClustering)
import Gargantext.Core.Text.Context (TermList) import Gargantext.Core.Text.Context (TermList)
import Gargantext.Core.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..)) import Gargantext.Core.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, fisWithSizePolyMap', Size(..))
import Gargantext.Core.Methods.Graph.MaxClique (getMaxCliques) import Gargantext.Core.Methods.Graph.MaxClique (getMaxCliques)
import Gargantext.Core.Methods.Distances (Distance(Conditional)) import Gargantext.Core.Methods.Distances (Distance(Conditional))
import Gargantext.Core.Viz.Phylo.PhyloExport (toHorizon) import Gargantext.Core.Viz.Phylo.PhyloExport (toHorizon)
...@@ -128,6 +128,7 @@ appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ co ...@@ -128,6 +128,7 @@ appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ co
cliqueToGroup :: PhyloClique -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup cliqueToGroup :: PhyloClique -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup
cliqueToGroup fis pId lvl idx coocs = PhyloGroup pId lvl idx "" cliqueToGroup fis pId lvl idx coocs = PhyloGroup pId lvl idx ""
(fis ^. phyloClique_support) (fis ^. phyloClique_support)
(fis ^. phyloClique_weight)
(fis ^. phyloClique_nodes) (fis ^. phyloClique_nodes)
(ngramsToCooc (fis ^. phyloClique_nodes) coocs) (ngramsToCooc (fis ^. phyloClique_nodes) coocs)
(1,[0]) -- branchid (lvl,[path in the branching tree]) (1,[0]) -- branchid (lvl,[path in the branching tree])
...@@ -217,8 +218,14 @@ toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of ...@@ -217,8 +218,14 @@ toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
phyloClique = case (clique $ getConfig phylo) of phyloClique = case (clique $ getConfig phylo) of
Fis _ _ -> Fis _ _ ->
let fis = map (\(prd,docs) -> let fis = map (\(prd,docs) ->
let lst = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs) case (corpusParser $ getConfig phylo) of
in (prd, map (\f -> PhyloClique (Set.toList $ fst f) (snd f) prd) lst)) CsvWeighted _ -> let lst = toList
$ fisWithSizePolyMap' (Segment 1 20) 1 (map (\d -> (ngramsToIdx (text d) (getRoots phylo), weight d)) docs)
in (prd, map (\f -> PhyloClique (Set.toList $ fst f) ((fst . snd) f) prd ((snd . snd) f)) lst)
_ -> let lst = toList
$ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
in (prd, map (\f -> PhyloClique (Set.toList $ fst f) (snd f) prd Nothing) lst)
)
$ toList phyloDocs $ toList phyloDocs
fis' = fis `using` parList rdeepseq fis' = fis `using` parList rdeepseq
in fromList fis' in fromList fis'
...@@ -228,7 +235,7 @@ toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of ...@@ -228,7 +235,7 @@ toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
$ foldl sumCooc empty $ foldl sumCooc empty
$ map listToMatrix $ map listToMatrix
$ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs $ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
in (prd, map (\cl -> PhyloClique cl 0 prd) $ getMaxCliques filterType Conditional thr cooc)) in (prd, map (\cl -> PhyloClique cl 0 prd Nothing) $ getMaxCliques filterType Conditional thr cooc))
$ toList phyloDocs $ toList phyloDocs
mcl' = mcl `using` parList rdeepseq mcl' = mcl `using` parList rdeepseq
in fromList mcl' in fromList mcl'
......
...@@ -22,6 +22,7 @@ import Data.Map (Map, fromList, fromListWith, foldlWithKey, (!), insert, empty, ...@@ -22,6 +22,7 @@ import Data.Map (Map, fromList, fromListWith, foldlWithKey, (!), insert, empty,
import Control.Lens hiding (Level) import Control.Lens hiding (Level)
import Control.Parallel.Strategies (parList, rdeepseq, using) import Control.Parallel.Strategies (parList, rdeepseq, using)
import Control.Monad (sequence)
-- import Debug.Trace (trace) -- import Debug.Trace (trace)
import qualified Data.Map as Map import qualified Data.Map as Map
...@@ -36,7 +37,10 @@ mergeGroups :: [Cooc] -> PhyloGroupId -> Map PhyloGroupId PhyloGroupId -> [Phylo ...@@ -36,7 +37,10 @@ mergeGroups :: [Cooc] -> PhyloGroupId -> Map PhyloGroupId PhyloGroupId -> [Phylo
mergeGroups coocs id mapIds childs = mergeGroups coocs id mapIds childs =
let ngrams = (sort . nub . concat) $ map _phylo_groupNgrams childs let ngrams = (sort . nub . concat) $ map _phylo_groupNgrams childs
in PhyloGroup (fst $ fst id) (snd $ fst id) (snd id) "" in PhyloGroup (fst $ fst id) (snd $ fst id) (snd id) ""
(sum $ map _phylo_groupSupport childs) ngrams (sum $ map _phylo_groupSupport childs)
(fmap sum $ sequence
$ map _phylo_groupWeight childs)
ngrams
(ngramsToCooc ngrams coocs) (ngramsToCooc ngrams coocs)
((snd $ fst id),bId) ((snd $ fst id),bId)
(mergeMeta bId childs) [] (map (\g -> (getGroupId g, 1)) childs) (mergeMeta bId childs) [] (map (\g -> (getGroupId g, 1)) childs)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment