Commit 0b0ee22a authored by qlobbe's avatar qlobbe

add weighted csv

parent 3e6c4d4a
......@@ -25,7 +25,7 @@ import Crypto.Hash.SHA256 (hash)
import Gargantext.Prelude
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Core.Text.Context (TermList)
import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year, csv_w_title, csv_w_abstract, csv_w_publication_year, csv_w_weight)
import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..),parseFile)
import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
......@@ -46,10 +46,9 @@ import qualified Data.Vector as Vector
import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as Csv
import qualified Data.Text as T
-- import Debug.Trace (trace)
data PhyloStage = PhyloWithCliques | PhyloWithLinks deriving (Show)
data PhyloStage = PhyloWithCliques | PhyloWithLinks deriving (Show)
---------------
-- | Tools | --
......@@ -63,7 +62,6 @@ getFilesFromPath path = do
then (listDirectory path)
else return [path]
--------------
-- | Json | --
--------------
......@@ -79,18 +77,13 @@ readJson path = Lazy.readFile path
----------------
-- | To filter the Ngrams of a document based on the termList
filterTerms :: Patterns -> (a, Text) -> (a, [Text])
filterTerms patterns (y,d) = (y,termsInText patterns d)
where
--------------------------------------
termsInText :: Patterns -> Text -> [Text]
termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList pats txt
--------------------------------------
-- | To transform a Wos file (or [file]) into a readable corpus
wosToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
wosToCorpus limit path = do
termsInText :: Patterns -> Text -> [Text]
termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList pats txt
-- | To transform a Wos file (or [file]) into a list of Docs
wosToDocs :: Int -> Patterns -> FilePath -> IO ([Document])
wosToDocs limit patterns path = do
files <- getFilesFromPath path
take limit
<$> map (\d -> let date' = fromJust $ _hd_publication_year d
......@@ -98,7 +91,7 @@ wosToCorpus limit path = do
abstr = if (isJust $ _hd_abstract d)
then fromJust $ _hd_abstract d
else ""
in (date', title <> " " <> abstr))
in Document date' (termsInText patterns $ title <> " " <> abstr) Nothing)
<$> concat
<$> mapConcurrently (\file ->
filter (\d -> (isJust $ _hd_publication_year d)
......@@ -106,27 +99,38 @@ wosToCorpus limit path = do
<$> parseFile WOS (path <> file) ) files
-- | To transform a Csv file into a readable corpus
csvToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
csvToCorpus limit path = Vector.toList
<$> Vector.take limit
<$> Vector.map (\row -> (csv_publication_year row, (csv_title row) <> " " <> (csv_abstract row)))
<$> snd <$> Csv.readFile path
-- | To use the correct parser given a CorpusType
fileToCorpus :: CorpusParser -> FilePath -> IO ([(Int,Text)])
fileToCorpus parser path = case parser of
Wos limit -> wosToCorpus limit path
Csv limit -> csvToCorpus limit path
-- To transform a Csv file into a list of Document
csvToDocs :: CorpusParser -> Patterns -> FilePath -> IO ([Document])
csvToDocs parser patterns path =
case parser of
Wos _ -> undefined
Csv limit -> Vector.toList
<$> Vector.take limit
<$> Vector.map (\row -> Document (csv_publication_year row)
(termsInText patterns $ (csv_title row) <> " " <> (csv_abstract row))
Nothing
) <$> snd <$> Csv.readFile path
CsvWeighted limit -> Vector.toList
<$> Vector.take limit
<$> Vector.map (\row -> Document (csv_w_publication_year row)
(termsInText patterns $ (csv_w_title row) <> " " <> (csv_w_abstract row))
(Just $ csv_w_weight row)
) <$> snd <$> Csv.readWeightedCsv path
-- To parse a file into a list of Document
fileToDocs' :: CorpusParser -> FilePath -> TermList -> IO [Document]
fileToDocs' parser path lst = do
let patterns = buildPatterns lst
case parser of
Wos limit -> wosToDocs limit patterns path
Csv _ -> csvToDocs parser patterns path
CsvWeighted _ -> csvToDocs parser patterns path
-- | To parse a file into a list of Document
fileToDocs :: CorpusParser -> FilePath -> TermList -> IO [Document]
fileToDocs parser path lst = do
corpus <- fileToCorpus parser path
let patterns = buildPatterns lst
pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
---------------
-- | Label | --
---------------
-- Config time parameters to label
......@@ -235,7 +239,7 @@ main = do
printIOMsg "Parse the corpus"
mapList <- csvMapTermList (listPath config)
corpus <- fileToDocs (corpusParser config) (corpusPath config) mapList
corpus <- fileToDocs' (corpusParser config) (corpusPath config) mapList
printIOComment (show (length corpus) <> " parsed docs from the corpus")
printIOMsg "Reconstruct the phylo"
......
......@@ -395,3 +395,34 @@ parseCsv fp = V.toList <$> V.map csv2doc <$> snd <$> readFile fp
parseCsv' :: BL.ByteString -> [HyperdataDocument]
parseCsv' bs = V.toList $ V.map csv2doc $ snd $ readCsvLazyBS bs
------------------------------------------------------------------------
-- Csv v3 weighted for phylo
data WeightedCsv = WeightedCsv
{ csv_w_title :: !Text
, csv_w_source :: !Text
, csv_w_publication_year :: !Int
, csv_w_publication_month :: !Int
, csv_w_publication_day :: !Int
, csv_w_abstract :: !Text
, csv_w_authors :: !Text
, csv_w_weight :: !Double } deriving (Show)
instance FromNamedRecord WeightedCsv where
parseNamedRecord r = WeightedCsv <$> r .: "title"
<*> r .: "source"
<*> r .: "publication_year"
<*> r .: "publication_month"
<*> r .: "publication_day"
<*> r .: "abstract"
<*> r .: "authors"
<*> r .: "weight"
readWeightedCsv :: FilePath -> IO (Header, Vector WeightedCsv)
readWeightedCsv fp =
fmap (\bs ->
case decodeByNameWith csvDecodeOptions bs of
Left e -> panic (pack e)
Right corpus -> corpus
) $ BL.readFile fp
\ No newline at end of file
......@@ -21,6 +21,7 @@ module Gargantext.Core.Text.Metrics.FrequentItemSet
, fisWithSizePoly
, fisWithSizePoly2
, fisWithSizePolyMap
, fisWithSizePolyMap'
, module HLCM
)
where
......@@ -35,6 +36,8 @@ import qualified Data.Map.Strict as Map
import qualified Data.Set as Set
import qualified Data.Vector as V
import Control.Monad (sequence)
data Size = Point Int | Segment Int Int
------------------------------------------------------------------------
......@@ -120,6 +123,28 @@ fisWithSizePolyMap n f is =
------------------------------------------------------------------------
------------------------------------------------------------------------
---- Weighted [[Item]]
isSublistOf :: Ord a => [a] -> [a] -> Bool
isSublistOf sub lst = all (\i -> elem i lst) sub
reIndexFis :: Ord a => [([a],b)] -> [Fis' a] -> [(Fis' a,[b])]
reIndexFis items fis = map (\f ->
let docs = filter (\(lst,_) -> isSublistOf (_fisItemSet f) lst) items
in (f, map snd docs)) fis
wsum :: [Maybe Double] -> Maybe Double
wsum lst = fmap sum $ sequence lst
fisWithSizePolyMap' :: Ord a => Size -> Frequency -> [([a],Maybe Double)] -> Map (Set a) (Int, Maybe Double)
fisWithSizePolyMap' n f is = Map.fromList
$ map (\(fis,ws) -> (Set.fromList (_fisItemSet fis),(_fisCount fis,(wsum ws))))
$ reIndexFis is
$ fisWithSizePoly2 n f (map fst is)
------------------------------------------------------------------------
------------------------------------------------------------------------
--
---- | /!\ indexes are not the same:
......
......@@ -52,6 +52,7 @@ import qualified Data.Text.Lazy as TextLazy
data CorpusParser =
Wos {_wos_limit :: Int}
| Csv {_csv_limit :: Int}
| CsvWeighted {_csvw_limit :: Int}
deriving (Show,Generic,Eq)
data SeaElevation =
......@@ -231,7 +232,6 @@ defaultPhyloParam =
-- | Document | --
------------------
-- | Date : a simple Integer
type Date = Int
......@@ -240,8 +240,9 @@ type Ngrams = Text
-- | Document : a piece of Text linked to a Date
data Document = Document
{ date :: Date
, text :: [Ngrams]
{ date :: Date
, text :: [Ngrams]
, weight :: Maybe Double
} deriving (Eq,Show,Generic,NFData)
......@@ -335,6 +336,7 @@ data PhyloGroup =
, _phylo_groupIndex :: Int
, _phylo_groupLabel :: Text
, _phylo_groupSupport :: Support
, _phylo_groupWeight :: Maybe Double
, _phylo_groupNgrams :: [Int]
, _phylo_groupCooc :: !(Cooc)
, _phylo_groupBranchId :: PhyloBranchId
......@@ -368,6 +370,7 @@ data PhyloClique = PhyloClique
{ _phyloClique_nodes :: [Int]
, _phyloClique_support :: Support
, _phyloClique_period :: (Date,Date)
, _phyloClique_weight :: Maybe Double
} deriving (Generic,NFData,Show,Eq)
----------------
......
......@@ -109,8 +109,10 @@ config =
docs :: [Document]
docs = map (\(d,t)
-> Document d ( filter (\n -> isRoots n (foundations ^. foundations_roots))
$ monoTexts t)) corpus
-> Document d
(filter (\n -> isRoots n (foundations ^. foundations_roots)) $ monoTexts t)
Nothing
) corpus
foundations :: PhyloFoundations
......
......@@ -139,6 +139,7 @@ groupToDotNode fdt g bId =
, toAttr "branchId" (pack $ unwords (init $ map show $ snd $ g ^. phylo_groupBranchId))
, toAttr "bId" (pack $ show bId)
, toAttr "support" (pack $ show (g ^. phylo_groupSupport))
, toAttr "weight" (pack $ show (g ^. phylo_groupWeight))
, toAttr "lbl" (pack $ show (ngramsToLabel fdt (g ^. phylo_groupNgrams)))
, toAttr "foundation" (pack $ show (idxToLabel (g ^. phylo_groupNgrams)))
, toAttr "role" (pack $ show (idxToLabel' ((g ^. phylo_groupMeta) ! "dynamics")))
......
......@@ -21,7 +21,7 @@ import Gargantext.Core.Viz.Phylo.PhyloTools
import Gargantext.Core.Viz.Phylo.TemporalMatching (adaptativeTemporalMatching, constanteTemporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toProximity)
import Gargantext.Core.Viz.Phylo.SynchronicClustering (synchronicClustering)
import Gargantext.Core.Text.Context (TermList)
import Gargantext.Core.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
import Gargantext.Core.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, fisWithSizePolyMap', Size(..))
import Gargantext.Core.Methods.Graph.MaxClique (getMaxCliques)
import Gargantext.Core.Methods.Distances (Distance(Conditional))
import Gargantext.Core.Viz.Phylo.PhyloExport (toHorizon)
......@@ -128,6 +128,7 @@ appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ co
cliqueToGroup :: PhyloClique -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup
cliqueToGroup fis pId lvl idx coocs = PhyloGroup pId lvl idx ""
(fis ^. phyloClique_support)
(fis ^. phyloClique_weight)
(fis ^. phyloClique_nodes)
(ngramsToCooc (fis ^. phyloClique_nodes) coocs)
(1,[0]) -- branchid (lvl,[path in the branching tree])
......@@ -217,8 +218,14 @@ toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
phyloClique = case (clique $ getConfig phylo) of
Fis _ _ ->
let fis = map (\(prd,docs) ->
let lst = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
in (prd, map (\f -> PhyloClique (Set.toList $ fst f) (snd f) prd) lst))
case (corpusParser $ getConfig phylo) of
CsvWeighted _ -> let lst = toList
$ fisWithSizePolyMap' (Segment 1 20) 1 (map (\d -> (ngramsToIdx (text d) (getRoots phylo), weight d)) docs)
in (prd, map (\f -> PhyloClique (Set.toList $ fst f) ((fst . snd) f) prd ((snd . snd) f)) lst)
_ -> let lst = toList
$ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
in (prd, map (\f -> PhyloClique (Set.toList $ fst f) (snd f) prd Nothing) lst)
)
$ toList phyloDocs
fis' = fis `using` parList rdeepseq
in fromList fis'
......@@ -228,7 +235,7 @@ toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
$ foldl sumCooc empty
$ map listToMatrix
$ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
in (prd, map (\cl -> PhyloClique cl 0 prd) $ getMaxCliques filterType Conditional thr cooc))
in (prd, map (\cl -> PhyloClique cl 0 prd Nothing) $ getMaxCliques filterType Conditional thr cooc))
$ toList phyloDocs
mcl' = mcl `using` parList rdeepseq
in fromList mcl'
......
......@@ -22,6 +22,7 @@ import Data.Map (Map, fromList, fromListWith, foldlWithKey, (!), insert, empty,
import Control.Lens hiding (Level)
import Control.Parallel.Strategies (parList, rdeepseq, using)
import Control.Monad (sequence)
-- import Debug.Trace (trace)
import qualified Data.Map as Map
......@@ -36,7 +37,10 @@ mergeGroups :: [Cooc] -> PhyloGroupId -> Map PhyloGroupId PhyloGroupId -> [Phylo
mergeGroups coocs id mapIds childs =
let ngrams = (sort . nub . concat) $ map _phylo_groupNgrams childs
in PhyloGroup (fst $ fst id) (snd $ fst id) (snd id) ""
(sum $ map _phylo_groupSupport childs) ngrams
(sum $ map _phylo_groupSupport childs)
(fmap sum $ sequence
$ map _phylo_groupWeight childs)
ngrams
(ngramsToCooc ngrams coocs)
((snd $ fst id),bId)
(mergeMeta bId childs) [] (map (\g -> (getGroupId g, 1)) childs)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment