Commit fc1084bf authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

Merge branch 'dev' into 70-dev-searx-parser

parents 966c3ed0 23a64f73
Pipeline #2220 failed with stage
in 10 minutes and 24 seconds
## Version 0.0.4.9.5
* [FEAT] Order 2 fixed with filtered edges
## Version 0.0.4.9.4
* [FEAT] Order 1 similarity validated and optimized
## Version 0.0.4.9.3
* [FIX] Node Calc import + more flexible delimiter for CSV parser
## Version 0.0.4.9.2
* [FEAT] Node Calc Parsing added (in tests)
## Version 0.0.4.9.1
* [FIX] Graph Screenshot
......
name: gargantext
version: '0.0.4.9.1'
version: '0.0.4.9.5'
synopsis: Search, map, share
description: Please see README.md
category: Data
......
......@@ -11,6 +11,7 @@ module Gargantext.API.Ngrams.Types where
import Codec.Serialise (Serialise())
import Control.Category ((>>>))
import Control.DeepSeq (NFData)
import Control.Lens (makeLenses, makePrisms, Iso', iso, from, (.~), (?=), (#), to, folded, {-withIndex, ifolded,-} view, use, (^.), (^?), (%~), (.~), (%=), at, _Just, Each(..), itraverse_, both, forOf_, (?~))
import Control.Monad.State
import Data.Aeson hiding ((.=))
......@@ -121,7 +122,7 @@ instance (ToJSONKey a, ToSchema a) => ToSchema (MSet a) where
------------------------------------------------------------------------
newtype NgramsTerm = NgramsTerm { unNgramsTerm :: Text }
deriving (Ord, Eq, Show, Generic, ToJSONKey, ToJSON, FromJSON, Semigroup, Arbitrary, Serialise, ToSchema, Hashable)
deriving (Ord, Eq, Show, Generic, ToJSONKey, ToJSON, FromJSON, Semigroup, Arbitrary, Serialise, ToSchema, Hashable, NFData)
instance IsHashable NgramsTerm where
hash (NgramsTerm t) = hash t
......
......@@ -80,6 +80,7 @@ frameCalcUploadAsync uId nId _f logStatus jobLog = do
httpLbs req manager
let body = T.pack $ BSU8.toString $ BSL.toStrict $ responseBody res
-- printDebug "body" body
mCId <- getClosestParentIdByType nId NodeCorpus
-- printDebug "[frameCalcUploadAsync] mCId" mCId
......
......@@ -123,7 +123,7 @@ readCSVFile_Annuaire' :: FilePath -> IO (Header, Vector IMTUser)
readCSVFile_Annuaire' = fmap readCsvHalLazyBS' . BL.readFile
where
readCsvHalLazyBS' :: BL.ByteString -> (Header, Vector IMTUser)
readCsvHalLazyBS' bs = case decodeByNameWith csvDecodeOptions bs of
readCsvHalLazyBS' bs = case decodeByNameWith (csvDecodeOptions Tab) bs of
Left e -> panic (cs e)
Right rows -> rows
......
......@@ -15,108 +15,44 @@ Motivation and definition of the @Conditional@ distance.
module Gargantext.Core.Methods.Distances.Conditional
where
import Data.List (sortOn)
import Data.Map (Map)
import Data.Matrix hiding (identity)
import Gargantext.Core.Viz.Graph.Utils
import Control.DeepSeq (NFData)
import Control.Parallel.Strategies (parList, rdeepseq, using)
import Data.Hashable (Hashable)
import Data.List (unzip)
import Data.Maybe (catMaybes)
import Gargantext.Prelude
import qualified Data.Map as M
import qualified Data.Set as S
import qualified Data.Vector as V
import Gargantext.Core.Viz.Graph.Utils (getMax)
import qualified Data.HashMap.Strict as Map
import qualified Data.Set as Set
------------------------------------------------------------------------
-- | Optimisation issue
toBeOptimized :: (Num a, Fractional a, Ord a) => Matrix a -> Matrix a
toBeOptimized m = proba Col m
type HashMap = Map.HashMap
------------------------------------------------------------------------
-- | Main Functions
-- Compute the probability from axis
-- x' = x / (sum Col x)
proba :: (Num a, Fractional a) => Axis -> Matrix a -> Matrix a
proba a m = mapOn a (\c x -> x / V.sum (axis a c m)) m
mapOn :: Axis -> (AxisId -> a -> a) -> Matrix a -> Matrix a
mapOn a f m = V.foldl' f' m (V.enumFromTo 1 (nOf a m))
where
f' m' c = mapOnly a f c m'
mapOnly :: Axis -> (AxisId -> a -> a) -> AxisId -> Matrix a -> Matrix a
mapOnly Col = mapCol
mapOnly Row = mapRow
mapAll :: (a -> a) -> Matrix a -> Matrix a
mapAll f m = mapOn Col (\_ -> f) m
---------------------------------------------------------------
-- | Compute a distance from axis
-- xs = (sum Col x') - x'
distFromSum :: (Num a, Fractional a)
=> Axis -> Matrix a -> Matrix a
distFromSum a m = mapOn a (\c x -> V.sum (axis a c m) - x) m
---------------------------------------------------------------
---------------------------------------------------------------
-- | To compute included/excluded or specific/generic scores
opWith :: (Fractional a1, Num a1)
=> (Matrix a2 -> t -> Matrix a1) -> Matrix a2 -> t -> Matrix a1
opWith op xs ys = mapAll (\x -> x / (2*n -1)) (xs `op` ys)
where
n = fromIntegral $ nOf Col xs
---------------------------------------------------------------
-------------------------------------------------------
conditional :: (Num a, Fractional a, Ord a) => Matrix a -> Matrix a
conditional m = filterMat (threshold m') m'
-- First version as first implementation
-- - qualitatively verified
-- - parallized as main optimization
conditional :: (Ord a, Hashable a, NFData a)
=> HashMap (a,a) Int
-> HashMap (a,a) Double
conditional m' = Map.fromList $ ((catMaybes results') `using` parList rdeepseq)
where
------------------------------------------------------------------------
-- | Main Operations
-- x' = x / (sum Col x)
x' = proba Col m
------------------------------------------------------------------------
-- xs = (sum Col x') - x'
xs = distFromSum Col x'
-- ys = (sum Row x') - x'
ys = distFromSum Row x'
------------------------------------------------------------------------
-- | Top included or excluded
ie = opWith (+) xs ys
-- ie = ( xs + ys) / (2 * (x.shape[0] - 1))
results' = [ let
ij = (/) <$> Map.lookup (i,j) m <*> Map.lookup (i,i) m
ji = (/) <$> Map.lookup (j,i) m <*> Map.lookup (j,j) m
in getMax (i,j) ij ji
-- | Top specific or generic
sg = opWith (-) xs ys
-- sg = ( xs - ys) / (2 * (x.shape[0] - 1))
| i <- keys
, j <- keys
, i < j
]
-- Converting from Int to Double
m = Map.map fromIntegral m'
nodes_kept :: [Int]
nodes_kept = take k' $ S.toList
$ foldl' (\s (n1,n2) -> insert [n1,n2] s) S.empty
$ map fst
$ nodes_included k <> nodes_specific k
-- Get the matrix coordinates, removing duplicates
keys = Set.toList $ Set.fromList (x <> y)
(x,y) = unzip $ Map.keys m
nodes_included n = take n $ sortOn snd $ toListsWithIndex ie
nodes_specific n = take n $ sortOn snd $ toListsWithIndex sg
insert as s = foldl' (\s' a -> S.insert a s') s as
k' = 2*k
k = 10
dico_nodes :: Map Int Int
dico_nodes = M.fromList $ zip ([1..] :: [Int]) nodes_kept
--dico_nodes_rev = M.fromList $ zip nodes_kept [1..]
m' = matrix (length nodes_kept)
(length nodes_kept)
(\(i,j) -> getElem ((M.!) dico_nodes i) ((M.!) dico_nodes j) x')
threshold m'' = V.minimum
$ V.map (\cId -> V.maximum $ getCol cId m'')
(V.enumFromTo 1 (nOf Col m'') )
filterMat t m'' = mapAll (\x -> filter' t x) m''
where
filter' t' x = case (x >= t') of
True -> x
False -> 0
------------------------------------------------------------------------
......@@ -218,14 +218,17 @@ hyperdataDocument2csvDoc h = CsvDoc { csv_title = m $ _hd_title h
mI = maybe 0 identity
csvDecodeOptions :: DecodeOptions
csvDecodeOptions = defaultDecodeOptions {decDelimiter = delimiter}
data Delimiter = Tab | Comma
csvEncodeOptions :: EncodeOptions
csvEncodeOptions = defaultEncodeOptions {encDelimiter = delimiter}
csvDecodeOptions :: Delimiter -> DecodeOptions
csvDecodeOptions d = defaultDecodeOptions {decDelimiter = delimiter d}
delimiter :: Word8
delimiter = fromIntegral $ ord '\t'
csvEncodeOptions :: Delimiter -> EncodeOptions
csvEncodeOptions d = defaultEncodeOptions {encDelimiter = delimiter d}
delimiter :: Delimiter -> Word8
delimiter Tab = fromIntegral $ ord '\t'
delimiter Comma = fromIntegral $ ord ','
------------------------------------------------------------------------
------------------------------------------------------------------------
readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO (Either Prelude.String [Text])
......@@ -237,27 +240,44 @@ readCsvOn' fields fp = do
------------------------------------------------------------------------
readFileLazy :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Either Prelude.String (Header, Vector a))
readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
readFileStrict :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Either Prelude.String (Header, Vector a))
readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
readByteStringLazy :: (FromNamedRecord a) => proxy a -> BL.ByteString -> Either Prelude.String (Header, Vector a)
readByteStringLazy _f bs = decodeByNameWith csvDecodeOptions bs
readByteStringStrict :: (FromNamedRecord a) => proxy a -> BS.ByteString -> Either Prelude.String (Header, Vector a)
readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
readFileLazy :: (FromNamedRecord a) => proxy a -> Delimiter -> FilePath -> IO (Either Prelude.String (Header, Vector a))
readFileLazy d f = fmap (readByteStringLazy d f) . BL.readFile
readFileStrict :: (FromNamedRecord a)
=> proxy a
-> Delimiter
-> FilePath
-> IO (Either Prelude.String (Header, Vector a))
readFileStrict d f = fmap (readByteStringStrict d f) . BS.readFile
readByteStringLazy :: (FromNamedRecord a)
=> proxy a
-> Delimiter
-> BL.ByteString
-> Either Prelude.String (Header, Vector a)
readByteStringLazy _f d bs = decodeByNameWith (csvDecodeOptions d) bs
readByteStringStrict :: (FromNamedRecord a)
=> proxy a
-> Delimiter
-> BS.ByteString
-> Either Prelude.String (Header, Vector a)
readByteStringStrict d ff = (readByteStringLazy d ff) . BL.fromStrict
------------------------------------------------------------------------
-- | TODO use readFileLazy
readFile :: FilePath -> IO (Either Prelude.String (Header, Vector CsvDoc))
readFile = fmap readCsvLazyBS . BL.readFile
readFile fp = do
result <- fmap (readCsvLazyBS Comma) $ BL.readFile fp
case result of
Left _err -> fmap (readCsvLazyBS Tab) $ BL.readFile fp
Right res -> pure $ Right res
-- | TODO use readByteStringLazy
readCsvLazyBS :: BL.ByteString -> Either Prelude.String (Header, Vector CsvDoc)
readCsvLazyBS bs = decodeByNameWith csvDecodeOptions bs
readCsvLazyBS :: Delimiter -> BL.ByteString -> Either Prelude.String (Header, Vector CsvDoc)
readCsvLazyBS d bs = decodeByNameWith (csvDecodeOptions d) bs
------------------------------------------------------------------------
-- | TODO use readFileLazy
......@@ -266,7 +286,7 @@ readCsvHal = fmap readCsvHalLazyBS . BL.readFile
-- | TODO use readByteStringLazy
readCsvHalLazyBS :: BL.ByteString -> Either Prelude.String (Header, Vector CsvHal)
readCsvHalLazyBS bs = decodeByNameWith csvDecodeOptions bs
readCsvHalLazyBS bs = decodeByNameWith (csvDecodeOptions Tab) bs
readCsvHalBSStrict :: BS.ByteString -> Either Prelude.String (Header, Vector CsvHal)
readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
......@@ -274,13 +294,13 @@ readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
------------------------------------------------------------------------
writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
writeFile fp (h, vs) = BL.writeFile fp $
encodeByNameWith csvEncodeOptions h (V.toList vs)
encodeByNameWith (csvEncodeOptions Tab) h (V.toList vs)
writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
writeDocs2Csv fp hs = BL.writeFile fp $ hyperdataDocument2csv hs
hyperdataDocument2csv :: [HyperdataDocument] -> BL.ByteString
hyperdataDocument2csv hs = encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
hyperdataDocument2csv hs = encodeByNameWith (csvEncodeOptions Tab) headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
------------------------------------------------------------------------
-- Hal Format
......@@ -425,13 +445,22 @@ parseHal' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
parseHal' bs = (V.toList . V.map csvHal2doc . snd) <$> readCsvHalLazyBS bs
------------------------------------------------------------------------
parseCsv :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
parseCsv fp = do
r <- readFile fp
pure $ (V.toList . V.map csv2doc . snd) <$> r
parseCsv fp = fmap (V.toList . V.map csv2doc . snd) <$> readFile fp
{-
parseCsv' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
parseCsv' bs = (V.toList . V.map csv2doc . snd) <$> readCsvLazyBS Comma bs
-}
parseCsv' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
parseCsv' bs = (V.toList . V.map csv2doc . snd) <$> readCsvLazyBS bs
parseCsv' bs = do
let
result = case readCsvLazyBS Comma bs of
Left _err -> readCsvLazyBS Tab bs
Right res -> Right res
(V.toList . V.map csv2doc . snd) <$> result
------------------------------------------------------------------------
-- Csv v3 weighted for phylo
......@@ -460,9 +489,9 @@ instance FromNamedRecord Csv' where
pure $ Csv' { .. }
readWeightedCsv :: FilePath -> IO (Header, Vector Csv')
readWeightedCsv fp =
fmap (\bs ->
case decodeByNameWith csvDecodeOptions bs of
readWeightedCsv fp =
fmap (\bs ->
case decodeByNameWith (csvDecodeOptions Tab) bs of
Left e -> panic (pack e)
Right corpus -> corpus
) $ BL.readFile fp
......@@ -18,13 +18,15 @@ module Gargantext.Core.Viz.Graph.Tools
import Data.HashMap.Strict (HashMap)
import Data.Map (Map)
import Data.Text (Text)
import Debug.Trace (trace)
-- import Debug.Trace (trace)
import GHC.Float (sin, cos)
import Gargantext.API.Ngrams.Types (NgramsTerm(..))
import Gargantext.Core.Methods.Distances.Conditional (conditional)
import Gargantext.Core.Methods.Distances (Distance(..), measure)
import Gargantext.Core.Methods.Graph.BAC.Proxemy (confluence)
import Gargantext.Core.Statistics
import Gargantext.Core.Viz.Graph
import Gargantext.Core.Viz.Graph.Utils (edgesFilter)
import Gargantext.Core.Viz.Graph.Bridgeness (bridgeness, Partitions, ToComId(..))
import Gargantext.Core.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map, Index, MatrixShape(..))
import Gargantext.Core.Viz.Graph.Tools.IGraph (mkGraphUfromEdges, spinglass)
......@@ -49,6 +51,7 @@ defaultClustering x = spinglass 1 x
-------------------------------------------------------------
type Threshold = Double
cooc2graph' :: Ord t => Distance
-> Double
-> Map (t, t) Int
......@@ -96,7 +99,6 @@ cooc2graphWith' doPartitions distance threshold myCooc = do
where
(as, bs) = List.unzip $ Map.keys distanceMap
n' = Set.size $ Set.fromList $ as <> bs
ClustersParams rivers _level = clustersParams nodesApprox
{- -- Debug
saveAsFileDebug "debug/distanceMap" distanceMap
......@@ -108,9 +110,7 @@ cooc2graphWith' doPartitions distance threshold myCooc = do
else panic "Text.Flow: DistanceMap is empty"
let
-- bridgeness' = distanceMap
bridgeness' = trace ("Rivers: " <> show rivers)
$ bridgeness rivers partitions distanceMap
bridgeness' = bridgeness (fromIntegral nodesApprox) partitions distanceMap
confluence' = confluence (Map.keys bridgeness') 3 True False
......@@ -118,7 +118,6 @@ cooc2graphWith' doPartitions distance threshold myCooc = do
diag bridgeness' confluence' partitions
doDistanceMap :: Distance
-> Threshold
-> HashMap (NgramsTerm, NgramsTerm) Int
......@@ -126,7 +125,7 @@ doDistanceMap :: Distance
, Map (Index, Index) Int
, Map NgramsTerm Index
)
doDistanceMap distance threshold myCooc = (distanceMap, toIndex ti diag, ti)
doDistanceMap Distributional threshold myCooc = (distanceMap, toIndex ti diag, ti)
where
-- TODO remove below
(diag, theMatrix) = Map.partitionWithKey (\(x,y) _ -> x == y)
......@@ -136,43 +135,45 @@ doDistanceMap distance threshold myCooc = (distanceMap, toIndex ti diag, ti)
(ti, _it) = createIndices theMatrix
tiSize = Map.size ti
{-
matCooc = case distance of -- Shape of the Matrix
Conditional -> map2mat Triangle 0 tiSize
Distributional -> map2mat Square 0 tiSize
{-$ case distance of -- Removing the Diagonal ?
Conditional -> Map.filterWithKey (\(a,b) _ -> a /= b)
Distributional -> Map.filterWithKey (\(a,b) _ -> a /= b)
-}
$ toIndex ti theMatrix
similarities = measure distance matCooc
-}
similarities = measure Distributional
$ map2mat Square 0 tiSize
$ toIndex ti theMatrix
links = round (let n :: Double = fromIntegral tiSize in n * log n)
distanceMap = Map.fromList
$ List.take links
$ List.sortOn snd
$ Map.toList
$ edgesFilter
$ Map.filter (> threshold)
$ mat2map similarities
doDistanceMap Conditional _threshold myCooc = (distanceMap, toIndex ti myCooc', ti)
where
myCooc' = Map.fromList $ HashMap.toList myCooc
(ti, _it) = createIndices myCooc'
-- tiSize = Map.size ti
-- links = round (let n :: Double = fromIntegral tiSize in n * log n)
distanceMap = toIndex ti
$ Map.fromList
-- $ List.take links
-- $ List.sortOn snd
$ HashMap.toList
-- $ HashMap.filter (> threshold)
$ conditional myCooc
------------------------------------------------------------------------
------------------------------------------------------------------------
data ClustersParams = ClustersParams { bridgness :: Double
, louvain :: Text
} deriving (Show)
clustersParams :: Int -> ClustersParams
clustersParams x = ClustersParams (fromIntegral x) "0.00000001" -- y
{- where
y | x < 100 = "0.000001"
| x < 350 = "0.000001"
| x < 500 = "0.000001"
| x < 1000 = "0.000001"
| otherwise = "1"
-}
----------------------------------------------------------
-- | From data to Graph
......@@ -187,18 +188,19 @@ data2graph :: ToComId a
-> [a]
-> Graph
data2graph labels occurences bridge conf partitions = Graph { _graph_nodes = nodes
, _graph_edges = edges
, _graph_metadata = Nothing }
, _graph_edges = edges
, _graph_metadata = Nothing
}
where
community_id_by_node_id = Map.fromList
$ map nodeId2comId partitions
nodes = map (setCoord ForceAtlas labels bridge)
[ (n, Node { node_size = maybe 0 identity (Map.lookup (n,n) occurences)
, node_type = Terms -- or Unknown
, node_id = cs (show n)
, node_label = l
[ (n, Node { node_size = maybe 0 identity (Map.lookup (n,n) occurences)
, node_type = Terms -- or Unknown
, node_id = cs (show n)
, node_label = l
, node_x_coord = 0
, node_y_coord = 0
, node_attributes =
......@@ -215,15 +217,16 @@ data2graph labels occurences bridge conf partitions = Graph { _graph_nodes = nod
edges = [ Edge { edge_source = cs (show s)
, edge_target = cs (show t)
, edge_weight = d
, edge_weight = weight
, edge_confluence = maybe 0 identity $ Map.lookup (s,t) conf
-- , edge_confluence = maybe (panic "E: data2graph edges") identity $ Map.lookup (s,t) conf
, edge_id = cs (show i)
}
| (i, ((s,t), d)) <- zip ([0..]::[Integer] )
(Map.toList bridge)
, s /= t, d > 0
]
| (i, ((s,t), weight)) <- zip ([0..]::[Integer] )
(Map.toList bridge)
, s /= t
, weight > 0
]
------------------------------------------------------------------------
......
......@@ -17,14 +17,16 @@ These functions are used for Vector.Matrix only.
module Gargantext.Core.Viz.Graph.Utils
where
import Data.Map (Map)
import Data.Matrix hiding (identity)
import Data.Vector (Vector)
import qualified Data.Vector as V
import qualified Data.List as L
import qualified Data.Map as Map
import Gargantext.Prelude
import Data.List (unzip)
import qualified Data.Vector as V
import Data.Maybe (catMaybes)
import qualified Data.Set as Set
------------------------------------------------------------------------
-- | Some utils to build the matrix from cooccurrence results
......@@ -63,8 +65,35 @@ toListsWithIndex m = concat' $ zip [1..] $ map (\c -> zip [1..] c) $ toLists m
concat' :: [(Int, [(Int, a)])] -> [((Int, Int), a)]
concat' xs = L.concat $ map (\(x, ys) -> map (\(y, a) -> ((x,y), a)) ys ) xs
------------------------------------------------------------------------
-- Utils to manage Graphs
edgesFilter :: (Ord a, Ord b) => Map (a,a) b -> Map (a,a) b
edgesFilter m = Map.fromList $ catMaybes results
where
results = [ let
ij = Map.lookup (i,j) m
ji = Map.lookup (j,i) m
in getMax (i,j) ij ji
| i <- keys
, j <- keys
, i < j
]
keys = Set.toList $ Set.fromList (x <> y)
(x,y) = unzip $ Map.keys m
getMax :: Ord b
=> (a,a)
-> Maybe b
-> Maybe b
-> Maybe ((a,a), b)
getMax (i,j) (Just d) Nothing = Just ((i,j), d)
getMax (i,j) Nothing (Just d) = Just ((j,i), d)
getMax ij (Just di) (Just dj) = if di >= dj then getMax ij (Just di) Nothing
else getMax ij Nothing (Just dj)
getMax _ _ _ = Nothing
resolver:
url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/lts/18/18.yaml
url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/lts/18/18.yaml
flags: {}
extra-package-dbs: []
skip-ghc-check: true
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment