Commit 9e3e9e0f authored by Alexandre Delanoë's avatar Alexandre Delanoë

[TextFlow] MapList building, score needs normalization

parent 48eb263b
...@@ -17,7 +17,6 @@ module Main where ...@@ -17,7 +17,6 @@ module Main where
import Data.Text (Text) import Data.Text (Text)
import Data.Either (Either(..)) import Data.Either (Either(..))
import Data.Maybe (Maybe(..))
import Gargantext.API.Admin.Settings (withDevEnv, runCmdDev) import Gargantext.API.Admin.Settings (withDevEnv, runCmdDev)
import Gargantext.API.Prelude (GargError) import Gargantext.API.Prelude (GargError)
import Gargantext.API.Node () -- instances only import Gargantext.API.Node () -- instances only
...@@ -29,7 +28,6 @@ import Gargantext.Database.Admin.Config (userMaster, corpusMasterName) ...@@ -29,7 +28,6 @@ import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
import Gargantext.Database.Admin.Types.Node import Gargantext.Database.Admin.Types.Node
import Gargantext.Database.Admin.Trigger.Init (initFirstTriggers, initLastTriggers) import Gargantext.Database.Admin.Trigger.Init (initFirstTriggers, initLastTriggers)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataCorpus) import Gargantext.Database.Admin.Types.Hyperdata (HyperdataCorpus)
import Gargantext.Database.Admin.Types.Node (CorpusId, RootId, ListId)
import Gargantext.Database.Prelude (Cmd, ) import Gargantext.Database.Prelude (Cmd, )
import Gargantext.Prelude import Gargantext.Prelude
import System.Environment (getArgs) import System.Environment (getArgs)
......
...@@ -968,9 +968,9 @@ putListNgrams' :: (HasInvalidError err, RepoCmdM env err m) ...@@ -968,9 +968,9 @@ putListNgrams' :: (HasInvalidError err, RepoCmdM env err m)
-> Map NgramsTerm NgramsRepoElement -> Map NgramsTerm NgramsRepoElement
-> m () -> m ()
putListNgrams' nodeId ngramsType ns = do putListNgrams' nodeId ngramsType ns = do
printDebug "[putListNgrams'] nodeId" nodeId -- printDebug "[putListNgrams'] nodeId" nodeId
printDebug "[putListNgrams'] ngramsType" ngramsType -- printDebug "[putListNgrams'] ngramsType" ngramsType
printDebug "[putListNgrams'] ns" ns -- printDebug "[putListNgrams'] ns" ns
let p1 = NgramsTablePatch . PM.fromMap $ NgramsReplace Nothing . Just <$> ns let p1 = NgramsTablePatch . PM.fromMap $ NgramsReplace Nothing . Just <$> ns
(p0, p0_validity) = PM.singleton nodeId p1 (p0, p0_validity) = PM.singleton nodeId p1
......
...@@ -14,9 +14,9 @@ Portability : POSIX ...@@ -14,9 +14,9 @@ Portability : POSIX
module Gargantext.Core.Text.List module Gargantext.Core.Text.List
where where
-- import Data.Either (partitionEithers, Either(..)) import Control.Lens (makeLenses)
import Control.Lens (makeLenses, set) import Data.Maybe (fromMaybe, catMaybes)
import Data.Maybe (fromMaybe) import Data.Ord (Down(..))
import Data.Map (Map) import Data.Map (Map)
import Data.Set (Set) import Data.Set (Set)
import Data.Text (Text) import Data.Text (Text)
...@@ -29,9 +29,9 @@ import qualified Data.Text as Text ...@@ -29,9 +29,9 @@ import qualified Data.Text as Text
import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList) import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
-- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..)) -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, Ordering(..)) import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
import Gargantext.Database.Admin.Types.Node (NodeId) import Gargantext.Database.Admin.Types.Node (NodeId)
import Gargantext.Core.Text.Metrics (scored') import Gargantext.Core.Text.Metrics (scored', Scored(..))
import Gargantext.Database.Action.Metrics.NgramsByNode (ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith, getNodesByNgramsOnlyUser) import Gargantext.Database.Action.Metrics.NgramsByNode (ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith, getNodesByNgramsOnlyUser)
import Gargantext.Database.Action.Metrics.TFICF (getTficf) import Gargantext.Database.Action.Metrics.TFICF (getTficf)
import Gargantext.Database.Query.Table.Node (defaultList) import Gargantext.Database.Query.Table.Node (defaultList)
...@@ -117,12 +117,14 @@ buildNgramsTermsList l n m s uCid mCid = do ...@@ -117,12 +117,14 @@ buildNgramsTermsList l n m s uCid mCid = do
-- Computing global speGen score -- Computing global speGen score
allTerms <- Map.toList <$> getTficf uCid mCid NgramsTerms allTerms <- Map.toList <$> getTficf uCid mCid NgramsTerms
-- printDebug "head candidates" (List.take 10 $ allTerms) -- printDebug "head candidates" (List.take 10 $ allTerms)
-- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms) -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
-- First remove stops terms -- First remove stops terms
let let
(stopTerms, candidateTerms) = List.partition ((isStopTerm s) . fst) allTerms -- stopTerms ignored for now (need to be tagged already)
(_stopTerms, candidateTerms) = List.partition ((isStopTerm s) . fst) allTerms
-- Grouping the ngrams and keeping the maximum score for label -- Grouping the ngrams and keeping the maximum score for label
let grouped = groupStems' let grouped = groupStems'
...@@ -134,22 +136,29 @@ buildNgramsTermsList l n m s uCid mCid = do ...@@ -134,22 +136,29 @@ buildNgramsTermsList l n m s uCid mCid = do
(groupedMono, groupedMult) = Map.partition (\gt -> _gt_size gt < 2) grouped (groupedMono, groupedMult) = Map.partition (\gt -> _gt_size gt < 2) grouped
-- printDebug "groupedMult" groupedMult
-- splitting monterms and multiterms to take proportional candidates -- splitting monterms and multiterms to take proportional candidates
let let
listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if to small listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if to small
monoSizeGlobal = 0.6 :: Double monoSize = 0.4 :: Double
multSizeGlobal = 1 - monoSizeGlobal multSize = 1 - monoSize
splitAt n ns = List.splitAt (round $ n * listSizeGlobal) $ List.sort $ Map.elems ns splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
(groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
(groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
(groupedMonoHead, groupedMonoTail) = splitAt monoSizeGlobal groupedMono printDebug "groupedMonoHead" (List.length groupedMonoHead)
(groupedMultHead, groupedMultTail) = splitAt multSizeGlobal groupedMult printDebug "groupedMonoTail" (List.length groupedMonoHead)
printDebug "groupedMultHead" (List.length groupedMultHead)
printDebug "groupedMultTail" (List.length groupedMultTail)
let
-- Get Local Scores now for selected grouped ngrams -- Get Local Scores now for selected grouped ngrams
selectedTerms = Set.toList $ List.foldl' selectedTerms = Set.toList $ List.foldl'
(\set (GroupedText _ l _ g _ _ _ ) -> Set.union set (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
$ Set.union g $ Set.union g
$ Set.singleton l $ Set.singleton l'
) )
Set.empty Set.empty
(groupedMonoHead <> groupedMultHead) (groupedMonoHead <> groupedMultHead)
...@@ -176,50 +185,84 @@ buildNgramsTermsList l n m s uCid mCid = do ...@@ -176,50 +185,84 @@ buildNgramsTermsList l n m s uCid mCid = do
$ Map.keys mapTextDocIds $ Map.keys mapTextDocIds
-- compute cooccurrences -- compute cooccurrences
mapCooc = Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2) mapCooc = Map.filter (>2) $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
| (t1, s1) <- mapStemNodeIds | (t1, s1) <- mapStemNodeIds
, (t2, s2) <- mapStemNodeIds , (t2, s2) <- mapStemNodeIds
, t1 /= t2 -- Null Diagonal
] ]
where where
mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
-- printDebug "mapCooc" mapCooc
let
-- computing scores -- computing scores
scores = scored' mapCooc mapScores f = Map.fromList $ map (\(Scored t g s') -> (t, f (g,s'))) $ scored' mapCooc
-- dilate scores
groupsWithScores = catMaybes
$ map (\(stem, g)
-- sort / filter -> case Map.lookup stem mapScores' of
Nothing -> Nothing
Just s' -> Just $ g { _gt_score = s'}
) $ Map.toList contextsAdded
where
mapScores' = mapScores adapt1 -- identity
adapt1 (s1,s2) = (log' 5 s1, log' 2 s2)
log' n' x = 1 + (if x <= 0 then 0 else log $ (10^(n'::Int)) * x)
-- adapt2 TOCHECK with DC
-- printDebug "groupsWithScores" groupsWithScores
let let
(mono, multi) = List.partition (\t -> (size . fst) t < 2) candidateTerms -- sort / partition / split
(monoHead , monoTail ) = List.splitAt (round $ 0.60 * listSizeGlobal) mono -- filter mono/multi again
(multiHead, multiTail) = List.splitAt (round $ 0.40 * listSizeGlobal) multi (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
-- filter with max score
partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
-- Computing local speGen score
listSizeLocal = 350 :: Double
-- Final Step building the Typed list
termList = (map (toGargList $ Just StopTerm) stopTerms)
<> (map (toGargList $ Just MapTerm) (monoHead <> multiHead))
<> (map (toGargList $ Just CandidateTerm) (monoTail <> multiTail))
(monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
(multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
-- splitAt
let
listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
inclSize = 0.4 :: Double
exclSize = 1 - inclSize
splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
(monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
(monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
(multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
(multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
ngs = List.concat
$ map toNgramsElement
$ groupStems
$ map (\(listType, (t,d)) -> let stem = ngramsGroup l n m t
in ( stem
, GroupedText listType t d Set.empty (size t) stem Set.empty
)
) termList
pure $ Map.fromList [(NgramsTerms, ngs)] -- Final Step building the Typed list
-- (map (toGargList $ Just StopTerm) stopTerms) -- Removing stops (needs social score)
termListHead =
(map (\g -> g { _gt_listType = Just MapTerm} ) ( monoScoredInclHead
<> monoScoredExclHead
<> multScoredInclHead
<> multScoredExclHead
)
)
<> (map (\g -> g { _gt_listType = Just CandidateTerm }) ( monoScoredInclTail
<> monoScoredExclTail
<> multScoredInclTail
<> multScoredExclTail
)
)
termListTail = map (\g -> g { _gt_listType = Just CandidateTerm }) ( groupedMonoTail <> groupedMultTail)
-- printDebug "monoScoredInclHead" monoScoredInclHead
-- printDebug "monoScoredExclHead" monoScoredExclTail
--
printDebug "multScoredInclHead" multScoredInclHead
printDebug "multScoredExclTail" multScoredExclTail
pure $ Map.fromList [(NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
<> (List.concat $ map toNgramsElement $ termListTail)
)
]
groupStems :: [(Stem, GroupedText Double)] -> [GroupedText Double] groupStems :: [(Stem, GroupedText Double)] -> [GroupedText Double]
groupStems = Map.elems . groupStems' groupStems = Map.elems . groupStems'
...@@ -239,7 +282,7 @@ groupStems' = Map.fromListWith grouping ...@@ -239,7 +282,7 @@ groupStems' = Map.fromListWith grouping
toNgramsElement :: GroupedText Double -> [NgramsElement] toNgramsElement :: GroupedText a -> [NgramsElement]
toNgramsElement (GroupedText listType label _ setNgrams _ _ _) = toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
[parentElem] <> childrenElems [parentElem] <> childrenElems
where where
...@@ -278,7 +321,8 @@ data GroupedText score = ...@@ -278,7 +321,8 @@ data GroupedText score =
, _gt_stem :: !Stem , _gt_stem :: !Stem
, _gt_nodes :: !(Set NodeId) , _gt_nodes :: !(Set NodeId)
} }
instance Show score => Show (GroupedText score) where
show (GroupedText _ l s _ _ _ _) = show l <> ":" <> show s
instance (Eq a) => Eq (GroupedText a) where instance (Eq a) => Eq (GroupedText a) where
(==) (GroupedText _ _ score1 _ _ _ _) (==) (GroupedText _ _ score1 _ _ _ _)
......
...@@ -18,18 +18,13 @@ module Gargantext.Core.Text.Metrics ...@@ -18,18 +18,13 @@ module Gargantext.Core.Text.Metrics
--import Data.Array.Accelerate ((:.)(..), Z(..)) --import Data.Array.Accelerate ((:.)(..), Z(..))
--import Math.KMeans (kmeans, euclidSq, elements) --import Math.KMeans (kmeans, euclidSq, elements)
--import GHC.Float (exp)
import Data.Tuple.Extra (both)
import Data.Map (Map) import Data.Map (Map)
import Data.List.Extra (sortOn)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Core.Viz.Graph.Distances.Matrice import Gargantext.Core.Viz.Graph.Distances.Matrice
import Gargantext.Core.Viz.Graph.Index import Gargantext.Core.Viz.Graph.Index
import Gargantext.Core.Statistics (pcaReduceTo, Dimension(..)) import Gargantext.Core.Statistics (pcaReduceTo, Dimension(..))
import qualified Data.Array.Accelerate as DAA import qualified Data.Array.Accelerate as DAA
import qualified Data.Array.Accelerate.Interpreter as DAA import qualified Data.Array.Accelerate.Interpreter as DAA
import qualified Data.List as List
import qualified Data.Map as Map import qualified Data.Map as Map
import qualified Data.Vector.Storable as Vec import qualified Data.Vector.Storable as Vec
...@@ -37,14 +32,6 @@ import qualified Data.Vector.Storable as Vec ...@@ -37,14 +32,6 @@ import qualified Data.Vector.Storable as Vec
type MapListSize = Int type MapListSize = Int
type InclusionSize = Int type InclusionSize = Int
{-
toScored' :: Ord t => [Map t (Vec.Vector Double)] -> [Scored t]
toScored' = map2scored
. (pcaReduceTo (Dimension 2))
. (Map.filter (\v -> Vec.length v > 1))
. (Map.unionsWith (<>))
-}
scored :: Ord t => Map (t,t) Int -> [Scored t] scored :: Ord t => Map (t,t) Int -> [Scored t]
scored = map2scored . (pcaReduceTo (Dimension 2)) . scored2map scored = map2scored . (pcaReduceTo (Dimension 2)) . scored2map
where where
...@@ -61,6 +48,7 @@ data Scored ts = Scored ...@@ -61,6 +48,7 @@ data Scored ts = Scored
, _scored_speExc :: !SpecificityExclusion , _scored_speExc :: !SpecificityExclusion
} deriving (Show) } deriving (Show)
localMetrics' :: Ord t => Map (t,t) Int -> Map t (Vec.Vector Double) localMetrics' :: Ord t => Map (t,t) Int -> Map t (Vec.Vector Double)
localMetrics' m = Map.fromList $ zipWith (\(_,t) (inc,spe) -> (t, Vec.fromList [inc,spe])) localMetrics' m = Map.fromList $ zipWith (\(_,t) (inc,spe) -> (t, Vec.fromList [inc,spe]))
(Map.toList fi) (Map.toList fi)
...@@ -85,39 +73,3 @@ scored' m = zipWith (\(_,t) (inc,spe) -> Scored t inc spe) (Map.toList fi) score ...@@ -85,39 +73,3 @@ scored' m = zipWith (\(_,t) (inc,spe) -> Scored t inc spe) (Map.toList fi) score
$ DAA.zip (DAA.use is) (DAA.use ss) $ DAA.zip (DAA.use is) (DAA.use ss)
takeScored :: Ord t => MapListSize -> InclusionSize -> Map (t,t) Int -> ([t],[t])
takeScored listSize incSize = both (map _scored_terms)
. takeLinear listSize incSize _scored_genInc
_scored_speExc
. scored
-- | Filter Scored data
-- >>> takeLinear 2 3 fst snd $ Prelude.zip ([1..10] :: [Int]) (reverse $ [1..10] :: [Int])
-- [(3,8),(6,5)]
takeLinear :: (Ord b1, Ord b2)
=> MapListSize -> InclusionSize
-> (a -> b2) -> (a -> b1) -> [a] -> ([a],[a])
takeLinear mls incSize speGen incExc = (List.splitAt mls)
. List.concat
. map (take $ round
$ (fromIntegral mls :: Double)
/ (fromIntegral incSize :: Double)
)
. map (sortOn speGen)
. splitEvery incSize
. take 5000
. takePercent (0.70)
. sortOn incExc
takePercent :: Double -> [a] -> [a]
takePercent l xs = List.take l' xs
where
l' = round $ l * (fromIntegral $ List.length xs)
splitTake :: (Int, a -> Bool) -> (Int, a -> Bool) -> [a] -> ([a], [a])
splitTake (a, af) (b, bf) xs = (mpa <> mpb, ca <> cb)
where
(mpa, ca) = List.splitAt a $ List.filter af xs
(mpb, cb) = List.splitAt b $ List.filter bf xs
...@@ -46,7 +46,7 @@ getNgramsCooc :: (FlowCmdM env err m) ...@@ -46,7 +46,7 @@ getNgramsCooc :: (FlowCmdM env err m)
) )
getNgramsCooc cId maybeListId tabType maybeLimit = do getNgramsCooc cId maybeListId tabType maybeLimit = do
(ngs', ngs) <- getNgrams cId maybeListId tabType (ngs', ngs) <- getNgrams cId maybeListId tabType
let let
take' Nothing xs = xs take' Nothing xs = xs
take' (Just n) xs = take n xs take' (Just n) xs = take n xs
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment