Commit 4463a799 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[TextFlow] MapList Global score that needs local score (WIP)

parent 3bb9fb2f
Pipeline #1085 failed with stage
...@@ -14,6 +14,7 @@ module Gargantext.Core.Text.List ...@@ -14,6 +14,7 @@ module Gargantext.Core.Text.List
where where
-- import Data.Either (partitionEithers, Either(..)) -- import Data.Either (partitionEithers, Either(..))
import Data.Maybe (fromMaybe)
import Data.Map (Map) import Data.Map (Map)
import Data.Set (Set) import Data.Set (Set)
import Data.Text (Text) import Data.Text (Text)
...@@ -105,28 +106,66 @@ buildNgramsTermsList :: Lang ...@@ -105,28 +106,66 @@ buildNgramsTermsList :: Lang
-> MasterCorpusId -> MasterCorpusId
-> Cmd err (Map NgramsType [NgramsElement]) -> Cmd err (Map NgramsType [NgramsElement])
buildNgramsTermsList l n m s uCid mCid = do buildNgramsTermsList l n m s uCid mCid = do
candidates <- sortTficf Up <$> getTficf uCid mCid NgramsTerms
-- printDebug "head candidates" (List.take 10 $ candidates)
-- printDebug "tail candidates" (List.take 10 $ List.reverse $ candidates)
-- Computing global speGen score
-- TODO sort is not needed, just take the score
allTerms <- sortTficf Up <$> getTficf uCid mCid NgramsTerms
-- printDebug "head candidates" (List.take 10 $ allTerms)
-- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
-- First remove stops terms
let
(stopTerms, candidateTerms) = List.partition ((isStopTerm s) . fst) allTerms
-- Grouping the ngrams and keeping the maximum score for label
let grouped = groupStems'
$ map (\(t,d) -> let stem = ngramsGroup l n m t
in ( stem
, GroupedText Nothing t d Set.empty (size t) stem
)
) candidateTerms
(groupedMono, groupedMult) = Map.partition (\gt -> _gt_size gt < 2) grouped
-- splitting monterms and multiterms to take proportional candidates
let let
listSize = 400 :: Double listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if to small
(candidatesHead, candidatesTail0) = List.splitAt 3 candidates monoSizeGlobal = 0.6 :: Double
multSizeGlobal = 1 - monoSizeGlobal
splitAt n ns = List.splitAt (round $ n * listSizeGlobal) $ List.sort $ Map.elems ns
(groupedMonoHead, groupedMonoTail) = splitAt monoSizeGlobal groupedMono
(groupedMultHead, groupedMultTail) = splitAt multSizeGlobal groupedMult
-- Get Local Scores now for selected grouped ngrams
selectedTerms = Set.toList $ List.foldl'
(\set (GroupedText _ l _ g _ _) -> Set.union set $ Set.union g $ Set.singleton l)
Set.empty
(groupedMonoHead <> groupedMultHead)
(mono, multi) = List.partition (\t -> (size . fst) t < 2) candidatesTail0
(monoHead , monoTail ) = List.splitAt (round $ 0.60 * listSize) mono
(multiHead, multiTail) = List.splitAt (round $ 0.40 * listSize) multi
termList = (map (toGargList ((isStopTerm s) . fst) CandidateTerm) candidatesHead) (mono, multi) = List.partition (\t -> (size . fst) t < 2) candidateTerms
<> (map (toGargList ((isStopTerm s) . fst) MapTerm) (monoHead <> multiHead)) (monoHead , monoTail ) = List.splitAt (round $ 0.60 * listSizeGlobal) mono
<> (map (toGargList ((isStopTerm s) . fst) CandidateTerm) (monoTail <> multiTail)) (multiHead, multiTail) = List.splitAt (round $ 0.40 * listSizeGlobal) multi
-- Computing local speGen score
listSizeLocal = 350 :: Double
-- Final Step building the Typed list
termList = (map (toGargList $ Just StopTerm) stopTerms)
<> (map (toGargList $ Just MapTerm) (monoHead <> multiHead))
<> (map (toGargList $ Just CandidateTerm) (monoTail <> multiTail))
ngs = List.concat ngs = List.concat
$ map toNgramsElement $ map toNgramsElement
$ groupStems $ groupStems
$ map (\(listType, (t,d)) -> ( ngramsGroup l n m t $ map (\(listType, (t,d)) -> let stem = ngramsGroup l n m t
, GroupedText listType t d Set.empty in ( stem
) , GroupedText listType t d Set.empty (size t) stem
)
) termList ) termList
pure $ Map.fromList [(NgramsTerms, ngs)] pure $ Map.fromList [(NgramsTerms, ngs)]
...@@ -134,42 +173,58 @@ buildNgramsTermsList l n m s uCid mCid = do ...@@ -134,42 +173,58 @@ buildNgramsTermsList l n m s uCid mCid = do
type Group = Lang -> Int -> Int -> Text -> Text type Group = Lang -> Int -> Int -> Text -> Text
type Stem = Text type Stem = Text
type Label = Text type Label = Text
data GroupedText = GroupedText { _gt_listType :: ListType data GroupedText score =
, _gt_label :: Label GroupedText { _gt_listType :: Maybe ListType
, _gt_score :: Double , _gt_label :: Label
, _gt_group :: Set Text , _gt_score :: score
} , _gt_group :: Set Text
groupStems :: [(Stem, GroupedText)] -> [GroupedText] , _gt_size :: Int
groupStems = Map.elems . Map.fromListWith grouping , _gt_stem :: Stem
}
instance (Eq a) => Eq (GroupedText a) where
(==) (GroupedText _ _ score1 _ _ _)
(GroupedText _ _ score2 _ _ _) = (==) score1 score2
instance (Eq a, Ord a) => Ord (GroupedText a) where
compare (GroupedText _ _ score1 _ _ _)
(GroupedText _ _ score2 _ _ _) = compare score1 score2
groupStems :: [(Stem, GroupedText Double)] -> [GroupedText Double]
groupStems = Map.elems . groupStems'
groupStems' :: [(Stem, GroupedText Double)] -> Map Stem (GroupedText Double)
groupStems' = Map.fromListWith grouping
where where
grouping (GroupedText lt1 label1 score1 group1) grouping (GroupedText lt1 label1 score1 group1 s1 stem1)
(GroupedText lt2 label2 score2 group2) (GroupedText lt2 label2 score2 group2 s2 stem2)
| score1 >= score2 = GroupedText lt label1 score1 (Set.insert label2 gr) | score1 >= score2 = GroupedText lt label1 score1 (Set.insert label2 gr) s1 stem1
| otherwise = GroupedText lt label2 score2 (Set.insert label1 gr) | otherwise = GroupedText lt label2 score2 (Set.insert label1 gr) s2 stem2
where where
lt = lt1 <> lt2 lt = lt1 <> lt2
gr = Set.union group1 group2 gr = Set.union group1 group2
toNgramsElement :: GroupedText -> [NgramsElement]
toNgramsElement (GroupedText listType label _ setNgrams) =
toNgramsElement :: GroupedText Double -> [NgramsElement]
toNgramsElement (GroupedText listType label _ setNgrams _ _) =
[parentElem] <> childrenElems [parentElem] <> childrenElems
where where
parent = label parent = label
children = Set.toList setNgrams children = Set.toList setNgrams
parentElem = mkNgramsElement (NgramsTerm parent) parentElem = mkNgramsElement (NgramsTerm parent)
listType (fromMaybe CandidateTerm listType)
Nothing Nothing
(mSetFromList (NgramsTerm <$> children)) (mSetFromList (NgramsTerm <$> children))
childrenElems = map (\t -> mkNgramsElement t listType childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
(Just $ RootParent (NgramsTerm parent) (NgramsTerm parent)) (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
(mSetFromList []) (mSetFromList [])
) (NgramsTerm <$> children) ) (NgramsTerm <$> children)
toGargList :: (b -> Bool) -> ListType -> b -> (ListType, b) toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
toGargList isStop l n = case isStop n of toGargList l n = (l,n)
True -> (StopTerm, n)
False -> (l, n)
isStopTerm :: StopSize -> Text -> Bool isStopTerm :: StopSize -> Text -> Bool
......
{-|
Module : Gargantext.Core.Text.Metrics.SpeGen.IncExc
Description :
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module Gargantext.Core.Text.Metrics.SpeGen.IncExc
where
{-
data IncExc = Inclusion { unInclusion :: !Double }
| Exclusion { unExclusion :: !Double }
-}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment