Commit 7fd045e8 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[TextFlow] SpeGen scores (WIP)

parent 1e877937
Pipeline #1086 failed with stage
...@@ -28,9 +28,10 @@ import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, NgramsTerm(..), Ro ...@@ -28,9 +28,10 @@ import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, NgramsTerm(..), Ro
-- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..)) -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, Ordering(..)) import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, Ordering(..))
import Gargantext.Database.Action.Metrics.NgramsByNode (ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith) import Gargantext.Database.Action.Metrics.NgramsByNode (ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith, getNodesByNgramsOnlyUser)
import Gargantext.Database.Action.Metrics.TFICF (getTficf) import Gargantext.Database.Action.Metrics.TFICF (getTficf)
import Gargantext.Core.Text.Metrics.TFICF (sortTficf) import Gargantext.Database.Query.Table.Node (defaultList)
import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
import Gargantext.Database.Prelude (Cmd) import Gargantext.Database.Prelude (Cmd)
import Gargantext.Database.Schema.Ngrams (NgramsType(..)) import Gargantext.Database.Schema.Ngrams (NgramsType(..))
...@@ -40,25 +41,26 @@ import Gargantext.Core.Text.List.Learn (Model(..)) ...@@ -40,25 +41,26 @@ import Gargantext.Core.Text.List.Learn (Model(..))
-- import Gargantext.Core.Text.Metrics (takeScored) -- import Gargantext.Core.Text.Metrics (takeScored)
data NgramsListBuilder = BuilderStepO { stemSize :: Int data NgramsListBuilder = BuilderStepO { stemSize :: !Int
, stemX :: Int , stemX :: !Int
, stopSize :: Int , stopSize :: !Int
} }
| BuilderStep1 { withModel :: Model } | BuilderStep1 { withModel :: !Model }
| BuilderStepN { withModel :: Model } | BuilderStepN { withModel :: !Model }
| Tficf { nlb_lang :: Lang | Tficf { nlb_lang :: !Lang
, nlb_group1 :: Int , nlb_group1 :: !Int
, nlb_group2 :: Int , nlb_group2 :: !Int
, nlb_stopSize :: StopSize , nlb_stopSize :: !StopSize
, nlb_userCorpusId :: UserCorpusId , nlb_userCorpusId :: !UserCorpusId
, nlb_masterCorpusId :: MasterCorpusId , nlb_masterCorpusId :: !MasterCorpusId
} }
data StopSize = StopSize {unStopSize :: Int} data StopSize = StopSize {unStopSize :: !Int}
-- | TODO improve grouping functions of Authors, Sources, Institutes.. -- | TODO improve grouping functions of Authors, Sources, Institutes..
buildNgramsLists :: Lang buildNgramsLists :: HasNodeError err
=> Lang
-> Int -> Int
-> Int -> Int
-> StopSize -> StopSize
...@@ -98,7 +100,9 @@ buildNgramsOthersList uCid groupIt nt = do ...@@ -98,7 +100,9 @@ buildNgramsOthersList uCid groupIt nt = do
] ]
)] )]
buildNgramsTermsList :: Lang -- TODO use ListIds
buildNgramsTermsList :: HasNodeError err
=> Lang
-> Int -> Int
-> Int -> Int
-> StopSize -> StopSize
...@@ -108,8 +112,7 @@ buildNgramsTermsList :: Lang ...@@ -108,8 +112,7 @@ buildNgramsTermsList :: Lang
buildNgramsTermsList l n m s uCid mCid = do buildNgramsTermsList l n m s uCid mCid = do
-- Computing global speGen score -- Computing global speGen score
-- TODO sort is not needed, just take the score allTerms <- Map.toList <$> getTficf uCid mCid NgramsTerms
allTerms <- sortTficf Up <$> getTficf uCid mCid NgramsTerms
-- printDebug "head candidates" (List.take 10 $ allTerms) -- printDebug "head candidates" (List.take 10 $ allTerms)
-- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms) -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
...@@ -147,8 +150,18 @@ buildNgramsTermsList l n m s uCid mCid = do ...@@ -147,8 +150,18 @@ buildNgramsTermsList l n m s uCid mCid = do
Set.empty Set.empty
(groupedMonoHead <> groupedMultHead) (groupedMonoHead <> groupedMultHead)
-- TO remove (and remove HasNodeError instance)
userListId <- defaultList uCid
masterListId <- defaultList mCid
mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
-- groups Set NodeId
-- compute cooccurrences
-- compute scores
-- sort / filter
let
(mono, multi) = List.partition (\t -> (size . fst) t < 2) candidateTerms (mono, multi) = List.partition (\t -> (size . fst) t < 2) candidateTerms
(monoHead , monoTail ) = List.splitAt (round $ 0.60 * listSizeGlobal) mono (monoHead , monoTail ) = List.splitAt (round $ 0.60 * listSizeGlobal) mono
(multiHead, multiTail) = List.splitAt (round $ 0.40 * listSizeGlobal) multi (multiHead, multiTail) = List.splitAt (round $ 0.40 * listSizeGlobal) multi
...@@ -177,12 +190,12 @@ type Group = Lang -> Int -> Int -> Text -> Text ...@@ -177,12 +190,12 @@ type Group = Lang -> Int -> Int -> Text -> Text
type Stem = Text type Stem = Text
type Label = Text type Label = Text
data GroupedText score = data GroupedText score =
GroupedText { _gt_listType :: Maybe ListType GroupedText { _gt_listType :: !(Maybe ListType)
, _gt_label :: Label , _gt_label :: !Label
, _gt_score :: score , _gt_score :: !score
, _gt_group :: Set Text , _gt_group :: !(Set Text)
, _gt_size :: Int , _gt_size :: !Int
, _gt_stem :: Stem , _gt_stem :: !Stem
} }
instance (Eq a) => Eq (GroupedText a) where instance (Eq a) => Eq (GroupedText a) where
......
...@@ -330,7 +330,7 @@ queryNgramsOnlyByNodeUser' = [sql| ...@@ -330,7 +330,7 @@ queryNgramsOnlyByNodeUser' = [sql|
|] |]
getNgramsByDocOnlyUser :: NodeId getNgramsByDocOnlyUser :: DocId
-> [ListId] -> [ListId]
-> NgramsType -> NgramsType
-> [Text] -> [Text]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment