From 97edf05ffa9a59e6a1d56c9faca668ec527e79f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexandre=20Delano=C3=AB?= <devel+git@delanoe.org> Date: Tue, 19 Jan 2021 16:00:12 +0100 Subject: [PATCH] [FEAT] group ngrams, connected (testing now) --- src/Gargantext/Core/Text/List.hs | 25 ++++++++-- .../Core/Text/List/Group/WithStem.hs | 4 ++ src/Gargantext/Core/Text/Terms.hs | 6 +-- src/Gargantext/Database/Action/Flow.hs | 8 +++- .../Database/Query/Table/NgramsPostag.hs | 46 +++++++++++++------ src/Gargantext/Database/Schema/Ngrams.hs | 3 +- src/Gargantext/Prelude.hs | 38 +++++++++++---- 7 files changed, 94 insertions(+), 36 deletions(-) diff --git a/src/Gargantext/Core/Text/List.hs b/src/Gargantext/Core/Text/List.hs index dfc4beb1..b4a7daac 100644 --- a/src/Gargantext/Core/Text/List.hs +++ b/src/Gargantext/Core/Text/List.hs @@ -37,9 +37,10 @@ import Gargantext.Database.Action.Metrics.TFICF (getTficf) import Gargantext.Database.Admin.Types.Node (NodeId) import Gargantext.Database.Prelude (CmdM) import Gargantext.Database.Query.Table.Node (defaultList) +import Gargantext.Database.Query.Table.NgramsPostag (selectLems) import Gargantext.Database.Query.Table.Node.Error (HasNodeError()) import Gargantext.Database.Query.Tree.Error (HasTreeError) -import Gargantext.Database.Schema.Ngrams (NgramsType(..)) +import Gargantext.Database.Schema.Ngrams (NgramsType(..), Ngrams(..)) import Gargantext.Prelude import qualified Data.HashMap.Strict as HashMap import qualified Data.List as List @@ -62,12 +63,12 @@ buildNgramsLists :: ( RepoCmdM env err m , HasTreeError err , HasNodeError err ) - => User - -> GroupParams + => GroupParams + -> User -> UserCorpusId -> MasterCorpusId -> m (Map NgramsType [NgramsElement]) -buildNgramsLists user gp uCid mCid = do +buildNgramsLists gp user uCid mCid = do ngTerms <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350) othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity) [ (Authors , MapListSize 9) @@ -132,6 +133,20 @@ buildNgramsOthersList user uCid _groupParams (nt, MapListSize mapListSize) = do )] +getGroupParams :: ( HasNodeError err + , CmdM env err m + , RepoCmdM env err m + , HasTreeError err + ) + => GroupParams -> Set Ngrams -> m GroupParams +getGroupParams gp@(GroupWithPosTag l a _m) ng = do + hashMap <- HashMap.fromList <$> selectLems l a (Set.toList ng) + pure $ over gwl_map (\x -> x <> hashMap) gp +getGroupParams gp _ = pure gp + + + + -- TODO use ListIds buildNgramsTermsList :: ( HasNodeError err , CmdM env err m @@ -160,7 +175,7 @@ buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do let socialLists_Stemmed = addScoreStem groupParams (HashMap.keysSet allTerms) socialLists printDebug "socialLists_Stemmed" socialLists_Stemmed - let groupedWithList = toGroupedTree {- groupParams -} socialLists_Stemmed allTerms + let groupedWithList = toGroupedTree socialLists_Stemmed allTerms (stopTerms, candidateTerms) = HashMap.partition ((== Just StopTerm) . viewListType) $ view flc_scores groupedWithList diff --git a/src/Gargantext/Core/Text/List/Group/WithStem.hs b/src/Gargantext/Core/Text/List/Group/WithStem.hs index 1f2923c3..38245270 100644 --- a/src/Gargantext/Core/Text/List/Group/WithStem.hs +++ b/src/Gargantext/Core/Text/List/Group/WithStem.hs @@ -17,6 +17,7 @@ Portability : POSIX module Gargantext.Core.Text.List.Group.WithStem where +import Control.Lens (makeLenses) import Data.HashMap.Strict (HashMap) import Data.HashSet (HashSet) import Data.Map (Map) @@ -53,6 +54,7 @@ data StopSize = StopSize {unStopSize :: !Int} -- discussed. Main purpose of this is offering -- a first grouping option to user and get some -- enriched data to better learn and improve that algo +-- | Lenses instances at the end of this file data GroupParams = GroupParams { unGroupParams_lang :: !Lang , unGroupParams_len :: !Int , unGroupParams_limit :: !Int @@ -124,3 +126,5 @@ toNgramsPatch children = NgramsPatch children' Patch.Keep $ PatchMap.fromList $ List.zip children (List.cycle [addPatch]) +-- | Instances +makeLenses ''GroupParams diff --git a/src/Gargantext/Core/Text/Terms.hs b/src/Gargantext/Core/Text/Terms.hs index 5c6a9a67..7ed5ec85 100644 --- a/src/Gargantext/Core/Text/Terms.hs +++ b/src/Gargantext/Core/Text/Terms.hs @@ -151,12 +151,12 @@ insertExtractedNgrams :: [ ExtractedNgrams ] -> Cmd err (HashMap Text NgramsId) insertExtractedNgrams ngs = do let (s, e) = List.partition isSimpleNgrams ngs m1 <- insertNgrams (map unSimpleNgrams s) - printDebug "others" m1 + --printDebug "others" m1 m2 <- insertNgramsPostag (map unEnrichedNgrams e) - printDebug "terms" m2 + --printDebug "terms" m2 - let result = HashMap.unions [m1, m2] + let result = HashMap.union m1 m2 pure result isSimpleNgrams :: ExtractedNgrams -> Bool diff --git a/src/Gargantext/Database/Action/Flow.hs b/src/Gargantext/Database/Action/Flow.hs index 14fb8647..6b054a0c 100644 --- a/src/Gargantext/Database/Action/Flow.hs +++ b/src/Gargantext/Database/Action/Flow.hs @@ -68,7 +68,7 @@ import Gargantext.Core.Ext.IMT (toSchoolName) import Gargantext.Core.Ext.IMTUser (deserialiseImtUsersFromFile) import Gargantext.Core.Flow.Types import Gargantext.Core.Text -import Gargantext.Core.Text.List.Group.WithStem (StopSize(..), GroupParams(..)) +import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..)) import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat) import Gargantext.Core.Text.List (buildNgramsLists) import Gargantext.Core.Text.Terms @@ -231,7 +231,11 @@ flowCorpusUser l user corpusName ctype ids = do -- User List Flow (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype - ngs <- buildNgramsLists user (GroupParams l 2 3 (StopSize 3)) userCorpusId masterCorpusId + + -- let gp = (GroupParams l 2 3 (StopSize 3)) + let gp = GroupWithPosTag l CoreNLP HashMap.empty + ngs <- buildNgramsLists gp user userCorpusId masterCorpusId + _userListId <- flowList_DbRepo listId ngs _mastListId <- getOrMkList masterCorpusId masterUserId -- _ <- insertOccsUpdates userCorpusId mastListId diff --git a/src/Gargantext/Database/Query/Table/NgramsPostag.hs b/src/Gargantext/Database/Query/Table/NgramsPostag.hs index a51034a2..287dfaa2 100644 --- a/src/Gargantext/Database/Query/Table/NgramsPostag.hs +++ b/src/Gargantext/Database/Query/Table/NgramsPostag.hs @@ -16,7 +16,7 @@ Portability : POSIX module Gargantext.Database.Query.Table.NgramsPostag where -import Control.Lens (view) +import Control.Lens (view, (^.)) import Data.HashMap.Strict (HashMap) import Data.Hashable (Hashable) import Data.Text (Text) @@ -25,6 +25,7 @@ import Gargantext.Core.Types import Gargantext.Database.Prelude (Cmd, runPGSQuery) import Gargantext.Database.Schema.Ngrams import Gargantext.Database.Schema.Prelude +import Gargantext.Database.Query.Table.Ngrams import Gargantext.Database.Types import Gargantext.Prelude import qualified Data.HashMap.Strict as HashMap @@ -38,11 +39,10 @@ data NgramsPostag = NgramsPostag { _np_lang :: Lang , _np_lem :: Ngrams } deriving (Eq, Ord, Generic, Show) - makeLenses ''NgramsPostag - instance Hashable NgramsPostag + type NgramsPostagInsert = ( Int , Int , Text @@ -64,12 +64,25 @@ toInsert (NgramsPostag l a p form lem) = ) insertNgramsPostag :: [NgramsPostag] -> Cmd err (HashMap Text NgramsId) -insertNgramsPostag ns = - if List.null ns +insertNgramsPostag xs = + if List.null xs then pure HashMap.empty - else HashMap.fromList - <$> map (\(Indexed t i) -> (t,i)) - <$> insertNgramsPostag' (map toInsert ns) + else do + -- We do not store the lem if it equals to its self form + let + (ns, nps) = + List.partition (\np -> np ^. np_form . ngramsTerms + /= np ^. np_lem . ngramsTerms + ) xs + + ns' <- insertNgrams (map (view np_form) ns) + + nps' <- HashMap.fromList + <$> map (\(Indexed t i) -> (t,i)) + <$> insertNgramsPostag' (map toInsert ns) + + pure $ HashMap.union ns' nps' + insertNgramsPostag' :: [NgramsPostagInsert] -> Cmd err [Indexed Text Int] insertNgramsPostag' ns = runPGSQuery queryInsertNgramsPostag (PGS.Only $ Values fields ns) @@ -119,13 +132,15 @@ queryInsertNgramsPostag = [sql| ) ------------------------------------------------ ------------------------------------------------ - , ins_postag AS ( INSERT INTO ngrams_postag (lang_id, algo_id, postag, ngrams_id, lemm_id,score) - SELECT ir.lang_id, ir.algo_id, ir.postag, form.id, lem.id, 1 + , ins_postag AS ( + INSERT INTO ngrams_postag (lang_id, algo_id, postag, ngrams_id, lemm_id,score) + SELECT ir.lang_id, ir.algo_id, ir.postag, form.id, lem.id, count(*) as s FROM input_rows ir JOIN ins_form_ret form ON form.terms = ir.form JOIN ins_lem_ret lem ON lem.terms = ir.lem - - + GROUP BY ir.lang_id, ir.algo_id, ir.postag, form.id, lem.id + ORDER BY s DESC + LIMIT 1 ON CONFLICT (lang_id,algo_id,postag,ngrams_id,lemm_id) DO UPDATE SET score = ngrams_postag.score + 1 ) @@ -135,9 +150,10 @@ SELECT terms,id FROM ins_form_ret |] - -selectLems :: [Ngrams] -> Cmd err [(Form, Lem)] -selectLems ns = runPGSQuery querySelectLems (PGS.Only $ Values fields (map toRow ns)) +-- TODO add lang and postag algo +-- TODO remove when form == lem in insert +selectLems :: Lang -> PosTagAlgo -> [Ngrams] -> Cmd err [(Form, Lem)] +selectLems l a ns = runPGSQuery querySelectLems (PGS.Only $ Values fields (map toRow ns)) where fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"] diff --git a/src/Gargantext/Database/Schema/Ngrams.hs b/src/Gargantext/Database/Schema/Ngrams.hs index 38516b60..ac51e514 100644 --- a/src/Gargantext/Database/Schema/Ngrams.hs +++ b/src/Gargantext/Database/Schema/Ngrams.hs @@ -180,7 +180,8 @@ instance Functor NgramsT where ----------------------------------------------------------------------- withMap :: HashMap Text NgramsId -> Text -> NgramsId -withMap m n = maybe (panic "withMap: should not happen") identity (HashMap.lookup n m) +withMap m n = maybe (panic $ "[G.D.S.Ngrams.withMap] Should not happen" <> (cs $ show n)) + identity (HashMap.lookup n m) indexNgramsT :: HashMap Text NgramsId -> NgramsT Ngrams -> NgramsT (Indexed Int Ngrams) indexNgramsT = fmap . indexNgramsWith . withMap diff --git a/src/Gargantext/Prelude.hs b/src/Gargantext/Prelude.hs index 87858f02..441bbf7a 100644 --- a/src/Gargantext/Prelude.hs +++ b/src/Gargantext/Prelude.hs @@ -16,21 +16,21 @@ Portability : POSIX module Gargantext.Prelude ( module Gargantext.Prelude , module Protolude - , headMay, lastMay , module GHC.Err.Located , module Text.Show , module Text.Read - , cs , module Data.Maybe - , round - , sortWith , module Prelude , MonadBase(..) , Typeable + , cs + , headMay, lastMay, sortWith + , round ) where import Control.Monad.Base (MonadBase(..)) +import Data.Set (Set) import GHC.Exts (sortWith) import GHC.Err.Located (undefined) import GHC.Real (round) @@ -71,15 +71,16 @@ import Prelude (Enum, Bounded, minBound, maxBound, putStrLn) -- TODO import functions optimized in Utils.Count -- import Protolude hiding (head, last, all, any, sum, product, length) -- import Gargantext.Utils.Count -import qualified Data.List as L hiding (head, sum) -import qualified Control.Monad as M -import qualified Data.Map as M import Data.Map.Strict (insertWith) -import qualified Data.Vector as V +import Data.String.Conversions (cs) import Safe (headMay, lastMay, initMay, tailMay) -import Text.Show (Show(), show) import Text.Read (Read()) -import Data.String.Conversions (cs) +import Text.Show (Show(), show) +import qualified Control.Monad as M +import qualified Data.List as L hiding (head, sum) +import qualified Data.Map as M +import qualified Data.Set as Set +import qualified Data.Vector as V printDebug :: (Show a, MonadBase IO m) => [Char] -> a -> m () @@ -338,3 +339,20 @@ instance Monoid Integer where instance Semigroup Integer where (<>) a b = a + b + +------------------------------------------------------------------------ + +hasDuplicates :: Ord a => [a] -> Bool +hasDuplicates = hasDuplicatesWith Set.empty + +hasDuplicatesWith :: Ord a => Set a -> [a] -> Bool +hasDuplicatesWith _seen [] = + False -- base case: empty lists never contain duplicates +hasDuplicatesWith seen (x:xs) = + -- If we have seen the current item before, we can short-circuit; otherwise, + -- we'll add it the the set of previously seen items and process the rest of the + -- list against that. + x `Set.member` seen || hasDuplicatesWith (Set.insert x seen) xs + + + -- 2.21.0