Commit 97edf05f authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT] group ngrams, connected (testing now)

parent bb469f39
Pipeline #1354 failed with stage
......@@ -37,9 +37,10 @@ import Gargantext.Database.Action.Metrics.TFICF (getTficf)
import Gargantext.Database.Admin.Types.Node (NodeId)
import Gargantext.Database.Prelude (CmdM)
import Gargantext.Database.Query.Table.Node (defaultList)
import Gargantext.Database.Query.Table.NgramsPostag (selectLems)
import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
import Gargantext.Database.Query.Tree.Error (HasTreeError)
import Gargantext.Database.Schema.Ngrams (NgramsType(..))
import Gargantext.Database.Schema.Ngrams (NgramsType(..), Ngrams(..))
import Gargantext.Prelude
import qualified Data.HashMap.Strict as HashMap
import qualified Data.List as List
......@@ -62,12 +63,12 @@ buildNgramsLists :: ( RepoCmdM env err m
, HasTreeError err
, HasNodeError err
)
=> User
-> GroupParams
=> GroupParams
-> User
-> UserCorpusId
-> MasterCorpusId
-> m (Map NgramsType [NgramsElement])
buildNgramsLists user gp uCid mCid = do
buildNgramsLists gp user uCid mCid = do
ngTerms <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350)
othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
[ (Authors , MapListSize 9)
......@@ -132,6 +133,20 @@ buildNgramsOthersList user uCid _groupParams (nt, MapListSize mapListSize) = do
)]
getGroupParams :: ( HasNodeError err
, CmdM env err m
, RepoCmdM env err m
, HasTreeError err
)
=> GroupParams -> Set Ngrams -> m GroupParams
getGroupParams gp@(GroupWithPosTag l a _m) ng = do
hashMap <- HashMap.fromList <$> selectLems l a (Set.toList ng)
pure $ over gwl_map (\x -> x <> hashMap) gp
getGroupParams gp _ = pure gp
-- TODO use ListIds
buildNgramsTermsList :: ( HasNodeError err
, CmdM env err m
......@@ -160,7 +175,7 @@ buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
let socialLists_Stemmed = addScoreStem groupParams (HashMap.keysSet allTerms) socialLists
printDebug "socialLists_Stemmed" socialLists_Stemmed
let groupedWithList = toGroupedTree {- groupParams -} socialLists_Stemmed allTerms
let groupedWithList = toGroupedTree socialLists_Stemmed allTerms
(stopTerms, candidateTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
$ view flc_scores groupedWithList
......
......@@ -17,6 +17,7 @@ Portability : POSIX
module Gargantext.Core.Text.List.Group.WithStem
where
import Control.Lens (makeLenses)
import Data.HashMap.Strict (HashMap)
import Data.HashSet (HashSet)
import Data.Map (Map)
......@@ -53,6 +54,7 @@ data StopSize = StopSize {unStopSize :: !Int}
-- discussed. Main purpose of this is offering
-- a first grouping option to user and get some
-- enriched data to better learn and improve that algo
-- | Lenses instances at the end of this file
data GroupParams = GroupParams { unGroupParams_lang :: !Lang
, unGroupParams_len :: !Int
, unGroupParams_limit :: !Int
......@@ -124,3 +126,5 @@ toNgramsPatch children = NgramsPatch children' Patch.Keep
$ PatchMap.fromList
$ List.zip children (List.cycle [addPatch])
-- | Instances
makeLenses ''GroupParams
......@@ -151,12 +151,12 @@ insertExtractedNgrams :: [ ExtractedNgrams ] -> Cmd err (HashMap Text NgramsId)
insertExtractedNgrams ngs = do
let (s, e) = List.partition isSimpleNgrams ngs
m1 <- insertNgrams (map unSimpleNgrams s)
printDebug "others" m1
--printDebug "others" m1
m2 <- insertNgramsPostag (map unEnrichedNgrams e)
printDebug "terms" m2
--printDebug "terms" m2
let result = HashMap.unions [m1, m2]
let result = HashMap.union m1 m2
pure result
isSimpleNgrams :: ExtractedNgrams -> Bool
......
......@@ -68,7 +68,7 @@ import Gargantext.Core.Ext.IMT (toSchoolName)
import Gargantext.Core.Ext.IMTUser (deserialiseImtUsersFromFile)
import Gargantext.Core.Flow.Types
import Gargantext.Core.Text
import Gargantext.Core.Text.List.Group.WithStem (StopSize(..), GroupParams(..))
import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
import Gargantext.Core.Text.List (buildNgramsLists)
import Gargantext.Core.Text.Terms
......@@ -231,7 +231,11 @@ flowCorpusUser l user corpusName ctype ids = do
-- User List Flow
(masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
ngs <- buildNgramsLists user (GroupParams l 2 3 (StopSize 3)) userCorpusId masterCorpusId
-- let gp = (GroupParams l 2 3 (StopSize 3))
let gp = GroupWithPosTag l CoreNLP HashMap.empty
ngs <- buildNgramsLists gp user userCorpusId masterCorpusId
_userListId <- flowList_DbRepo listId ngs
_mastListId <- getOrMkList masterCorpusId masterUserId
-- _ <- insertOccsUpdates userCorpusId mastListId
......
......@@ -16,7 +16,7 @@ Portability : POSIX
module Gargantext.Database.Query.Table.NgramsPostag
where
import Control.Lens (view)
import Control.Lens (view, (^.))
import Data.HashMap.Strict (HashMap)
import Data.Hashable (Hashable)
import Data.Text (Text)
......@@ -25,6 +25,7 @@ import Gargantext.Core.Types
import Gargantext.Database.Prelude (Cmd, runPGSQuery)
import Gargantext.Database.Schema.Ngrams
import Gargantext.Database.Schema.Prelude
import Gargantext.Database.Query.Table.Ngrams
import Gargantext.Database.Types
import Gargantext.Prelude
import qualified Data.HashMap.Strict as HashMap
......@@ -38,11 +39,10 @@ data NgramsPostag = NgramsPostag { _np_lang :: Lang
, _np_lem :: Ngrams
}
deriving (Eq, Ord, Generic, Show)
makeLenses ''NgramsPostag
instance Hashable NgramsPostag
type NgramsPostagInsert = ( Int
, Int
, Text
......@@ -64,12 +64,25 @@ toInsert (NgramsPostag l a p form lem) =
)
insertNgramsPostag :: [NgramsPostag] -> Cmd err (HashMap Text NgramsId)
insertNgramsPostag ns =
if List.null ns
insertNgramsPostag xs =
if List.null xs
then pure HashMap.empty
else HashMap.fromList
<$> map (\(Indexed t i) -> (t,i))
<$> insertNgramsPostag' (map toInsert ns)
else do
-- We do not store the lem if it equals to its self form
let
(ns, nps) =
List.partition (\np -> np ^. np_form . ngramsTerms
/= np ^. np_lem . ngramsTerms
) xs
ns' <- insertNgrams (map (view np_form) ns)
nps' <- HashMap.fromList
<$> map (\(Indexed t i) -> (t,i))
<$> insertNgramsPostag' (map toInsert ns)
pure $ HashMap.union ns' nps'
insertNgramsPostag' :: [NgramsPostagInsert] -> Cmd err [Indexed Text Int]
insertNgramsPostag' ns = runPGSQuery queryInsertNgramsPostag (PGS.Only $ Values fields ns)
......@@ -119,13 +132,15 @@ queryInsertNgramsPostag = [sql|
)
------------------------------------------------
------------------------------------------------
, ins_postag AS ( INSERT INTO ngrams_postag (lang_id, algo_id, postag, ngrams_id, lemm_id,score)
SELECT ir.lang_id, ir.algo_id, ir.postag, form.id, lem.id, 1
, ins_postag AS (
INSERT INTO ngrams_postag (lang_id, algo_id, postag, ngrams_id, lemm_id,score)
SELECT ir.lang_id, ir.algo_id, ir.postag, form.id, lem.id, count(*) as s
FROM input_rows ir
JOIN ins_form_ret form ON form.terms = ir.form
JOIN ins_lem_ret lem ON lem.terms = ir.lem
GROUP BY ir.lang_id, ir.algo_id, ir.postag, form.id, lem.id
ORDER BY s DESC
LIMIT 1
ON CONFLICT (lang_id,algo_id,postag,ngrams_id,lemm_id)
DO UPDATE SET score = ngrams_postag.score + 1
)
......@@ -135,9 +150,10 @@ SELECT terms,id FROM ins_form_ret
|]
selectLems :: [Ngrams] -> Cmd err [(Form, Lem)]
selectLems ns = runPGSQuery querySelectLems (PGS.Only $ Values fields (map toRow ns))
-- TODO add lang and postag algo
-- TODO remove when form == lem in insert
selectLems :: Lang -> PosTagAlgo -> [Ngrams] -> Cmd err [(Form, Lem)]
selectLems l a ns = runPGSQuery querySelectLems (PGS.Only $ Values fields (map toRow ns))
where
fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
......
......@@ -180,7 +180,8 @@ instance Functor NgramsT where
-----------------------------------------------------------------------
withMap :: HashMap Text NgramsId -> Text -> NgramsId
withMap m n = maybe (panic "withMap: should not happen") identity (HashMap.lookup n m)
withMap m n = maybe (panic $ "[G.D.S.Ngrams.withMap] Should not happen" <> (cs $ show n))
identity (HashMap.lookup n m)
indexNgramsT :: HashMap Text NgramsId -> NgramsT Ngrams -> NgramsT (Indexed Int Ngrams)
indexNgramsT = fmap . indexNgramsWith . withMap
......
......@@ -16,21 +16,21 @@ Portability : POSIX
module Gargantext.Prelude
( module Gargantext.Prelude
, module Protolude
, headMay, lastMay
, module GHC.Err.Located
, module Text.Show
, module Text.Read
, cs
, module Data.Maybe
, round
, sortWith
, module Prelude
, MonadBase(..)
, Typeable
, cs
, headMay, lastMay, sortWith
, round
)
where
import Control.Monad.Base (MonadBase(..))
import Data.Set (Set)
import GHC.Exts (sortWith)
import GHC.Err.Located (undefined)
import GHC.Real (round)
......@@ -71,15 +71,16 @@ import Prelude (Enum, Bounded, minBound, maxBound, putStrLn)
-- TODO import functions optimized in Utils.Count
-- import Protolude hiding (head, last, all, any, sum, product, length)
-- import Gargantext.Utils.Count
import qualified Data.List as L hiding (head, sum)
import qualified Control.Monad as M
import qualified Data.Map as M
import Data.Map.Strict (insertWith)
import qualified Data.Vector as V
import Data.String.Conversions (cs)
import Safe (headMay, lastMay, initMay, tailMay)
import Text.Show (Show(), show)
import Text.Read (Read())
import Data.String.Conversions (cs)
import Text.Show (Show(), show)
import qualified Control.Monad as M
import qualified Data.List as L hiding (head, sum)
import qualified Data.Map as M
import qualified Data.Set as Set
import qualified Data.Vector as V
printDebug :: (Show a, MonadBase IO m) => [Char] -> a -> m ()
......@@ -338,3 +339,20 @@ instance Monoid Integer where
instance Semigroup Integer where
(<>) a b = a + b
------------------------------------------------------------------------
hasDuplicates :: Ord a => [a] -> Bool
hasDuplicates = hasDuplicatesWith Set.empty
hasDuplicatesWith :: Ord a => Set a -> [a] -> Bool
hasDuplicatesWith _seen [] =
False -- base case: empty lists never contain duplicates
hasDuplicatesWith seen (x:xs) =
-- If we have seen the current item before, we can short-circuit; otherwise,
-- we'll add it the the set of previously seen items and process the rest of the
-- list against that.
x `Set.member` seen || hasDuplicatesWith (Set.insert x seen) xs
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment