From 97edf05ffa9a59e6a1d56c9faca668ec527e79f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexandre=20Delano=C3=AB?= <devel+git@delanoe.org>
Date: Tue, 19 Jan 2021 16:00:12 +0100
Subject: [PATCH] [FEAT] group ngrams, connected (testing now)

---
 src/Gargantext/Core/Text/List.hs              | 25 ++++++++--
 .../Core/Text/List/Group/WithStem.hs          |  4 ++
 src/Gargantext/Core/Text/Terms.hs             |  6 +--
 src/Gargantext/Database/Action/Flow.hs        |  8 +++-
 .../Database/Query/Table/NgramsPostag.hs      | 46 +++++++++++++------
 src/Gargantext/Database/Schema/Ngrams.hs      |  3 +-
 src/Gargantext/Prelude.hs                     | 38 +++++++++++----
 7 files changed, 94 insertions(+), 36 deletions(-)

diff --git a/src/Gargantext/Core/Text/List.hs b/src/Gargantext/Core/Text/List.hs
index dfc4beb1..b4a7daac 100644
--- a/src/Gargantext/Core/Text/List.hs
+++ b/src/Gargantext/Core/Text/List.hs
@@ -37,9 +37,10 @@ import Gargantext.Database.Action.Metrics.TFICF (getTficf)
 import Gargantext.Database.Admin.Types.Node (NodeId)
 import Gargantext.Database.Prelude (CmdM)
 import Gargantext.Database.Query.Table.Node (defaultList)
+import Gargantext.Database.Query.Table.NgramsPostag (selectLems)
 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
 import Gargantext.Database.Query.Tree.Error (HasTreeError)
-import Gargantext.Database.Schema.Ngrams (NgramsType(..))
+import Gargantext.Database.Schema.Ngrams (NgramsType(..), Ngrams(..))
 import Gargantext.Prelude
 import qualified Data.HashMap.Strict as HashMap
 import qualified Data.List as List
@@ -62,12 +63,12 @@ buildNgramsLists :: ( RepoCmdM env err m
                     , HasTreeError err
                     , HasNodeError err
                     )
-                 => User
-                 -> GroupParams
+                 => GroupParams
+                 -> User
                  -> UserCorpusId
                  -> MasterCorpusId
                  -> m (Map NgramsType [NgramsElement])
-buildNgramsLists user gp uCid mCid = do
+buildNgramsLists gp user uCid mCid = do
   ngTerms     <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350)
   othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
                       [ (Authors   , MapListSize 9)
@@ -132,6 +133,20 @@ buildNgramsOthersList user uCid _groupParams (nt, MapListSize mapListSize) = do
                           )]
 
 
+getGroupParams :: ( HasNodeError err
+                  , CmdM     env err m
+                  , RepoCmdM env err m
+                  , HasTreeError err
+                  )
+               => GroupParams -> Set Ngrams -> m GroupParams
+getGroupParams gp@(GroupWithPosTag l a _m) ng = do
+  hashMap <- HashMap.fromList <$> selectLems l a (Set.toList ng)
+  pure $ over gwl_map (\x -> x <> hashMap) gp
+getGroupParams gp _ = pure gp
+
+
+
+
 -- TODO use ListIds
 buildNgramsTermsList :: ( HasNodeError err
                         , CmdM     env err m
@@ -160,7 +175,7 @@ buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
 
   let socialLists_Stemmed = addScoreStem groupParams (HashMap.keysSet allTerms) socialLists
   printDebug "socialLists_Stemmed" socialLists_Stemmed
-  let groupedWithList = toGroupedTree {- groupParams -} socialLists_Stemmed allTerms
+  let groupedWithList = toGroupedTree socialLists_Stemmed allTerms
       (stopTerms, candidateTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
                                   $ view flc_scores groupedWithList
 
diff --git a/src/Gargantext/Core/Text/List/Group/WithStem.hs b/src/Gargantext/Core/Text/List/Group/WithStem.hs
index 1f2923c3..38245270 100644
--- a/src/Gargantext/Core/Text/List/Group/WithStem.hs
+++ b/src/Gargantext/Core/Text/List/Group/WithStem.hs
@@ -17,6 +17,7 @@ Portability : POSIX
 module Gargantext.Core.Text.List.Group.WithStem
   where
 
+import Control.Lens (makeLenses)
 import Data.HashMap.Strict (HashMap)
 import Data.HashSet (HashSet)
 import Data.Map (Map)
@@ -53,6 +54,7 @@ data StopSize = StopSize {unStopSize :: !Int}
 -- discussed. Main purpose of this is offering
 -- a first grouping option to user and get some
 -- enriched data to better learn and improve that algo
+-- | Lenses instances at the end of this file
 data GroupParams = GroupParams { unGroupParams_lang     :: !Lang
                                , unGroupParams_len      :: !Int
                                , unGroupParams_limit    :: !Int
@@ -124,3 +126,5 @@ toNgramsPatch children = NgramsPatch children' Patch.Keep
               $ PatchMap.fromList
               $ List.zip children (List.cycle [addPatch])
 
+-- | Instances
+makeLenses ''GroupParams
diff --git a/src/Gargantext/Core/Text/Terms.hs b/src/Gargantext/Core/Text/Terms.hs
index 5c6a9a67..7ed5ec85 100644
--- a/src/Gargantext/Core/Text/Terms.hs
+++ b/src/Gargantext/Core/Text/Terms.hs
@@ -151,12 +151,12 @@ insertExtractedNgrams :: [ ExtractedNgrams ] -> Cmd err (HashMap Text NgramsId)
 insertExtractedNgrams ngs = do
   let (s, e) = List.partition isSimpleNgrams ngs
   m1 <- insertNgrams       (map unSimpleNgrams   s)
-  printDebug "others" m1
+  --printDebug "others" m1
   
   m2 <- insertNgramsPostag (map unEnrichedNgrams e)
-  printDebug "terms" m2
+  --printDebug "terms" m2
  
-  let result = HashMap.unions [m1, m2]
+  let result = HashMap.union m1 m2
   pure result
 
 isSimpleNgrams :: ExtractedNgrams -> Bool
diff --git a/src/Gargantext/Database/Action/Flow.hs b/src/Gargantext/Database/Action/Flow.hs
index 14fb8647..6b054a0c 100644
--- a/src/Gargantext/Database/Action/Flow.hs
+++ b/src/Gargantext/Database/Action/Flow.hs
@@ -68,7 +68,7 @@ import Gargantext.Core.Ext.IMT (toSchoolName)
 import Gargantext.Core.Ext.IMTUser (deserialiseImtUsersFromFile)
 import Gargantext.Core.Flow.Types
 import Gargantext.Core.Text
-import Gargantext.Core.Text.List.Group.WithStem (StopSize(..), GroupParams(..))
+import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
 import Gargantext.Core.Text.List (buildNgramsLists)
 import Gargantext.Core.Text.Terms
@@ -231,7 +231,11 @@ flowCorpusUser l user corpusName ctype ids = do
 
   -- User List Flow
   (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
-  ngs         <- buildNgramsLists user (GroupParams l 2 3 (StopSize 3)) userCorpusId masterCorpusId
+
+  -- let gp = (GroupParams l 2 3 (StopSize 3)) 
+  let gp = GroupWithPosTag l CoreNLP HashMap.empty 
+  ngs         <- buildNgramsLists gp user userCorpusId masterCorpusId
+
   _userListId <- flowList_DbRepo listId ngs
   _mastListId <- getOrMkList masterCorpusId masterUserId
   -- _ <- insertOccsUpdates userCorpusId mastListId
diff --git a/src/Gargantext/Database/Query/Table/NgramsPostag.hs b/src/Gargantext/Database/Query/Table/NgramsPostag.hs
index a51034a2..287dfaa2 100644
--- a/src/Gargantext/Database/Query/Table/NgramsPostag.hs
+++ b/src/Gargantext/Database/Query/Table/NgramsPostag.hs
@@ -16,7 +16,7 @@ Portability : POSIX
 module Gargantext.Database.Query.Table.NgramsPostag
     where
 
-import Control.Lens (view)
+import Control.Lens (view, (^.))
 import Data.HashMap.Strict (HashMap)
 import Data.Hashable (Hashable)
 import Data.Text (Text)
@@ -25,6 +25,7 @@ import Gargantext.Core.Types
 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
 import Gargantext.Database.Schema.Ngrams
 import Gargantext.Database.Schema.Prelude
+import Gargantext.Database.Query.Table.Ngrams
 import Gargantext.Database.Types
 import Gargantext.Prelude
 import qualified Data.HashMap.Strict        as HashMap
@@ -38,11 +39,10 @@ data NgramsPostag = NgramsPostag { _np_lang   :: Lang
                                  , _np_lem    :: Ngrams
                                  }
   deriving (Eq, Ord, Generic, Show)
-
 makeLenses ''NgramsPostag
-
 instance Hashable NgramsPostag
 
+
 type NgramsPostagInsert = ( Int
                           , Int
                           , Text
@@ -64,12 +64,25 @@ toInsert (NgramsPostag l a p form lem) =
   )
 
 insertNgramsPostag :: [NgramsPostag] -> Cmd err (HashMap Text NgramsId)
-insertNgramsPostag ns =
-  if List.null ns
+insertNgramsPostag xs =
+  if List.null xs
      then pure HashMap.empty
-     else HashMap.fromList
-         <$> map (\(Indexed t i) -> (t,i))
-         <$> insertNgramsPostag' (map toInsert ns)
+     else do
+        -- We do not store the lem if it equals to its self form
+       let
+          (ns, nps) =
+            List.partition (\np -> np ^. np_form . ngramsTerms
+                                /= np ^. np_lem  . ngramsTerms
+                           ) xs
+
+       ns' <- insertNgrams (map (view np_form) ns)
+
+       nps' <- HashMap.fromList
+           <$> map (\(Indexed t i) -> (t,i))
+           <$> insertNgramsPostag' (map toInsert ns)
+
+       pure $ HashMap.union ns' nps'
+
 
 insertNgramsPostag' :: [NgramsPostagInsert] -> Cmd err [Indexed Text Int]
 insertNgramsPostag' ns = runPGSQuery queryInsertNgramsPostag (PGS.Only $ Values fields ns)
@@ -119,13 +132,15 @@ queryInsertNgramsPostag = [sql|
     )
   ------------------------------------------------
   ------------------------------------------------
-  , ins_postag AS ( INSERT INTO ngrams_postag (lang_id, algo_id, postag, ngrams_id, lemm_id,score)
-    SELECT ir.lang_id, ir.algo_id, ir.postag, form.id, lem.id, 1
+  , ins_postag AS (
+    INSERT INTO ngrams_postag (lang_id, algo_id, postag, ngrams_id, lemm_id,score)
+    SELECT ir.lang_id, ir.algo_id, ir.postag, form.id, lem.id, count(*) as s
     FROM input_rows ir
       JOIN ins_form_ret  form ON form.terms = ir.form
       JOIN ins_lem_ret   lem  ON lem.terms  = ir.lem
-
-
+       GROUP BY ir.lang_id, ir.algo_id, ir.postag, form.id, lem.id    
+       ORDER BY s DESC
+       LIMIT 1
       ON CONFLICT (lang_id,algo_id,postag,ngrams_id,lemm_id)
         DO UPDATE SET score = ngrams_postag.score + 1
     )
@@ -135,9 +150,10 @@ SELECT terms,id FROM ins_form_ret
 
   |]
 
-
-selectLems :: [Ngrams] -> Cmd err [(Form, Lem)]
-selectLems ns = runPGSQuery querySelectLems (PGS.Only $ Values fields (map toRow ns))
+-- TODO add lang and postag algo
+-- TODO remove when form == lem in insert
+selectLems :: Lang -> PosTagAlgo -> [Ngrams] -> Cmd err [(Form, Lem)]
+selectLems l a ns = runPGSQuery querySelectLems (PGS.Only $ Values fields (map toRow ns))
   where
     fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
 
diff --git a/src/Gargantext/Database/Schema/Ngrams.hs b/src/Gargantext/Database/Schema/Ngrams.hs
index 38516b60..ac51e514 100644
--- a/src/Gargantext/Database/Schema/Ngrams.hs
+++ b/src/Gargantext/Database/Schema/Ngrams.hs
@@ -180,7 +180,8 @@ instance Functor NgramsT where
 
 -----------------------------------------------------------------------
 withMap :: HashMap Text NgramsId -> Text -> NgramsId
-withMap m n = maybe (panic "withMap: should not happen") identity (HashMap.lookup n m)
+withMap m n = maybe (panic $ "[G.D.S.Ngrams.withMap] Should not happen" <> (cs $ show n))
+                    identity (HashMap.lookup n m)
 
 indexNgramsT :: HashMap Text NgramsId -> NgramsT Ngrams -> NgramsT (Indexed Int Ngrams)
 indexNgramsT = fmap . indexNgramsWith . withMap
diff --git a/src/Gargantext/Prelude.hs b/src/Gargantext/Prelude.hs
index 87858f02..441bbf7a 100644
--- a/src/Gargantext/Prelude.hs
+++ b/src/Gargantext/Prelude.hs
@@ -16,21 +16,21 @@ Portability : POSIX
 module Gargantext.Prelude
   ( module Gargantext.Prelude
   , module Protolude
-  , headMay, lastMay
   , module GHC.Err.Located
   , module Text.Show
   , module Text.Read
-  , cs
   , module Data.Maybe
-  , round
-  , sortWith
   , module Prelude
   , MonadBase(..)
   , Typeable
+  , cs
+  , headMay, lastMay, sortWith
+  , round
   )
   where
 
 import Control.Monad.Base (MonadBase(..))
+import Data.Set (Set)
 import GHC.Exts (sortWith)
 import GHC.Err.Located (undefined)
 import GHC.Real (round)
@@ -71,15 +71,16 @@ import Prelude (Enum, Bounded, minBound, maxBound, putStrLn)
 -- TODO import functions optimized in Utils.Count
 -- import Protolude hiding (head, last, all, any, sum, product, length)
 -- import Gargantext.Utils.Count
-import qualified Data.List     as L hiding (head, sum)
-import qualified Control.Monad as M
-import qualified Data.Map      as M
 import Data.Map.Strict (insertWith)
-import qualified Data.Vector as V
+import Data.String.Conversions (cs)
 import Safe (headMay, lastMay, initMay, tailMay)
-import Text.Show (Show(), show)
 import Text.Read (Read())
-import Data.String.Conversions (cs)
+import Text.Show (Show(), show)
+import qualified Control.Monad as M
+import qualified Data.List     as L hiding (head, sum)
+import qualified Data.Map      as M
+import qualified Data.Set      as Set
+import qualified Data.Vector   as V
 
 
 printDebug :: (Show a, MonadBase IO m) => [Char] -> a -> m ()
@@ -338,3 +339,20 @@ instance Monoid Integer where
 
 instance Semigroup Integer where
   (<>) a b = a + b
+
+------------------------------------------------------------------------
+
+hasDuplicates :: Ord a => [a] -> Bool
+hasDuplicates = hasDuplicatesWith Set.empty
+
+hasDuplicatesWith :: Ord a => Set a -> [a] -> Bool
+hasDuplicatesWith _seen [] =
+    False -- base case: empty lists never contain duplicates
+hasDuplicatesWith  seen (x:xs) =
+    -- If we have seen the current item before, we can short-circuit; otherwise,
+    -- we'll add it the the set of previously seen items and process the rest of the
+    -- list against that.
+    x `Set.member` seen || hasDuplicatesWith (Set.insert x seen) xs
+
+
+
-- 
2.21.0