Commit c67c2137 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FLOW][NGRAMS] Lists adding others Ngrams.

parent 9be9b7e0
...@@ -24,48 +24,48 @@ Portability : POSIX ...@@ -24,48 +24,48 @@ Portability : POSIX
module Gargantext.Database.Flow -- (flowDatabase, ngrams2list) module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
where where
--import Gargantext.Database.Metrics.Count (getNgramsElementsWithParentNodeId)
--import Gargantext.Database.Metrics.TFICF (getTficf)
--import Gargantext.Database.Node.Contact (HyperdataContact(..))
--import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
--import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
--import Gargantext.Database.Schema.User (insertUsers, simpleUser, gargantuaUser)
--import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
--import Gargantext.Text.Metrics.TFICF (Tficf(..))
import Control.Monad (mapM_) import Control.Monad (mapM_)
import Control.Monad.IO.Class (liftIO) import Control.Monad.IO.Class (liftIO)
--import Gargantext.Database.Node.Contact (HyperdataContact(..)) import Data.List (concat)
import Data.Map (Map, lookup, toList) import Data.Map (Map, lookup, toList)
import Data.Maybe (Maybe(..), catMaybes) import Data.Maybe (Maybe(..), catMaybes)
import Data.Monoid import Data.Monoid
import Data.Text (Text, splitOn, intercalate) import Data.Text (Text, splitOn, intercalate)
import qualified Data.Text as Text
import Data.List (concat)
import GHC.Show (Show) import GHC.Show (Show)
import Gargantext.API.Ngrams (HasRepoVar)
import Gargantext.API.Ngrams (NgramsElement, putListNgrams, RepoCmdM)
import Gargantext.Core (Lang(..))
import Gargantext.Core.Types (NodePoly(..), Terms(..)) import Gargantext.Core.Types (NodePoly(..), Terms(..))
import Gargantext.Core.Types.Individu (Username) import Gargantext.Core.Types.Individu (Username)
import Gargantext.Core.Types.Main import Gargantext.Core.Types.Main
import Gargantext.Core (Lang(..))
import Gargantext.Database.Config (userMaster, corpusMasterName) import Gargantext.Database.Config (userMaster, corpusMasterName)
import Gargantext.Database.Flow.Utils (insertToNodeNgrams) import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
--import Gargantext.Database.Metrics.TFICF (getTficf)
import Gargantext.Text.Terms (extractTerms)
--import Gargantext.Text.Metrics.TFICF (Tficf(..))
import qualified Gargantext.Database.Node.Document.Add as Doc (add)
import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..)) import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
import Gargantext.Database.Root (getRoot) import Gargantext.Database.Root (getRoot)
import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId) import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError) import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
-- import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
--import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
--import Gargantext.Database.Metrics.Count (getNgramsElementsWithParentNodeId)
import Gargantext.Database.Schema.User (getUser, UserLight(..)) import Gargantext.Database.Schema.User (getUser, UserLight(..))
import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId) import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
import Gargantext.Database.Utils (Cmd, CmdM) import Gargantext.Database.Utils (Cmd, CmdM)
import Gargantext.Text.Terms (TermType(..))
import Gargantext.Ext.IMT (toSchoolName) import Gargantext.Ext.IMT (toSchoolName)
--import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Text.List import Gargantext.Text.List
import Gargantext.Text.Parsers (parseDocs, FileFormat) import Gargantext.Text.Parsers (parseDocs, FileFormat)
import System.FilePath (FilePath) import Gargantext.Text.Terms (TermType(..))
import Gargantext.API.Ngrams (HasRepoVar) import Gargantext.Text.Terms (extractTerms)
import Servant (ServantErr) import Servant (ServantErr)
import Gargantext.API.Ngrams (NgramsElement, putListNgrams, RepoCmdM) import System.FilePath (FilePath)
--import Gargantext.Database.Schema.User (insertUsers, simpleUser, gargantuaUser)
import qualified Data.Map as DM import qualified Data.Map as DM
import qualified Data.Text as Text
import qualified Gargantext.Database.Node.Document.Add as Doc (add)
type FlowCmdM env err m = type FlowCmdM env err m =
( CmdM env err m ( CmdM env err m
...@@ -86,10 +86,9 @@ flowCorpus userName ff fp corpusName = do ...@@ -86,10 +86,9 @@ flowCorpus userName ff fp corpusName = do
-- TODO uniformize language of corpus -- TODO uniformize language of corpus
-- TODO ChunkAlong is not the right function here -- TODO ChunkAlong is not the right function here
-- chunkAlong 10 10 [1..15] == [1..10] -- chunkAlong 10 10 [1..15] == [1..10]
-- BUG: what about the rest (divMod 15 10)? -- BUG: what about the rest of (divMod 15 10)?
-- but if temporary enables big corpora insert for tests
-- TODO: chunkAlongNoRest or chunkAlongWithRest -- TODO: chunkAlongNoRest or chunkAlongWithRest
-- default: NoRest -- default behavior: NoRest
ids <- mapM insertMasterDocs $ chunkAlong 10000 10000 docs ids <- mapM insertMasterDocs $ chunkAlong 10000 10000 docs
-- User Flow -- User Flow
...@@ -100,9 +99,9 @@ flowCorpus userName ff fp corpusName = do ...@@ -100,9 +99,9 @@ flowCorpus userName ff fp corpusName = do
-- User List Flow -- User List Flow
(_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster "" (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster ""
-- /!\ this extract NgramsTerms Only -- /!\ this extract NgramsTerms Only
ngs <- buildNgramsList userCorpusId masterCorpusId ngs <- buildNgramsLists userCorpusId masterCorpusId
--printDebug "ngs" ngs --printDebug "ngs" ngs
-- TODO getNgramsElement of NgramsType... --TODO getNgramsElement of NgramsType...
--ngs <- getNgramsElementsWithParentNodeId masterCorpusId --ngs <- getNgramsElementsWithParentNodeId masterCorpusId
userListId <- flowList userId userCorpusId ngs userListId <- flowList userId userCorpusId ngs
printDebug "userListId" userListId printDebug "userListId" userListId
...@@ -125,7 +124,6 @@ insertMasterDocs hs = do ...@@ -125,7 +124,6 @@ insertMasterDocs hs = do
let hyperdataDocuments' = map (\h -> ToDbDocument h) hs let hyperdataDocuments' = map (\h -> ToDbDocument h) hs
-- TODO put in State Monad
(masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster corpusMasterName (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster corpusMasterName
ids <- insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments' ids <- insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
...@@ -134,22 +132,13 @@ insertMasterDocs hs = do ...@@ -134,22 +132,13 @@ insertMasterDocs hs = do
let maps = mapNodeIdNgrams docsWithNgrams let maps = mapNodeIdNgrams docsWithNgrams
--printDebug "maps" (maps)
terms2id <- insertNgrams $ DM.keys maps terms2id <- insertNgrams $ DM.keys maps
let indexedNgrams = DM.mapKeys (indexNgrams terms2id) maps let indexedNgrams = DM.mapKeys (indexNgrams terms2id) maps
--printDebug "inserted ngrams" indexedNgrams
_ <- insertToNodeNgrams indexedNgrams _ <- insertToNodeNgrams indexedNgrams
pure $ map reId ids pure $ map reId ids
getUserCorpusNgrams :: FlowCmdM env ServantErr m
=> CorpusId -> m [Ngrams]
getUserCorpusNgrams = undefined
type CorpusName = Text type CorpusName = Text
getOrMkRootWithCorpus :: HasNodeError err getOrMkRootWithCorpus :: HasNodeError err
...@@ -188,7 +177,6 @@ getOrMkRootWithCorpus username cName = do ...@@ -188,7 +177,6 @@ getOrMkRootWithCorpus username cName = do
------------------------------------------------------------------------ ------------------------------------------------------------------------
toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d)) toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
...@@ -219,7 +207,6 @@ data DocumentIdWithNgrams = DocumentIdWithNgrams ...@@ -219,7 +207,6 @@ data DocumentIdWithNgrams = DocumentIdWithNgrams
, document_ngrams :: !(Map Ngrams (Map NgramsType Int)) , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
} deriving (Show) } deriving (Show)
-- TODO group terms
extractNgramsT :: HasNodeError err extractNgramsT :: HasNodeError err
=> HyperdataDocument => HyperdataDocument
...@@ -297,7 +284,6 @@ flowListBase :: FlowCmdM env err m ...@@ -297,7 +284,6 @@ flowListBase :: FlowCmdM env err m
=> ListId -> Map NgramsType [NgramsElement] => ListId -> Map NgramsType [NgramsElement]
-> m () -> m ()
flowListBase lId ngs = do flowListBase lId ngs = do
-- compute Candidate / Map
mapM_ (\(typeList, ngElmts) -> putListNgrams lId typeList ngElmts) $ toList ngs mapM_ (\(typeList, ngElmts) -> putListNgrams lId typeList ngElmts) $ toList ngs
flowList :: FlowCmdM env err m => UserId -> CorpusId flowList :: FlowCmdM env err m => UserId -> CorpusId
......
...@@ -245,7 +245,7 @@ countCorpusDocuments r cId = maybe 0 identity ...@@ -245,7 +245,7 @@ countCorpusDocuments r cId = maybe 0 identity
<$> runQuery' r cId <$> runQuery' r cId
where where
runQuery' RoleUser cId' = runPGSQuery runQuery' RoleUser cId' = runPGSQuery
"SELECT count(*) from nodes_nodes nn WHERE nn.node1_id = ?" "SELECT count(*) from nodes_nodes nn WHERE nn.node1_id = ? AND nn.delete = False"
(PGS.Only cId') (PGS.Only cId')
runQuery' RoleMaster cId' = runPGSQuery runQuery' RoleMaster cId' = runPGSQuery
"SELECT count(*) from nodes n WHERE n.parent_id = ? AND n.typename = ?" "SELECT count(*) from nodes n WHERE n.parent_id = ? AND n.typename = ?"
......
...@@ -60,7 +60,7 @@ sortTficf = List.sortOn (fst . snd) . toList ...@@ -60,7 +60,7 @@ sortTficf = List.sortOn (fst . snd) . toList
getTficf' :: UserCorpusId -> MasterCorpusId -> (Text -> Text) getTficf' :: UserCorpusId -> MasterCorpusId -> (Text -> Text)
-> Cmd err (Map Text (Double, Set Text)) -> Cmd err (Map Text (Double, Set Text))
getTficf' u m f = do getTficf' u m f = do
u' <- getNodesByNgramsUser u u' <- getNodesByNgramsUser u NgramsTerms
m' <- getNodesByNgramsMaster u m m' <- getNodesByNgramsMaster u m
pure $ toTficfData (countNodesByNgramsWith f u') pure $ toTficfData (countNodesByNgramsWith f u')
...@@ -104,16 +104,16 @@ groupNodesByNgramsWith f m = ...@@ -104,16 +104,16 @@ groupNodesByNgramsWith f m =
$ toList m $ toList m
------------------------------------------------------------------------ ------------------------------------------------------------------------
getNodesByNgramsUser :: CorpusId -> Cmd err (Map Text (Set NodeId)) getNodesByNgramsUser :: CorpusId -> NgramsType -> Cmd err (Map Text (Set NodeId))
getNodesByNgramsUser cId = fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n)) getNodesByNgramsUser cId nt = fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
<$> selectNgramsByNodeUser cId <$> selectNgramsByNodeUser cId nt
selectNgramsByNodeUser :: CorpusId -> Cmd err [(NodeId, Text)] selectNgramsByNodeUser :: CorpusId -> NgramsType -> Cmd err [(NodeId, Text)]
selectNgramsByNodeUser cId = runPGSQuery selectNgramsByNodeUser cId nt = runPGSQuery
queryNgramsByNodeUser queryNgramsByNodeUser
( cId ( cId
, nodeTypeId NodeDocument , nodeTypeId NodeDocument
, ngramsTypeId NgramsTerms , ngramsTypeId nt
) )
queryNgramsByNodeUser :: DPS.Query queryNgramsByNodeUser :: DPS.Query
......
...@@ -26,7 +26,7 @@ import Data.Text (Text) ...@@ -26,7 +26,7 @@ import Data.Text (Text)
import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, mSetFromList) import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, mSetFromList)
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId) import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
import Gargantext.Database.Metrics.NgramsByNode (getTficf', sortTficf, ngramsGroup) import Gargantext.Database.Metrics.NgramsByNode (getTficf', sortTficf, ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith)
import Gargantext.Database.Schema.Ngrams (NgramsType(..)) import Gargantext.Database.Schema.Ngrams (NgramsType(..))
import Gargantext.Database.Utils (Cmd) import Gargantext.Database.Utils (Cmd)
import Gargantext.Prelude import Gargantext.Prelude
...@@ -35,9 +35,31 @@ import qualified Data.Set as Set ...@@ -35,9 +35,31 @@ import qualified Data.Set as Set
import qualified Data.Text as Text import qualified Data.Text as Text
import qualified Data.List as List import qualified Data.List as List
buildNgramsList :: UserCorpusId -> MasterCorpusId -- | TODO improve grouping functions of Authors, Sources, Institutes..
buildNgramsLists :: UserCorpusId -> MasterCorpusId
-> Cmd err (Map NgramsType [NgramsElement]) -> Cmd err (Map NgramsType [NgramsElement])
buildNgramsList uCid mCid = do buildNgramsLists uCid mCid = do
ngTerms <- buildNgramsTermsList uCid mCid
othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
pure $ Map.unions $ othersTerms <> [ngTerms]
buildNgramsOthersList :: UserCorpusId -> (Text -> Text) -> NgramsType
-> Cmd err (Map NgramsType [NgramsElement])
buildNgramsOthersList uCid groupIt nt = do
ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
pure $ Map.fromList [(nt, [ mkNgramsElement t CandidateTerm Nothing (mSetFromList [])
| (t,_ns) <- Map.toList ngs
]
)
]
buildNgramsTermsList :: UserCorpusId -> MasterCorpusId
-> Cmd err (Map NgramsType [NgramsElement])
buildNgramsTermsList uCid mCid = do
candidates <- sortTficf <$> getTficf' uCid mCid (ngramsGroup EN 2) candidates <- sortTficf <$> getTficf' uCid mCid (ngramsGroup EN 2)
--printDebug "candidate" (length candidates) --printDebug "candidate" (length candidates)
...@@ -65,6 +87,9 @@ toNgramsElement (listType, (_stem, (_score, setNgrams))) = ...@@ -65,6 +87,9 @@ toNgramsElement (listType, (_stem, (_score, setNgrams))) =
) children ) children
toTermList :: (a -> Bool) -> [a] -> [(ListType, a)] toTermList :: (a -> Bool) -> [a] -> [(ListType, a)]
toTermList stop ns = map (toTermList' stop CandidateTerm) xs toTermList stop ns = map (toTermList' stop CandidateTerm) xs
<> map (toTermList' stop GraphTerm) ys <> map (toTermList' stop GraphTerm) ys
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment