Commit 8f0b7cd4 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[WIP] ngrams lists and groups.

parent 740badb8
Pipeline #294 failed with stage
...@@ -64,8 +64,9 @@ import Gargantext.Text.List (buildNgramsLists) ...@@ -64,8 +64,9 @@ import Gargantext.Text.List (buildNgramsLists)
import Gargantext.Text.Terms (TermType(..)) import Gargantext.Text.Terms (TermType(..))
import Gargantext.Text.Terms (extractTerms) import Gargantext.Text.Terms (extractTerms)
import Gargantext.Text.Terms.Mono.Stem.En (stemIt) import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
import qualified Gargantext.Text.Parsers.GrandDebat as GD
import Servant (ServantErr) import Servant (ServantErr)
--import System.FilePath (FilePath) import System.FilePath (FilePath)
import qualified Data.Map as DM import qualified Data.Map as DM
import qualified Data.Text as Text import qualified Data.Text as Text
import qualified Gargantext.Database.Node.Document.Add as Doc (add) import qualified Gargantext.Database.Node.Document.Add as Doc (add)
...@@ -77,6 +78,18 @@ type FlowCmdM env err m = ...@@ -77,6 +78,18 @@ type FlowCmdM env err m =
, HasRepoVar env , HasRepoVar env
) )
flowCorpusDebat :: FlowCmdM env ServantErr m
=> Username -> CorpusName -> Int -> FilePath -> m CorpusId
flowCorpusDebat u n l fp = do
docs <- liftIO ( splitEvery 500
<$> take l
<$> GD.readFile fp
:: IO [[GD.GrandDebatReference ]]
)
flowCorpus u n (Multi FR) docs
{- {-
flowCorpus :: FlowCmdM env ServantErr m flowCorpus :: FlowCmdM env ServantErr m
=> Username -> CorpusName -> TermType Lang -> FileFormat -> FilePath -> m CorpusId => Username -> CorpusName -> TermType Lang -> FileFormat -> FilePath -> m CorpusId
...@@ -91,7 +104,7 @@ flowCorpus :: (FlowCmdM env ServantErr m, ToHyperdataDocument a) ...@@ -91,7 +104,7 @@ flowCorpus :: (FlowCmdM env ServantErr m, ToHyperdataDocument a)
=> Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId => Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
flowCorpus u cn la docs = do flowCorpus u cn la docs = do
ids <- mapM ((insertMasterDocs la) . (map toHyperdataDocument)) docs ids <- mapM ((insertMasterDocs la) . (map toHyperdataDocument)) docs
flowCorpusUser u cn (concat ids) flowCorpusUser FR u cn (concat ids)
-- TODO query with complex query -- TODO query with complex query
...@@ -100,12 +113,12 @@ flowCorpusSearchInDatabase :: FlowCmdM env ServantErr m ...@@ -100,12 +113,12 @@ flowCorpusSearchInDatabase :: FlowCmdM env ServantErr m
flowCorpusSearchInDatabase u q = do flowCorpusSearchInDatabase u q = do
(_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster ""
ids <- map fst <$> searchInDatabase cId (stemIt q) ids <- map fst <$> searchInDatabase cId (stemIt q)
flowCorpusUser u q ids flowCorpusUser FR u q ids
flowCorpusUser :: FlowCmdM env ServantErr m flowCorpusUser :: FlowCmdM env ServantErr m
=> Username -> CorpusName -> [NodeId] -> m CorpusId => Lang -> Username -> CorpusName -> [NodeId] -> m CorpusId
flowCorpusUser userName corpusName ids = do flowCorpusUser l userName corpusName ids = do
-- User Flow -- User Flow
(userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName
-- TODO: check if present already, ignore -- TODO: check if present already, ignore
...@@ -113,12 +126,12 @@ flowCorpusUser userName corpusName ids = do ...@@ -113,12 +126,12 @@ flowCorpusUser userName corpusName ids = do
-- User List Flow -- User List Flow
(_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster "" (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster ""
ngs <- buildNgramsLists userCorpusId masterCorpusId ngs <- buildNgramsLists l 2 3 userCorpusId masterCorpusId
userListId <- flowList userId userCorpusId ngs userListId <- flowList userId userCorpusId ngs
printDebug "userListId" userListId printDebug "userListId" userListId
-- User Graph Flow -- User Graph Flow
--_ <- mkGraph userCorpusId userId _ <- mkGraph userCorpusId userId
-- User Dashboard Flow -- User Dashboard Flow
-- _ <- mkDashboard userCorpusId userId -- _ <- mkDashboard userCorpusId userId
......
...@@ -44,13 +44,13 @@ import qualified Database.PostgreSQL.Simple as DPS ...@@ -44,13 +44,13 @@ import qualified Database.PostgreSQL.Simple as DPS
-- a first grouping option to user and get some -- a first grouping option to user and get some
-- enriched data to better learn and improve that algo -- enriched data to better learn and improve that algo
ngramsGroup :: Lang -> Int -> Int -> Text -> Text ngramsGroup :: Lang -> Int -> Int -> Text -> Text
ngramsGroup l m n = Text.intercalate " " ngramsGroup l _m _n = Text.intercalate " "
. map (stem l) . map (stem l)
. take n -- . take n
. List.sort . List.sort
. (List.filter (\t -> Text.length t > m)) -- . (List.filter (\t -> Text.length t > m))
. Text.splitOn " " . Text.splitOn " "
. Text.replace "-" " " . Text.replace "-" " "
sortTficf :: (Map Text (Double, Set Text)) sortTficf :: (Map Text (Double, Set Text))
......
...@@ -26,6 +26,7 @@ import Gargantext.Database.Metrics.NgramsByNode (getTficf', sortTficf, ngramsGro ...@@ -26,6 +26,7 @@ import Gargantext.Database.Metrics.NgramsByNode (getTficf', sortTficf, ngramsGro
import Gargantext.Database.Schema.Ngrams (NgramsType(..)) import Gargantext.Database.Schema.Ngrams (NgramsType(..))
import Gargantext.Database.Utils (Cmd) import Gargantext.Database.Utils (Cmd)
import Gargantext.Prelude import Gargantext.Prelude
--import Gargantext.Text.Terms (TermType(..))
import qualified Data.Char as Char import qualified Data.Char as Char
import qualified Data.List as List import qualified Data.List as List
import qualified Data.Map as Map import qualified Data.Map as Map
...@@ -33,10 +34,10 @@ import qualified Data.Set as Set ...@@ -33,10 +34,10 @@ import qualified Data.Set as Set
import qualified Data.Text as Text import qualified Data.Text as Text
-- | TODO improve grouping functions of Authors, Sources, Institutes.. -- | TODO improve grouping functions of Authors, Sources, Institutes..
buildNgramsLists :: UserCorpusId -> MasterCorpusId buildNgramsLists :: Lang -> Int -> Int -> UserCorpusId -> MasterCorpusId
-> Cmd err (Map NgramsType [NgramsElement]) -> Cmd err (Map NgramsType [NgramsElement])
buildNgramsLists uCid mCid = do buildNgramsLists l n m uCid mCid = do
ngTerms <- buildNgramsTermsList uCid mCid ngTerms <- buildNgramsTermsList l n m uCid mCid
othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes] othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
pure $ Map.unions $ othersTerms <> [ngTerms] pure $ Map.unions $ othersTerms <> [ngTerms]
...@@ -53,13 +54,14 @@ buildNgramsOthersList uCid groupIt nt = do ...@@ -53,13 +54,14 @@ buildNgramsOthersList uCid groupIt nt = do
] ]
-- TODO remove hard coded parameters -- TODO remove hard coded parameters
buildNgramsTermsList :: UserCorpusId -> MasterCorpusId buildNgramsTermsList :: Lang -> Int -> Int -> UserCorpusId -> MasterCorpusId
-> Cmd err (Map NgramsType [NgramsElement]) -> Cmd err (Map NgramsType [NgramsElement])
buildNgramsTermsList uCid mCid = do buildNgramsTermsList l n m uCid mCid = do
candidates <- sortTficf <$> getTficf' uCid mCid (ngramsGroup EN 4 2) candidates <- sortTficf <$> getTficf' uCid mCid (ngramsGroup l n m)
--printDebug "candidate" (length candidates) --printDebug "candidate" (length candidates)
let termList = toTermList (isStopTerm . fst) candidates --let termList = toTermList (isStopTerm . fst) candidates
let termList = toTermList ((\_ -> False) . fst) candidates
--printDebug "termlist" (length termList) --printDebug "termlist" (length termList)
let ngs = List.concat $ map toNgramsElement termList let ngs = List.concat $ map toNgramsElement termList
...@@ -98,14 +100,14 @@ toTermList stop ns = map (toTermList' stop CandidateTerm) xs ...@@ -98,14 +100,14 @@ toTermList stop ns = map (toTermList' stop CandidateTerm) xs
ys = take b $ drop a ns ys = take b $ drop a ns
zs = drop b $ drop a ns zs = drop b $ drop a ns
a = 50 a = 10
b = 1000 b = 400
isStopTerm :: Text -> Bool isStopTerm :: Text -> Bool
isStopTerm x = Text.length x < 3 isStopTerm x = Text.length x < 3
|| not (all Char.isAlpha (Text.unpack x')) || not (all Char.isAlpha (Text.unpack x'))
where where
x' = foldl (\t -> Text.replace t "") x' = foldl (\t -> Text.replace t "a")
x x
["-"," ","/","(",")"] ["-"," ","/","(",")"]
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment