Commit 8f0b7cd4 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[WIP] ngrams lists and groups.

parent 740badb8
Pipeline #294 failed with stage
......@@ -64,8 +64,9 @@ import Gargantext.Text.List (buildNgramsLists)
import Gargantext.Text.Terms (TermType(..))
import Gargantext.Text.Terms (extractTerms)
import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
import qualified Gargantext.Text.Parsers.GrandDebat as GD
import Servant (ServantErr)
--import System.FilePath (FilePath)
import System.FilePath (FilePath)
import qualified Data.Map as DM
import qualified Data.Text as Text
import qualified Gargantext.Database.Node.Document.Add as Doc (add)
......@@ -77,6 +78,18 @@ type FlowCmdM env err m =
, HasRepoVar env
)
flowCorpusDebat :: FlowCmdM env ServantErr m
=> Username -> CorpusName -> Int -> FilePath -> m CorpusId
flowCorpusDebat u n l fp = do
docs <- liftIO ( splitEvery 500
<$> take l
<$> GD.readFile fp
:: IO [[GD.GrandDebatReference ]]
)
flowCorpus u n (Multi FR) docs
{-
flowCorpus :: FlowCmdM env ServantErr m
=> Username -> CorpusName -> TermType Lang -> FileFormat -> FilePath -> m CorpusId
......@@ -91,7 +104,7 @@ flowCorpus :: (FlowCmdM env ServantErr m, ToHyperdataDocument a)
=> Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
flowCorpus u cn la docs = do
ids <- mapM ((insertMasterDocs la) . (map toHyperdataDocument)) docs
flowCorpusUser u cn (concat ids)
flowCorpusUser FR u cn (concat ids)
-- TODO query with complex query
......@@ -100,12 +113,12 @@ flowCorpusSearchInDatabase :: FlowCmdM env ServantErr m
flowCorpusSearchInDatabase u q = do
(_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster ""
ids <- map fst <$> searchInDatabase cId (stemIt q)
flowCorpusUser u q ids
flowCorpusUser FR u q ids
flowCorpusUser :: FlowCmdM env ServantErr m
=> Username -> CorpusName -> [NodeId] -> m CorpusId
flowCorpusUser userName corpusName ids = do
=> Lang -> Username -> CorpusName -> [NodeId] -> m CorpusId
flowCorpusUser l userName corpusName ids = do
-- User Flow
(userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName
-- TODO: check if present already, ignore
......@@ -113,12 +126,12 @@ flowCorpusUser userName corpusName ids = do
-- User List Flow
(_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster ""
ngs <- buildNgramsLists userCorpusId masterCorpusId
ngs <- buildNgramsLists l 2 3 userCorpusId masterCorpusId
userListId <- flowList userId userCorpusId ngs
printDebug "userListId" userListId
-- User Graph Flow
--_ <- mkGraph userCorpusId userId
_ <- mkGraph userCorpusId userId
-- User Dashboard Flow
-- _ <- mkDashboard userCorpusId userId
......
......@@ -44,13 +44,13 @@ import qualified Database.PostgreSQL.Simple as DPS
-- a first grouping option to user and get some
-- enriched data to better learn and improve that algo
ngramsGroup :: Lang -> Int -> Int -> Text -> Text
ngramsGroup l m n = Text.intercalate " "
. map (stem l)
. take n
. List.sort
. (List.filter (\t -> Text.length t > m))
. Text.splitOn " "
. Text.replace "-" " "
ngramsGroup l _m _n = Text.intercalate " "
. map (stem l)
-- . take n
. List.sort
-- . (List.filter (\t -> Text.length t > m))
. Text.splitOn " "
. Text.replace "-" " "
sortTficf :: (Map Text (Double, Set Text))
......
......@@ -26,6 +26,7 @@ import Gargantext.Database.Metrics.NgramsByNode (getTficf', sortTficf, ngramsGro
import Gargantext.Database.Schema.Ngrams (NgramsType(..))
import Gargantext.Database.Utils (Cmd)
import Gargantext.Prelude
--import Gargantext.Text.Terms (TermType(..))
import qualified Data.Char as Char
import qualified Data.List as List
import qualified Data.Map as Map
......@@ -33,10 +34,10 @@ import qualified Data.Set as Set
import qualified Data.Text as Text
-- | TODO improve grouping functions of Authors, Sources, Institutes..
buildNgramsLists :: UserCorpusId -> MasterCorpusId
buildNgramsLists :: Lang -> Int -> Int -> UserCorpusId -> MasterCorpusId
-> Cmd err (Map NgramsType [NgramsElement])
buildNgramsLists uCid mCid = do
ngTerms <- buildNgramsTermsList uCid mCid
buildNgramsLists l n m uCid mCid = do
ngTerms <- buildNgramsTermsList l n m uCid mCid
othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
pure $ Map.unions $ othersTerms <> [ngTerms]
......@@ -53,13 +54,14 @@ buildNgramsOthersList uCid groupIt nt = do
]
-- TODO remove hard coded parameters
buildNgramsTermsList :: UserCorpusId -> MasterCorpusId
buildNgramsTermsList :: Lang -> Int -> Int -> UserCorpusId -> MasterCorpusId
-> Cmd err (Map NgramsType [NgramsElement])
buildNgramsTermsList uCid mCid = do
candidates <- sortTficf <$> getTficf' uCid mCid (ngramsGroup EN 4 2)
buildNgramsTermsList l n m uCid mCid = do
candidates <- sortTficf <$> getTficf' uCid mCid (ngramsGroup l n m)
--printDebug "candidate" (length candidates)
let termList = toTermList (isStopTerm . fst) candidates
--let termList = toTermList (isStopTerm . fst) candidates
let termList = toTermList ((\_ -> False) . fst) candidates
--printDebug "termlist" (length termList)
let ngs = List.concat $ map toNgramsElement termList
......@@ -98,14 +100,14 @@ toTermList stop ns = map (toTermList' stop CandidateTerm) xs
ys = take b $ drop a ns
zs = drop b $ drop a ns
a = 50
b = 1000
a = 10
b = 400
isStopTerm :: Text -> Bool
isStopTerm x = Text.length x < 3
|| not (all Char.isAlpha (Text.unpack x'))
where
x' = foldl (\t -> Text.replace t "")
x' = foldl (\t -> Text.replace t "a")
x
["-"," ","/","(",")"]
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment