Commit c898b780 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FLOW][LIST] NgramsTerms typed.

parent c6b1adf0
...@@ -24,20 +24,17 @@ Portability : POSIX ...@@ -24,20 +24,17 @@ Portability : POSIX
module Gargantext.Database.Flow -- (flowDatabase, ngrams2list) module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
where where
import Debug.Trace (trace)
--import Control.Lens (view)
import Control.Monad (mapM_) import Control.Monad (mapM_)
import Control.Monad.IO.Class (liftIO) import Control.Monad.IO.Class (liftIO)
--import Gargantext.Core.Types
--import Gargantext.Database.Node.Contact (HyperdataContact(..)) --import Gargantext.Database.Node.Contact (HyperdataContact(..))
import Data.Map (Map, lookup, fromListWith, toList) import Data.Map (Map, lookup, toList)
import Data.Maybe (Maybe(..), catMaybes) import Data.Maybe (Maybe(..), catMaybes)
import Data.Monoid import Data.Monoid
import Data.Text (Text, splitOn, intercalate) import Data.Text (Text, splitOn, intercalate)
import qualified Data.Text as Text import qualified Data.Text as Text
import Data.List (concat) import Data.List (concat)
import GHC.Show (Show) import GHC.Show (Show)
import Gargantext.Core.Types (NodePoly(..), ListType(..), Terms(..)) import Gargantext.Core.Types (NodePoly(..), Terms(..))
import Gargantext.Core.Types.Individu (Username) import Gargantext.Core.Types.Individu (Username)
import Gargantext.Core.Types.Main import Gargantext.Core.Types.Main
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
...@@ -47,14 +44,13 @@ import Gargantext.Database.Flow.Utils (insertToNodeNgrams) ...@@ -47,14 +44,13 @@ import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
import Gargantext.Text.Terms (extractTerms) import Gargantext.Text.Terms (extractTerms)
--import Gargantext.Text.Metrics.TFICF (Tficf(..)) --import Gargantext.Text.Metrics.TFICF (Tficf(..))
import qualified Gargantext.Database.Node.Document.Add as Doc (add) import qualified Gargantext.Database.Node.Document.Add as Doc (add)
import Gargantext.Database.Metrics.NgramsByNode (getTficf', sortTficf, ngramsGroup)
import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..)) import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
import Gargantext.Database.Root (getRoot) import Gargantext.Database.Root (getRoot)
import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId) import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError) import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
-- import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams) -- import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
--import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew) --import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
import Gargantext.Database.Metrics.Count (getNgramsElementsWithParentNodeId) --import Gargantext.Database.Metrics.Count (getNgramsElementsWithParentNodeId)
import Gargantext.Database.Schema.User (getUser, UserLight(..)) import Gargantext.Database.Schema.User (getUser, UserLight(..))
import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId) import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
import Gargantext.Database.Utils (Cmd, CmdM) import Gargantext.Database.Utils (Cmd, CmdM)
...@@ -62,11 +58,12 @@ import Gargantext.Text.Terms (TermType(..)) ...@@ -62,11 +58,12 @@ import Gargantext.Text.Terms (TermType(..))
import Gargantext.Ext.IMT (toSchoolName) import Gargantext.Ext.IMT (toSchoolName)
--import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile) --import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Text.List
import Gargantext.Text.Parsers (parseDocs, FileFormat) import Gargantext.Text.Parsers (parseDocs, FileFormat)
import System.FilePath (FilePath) import System.FilePath (FilePath)
import Gargantext.API.Ngrams (HasRepoVar) import Gargantext.API.Ngrams (HasRepoVar)
import Servant (ServantErr) import Servant (ServantErr)
import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, putListNgrams, RepoCmdM) import Gargantext.API.Ngrams (NgramsElement, putListNgrams, RepoCmdM)
--import Gargantext.Database.Schema.User (insertUsers, simpleUser, gargantuaUser) --import Gargantext.Database.Schema.User (insertUsers, simpleUser, gargantuaUser)
import qualified Data.Map as DM import qualified Data.Map as DM
...@@ -103,19 +100,12 @@ flowCorpus userName ff fp corpusName = do ...@@ -103,19 +100,12 @@ flowCorpus userName ff fp corpusName = do
-- User List Flow -- User List Flow
(_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster "" (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster ""
-- /!\ this extract NgramsTerms Only -- /!\ this extract NgramsTerms Only
_ngs <- toTermList (isStopTerm . fst) <$> sortTficf ngs <- buildNgramsList userCorpusId masterCorpusId
<$> getTficf' userCorpusId masterCorpusId (ngramsGroup EN 2) --printDebug "ngs" ngs
--printDebug "tficf size ngs" (take 100 $ ngs)
-- TODO getNgramsElement of NgramsType... -- TODO getNgramsElement of NgramsType...
ngs <- getNgramsElementsWithParentNodeId masterCorpusId --ngs <- getNgramsElementsWithParentNodeId masterCorpusId
printDebug "getNgramsElementsWithParentNodeId size ngs" (length ngs) userListId <- flowList userId userCorpusId ngs
printDebug "userListId" userListId
-- TEMP fix
let masterUserId = 2
_masterListId <- flowList masterUserId masterCorpusId ngs
_userListId <- flowListUser userId userCorpusId ngs 100
-- User Graph Flow -- User Graph Flow
_ <- mkGraph userCorpusId userId _ <- mkGraph userCorpusId userId
...@@ -288,6 +278,9 @@ documentIdWithNgrams f = mapM toDocumentIdWithNgrams ...@@ -288,6 +278,9 @@ documentIdWithNgrams f = mapM toDocumentIdWithNgrams
e <- f $ documentData d e <- f $ documentData d
pure $ DocumentIdWithNgrams d e pure $ DocumentIdWithNgrams d e
-- FLOW LIST
-- | TODO check optimization -- | TODO check optimization
mapNodeIdNgrams :: [DocumentIdWithNgrams] mapNodeIdNgrams :: [DocumentIdWithNgrams]
-> Map Ngrams (Map NgramsType (Map NodeId Int)) -> Map Ngrams (Map NgramsType (Map NodeId Int))
...@@ -316,70 +309,8 @@ flowList uId cId ngs = do ...@@ -316,70 +309,8 @@ flowList uId cId ngs = do
flowListBase lId ngs flowListBase lId ngs
pure lId pure lId
flowListUser :: FlowCmdM env err m
=> UserId -> CorpusId
-> Map NgramsType [NgramsElement]
-> Int
-> m ListId
flowListUser uId cId ngsM _n = do
lId <- getOrMkList cId uId
let ngs =
[ "test" <> Text.pack [x,y]
| x <- ['A'..'Z']
, y <- ['A'..'Z']
]
trace ("flowListBase" <> show lId) flowListBase lId ngsM
putListNgrams lId NgramsTerms $
[ mkNgramsElement ng GraphTerm Nothing mempty
| ng <- ngs
]
pure lId
------------------------------------------------------------------------
ngrams2list :: Map NgramsIndexed (Map NgramsType a)
-> [(ListType, (NgramsType, NgramsIndexed))]
ngrams2list m =
[ (CandidateTerm, (t, ng))
| (ng, tm) <- DM.toList m
, t <- DM.keys tm
]
ngrams2list' :: Map NgramsIndexed (Map NgramsType a)
-> Map NgramsType [NgramsElement]
ngrams2list' m = fromListWith (<>)
[ (t, [mkNgramsElement (_ngramsTerms $ _ngrams ng) CandidateTerm Nothing mempty])
| (ng, tm) <- DM.toList m
, t <- DM.keys tm
]
------------------------------------------------------------------------ ------------------------------------------------------------------------
toTermList :: (a -> Bool) -> [a] -> [(ListType, a)]
toTermList stop ns = map (toTermList' stop CandidateTerm) xs
<> map (toTermList' stop GraphTerm) ys
<> map (toTermList' stop CandidateTerm) zs
where
toTermList' stop' l n = case stop' n of
True -> (StopTerm, n)
False -> (l, n)
-- TODO use % of size of list
-- TODO user ML
xs = take a ns
ys = take b $ drop a xs
zs = drop b ys
a = 100
b = 1000
isStopTerm :: Text -> Bool
isStopTerm x = Text.length x < 3
------------------------------------------------------------------------ ------------------------------------------------------------------------
{- {-
......
...@@ -14,50 +14,61 @@ commentary with @some markup@. ...@@ -14,50 +14,61 @@ commentary with @some markup@.
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
module Gargantext.Text.List module Gargantext.Text.List
where where
import Data.Map (Map)
import Data.Text (Text) import Data.Text (Text)
import qualified Data.Text as DT import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, mSetFromList)
import Gargantext.Core (Lang(..))
import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
import Gargantext.Database.Metrics.NgramsByNode (getTficf', sortTficf, ngramsGroup)
import Gargantext.Database.Schema.Ngrams (NgramsType(..))
import Gargantext.Database.Utils (Cmd)
import Gargantext.Prelude import Gargantext.Prelude
import qualified Data.Map as Map
import qualified Data.Set as Set
import qualified Data.Text as Text
buildNgramsList :: UserCorpusId -> MasterCorpusId -> Cmd err (Map NgramsType [NgramsElement])
buildNgramsList uCid mCid = do
candidates <- sortTficf <$> getTficf' uCid mCid (ngramsGroup EN 2)
printDebug "candidate" (length candidates)
let termList = toTermList (isStopTerm . fst) candidates
printDebug "termlist" (length termList)
let ngs = map (\(lt, (stm, (_score, setext)))
-> mkNgramsElement stm lt
(Just stm)
(mSetFromList $ Set.toList setext)
) termList
pure $ Map.fromList [(NgramsTerms, ngs)]
toTermList :: (a -> Bool) -> [a] -> [(ListType, a)]
toTermList stop ns = map (toTermList' stop CandidateTerm) xs
<> map (toTermList' stop GraphTerm) ys
<> map (toTermList' stop CandidateTerm) zs
where
toTermList' stop' l n = case stop' n of
True -> (StopTerm, n)
False -> (l, n)
-- TODO use % of size of list
-- TODO user ML
xs = take a ns
ys = take b $ drop a ns
zs = drop b $ drop a ns
a = 100
b = 1000
-- | TODO normalize text isStopTerm :: Text -> Bool
isStopTerm x = Text.length x < 3
-- | TODO Order the seperators in probability of apparition
separators :: [Text]
separators = [" ", ",", ".", "?", "!", "\""]
isIn :: Text -> Text -> Bool
isIn term context = any (\x -> DT.isInfixOf x context)
$ map (\sep -> term <> sep) separators
------------------------------------------------------------------------
--graph :: [Ngrams] -> [Ngrams]
--graph ngs = filter (\ng -> _ngramsListName ng == Just Graph) ngs
--
--candidates :: [Ngrams] -> [Ngrams]
--candidates ngs = filter (\ng -> _ngramsListName ng == Just Candidate) ngs
--
--stop :: [Ngrams] -> [Ngrams]
--stop ngs = filter (\ng -> _ngramsListName ng == Just Stop) ngs
------------------------------------------------------------------------
-- | Attoparsec solution to index test
--import Data.Attoparsec.ByteString (Parser, parseOnly, try, string
-- , takeTill, take
-- , manyTill, many1)
--import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
--import Data.ByteString (ByteString, concat)
--import Data.ByteString.Char8 (pack)
--import Control.Applicative
-- | Attoparsec version
--indexParser :: (ByteString -> b) -> ByteString -> Parser b
--indexParser form2label x = do
-- _ <- manyTill anyChar (string x)
-- pure $ form2label x
--doIndex :: Applicative f => ByteString -> ByteString -> f (Either String [ByteString]
--doIndex f x txt = pure $ parseOnly (many $ indexParser f x) txt
------------------------------------------------------------------------
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment