Commit 020e78de authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FLOW.GRAPH.METRICS] spec gen inc exc.

parent 07305554
...@@ -282,8 +282,8 @@ graphAPI nId = do ...@@ -282,8 +282,8 @@ graphAPI nId = do
] ]
-- (map (\n -> LegendField n "#FFFFFF" (pack $ show n)) [1..10]) -- (map (\n -> LegendField n "#FFFFFF" (pack $ show n)) [1..10])
let cId = maybe (panic "no parentId") identity $ _node_parentId nodeGraph let cId = maybe (panic "no parentId") identity $ _node_parentId nodeGraph
lId <- defaultList cId
lId <- defaultList cId
ngs <- filterListWithRoot GraphTerm <$> mapTermListRoot [lId] NgramsTerms ngs <- filterListWithRoot GraphTerm <$> mapTermListRoot [lId] NgramsTerms
myCooc <- Map.filter (>1) <$> getCoocByNgrams myCooc <- Map.filter (>1) <$> getCoocByNgrams
...@@ -296,7 +296,7 @@ graphAPI nId = do ...@@ -296,7 +296,7 @@ graphAPI nId = do
instance HasNodeError ServantErr where instance HasNodeError ServantErr where
_NodeError = prism' mk (const Nothing) -- $ panic "HasNodeError ServantErr: not a prism") _NodeError = prism' mk (const Nothing) -- $ panic "HasNodeError ServantErr: not a prism")
where where
e = "NodeError: " e = "Gargantext.NodeError: "
mk NoListFound = err404 { errBody = e <> "No list found" } mk NoListFound = err404 { errBody = e <> "No list found" }
mk NoRootFound = err404 { errBody = e <> "No Root found" } mk NoRootFound = err404 { errBody = e <> "No Root found" }
mk NoCorpusFound = err404 { errBody = e <> "No Corpus found" } mk NoCorpusFound = err404 { errBody = e <> "No Corpus found" }
...@@ -333,18 +333,20 @@ rename nId (RenameNode name') = U.update (U.Rename nId name') ...@@ -333,18 +333,20 @@ rename nId (RenameNode name') = U.update (U.Rename nId name')
getTable :: NodeId -> Maybe TabType getTable :: NodeId -> Maybe TabType
-> Maybe Offset -> Maybe Limit -> Maybe Offset -> Maybe Limit
-> Maybe OrderBy -> Cmd err [FacetDoc] -> Maybe OrderBy -> Cmd err [FacetDoc]
getTable cId ft o l order = case ft of getTable cId ft o l order =
(Just Docs) -> runViewDocuments cId False o l order case ft of
(Just Trash) -> runViewDocuments cId True o l order (Just Docs) -> runViewDocuments cId False o l order
_ -> panic "not implemented" (Just Trash) -> runViewDocuments cId True o l order
_ -> panic "not implemented"
getPairing :: ContactId -> Maybe TabType getPairing :: ContactId -> Maybe TabType
-> Maybe Offset -> Maybe Limit -> Maybe Offset -> Maybe Limit
-> Maybe OrderBy -> Cmd err [FacetDoc] -> Maybe OrderBy -> Cmd err [FacetDoc]
getPairing cId ft o l order = case ft of getPairing cId ft o l order =
(Just Docs) -> runViewAuthorsDoc cId False o l order case ft of
(Just Trash) -> runViewAuthorsDoc cId True o l order (Just Docs) -> runViewAuthorsDoc cId False o l order
_ -> panic "not implemented" (Just Trash) -> runViewAuthorsDoc cId True o l order
_ -> panic "not implemented"
getChart :: NodeId -> Maybe UTCTime -> Maybe UTCTime getChart :: NodeId -> Maybe UTCTime -> Maybe UTCTime
......
...@@ -83,6 +83,7 @@ flowCorpus u cn ff fp = do ...@@ -83,6 +83,7 @@ flowCorpus u cn ff fp = do
ids <- flowCorpusMaster ff fp ids <- flowCorpusMaster ff fp
flowCorpusUser u cn ids flowCorpusUser u cn ids
-- TODO query with complex query
flowCorpusSearchInDatabase :: FlowCmdM env ServantErr m flowCorpusSearchInDatabase :: FlowCmdM env ServantErr m
=> Username -> Text -> m CorpusId => Username -> Text -> m CorpusId
flowCorpusSearchInDatabase u q = do flowCorpusSearchInDatabase u q = do
...@@ -112,11 +113,7 @@ flowCorpusUser userName corpusName ids = do ...@@ -112,11 +113,7 @@ flowCorpusUser userName corpusName ids = do
-- User List Flow -- User List Flow
(_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster "" (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster ""
-- /!\ this extract NgramsTerms Only ngs <- buildNgramsLists userCorpusId masterCorpusId
ngs <- buildNgramsLists userCorpusId masterCorpusId
--printDebug "ngs" ngs
--TODO getNgramsElement of NgramsType...
--ngs <- getNgramsElementsWithParentNodeId masterCorpusId
userListId <- flowList userId userCorpusId ngs userListId <- flowList userId userCorpusId ngs
printDebug "userListId" userListId printDebug "userListId" userListId
...@@ -124,11 +121,10 @@ flowCorpusUser userName corpusName ids = do ...@@ -124,11 +121,10 @@ flowCorpusUser userName corpusName ids = do
_ <- mkGraph userCorpusId userId _ <- mkGraph userCorpusId userId
-- User Dashboard Flow -- User Dashboard Flow
_ <- mkDashboard userCorpusId userId -- _ <- mkDashboard userCorpusId userId
-- Annuaire Flow -- Annuaire Flow
-- _ <- mkAnnuaire rootUserId userId -- _ <- mkAnnuaire rootUserId userId
pure userCorpusId pure userCorpusId
...@@ -142,7 +138,7 @@ insertMasterDocs hs = do ...@@ -142,7 +138,7 @@ insertMasterDocs hs = do
ids <- insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments' ids <- insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
let documentsWithId = mergeData (toInserted ids) (toInsert hs) let documentsWithId = mergeData (toInserted ids) (toInsert hs)
docsWithNgrams <- documentIdWithNgrams extractNgramsT documentsWithId docsWithNgrams <- documentIdWithNgrams extractNgramsT documentsWithId
let maps = mapNodeIdNgrams docsWithNgrams let maps = mapNodeIdNgrams docsWithNgrams
......
...@@ -24,13 +24,12 @@ import Data.Set (Set) ...@@ -24,13 +24,12 @@ import Data.Set (Set)
import Data.Text (Text) import Data.Text (Text)
import Data.Tuple.Extra (second, swap) import Data.Tuple.Extra (second, swap)
import Database.PostgreSQL.Simple.SqlQQ (sql) import Database.PostgreSQL.Simple.SqlQQ (sql)
import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Database.Config (nodeTypeId) import Gargantext.Database.Config (nodeTypeId)
import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..)) import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
import Gargantext.Database.Types.Node -- (ListId, CorpusId, NodeId) import Gargantext.Database.Types.Node -- (ListId, CorpusId, NodeId)
import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
import Gargantext.Database.Utils (Cmd, runPGSQuery) import Gargantext.Database.Utils (Cmd, runPGSQuery)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Text.Metrics.TFICF import Gargantext.Text.Metrics.TFICF
import Gargantext.Text.Terms.Mono.Stem (stem) import Gargantext.Text.Terms.Mono.Stem (stem)
...@@ -54,7 +53,7 @@ ngramsGroup l n = Text.intercalate " " ...@@ -54,7 +53,7 @@ ngramsGroup l n = Text.intercalate " "
sortTficf :: (Map Text (Double, Set Text)) sortTficf :: (Map Text (Double, Set Text))
-> [(Text, (Double, Set Text))] -> [ (Text,(Double, Set Text))]
sortTficf = List.sortOn (fst . snd) . toList sortTficf = List.sortOn (fst . snd) . toList
...@@ -69,8 +68,8 @@ getTficf' u m f = do ...@@ -69,8 +68,8 @@ getTficf' u m f = do
type Context = (Double, Map Text (Double, Set Text)) type Context = (Double, Map Text (Double, Set Text))
type Supra = Context type Supra = Context
type Infra = Context type Infra = Context
toTficfData :: Infra -> Supra toTficfData :: Infra -> Supra
-> Map Text (Double, Set Text) -> Map Text (Double, Set Text)
...@@ -105,17 +104,20 @@ groupNodesByNgramsWith f m = ...@@ -105,17 +104,20 @@ groupNodesByNgramsWith f m =
$ toList m $ toList m
------------------------------------------------------------------------ ------------------------------------------------------------------------
getNodesByNgramsUser :: CorpusId -> NgramsType -> Cmd err (Map Text (Set NodeId)) getNodesByNgramsUser :: CorpusId -> NgramsType
getNodesByNgramsUser cId nt = fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n)) -> Cmd err (Map Text (Set NodeId))
<$> selectNgramsByNodeUser cId nt getNodesByNgramsUser cId nt =
fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
selectNgramsByNodeUser :: CorpusId -> NgramsType -> Cmd err [(NodeId, Text)] <$> selectNgramsByNodeUser cId nt
selectNgramsByNodeUser cId nt = runPGSQuery
queryNgramsByNodeUser selectNgramsByNodeUser :: CorpusId -> NgramsType
( cId -> Cmd err [(NodeId, Text)]
, nodeTypeId NodeDocument selectNgramsByNodeUser cId nt =
, ngramsTypeId nt runPGSQuery queryNgramsByNodeUser
) ( cId
, nodeTypeId NodeDocument
, ngramsTypeId nt
)
queryNgramsByNodeUser :: DPS.Query queryNgramsByNodeUser :: DPS.Query
queryNgramsByNodeUser = [sql| queryNgramsByNodeUser = [sql|
...@@ -137,13 +139,17 @@ getOccByNgramsOnly :: CorpusId -> NgramsType -> [Text] ...@@ -137,13 +139,17 @@ getOccByNgramsOnly :: CorpusId -> NgramsType -> [Text]
getOccByNgramsOnly cId nt ngs = Map.map Set.size getOccByNgramsOnly cId nt ngs = Map.map Set.size
<$> getNodesByNgramsOnlyUser cId nt ngs <$> getNodesByNgramsOnlyUser cId nt ngs
getNodesByNgramsOnlyUser :: CorpusId -> NgramsType -> [Text] -> Cmd err (Map Text (Set NodeId)) getNodesByNgramsOnlyUser :: CorpusId -> NgramsType -> [Text]
getNodesByNgramsOnlyUser cId nt ngs = fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n)) -> Cmd err (Map Text (Set NodeId))
<$> selectNgramsOnlyByNodeUser cId nt ngs getNodesByNgramsOnlyUser cId nt ngs =
fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
<$> selectNgramsOnlyByNodeUser cId nt ngs
selectNgramsOnlyByNodeUser :: CorpusId -> NgramsType -> [Text] -> Cmd err [(NodeId, Text)] selectNgramsOnlyByNodeUser :: CorpusId -> NgramsType -> [Text]
selectNgramsOnlyByNodeUser cId nt tms = runPGSQuery queryNgramsOnlyByNodeUser (DPS.Only $ Values fields tms' ) -> Cmd err [(NodeId, Text)]
selectNgramsOnlyByNodeUser cId nt tms =
runPGSQuery queryNgramsOnlyByNodeUser (DPS.Only $ Values fields tms' )
where where
fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4", "int4", "int4"] fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4", "int4", "int4"]
tms' = map (\t -> (t,cId,nodeTypeId NodeDocument, ngramsTypeId nt)) tms tms' = map (\t -> (t,cId,nodeTypeId NodeDocument, ngramsTypeId nt)) tms
......
{-| {-|
Module : Gargantext.Text.Ngrams.Lists Module : Gargantext.Text.Ngrams.Lists
Description : Description : Tools to build lists
Copyright : (c) CNRS, 2017-Present Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3 License : AGPL + CECILL v3
Maintainer : team@gargantext.org Maintainer : team@gargantext.org
Stability : experimental Stability : experimental
Portability : POSIX Portability : POSIX
Here is a longer description of this module, containing some
commentary with @some markup@.
-} -}
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-} {-# LANGUAGE RankNTypes #-}
module Gargantext.Text.List module Gargantext.Text.List
where where
import Data.Set (Set)
import Data.Map (Map) import Data.Map (Map)
import Data.Set (Set)
import Data.Text (Text) import Data.Text (Text)
import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList) import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList)
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
...@@ -30,11 +26,11 @@ import Gargantext.Database.Metrics.NgramsByNode (getTficf', sortTficf, ngramsGro ...@@ -30,11 +26,11 @@ import Gargantext.Database.Metrics.NgramsByNode (getTficf', sortTficf, ngramsGro
import Gargantext.Database.Schema.Ngrams (NgramsType(..)) import Gargantext.Database.Schema.Ngrams (NgramsType(..))
import Gargantext.Database.Utils (Cmd) import Gargantext.Database.Utils (Cmd)
import Gargantext.Prelude import Gargantext.Prelude
import qualified Data.Char as Char
import qualified Data.List as List
import qualified Data.Map as Map import qualified Data.Map as Map
import qualified Data.Set as Set import qualified Data.Set as Set
import qualified Data.Text as Text import qualified Data.Text as Text
import qualified Data.List as List
import qualified Data.Char as Char
-- | TODO improve grouping functions of Authors, Sources, Institutes.. -- | TODO improve grouping functions of Authors, Sources, Institutes..
buildNgramsLists :: UserCorpusId -> MasterCorpusId buildNgramsLists :: UserCorpusId -> MasterCorpusId
...@@ -56,20 +52,19 @@ buildNgramsOthersList uCid groupIt nt = do ...@@ -56,20 +52,19 @@ buildNgramsOthersList uCid groupIt nt = do
) )
] ]
-- TODO remove hard coded parameters
buildNgramsTermsList :: UserCorpusId -> MasterCorpusId buildNgramsTermsList :: UserCorpusId -> MasterCorpusId
-> Cmd err (Map NgramsType [NgramsElement]) -> Cmd err (Map NgramsType [NgramsElement])
buildNgramsTermsList uCid mCid = do buildNgramsTermsList uCid mCid = do
candidates <- sortTficf <$> getTficf' uCid mCid (ngramsGroup EN 2) candidates <- sortTficf <$> getTficf' uCid mCid (ngramsGroup EN 2)
--printDebug "candidate" (length candidates) --printDebug "candidate" (length candidates)
let termList = toTermList (isStopTerm . fst) candidates let termList = toTermList (isStopTerm . fst) candidates
--printDebug "termlist" (length termList) --printDebug "termlist" (length termList)
let ngs = List.concat $ map toNgramsElement termList let ngs = List.concat $ map toNgramsElement termList
pure $ Map.fromList [(NgramsTerms, ngs)] pure $ Map.fromList [(NgramsTerms, ngs)]
toNgramsElement :: (ListType, (Text, (Double, Set Text))) -> [NgramsElement] toNgramsElement :: (ListType, (Text, (Double, Set Text))) -> [NgramsElement]
...@@ -87,10 +82,7 @@ toNgramsElement (listType, (_stem, (_score, setNgrams))) = ...@@ -87,10 +82,7 @@ toNgramsElement (listType, (_stem, (_score, setNgrams))) =
(mSetFromList []) (mSetFromList [])
) children ) children
-- TODO remove hard coded parameters
toTermList :: (a -> Bool) -> [a] -> [(ListType, a)] toTermList :: (a -> Bool) -> [a] -> [(ListType, a)]
toTermList stop ns = map (toTermList' stop CandidateTerm) xs toTermList stop ns = map (toTermList' stop CandidateTerm) xs
<> map (toTermList' stop GraphTerm) ys <> map (toTermList' stop GraphTerm) ys
...@@ -106,8 +98,8 @@ toTermList stop ns = map (toTermList' stop CandidateTerm) xs ...@@ -106,8 +98,8 @@ toTermList stop ns = map (toTermList' stop CandidateTerm) xs
ys = take b $ drop a ns ys = take b $ drop a ns
zs = drop b $ drop a ns zs = drop b $ drop a ns
a = 1 a = 10
b = 10000 b = 3000
isStopTerm :: Text -> Bool isStopTerm :: Text -> Bool
isStopTerm x = Text.length x < 3 isStopTerm x = Text.length x < 3
...@@ -118,4 +110,3 @@ isStopTerm x = Text.length x < 3 ...@@ -118,4 +110,3 @@ isStopTerm x = Text.length x < 3
. Text.replace "/" "" . Text.replace "/" ""
) x ) x
...@@ -22,7 +22,7 @@ module Gargantext.Text.Metrics ...@@ -22,7 +22,7 @@ module Gargantext.Text.Metrics
--import Debug.Trace (trace) --import Debug.Trace (trace)
--import Math.KMeans (kmeans, euclidSq, elements) --import Math.KMeans (kmeans, euclidSq, elements)
import Data.Map (Map) import Data.Map (Map)
import Data.Ord (Down(..)) import Data.List.Extra (sortOn)
import GHC.Real (round) import GHC.Real (round)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Viz.Graph.Distances.Matrice import Gargantext.Viz.Graph.Distances.Matrice
...@@ -32,65 +32,14 @@ import qualified Data.Array.Accelerate.Interpreter as DAA ...@@ -32,65 +32,14 @@ import qualified Data.Array.Accelerate.Interpreter as DAA
import qualified Data.List as L import qualified Data.List as L
import qualified Data.Map as M import qualified Data.Map as M
data MapListSize = MapListSize Int type GraphListSize = Int
data InclusionSize = InclusionSize Int type InclusionSize = Int
data SampleBins = SampleBins Double
data Clusters = Clusters Int
data DefaultValue = DefaultValue Int
data FilterConfig = FilterConfig
{ fc_mapListSize :: MapListSize
, fc_inclusionSize :: InclusionSize
, fc_sampleBins :: SampleBins
, fc_clusters :: Clusters
, fc_defaultValue :: DefaultValue
}
filterCooc :: (Show t, Ord t)
=> FilterConfig -> Map (t, t) Int -> Map (t, t) Int
filterCooc fc cc = (filterCooc' fc) ts cc
where
ts = map _scored_terms $ takeSome fc $ coocScored cc
filterCooc' :: (Show t, Ord t)
=> FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m =
-- trace ("coocScored " <> show ts) $
foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
M.empty (listToCombi identity ts)
-- | Map list creation
-- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
-- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
-- each parts is then ordered by Inclusion/Exclusion
-- take n scored terms in each parts where n * SampleBins = MapListSize.
takeSome :: Ord t
=> FilterConfig -> [Scored t] -> [Scored t]
takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters _) _) scores = L.take l
$ takeSample n m
$ L.take l'
$ reverse $ sortWith (Down . _scored_incExc) scores
-- splitKmeans k scores
where
-- TODO: benchmark with accelerate-example kmeans version
--splitKmeans x xs = L.concat $ map elements
-- $ V.take (k-1)
-- $ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
-- euclidSq x xs
n = round ((fromIntegral l)/s)
m = round $ (fromIntegral $ length scores) / (s)
takeSample n' m' xs = -- trace ("splitKmeans " <> show (length xs)) $
L.concat $ map (L.take n')
$ map (sortWith (Down . _scored_incExc))
-- TODO use kmeans s instead of splitEvery
-- in order to split in s heteregenous parts
-- without homogeneous order hypothesis
$ splitEvery m'
$ sortWith (Down . _scored_speGen) xs
takeScored :: Ord t => GraphListSize -> InclusionSize -> Map (t,t) Int -> [t]
takeScored listSize incSize = map _scored_terms
. linearTakes listSize incSize _scored_speGen
_scored_incExc
. scored
data Scored ts = Scored data Scored ts = Scored
{ _scored_terms :: !ts { _scored_terms :: !ts
...@@ -98,11 +47,45 @@ data Scored ts = Scored ...@@ -98,11 +47,45 @@ data Scored ts = Scored
, _scored_speGen :: !SpecificityGenericity , _scored_speGen :: !SpecificityGenericity
} deriving (Show) } deriving (Show)
-- TODO in the textflow we end up needing these indices, it might be better -- TODO in the textflow we end up needing these indices, it might be better
-- to compute them earlier and pass them around. -- to compute them earlier and pass them around.
coocScored :: Ord t => Map (t,t) Int -> [Scored t] scored :: Ord t => Map (t,t) Int -> [Scored t]
coocScored m = zipWith (\(_,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores scored m = zipWith (\(_,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
where where
(ti, fi) = createIndices m (ti, fi) = createIndices m
(is, ss) = incExcSpeGen $ cooc2mat ti m (is, ss) = incExcSpeGen $ cooc2mat ti m
scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss) scores = DAA.toList
$ DAA.run
$ DAA.zip (DAA.use is) (DAA.use ss)
-- | Filter Scored data
-- >>> linearTakes 2 3 fst snd $ Prelude.zip ([1..10] :: [Int]) (reverse $ [1..10] :: [Int])
-- [(3,8),(6,5)]
linearTakes :: (Ord b1, Ord b2)
=> GraphListSize -> InclusionSize -> (a -> b2) -> (a -> b1) -> [a] -> [a]
linearTakes gls incSize speGen incExc = take gls
. L.concat
. map (take $ round
$ (fromIntegral gls :: Double)
/ (fromIntegral incSize :: Double)
)
. map (sortOn incExc)
. splitEvery incSize
. sortOn speGen
-- | Filters
{- splitKmeans k scores
TODO: benchmark with accelerate-example kmeans version
splitKmeans x xs = L.concat $ map elements
$ V.take (k-1)
$ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
euclidSq x xs
-- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
-- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
-- each parts is then ordered by Inclusion/Exclusion
-- take n scored terms in each parts where n * SampleBins = MapListSize.
-}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment