Commit 020e78de authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FLOW.GRAPH.METRICS] spec gen inc exc.

parent 07305554
Pipeline #267 failed with stage
......@@ -282,8 +282,8 @@ graphAPI nId = do
]
-- (map (\n -> LegendField n "#FFFFFF" (pack $ show n)) [1..10])
let cId = maybe (panic "no parentId") identity $ _node_parentId nodeGraph
lId <- defaultList cId
lId <- defaultList cId
ngs <- filterListWithRoot GraphTerm <$> mapTermListRoot [lId] NgramsTerms
myCooc <- Map.filter (>1) <$> getCoocByNgrams
......@@ -296,7 +296,7 @@ graphAPI nId = do
instance HasNodeError ServantErr where
_NodeError = prism' mk (const Nothing) -- $ panic "HasNodeError ServantErr: not a prism")
where
e = "NodeError: "
e = "Gargantext.NodeError: "
mk NoListFound = err404 { errBody = e <> "No list found" }
mk NoRootFound = err404 { errBody = e <> "No Root found" }
mk NoCorpusFound = err404 { errBody = e <> "No Corpus found" }
......@@ -333,18 +333,20 @@ rename nId (RenameNode name') = U.update (U.Rename nId name')
getTable :: NodeId -> Maybe TabType
-> Maybe Offset -> Maybe Limit
-> Maybe OrderBy -> Cmd err [FacetDoc]
getTable cId ft o l order = case ft of
(Just Docs) -> runViewDocuments cId False o l order
(Just Trash) -> runViewDocuments cId True o l order
_ -> panic "not implemented"
getTable cId ft o l order =
case ft of
(Just Docs) -> runViewDocuments cId False o l order
(Just Trash) -> runViewDocuments cId True o l order
_ -> panic "not implemented"
getPairing :: ContactId -> Maybe TabType
-> Maybe Offset -> Maybe Limit
-> Maybe OrderBy -> Cmd err [FacetDoc]
getPairing cId ft o l order = case ft of
(Just Docs) -> runViewAuthorsDoc cId False o l order
(Just Trash) -> runViewAuthorsDoc cId True o l order
_ -> panic "not implemented"
getPairing cId ft o l order =
case ft of
(Just Docs) -> runViewAuthorsDoc cId False o l order
(Just Trash) -> runViewAuthorsDoc cId True o l order
_ -> panic "not implemented"
getChart :: NodeId -> Maybe UTCTime -> Maybe UTCTime
......
......@@ -83,6 +83,7 @@ flowCorpus u cn ff fp = do
ids <- flowCorpusMaster ff fp
flowCorpusUser u cn ids
-- TODO query with complex query
flowCorpusSearchInDatabase :: FlowCmdM env ServantErr m
=> Username -> Text -> m CorpusId
flowCorpusSearchInDatabase u q = do
......@@ -112,11 +113,7 @@ flowCorpusUser userName corpusName ids = do
-- User List Flow
(_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster ""
-- /!\ this extract NgramsTerms Only
ngs <- buildNgramsLists userCorpusId masterCorpusId
--printDebug "ngs" ngs
--TODO getNgramsElement of NgramsType...
--ngs <- getNgramsElementsWithParentNodeId masterCorpusId
ngs <- buildNgramsLists userCorpusId masterCorpusId
userListId <- flowList userId userCorpusId ngs
printDebug "userListId" userListId
......@@ -124,11 +121,10 @@ flowCorpusUser userName corpusName ids = do
_ <- mkGraph userCorpusId userId
-- User Dashboard Flow
_ <- mkDashboard userCorpusId userId
-- _ <- mkDashboard userCorpusId userId
-- Annuaire Flow
-- _ <- mkAnnuaire rootUserId userId
pure userCorpusId
......@@ -142,7 +138,7 @@ insertMasterDocs hs = do
ids <- insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
let documentsWithId = mergeData (toInserted ids) (toInsert hs)
docsWithNgrams <- documentIdWithNgrams extractNgramsT documentsWithId
docsWithNgrams <- documentIdWithNgrams extractNgramsT documentsWithId
let maps = mapNodeIdNgrams docsWithNgrams
......
......@@ -24,13 +24,12 @@ import Data.Set (Set)
import Data.Text (Text)
import Data.Tuple.Extra (second, swap)
import Database.PostgreSQL.Simple.SqlQQ (sql)
import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
import Gargantext.Core (Lang(..))
import Gargantext.Database.Config (nodeTypeId)
import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
import Gargantext.Database.Types.Node -- (ListId, CorpusId, NodeId)
import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
import Gargantext.Database.Utils (Cmd, runPGSQuery)
import Gargantext.Prelude
import Gargantext.Text.Metrics.TFICF
import Gargantext.Text.Terms.Mono.Stem (stem)
......@@ -54,7 +53,7 @@ ngramsGroup l n = Text.intercalate " "
sortTficf :: (Map Text (Double, Set Text))
-> [(Text, (Double, Set Text))]
-> [ (Text,(Double, Set Text))]
sortTficf = List.sortOn (fst . snd) . toList
......@@ -69,8 +68,8 @@ getTficf' u m f = do
type Context = (Double, Map Text (Double, Set Text))
type Supra = Context
type Infra = Context
type Supra = Context
type Infra = Context
toTficfData :: Infra -> Supra
-> Map Text (Double, Set Text)
......@@ -105,17 +104,20 @@ groupNodesByNgramsWith f m =
$ toList m
------------------------------------------------------------------------
getNodesByNgramsUser :: CorpusId -> NgramsType -> Cmd err (Map Text (Set NodeId))
getNodesByNgramsUser cId nt = fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
<$> selectNgramsByNodeUser cId nt
selectNgramsByNodeUser :: CorpusId -> NgramsType -> Cmd err [(NodeId, Text)]
selectNgramsByNodeUser cId nt = runPGSQuery
queryNgramsByNodeUser
( cId
, nodeTypeId NodeDocument
, ngramsTypeId nt
)
getNodesByNgramsUser :: CorpusId -> NgramsType
-> Cmd err (Map Text (Set NodeId))
getNodesByNgramsUser cId nt =
fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
<$> selectNgramsByNodeUser cId nt
selectNgramsByNodeUser :: CorpusId -> NgramsType
-> Cmd err [(NodeId, Text)]
selectNgramsByNodeUser cId nt =
runPGSQuery queryNgramsByNodeUser
( cId
, nodeTypeId NodeDocument
, ngramsTypeId nt
)
queryNgramsByNodeUser :: DPS.Query
queryNgramsByNodeUser = [sql|
......@@ -137,13 +139,17 @@ getOccByNgramsOnly :: CorpusId -> NgramsType -> [Text]
getOccByNgramsOnly cId nt ngs = Map.map Set.size
<$> getNodesByNgramsOnlyUser cId nt ngs
getNodesByNgramsOnlyUser :: CorpusId -> NgramsType -> [Text] -> Cmd err (Map Text (Set NodeId))
getNodesByNgramsOnlyUser cId nt ngs = fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
<$> selectNgramsOnlyByNodeUser cId nt ngs
getNodesByNgramsOnlyUser :: CorpusId -> NgramsType -> [Text]
-> Cmd err (Map Text (Set NodeId))
getNodesByNgramsOnlyUser cId nt ngs =
fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
<$> selectNgramsOnlyByNodeUser cId nt ngs
selectNgramsOnlyByNodeUser :: CorpusId -> NgramsType -> [Text] -> Cmd err [(NodeId, Text)]
selectNgramsOnlyByNodeUser cId nt tms = runPGSQuery queryNgramsOnlyByNodeUser (DPS.Only $ Values fields tms' )
selectNgramsOnlyByNodeUser :: CorpusId -> NgramsType -> [Text]
-> Cmd err [(NodeId, Text)]
selectNgramsOnlyByNodeUser cId nt tms =
runPGSQuery queryNgramsOnlyByNodeUser (DPS.Only $ Values fields tms' )
where
fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4", "int4", "int4"]
tms' = map (\t -> (t,cId,nodeTypeId NodeDocument, ngramsTypeId nt)) tms
......
{-|
Module : Gargantext.Text.Ngrams.Lists
Description :
Description : Tools to build lists
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Here is a longer description of this module, containing some
commentary with @some markup@.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
module Gargantext.Text.List
where
import Data.Set (Set)
import Data.Map (Map)
import Data.Set (Set)
import Data.Text (Text)
import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList)
import Gargantext.Core (Lang(..))
......@@ -30,11 +26,11 @@ import Gargantext.Database.Metrics.NgramsByNode (getTficf', sortTficf, ngramsGro
import Gargantext.Database.Schema.Ngrams (NgramsType(..))
import Gargantext.Database.Utils (Cmd)
import Gargantext.Prelude
import qualified Data.Char as Char
import qualified Data.List as List
import qualified Data.Map as Map
import qualified Data.Set as Set
import qualified Data.Text as Text
import qualified Data.List as List
import qualified Data.Char as Char
-- | TODO improve grouping functions of Authors, Sources, Institutes..
buildNgramsLists :: UserCorpusId -> MasterCorpusId
......@@ -56,20 +52,19 @@ buildNgramsOthersList uCid groupIt nt = do
)
]
-- TODO remove hard coded parameters
buildNgramsTermsList :: UserCorpusId -> MasterCorpusId
-> Cmd err (Map NgramsType [NgramsElement])
-> Cmd err (Map NgramsType [NgramsElement])
buildNgramsTermsList uCid mCid = do
candidates <- sortTficf <$> getTficf' uCid mCid (ngramsGroup EN 2)
--printDebug "candidate" (length candidates)
candidates <- sortTficf <$> getTficf' uCid mCid (ngramsGroup EN 2)
--printDebug "candidate" (length candidates)
let termList = toTermList (isStopTerm . fst) candidates
--printDebug "termlist" (length termList)
let termList = toTermList (isStopTerm . fst) candidates
--printDebug "termlist" (length termList)
let ngs = List.concat $ map toNgramsElement termList
let ngs = List.concat $ map toNgramsElement termList
pure $ Map.fromList [(NgramsTerms, ngs)]
pure $ Map.fromList [(NgramsTerms, ngs)]
toNgramsElement :: (ListType, (Text, (Double, Set Text))) -> [NgramsElement]
......@@ -87,10 +82,7 @@ toNgramsElement (listType, (_stem, (_score, setNgrams))) =
(mSetFromList [])
) children
-- TODO remove hard coded parameters
toTermList :: (a -> Bool) -> [a] -> [(ListType, a)]
toTermList stop ns = map (toTermList' stop CandidateTerm) xs
<> map (toTermList' stop GraphTerm) ys
......@@ -106,8 +98,8 @@ toTermList stop ns = map (toTermList' stop CandidateTerm) xs
ys = take b $ drop a ns
zs = drop b $ drop a ns
a = 1
b = 10000
a = 10
b = 3000
isStopTerm :: Text -> Bool
isStopTerm x = Text.length x < 3
......@@ -118,4 +110,3 @@ isStopTerm x = Text.length x < 3
. Text.replace "/" ""
) x
......@@ -22,7 +22,7 @@ module Gargantext.Text.Metrics
--import Debug.Trace (trace)
--import Math.KMeans (kmeans, euclidSq, elements)
import Data.Map (Map)
import Data.Ord (Down(..))
import Data.List.Extra (sortOn)
import GHC.Real (round)
import Gargantext.Prelude
import Gargantext.Viz.Graph.Distances.Matrice
......@@ -32,65 +32,14 @@ import qualified Data.Array.Accelerate.Interpreter as DAA
import qualified Data.List as L
import qualified Data.Map as M
data MapListSize = MapListSize Int
data InclusionSize = InclusionSize Int
data SampleBins = SampleBins Double
data Clusters = Clusters Int
data DefaultValue = DefaultValue Int
data FilterConfig = FilterConfig
{ fc_mapListSize :: MapListSize
, fc_inclusionSize :: InclusionSize
, fc_sampleBins :: SampleBins
, fc_clusters :: Clusters
, fc_defaultValue :: DefaultValue
}
filterCooc :: (Show t, Ord t)
=> FilterConfig -> Map (t, t) Int -> Map (t, t) Int
filterCooc fc cc = (filterCooc' fc) ts cc
where
ts = map _scored_terms $ takeSome fc $ coocScored cc
filterCooc' :: (Show t, Ord t)
=> FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m =
-- trace ("coocScored " <> show ts) $
foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
M.empty (listToCombi identity ts)
-- | Map list creation
-- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
-- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
-- each parts is then ordered by Inclusion/Exclusion
-- take n scored terms in each parts where n * SampleBins = MapListSize.
takeSome :: Ord t
=> FilterConfig -> [Scored t] -> [Scored t]
takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters _) _) scores = L.take l
$ takeSample n m
$ L.take l'
$ reverse $ sortWith (Down . _scored_incExc) scores
-- splitKmeans k scores
where
-- TODO: benchmark with accelerate-example kmeans version
--splitKmeans x xs = L.concat $ map elements
-- $ V.take (k-1)
-- $ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
-- euclidSq x xs
n = round ((fromIntegral l)/s)
m = round $ (fromIntegral $ length scores) / (s)
takeSample n' m' xs = -- trace ("splitKmeans " <> show (length xs)) $
L.concat $ map (L.take n')
$ map (sortWith (Down . _scored_incExc))
-- TODO use kmeans s instead of splitEvery
-- in order to split in s heteregenous parts
-- without homogeneous order hypothesis
$ splitEvery m'
$ sortWith (Down . _scored_speGen) xs
type GraphListSize = Int
type InclusionSize = Int
takeScored :: Ord t => GraphListSize -> InclusionSize -> Map (t,t) Int -> [t]
takeScored listSize incSize = map _scored_terms
. linearTakes listSize incSize _scored_speGen
_scored_incExc
. scored
data Scored ts = Scored
{ _scored_terms :: !ts
......@@ -98,11 +47,45 @@ data Scored ts = Scored
, _scored_speGen :: !SpecificityGenericity
} deriving (Show)
-- TODO in the textflow we end up needing these indices, it might be better
-- to compute them earlier and pass them around.
coocScored :: Ord t => Map (t,t) Int -> [Scored t]
coocScored m = zipWith (\(_,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
scored :: Ord t => Map (t,t) Int -> [Scored t]
scored m = zipWith (\(_,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
where
(ti, fi) = createIndices m
(is, ss) = incExcSpeGen $ cooc2mat ti m
scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)
scores = DAA.toList
$ DAA.run
$ DAA.zip (DAA.use is) (DAA.use ss)
-- | Filter Scored data
-- >>> linearTakes 2 3 fst snd $ Prelude.zip ([1..10] :: [Int]) (reverse $ [1..10] :: [Int])
-- [(3,8),(6,5)]
linearTakes :: (Ord b1, Ord b2)
=> GraphListSize -> InclusionSize -> (a -> b2) -> (a -> b1) -> [a] -> [a]
linearTakes gls incSize speGen incExc = take gls
. L.concat
. map (take $ round
$ (fromIntegral gls :: Double)
/ (fromIntegral incSize :: Double)
)
. map (sortOn incExc)
. splitEvery incSize
. sortOn speGen
-- | Filters
{- splitKmeans k scores
TODO: benchmark with accelerate-example kmeans version
splitKmeans x xs = L.concat $ map elements
$ V.take (k-1)
$ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
euclidSq x xs
-- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
-- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
-- each parts is then ordered by Inclusion/Exclusion
-- take n scored terms in each parts where n * SampleBins = MapListSize.
-}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment