Commit 8804c4e7 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[CLEAN] metrics

parent 80fbde18
...@@ -69,7 +69,9 @@ flowList_DbRepo lId ngs = do ...@@ -69,7 +69,9 @@ flowList_DbRepo lId ngs = do
-- Inserting groups of ngrams -- Inserting groups of ngrams
_r <- insert_Node_NodeNgrams_NodeNgrams _r <- insert_Node_NodeNgrams_NodeNgrams
$ map (\(a,b) -> Node_NodeNgrams_NodeNgrams lId a b Nothing) toInsert $ map (\(a,b) -> Node_NodeNgrams_NodeNgrams lId a b Nothing) toInsert
listInsert lId ngs listInsert lId ngs
--trace (show $ List.filter (\n -> _ne_ngrams n == "versatile") $ List.concat $ Map.elems ngs) $ listInsert lId ngs --trace (show $ List.filter (\n -> _ne_ngrams n == "versatile") $ List.concat $ Map.elems ngs) $ listInsert lId ngs
pure lId pure lId
------------------------------------------------------------------------ ------------------------------------------------------------------------
...@@ -88,6 +90,7 @@ toNodeNgramsW l ngs = List.concat $ map (toNodeNgramsW'' l) ngs ...@@ -88,6 +90,7 @@ toNodeNgramsW l ngs = List.concat $ map (toNodeNgramsW'' l) ngs
(NgramsElement ngrams_terms' _size list_type _occ _root _parent _children) <- elms (NgramsElement ngrams_terms' _size list_type _occ _root _parent _children) <- elms
] ]
toNodeNgramsW' :: ListId toNodeNgramsW' :: ListId
-> [(Text, [NgramsType])] -> [(Text, [NgramsType])]
-> [NodeNgramsW] -> [NodeNgramsW]
...@@ -102,8 +105,7 @@ listInsert :: FlowCmdM env err m ...@@ -102,8 +105,7 @@ listInsert :: FlowCmdM env err m
-> Map NgramsType [NgramsElement] -> Map NgramsType [NgramsElement]
-> m () -> m ()
listInsert lId ngs = mapM_ (\(typeList, ngElmts) listInsert lId ngs = mapM_ (\(typeList, ngElmts)
-> putListNgrams lId typeList ngElmts -> putListNgrams lId typeList ngElmts) (toList ngs)
) $ toList ngs
------------------------------------------------------------------------ ------------------------------------------------------------------------
------------------------------------------------------------------------ ------------------------------------------------------------------------
...@@ -46,7 +46,11 @@ import qualified Database.PostgreSQL.Simple as DPS ...@@ -46,7 +46,11 @@ import qualified Database.PostgreSQL.Simple as DPS
-- discussed. Main purpose of this is offering -- discussed. Main purpose of this is offering
-- a first grouping option to user and get some -- a first grouping option to user and get some
-- enriched data to better learn and improve that algo -- enriched data to better learn and improve that algo
ngramsGroup :: Lang -> Int -> Int -> Text -> Text ngramsGroup :: Lang
-> Int
-> Int
-> Text
-> Text
ngramsGroup l _m _n = Text.intercalate " " ngramsGroup l _m _n = Text.intercalate " "
. map (stem l) . map (stem l)
-- . take n -- . take n
...@@ -61,17 +65,24 @@ sortTficf :: (Map Text (Double, Set Text)) ...@@ -61,17 +65,24 @@ sortTficf :: (Map Text (Double, Set Text))
sortTficf = List.sortOn (fst . snd) . toList sortTficf = List.sortOn (fst . snd) . toList
getTficf' :: UserCorpusId -> MasterCorpusId -> NgramsType -> (Text -> Text) getTficf :: UserCorpusId
-> MasterCorpusId
-> NgramsType
-> (Text -> Text)
-> Cmd err (Map Text (Double, Set Text)) -> Cmd err (Map Text (Double, Set Text))
getTficf' u m nt f = do getTficf u m nt f = do
u' <- getNodesByNgramsUser u nt u' <- getNodesByNgramsUser u nt
m' <- getNodesByNgramsMaster u m m' <- getNodesByNgramsMaster u m
pure $ toTficfData (countNodesByNgramsWith f u') pure $ toTficfData (countNodesByNgramsWith f u')
(countNodesByNgramsWith f m') (countNodesByNgramsWith f m')
getTficfWith :: UserCorpusId -> MasterCorpusId -> [ListId] {-
-> NgramsType -> Map Text (Maybe Text) getTficfWith :: UserCorpusId
-> MasterCorpusId
-> [ListId]
-> NgramsType
-> Map Text (Maybe Text)
-> Cmd err (Map Text (Double, Set Text)) -> Cmd err (Map Text (Double, Set Text))
getTficfWith u m ls nt mtxt = do getTficfWith u m ls nt mtxt = do
u' <- getNodesByNgramsOnlyUser u ls nt (Map.keys mtxt) u' <- getNodesByNgramsOnlyUser u ls nt (Map.keys mtxt)
...@@ -83,13 +94,14 @@ getTficfWith u m ls nt mtxt = do ...@@ -83,13 +94,14 @@ getTficfWith u m ls nt mtxt = do
pure $ toTficfData (countNodesByNgramsWith f u') pure $ toTficfData (countNodesByNgramsWith f u')
(countNodesByNgramsWith f m') (countNodesByNgramsWith f m')
-}
type Context = (Double, Map Text (Double, Set Text)) type Context = (Double, Map Text (Double, Set Text))
type Supra = Context type Supra = Context
type Infra = Context type Infra = Context
toTficfData :: Infra -> Supra toTficfData :: Infra
-> Supra
-> Map Text (Double, Set Text) -> Map Text (Double, Set Text)
toTficfData (ti, mi) (ts, ms) = toTficfData (ti, mi) (ts, ms) =
fromList [ (t, ( tficf (TficfInfra (Count n )(Total ti)) fromList [ (t, ( tficf (TficfInfra (Count n )(Total ti))
...@@ -129,7 +141,8 @@ getNodesByNgramsUser cId nt = ...@@ -129,7 +141,8 @@ getNodesByNgramsUser cId nt =
<$> selectNgramsByNodeUser cId nt <$> selectNgramsByNodeUser cId nt
where where
selectNgramsByNodeUser :: CorpusId -> NgramsType selectNgramsByNodeUser :: CorpusId
-> NgramsType
-> Cmd err [(NodeId, Text)] -> Cmd err [(NodeId, Text)]
selectNgramsByNodeUser cId' nt' = selectNgramsByNodeUser cId' nt' =
runPGSQuery queryNgramsByNodeUser runPGSQuery queryNgramsByNodeUser
...@@ -202,8 +215,6 @@ getOccByNgramsOnlyFast' cId lId nt tms = trace (show (cId, lId)) $ ...@@ -202,8 +215,6 @@ getOccByNgramsOnlyFast' cId lId nt tms = trace (show (cId, lId)) $
|] |]
-- just slower than getOccByNgramsOnlyFast -- just slower than getOccByNgramsOnlyFast
getOccByNgramsOnlySlow :: NodeType getOccByNgramsOnlySlow :: NodeType
-> CorpusId -> CorpusId
...@@ -228,11 +239,14 @@ getOccByNgramsOnlySafe cId ls nt ngs = do ...@@ -228,11 +239,14 @@ getOccByNgramsOnlySafe cId ls nt ngs = do
fast <- getOccByNgramsOnlyFast cId nt ngs fast <- getOccByNgramsOnlyFast cId nt ngs
slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
when (fast /= slow) $ when (fast /= slow) $
printDebug "getOccByNgramsOnlySafe: difference" (diff slow fast :: PatchMap Text (Replace (Maybe Int))) printDebug "getOccByNgramsOnlySafe: difference"
(diff slow fast :: PatchMap Text (Replace (Maybe Int)))
pure slow pure slow
selectNgramsOccurrencesOnlyByNodeUser :: CorpusId -> NgramsType -> [Text] selectNgramsOccurrencesOnlyByNodeUser :: CorpusId
-> NgramsType
-> [Text]
-> Cmd err [(Text, Int)] -> Cmd err [(Text, Int)]
selectNgramsOccurrencesOnlyByNodeUser cId nt tms = selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
runPGSQuery queryNgramsOccurrencesOnlyByNodeUser runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
...@@ -262,8 +276,6 @@ queryNgramsOccurrencesOnlyByNodeUser = [sql| ...@@ -262,8 +276,6 @@ queryNgramsOccurrencesOnlyByNodeUser = [sql|
GROUP BY nng.node2_id, ng.terms GROUP BY nng.node2_id, ng.terms
|] |]
queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query
queryNgramsOccurrencesOnlyByNodeUser' = [sql| queryNgramsOccurrencesOnlyByNodeUser' = [sql|
WITH input_rows(terms) AS (?) WITH input_rows(terms) AS (?)
...@@ -280,8 +292,10 @@ queryNgramsOccurrencesOnlyByNodeUser' = [sql| ...@@ -280,8 +292,10 @@ queryNgramsOccurrencesOnlyByNodeUser' = [sql|
|] |]
------------------------------------------------------------------------ ------------------------------------------------------------------------
getNodesByNgramsOnlyUser :: NodeId
getNodesByNgramsOnlyUser :: NodeId -> [ListId] -> NgramsType -> [Text] -> [ListId]
-> NgramsType
-> [Text]
-> Cmd err (Map Text (Set NodeId)) -> Cmd err (Map Text (Set NodeId))
getNodesByNgramsOnlyUser cId ls nt ngs = getNodesByNgramsOnlyUser cId ls nt ngs =
Map.unionsWith (<>) Map.unionsWith (<>)
...@@ -305,12 +319,16 @@ getNgramsByNodeOnlyUser cId ls nt ngs = ...@@ -305,12 +319,16 @@ getNgramsByNodeOnlyUser cId ls nt ngs =
(splitEvery 1000 ngs) (splitEvery 1000 ngs)
------------------------------------------------------------------------ ------------------------------------------------------------------------
selectNgramsOnlyByNodeUser :: CorpusId -> [ListId] -> NgramsType -> [Text] selectNgramsOnlyByNodeUser :: CorpusId
-> [ListId]
-> NgramsType
-> [Text]
-> Cmd err [(Text, NodeId)] -> Cmd err [(Text, NodeId)]
selectNgramsOnlyByNodeUser cId ls nt tms = selectNgramsOnlyByNodeUser cId ls nt tms =
runPGSQuery queryNgramsOnlyByNodeUser runPGSQuery queryNgramsOnlyByNodeUser
( Values fields (DPS.Only <$> tms) ( Values fields (DPS.Only <$> tms)
, Values [QualifiedIdentifier Nothing "int4"] (DPS.Only <$> (map (\(NodeId n) -> n) ls)) , Values [QualifiedIdentifier Nothing "int4"]
(DPS.Only <$> (map (\(NodeId n) -> n) ls))
, cId , cId
, nodeTypeId NodeDocument , nodeTypeId NodeDocument
, ngramsTypeId nt , ngramsTypeId nt
...@@ -336,13 +354,16 @@ queryNgramsOnlyByNodeUser = [sql| ...@@ -336,13 +354,16 @@ queryNgramsOnlyByNodeUser = [sql|
|] |]
selectNgramsOnlyByNodeUser' :: CorpusId
selectNgramsOnlyByNodeUser' :: CorpusId -> [ListId] -> NgramsType -> [Text] -> [ListId]
-> NgramsType
-> [Text]
-> Cmd err [(Text, Int)] -> Cmd err [(Text, Int)]
selectNgramsOnlyByNodeUser' cId ls nt tms = selectNgramsOnlyByNodeUser' cId ls nt tms =
runPGSQuery queryNgramsOnlyByNodeUser runPGSQuery queryNgramsOnlyByNodeUser
( Values fields (DPS.Only <$> tms) ( Values fields (DPS.Only <$> tms)
, Values [QualifiedIdentifier Nothing "int4"] (DPS.Only <$> (map (\(NodeId n) -> n) ls)) , Values [QualifiedIdentifier Nothing "int4"]
(DPS.Only <$> (map (\(NodeId n) -> n) ls))
, cId , cId
, nodeTypeId NodeDocument , nodeTypeId NodeDocument
, ngramsTypeId nt , ngramsTypeId nt
...@@ -365,8 +386,10 @@ queryNgramsOnlyByNodeUser' = [sql| ...@@ -365,8 +386,10 @@ queryNgramsOnlyByNodeUser' = [sql|
|] |]
getNgramsByDocOnlyUser :: NodeId
getNgramsByDocOnlyUser :: NodeId -> [ListId] -> NgramsType -> [Text] -> [ListId]
-> NgramsType
-> [Text]
-> Cmd err (Map Text (Set NodeId)) -> Cmd err (Map Text (Set NodeId))
getNgramsByDocOnlyUser cId ls nt ngs = getNgramsByDocOnlyUser cId ls nt ngs =
Map.unionsWith (<>) Map.unionsWith (<>)
...@@ -374,18 +397,23 @@ getNgramsByDocOnlyUser cId ls nt ngs = ...@@ -374,18 +397,23 @@ getNgramsByDocOnlyUser cId ls nt ngs =
<$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs) <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
selectNgramsOnlyByDocUser :: DocId -> [ListId] -> NgramsType -> [Text] selectNgramsOnlyByDocUser :: DocId
-> [ListId]
-> NgramsType
-> [Text]
-> Cmd err [(Text, NodeId)] -> Cmd err [(Text, NodeId)]
selectNgramsOnlyByDocUser dId ls nt tms = selectNgramsOnlyByDocUser dId ls nt tms =
runPGSQuery queryNgramsOnlyByDocUser runPGSQuery queryNgramsOnlyByDocUser
( Values fields (DPS.Only <$> tms) ( Values fields (DPS.Only <$> tms)
, Values [QualifiedIdentifier Nothing "int4"] (DPS.Only <$> (map (\(NodeId n) -> n) ls)) , Values [QualifiedIdentifier Nothing "int4"]
(DPS.Only <$> (map (\(NodeId n) -> n) ls))
, dId , dId
, ngramsTypeId nt , ngramsTypeId nt
) )
where where
fields = [QualifiedIdentifier Nothing "text"] fields = [QualifiedIdentifier Nothing "text"]
queryNgramsOnlyByDocUser :: DPS.Query queryNgramsOnlyByDocUser :: DPS.Query
queryNgramsOnlyByDocUser = [sql| queryNgramsOnlyByDocUser = [sql|
WITH input_rows(terms) AS (?), WITH input_rows(terms) AS (?),
...@@ -408,8 +436,11 @@ getNodesByNgramsMaster ucId mcId = Map.unionsWith (<>) ...@@ -408,8 +436,11 @@ getNodesByNgramsMaster ucId mcId = Map.unionsWith (<>)
-- . takeWhile (\l -> List.length l > 3) -- . takeWhile (\l -> List.length l > 3)
<$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000] <$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
selectNgramsByNodeMaster :: Int
selectNgramsByNodeMaster :: Int -> UserCorpusId -> MasterCorpusId -> Int -> Cmd err [(NodeId, Text)] -> UserCorpusId
-> MasterCorpusId
-> Int
-> Cmd err [(NodeId, Text)]
selectNgramsByNodeMaster n ucId mcId p = runPGSQuery selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
queryNgramsByNodeMaster' queryNgramsByNodeMaster'
( ucId ( ucId
...@@ -458,5 +489,3 @@ queryNgramsByNodeMaster' = [sql| ...@@ -458,5 +489,3 @@ queryNgramsByNodeMaster' = [sql|
SELECT m.id, m.terms FROM nodesByNgramsMaster m SELECT m.id, m.terms FROM nodesByNgramsMaster m
RIGHT JOIN nodesByNgramsUser u ON u.id = m.id RIGHT JOIN nodesByNgramsUser u ON u.id = m.id
|] |]
...@@ -25,7 +25,7 @@ import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mS ...@@ -25,7 +25,7 @@ import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mS
import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..)) import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, NodeId) import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, NodeId)
import Gargantext.Database.Action.Metrics.NgramsByNode (getTficf', sortTficf, ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith) import Gargantext.Database.Action.Metrics.NgramsByNode (getTficf, sortTficf, ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith)
import Gargantext.Database.Admin.Utils (Cmd) import Gargantext.Database.Admin.Utils (Cmd)
import Gargantext.Database.Schema.Ngrams (NgramsType(..)) import Gargantext.Database.Schema.Ngrams (NgramsType(..))
import Gargantext.Prelude import Gargantext.Prelude
...@@ -56,7 +56,12 @@ data NgramsListBuilder = BuilderStepO { stemSize :: Int ...@@ -56,7 +56,12 @@ data NgramsListBuilder = BuilderStepO { stemSize :: Int
data StopSize = StopSize {unStopSize :: Int} data StopSize = StopSize {unStopSize :: Int}
-- | TODO improve grouping functions of Authors, Sources, Institutes.. -- | TODO improve grouping functions of Authors, Sources, Institutes..
buildNgramsLists :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId buildNgramsLists :: Lang
-> Int
-> Int
-> StopSize
-> UserCorpusId
-> MasterCorpusId
-> Cmd err (Map NgramsType [NgramsElement]) -> Cmd err (Map NgramsType [NgramsElement])
buildNgramsLists l n m s uCid mCid = do buildNgramsLists l n m s uCid mCid = do
ngTerms <- buildNgramsTermsList l n m s uCid mCid ngTerms <- buildNgramsTermsList l n m s uCid mCid
...@@ -64,7 +69,9 @@ buildNgramsLists l n m s uCid mCid = do ...@@ -64,7 +69,9 @@ buildNgramsLists l n m s uCid mCid = do
pure $ Map.unions $ othersTerms <> [ngTerms] pure $ Map.unions $ othersTerms <> [ngTerms]
buildNgramsOthersList :: UserCorpusId -> (Text -> Text) -> NgramsType buildNgramsOthersList :: UserCorpusId
-> (Text -> Text)
-> NgramsType
-> Cmd err (Map NgramsType [NgramsElement]) -> Cmd err (Map NgramsType [NgramsElement])
buildNgramsOthersList uCid groupIt nt = do buildNgramsOthersList uCid groupIt nt = do
ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
...@@ -83,12 +90,14 @@ buildNgramsOthersList uCid groupIt nt = do ...@@ -83,12 +90,14 @@ buildNgramsOthersList uCid groupIt nt = do
) )
] ]
--{- {-
buildNgramsTermsList' :: UserCorpusId buildNgramsTermsList' :: UserCorpusId
-> (Text -> Text) -> (Text -> Text)
-> ((Text, (Set Text, Set NodeId)) -> Bool) -> Int -> Int -> ((Text, (Set Text, Set NodeId)) -> Bool)
-> Int
-> Int
-> Cmd err (Map NgramsType [NgramsElement]) -> Cmd err (Map NgramsType [NgramsElement])
--}
buildNgramsTermsList' uCid groupIt stop gls is = do buildNgramsTermsList' uCid groupIt stop gls is = do
ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid NgramsTerms ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid NgramsTerms
...@@ -117,21 +126,31 @@ buildNgramsTermsList' uCid groupIt stop gls is = do ...@@ -117,21 +126,31 @@ buildNgramsTermsList' uCid groupIt stop gls is = do
<> map (\t -> (GraphTerm, toList' t)) m <> map (\t -> (GraphTerm, toList' t)) m
pure $ Map.fromList [(NgramsTerms, ngs')] pure $ Map.fromList [(NgramsTerms, ngs')]
-}
buildNgramsTermsList :: Lang
buildNgramsTermsList :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId -> Int
-> Int
-> StopSize
-> UserCorpusId
-> MasterCorpusId
-> Cmd err (Map NgramsType [NgramsElement]) -> Cmd err (Map NgramsType [NgramsElement])
buildNgramsTermsList l n m s uCid mCid = do buildNgramsTermsList l n m s uCid mCid = do
candidates <- sortTficf <$> getTficf' uCid mCid NgramsTerms (ngramsGroup l n m) candidates <- sortTficf <$> getTficf uCid mCid NgramsTerms (ngramsGroup l n m)
let let
candidatesSize = 2000 candidatesSize = 2000
a = 10 a = 10
b = 10 b = 10
candidatesHead = List.take candidatesSize candidates candidatesHead = List.take candidatesSize candidates
candidatesTail = List.drop candidatesSize candidates candidatesTail = List.drop candidatesSize candidates
termList = (toTermList a b ((isStopTerm s) . fst) candidatesHead) termList = (toTermList a b ((isStopTerm s) . fst) candidatesHead)
<> (map (toList ((isStopTerm s) .fst) CandidateTerm) candidatesTail) <> (map (toList ((isStopTerm s) .fst) CandidateTerm) candidatesTail)
let ngs = List.concat $ map toNgramsElement termList
ngs = List.concat $ map toNgramsElement termList
pure $ Map.fromList [(NgramsTerms, ngs)] pure $ Map.fromList [(NgramsTerms, ngs)]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment