Commit 8804c4e7 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[CLEAN] metrics

parent 80fbde18
......@@ -69,7 +69,9 @@ flowList_DbRepo lId ngs = do
-- Inserting groups of ngrams
_r <- insert_Node_NodeNgrams_NodeNgrams
$ map (\(a,b) -> Node_NodeNgrams_NodeNgrams lId a b Nothing) toInsert
listInsert lId ngs
--trace (show $ List.filter (\n -> _ne_ngrams n == "versatile") $ List.concat $ Map.elems ngs) $ listInsert lId ngs
pure lId
------------------------------------------------------------------------
......@@ -88,6 +90,7 @@ toNodeNgramsW l ngs = List.concat $ map (toNodeNgramsW'' l) ngs
(NgramsElement ngrams_terms' _size list_type _occ _root _parent _children) <- elms
]
toNodeNgramsW' :: ListId
-> [(Text, [NgramsType])]
-> [NodeNgramsW]
......@@ -102,8 +105,7 @@ listInsert :: FlowCmdM env err m
-> Map NgramsType [NgramsElement]
-> m ()
listInsert lId ngs = mapM_ (\(typeList, ngElmts)
-> putListNgrams lId typeList ngElmts
) $ toList ngs
-> putListNgrams lId typeList ngElmts) (toList ngs)
------------------------------------------------------------------------
------------------------------------------------------------------------
......@@ -46,7 +46,11 @@ import qualified Database.PostgreSQL.Simple as DPS
-- discussed. Main purpose of this is offering
-- a first grouping option to user and get some
-- enriched data to better learn and improve that algo
ngramsGroup :: Lang -> Int -> Int -> Text -> Text
ngramsGroup :: Lang
-> Int
-> Int
-> Text
-> Text
ngramsGroup l _m _n = Text.intercalate " "
. map (stem l)
-- . take n
......@@ -61,18 +65,25 @@ sortTficf :: (Map Text (Double, Set Text))
sortTficf = List.sortOn (fst . snd) . toList
getTficf' :: UserCorpusId -> MasterCorpusId -> NgramsType -> (Text -> Text)
getTficf :: UserCorpusId
-> MasterCorpusId
-> NgramsType
-> (Text -> Text)
-> Cmd err (Map Text (Double, Set Text))
getTficf' u m nt f = do
getTficf u m nt f = do
u' <- getNodesByNgramsUser u nt
m' <- getNodesByNgramsMaster u m
pure $ toTficfData (countNodesByNgramsWith f u')
(countNodesByNgramsWith f m')
getTficfWith :: UserCorpusId -> MasterCorpusId -> [ListId]
-> NgramsType -> Map Text (Maybe Text)
-> Cmd err (Map Text (Double, Set Text))
{-
getTficfWith :: UserCorpusId
-> MasterCorpusId
-> [ListId]
-> NgramsType
-> Map Text (Maybe Text)
-> Cmd err (Map Text (Double, Set Text))
getTficfWith u m ls nt mtxt = do
u' <- getNodesByNgramsOnlyUser u ls nt (Map.keys mtxt)
m' <- getNodesByNgramsMaster u m
......@@ -83,13 +94,14 @@ getTficfWith u m ls nt mtxt = do
pure $ toTficfData (countNodesByNgramsWith f u')
(countNodesByNgramsWith f m')
-}
type Context = (Double, Map Text (Double, Set Text))
type Supra = Context
type Infra = Context
toTficfData :: Infra -> Supra
toTficfData :: Infra
-> Supra
-> Map Text (Double, Set Text)
toTficfData (ti, mi) (ts, ms) =
fromList [ (t, ( tficf (TficfInfra (Count n )(Total ti))
......@@ -129,7 +141,8 @@ getNodesByNgramsUser cId nt =
<$> selectNgramsByNodeUser cId nt
where
selectNgramsByNodeUser :: CorpusId -> NgramsType
selectNgramsByNodeUser :: CorpusId
-> NgramsType
-> Cmd err [(NodeId, Text)]
selectNgramsByNodeUser cId' nt' =
runPGSQuery queryNgramsByNodeUser
......@@ -172,10 +185,10 @@ getOccByNgramsOnlyFast' :: CorpusId
-> Cmd err (Map Text Int)
getOccByNgramsOnlyFast' cId lId nt tms = trace (show (cId, lId)) $
fromListWith (+) <$> map (second round) <$> run cId lId nt tms
where
fields = [QualifiedIdentifier Nothing "text"]
run :: CorpusId
-> ListId
-> NgramsType
......@@ -202,8 +215,6 @@ getOccByNgramsOnlyFast' cId lId nt tms = trace (show (cId, lId)) $
|]
-- just slower than getOccByNgramsOnlyFast
getOccByNgramsOnlySlow :: NodeType
-> CorpusId
......@@ -228,12 +239,15 @@ getOccByNgramsOnlySafe cId ls nt ngs = do
fast <- getOccByNgramsOnlyFast cId nt ngs
slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
when (fast /= slow) $
printDebug "getOccByNgramsOnlySafe: difference" (diff slow fast :: PatchMap Text (Replace (Maybe Int)))
printDebug "getOccByNgramsOnlySafe: difference"
(diff slow fast :: PatchMap Text (Replace (Maybe Int)))
pure slow
selectNgramsOccurrencesOnlyByNodeUser :: CorpusId -> NgramsType -> [Text]
-> Cmd err [(Text, Int)]
selectNgramsOccurrencesOnlyByNodeUser :: CorpusId
-> NgramsType
-> [Text]
-> Cmd err [(Text, Int)]
selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
( Values fields (DPS.Only <$> tms)
......@@ -262,8 +276,6 @@ queryNgramsOccurrencesOnlyByNodeUser = [sql|
GROUP BY nng.node2_id, ng.terms
|]
queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query
queryNgramsOccurrencesOnlyByNodeUser' = [sql|
WITH input_rows(terms) AS (?)
......@@ -280,8 +292,10 @@ queryNgramsOccurrencesOnlyByNodeUser' = [sql|
|]
------------------------------------------------------------------------
getNodesByNgramsOnlyUser :: NodeId -> [ListId] -> NgramsType -> [Text]
getNodesByNgramsOnlyUser :: NodeId
-> [ListId]
-> NgramsType
-> [Text]
-> Cmd err (Map Text (Set NodeId))
getNodesByNgramsOnlyUser cId ls nt ngs =
Map.unionsWith (<>)
......@@ -305,12 +319,16 @@ getNgramsByNodeOnlyUser cId ls nt ngs =
(splitEvery 1000 ngs)
------------------------------------------------------------------------
selectNgramsOnlyByNodeUser :: CorpusId -> [ListId] -> NgramsType -> [Text]
selectNgramsOnlyByNodeUser :: CorpusId
-> [ListId]
-> NgramsType
-> [Text]
-> Cmd err [(Text, NodeId)]
selectNgramsOnlyByNodeUser cId ls nt tms =
runPGSQuery queryNgramsOnlyByNodeUser
( Values fields (DPS.Only <$> tms)
, Values [QualifiedIdentifier Nothing "int4"] (DPS.Only <$> (map (\(NodeId n) -> n) ls))
, Values [QualifiedIdentifier Nothing "int4"]
(DPS.Only <$> (map (\(NodeId n) -> n) ls))
, cId
, nodeTypeId NodeDocument
, ngramsTypeId nt
......@@ -336,13 +354,16 @@ queryNgramsOnlyByNodeUser = [sql|
|]
selectNgramsOnlyByNodeUser' :: CorpusId -> [ListId] -> NgramsType -> [Text]
-> Cmd err [(Text, Int)]
selectNgramsOnlyByNodeUser' :: CorpusId
-> [ListId]
-> NgramsType
-> [Text]
-> Cmd err [(Text, Int)]
selectNgramsOnlyByNodeUser' cId ls nt tms =
runPGSQuery queryNgramsOnlyByNodeUser
( Values fields (DPS.Only <$> tms)
, Values [QualifiedIdentifier Nothing "int4"] (DPS.Only <$> (map (\(NodeId n) -> n) ls))
, Values [QualifiedIdentifier Nothing "int4"]
(DPS.Only <$> (map (\(NodeId n) -> n) ls))
, cId
, nodeTypeId NodeDocument
, ngramsTypeId nt
......@@ -365,27 +386,34 @@ queryNgramsOnlyByNodeUser' = [sql|
|]
getNgramsByDocOnlyUser :: NodeId -> [ListId] -> NgramsType -> [Text]
-> Cmd err (Map Text (Set NodeId))
getNgramsByDocOnlyUser :: NodeId
-> [ListId]
-> NgramsType
-> [Text]
-> Cmd err (Map Text (Set NodeId))
getNgramsByDocOnlyUser cId ls nt ngs =
Map.unionsWith (<>)
. map (fromListWith (<>) . map (second Set.singleton))
<$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
selectNgramsOnlyByDocUser :: DocId -> [ListId] -> NgramsType -> [Text]
-> Cmd err [(Text, NodeId)]
selectNgramsOnlyByDocUser :: DocId
-> [ListId]
-> NgramsType
-> [Text]
-> Cmd err [(Text, NodeId)]
selectNgramsOnlyByDocUser dId ls nt tms =
runPGSQuery queryNgramsOnlyByDocUser
( Values fields (DPS.Only <$> tms)
, Values [QualifiedIdentifier Nothing "int4"] (DPS.Only <$> (map (\(NodeId n) -> n) ls))
, Values [QualifiedIdentifier Nothing "int4"]
(DPS.Only <$> (map (\(NodeId n) -> n) ls))
, dId
, ngramsTypeId nt
)
where
fields = [QualifiedIdentifier Nothing "text"]
queryNgramsOnlyByDocUser :: DPS.Query
queryNgramsOnlyByDocUser = [sql|
WITH input_rows(terms) AS (?),
......@@ -408,8 +436,11 @@ getNodesByNgramsMaster ucId mcId = Map.unionsWith (<>)
-- . takeWhile (\l -> List.length l > 3)
<$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
selectNgramsByNodeMaster :: Int -> UserCorpusId -> MasterCorpusId -> Int -> Cmd err [(NodeId, Text)]
selectNgramsByNodeMaster :: Int
-> UserCorpusId
-> MasterCorpusId
-> Int
-> Cmd err [(NodeId, Text)]
selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
queryNgramsByNodeMaster'
( ucId
......@@ -458,5 +489,3 @@ queryNgramsByNodeMaster' = [sql|
SELECT m.id, m.terms FROM nodesByNgramsMaster m
RIGHT JOIN nodesByNgramsUser u ON u.id = m.id
|]
......@@ -25,7 +25,7 @@ import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mS
import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
import Gargantext.Core (Lang(..))
import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, NodeId)
import Gargantext.Database.Action.Metrics.NgramsByNode (getTficf', sortTficf, ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith)
import Gargantext.Database.Action.Metrics.NgramsByNode (getTficf, sortTficf, ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith)
import Gargantext.Database.Admin.Utils (Cmd)
import Gargantext.Database.Schema.Ngrams (NgramsType(..))
import Gargantext.Prelude
......@@ -33,8 +33,8 @@ import Gargantext.Text.List.Learn (Model(..))
import Gargantext.Text.Metrics (takeScored)
import qualified Data.Char as Char
import qualified Data.List as List
import qualified Data.Map as Map
import qualified Data.Set as Set
import qualified Data.Map as Map
import qualified Data.Set as Set
import qualified Data.Text as Text
......@@ -44,11 +44,11 @@ data NgramsListBuilder = BuilderStepO { stemSize :: Int
}
| BuilderStep1 { withModel :: Model }
| BuilderStepN { withModel :: Model }
| Tficf { nlb_lang :: Lang
, nlb_group1 :: Int
, nlb_group2 :: Int
, nlb_stopSize :: StopSize
, nlb_userCorpusId :: UserCorpusId
| Tficf { nlb_lang :: Lang
, nlb_group1 :: Int
, nlb_group2 :: Int
, nlb_stopSize :: StopSize
, nlb_userCorpusId :: UserCorpusId
, nlb_masterCorpusId :: MasterCorpusId
}
......@@ -56,7 +56,12 @@ data NgramsListBuilder = BuilderStepO { stemSize :: Int
data StopSize = StopSize {unStopSize :: Int}
-- | TODO improve grouping functions of Authors, Sources, Institutes..
buildNgramsLists :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId
buildNgramsLists :: Lang
-> Int
-> Int
-> StopSize
-> UserCorpusId
-> MasterCorpusId
-> Cmd err (Map NgramsType [NgramsElement])
buildNgramsLists l n m s uCid mCid = do
ngTerms <- buildNgramsTermsList l n m s uCid mCid
......@@ -64,7 +69,9 @@ buildNgramsLists l n m s uCid mCid = do
pure $ Map.unions $ othersTerms <> [ngTerms]
buildNgramsOthersList :: UserCorpusId -> (Text -> Text) -> NgramsType
buildNgramsOthersList :: UserCorpusId
-> (Text -> Text)
-> NgramsType
-> Cmd err (Map NgramsType [NgramsElement])
buildNgramsOthersList uCid groupIt nt = do
ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
......@@ -83,12 +90,14 @@ buildNgramsOthersList uCid groupIt nt = do
)
]
--{-
{-
buildNgramsTermsList' :: UserCorpusId
-> (Text -> Text)
-> ((Text, (Set Text, Set NodeId)) -> Bool) -> Int -> Int
-> ((Text, (Set Text, Set NodeId)) -> Bool)
-> Int
-> Int
-> Cmd err (Map NgramsType [NgramsElement])
--}
buildNgramsTermsList' uCid groupIt stop gls is = do
ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid NgramsTerms
......@@ -117,21 +126,31 @@ buildNgramsTermsList' uCid groupIt stop gls is = do
<> map (\t -> (GraphTerm, toList' t)) m
pure $ Map.fromList [(NgramsTerms, ngs')]
-}
buildNgramsTermsList :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId
buildNgramsTermsList :: Lang
-> Int
-> Int
-> StopSize
-> UserCorpusId
-> MasterCorpusId
-> Cmd err (Map NgramsType [NgramsElement])
buildNgramsTermsList l n m s uCid mCid = do
candidates <- sortTficf <$> getTficf' uCid mCid NgramsTerms (ngramsGroup l n m)
candidates <- sortTficf <$> getTficf uCid mCid NgramsTerms (ngramsGroup l n m)
let
candidatesSize = 2000
a = 10
b = 10
candidatesHead = List.take candidatesSize candidates
candidatesTail = List.drop candidatesSize candidates
termList = (toTermList a b ((isStopTerm s) . fst) candidatesHead)
<> (map (toList ((isStopTerm s) .fst) CandidateTerm) candidatesTail)
let ngs = List.concat $ map toNgramsElement termList
ngs = List.concat $ map toNgramsElement termList
pure $ Map.fromList [(NgramsTerms, ngs)]
......@@ -167,7 +186,7 @@ toTermList a b stop ns = -- trace ("computing toTermList") $
where
xs = take a ns
ta = drop a ns
ys = take b ta
zs = drop b ta
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment