some more refactorings, comments to code i didn't understand

parent 166f9c19
Pipeline #5855 passed with stages
in 151 minutes and 22 seconds
...@@ -65,7 +65,6 @@ common defaults ...@@ -65,7 +65,6 @@ common defaults
build-depends: build-depends:
base >=4.7 && <5 base >=4.7 && <5
optimization: 2
common optimized common optimized
ghc-options: ghc-options:
-O2 -O2
...@@ -817,7 +816,6 @@ executable gargantext-server ...@@ -817,7 +816,6 @@ executable gargantext-server
, text ^>= 1.2.4.1 , text ^>= 1.2.4.1
, unordered-containers ^>= 0.2.16.0 , unordered-containers ^>= 0.2.16.0
, vector ^>= 0.7.3 , vector ^>= 0.7.3
optimization: 2
executable gargantext-upgrade executable gargantext-upgrade
import: import:
......
...@@ -58,6 +58,11 @@ isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x) ...@@ -58,6 +58,11 @@ isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
-} -}
-- | Good value from users' requests and anthropological analysis
goodMapListSize :: Int
goodMapListSize = 350
-- | TODO improve grouping functions of Authors, Sources, Institutes.. -- | TODO improve grouping functions of Authors, Sources, Institutes..
buildNgramsLists :: ( HasNodeStory env err m buildNgramsLists :: ( HasNodeStory env err m
, HasNLPServer env , HasNLPServer env
...@@ -71,7 +76,7 @@ buildNgramsLists :: ( HasNodeStory env err m ...@@ -71,7 +76,7 @@ buildNgramsLists :: ( HasNodeStory env err m
-> GroupParams -> GroupParams
-> m (Map NgramsType [NgramsElement]) -> m (Map NgramsType [NgramsElement])
buildNgramsLists user uCid mCid mfslw gp = do buildNgramsLists user uCid mCid mfslw gp = do
ngTerms <- buildNgramsTermsList user uCid mCid mfslw gp (NgramsTerms, MapListSize 350) ngTerms <- buildNgramsTermsList user uCid mCid mfslw gp (NgramsTerms, MapListSize goodMapListSize)
othersTerms <- mapM (buildNgramsOthersList user uCid mfslw GroupIdentity) othersTerms <- mapM (buildNgramsOthersList user uCid mfslw GroupIdentity)
[ (Authors , MapListSize 9, MaxListSize 1000) [ (Authors , MapListSize 9, MaxListSize 1000)
, (Sources , MapListSize 9, MaxListSize 1000) , (Sources , MapListSize 9, MaxListSize 1000)
...@@ -195,6 +200,7 @@ buildNgramsTermsList user uCid mCid mfslw groupParams (nt, MapListSize mapListSi ...@@ -195,6 +200,7 @@ buildNgramsTermsList user uCid mCid mfslw groupParams (nt, MapListSize mapListSi
$ HashMap.filter (\g -> view gts'_score g > 1) $ HashMap.filter (\g -> view gts'_score g > 1)
$ view flc_scores groupedWithList $ view flc_scores groupedWithList
-- | Split candidateTerms into mono-terms and multi-terms.
!(groupedMono, groupedMult) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms !(groupedMono, groupedMult) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms
-- void $ panicTrace $ "groupedWithList: " <> show groupedWithList -- void $ panicTrace $ "groupedWithList: " <> show groupedWithList
...@@ -211,6 +217,7 @@ buildNgramsTermsList user uCid mCid mfslw groupParams (nt, MapListSize mapListSi ...@@ -211,6 +217,7 @@ buildNgramsTermsList user uCid mCid mfslw groupParams (nt, MapListSize mapListSi
!monoSize = 0.4 :: Double !monoSize = 0.4 :: Double
!multSize = 1 - monoSize !multSize = 1 - monoSize
-- | Splits given hashmap into 2 pieces, based on score
splitAt' n' ns = both (HashMap.fromListWith (<>)) splitAt' n' ns = both (HashMap.fromListWith (<>))
$ List.splitAt (round $ n' * listSizeGlobal) $ List.splitAt (round $ n' * listSizeGlobal)
$ List.sortOn (viewScore . snd) $ List.sortOn (viewScore . snd)
...@@ -254,8 +261,7 @@ buildNgramsTermsList user uCid mCid mfslw groupParams (nt, MapListSize mapListSi ...@@ -254,8 +261,7 @@ buildNgramsTermsList user uCid mCid mfslw groupParams (nt, MapListSize mapListSi
] ]
where where
mapStemNodeIds = HashMap.toList mapStemNodeIds = HashMap.toList
$ HashMap.map viewScores $ HashMap.map viewScores groupedTreeScores_SetNodeId
$ groupedTreeScores_SetNodeId
let let
-- computing scores -- computing scores
mapScores f = HashMap.fromList mapScores f = HashMap.fromList
......
...@@ -14,14 +14,15 @@ TFICF is a generalization of [TFIDF](https://en.wikipedia.org/wiki/Tf%E2%80%93id ...@@ -14,14 +14,15 @@ TFICF is a generalization of [TFIDF](https://en.wikipedia.org/wiki/Tf%E2%80%93id
-} -}
module Gargantext.Core.Text.Metrics.TFICF ( TFICF module Gargantext.Core.Text.Metrics.TFICF
, TficfContext(..) ( TFICF
, Total(..) , TficfContext(..)
, Count(..) , Total(..)
, tficf , Count(..)
, sortTficf , tficf
) , sortTficf
where )
where
import Data.List qualified as List import Data.List qualified as List
import Data.Map.Strict (toList) import Data.Map.Strict (toList)
...@@ -34,12 +35,19 @@ path = "[G.T.Metrics.TFICF]" ...@@ -34,12 +35,19 @@ path = "[G.T.Metrics.TFICF]"
type TFICF = Double type TFICF = Double
-- https://www.researchgate.net/publication/221226686_TF-ICF_A_New_Term_Weighting_Scheme_for_Clustering_Dynamic_Data_Streams
-- TficfSupra n m
-- - m is the total number of documents in the corpus
-- - n is the number of documents, where given term occured more than once
-- TficfInfra n m
-- -
data TficfContext n m = TficfInfra n m data TficfContext n m = TficfInfra n m
| TficfSupra n m | TficfSupra n m
deriving (Show) deriving (Show)
data Total = Total {unTotal :: !Double} newtype Total = Total { unTotal :: Double }
data Count = Count {unCount :: !Double} newtype Count = Count { unCount :: Double }
tficf :: TficfContext Count Total tficf :: TficfContext Count Total
-> TficfContext Count Total -> TficfContext Count Total
...@@ -50,7 +58,11 @@ tficf (TficfInfra (Count ic) (Total it) ) ...@@ -50,7 +58,11 @@ tficf (TficfInfra (Count ic) (Total it) )
| otherwise = panicTrace | otherwise = panicTrace
$ "[ERR]" $ "[ERR]"
<> path <> path
<> " Frequency impossible" <> " Frequency impossible: "
<> "ic = " <> show ic
<> ", it = " <> show it
<> ", sc = " <> show sc
<> ", st = " <> show st
tficf _ _ = panicTrace $ "[ERR]" <> path <> "Undefined for these contexts" tficf _ _ = panicTrace $ "[ERR]" <> path <> "Undefined for these contexts"
......
...@@ -114,7 +114,7 @@ extractTermsWithList' pats = map (concat . map concat . replaceTerms KeepAll pat ...@@ -114,7 +114,7 @@ extractTermsWithList' pats = map (concat . map concat . replaceTerms KeepAll pat
-------------------------------------------------------------------------- --------------------------------------------------------------------------
addSpaces :: Text -> Text addSpaces :: Text -> Text
addSpaces = (Text.intercalate " ") . (Text.chunksOf 1) addSpaces = Text.unwords . (Text.chunksOf 1)
-------------------------------------------------------------------------- --------------------------------------------------------------------------
......
...@@ -185,10 +185,13 @@ selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms = ...@@ -185,10 +185,13 @@ selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
( int ( int
, toDBid NodeDocument , toDBid NodeDocument
, cId , cId
-- , Values fields ((DPS.Only . unNgramsTerm) <$> (List.take 10000 tms))
, DPS.In (unNgramsTerm <$> (List.take 10000 tms)) , DPS.In (unNgramsTerm <$> (List.take 10000 tms))
, cId , cId
, toDBid nt , toDBid nt
) )
-- where
-- fields = [QualifiedIdentifier Nothing "text"]
queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
queryNgramsOccurrencesOnlyByContextUser_withSample = [sql| queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
...@@ -211,6 +214,27 @@ queryNgramsOccurrencesOnlyByContextUser_withSample = [sql| ...@@ -211,6 +214,27 @@ queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
GROUP BY cng.node_id, ir.terms GROUP BY cng.node_id, ir.terms
|] |]
-- queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
-- queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
-- WITH nodes_sample AS (SELECT c.id FROM contexts c TABLESAMPLE SYSTEM_ROWS (?)
-- JOIN nodes_contexts nc ON c.id = nc.context_id
-- WHERE c.typename = ?
-- AND nc.node_id = ?),
-- input_rows(terms) AS (?)
-- SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
-- JOIN ngrams ng ON cng.ngrams_id = ng.id
-- JOIN input_rows ir ON ir.terms = ng.terms
-- JOIN nodes_contexts nc ON nc.context_id = cng.context_id
-- JOIN nodes_sample ns ON nc.context_id = ns.id
-- WHERE nc.node_id = ? -- CorpusId
-- AND cng.ngrams_type = ? -- NgramsTypeId
-- AND nc.category > 0
-- -- AND nc.context_id IN (SELECT id FROM nodes_sample)
-- GROUP BY cng.node_id, ng.terms
-- |]
selectNgramsOccurrencesOnlyByContextUser_withSample' :: HasDBid NodeType selectNgramsOccurrencesOnlyByContextUser_withSample' :: HasDBid NodeType
=> CorpusId => CorpusId
-> Int -> Int
......
...@@ -70,6 +70,9 @@ getTficf_withSample cId mId nt = do ...@@ -70,6 +70,9 @@ getTficf_withSample cId mId nt = do
<$> getOccByNgramsOnlyFast_withSample mId countGlobal nt <$> getOccByNgramsOnlyFast_withSample mId countGlobal nt
(HM.keys mapTextDoubleLocal) (HM.keys mapTextDoubleLocal)
printDebug "[getTficf_withSample] mapTextDoubleLocal: " mapTextDoubleLocal
printDebug "[getTficf_withSample] mapTextDoubleGlobal: " mapTextDoubleGlobal
--printDebug "getTficf_withSample" (mapTextDoubleLocal, mapTextDoubleGlobal, countLocal, countGlobal) --printDebug "getTficf_withSample" (mapTextDoubleLocal, mapTextDoubleGlobal, countLocal, countGlobal)
pure $ HM.mapWithKey (\t n -> pure $ HM.mapWithKey (\t n ->
tficf (TficfInfra (Count n ) tficf (TficfInfra (Count n )
......
...@@ -9,7 +9,6 @@ Portability : POSIX ...@@ -9,7 +9,6 @@ Portability : POSIX
-} -}
{-# LANGUAGE Arrows #-}
{-# LANGUAGE ConstraintKinds, ScopedTypeVariables #-} {-# LANGUAGE ConstraintKinds, ScopedTypeVariables #-}
{-# LANGUAGE LambdaCase #-} {-# LANGUAGE LambdaCase #-}
...@@ -210,9 +209,9 @@ fromField' field mb = do ...@@ -210,9 +209,9 @@ fromField' field mb = do
valueToHyperdata v = case fromJSON v of valueToHyperdata v = case fromJSON v of
Success a -> pure a Success a -> pure a
Error _err -> returnError ConversionFailed field Error _err -> returnError ConversionFailed field
$ DL.intercalate " " [ "cannot parse hyperdata for JSON: " $ DL.unwords [ "cannot parse hyperdata for JSON: "
, show v , show v
] ]
printSqlOpa :: Default Unpackspec a a => Select a -> IO () printSqlOpa :: Default Unpackspec a a => Select a -> IO ()
printSqlOpa = putStrLn . maybe "Empty query" identity . showSql printSqlOpa = putStrLn . maybe "Empty query" identity . showSql
......
...@@ -164,7 +164,7 @@ querySelectLems = [sql| ...@@ -164,7 +164,7 @@ querySelectLems = [sql|
AS (SELECT id, terms AS (SELECT id, terms
FROM ngrams FROM ngrams
WHERE terms IN ?) WHERE terms IN ?)
, input_rows(lang_id, algo_id, terms,n) , input_rows
AS (SELECT ? as lang_id, ? as algo_id, terms, id AS (SELECT ? as lang_id, ? as algo_id, terms, id
FROM trms) FROM trms)
, lems AS ( select ir.terms as t1, n2.terms as t2, sum(np.score) as score from input_rows ir , lems AS ( select ir.terms as t1, n2.terms as t2, sum(np.score) as score from input_rows ir
......
...@@ -317,6 +317,15 @@ nodeContextsScore inputData = map (\(PGS.Only a) -> a) ...@@ -317,6 +317,15 @@ nodeContextsScore inputData = map (\(PGS.Only a) -> a)
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | Counts the number of documents in a corpus.
-- Also applies filter for category to be at least 1 (i.e. not in trash).
-- select count(*)
-- from contexts c
-- join nodes_contexts nc on c.id = nc.context_id
-- where
-- nc.node_id = 88
-- and nc.category >= 1
-- and c.typename = 4
selectCountDocs :: HasDBid NodeType => CorpusId -> DBCmd err Int selectCountDocs :: HasDBid NodeType => CorpusId -> DBCmd err Int
selectCountDocs cId = runCountOpaQuery (queryCountDocs cId) selectCountDocs cId = runCountOpaQuery (queryCountDocs cId)
where where
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment