some more refactorings, comments to code i didn't understand

parent 166f9c19
Pipeline #5855 passed with stages
in 151 minutes and 22 seconds
......@@ -65,7 +65,6 @@ common defaults
build-depends:
base >=4.7 && <5
optimization: 2
common optimized
ghc-options:
-O2
......@@ -817,7 +816,6 @@ executable gargantext-server
, text ^>= 1.2.4.1
, unordered-containers ^>= 0.2.16.0
, vector ^>= 0.7.3
optimization: 2
executable gargantext-upgrade
import:
......
......@@ -58,6 +58,11 @@ isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
-}
-- | Good value from users' requests and anthropological analysis
goodMapListSize :: Int
goodMapListSize = 350
-- | TODO improve grouping functions of Authors, Sources, Institutes..
buildNgramsLists :: ( HasNodeStory env err m
, HasNLPServer env
......@@ -71,7 +76,7 @@ buildNgramsLists :: ( HasNodeStory env err m
-> GroupParams
-> m (Map NgramsType [NgramsElement])
buildNgramsLists user uCid mCid mfslw gp = do
ngTerms <- buildNgramsTermsList user uCid mCid mfslw gp (NgramsTerms, MapListSize 350)
ngTerms <- buildNgramsTermsList user uCid mCid mfslw gp (NgramsTerms, MapListSize goodMapListSize)
othersTerms <- mapM (buildNgramsOthersList user uCid mfslw GroupIdentity)
[ (Authors , MapListSize 9, MaxListSize 1000)
, (Sources , MapListSize 9, MaxListSize 1000)
......@@ -195,6 +200,7 @@ buildNgramsTermsList user uCid mCid mfslw groupParams (nt, MapListSize mapListSi
$ HashMap.filter (\g -> view gts'_score g > 1)
$ view flc_scores groupedWithList
-- | Split candidateTerms into mono-terms and multi-terms.
!(groupedMono, groupedMult) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms
-- void $ panicTrace $ "groupedWithList: " <> show groupedWithList
......@@ -211,6 +217,7 @@ buildNgramsTermsList user uCid mCid mfslw groupParams (nt, MapListSize mapListSi
!monoSize = 0.4 :: Double
!multSize = 1 - monoSize
-- | Splits given hashmap into 2 pieces, based on score
splitAt' n' ns = both (HashMap.fromListWith (<>))
$ List.splitAt (round $ n' * listSizeGlobal)
$ List.sortOn (viewScore . snd)
......@@ -254,8 +261,7 @@ buildNgramsTermsList user uCid mCid mfslw groupParams (nt, MapListSize mapListSi
]
where
mapStemNodeIds = HashMap.toList
$ HashMap.map viewScores
$ groupedTreeScores_SetNodeId
$ HashMap.map viewScores groupedTreeScores_SetNodeId
let
-- computing scores
mapScores f = HashMap.fromList
......
......@@ -14,14 +14,15 @@ TFICF is a generalization of [TFIDF](https://en.wikipedia.org/wiki/Tf%E2%80%93id
-}
module Gargantext.Core.Text.Metrics.TFICF ( TFICF
, TficfContext(..)
, Total(..)
, Count(..)
, tficf
, sortTficf
)
where
module Gargantext.Core.Text.Metrics.TFICF
( TFICF
, TficfContext(..)
, Total(..)
, Count(..)
, tficf
, sortTficf
)
where
import Data.List qualified as List
import Data.Map.Strict (toList)
......@@ -34,12 +35,19 @@ path = "[G.T.Metrics.TFICF]"
type TFICF = Double
-- https://www.researchgate.net/publication/221226686_TF-ICF_A_New_Term_Weighting_Scheme_for_Clustering_Dynamic_Data_Streams
-- TficfSupra n m
-- - m is the total number of documents in the corpus
-- - n is the number of documents, where given term occured more than once
-- TficfInfra n m
-- -
data TficfContext n m = TficfInfra n m
| TficfSupra n m
deriving (Show)
data Total = Total {unTotal :: !Double}
data Count = Count {unCount :: !Double}
newtype Total = Total { unTotal :: Double }
newtype Count = Count { unCount :: Double }
tficf :: TficfContext Count Total
-> TficfContext Count Total
......@@ -50,7 +58,11 @@ tficf (TficfInfra (Count ic) (Total it) )
| otherwise = panicTrace
$ "[ERR]"
<> path
<> " Frequency impossible"
<> " Frequency impossible: "
<> "ic = " <> show ic
<> ", it = " <> show it
<> ", sc = " <> show sc
<> ", st = " <> show st
tficf _ _ = panicTrace $ "[ERR]" <> path <> "Undefined for these contexts"
......
......@@ -114,7 +114,7 @@ extractTermsWithList' pats = map (concat . map concat . replaceTerms KeepAll pat
--------------------------------------------------------------------------
addSpaces :: Text -> Text
addSpaces = (Text.intercalate " ") . (Text.chunksOf 1)
addSpaces = Text.unwords . (Text.chunksOf 1)
--------------------------------------------------------------------------
......
......@@ -185,10 +185,13 @@ selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
( int
, toDBid NodeDocument
, cId
-- , Values fields ((DPS.Only . unNgramsTerm) <$> (List.take 10000 tms))
, DPS.In (unNgramsTerm <$> (List.take 10000 tms))
, cId
, toDBid nt
)
-- where
-- fields = [QualifiedIdentifier Nothing "text"]
queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
......@@ -211,6 +214,27 @@ queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
GROUP BY cng.node_id, ir.terms
|]
-- queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
-- queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
-- WITH nodes_sample AS (SELECT c.id FROM contexts c TABLESAMPLE SYSTEM_ROWS (?)
-- JOIN nodes_contexts nc ON c.id = nc.context_id
-- WHERE c.typename = ?
-- AND nc.node_id = ?),
-- input_rows(terms) AS (?)
-- SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
-- JOIN ngrams ng ON cng.ngrams_id = ng.id
-- JOIN input_rows ir ON ir.terms = ng.terms
-- JOIN nodes_contexts nc ON nc.context_id = cng.context_id
-- JOIN nodes_sample ns ON nc.context_id = ns.id
-- WHERE nc.node_id = ? -- CorpusId
-- AND cng.ngrams_type = ? -- NgramsTypeId
-- AND nc.category > 0
-- -- AND nc.context_id IN (SELECT id FROM nodes_sample)
-- GROUP BY cng.node_id, ng.terms
-- |]
selectNgramsOccurrencesOnlyByContextUser_withSample' :: HasDBid NodeType
=> CorpusId
-> Int
......
......@@ -70,6 +70,9 @@ getTficf_withSample cId mId nt = do
<$> getOccByNgramsOnlyFast_withSample mId countGlobal nt
(HM.keys mapTextDoubleLocal)
printDebug "[getTficf_withSample] mapTextDoubleLocal: " mapTextDoubleLocal
printDebug "[getTficf_withSample] mapTextDoubleGlobal: " mapTextDoubleGlobal
--printDebug "getTficf_withSample" (mapTextDoubleLocal, mapTextDoubleGlobal, countLocal, countGlobal)
pure $ HM.mapWithKey (\t n ->
tficf (TficfInfra (Count n )
......
......@@ -9,7 +9,6 @@ Portability : POSIX
-}
{-# LANGUAGE Arrows #-}
{-# LANGUAGE ConstraintKinds, ScopedTypeVariables #-}
{-# LANGUAGE LambdaCase #-}
......@@ -210,9 +209,9 @@ fromField' field mb = do
valueToHyperdata v = case fromJSON v of
Success a -> pure a
Error _err -> returnError ConversionFailed field
$ DL.intercalate " " [ "cannot parse hyperdata for JSON: "
, show v
]
$ DL.unwords [ "cannot parse hyperdata for JSON: "
, show v
]
printSqlOpa :: Default Unpackspec a a => Select a -> IO ()
printSqlOpa = putStrLn . maybe "Empty query" identity . showSql
......
......@@ -164,7 +164,7 @@ querySelectLems = [sql|
AS (SELECT id, terms
FROM ngrams
WHERE terms IN ?)
, input_rows(lang_id, algo_id, terms,n)
, input_rows
AS (SELECT ? as lang_id, ? as algo_id, terms, id
FROM trms)
, lems AS ( select ir.terms as t1, n2.terms as t2, sum(np.score) as score from input_rows ir
......
......@@ -317,6 +317,15 @@ nodeContextsScore inputData = map (\(PGS.Only a) -> a)
------------------------------------------------------------------------
-- | Counts the number of documents in a corpus.
-- Also applies filter for category to be at least 1 (i.e. not in trash).
-- select count(*)
-- from contexts c
-- join nodes_contexts nc on c.id = nc.context_id
-- where
-- nc.node_id = 88
-- and nc.category >= 1
-- and c.typename = 4
selectCountDocs :: HasDBid NodeType => CorpusId -> DBCmd err Int
selectCountDocs cId = runCountOpaQuery (queryCountDocs cId)
where
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment