Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
195
Issues
195
List
Board
Labels
Milestones
Merge Requests
12
Merge Requests
12
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
956df688
Verified
Commit
956df688
authored
Apr 02, 2024
by
Przemyslaw Kaminski
1
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
some more refactorings, comments to code i didn't understand
parent
166f9c19
Pipeline
#5855
passed with stages
in 151 minutes and 22 seconds
Changes
9
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
73 additions
and
22 deletions
+73
-22
gargantext.cabal
gargantext.cabal
+0
-2
List.hs
src/Gargantext/Core/Text/List.hs
+9
-3
TFICF.hs
src/Gargantext/Core/Text/Metrics/TFICF.hs
+23
-11
WithList.hs
src/Gargantext/Core/Text/Terms/WithList.hs
+1
-1
NgramsByContext.hs
src/Gargantext/Database/Action/Metrics/NgramsByContext.hs
+24
-0
TFICF.hs
src/Gargantext/Database/Action/Metrics/TFICF.hs
+3
-0
Prelude.hs
src/Gargantext/Database/Prelude.hs
+3
-4
NgramsPostag.hs
src/Gargantext/Database/Query/Table/NgramsPostag.hs
+1
-1
NodeContext.hs
src/Gargantext/Database/Query/Table/NodeContext.hs
+9
-0
No files found.
gargantext.cabal
View file @
956df688
...
...
@@ -65,7 +65,6 @@ common defaults
build-depends:
base >=4.7 && <5
optimization: 2
common optimized
ghc-options:
-O2
...
...
@@ -817,7 +816,6 @@ executable gargantext-server
, text ^>= 1.2.4.1
, unordered-containers ^>= 0.2.16.0
, vector ^>= 0.7.3
optimization: 2
executable gargantext-upgrade
import:
...
...
src/Gargantext/Core/Text/List.hs
View file @
956df688
...
...
@@ -58,6 +58,11 @@ isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
-}
-- | Good value from users' requests and anthropological analysis
goodMapListSize
::
Int
goodMapListSize
=
350
-- | TODO improve grouping functions of Authors, Sources, Institutes..
buildNgramsLists
::
(
HasNodeStory
env
err
m
,
HasNLPServer
env
...
...
@@ -71,7 +76,7 @@ buildNgramsLists :: ( HasNodeStory env err m
->
GroupParams
->
m
(
Map
NgramsType
[
NgramsElement
])
buildNgramsLists
user
uCid
mCid
mfslw
gp
=
do
ngTerms
<-
buildNgramsTermsList
user
uCid
mCid
mfslw
gp
(
NgramsTerms
,
MapListSize
350
)
ngTerms
<-
buildNgramsTermsList
user
uCid
mCid
mfslw
gp
(
NgramsTerms
,
MapListSize
goodMapListSize
)
othersTerms
<-
mapM
(
buildNgramsOthersList
user
uCid
mfslw
GroupIdentity
)
[
(
Authors
,
MapListSize
9
,
MaxListSize
1000
)
,
(
Sources
,
MapListSize
9
,
MaxListSize
1000
)
...
...
@@ -195,6 +200,7 @@ buildNgramsTermsList user uCid mCid mfslw groupParams (nt, MapListSize mapListSi
$
HashMap
.
filter
(
\
g
->
view
gts'_score
g
>
1
)
$
view
flc_scores
groupedWithList
-- | Split candidateTerms into mono-terms and multi-terms.
!
(
groupedMono
,
groupedMult
)
=
HashMap
.
partitionWithKey
(
\
(
NgramsTerm
t
)
_v
->
size
t
<
2
)
candidateTerms
-- void $ panicTrace $ "groupedWithList: " <> show groupedWithList
...
...
@@ -211,6 +217,7 @@ buildNgramsTermsList user uCid mCid mfslw groupParams (nt, MapListSize mapListSi
!
monoSize
=
0.4
::
Double
!
multSize
=
1
-
monoSize
-- | Splits given hashmap into 2 pieces, based on score
splitAt'
n'
ns
=
both
(
HashMap
.
fromListWith
(
<>
))
$
List
.
splitAt
(
round
$
n'
*
listSizeGlobal
)
$
List
.
sortOn
(
viewScore
.
snd
)
...
...
@@ -254,8 +261,7 @@ buildNgramsTermsList user uCid mCid mfslw groupParams (nt, MapListSize mapListSi
]
where
mapStemNodeIds
=
HashMap
.
toList
$
HashMap
.
map
viewScores
$
groupedTreeScores_SetNodeId
$
HashMap
.
map
viewScores
groupedTreeScores_SetNodeId
let
-- computing scores
mapScores
f
=
HashMap
.
fromList
...
...
src/Gargantext/Core/Text/Metrics/TFICF.hs
View file @
956df688
...
...
@@ -14,14 +14,15 @@ TFICF is a generalization of [TFIDF](https://en.wikipedia.org/wiki/Tf%E2%80%93id
-}
module
Gargantext.Core.Text.Metrics.TFICF
(
TFICF
module
Gargantext.Core.Text.Metrics.TFICF
(
TFICF
,
TficfContext
(
..
)
,
Total
(
..
)
,
Count
(
..
)
,
tficf
,
sortTficf
)
where
where
import
Data.List
qualified
as
List
import
Data.Map.Strict
(
toList
)
...
...
@@ -34,12 +35,19 @@ path = "[G.T.Metrics.TFICF]"
type
TFICF
=
Double
-- https://www.researchgate.net/publication/221226686_TF-ICF_A_New_Term_Weighting_Scheme_for_Clustering_Dynamic_Data_Streams
-- TficfSupra n m
-- - m is the total number of documents in the corpus
-- - n is the number of documents, where given term occured more than once
-- TficfInfra n m
-- -
data
TficfContext
n
m
=
TficfInfra
n
m
|
TficfSupra
n
m
deriving
(
Show
)
data
Total
=
Total
{
unTotal
::
!
Double
}
data
Count
=
Count
{
unCount
::
!
Double
}
newtype
Total
=
Total
{
unTotal
::
Double
}
newtype
Count
=
Count
{
unCount
::
Double
}
tficf
::
TficfContext
Count
Total
->
TficfContext
Count
Total
...
...
@@ -50,7 +58,11 @@ tficf (TficfInfra (Count ic) (Total it) )
|
otherwise
=
panicTrace
$
"[ERR]"
<>
path
<>
" Frequency impossible"
<>
" Frequency impossible: "
<>
"ic = "
<>
show
ic
<>
", it = "
<>
show
it
<>
", sc = "
<>
show
sc
<>
", st = "
<>
show
st
tficf
_
_
=
panicTrace
$
"[ERR]"
<>
path
<>
"Undefined for these contexts"
...
...
src/Gargantext/Core/Text/Terms/WithList.hs
View file @
956df688
...
...
@@ -114,7 +114,7 @@ extractTermsWithList' pats = map (concat . map concat . replaceTerms KeepAll pat
--------------------------------------------------------------------------
addSpaces
::
Text
->
Text
addSpaces
=
(
Text
.
intercalate
" "
)
.
(
Text
.
chunksOf
1
)
addSpaces
=
Text
.
unwords
.
(
Text
.
chunksOf
1
)
--------------------------------------------------------------------------
...
...
src/Gargantext/Database/Action/Metrics/NgramsByContext.hs
View file @
956df688
...
...
@@ -185,10 +185,13 @@ selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
(
int
,
toDBid
NodeDocument
,
cId
-- , Values fields ((DPS.Only . unNgramsTerm) <$> (List.take 10000 tms))
,
DPS
.
In
(
unNgramsTerm
<$>
(
List
.
take
10000
tms
))
,
cId
,
toDBid
nt
)
-- where
-- fields = [QualifiedIdentifier Nothing "text"]
queryNgramsOccurrencesOnlyByContextUser_withSample
::
DPS
.
Query
queryNgramsOccurrencesOnlyByContextUser_withSample
=
[
sql
|
...
...
@@ -211,6 +214,27 @@ queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
GROUP BY cng.node_id, ir.terms
|]
-- queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
-- queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
-- WITH nodes_sample AS (SELECT c.id FROM contexts c TABLESAMPLE SYSTEM_ROWS (?)
-- JOIN nodes_contexts nc ON c.id = nc.context_id
-- WHERE c.typename = ?
-- AND nc.node_id = ?),
-- input_rows(terms) AS (?)
-- SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
-- JOIN ngrams ng ON cng.ngrams_id = ng.id
-- JOIN input_rows ir ON ir.terms = ng.terms
-- JOIN nodes_contexts nc ON nc.context_id = cng.context_id
-- JOIN nodes_sample ns ON nc.context_id = ns.id
-- WHERE nc.node_id = ? -- CorpusId
-- AND cng.ngrams_type = ? -- NgramsTypeId
-- AND nc.category > 0
-- -- AND nc.context_id IN (SELECT id FROM nodes_sample)
-- GROUP BY cng.node_id, ng.terms
-- |]
selectNgramsOccurrencesOnlyByContextUser_withSample'
::
HasDBid
NodeType
=>
CorpusId
->
Int
...
...
src/Gargantext/Database/Action/Metrics/TFICF.hs
View file @
956df688
...
...
@@ -70,6 +70,9 @@ getTficf_withSample cId mId nt = do
<$>
getOccByNgramsOnlyFast_withSample
mId
countGlobal
nt
(
HM
.
keys
mapTextDoubleLocal
)
printDebug
"[getTficf_withSample] mapTextDoubleLocal: "
mapTextDoubleLocal
printDebug
"[getTficf_withSample] mapTextDoubleGlobal: "
mapTextDoubleGlobal
--printDebug "getTficf_withSample" (mapTextDoubleLocal, mapTextDoubleGlobal, countLocal, countGlobal)
pure
$
HM
.
mapWithKey
(
\
t
n
->
tficf
(
TficfInfra
(
Count
n
)
...
...
src/Gargantext/Database/Prelude.hs
View file @
956df688
...
...
@@ -9,7 +9,6 @@ Portability : POSIX
-}
{-# LANGUAGE Arrows #-}
{-# LANGUAGE ConstraintKinds, ScopedTypeVariables #-}
{-# LANGUAGE LambdaCase #-}
...
...
@@ -210,7 +209,7 @@ fromField' field mb = do
valueToHyperdata
v
=
case
fromJSON
v
of
Success
a
->
pure
a
Error
_err
->
returnError
ConversionFailed
field
$
DL
.
intercalate
" "
[
"cannot parse hyperdata for JSON: "
$
DL
.
unwords
[
"cannot parse hyperdata for JSON: "
,
show
v
]
...
...
src/Gargantext/Database/Query/Table/NgramsPostag.hs
View file @
956df688
...
...
@@ -164,7 +164,7 @@ querySelectLems = [sql|
AS (SELECT id, terms
FROM ngrams
WHERE terms IN ?)
, input_rows
(lang_id, algo_id, terms,n)
, input_rows
AS (SELECT ? as lang_id, ? as algo_id, terms, id
FROM trms)
, lems AS ( select ir.terms as t1, n2.terms as t2, sum(np.score) as score from input_rows ir
...
...
src/Gargantext/Database/Query/Table/NodeContext.hs
View file @
956df688
...
...
@@ -317,6 +317,15 @@ nodeContextsScore inputData = map (\(PGS.Only a) -> a)
------------------------------------------------------------------------
-- | Counts the number of documents in a corpus.
-- Also applies filter for category to be at least 1 (i.e. not in trash).
-- select count(*)
-- from contexts c
-- join nodes_contexts nc on c.id = nc.context_id
-- where
-- nc.node_id = 88
-- and nc.category >= 1
-- and c.typename = 4
selectCountDocs
::
HasDBid
NodeType
=>
CorpusId
->
DBCmd
err
Int
selectCountDocs
cId
=
runCountOpaQuery
(
queryCountDocs
cId
)
where
...
...
Przemyslaw Kaminski
@cgenie
mentioned in commit
5660aec0
·
Oct 08, 2024
mentioned in commit
5660aec0
mentioned in commit 5660aec07ec5a0a0a5468f440092c1a8f57a864e
Toggle commit list
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment