Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Christian Merten
haskell-gargantext
Commits
956df688
Verified
Commit
956df688
authored
Apr 02, 2024
by
Przemyslaw Kaminski
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
some more refactorings, comments to code i didn't understand
parent
166f9c19
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
73 additions
and
22 deletions
+73
-22
gargantext.cabal
gargantext.cabal
+0
-2
List.hs
src/Gargantext/Core/Text/List.hs
+9
-3
TFICF.hs
src/Gargantext/Core/Text/Metrics/TFICF.hs
+23
-11
WithList.hs
src/Gargantext/Core/Text/Terms/WithList.hs
+1
-1
NgramsByContext.hs
src/Gargantext/Database/Action/Metrics/NgramsByContext.hs
+24
-0
TFICF.hs
src/Gargantext/Database/Action/Metrics/TFICF.hs
+3
-0
Prelude.hs
src/Gargantext/Database/Prelude.hs
+3
-4
NgramsPostag.hs
src/Gargantext/Database/Query/Table/NgramsPostag.hs
+1
-1
NodeContext.hs
src/Gargantext/Database/Query/Table/NodeContext.hs
+9
-0
No files found.
gargantext.cabal
View file @
956df688
...
...
@@ -65,7 +65,6 @@ common defaults
build-depends:
base >=4.7 && <5
optimization: 2
common optimized
ghc-options:
-O2
...
...
@@ -817,7 +816,6 @@ executable gargantext-server
, text ^>= 1.2.4.1
, unordered-containers ^>= 0.2.16.0
, vector ^>= 0.7.3
optimization: 2
executable gargantext-upgrade
import:
...
...
src/Gargantext/Core/Text/List.hs
View file @
956df688
...
...
@@ -58,6 +58,11 @@ isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
-}
-- | Good value from users' requests and anthropological analysis
goodMapListSize
::
Int
goodMapListSize
=
350
-- | TODO improve grouping functions of Authors, Sources, Institutes..
buildNgramsLists
::
(
HasNodeStory
env
err
m
,
HasNLPServer
env
...
...
@@ -71,7 +76,7 @@ buildNgramsLists :: ( HasNodeStory env err m
->
GroupParams
->
m
(
Map
NgramsType
[
NgramsElement
])
buildNgramsLists
user
uCid
mCid
mfslw
gp
=
do
ngTerms
<-
buildNgramsTermsList
user
uCid
mCid
mfslw
gp
(
NgramsTerms
,
MapListSize
350
)
ngTerms
<-
buildNgramsTermsList
user
uCid
mCid
mfslw
gp
(
NgramsTerms
,
MapListSize
goodMapListSize
)
othersTerms
<-
mapM
(
buildNgramsOthersList
user
uCid
mfslw
GroupIdentity
)
[
(
Authors
,
MapListSize
9
,
MaxListSize
1000
)
,
(
Sources
,
MapListSize
9
,
MaxListSize
1000
)
...
...
@@ -195,6 +200,7 @@ buildNgramsTermsList user uCid mCid mfslw groupParams (nt, MapListSize mapListSi
$
HashMap
.
filter
(
\
g
->
view
gts'_score
g
>
1
)
$
view
flc_scores
groupedWithList
-- | Split candidateTerms into mono-terms and multi-terms.
!
(
groupedMono
,
groupedMult
)
=
HashMap
.
partitionWithKey
(
\
(
NgramsTerm
t
)
_v
->
size
t
<
2
)
candidateTerms
-- void $ panicTrace $ "groupedWithList: " <> show groupedWithList
...
...
@@ -211,6 +217,7 @@ buildNgramsTermsList user uCid mCid mfslw groupParams (nt, MapListSize mapListSi
!
monoSize
=
0.4
::
Double
!
multSize
=
1
-
monoSize
-- | Splits given hashmap into 2 pieces, based on score
splitAt'
n'
ns
=
both
(
HashMap
.
fromListWith
(
<>
))
$
List
.
splitAt
(
round
$
n'
*
listSizeGlobal
)
$
List
.
sortOn
(
viewScore
.
snd
)
...
...
@@ -254,8 +261,7 @@ buildNgramsTermsList user uCid mCid mfslw groupParams (nt, MapListSize mapListSi
]
where
mapStemNodeIds
=
HashMap
.
toList
$
HashMap
.
map
viewScores
$
groupedTreeScores_SetNodeId
$
HashMap
.
map
viewScores
groupedTreeScores_SetNodeId
let
-- computing scores
mapScores
f
=
HashMap
.
fromList
...
...
src/Gargantext/Core/Text/Metrics/TFICF.hs
View file @
956df688
...
...
@@ -14,14 +14,15 @@ TFICF is a generalization of [TFIDF](https://en.wikipedia.org/wiki/Tf%E2%80%93id
-}
module
Gargantext.Core.Text.Metrics.TFICF
(
TFICF
,
TficfContext
(
..
)
,
Total
(
..
)
,
Count
(
..
)
,
tficf
,
sortTficf
)
where
module
Gargantext.Core.Text.Metrics.TFICF
(
TFICF
,
TficfContext
(
..
)
,
Total
(
..
)
,
Count
(
..
)
,
tficf
,
sortTficf
)
where
import
Data.List
qualified
as
List
import
Data.Map.Strict
(
toList
)
...
...
@@ -34,12 +35,19 @@ path = "[G.T.Metrics.TFICF]"
type
TFICF
=
Double
-- https://www.researchgate.net/publication/221226686_TF-ICF_A_New_Term_Weighting_Scheme_for_Clustering_Dynamic_Data_Streams
-- TficfSupra n m
-- - m is the total number of documents in the corpus
-- - n is the number of documents, where given term occured more than once
-- TficfInfra n m
-- -
data
TficfContext
n
m
=
TficfInfra
n
m
|
TficfSupra
n
m
deriving
(
Show
)
data
Total
=
Total
{
unTotal
::
!
Double
}
data
Count
=
Count
{
unCount
::
!
Double
}
newtype
Total
=
Total
{
unTotal
::
Double
}
newtype
Count
=
Count
{
unCount
::
Double
}
tficf
::
TficfContext
Count
Total
->
TficfContext
Count
Total
...
...
@@ -50,7 +58,11 @@ tficf (TficfInfra (Count ic) (Total it) )
|
otherwise
=
panicTrace
$
"[ERR]"
<>
path
<>
" Frequency impossible"
<>
" Frequency impossible: "
<>
"ic = "
<>
show
ic
<>
", it = "
<>
show
it
<>
", sc = "
<>
show
sc
<>
", st = "
<>
show
st
tficf
_
_
=
panicTrace
$
"[ERR]"
<>
path
<>
"Undefined for these contexts"
...
...
src/Gargantext/Core/Text/Terms/WithList.hs
View file @
956df688
...
...
@@ -114,7 +114,7 @@ extractTermsWithList' pats = map (concat . map concat . replaceTerms KeepAll pat
--------------------------------------------------------------------------
addSpaces
::
Text
->
Text
addSpaces
=
(
Text
.
intercalate
" "
)
.
(
Text
.
chunksOf
1
)
addSpaces
=
Text
.
unwords
.
(
Text
.
chunksOf
1
)
--------------------------------------------------------------------------
...
...
src/Gargantext/Database/Action/Metrics/NgramsByContext.hs
View file @
956df688
...
...
@@ -185,10 +185,13 @@ selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
(
int
,
toDBid
NodeDocument
,
cId
-- , Values fields ((DPS.Only . unNgramsTerm) <$> (List.take 10000 tms))
,
DPS
.
In
(
unNgramsTerm
<$>
(
List
.
take
10000
tms
))
,
cId
,
toDBid
nt
)
-- where
-- fields = [QualifiedIdentifier Nothing "text"]
queryNgramsOccurrencesOnlyByContextUser_withSample
::
DPS
.
Query
queryNgramsOccurrencesOnlyByContextUser_withSample
=
[
sql
|
...
...
@@ -211,6 +214,27 @@ queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
GROUP BY cng.node_id, ir.terms
|]
-- queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
-- queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
-- WITH nodes_sample AS (SELECT c.id FROM contexts c TABLESAMPLE SYSTEM_ROWS (?)
-- JOIN nodes_contexts nc ON c.id = nc.context_id
-- WHERE c.typename = ?
-- AND nc.node_id = ?),
-- input_rows(terms) AS (?)
-- SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
-- JOIN ngrams ng ON cng.ngrams_id = ng.id
-- JOIN input_rows ir ON ir.terms = ng.terms
-- JOIN nodes_contexts nc ON nc.context_id = cng.context_id
-- JOIN nodes_sample ns ON nc.context_id = ns.id
-- WHERE nc.node_id = ? -- CorpusId
-- AND cng.ngrams_type = ? -- NgramsTypeId
-- AND nc.category > 0
-- -- AND nc.context_id IN (SELECT id FROM nodes_sample)
-- GROUP BY cng.node_id, ng.terms
-- |]
selectNgramsOccurrencesOnlyByContextUser_withSample'
::
HasDBid
NodeType
=>
CorpusId
->
Int
...
...
src/Gargantext/Database/Action/Metrics/TFICF.hs
View file @
956df688
...
...
@@ -70,6 +70,9 @@ getTficf_withSample cId mId nt = do
<$>
getOccByNgramsOnlyFast_withSample
mId
countGlobal
nt
(
HM
.
keys
mapTextDoubleLocal
)
printDebug
"[getTficf_withSample] mapTextDoubleLocal: "
mapTextDoubleLocal
printDebug
"[getTficf_withSample] mapTextDoubleGlobal: "
mapTextDoubleGlobal
--printDebug "getTficf_withSample" (mapTextDoubleLocal, mapTextDoubleGlobal, countLocal, countGlobal)
pure
$
HM
.
mapWithKey
(
\
t
n
->
tficf
(
TficfInfra
(
Count
n
)
...
...
src/Gargantext/Database/Prelude.hs
View file @
956df688
...
...
@@ -9,7 +9,6 @@ Portability : POSIX
-}
{-# LANGUAGE Arrows #-}
{-# LANGUAGE ConstraintKinds, ScopedTypeVariables #-}
{-# LANGUAGE LambdaCase #-}
...
...
@@ -210,9 +209,9 @@ fromField' field mb = do
valueToHyperdata
v
=
case
fromJSON
v
of
Success
a
->
pure
a
Error
_err
->
returnError
ConversionFailed
field
$
DL
.
intercalate
" "
[
"cannot parse hyperdata for JSON: "
,
show
v
]
$
DL
.
unwords
[
"cannot parse hyperdata for JSON: "
,
show
v
]
printSqlOpa
::
Default
Unpackspec
a
a
=>
Select
a
->
IO
()
printSqlOpa
=
putStrLn
.
maybe
"Empty query"
identity
.
showSql
...
...
src/Gargantext/Database/Query/Table/NgramsPostag.hs
View file @
956df688
...
...
@@ -164,7 +164,7 @@ querySelectLems = [sql|
AS (SELECT id, terms
FROM ngrams
WHERE terms IN ?)
, input_rows
(lang_id, algo_id, terms,n)
, input_rows
AS (SELECT ? as lang_id, ? as algo_id, terms, id
FROM trms)
, lems AS ( select ir.terms as t1, n2.terms as t2, sum(np.score) as score from input_rows ir
...
...
src/Gargantext/Database/Query/Table/NodeContext.hs
View file @
956df688
...
...
@@ -317,6 +317,15 @@ nodeContextsScore inputData = map (\(PGS.Only a) -> a)
------------------------------------------------------------------------
-- | Counts the number of documents in a corpus.
-- Also applies filter for category to be at least 1 (i.e. not in trash).
-- select count(*)
-- from contexts c
-- join nodes_contexts nc on c.id = nc.context_id
-- where
-- nc.node_id = 88
-- and nc.category >= 1
-- and c.typename = 4
selectCountDocs
::
HasDBid
NodeType
=>
CorpusId
->
DBCmd
err
Int
selectCountDocs
cId
=
runCountOpaQuery
(
queryCountDocs
cId
)
where
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment