Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
049dc862
Commit
049dc862
authored
May 19, 2016
by
Romain Loth
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
remove sql IN from global ranking score
parent
389583e9
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
73 additions
and
54 deletions
+73
-54
metric_tfidf.py
gargantext/util/toolchain/metric_tfidf.py
+73
-54
No files found.
gargantext/util/toolchain/metric_tfidf.py
View file @
049dc862
...
...
@@ -9,9 +9,10 @@ FIXME: "having the same source" means we need to select inside hyperdata
"""
from
gargantext.models
import
Node
,
NodeNgram
,
NodeNodeNgram
,
NodeNgramNgram
from
gargantext.util.db
import
session
,
bulk_insert
,
func
# = sqlalchemy.func like sum() or count()
from
sqlalchemy
import
text
# for query from raw SQL statement
from
gargantext.util.db
import
session
,
bulk_insert
,
aliased
,
\
func
# = sqlalchemy.func like sum() or count()
from
sqlalchemy.sql.expression
import
case
# for choice if ngram has mainform or not
from
sqlalchemy
import
distinct
# for list of unique ngram_ids within a corpus
from
math
import
log
# £TODO
# from gargantext.util.lists import WeightedContextIndex
...
...
@@ -140,7 +141,7 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
"""
# TODO check if cumulated tfs correspond to app's use cases and intention
Calculates tfidf ranking (cumulated tfidf
) within the
given scope
Calculates tfidf ranking (cumulated tfidf
for each ngram) within
given scope
Parameters:
- the corpus itself
...
...
@@ -148,92 +149,110 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
- local <=> frequencies counted in the current corpus
- global <=> frequencies counted in all corpora of this type
when the count_scope is global, there is another parameter:
- termset_scope: {"local" or "global"}
- local <=> output list of terms limited to the current corpus
(SELECT ngram_id FROM nodes_ngrams WHERE node_id IN <docs>)
(SELECT
DISTINCT
ngram_id FROM nodes_ngrams WHERE node_id IN <docs>)
- global <=> output list of terms from all corpora of this type
!!!! (more terms)
!!!! (m
any m
ore terms)
- overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced)
"""
corpus_docids_subquery
=
(
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
.
subquery
()
# MAIN QUERY SKELETON
tf_nd_query
=
(
session
.
query
(
NodeNgram
.
ngram_id
,
func
.
sum
(
NodeNgram
.
weight
),
# tf: same as occurrences
# -----------------------
func
.
count
(
NodeNgram
.
node_id
)
# nd: n docs with term
# --------------------
)
.
group_by
(
NodeNgram
.
ngram_id
)
# optional *count_scope*: if we'll restrict the doc nodes
# -------------
# .join(countdocs_subquery,
# countdocs_subquery.c.id == NodeNgram.node_id)
# optional *termset_scope*: if we'll restrict the ngrams
# ---------------
# .join(termset_subquery,
# termset_subquery.c.uniq_ngid == NodeNgram.ngram_id)
)
# validate string params
if
count_scope
not
in
[
"local"
,
"global"
]:
raise
ValueError
(
"compute_ti_ranking: count_scope param allowed values: 'local', 'global'"
)
if
termset_scope
not
in
[
"local"
,
"global"
]:
raise
ValueError
(
"compute_ti_ranking: termset_scope param allowed values: 'local', 'global'"
)
if
count_scope
==
"local"
and
termset_scope
==
"global"
:
raise
ValueError
(
"compute_ti_ranking: the termset_scope param can be 'global' iff count_scope param is 'global' too."
)
# local <=> within this corpus
if
count_scope
==
"local"
:
# All docs of this corpus
count_scope_subquery
=
corpus_docids_subquery
termset_scope_subquery
=
(
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
.
in_
(
corpus_docids_subquery
))
countdocs_subquery
=
(
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
subquery
()
)
# both scopes are the same: no need to independantly restrict the ngrams
tf_nd_query
=
tf_nd_query
.
join
(
countdocs_subquery
,
countdocs_subquery
.
c
.
id
==
NodeNgram
.
node_id
)
# global <=> within all corpora of this source
elif
count_scope
==
"global"
:
this_source_type
=
corpus
.
resources
()[
0
][
'type'
]
# all corpora with the same source type
# (we need raw SQL query for postgres JSON operators) (TODO test speed)
same_source_corpora_query
=
(
session
.
query
(
Node
.
id
)
.
from_statement
(
text
(
"""
SELECT id FROM nodes
WHERE hyperdata->'resources' @> '[{
\"
type
\"
\
:
%
s}]'
"""
%
this_source_type
))
)
CorpusNode
=
aliased
(
Node
)
# All docs **in all corpora of the same source**
ressource_docid
s_subquery
=
(
session
countdoc
s_subquery
=
(
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
parent_id
.
in_
(
same_source_corpora_query
))
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
# join on parent_id with selected corpora nodes
.
join
(
CorpusNode
,
CorpusNode
.
id
==
Node
.
parent_id
)
.
filter
(
CorpusNode
.
typename
==
"CORPUS"
)
.
filter
(
CorpusNode
.
hyperdata
[
'resources'
][
0
][
'type'
]
.
astext
==
str
(
this_source_type
))
.
subquery
()
)
count_scope_subquery
=
ressource_docids_subquery
if
termset_scope
==
"global"
:
termset_scope_subquery
=
(
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
.
in_
(
ressource_docids_subquery
))
.
subquery
()
)
else
:
termset_scope_subquery
=
(
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
.
in_
(
corpus_docids_subquery
))
# both scopes are the same: no need to independantly restrict the ngrams
tf_nd_query
=
tf_nd_query
.
join
(
countdocs_subquery
,
countdocs_subquery
.
c
.
id
==
NodeNgram
.
node_id
)
elif
termset_scope
==
"local"
:
# All unique terms in the original corpus
termset_subquery
=
(
session
.
query
(
distinct
(
NodeNgram
.
ngram_id
)
.
label
(
"uniq_ngid"
))
.
join
(
Node
)
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
subquery
()
)
# only case of independant restrictions on docs and terms
tf_nd_query
=
(
tf_nd_query
.
join
(
countdocs_subquery
,
countdocs_subquery
.
c
.
id
==
NodeNgram
.
node_id
)
.
join
(
termset_subquery
,
termset_subquery
.
c
.
uniq_ngid
==
NodeNgram
.
ngram_id
)
)
# N
total_docs
=
session
.
query
(
ressource_docid
s_subquery
)
.
count
()
total_docs
=
session
.
query
(
countdoc
s_subquery
)
.
count
()
# nb: possible to do the occurrences right now at the same time
tf_nd
=
(
session
.
query
(
NodeNgram
.
ngram_id
,
func
.
sum
(
NodeNgram
.
weight
),
# tf: same as occnode
func
.
count
(
NodeNgram
.
node_id
)
# nd: n docs with term
)
.
filter
(
NodeNgram
.
node_id
.
in_
(
count_scope_subquery
))
.
filter
(
NodeNgram
.
ngram_id
.
in_
(
termset_scope_subquery
))
.
group_by
(
NodeNgram
.
ngram_id
)
.
all
()
)
# result
tf_nd
=
tf_nd_query
.
all
()
# -------------------------------------------------
tfidfs
=
{}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment