Commit 049dc862 authored by Romain Loth's avatar Romain Loth

remove sql IN from global ranking score

parent 389583e9
......@@ -9,9 +9,10 @@ FIXME: "having the same source" means we need to select inside hyperdata
"""
from gargantext.models import Node, NodeNgram, NodeNodeNgram, NodeNgramNgram
from gargantext.util.db import session, bulk_insert, func # = sqlalchemy.func like sum() or count()
from sqlalchemy import text # for query from raw SQL statement
from gargantext.util.db import session, bulk_insert, aliased, \
func # = sqlalchemy.func like sum() or count()
from sqlalchemy.sql.expression import case # for choice if ngram has mainform or not
from sqlalchemy import distinct # for list of unique ngram_ids within a corpus
from math import log
# £TODO
# from gargantext.util.lists import WeightedContextIndex
......@@ -140,7 +141,7 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
"""
# TODO check if cumulated tfs correspond to app's use cases and intention
Calculates tfidf ranking (cumulated tfidf) within the given scope
Calculates tfidf ranking (cumulated tfidf for each ngram) within given scope
Parameters:
- the corpus itself
......@@ -148,92 +149,110 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
- local <=> frequencies counted in the current corpus
- global <=> frequencies counted in all corpora of this type
when the count_scope is global, there is another parameter:
- termset_scope: {"local" or "global"}
- local <=> output list of terms limited to the current corpus
(SELECT ngram_id FROM nodes_ngrams WHERE node_id IN <docs>)
(SELECT DISTINCT ngram_id FROM nodes_ngrams WHERE node_id IN <docs>)
- global <=> output list of terms from all corpora of this type
!!!! (more terms)
!!!! (many more terms)
- overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced)
"""
corpus_docids_subquery = (session
.query(Node.id)
.filter(Node.parent_id == corpus.id)
.filter(Node.typename == "DOCUMENT")
.subquery()
# MAIN QUERY SKELETON
tf_nd_query = (session
.query(
NodeNgram.ngram_id,
func.sum(NodeNgram.weight), # tf: same as occurrences
# -----------------------
func.count(NodeNgram.node_id) # nd: n docs with term
# --------------------
)
.group_by(NodeNgram.ngram_id)
# optional *count_scope*: if we'll restrict the doc nodes
# -------------
# .join(countdocs_subquery,
# countdocs_subquery.c.id == NodeNgram.node_id)
# optional *termset_scope*: if we'll restrict the ngrams
# ---------------
# .join(termset_subquery,
# termset_subquery.c.uniq_ngid == NodeNgram.ngram_id)
)
# validate string params
if count_scope not in ["local","global"]:
raise ValueError("compute_ti_ranking: count_scope param allowed values: 'local', 'global'")
if termset_scope not in ["local","global"]:
raise ValueError("compute_ti_ranking: termset_scope param allowed values: 'local', 'global'")
if count_scope == "local" and termset_scope == "global":
raise ValueError("compute_ti_ranking: the termset_scope param can be 'global' iff count_scope param is 'global' too.")
# local <=> within this corpus
if count_scope == "local":
# All docs of this corpus
count_scope_subquery = corpus_docids_subquery
termset_scope_subquery = (session
.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id.in_(corpus_docids_subquery))
countdocs_subquery = (session
.query(Node.id)
.filter(Node.typename == "DOCUMENT")
.filter(Node.parent_id == corpus.id)
.subquery()
)
# both scopes are the same: no need to independantly restrict the ngrams
tf_nd_query = tf_nd_query.join(countdocs_subquery,
countdocs_subquery.c.id == NodeNgram.node_id)
# global <=> within all corpora of this source
elif count_scope == "global":
this_source_type = corpus.resources()[0]['type']
# all corpora with the same source type
# (we need raw SQL query for postgres JSON operators) (TODO test speed)
same_source_corpora_query = (session
.query(Node.id)
.from_statement(text(
"""
SELECT id FROM nodes
WHERE hyperdata->'resources' @> '[{\"type\"\:%s}]'
""" % this_source_type
))
)
CorpusNode = aliased(Node)
# All docs **in all corpora of the same source**
ressource_docids_subquery = (session
countdocs_subquery = (session
.query(Node.id)
.filter(Node.parent_id.in_(same_source_corpora_query))
.filter(Node.typename == "DOCUMENT")
# join on parent_id with selected corpora nodes
.join(CorpusNode, CorpusNode.id == Node.parent_id)
.filter(CorpusNode.typename == "CORPUS")
.filter(CorpusNode.hyperdata['resources'][0]['type'].astext == str(this_source_type))
.subquery()
)
count_scope_subquery = ressource_docids_subquery
if termset_scope == "global":
termset_scope_subquery = (session
.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id.in_(ressource_docids_subquery))
.subquery()
)
else:
termset_scope_subquery = (session
.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id.in_(corpus_docids_subquery))
# both scopes are the same: no need to independantly restrict the ngrams
tf_nd_query = tf_nd_query.join(countdocs_subquery,
countdocs_subquery.c.id == NodeNgram.node_id)
elif termset_scope == "local":
# All unique terms in the original corpus
termset_subquery = (session
.query(distinct(NodeNgram.ngram_id).label("uniq_ngid"))
.join(Node)
.filter(Node.typename == "DOCUMENT")
.filter(Node.parent_id == corpus.id)
.subquery()
)
# only case of independant restrictions on docs and terms
tf_nd_query = (tf_nd_query
.join(countdocs_subquery,
countdocs_subquery.c.id == NodeNgram.node_id)
.join(termset_subquery,
termset_subquery.c.uniq_ngid == NodeNgram.ngram_id)
)
# N
total_docs = session.query(ressource_docids_subquery).count()
total_docs = session.query(countdocs_subquery).count()
# nb: possible to do the occurrences right now at the same time
tf_nd = (session
.query(
NodeNgram.ngram_id,
func.sum(NodeNgram.weight), # tf: same as occnode
func.count(NodeNgram.node_id) # nd: n docs with term
)
.filter(NodeNgram.node_id.in_(count_scope_subquery))
.filter(NodeNgram.ngram_id.in_(termset_scope_subquery))
.group_by(NodeNgram.ngram_id)
.all()
)
# result
tf_nd = tf_nd_query.all()
# -------------------------------------------------
tfidfs = {}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment