Commit 049dc862 authored by Romain Loth's avatar Romain Loth

remove sql IN from global ranking score

parent 389583e9
...@@ -9,9 +9,10 @@ FIXME: "having the same source" means we need to select inside hyperdata ...@@ -9,9 +9,10 @@ FIXME: "having the same source" means we need to select inside hyperdata
""" """
from gargantext.models import Node, NodeNgram, NodeNodeNgram, NodeNgramNgram from gargantext.models import Node, NodeNgram, NodeNodeNgram, NodeNgramNgram
from gargantext.util.db import session, bulk_insert, func # = sqlalchemy.func like sum() or count() from gargantext.util.db import session, bulk_insert, aliased, \
from sqlalchemy import text # for query from raw SQL statement func # = sqlalchemy.func like sum() or count()
from sqlalchemy.sql.expression import case # for choice if ngram has mainform or not from sqlalchemy.sql.expression import case # for choice if ngram has mainform or not
from sqlalchemy import distinct # for list of unique ngram_ids within a corpus
from math import log from math import log
# £TODO # £TODO
# from gargantext.util.lists import WeightedContextIndex # from gargantext.util.lists import WeightedContextIndex
...@@ -140,7 +141,7 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw ...@@ -140,7 +141,7 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
""" """
# TODO check if cumulated tfs correspond to app's use cases and intention # TODO check if cumulated tfs correspond to app's use cases and intention
Calculates tfidf ranking (cumulated tfidf) within the given scope Calculates tfidf ranking (cumulated tfidf for each ngram) within given scope
Parameters: Parameters:
- the corpus itself - the corpus itself
...@@ -148,92 +149,110 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw ...@@ -148,92 +149,110 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
- local <=> frequencies counted in the current corpus - local <=> frequencies counted in the current corpus
- global <=> frequencies counted in all corpora of this type - global <=> frequencies counted in all corpora of this type
when the count_scope is global, there is another parameter: when the count_scope is global, there is another parameter:
- termset_scope: {"local" or "global"} - termset_scope: {"local" or "global"}
- local <=> output list of terms limited to the current corpus - local <=> output list of terms limited to the current corpus
(SELECT ngram_id FROM nodes_ngrams WHERE node_id IN <docs>) (SELECT DISTINCT ngram_id FROM nodes_ngrams WHERE node_id IN <docs>)
- global <=> output list of terms from all corpora of this type - global <=> output list of terms from all corpora of this type
!!!! (more terms) !!!! (many more terms)
- overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus - overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced) (the Node and its previous NodeNodeNgram rows will be replaced)
""" """
corpus_docids_subquery = (session # MAIN QUERY SKELETON
.query(Node.id) tf_nd_query = (session
.filter(Node.parent_id == corpus.id) .query(
.filter(Node.typename == "DOCUMENT") NodeNgram.ngram_id,
.subquery() func.sum(NodeNgram.weight), # tf: same as occurrences
# -----------------------
func.count(NodeNgram.node_id) # nd: n docs with term
# --------------------
)
.group_by(NodeNgram.ngram_id)
# optional *count_scope*: if we'll restrict the doc nodes
# -------------
# .join(countdocs_subquery,
# countdocs_subquery.c.id == NodeNgram.node_id)
# optional *termset_scope*: if we'll restrict the ngrams
# ---------------
# .join(termset_subquery,
# termset_subquery.c.uniq_ngid == NodeNgram.ngram_id)
) )
# validate string params
if count_scope not in ["local","global"]:
raise ValueError("compute_ti_ranking: count_scope param allowed values: 'local', 'global'")
if termset_scope not in ["local","global"]:
raise ValueError("compute_ti_ranking: termset_scope param allowed values: 'local', 'global'")
if count_scope == "local" and termset_scope == "global":
raise ValueError("compute_ti_ranking: the termset_scope param can be 'global' iff count_scope param is 'global' too.")
# local <=> within this corpus # local <=> within this corpus
if count_scope == "local": if count_scope == "local":
# All docs of this corpus # All docs of this corpus
count_scope_subquery = corpus_docids_subquery countdocs_subquery = (session
.query(Node.id)
termset_scope_subquery = (session .filter(Node.typename == "DOCUMENT")
.query(NodeNgram.ngram_id) .filter(Node.parent_id == corpus.id)
.filter(NodeNgram.node_id.in_(corpus_docids_subquery))
.subquery() .subquery()
) )
# both scopes are the same: no need to independantly restrict the ngrams
tf_nd_query = tf_nd_query.join(countdocs_subquery,
countdocs_subquery.c.id == NodeNgram.node_id)
# global <=> within all corpora of this source # global <=> within all corpora of this source
elif count_scope == "global": elif count_scope == "global":
this_source_type = corpus.resources()[0]['type'] this_source_type = corpus.resources()[0]['type']
# all corpora with the same source type CorpusNode = aliased(Node)
# (we need raw SQL query for postgres JSON operators) (TODO test speed)
same_source_corpora_query = (session
.query(Node.id)
.from_statement(text(
"""
SELECT id FROM nodes
WHERE hyperdata->'resources' @> '[{\"type\"\:%s}]'
""" % this_source_type
))
)
# All docs **in all corpora of the same source** # All docs **in all corpora of the same source**
ressource_docids_subquery = (session countdocs_subquery = (session
.query(Node.id) .query(Node.id)
.filter(Node.parent_id.in_(same_source_corpora_query))
.filter(Node.typename == "DOCUMENT") .filter(Node.typename == "DOCUMENT")
# join on parent_id with selected corpora nodes
.join(CorpusNode, CorpusNode.id == Node.parent_id)
.filter(CorpusNode.typename == "CORPUS")
.filter(CorpusNode.hyperdata['resources'][0]['type'].astext == str(this_source_type))
.subquery() .subquery()
) )
count_scope_subquery = ressource_docids_subquery
if termset_scope == "global": if termset_scope == "global":
termset_scope_subquery = (session # both scopes are the same: no need to independantly restrict the ngrams
.query(NodeNgram.ngram_id) tf_nd_query = tf_nd_query.join(countdocs_subquery,
.filter(NodeNgram.node_id.in_(ressource_docids_subquery)) countdocs_subquery.c.id == NodeNgram.node_id)
.subquery()
) elif termset_scope == "local":
else:
termset_scope_subquery = (session # All unique terms in the original corpus
.query(NodeNgram.ngram_id) termset_subquery = (session
.filter(NodeNgram.node_id.in_(corpus_docids_subquery)) .query(distinct(NodeNgram.ngram_id).label("uniq_ngid"))
.join(Node)
.filter(Node.typename == "DOCUMENT")
.filter(Node.parent_id == corpus.id)
.subquery() .subquery()
) )
# only case of independant restrictions on docs and terms
tf_nd_query = (tf_nd_query
.join(countdocs_subquery,
countdocs_subquery.c.id == NodeNgram.node_id)
.join(termset_subquery,
termset_subquery.c.uniq_ngid == NodeNgram.ngram_id)
)
# N # N
total_docs = session.query(ressource_docids_subquery).count() total_docs = session.query(countdocs_subquery).count()
# nb: possible to do the occurrences right now at the same time # result
tf_nd = (session tf_nd = tf_nd_query.all()
.query(
NodeNgram.ngram_id,
func.sum(NodeNgram.weight), # tf: same as occnode
func.count(NodeNgram.node_id) # nd: n docs with term
)
.filter(NodeNgram.node_id.in_(count_scope_subquery))
.filter(NodeNgram.ngram_id.in_(termset_scope_subquery))
.group_by(NodeNgram.ngram_id)
.all()
)
# ------------------------------------------------- # -------------------------------------------------
tfidfs = {} tfidfs = {}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment