Commit 2785cf15 authored by Romain Loth's avatar Romain Loth

workflow: fix set of terms used in global ranking score (still using IN)

parent 242acca7
...@@ -195,9 +195,9 @@ RESOURCETYPES = [ ...@@ -195,9 +195,9 @@ RESOURCETYPES = [
] ]
# linguistic extraction parameters --------------------------------------------- # linguistic extraction parameters ---------------------------------------------
DEFAULT_TFIDF_CUTOFF_RATIO = .75 # MAINLIST maximum terms in % DEFAULT_RANK_CUTOFF_RATIO = .75 # MAINLIST maximum terms in %
DEFAULT_TFIDF_HARD_LIMIT = 5000 # MAINLIST maximum terms abs DEFAULT_RANK_HARD_LIMIT = 5000 # MAINLIST maximum terms abs
# (makes COOCS larger ~ O(N²) /!\) # (makes COOCS larger ~ O(N²) /!\)
DEFAULT_COOC_THRESHOLD = 2 # inclusive minimum for COOCS coefs DEFAULT_COOC_THRESHOLD = 2 # inclusive minimum for COOCS coefs
......
...@@ -6,7 +6,7 @@ from .hyperdata_indexing import index_hyperdata ...@@ -6,7 +6,7 @@ from .hyperdata_indexing import index_hyperdata
# in usual run order # in usual run order
from .list_stop import do_stoplist from .list_stop import do_stoplist
from .metric_tfidf import compute_occs, compute_tfidf_local, compute_cumulated_tfidf from .metric_tfidf import compute_occs, compute_tfidf_local, compute_ti_ranking
from .list_main import do_mainlist from .list_main import do_mainlist
from .ngram_coocs import compute_coocs from .ngram_coocs import compute_coocs
from .metric_specificity import compute_specificity from .metric_specificity import compute_specificity
...@@ -116,13 +116,15 @@ def parse_extract_indexhyperdata(corpus): ...@@ -116,13 +116,15 @@ def parse_extract_indexhyperdata(corpus):
ltfidf_id = compute_tfidf_local(corpus) ltfidf_id = compute_tfidf_local(corpus)
print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id)) print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))
# -> write global and cumulated tfidf to Node and NodeNodeNgram # -> write global and cumulated ti_ranking (tfidf ranking vector) to Node and NodeNodeNgram
gtfidf_id = compute_cumulated_tfidf(corpus, scope="global") tirank_id = compute_ti_ranking(corpus,
print('CORPUS #%d: [%s] new globaltfidf node #%i' % (corpus.id, t(), gtfidf_id)) count_scope="global",
termset_scope="local")
print('CORPUS #%d: [%s] new tfidf ranking node #%i' % (corpus.id, t(), tirank_id))
# -> mainlist: filter + write (to Node and NodeNgram) # -> mainlist: filter + write (to Node and NodeNgram)
mainlist_id = do_mainlist(corpus, mainlist_id = do_mainlist(corpus,
tfidf_id = gtfidf_id, ranking_scores_id = tirank_id,
stoplist_id = stop_id) stoplist_id = stop_id)
print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id)) print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id))
...@@ -143,7 +145,7 @@ def parse_extract_indexhyperdata(corpus): ...@@ -143,7 +145,7 @@ def parse_extract_indexhyperdata(corpus):
print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id)) print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id))
print('CORPUS #%d: [%s] FINISHED ngram lists computation' % (corpus.id, t())) print('CORPUS #%d: [%s] FINISHED ngram lists computation' % (corpus.id, t()))
corpus.status('Lists', progress=0, complete=True) corpus.status('Lists', progress=0, complete=True)
corpus.save_hyperdata() corpus.save_hyperdata()
session.commit() session.commit()
......
...@@ -2,14 +2,14 @@ from gargantext.models import Node, NodeNgram, NodeNodeNgram ...@@ -2,14 +2,14 @@ from gargantext.models import Node, NodeNgram, NodeNodeNgram
from gargantext.util.db import session from gargantext.util.db import session
from gargantext.util.lists import UnweightedList from gargantext.util.lists import UnweightedList
from sqlalchemy import desc from sqlalchemy import desc
from gargantext.constants import DEFAULT_TFIDF_CUTOFF_RATIO, \ from gargantext.constants import DEFAULT_RANK_CUTOFF_RATIO, \
DEFAULT_TFIDF_HARD_LIMIT DEFAULT_RANK_HARD_LIMIT
def do_mainlist(corpus, def do_mainlist(corpus,
overwrite_id = None, overwrite_id = None,
tfidf_id=None, stoplist_id=None, ranking_scores_id=None, stoplist_id=None,
hard_limit=DEFAULT_TFIDF_HARD_LIMIT, hard_limit=DEFAULT_RANK_HARD_LIMIT,
ratio_limit=DEFAULT_TFIDF_CUTOFF_RATIO ratio_limit=DEFAULT_RANK_CUTOFF_RATIO
): ):
""" """
Select top n terms according to a global tfidf ranking and stoplist filter. Select top n terms according to a global tfidf ranking and stoplist filter.
...@@ -18,7 +18,7 @@ def do_mainlist(corpus, ...@@ -18,7 +18,7 @@ def do_mainlist(corpus,
min(hard_limit, number_of_terms * ratio_limit) min(hard_limit, number_of_terms * ratio_limit)
NB : We use a global tfidf node where the values are global but the ngrams NB : We use a global tfidf node where the values are global but the ngrams
are already selected (== only within this corpus documents). are already selected (termset_scope == only within this corpus docs).
TO DISCUSS: allow influence of the local tfidf scores too TO DISCUSS: allow influence of the local tfidf scores too
Parameters: Parameters:
...@@ -37,12 +37,12 @@ def do_mainlist(corpus, ...@@ -37,12 +37,12 @@ def do_mainlist(corpus,
""" """
# retrieve helper nodes if not provided # retrieve helper nodes if not provided
if not tfidf_id: if not ranking_scores_id:
tfidf_id = session.query(Node.id).filter( ranking_scores_id = session.query(Node.id).filter(
Node.typename == "TFIDF-GLOBAL", Node.typename == "TFIDF-GLOBAL",
Node.parent_id == corpus.id Node.parent_id == corpus.id
).first() ).first()
if not tfidf_id: if not ranking_scores_id:
raise ValueError("MAINLIST: TFIDF node needed for mainlist creation") raise ValueError("MAINLIST: TFIDF node needed for mainlist creation")
if not stoplist_id: if not stoplist_id:
...@@ -64,7 +64,7 @@ def do_mainlist(corpus, ...@@ -64,7 +64,7 @@ def do_mainlist(corpus,
# tfidf-ranked query # tfidf-ranked query
ordered_filtered_tfidf = (session ordered_filtered_tfidf = (session
.query(NodeNodeNgram.ngram_id) .query(NodeNodeNgram.ngram_id)
.filter(NodeNodeNgram.node1_id == tfidf_id) .filter(NodeNodeNgram.node1_id == ranking_scores_id)
.filter(~ NodeNodeNgram.ngram_id.in_(stopterms_subquery)) .filter(~ NodeNodeNgram.ngram_id.in_(stopterms_subquery))
.order_by(desc(NodeNodeNgram.score)) .order_by(desc(NodeNodeNgram.score))
) )
......
...@@ -44,11 +44,11 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None): ...@@ -44,11 +44,11 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
# v = d.sum(axis=1) (- lui-même) # v = d.sum(axis=1) (- lui-même)
xs = x.sum(axis=1) - x xs = x.sum(axis=1) - x
ys = x.sum(axis=0) - x ys = x.sum(axis=0) - x
# top inclus ou exclus # top inclus ou exclus
#n = ( xs + ys) / (2 * (x.shape[0] - 1)) #n = ( xs + ys) / (2 * (x.shape[0] - 1))
# top generic or specific (asc is spec, desc is generic) # top generic or specific (asc is spec, desc is generic)
v = ( xs - ys) / ( 2 * (x.shape[0] - 1)) v = ( xs - ys) / ( 2 * (x.shape[0] - 1))
...@@ -105,11 +105,14 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None): ...@@ -105,11 +105,14 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
# print(v) # print(v)
pd.options.display.float_format = '${:,.2f}'.format pd.options.display.float_format = '${:,.2f}'.format
data = WeightedList( if not v.empty:
zip( v.index.tolist() data = WeightedList(
, v.values.tolist()[0] zip( v.index.tolist()
) , v.values.tolist()[0]
) )
data.save(the_id) )
data.save(the_id)
else:
print("WARNING: had no terms in COOCS => empty SPECIFICITY node")
return(the_id) return(the_id)
...@@ -88,7 +88,7 @@ def compute_occs(corpus, overwrite_id = None): ...@@ -88,7 +88,7 @@ def compute_occs(corpus, overwrite_id = None):
return the_id return the_id
def compute_cumulated_tfidf(corpus, scope="local", overwrite_id=None): def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overwrite_id=None):
""" """
# TODO check if cumulated tfs correspond to app's use cases and intention # TODO check if cumulated tfs correspond to app's use cases and intention
...@@ -96,55 +96,93 @@ def compute_cumulated_tfidf(corpus, scope="local", overwrite_id=None): ...@@ -96,55 +96,93 @@ def compute_cumulated_tfidf(corpus, scope="local", overwrite_id=None):
Parameters: Parameters:
- the corpus itself - the corpus itself
- scope: {"local" or "global"} - count_scope: {"local" or "global"}
- local <=> frequencies counted in the current corpus
- global <=> frequencies counted in all corpora of this type
when the count_scope is global, there is another parameter:
- termset_scope: {"local" or "global"}
- local <=> output list of terms limited to the current corpus
(SELECT ngram_id FROM nodes_ngrams WHERE node_id IN <docs>)
- global <=> output list of terms from all corpora of this type
!!!! (more terms)
- overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus - overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced) (the Node and its previous NodeNodeNgram rows will be replaced)
""" """
corpus_docids_subquery = (session
.query(Node.id)
.filter(Node.parent_id == corpus.id)
.filter(Node.typename == "DOCUMENT")
.subquery()
)
# local <=> within this corpus # local <=> within this corpus
if scope == "local": if count_scope == "local":
# All docs of this corpus # All docs of this corpus
docids_subquery = (session count_scope_subquery = corpus_docids_subquery
.query(Node.id)
.filter(Node.parent_id == corpus.id) termset_scope_subquery = (session
.filter(Node.typename == "DOCUMENT") .query(NodeNgram.ngram_id)
.subquery() .filter(NodeNgram.node_id.in_(corpus_docids_subquery))
) .subquery()
)
# global <=> within all corpora of this source # global <=> within all corpora of this source
elif scope == "global": elif count_scope == "global":
this_source_type = corpus.resources()[0]['type'] this_source_type = corpus.resources()[0]['type']
# all corpora with the same source type # all corpora with the same source type
# (we need raw SQL query for postgres JSON operators) (TODO test speed) # (we need raw SQL query for postgres JSON operators) (TODO test speed)
same_source_corpora_query = (session same_source_corpora_query = (session
.query(Node.id) .query(Node.id)
.from_statement(text( .from_statement(text(
""" """
SELECT id FROM nodes SELECT id FROM nodes
WHERE hyperdata->'resources' @> '[{\"type\"\:%s}]' WHERE hyperdata->'resources' @> '[{\"type\"\:%s}]'
""" % this_source_type """ % this_source_type
)) ))
) )
# All docs **in all corpora of the same source** # All docs **in all corpora of the same source**
docids_subquery = (session ressource_docids_subquery = (session
.query(Node.id) .query(Node.id)
.filter(Node.parent_id.in_(same_source_corpora_query)) .filter(Node.parent_id.in_(same_source_corpora_query))
.filter(Node.typename == "DOCUMENT") .filter(Node.typename == "DOCUMENT")
.subquery()
)
count_scope_subquery = ressource_docids_subquery
if termset_scope == "global":
termset_scope_subquery = (session
.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id.in_(ressource_docids_subquery))
.subquery()
)
else:
termset_scope_subquery = (session
.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id.in_(corpus_docids_subquery))
.subquery() .subquery()
) )
# N # N
total_docs = session.query(docids_subquery).count() total_docs = session.query(ressource_docids_subquery).count()
# or perhaps at least do the occurrences right now at the same time # nb: possible to do the occurrences right now at the same time
tf_nd = (session tf_nd = (session
.query( .query(
NodeNgram.ngram_id, NodeNgram.ngram_id,
func.sum(NodeNgram.weight), # tf: same as occnode func.sum(NodeNgram.weight), # tf: same as occnode
func.count(NodeNgram.node_id) # nd: n docs with term func.count(NodeNgram.node_id) # nd: n docs with term
) )
.filter(NodeNgram.node_id.in_(docids_subquery)) .filter(NodeNgram.node_id.in_(count_scope_subquery))
.filter(NodeNgram.ngram_id.in_(termset_scope_subquery))
.group_by(NodeNgram.ngram_id) .group_by(NodeNgram.ngram_id)
.all() .all()
) )
...@@ -162,10 +200,10 @@ def compute_cumulated_tfidf(corpus, scope="local", overwrite_id=None): ...@@ -162,10 +200,10 @@ def compute_cumulated_tfidf(corpus, scope="local", overwrite_id=None):
else: else:
# create the new TFIDF-XXXX node # create the new TFIDF-XXXX node
tfidf_nd = corpus.add_child() tfidf_nd = corpus.add_child()
if scope == "local": # TODO discuss use and find new typename if count_scope == "local": # TODO discuss use and find new typename
tfidf_nd.typename = "TFIDF-CORPUS" tfidf_nd.typename = "TFIDF-CORPUS"
tfidf_nd.name = "tfidf-cumul-corpus (in:%s)" % corpus.id tfidf_nd.name = "tfidf-cumul-corpus (in:%s)" % corpus.id
elif scope == "global": elif count_scope == "global":
tfidf_nd.typename = "TFIDF-GLOBAL" tfidf_nd.typename = "TFIDF-GLOBAL"
tfidf_nd.name = "tfidf-cumul-global (in type:%s)" % this_source_type tfidf_nd.name = "tfidf-cumul-global (in type:%s)" % this_source_type
session.add(tfidf_nd) session.add(tfidf_nd)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment