Commit 87f75264 authored by delanoe's avatar delanoe

Merge branch 'romain-refactoring' into merge

parents 4362b85b 4c3aa4b9
......@@ -194,9 +194,9 @@ RESOURCETYPES = [
]
# linguistic extraction parameters ---------------------------------------------
DEFAULT_TFIDF_CUTOFF_RATIO = .75 # MAINLIST maximum terms in %
DEFAULT_RANK_CUTOFF_RATIO = .75 # MAINLIST maximum terms in %
DEFAULT_TFIDF_HARD_LIMIT = 5000 # MAINLIST maximum terms abs
DEFAULT_RANK_HARD_LIMIT = 5000 # MAINLIST maximum terms abs
# (makes COOCS larger ~ O(N²) /!\)
DEFAULT_COOC_THRESHOLD = 2 # inclusive minimum for COOCS coefs
......
......@@ -6,7 +6,7 @@ from .hyperdata_indexing import index_hyperdata
# in usual run order
from .list_stop import do_stoplist
from .metric_tfidf import compute_occs, compute_tfidf_local, compute_cumulated_tfidf
from .metric_tfidf import compute_occs, compute_tfidf_local, compute_ti_ranking
from .list_main import do_mainlist
from .ngram_coocs import compute_coocs
from .metric_specificity import compute_specificity
......@@ -116,13 +116,15 @@ def parse_extract_indexhyperdata(corpus):
ltfidf_id = compute_tfidf_local(corpus)
print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))
# -> write global and cumulated tfidf to Node and NodeNodeNgram
gtfidf_id = compute_cumulated_tfidf(corpus, scope="global")
print('CORPUS #%d: [%s] new globaltfidf node #%i' % (corpus.id, t(), gtfidf_id))
# -> write global and cumulated ti_ranking (tfidf ranking vector) to Node and NodeNodeNgram
tirank_id = compute_ti_ranking(corpus,
count_scope="global",
termset_scope="local")
print('CORPUS #%d: [%s] new tfidf ranking node #%i' % (corpus.id, t(), tirank_id))
# -> mainlist: filter + write (to Node and NodeNgram)
mainlist_id = do_mainlist(corpus,
tfidf_id = gtfidf_id,
ranking_scores_id = tirank_id,
stoplist_id = stop_id)
print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id))
......@@ -143,7 +145,7 @@ def parse_extract_indexhyperdata(corpus):
print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id))
print('CORPUS #%d: [%s] FINISHED ngram lists computation' % (corpus.id, t()))
corpus.status('Lists', progress=0, complete=True)
corpus.save_hyperdata()
session.commit()
......
......@@ -2,14 +2,14 @@ from gargantext.models import Node, NodeNgram, NodeNodeNgram
from gargantext.util.db import session
from gargantext.util.lists import UnweightedList
from sqlalchemy import desc
from gargantext.constants import DEFAULT_TFIDF_CUTOFF_RATIO, \
DEFAULT_TFIDF_HARD_LIMIT
from gargantext.constants import DEFAULT_RANK_CUTOFF_RATIO, \
DEFAULT_RANK_HARD_LIMIT
def do_mainlist(corpus,
overwrite_id = None,
tfidf_id=None, stoplist_id=None,
hard_limit=DEFAULT_TFIDF_HARD_LIMIT,
ratio_limit=DEFAULT_TFIDF_CUTOFF_RATIO
ranking_scores_id=None, stoplist_id=None,
hard_limit=DEFAULT_RANK_HARD_LIMIT,
ratio_limit=DEFAULT_RANK_CUTOFF_RATIO
):
"""
Select top n terms according to a global tfidf ranking and stoplist filter.
......@@ -18,7 +18,7 @@ def do_mainlist(corpus,
min(hard_limit, number_of_terms * ratio_limit)
NB : We use a global tfidf node where the values are global but the ngrams
are already selected (== only within this corpus documents).
are already selected (termset_scope == only within this corpus docs).
TO DISCUSS: allow influence of the local tfidf scores too
Parameters:
......@@ -37,12 +37,12 @@ def do_mainlist(corpus,
"""
# retrieve helper nodes if not provided
if not tfidf_id:
tfidf_id = session.query(Node.id).filter(
if not ranking_scores_id:
ranking_scores_id = session.query(Node.id).filter(
Node.typename == "TFIDF-GLOBAL",
Node.parent_id == corpus.id
).first()
if not tfidf_id:
if not ranking_scores_id:
raise ValueError("MAINLIST: TFIDF node needed for mainlist creation")
if not stoplist_id:
......@@ -64,7 +64,7 @@ def do_mainlist(corpus,
# tfidf-ranked query
ordered_filtered_tfidf = (session
.query(NodeNodeNgram.ngram_id)
.filter(NodeNodeNgram.node1_id == tfidf_id)
.filter(NodeNodeNgram.node1_id == ranking_scores_id)
.filter(~ NodeNodeNgram.ngram_id.in_(stopterms_subquery))
.order_by(desc(NodeNodeNgram.score))
)
......
......@@ -44,11 +44,11 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
# v = d.sum(axis=1) (- lui-même)
xs = x.sum(axis=1) - x
ys = x.sum(axis=0) - x
# top inclus ou exclus
#n = ( xs + ys) / (2 * (x.shape[0] - 1))
# top generic or specific (asc is spec, desc is generic)
v = ( xs - ys) / ( 2 * (x.shape[0] - 1))
......@@ -105,11 +105,14 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
# print(v)
pd.options.display.float_format = '${:,.2f}'.format
data = WeightedList(
zip( v.index.tolist()
, v.values.tolist()[0]
)
)
data.save(the_id)
if not v.empty:
data = WeightedList(
zip( v.index.tolist()
, v.values.tolist()[0]
)
)
data.save(the_id)
else:
print("WARNING: had no terms in COOCS => empty SPECIFICITY node")
return(the_id)
......@@ -88,7 +88,7 @@ def compute_occs(corpus, overwrite_id = None):
return the_id
def compute_cumulated_tfidf(corpus, scope="local", overwrite_id=None):
def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overwrite_id=None):
"""
# TODO check if cumulated tfs correspond to app's use cases and intention
......@@ -96,55 +96,93 @@ def compute_cumulated_tfidf(corpus, scope="local", overwrite_id=None):
Parameters:
- the corpus itself
- scope: {"local" or "global"}
- count_scope: {"local" or "global"}
- local <=> frequencies counted in the current corpus
- global <=> frequencies counted in all corpora of this type
when the count_scope is global, there is another parameter:
- termset_scope: {"local" or "global"}
- local <=> output list of terms limited to the current corpus
(SELECT ngram_id FROM nodes_ngrams WHERE node_id IN <docs>)
- global <=> output list of terms from all corpora of this type
!!!! (more terms)
- overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced)
"""
corpus_docids_subquery = (session
.query(Node.id)
.filter(Node.parent_id == corpus.id)
.filter(Node.typename == "DOCUMENT")
.subquery()
)
# local <=> within this corpus
if scope == "local":
if count_scope == "local":
# All docs of this corpus
docids_subquery = (session
.query(Node.id)
.filter(Node.parent_id == corpus.id)
.filter(Node.typename == "DOCUMENT")
.subquery()
)
count_scope_subquery = corpus_docids_subquery
termset_scope_subquery = (session
.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id.in_(corpus_docids_subquery))
.subquery()
)
# global <=> within all corpora of this source
elif scope == "global":
elif count_scope == "global":
this_source_type = corpus.resources()[0]['type']
# all corpora with the same source type
# (we need raw SQL query for postgres JSON operators) (TODO test speed)
same_source_corpora_query = (session
.query(Node.id)
.from_statement(text(
"""
SELECT id FROM nodes
WHERE hyperdata->'resources' @> '[{\"type\"\:%s}]'
""" % this_source_type
))
)
.query(Node.id)
.from_statement(text(
"""
SELECT id FROM nodes
WHERE hyperdata->'resources' @> '[{\"type\"\:%s}]'
""" % this_source_type
))
)
# All docs **in all corpora of the same source**
docids_subquery = (session
.query(Node.id)
.filter(Node.parent_id.in_(same_source_corpora_query))
.filter(Node.typename == "DOCUMENT")
ressource_docids_subquery = (session
.query(Node.id)
.filter(Node.parent_id.in_(same_source_corpora_query))
.filter(Node.typename == "DOCUMENT")
.subquery()
)
count_scope_subquery = ressource_docids_subquery
if termset_scope == "global":
termset_scope_subquery = (session
.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id.in_(ressource_docids_subquery))
.subquery()
)
else:
termset_scope_subquery = (session
.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id.in_(corpus_docids_subquery))
.subquery()
)
# N
total_docs = session.query(docids_subquery).count()
total_docs = session.query(ressource_docids_subquery).count()
# or perhaps at least do the occurrences right now at the same time
# nb: possible to do the occurrences right now at the same time
tf_nd = (session
.query(
NodeNgram.ngram_id,
func.sum(NodeNgram.weight), # tf: same as occnode
func.count(NodeNgram.node_id) # nd: n docs with term
)
.filter(NodeNgram.node_id.in_(docids_subquery))
.filter(NodeNgram.node_id.in_(count_scope_subquery))
.filter(NodeNgram.ngram_id.in_(termset_scope_subquery))
.group_by(NodeNgram.ngram_id)
.all()
)
......@@ -162,10 +200,10 @@ def compute_cumulated_tfidf(corpus, scope="local", overwrite_id=None):
else:
# create the new TFIDF-XXXX node
tfidf_nd = corpus.add_child()
if scope == "local": # TODO discuss use and find new typename
if count_scope == "local": # TODO discuss use and find new typename
tfidf_nd.typename = "TFIDF-CORPUS"
tfidf_nd.name = "tfidf-cumul-corpus (in:%s)" % corpus.id
elif scope == "global":
elif count_scope == "global":
tfidf_nd.typename = "TFIDF-GLOBAL"
tfidf_nd.name = "tfidf-cumul-global (in type:%s)" % this_source_type
session.add(tfidf_nd)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment