Commit 3b2d568c authored by Romain Loth's avatar Romain Loth

add groups to ngram_coocs + fix date params + fix stoplist param + remove sql IN operators there

parent 92d5dfcd
...@@ -111,15 +111,11 @@ def parse_extract_indexhyperdata(corpus): ...@@ -111,15 +111,11 @@ def parse_extract_indexhyperdata(corpus):
group_id = compute_groups(corpus, stoplist_id = None) group_id = compute_groups(corpus, stoplist_id = None)
print('CORPUS #%d: [%s] new grouplist node #%i' % (corpus.id, t(), group_id)) print('CORPUS #%d: [%s] new grouplist node #%i' % (corpus.id, t(), group_id))
# ------------
# -> write occurrences to Node and NodeNodeNgram # (todo: NodeNgram) # -> write occurrences to Node and NodeNodeNgram # (todo: NodeNgram)
occ_id = compute_occs(corpus, groupings_id = group_id) occ_id = compute_occs(corpus, groupings_id = group_id)
print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id)) print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id))
# ------------
# -> write local tfidf similarities to Node and NodeNodeNgram
ltfidf_id = compute_tfidf_local(corpus)
print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))
# -> write cumulated ti_ranking (tfidf ranking vector) to Node and NodeNodeNgram (todo: NodeNgram) # -> write cumulated ti_ranking (tfidf ranking vector) to Node and NodeNodeNgram (todo: NodeNgram)
tirank_id = compute_ti_ranking(corpus, tirank_id = compute_ti_ranking(corpus,
groupings_id = group_id, groupings_id = group_id,
...@@ -132,13 +128,21 @@ def parse_extract_indexhyperdata(corpus): ...@@ -132,13 +128,21 @@ def parse_extract_indexhyperdata(corpus):
stoplist_id = stop_id) stoplist_id = stop_id)
print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id)) print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id))
# -> write local tfidf similarities to Node and NodeNodeNgram
# TODO only on mainlist
ltfidf_id = compute_tfidf_local(corpus)
print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))
# => used for doc <=> ngram association
# ------------ # ------------
# -> cooccurrences: compute + write (=> Node and NodeNodeNgram) # -> cooccurrences: compute + write (=> Node and NodeNodeNgram)
cooc_id = compute_coocs(corpus, mainlist_id = mainlist_id) cooc_id = compute_coocs(corpus, mainlist_id = mainlist_id, groupings_id = group_id)
print('CORPUS #%d: [%s] new coocs node #%i' % (corpus.id, t(), cooc_id)) print('CORPUS #%d: [%s] new coocs node #%i' % (corpus.id, t(), cooc_id))
# -> specificity: compute + write (=> NodeNodeNgram) # -> specificity: compute + write (=> NodeNodeNgram)
spec_id = compute_specificity(corpus, cooc_id=cooc_id) spec_id = compute_specificity(corpus, cooc_id=cooc_id
# ,groupings_id = group_id
)
print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), spec_id)) print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), spec_id))
# ?? maplist: compute + write (to Node and NodeNgram) # ?? maplist: compute + write (to Node and NodeNgram)
......
...@@ -65,6 +65,9 @@ def do_mainlist(corpus, ...@@ -65,6 +65,9 @@ def do_mainlist(corpus,
ordered_filtered_tfidf = (session ordered_filtered_tfidf = (session
.query(NodeNodeNgram.ngram_id) .query(NodeNodeNgram.ngram_id)
.filter(NodeNodeNgram.node1_id == ranking_scores_id) .filter(NodeNodeNgram.node1_id == ranking_scores_id)
# NOT IN but speed theoretically ok here
# see http://sqlperformance.com/2012/12/t-sql-queries/left-anti-semi-join
# but http://stackoverflow.com/questions/2246772/whats-the-difference-between-not-exists-vs-not-in-vs-left-join-where-is-null/2246793#2246793
.filter(~ NodeNodeNgram.ngram_id.in_(stopterms_subquery)) .filter(~ NodeNodeNgram.ngram_id.in_(stopterms_subquery))
.order_by(desc(NodeNodeNgram.score)) .order_by(desc(NodeNodeNgram.score))
) )
......
...@@ -63,7 +63,6 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,): ...@@ -63,7 +63,6 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
# ------------ # ------------
# (the occurrences are the sums for each ngram's mainform) # (the occurrences are the sums for each ngram's mainform)
else: else:
print ("gtoup mode")
# sub-SELECT the synonyms of this GROUPLIST id (for OUTER JOIN later) # sub-SELECT the synonyms of this GROUPLIST id (for OUTER JOIN later)
syn = (session.query(NodeNgramNgram.ngram1_id, syn = (session.query(NodeNgramNgram.ngram1_id,
NodeNgramNgram.ngram2_id) NodeNgramNgram.ngram2_id)
......
from gargantext.models import Node, NodeNgram, NodeNgramNgram, \ from gargantext.models import Node, NodeNgram, NodeNgramNgram, \
NodeHyperdata NodeHyperdata, Ngram
from gargantext.util.lists import WeightedMatrix from gargantext.util.lists import WeightedMatrix
from gargantext.util.db import session, aliased, func from gargantext.util.db import session, aliased, func
from gargantext.util.db_cache import cache from gargantext.util.db_cache import cache
from gargantext.constants import DEFAULT_COOC_THRESHOLD from gargantext.constants import DEFAULT_COOC_THRESHOLD
from datetime import datetime from datetime import datetime
from sqlalchemy.sql.expression import case # for choice if ngram has mainform or not
def compute_coocs( corpus, def compute_coocs( corpus,
overwrite_id = None, overwrite_id = None,
threshold = DEFAULT_COOC_THRESHOLD, threshold = DEFAULT_COOC_THRESHOLD,
groupings_id = None,
mainlist_id = None, mainlist_id = None,
stoplist_id = None, stoplist_id = None,
start = None, start = None,
...@@ -41,9 +44,11 @@ def compute_coocs( corpus, ...@@ -41,9 +44,11 @@ def compute_coocs( corpus,
- overwrite_id: id of a pre-existing COOCCURRENCES node for this corpus - overwrite_id: id of a pre-existing COOCCURRENCES node for this corpus
(all hyperdata and previous NodeNgramNgram rows will be replaced) (all hyperdata and previous NodeNgramNgram rows will be replaced)
- threshold: on output cooc count (previously called hapax) - threshold: on output cooc count (previously called hapax)
- groupings_id: optional synonym relations to add all subform counts
with their mainform's counts
- mainlist_id: mainlist to constrain the input ngrams - mainlist_id: mainlist to constrain the input ngrams
- stoplist_id: stoplist for filtering input ngrams - stoplist_id: stoplist for filtering input ngrams
(normally unnecessary if a mainlist is provided) (normally unnecessary if a mainlist is already provided)
- start, end: provide one or both temporal limits to filter on doc date - start, end: provide one or both temporal limits to filter on doc date
NB the expected type of parameter value is datetime.datetime NB the expected type of parameter value is datetime.datetime
(string is also possible but format must follow (string is also possible but format must follow
...@@ -56,25 +61,24 @@ def compute_coocs( corpus, ...@@ -56,25 +61,24 @@ def compute_coocs( corpus,
basic idea for one doc basic idea for one doc
====================== ======================
each pair of ngrams sharing same doc (node_id) each pair of ngrams sharing same doc (node_id)
SELEC idx1.ngram_id, idx2.ngram_id SELEC idxa.ngram_id, idxb.ngram_id
FROM nodes_ngrams AS idx1, nodes_ngrams AS idx2 FROM nodes_ngrams AS idxa, nodes_ngrams AS idxb
--------------------------------- ---------------------------------
WHERE idx1.node_id = idx2.node_id <== that's cooc WHERE idxa.node_id = idxb.node_id <== that's cooc
--------------------------------- ---------------------------------
AND idx1.ngram_id <> idx2.ngram_id AND idxa.ngram_id <> idxb.ngram_id
AND idx1.node_id = MY_DOC ; AND idxa.node_id = MY_DOC ;
on entire corpus on entire corpus
================= =================
coocs for each doc : coocs for each doc :
- each given pair like (termA, termB) will likely appear several times - each given pair like (termA, termB) will likely appear several times
=> we do GROUP BY (x1.ngram_id, x2.ngram_id) => we do GROUP BY (Xindex.ngram_id, Yindex.ngram_id)
- we count unique appearances of the pair (cooc) - we count unique appearances of the pair (cooc)
""" """
# - TODO add grouped element's values in grouping 'chief ngram'
# - TODO cvalue_id: allow a metric as additional input filter # - TODO cvalue_id: allow a metric as additional input filter
# - TODO n_min, n_max : filter on Ngram.n (aka length of ngram) # - TODO n_min, n_max : filter on Ngram.n (aka length of ngram)
# - TODO weighted: if False normal cooc to be saved as result # - TODO weighted: if False normal cooc to be saved as result
...@@ -86,124 +90,194 @@ def compute_coocs( corpus, ...@@ -86,124 +90,194 @@ def compute_coocs( corpus,
# 71.134 lignes en se limitant aux ngrammes qui ont une occ > 1 (weight) # 71.134 lignes en se limitant aux ngrammes qui ont une occ > 1 (weight)
# 2 x the occurrence index table # 2 x the occurrence index table
x1 = aliased(NodeNgram) Xindex = aliased(NodeNgram)
x2 = aliased(NodeNgram) Yindex = aliased(NodeNgram)
# cooccurrences columns definition # for debug (1/4)
ucooc = func.count(x1.ngram_id).label("ucooc") # Xngram = aliased(Ngram)
# Yngram = aliased(Ngram)
# 1) MAIN DB QUERY
coocs_query = ( # 1) prepare definition of counted forms
session.query(x1.ngram_id, x2.ngram_id, ucooc) if not groupings_id:
.join(Node, Node.id == x1.node_id) # <- b/c within corpus
.join(x2, x1.node_id == Node.id ) # <- b/c within corpus # no groupings => the counted forms are the ngrams
.filter(Node.parent_id == corpus.id) # <- b/c within corpus Xindex_ngform_id = Xindex.ngram_id
.filter(Node.typename == "DOCUMENT") # <- b/c within corpus Yindex_ngform_id = Yindex.ngram_id
.filter(x1.node_id == x2.node_id) # <- by definition of cooc
.filter(x1.ngram_id != x2.ngram_id) # <- b/c not with itself # groupings: cf commentaire détaillé dans compute_occs() + todo facto
.group_by(x1.ngram_id, x2.ngram_id) else:
# prepare translations
Xsyno = (session.query(NodeNgramNgram.ngram1_id,
NodeNgramNgram.ngram2_id)
.filter(NodeNgramNgram.node_id == groupings_id)
.subquery()
)
# further use as anon tables prevent doing Ysyno = Xsyno
Ysyno = (session.query(NodeNgramNgram.ngram1_id,
NodeNgramNgram.ngram2_id)
.filter(NodeNgramNgram.node_id == groupings_id)
.subquery()
)
# groupings => define the counted form depending on the existence of a synonym
Xindex_ngform_id = case([
(Xsyno.c.ngram1_id != None, Xsyno.c.ngram1_id),
(Xsyno.c.ngram1_id == None, Xindex.ngram_id)
# condition value
])
Yindex_ngform_id = case([
(Ysyno.c.ngram1_id != None, Ysyno.c.ngram1_id),
(Ysyno.c.ngram1_id == None, Yindex.ngram_id)
])
# ---
# 2) BASE DB QUERY
# cooccurrences columns definition ----------------
ucooc = func.count(Xindex_ngform_id).label("ucooc")
# NB could be X or Y in this line
# (we're counting grouped rows and just happen to do it on this column)
base_query = (
session.query(
Xindex_ngform_id,
Yindex_ngform_id,
ucooc
# for debug (2/4)
#, Xngram.terms.label("w_x")
#, Yngram.terms.label("w_y")
)
.join(Yindex, Xindex.node_id == Yindex.node_id ) # <- by definition of cooc
.join(Node, Node.id == Xindex.node_id) # <- b/c within corpus
.filter(Node.parent_id == corpus.id) # <- b/c within corpus
.filter(Node.typename == "DOCUMENT") # <- b/c within corpus
.filter(Xindex_ngform_id != Yindex_ngform_id) # <- b/c not with itself
)
# outerjoin the synonyms if needed
if groupings_id:
base_query = (base_query
.outerjoin(Xsyno, # <- synonyms for Xindex.ngrams
Xsyno.c.ngram2_id == Xindex.ngram_id)
.outerjoin(Ysyno, # <- synonyms for Yindex.ngrams
Ysyno.c.ngram2_id == Yindex.ngram_id)
)
# 3) counting clause in any case
coocs_query = (base_query
.group_by(
Xindex_ngform_id, Yindex_ngform_id # <- what we're counting
# for debug (3/4)
#,"w_x", "w_y"
)
# for debug (4/4)
#.join(Xngram, Xngram.id == Xindex_ngform_id)
#.join(Yngram, Yngram.id == Yindex_ngform_id)
.order_by(ucooc)
) )
# 2) INPUT FILTERS (reduce N before O(N²))
# 4) INPUT FILTERS (reduce N before O(N²))
if mainlist_id: if mainlist_id:
m1 = aliased(NodeNgram) m1 = aliased(NodeNgram)
m2 = aliased(NodeNgram) m2 = aliased(NodeNgram)
coocs_query = ( coocs_query coocs_query = ( coocs_query
.join(m1, m1.ngram_id == x1.ngram_id) .join(m1, m1.ngram_id == Xindex_ngform_id)
.join(m2, m2.ngram_id == x2.ngram_id) .join(m2, m2.ngram_id == Yindex_ngform_id)
.filter( m1.node_id == mainlist_id ) .filter( m1.node_id == mainlist_id )
.filter( m2.node_id == mainlist_id ) .filter( m2.node_id == mainlist_id )
) )
if stoplist_id: if stoplist_id:
s1 = aliased(NodeNgram) s1 = (session.query(NodeNgram.ngram_id)
s2 = aliased(NodeNgram) .filter(NodeNgram.node_id == stoplist_id)
.subquery()
)
# further use as anon tables prevent doing s2 = s1
s2 = (session.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id == stoplist_id)
.subquery()
)
coocs_query = ( coocs_query coocs_query = ( coocs_query
.join(m1, s1.ngram_id == x1.ngram_id) .outerjoin(s1, s1.c.ngram_id == Xindex_ngform_id)
.join(m2, s2.ngram_id == x2.ngram_id) .outerjoin(s2, s2.c.ngram_id == Yindex_ngform_id)
# équivalent NOT IN stoplist
.filter( s1.c.ngram_id == None )
.filter( s2.c.ngram_id == None )
.filter( s1.node_id == mainlist_id )
.filter( s2.node_id == mainlist_id )
) )
if start: if start or end:
if isinstance(start, datetime): Time = aliased(NodeHyperdata)
start_str = start.strftime("%Y-%m-%d %H:%M:%S")
else: coocs_query = (coocs_query
start_str = str(start) .join(Time, Time.node_id == Xindex.node_id)
.filter(Time.key=="publication_date")
# doc_ids matching this limit )
# TODO s/subqueries/inner joins/ && thanks!
starttime_subquery = (session if start:
.query(NodeHyperdata.node_id) if not isinstance(start, datetime):
.filter(NodeHyperdata.key=="publication_date") try:
.filter(NodeHyperdata.value_str >= start_str) start = datetime.strptime(start, '%Y-%m-%d')
.subquery() except:
) raise TypeError("'start' param expects datetime object or %%Y-%%m-%%d string")
# direct use of str comparison op because there is consistency b/w
# sql alpha sort and chrono sort *in this format %Y-%m-%d %H:%M:%S* # the filtering by start limit
coocs_query = coocs_query.filter(Time.value_utc >= start)
# the filtering by start limit
coocs_query = coocs_query.filter(x1.node_id.in_(starttime_subquery)) if end:
if not isinstance(end, datetime):
if end: try:
if isinstance(end, datetime): end = datetime.strptime(end, '%Y-%m-%d')
end_str = end.strftime("%Y-%m-%d %H:%M:%S") except:
else: raise TypeError("'end' param expects datetime object or %%Y-%%m-%%d string")
end_str = str(end)
# the filtering by start limit
# TODO s/subqueries/inner joins/ && thanks! coocs_query = coocs_query.filter(Time.value_utc <= end)
endtime_subquery = (session
.query(NodeHyperdata.node_id)
.filter(NodeHyperdata.key=="publication_date")
.filter(NodeHyperdata.value_str <= end_str)
.subquery()
)
# the filtering by end limit
coocs_query = coocs_query.filter(x1.node_id.in_(endtime_subquery))
if symmetry_filter: if symmetry_filter:
# 1 filtre tenant en compte de la symétrie # 1 filtre tenant en compte de la symétrie
# -> réduit le travail de moitié !! # -> réduit le travail de moitié !!
# -> mais empêchera l'accès direct aux cooccurrences de x2 # -> mais récupération sera plus couteuse via des requêtes OR comme:
# -> seront éparpillées: notées dans les x1 qui ont précédé x2
# -> récupération sera plus couteuse via des requêtes OR comme:
# WHERE ngram1 = mon_ngram OR ngram2 = mon_ngram # WHERE ngram1 = mon_ngram OR ngram2 = mon_ngram
coocs_query = coocs_query.filter(x1.ngram_id < x2.ngram_id) coocs_query = coocs_query.filter(Xindex_ngform_id < Yindex_ngform_id)
# ------------
# 2 filtres amont possibles pour réduire combinatoire
# - par exemple 929k lignes => 35k lignes
# - ici sur weight mais dégrade les résultats
# => imaginable sur une autre métrique (cvalue ou tfidf?)
# coocs_query = coocs_query.filter(x1.weight > 1)
# coocs_query = coocs_query.filter(x2.weight > 1)
# ------------
# 3) OUTPUT FILTERS # 5) OUTPUT FILTERS
# ------------------ # ------------------
# threshold # threshold
# £TODO adjust COOC_THRESHOLD a posteriori: # £TODO adjust COOC_THRESHOLD a posteriori:
# ex: sometimes 2 sometimes 4 depending on sparsity # ex: sometimes 2 sometimes 4 depending on sparsity
coocs_query = coocs_query.having(ucooc >= threshold) coocs_query = coocs_query.having(ucooc >= threshold)
# 4) EXECUTE QUERY
# 6) EXECUTE QUERY
# ---------------- # ----------------
# => storage in our matrix structure # => storage in our matrix structure
matrix = WeightedMatrix(coocs_query.all()) matrix = WeightedMatrix(coocs_query.all())
# -------------------
# fyi # fyi
#shape_0 = len({pair[0] for pair in matrix.items}) shape_0 = len({pair[0] for pair in matrix.items})
#shape_1 = len({pair[1] for pair in matrix.items}) shape_1 = len({pair[1] for pair in matrix.items})
#print("COOCS: NEW matrix shape [%ix%i]" % (shape_0, shape_1)) print("COOCS: NEW matrix shape [%ix%i]" % (shape_0, shape_1))
# 5) SAVE # 5) SAVE
# -------- # --------
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment