Commit 3b2d568c authored by Romain Loth's avatar Romain Loth

add groups to ngram_coocs + fix date params + fix stoplist param + remove sql IN operators there

parent 92d5dfcd
......@@ -111,15 +111,11 @@ def parse_extract_indexhyperdata(corpus):
group_id = compute_groups(corpus, stoplist_id = None)
print('CORPUS #%d: [%s] new grouplist node #%i' % (corpus.id, t(), group_id))
# ------------
# -> write occurrences to Node and NodeNodeNgram # (todo: NodeNgram)
occ_id = compute_occs(corpus, groupings_id = group_id)
print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id))
# ------------
# -> write local tfidf similarities to Node and NodeNodeNgram
ltfidf_id = compute_tfidf_local(corpus)
print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))
# -> write cumulated ti_ranking (tfidf ranking vector) to Node and NodeNodeNgram (todo: NodeNgram)
tirank_id = compute_ti_ranking(corpus,
groupings_id = group_id,
......@@ -132,13 +128,21 @@ def parse_extract_indexhyperdata(corpus):
stoplist_id = stop_id)
print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id))
# -> write local tfidf similarities to Node and NodeNodeNgram
# TODO only on mainlist
ltfidf_id = compute_tfidf_local(corpus)
print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))
# => used for doc <=> ngram association
# ------------
# -> cooccurrences: compute + write (=> Node and NodeNodeNgram)
cooc_id = compute_coocs(corpus, mainlist_id = mainlist_id)
cooc_id = compute_coocs(corpus, mainlist_id = mainlist_id, groupings_id = group_id)
print('CORPUS #%d: [%s] new coocs node #%i' % (corpus.id, t(), cooc_id))
# -> specificity: compute + write (=> NodeNodeNgram)
spec_id = compute_specificity(corpus, cooc_id=cooc_id)
spec_id = compute_specificity(corpus, cooc_id=cooc_id
# ,groupings_id = group_id
)
print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), spec_id))
# ?? maplist: compute + write (to Node and NodeNgram)
......
......@@ -65,6 +65,9 @@ def do_mainlist(corpus,
ordered_filtered_tfidf = (session
.query(NodeNodeNgram.ngram_id)
.filter(NodeNodeNgram.node1_id == ranking_scores_id)
# NOT IN but speed theoretically ok here
# see http://sqlperformance.com/2012/12/t-sql-queries/left-anti-semi-join
# but http://stackoverflow.com/questions/2246772/whats-the-difference-between-not-exists-vs-not-in-vs-left-join-where-is-null/2246793#2246793
.filter(~ NodeNodeNgram.ngram_id.in_(stopterms_subquery))
.order_by(desc(NodeNodeNgram.score))
)
......
......@@ -63,7 +63,6 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
# ------------
# (the occurrences are the sums for each ngram's mainform)
else:
print ("gtoup mode")
# sub-SELECT the synonyms of this GROUPLIST id (for OUTER JOIN later)
syn = (session.query(NodeNgramNgram.ngram1_id,
NodeNgramNgram.ngram2_id)
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment