Commit aa848fd9 authored by Romain Loth's avatar Romain Loth

clarify

parent 049dc862
......@@ -111,8 +111,8 @@ def parse_extract_indexhyperdata(corpus):
group_id = compute_groups(corpus, stoplist_id = None)
print('CORPUS #%d: [%s] new grouplist node #%i' % (corpus.id, t(), group_id))
# -> write occurrences to Node and NodeNodeNgram # possible: factorize with tfidf
occ_id = compute_occs(corpus)
# -> write occurrences to Node and NodeNodeNgram # (todo: NodeNgram)
occ_id = compute_occs(corpus, groupings_id = group_id)
print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id))
# ------------
......@@ -120,11 +120,11 @@ def parse_extract_indexhyperdata(corpus):
ltfidf_id = compute_tfidf_local(corpus)
print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))
# -> write global and cumulated ti_ranking (tfidf ranking vector) to Node and NodeNodeNgram
# -> write cumulated ti_ranking (tfidf ranking vector) to Node and NodeNodeNgram (todo: NodeNgram)
tirank_id = compute_ti_ranking(corpus,
count_scope="global",
termset_scope="local")
print('CORPUS #%d: [%s] new tfidf ranking node #%i' % (corpus.id, t(), tirank_id))
groupings_id = group_id,
count_scope="global")
print('CORPUS #%d: [%s] new ti ranking node #%i' % (corpus.id, t(), tirank_id))
# -> mainlist: filter + write (to Node and NodeNgram)
mainlist_id = do_mainlist(corpus,
......
......@@ -85,14 +85,6 @@ def compute_coocs( corpus,
# 1.859.408 lignes pour la requête cooc simple
# 71.134 lignes en se limitant aux ngrammes qui ont une occ > 1 (weight)
# docs of our corpus
docids_subquery = (session
.query(Node.id)
.filter(Node.parent_id == corpus.id)
.filter(Node.typename == "DOCUMENT")
.subquery()
)
# 2 x the occurrence index table
x1 = aliased(NodeNgram)
x2 = aliased(NodeNgram)
......@@ -105,11 +97,9 @@ def compute_coocs( corpus,
session.query(x1.ngram_id, x2.ngram_id, ucooc)
.join(Node, Node.id == x1.node_id) # <- b/c within corpus
.join(x2, x1.node_id == Node.id ) # <- b/c within corpus
.filter(Node.parent_id == corpus.id) # <- b/c within corpus
.filter(Node.typename == "DOCUMENT") # <- b/c within corpus
.filter(x1.node_id == x2.node_id) # <- by definition of cooc
.filter(x1.ngram_id != x2.ngram_id) # <- b/c not with itself
.group_by(x1.ngram_id, x2.ngram_id)
......@@ -120,7 +110,7 @@ def compute_coocs( corpus,
m1 = aliased(NodeNgram)
m2 = aliased(NodeNgram)
coocs_query = ( coocs_query
.join(m1, m1.ngram_id == x1.ngram_id)
.join(m2, m2.ngram_id == x2.ngram_id)
......@@ -211,9 +201,9 @@ def compute_coocs( corpus,
matrix = WeightedMatrix(coocs_query.all())
# fyi
shape_0 = len({pair[0] for pair in matrix.items})
shape_1 = len({pair[1] for pair in matrix.items})
print("COOCS: NEW matrix shape [%ix%i]" % (shape_0, shape_1))
#shape_0 = len({pair[0] for pair in matrix.items})
#shape_1 = len({pair[1] for pair in matrix.items})
#print("COOCS: NEW matrix shape [%ix%i]" % (shape_0, shape_1))
# 5) SAVE
# --------
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment