Commit aa848fd9 authored by Romain Loth's avatar Romain Loth

clarify

parent 049dc862
...@@ -111,8 +111,8 @@ def parse_extract_indexhyperdata(corpus): ...@@ -111,8 +111,8 @@ def parse_extract_indexhyperdata(corpus):
group_id = compute_groups(corpus, stoplist_id = None) group_id = compute_groups(corpus, stoplist_id = None)
print('CORPUS #%d: [%s] new grouplist node #%i' % (corpus.id, t(), group_id)) print('CORPUS #%d: [%s] new grouplist node #%i' % (corpus.id, t(), group_id))
# -> write occurrences to Node and NodeNodeNgram # possible: factorize with tfidf # -> write occurrences to Node and NodeNodeNgram # (todo: NodeNgram)
occ_id = compute_occs(corpus) occ_id = compute_occs(corpus, groupings_id = group_id)
print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id)) print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id))
# ------------ # ------------
...@@ -120,11 +120,11 @@ def parse_extract_indexhyperdata(corpus): ...@@ -120,11 +120,11 @@ def parse_extract_indexhyperdata(corpus):
ltfidf_id = compute_tfidf_local(corpus) ltfidf_id = compute_tfidf_local(corpus)
print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id)) print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))
# -> write global and cumulated ti_ranking (tfidf ranking vector) to Node and NodeNodeNgram # -> write cumulated ti_ranking (tfidf ranking vector) to Node and NodeNodeNgram (todo: NodeNgram)
tirank_id = compute_ti_ranking(corpus, tirank_id = compute_ti_ranking(corpus,
count_scope="global", groupings_id = group_id,
termset_scope="local") count_scope="global")
print('CORPUS #%d: [%s] new tfidf ranking node #%i' % (corpus.id, t(), tirank_id)) print('CORPUS #%d: [%s] new ti ranking node #%i' % (corpus.id, t(), tirank_id))
# -> mainlist: filter + write (to Node and NodeNgram) # -> mainlist: filter + write (to Node and NodeNgram)
mainlist_id = do_mainlist(corpus, mainlist_id = do_mainlist(corpus,
......
...@@ -85,14 +85,6 @@ def compute_coocs( corpus, ...@@ -85,14 +85,6 @@ def compute_coocs( corpus,
# 1.859.408 lignes pour la requête cooc simple # 1.859.408 lignes pour la requête cooc simple
# 71.134 lignes en se limitant aux ngrammes qui ont une occ > 1 (weight) # 71.134 lignes en se limitant aux ngrammes qui ont une occ > 1 (weight)
# docs of our corpus
docids_subquery = (session
.query(Node.id)
.filter(Node.parent_id == corpus.id)
.filter(Node.typename == "DOCUMENT")
.subquery()
)
# 2 x the occurrence index table # 2 x the occurrence index table
x1 = aliased(NodeNgram) x1 = aliased(NodeNgram)
x2 = aliased(NodeNgram) x2 = aliased(NodeNgram)
...@@ -105,11 +97,9 @@ def compute_coocs( corpus, ...@@ -105,11 +97,9 @@ def compute_coocs( corpus,
session.query(x1.ngram_id, x2.ngram_id, ucooc) session.query(x1.ngram_id, x2.ngram_id, ucooc)
.join(Node, Node.id == x1.node_id) # <- b/c within corpus .join(Node, Node.id == x1.node_id) # <- b/c within corpus
.join(x2, x1.node_id == Node.id ) # <- b/c within corpus .join(x2, x1.node_id == Node.id ) # <- b/c within corpus
.filter(Node.parent_id == corpus.id) # <- b/c within corpus .filter(Node.parent_id == corpus.id) # <- b/c within corpus
.filter(Node.typename == "DOCUMENT") # <- b/c within corpus .filter(Node.typename == "DOCUMENT") # <- b/c within corpus
.filter(x1.node_id == x2.node_id) # <- by definition of cooc .filter(x1.node_id == x2.node_id) # <- by definition of cooc
.filter(x1.ngram_id != x2.ngram_id) # <- b/c not with itself .filter(x1.ngram_id != x2.ngram_id) # <- b/c not with itself
.group_by(x1.ngram_id, x2.ngram_id) .group_by(x1.ngram_id, x2.ngram_id)
...@@ -120,7 +110,7 @@ def compute_coocs( corpus, ...@@ -120,7 +110,7 @@ def compute_coocs( corpus,
m1 = aliased(NodeNgram) m1 = aliased(NodeNgram)
m2 = aliased(NodeNgram) m2 = aliased(NodeNgram)
coocs_query = ( coocs_query coocs_query = ( coocs_query
.join(m1, m1.ngram_id == x1.ngram_id) .join(m1, m1.ngram_id == x1.ngram_id)
.join(m2, m2.ngram_id == x2.ngram_id) .join(m2, m2.ngram_id == x2.ngram_id)
...@@ -211,9 +201,9 @@ def compute_coocs( corpus, ...@@ -211,9 +201,9 @@ def compute_coocs( corpus,
matrix = WeightedMatrix(coocs_query.all()) matrix = WeightedMatrix(coocs_query.all())
# fyi # fyi
shape_0 = len({pair[0] for pair in matrix.items}) #shape_0 = len({pair[0] for pair in matrix.items})
shape_1 = len({pair[1] for pair in matrix.items}) #shape_1 = len({pair[1] for pair in matrix.items})
print("COOCS: NEW matrix shape [%ix%i]" % (shape_0, shape_1)) #print("COOCS: NEW matrix shape [%ix%i]" % (shape_0, shape_1))
# 5) SAVE # 5) SAVE
# -------- # --------
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment