Commit 7cea952c authored by Romain Loth's avatar Romain Loth

FIX unnecessary writing of big cooc matrix used once just in specif + update maplist

parent 51bc0bf5
...@@ -136,21 +136,23 @@ def parse_extract_indexhyperdata(corpus): ...@@ -136,21 +136,23 @@ def parse_extract_indexhyperdata(corpus):
# ------------ # ------------
# -> cooccurrences on mainlist: compute + write (=> Node and NodeNgramNgram) # -> cooccurrences on mainlist: compute + write (=> Node and NodeNgramNgram)
# todo: no need to write it ? coocs = compute_coocs(corpus,
cooc_id = compute_coocs(corpus, on_list_id = mainlist_id, groupings_id = group_id) on_list_id = mainlist_id,
print('CORPUS #%d: [%s] new coocs node #%i' % (corpus.id, t(), cooc_id)) groupings_id = group_id,
just_pass_result = True)
print('CORPUS #%d: [%s] computed mainlist coocs for specif rank' % (corpus.id, t()))
# -> specificity: compute + write (=> NodeNodeNgram) # -> specificity: compute + write (=> NodeNodeNgram)
spec_id = compute_specificity(corpus, cooc_id=cooc_id spec_id = compute_specificity(corpus,cooc_matrix = coocs)
# ,groupings_id = group_id # no need here for subforms because cooc already counted them in mainform
)
print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), spec_id)) print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), spec_id))
# ?? maplist: compute + write (to Node and NodeNgram) # maplist: compute + write (to Node and NodeNgram)
map_id = do_maplist(corpus, map_id = do_maplist(corpus,
mainlist_id = mainlist_id, mainlist_id = mainlist_id,
specificity_id=spec_id, specificity_id=spec_id,
grouplist_id=group_id) grouplist_id=group_id
)
print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id)) print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id))
print('CORPUS #%d: [%s] FINISHED ngram lists computation' % (corpus.id, t())) print('CORPUS #%d: [%s] FINISHED ngram lists computation' % (corpus.id, t()))
...@@ -161,7 +163,7 @@ def parse_extract_indexhyperdata(corpus): ...@@ -161,7 +163,7 @@ def parse_extract_indexhyperdata(corpus):
if DEBUG is False: if DEBUG is False:
print('CORPUS #%d: [%s] FINISHED Sendind email notification' % (corpus.id, t())) print('CORPUS #%d: [%s] FINISHED Sending email notification' % (corpus.id, t()))
notify_owner(corpus) notify_owner(corpus)
corpus.status('Workflow', progress=10, complete=True) corpus.status('Workflow', progress=10, complete=True)
......
...@@ -43,15 +43,11 @@ def do_maplist(corpus, ...@@ -43,15 +43,11 @@ def do_maplist(corpus,
#dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id) #dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
mainterms_subquery = (session MainlistTable = aliased(NodeNgram)
# we want only terms within mainlist
.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id == mainlist_id)
.subquery()
)
primary_groupterms_subquery = (session IsSubform = (session
# we want only primary terms (ngram1) # we want only secondary terms (ngram2)
# to be able to filter them out
.query(NodeNgramNgram.ngram2_id) .query(NodeNgramNgram.ngram2_id)
.filter(NodeNgramNgram.node_id == grouplist_id) .filter(NodeNgramNgram.node_id == grouplist_id)
.subquery() .subquery()
...@@ -63,8 +59,15 @@ def do_maplist(corpus, ...@@ -63,8 +59,15 @@ def do_maplist(corpus,
query = (session.query(ScoreSpec.ngram_id) query = (session.query(ScoreSpec.ngram_id)
.join(Ngram, Ngram.id == ScoreSpec.ngram_id) .join(Ngram, Ngram.id == ScoreSpec.ngram_id)
.filter(ScoreSpec.node_id == specificity_id) .filter(ScoreSpec.node_id == specificity_id)
.filter(ScoreSpec.ngram_id.in_(mainterms_subquery))
.filter(ScoreSpec.ngram_id.notin_(primary_groupterms_subquery)) # we want only terms within mainlist
.join(MainlistTable, Ngram.id == MainlistTable.ngram_id)
.filter(MainlistTable.node_id == mainlist_id)
# we remove all ngrams matching an ngram2_id from the synonyms
.outerjoin(IsSubform,
IsSubform.c.ngram2_id == ScoreSpec.ngram_id)
.filter(IsSubform.c.ngram2_id == None)
) )
# TODO: move these 2 pools up to mainlist selection # TODO: move these 2 pools up to mainlist selection
...@@ -94,7 +97,7 @@ def do_maplist(corpus, ...@@ -94,7 +97,7 @@ def do_maplist(corpus,
new_hyperdata = { 'corpus': corpus.id, new_hyperdata = { 'corpus': corpus.id,
'limit' : limit, 'limit' : limit,
'monograms_part' : monograms_part, 'monograms_part' : monograms_part,
'monograms_result' : obtained_mono/obtained_total if obtained_total != 0 else obtained_mono 'monograms_result' : obtained_mono/obtained_total if obtained_total != 0 else 0
} }
if overwrite_id: if overwrite_id:
# overwrite pre-existing node # overwrite pre-existing node
......
...@@ -9,7 +9,7 @@ from collections import defaultdict ...@@ -9,7 +9,7 @@ from collections import defaultdict
from pandas import DataFrame from pandas import DataFrame
import pandas as pd import pandas as pd
def compute_specificity(corpus, cooc_id=None, overwrite_id = None): def compute_specificity(corpus, cooc_id=None, cooc_matrix=None, overwrite_id = None):
''' '''
Compute the specificity, simple calculus. Compute the specificity, simple calculus.
...@@ -18,17 +18,25 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None): ...@@ -18,17 +18,25 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
- overwrite_id: optional preexisting specificity node to overwrite - overwrite_id: optional preexisting specificity node to overwrite
''' '''
cooccurrences = (session.query(NodeNgramNgram)
.filter(NodeNgramNgram.node_id==cooc_id)
)
# no filtering: new choice cooc already filtered on tfidf before creation
matrix = defaultdict(lambda : defaultdict(float)) matrix = defaultdict(lambda : defaultdict(float))
# £TODO re-rename weight => score if cooc_id == None and cooc_matrix == None:
for cooccurrence in cooccurrences: raise TypeError("compute_specificity: needs a cooc_id or cooc_matrix param")
matrix[cooccurrence.ngram1_id][cooccurrence.ngram2_id] = cooccurrence.weight
matrix[cooccurrence.ngram2_id][cooccurrence.ngram1_id] = cooccurrence.weight elif cooc_id:
cooccurrences = (session.query(NodeNgramNgram)
.filter(NodeNgramNgram.node_id==cooc_id)
)
# no filtering: cooc already filtered on mainlist_id at creation
for cooccurrence in cooccurrences:
matrix[cooccurrence.ngram1_id][cooccurrence.ngram2_id] = cooccurrence.weight
matrix[cooccurrence.ngram2_id][cooccurrence.ngram1_id] = cooccurrence.weight
elif cooc_matrix:
# copy WeightedMatrix into local matrix structure
for (ngram1_id, ngram2_id) in cooc_matrix.items:
w = cooc_matrix.items[(ngram1_id, ngram2_id)]
matrix[ngram1_id][ngram2_id] = w
nb_ngrams = len(matrix) nb_ngrams = len(matrix)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment