Commit 7cea952c authored by Romain Loth's avatar Romain Loth

FIX unnecessary writing of big cooc matrix used once just in specif + update maplist

parent 51bc0bf5
......@@ -136,21 +136,23 @@ def parse_extract_indexhyperdata(corpus):
# ------------
# -> cooccurrences on mainlist: compute + write (=> Node and NodeNgramNgram)
# todo: no need to write it ?
cooc_id = compute_coocs(corpus, on_list_id = mainlist_id, groupings_id = group_id)
print('CORPUS #%d: [%s] new coocs node #%i' % (corpus.id, t(), cooc_id))
coocs = compute_coocs(corpus,
on_list_id = mainlist_id,
groupings_id = group_id,
just_pass_result = True)
print('CORPUS #%d: [%s] computed mainlist coocs for specif rank' % (corpus.id, t()))
# -> specificity: compute + write (=> NodeNodeNgram)
spec_id = compute_specificity(corpus, cooc_id=cooc_id
# ,groupings_id = group_id
)
spec_id = compute_specificity(corpus,cooc_matrix = coocs)
# no need here for subforms because cooc already counted them in mainform
print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), spec_id))
# ?? maplist: compute + write (to Node and NodeNgram)
# maplist: compute + write (to Node and NodeNgram)
map_id = do_maplist(corpus,
mainlist_id = mainlist_id,
specificity_id=spec_id,
grouplist_id=group_id)
grouplist_id=group_id
)
print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id))
print('CORPUS #%d: [%s] FINISHED ngram lists computation' % (corpus.id, t()))
......@@ -161,7 +163,7 @@ def parse_extract_indexhyperdata(corpus):
if DEBUG is False:
print('CORPUS #%d: [%s] FINISHED Sendind email notification' % (corpus.id, t()))
print('CORPUS #%d: [%s] FINISHED Sending email notification' % (corpus.id, t()))
notify_owner(corpus)
corpus.status('Workflow', progress=10, complete=True)
......
......@@ -43,15 +43,11 @@ def do_maplist(corpus,
#dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
mainterms_subquery = (session
# we want only terms within mainlist
.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id == mainlist_id)
.subquery()
)
MainlistTable = aliased(NodeNgram)
primary_groupterms_subquery = (session
# we want only primary terms (ngram1)
IsSubform = (session
# we want only secondary terms (ngram2)
# to be able to filter them out
.query(NodeNgramNgram.ngram2_id)
.filter(NodeNgramNgram.node_id == grouplist_id)
.subquery()
......@@ -63,8 +59,15 @@ def do_maplist(corpus,
query = (session.query(ScoreSpec.ngram_id)
.join(Ngram, Ngram.id == ScoreSpec.ngram_id)
.filter(ScoreSpec.node_id == specificity_id)
.filter(ScoreSpec.ngram_id.in_(mainterms_subquery))
.filter(ScoreSpec.ngram_id.notin_(primary_groupterms_subquery))
# we want only terms within mainlist
.join(MainlistTable, Ngram.id == MainlistTable.ngram_id)
.filter(MainlistTable.node_id == mainlist_id)
# we remove all ngrams matching an ngram2_id from the synonyms
.outerjoin(IsSubform,
IsSubform.c.ngram2_id == ScoreSpec.ngram_id)
.filter(IsSubform.c.ngram2_id == None)
)
# TODO: move these 2 pools up to mainlist selection
......@@ -94,7 +97,7 @@ def do_maplist(corpus,
new_hyperdata = { 'corpus': corpus.id,
'limit' : limit,
'monograms_part' : monograms_part,
'monograms_result' : obtained_mono/obtained_total if obtained_total != 0 else obtained_mono
'monograms_result' : obtained_mono/obtained_total if obtained_total != 0 else 0
}
if overwrite_id:
# overwrite pre-existing node
......
......@@ -9,7 +9,7 @@ from collections import defaultdict
from pandas import DataFrame
import pandas as pd
def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
def compute_specificity(corpus, cooc_id=None, cooc_matrix=None, overwrite_id = None):
'''
Compute the specificity, simple calculus.
......@@ -18,17 +18,25 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
- overwrite_id: optional preexisting specificity node to overwrite
'''
cooccurrences = (session.query(NodeNgramNgram)
.filter(NodeNgramNgram.node_id==cooc_id)
)
# no filtering: new choice cooc already filtered on tfidf before creation
matrix = defaultdict(lambda : defaultdict(float))
# £TODO re-rename weight => score
for cooccurrence in cooccurrences:
matrix[cooccurrence.ngram1_id][cooccurrence.ngram2_id] = cooccurrence.weight
matrix[cooccurrence.ngram2_id][cooccurrence.ngram1_id] = cooccurrence.weight
if cooc_id == None and cooc_matrix == None:
raise TypeError("compute_specificity: needs a cooc_id or cooc_matrix param")
elif cooc_id:
cooccurrences = (session.query(NodeNgramNgram)
.filter(NodeNgramNgram.node_id==cooc_id)
)
# no filtering: cooc already filtered on mainlist_id at creation
for cooccurrence in cooccurrences:
matrix[cooccurrence.ngram1_id][cooccurrence.ngram2_id] = cooccurrence.weight
matrix[cooccurrence.ngram2_id][cooccurrence.ngram1_id] = cooccurrence.weight
elif cooc_matrix:
# copy WeightedMatrix into local matrix structure
for (ngram1_id, ngram2_id) in cooc_matrix.items:
w = cooc_matrix.items[(ngram1_id, ngram2_id)]
matrix[ngram1_id][ngram2_id] = w
nb_ngrams = len(matrix)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment