Commit 51bc0bf5 authored by Romain Loth's avatar Romain Loth

add option to not write coocs but just pass the matrix result

parent e52afd97
......@@ -6,12 +6,12 @@ from .hyperdata_indexing import index_hyperdata
# in usual run order
from .list_stop import do_stoplist
from .ngram_groups import compute_groups
from .metric_tfidf import compute_occs, compute_tfidf_local, compute_ti_ranking
from .list_main import do_mainlist
from .ngram_coocs import compute_coocs
from .metric_specificity import compute_specificity
from .list_map import do_maplist # TEST
from .ngram_groups import compute_groups
from .mail_notification import notify_owner
from gargantext.util.db import session
from gargantext.models import Node
......@@ -135,8 +135,9 @@ def parse_extract_indexhyperdata(corpus):
# => used for doc <=> ngram association
# ------------
# -> cooccurrences: compute + write (=> Node and NodeNodeNgram)
cooc_id = compute_coocs(corpus, mainlist_id = mainlist_id, groupings_id = group_id)
# -> cooccurrences on mainlist: compute + write (=> Node and NodeNgramNgram)
# todo: no need to write it ?
cooc_id = compute_coocs(corpus, on_list_id = mainlist_id, groupings_id = group_id)
print('CORPUS #%d: [%s] new coocs node #%i' % (corpus.id, t(), cooc_id))
# -> specificity: compute + write (=> NodeNodeNgram)
......
......@@ -10,9 +10,11 @@ from sqlalchemy.sql.expression import case # for choice if ngram has mainform or
def compute_coocs( corpus,
overwrite_id = None,
just_pass_result= True, # just return the WeightedMatrix,
# (don't write to DB)
threshold = DEFAULT_COOC_THRESHOLD,
groupings_id = None,
mainlist_id = None,
on_list_id = None,
stoplist_id = None,
start = None,
end = None,
......@@ -46,7 +48,7 @@ def compute_coocs( corpus,
- threshold: on output cooc count (previously called hapax)
- groupings_id: optional synonym relations to add all subform counts
with their mainform's counts
- mainlist_id: mainlist to constrain the input ngrams
- on_list_id: mainlist or maplist type, to constrain the input ngrams
- stoplist_id: stoplist for filtering input ngrams
(normally unnecessary if a mainlist is already provided)
- start, end: provide one or both temporal limits to filter on doc date
......@@ -62,9 +64,10 @@ def compute_coocs( corpus,
======================
each pair of ngrams sharing same doc (node_id)
SELEC idxa.ngram_id, idxb.ngram_id
FROM nodes_ngrams AS idxa, nodes_ngrams AS idxb
FROM nodes_ngrams AS idxa
---------------------------------
WHERE idxa.node_id = idxb.node_id <== that's cooc
JOIN nodes_ngrams AS idxb
ON idxa.node_id = idxb.node_id <== that's cooc
---------------------------------
AND idxa.ngram_id <> idxb.ngram_id
AND idxa.node_id = MY_DOC ;
......@@ -188,7 +191,7 @@ def compute_coocs( corpus,
# 4) INPUT FILTERS (reduce N before O(N²))
if mainlist_id:
if on_list_id:
m1 = aliased(NodeNgram)
m2 = aliased(NodeNgram)
......@@ -197,8 +200,8 @@ def compute_coocs( corpus,
.join(m1, m1.ngram_id == Xindex_ngform_id)
.join(m2, m2.ngram_id == Yindex_ngform_id)
.filter( m1.node_id == mainlist_id )
.filter( m2.node_id == mainlist_id )
.filter( m1.node_id == on_list_id )
.filter( m2.node_id == on_list_id )
)
if stoplist_id:
......@@ -279,31 +282,36 @@ def compute_coocs( corpus,
shape_1 = len({pair[1] for pair in matrix.items})
print("COOCS: NEW matrix shape [%ix%i]" % (shape_0, shape_1))
# 5) SAVE
# --------
# saving the parameters of the analysis in the Node JSON
new_hyperdata = { 'corpus' : corpus.id,
'threshold': threshold }
if overwrite_id:
# overwrite pre-existing id
the_cooc = cache.Node[overwrite_id]
the_cooc.hyperdata = new_hyperdata
the_cooc.save_hyperdata()
session.commit()
the_id = overwrite_id
else:
# create the new cooc node
the_cooc = corpus.add_child(
typename = "COOCCURRENCES",
name = "Coocs (in:%s)" % corpus.name[0:10],
hyperdata = new_hyperdata,
)
session.add(the_cooc)
session.commit()
the_id = the_cooc.id
# ==> save all NodeNgramNgram with link to new cooc node id
matrix.save(the_id)
return the_id
if just_pass_result:
return matrix
else:
# 5) SAVE
# --------
# saving the parameters of the analysis in the Node JSON
new_hyperdata = { 'corpus' : corpus.id,
'threshold': threshold }
if overwrite_id:
# overwrite pre-existing id
the_cooc = cache.Node[overwrite_id]
the_cooc.hyperdata = new_hyperdata
the_cooc.save_hyperdata()
session.commit()
the_id = overwrite_id
else:
# create the new cooc node
the_cooc = corpus.add_child(
typename = "COOCCURRENCES",
name = "Coocs (in:%s)" % corpus.name[0:10],
hyperdata = new_hyperdata,
)
session.add(the_cooc)
session.commit()
the_id = the_cooc.id
# ==> save all NodeNgramNgram with link to new cooc node id
matrix.save(the_id)
return the_id
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment