Commit 7a141a02 authored by delanoe's avatar delanoe

[NGRAMS] workflow fixes.

parent 91e14e3e
......@@ -52,7 +52,7 @@ def do_maplist(corpus,
primary_groupterms_subquery = (session
# we want only primary terms (ngram1)
.query(NodeNgramNgram.ngram1_id)
.query(NodeNgramNgram.ngram2_id)
.filter(NodeNgramNgram.node_id == grouplist_id)
.subquery()
)
......@@ -64,7 +64,7 @@ def do_maplist(corpus,
.join(Ngram, Ngram.id == ScoreSpec.ngram_id)
.filter(ScoreSpec.node_id == specificity_id)
.filter(ScoreSpec.ngram_id.in_(mainterms_subquery))
.filter(ScoreSpec.ngram_id.in_(primary_groupterms_subquery))
.filter(ScoreSpec.ngram_id.notin_(primary_groupterms_subquery))
)
# TODO: move these 2 pools up to mainlist selection
......@@ -81,7 +81,7 @@ def do_maplist(corpus,
.limit(multigrams_limit)
.all()
)
obtained_mono = len(top_monograms)
obtained_mono = len(top_monograms)
obtained_multi = len(top_multigrams)
obtained_total = obtained_mono + obtained_multi
# print("MAPLIST: top_monograms =", obtained_mono)
......
......@@ -8,7 +8,7 @@ FIXME: "having the same source" means we need to select inside hyperdata
with a (perhaps costly) JSON query: WHERE hyperdata->'resources' @> ...
"""
from gargantext.models import Node, NodeNgram, NodeNodeNgram
from gargantext.models import Node, NodeNgram, NodeNodeNgram, NodeNgramNgram
from gargantext.util.db import session, bulk_insert, func # = sqlalchemy.func like sum() or count()
from sqlalchemy import text # for query from raw SQL statement
from math import log
......@@ -29,6 +29,13 @@ def compute_occs(corpus, overwrite_id = None):
- overwrite_id: optional id of a pre-existing OCCURRENCES node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced)
"""
# 0) Get the groups
group_id = (session.query(Node.id)
.filter(Node.parent_id == corpus.id)
.filter(Node.typename == "GROUPLIST")
.first()
)
# 1) all the doc_ids of our corpus (scope of counts for filter)
# slower alternative: [doc.id for doc in corpus.children('DOCUMENT').all()]
......@@ -45,6 +52,7 @@ def compute_occs(corpus, overwrite_id = None):
NodeNgram.ngram_id,
func.sum(NodeNgram.weight)
)
#.join(NodeNgramNgram, NodeNgramNgram.node_id == group_id)
.filter(NodeNgram.node_id.in_(docids_subquery))
.group_by(NodeNgram.ngram_id)
.all()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment