Commit 7a141a02 authored by delanoe's avatar delanoe

[NGRAMS] workflow fixes.

parent 91e14e3e
...@@ -52,7 +52,7 @@ def do_maplist(corpus, ...@@ -52,7 +52,7 @@ def do_maplist(corpus,
primary_groupterms_subquery = (session primary_groupterms_subquery = (session
# we want only primary terms (ngram1) # we want only primary terms (ngram1)
.query(NodeNgramNgram.ngram1_id) .query(NodeNgramNgram.ngram2_id)
.filter(NodeNgramNgram.node_id == grouplist_id) .filter(NodeNgramNgram.node_id == grouplist_id)
.subquery() .subquery()
) )
...@@ -64,7 +64,7 @@ def do_maplist(corpus, ...@@ -64,7 +64,7 @@ def do_maplist(corpus,
.join(Ngram, Ngram.id == ScoreSpec.ngram_id) .join(Ngram, Ngram.id == ScoreSpec.ngram_id)
.filter(ScoreSpec.node_id == specificity_id) .filter(ScoreSpec.node_id == specificity_id)
.filter(ScoreSpec.ngram_id.in_(mainterms_subquery)) .filter(ScoreSpec.ngram_id.in_(mainterms_subquery))
.filter(ScoreSpec.ngram_id.in_(primary_groupterms_subquery)) .filter(ScoreSpec.ngram_id.notin_(primary_groupterms_subquery))
) )
# TODO: move these 2 pools up to mainlist selection # TODO: move these 2 pools up to mainlist selection
...@@ -81,7 +81,7 @@ def do_maplist(corpus, ...@@ -81,7 +81,7 @@ def do_maplist(corpus,
.limit(multigrams_limit) .limit(multigrams_limit)
.all() .all()
) )
obtained_mono = len(top_monograms) obtained_mono = len(top_monograms)
obtained_multi = len(top_multigrams) obtained_multi = len(top_multigrams)
obtained_total = obtained_mono + obtained_multi obtained_total = obtained_mono + obtained_multi
# print("MAPLIST: top_monograms =", obtained_mono) # print("MAPLIST: top_monograms =", obtained_mono)
......
...@@ -8,7 +8,7 @@ FIXME: "having the same source" means we need to select inside hyperdata ...@@ -8,7 +8,7 @@ FIXME: "having the same source" means we need to select inside hyperdata
with a (perhaps costly) JSON query: WHERE hyperdata->'resources' @> ... with a (perhaps costly) JSON query: WHERE hyperdata->'resources' @> ...
""" """
from gargantext.models import Node, NodeNgram, NodeNodeNgram from gargantext.models import Node, NodeNgram, NodeNodeNgram, NodeNgramNgram
from gargantext.util.db import session, bulk_insert, func # = sqlalchemy.func like sum() or count() from gargantext.util.db import session, bulk_insert, func # = sqlalchemy.func like sum() or count()
from sqlalchemy import text # for query from raw SQL statement from sqlalchemy import text # for query from raw SQL statement
from math import log from math import log
...@@ -29,6 +29,13 @@ def compute_occs(corpus, overwrite_id = None): ...@@ -29,6 +29,13 @@ def compute_occs(corpus, overwrite_id = None):
- overwrite_id: optional id of a pre-existing OCCURRENCES node for this corpus - overwrite_id: optional id of a pre-existing OCCURRENCES node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced) (the Node and its previous NodeNodeNgram rows will be replaced)
""" """
# 0) Get the groups
group_id = (session.query(Node.id)
.filter(Node.parent_id == corpus.id)
.filter(Node.typename == "GROUPLIST")
.first()
)
# 1) all the doc_ids of our corpus (scope of counts for filter) # 1) all the doc_ids of our corpus (scope of counts for filter)
# slower alternative: [doc.id for doc in corpus.children('DOCUMENT').all()] # slower alternative: [doc.id for doc in corpus.children('DOCUMENT').all()]
...@@ -45,6 +52,7 @@ def compute_occs(corpus, overwrite_id = None): ...@@ -45,6 +52,7 @@ def compute_occs(corpus, overwrite_id = None):
NodeNgram.ngram_id, NodeNgram.ngram_id,
func.sum(NodeNgram.weight) func.sum(NodeNgram.weight)
) )
#.join(NodeNgramNgram, NodeNgramNgram.node_id == group_id)
.filter(NodeNgram.node_id.in_(docids_subquery)) .filter(NodeNgram.node_id.in_(docids_subquery))
.group_by(NodeNgram.ngram_id) .group_by(NodeNgram.ngram_id)
.all() .all()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment