Commit 1ad1ec6b authored by sim's avatar sim

Notebook: corpus_list(..., with_count=True) to get ngram occurrence count

parent fd3d5bbc
...@@ -16,7 +16,7 @@ django.setup() ...@@ -16,7 +16,7 @@ django.setup()
from gargantext.constants import QUERY_SIZE_N_MAX, get_resource, get_resource_by_name from gargantext.constants import QUERY_SIZE_N_MAX, get_resource, get_resource_by_name
from gargantext.models import (Node, ProjectNode, DocumentNode, from gargantext.models import (Node, ProjectNode, DocumentNode,
Ngram, NodeNgram, NodeNgramNgram) Ngram, NodeNgram, NodeNgramNgram, NodeNodeNgram)
from gargantext.util.db import session, get_engine, func, aliased, case from gargantext.util.db import session, get_engine, func, aliased, case
from collections import Counter from collections import Counter
import importlib import importlib
...@@ -223,7 +223,8 @@ def _ngrams(corpus_id, list_types, entities): ...@@ -223,7 +223,8 @@ def _ngrams(corpus_id, list_types, entities):
Node.typename.in_(list_typenames))) Node.typename.in_(list_typenames)))
def corpus_list(corpus_id, list_types=ALL_LIST_TYPES, with_synonyms=True): def corpus_list(corpus_id, list_types=ALL_LIST_TYPES, with_synonyms=False,
with_count=False):
# Link between a GROUPLIST, a normal form (ngram1), and a synonym (ngram2) # Link between a GROUPLIST, a normal form (ngram1), and a synonym (ngram2)
NNN = NodeNgramNgram NNN = NodeNgramNgram
...@@ -236,6 +237,9 @@ def corpus_list(corpus_id, list_types=ALL_LIST_TYPES, with_synonyms=True): ...@@ -236,6 +237,9 @@ def corpus_list(corpus_id, list_types=ALL_LIST_TYPES, with_synonyms=True):
# We will retrieve each ngram as the following tuple: # We will retrieve each ngram as the following tuple:
entities = (list_type, Ngram.terms.label('ng')) entities = (list_type, Ngram.terms.label('ng'))
if with_count:
entities += (Ngram.id.label('id'),)
# First, get ngrams from wanted lists # First, get ngrams from wanted lists
ngrams = _ngrams(corpus_id, list_types, entities) ngrams = _ngrams(corpus_id, list_types, entities)
...@@ -252,7 +256,8 @@ def corpus_list(corpus_id, list_types=ALL_LIST_TYPES, with_synonyms=True): ...@@ -252,7 +256,8 @@ def corpus_list(corpus_id, list_types=ALL_LIST_TYPES, with_synonyms=True):
# source for them # source for them
if with_synonyms: if with_synonyms:
Synonym = aliased(Ngram) Synonym = aliased(Ngram)
synonyms = (ngrams.with_entities(list_type, Synonym.terms.label('ng')) ent = (list_type, Synonym.terms.label('ng'), Synonym.id.label('id'))
synonyms = (ngrams.with_entities(*ent)
.filter(NNN.ngram1_id==Ngram.id, .filter(NNN.ngram1_id==Ngram.id,
NNN.ngram2_id==Synonym.id, NNN.ngram2_id==Synonym.id,
NNN.node_id==Groups.id, NNN.node_id==Groups.id,
...@@ -264,8 +269,15 @@ def corpus_list(corpus_id, list_types=ALL_LIST_TYPES, with_synonyms=True): ...@@ -264,8 +269,15 @@ def corpus_list(corpus_id, list_types=ALL_LIST_TYPES, with_synonyms=True):
# we don't wan't that # we don't wan't that
if 'main' in list_types and 'map' not in list_types: if 'main' in list_types and 'map' not in list_types:
# Exclude MAPLIST ngrams from MAINLIST # Exclude MAPLIST ngrams from MAINLIST
entities = ("'main'", entities[1])
query = query.except_(_ngrams(corpus_id, 'map', entities)) query = query.except_(_ngrams(corpus_id, 'map', entities))
if with_count:
N = query.subquery()
return (session.query(N.c.type, N.c.ng, NodeNodeNgram.score)
.join(Node, (Node.parent_id==corpus_id) & (Node.typename=='OCCURRENCES'))
.outerjoin(NodeNodeNgram, (NodeNodeNgram.ngram_id==N.c.id) &
(NodeNodeNgram.node1_id==Node.id) &
(NodeNodeNgram.node2_id==corpus_id)))
# Return found ngrams sorted by list type, and then alphabetically # Return found ngrams sorted by list type, and then alphabetically
return query.order_by('type', 'ng') return query.order_by('type', 'ng')
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment