Commit 2791e98e authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge branch 'stable' into stable-imt

parents 4e9dc26a 0f3ecfc8
......@@ -25,6 +25,7 @@ session = scoped_session(sessionmaker(bind=engine))
########################################################################
from sqlalchemy.orm import aliased
from sqlalchemy import func, desc
from sqlalchemy.sql.expression import case
########################################################################
# bulk insertions
......
......@@ -62,12 +62,12 @@ def parse_extract_indexhyperdata(corpus):
# apply actions
print('CORPUS #%d' % (corpus.id))
corpus.status('Docs', progress=1)
corpus.save_hyperdata()
session.commit()
parse(corpus)
docs = corpus.children("DOCUMENT").count()
print('CORPUS #%d: parsed %d' % (corpus.id, docs))
extract_ngrams(corpus)
......@@ -242,6 +242,19 @@ def recount(corpus_id):
corpus.save_hyperdata()
session.commit()
# START OF KLUDGE...
from gargantext.models import NodeNgram, DocumentNode
from .ngrams_addition import index_new_ngrams
maplist_id = corpus.children("MAPLIST").first().id
ngram_ids = session.query(NodeNgram.ngram_id.distinct())
indexed_ngrams = ngram_ids.join(DocumentNode).filter(DocumentNode.parent_id==corpus.id)
not_indexed_ngrams = ngram_ids.filter(NodeNgram.node_id==maplist_id,
~NodeNgram.ngram_id.in_(indexed_ngrams))
not_indexed_ngrams = [x[0] for x in not_indexed_ngrams]
added = index_new_ngrams(not_indexed_ngrams, corpus)
print('RECOUNT #%d: [%s] indexed %s ngrams' % (corpus.id, t(), added))
# ...END OF KLUDGE
# -> overwrite occurrences (=> NodeNodeNgram)
occ_id = compute_occs(corpus,
groupings_id = group_id,
......
......@@ -15,8 +15,9 @@ os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gargantext.settings')
django.setup()
from gargantext.constants import QUERY_SIZE_N_MAX, get_resource, get_resource_by_name
from gargantext.models import Node, ProjectNode, DocumentNode
from gargantext.util.db import session, get_engine, func
from gargantext.models import (Node, ProjectNode, DocumentNode,
Ngram, NodeNgram, NodeNgramNgram, NodeNodeNgram)
from gargantext.util.db import session, get_engine, func, aliased, case
from collections import Counter
import importlib
from django.http import Http404
......@@ -53,20 +54,32 @@ def scan_hal(request):
return hal.scan_results(request)
def _search_docs(corpus_id, request):
return (session.query(DocumentNode)
.filter_by(parent_id=corpus_id)
.filter(Node.title_abstract.match(request)))
def _search_docs(corpus_id, request, fast=False):
q = session.query(DocumentNode).filter_by(parent_id=corpus_id)
# Search ngram <request> in hyperdata <field>
H = lambda field, request: Node.hyperdata[field].astext.op('~*')(request)
def scan_gargantext(corpus_id, request):
return (_search_docs(corpus_id, request)
.with_entities(func.count(DocumentNode.id.distinct()))
.one())[0]
if not fast:
# Only match <request> starting and ending with word boundary
# Sequence of spaces will match any sequence of spaces
request = '\s+'.join(filter(None, r'\m{}\M'.format(request).split(' ')))
return q.filter(Node.title_abstract.match(request)) if fast else \
q.filter(H('title', request) | H('abstract', request))
def scan_gargantext_and_delete(corpus_id, request):
r = _search_docs(corpus_id, request).delete(synchronize_session='fetch')
def scan_gargantext(corpus_id, request, fast=False, documents=False):
query = _search_docs(corpus_id, request, fast)
if documents:
return query.all()
return query.with_entities(func.count(DocumentNode.id.distinct())).one()[0]
def scan_gargantext_and_delete(corpus_id, request, fast=False):
r = _search_docs(corpus_id, request, fast).delete(synchronize_session='fetch')
session.commit()
return r
......@@ -191,3 +204,80 @@ def run_moissonneur(moissonneur, project, name, query):
session.commit()
return corpus
ALL_LIST_TYPES = ['main', 'map', 'stop']
def _ngrams(corpus_id, list_types, entities):
list_types = (list_types,) if isinstance(list_types, str) else list_types
list_typenames = [
'{}LIST'.format(t.upper()) for t in list_types if t in ALL_LIST_TYPES]
# `Node` is our list, ie. MAINLIST and/or MAPLIST and/or STOPLIST
return (session.query(*entities)
.select_from(Ngram)
.filter(NodeNgram.ngram_id==Ngram.id,
NodeNgram.node_id==Node.id,
Node.parent_id==corpus_id,
Node.typename.in_(list_typenames)))
def corpus_list(corpus_id, list_types=ALL_LIST_TYPES, with_synonyms=False,
with_count=False):
# Link between a GROUPLIST, a normal form (ngram1), and a synonym (ngram2)
NNN = NodeNgramNgram
# Get the list type from the Node type -- as in CSV export
list_type = (case([(Node.typename=='MAINLIST', 'main'),
(Node.typename=='MAPLIST', 'map'),
(Node.typename=='STOPLIST', 'stop')])
.label('type'))
# We will retrieve each ngram as the following tuple:
entities = (list_type, Ngram.terms.label('ng'))
if with_count:
entities += (Ngram.id.label('id'),)
# First, get ngrams from wanted lists
ngrams = _ngrams(corpus_id, list_types, entities)
# Secondly, exclude "synonyms" (grouped ngrams that are not normal forms).
# We have to exclude synonyms first because data is inconsistent and some
# of them can be both in GROUPLIST and in MAIN/MAP/STOP lists. We want to
# take synonyms from GROUPLIST only -- see below.
Groups = aliased(Node, name='groups')
query = (ngrams.outerjoin(Groups, (Groups.parent_id==corpus_id) & (Groups.typename=='GROUPLIST'))
.outerjoin(NNN, (NNN.node_id==Groups.id) & (NNN.ngram2_id==Ngram.id))
.filter(NNN.ngram1_id==None))
# If `with_synonyms` is True, add them from GROUPLIST: this is the reliable
# source for them
if with_synonyms:
Synonym = aliased(Ngram)
ent = (list_type, Synonym.terms.label('ng'), Synonym.id.label('id'))
synonyms = (ngrams.with_entities(*ent)
.filter(NNN.ngram1_id==Ngram.id,
NNN.ngram2_id==Synonym.id,
NNN.node_id==Groups.id,
Groups.parent_id==corpus_id,
Groups.typename=='GROUPLIST'))
query = query.union(synonyms)
# Again, data is inconsistent: MAINLIST may intersect with MAPLIST and
# we don't wan't that
if 'main' in list_types and 'map' not in list_types:
# Exclude MAPLIST ngrams from MAINLIST
query = query.except_(_ngrams(corpus_id, 'map', entities))
if with_count:
N = query.subquery()
return (session.query(N.c.type, N.c.ng, NodeNodeNgram.score)
.join(Node, (Node.parent_id==corpus_id) & (Node.typename=='OCCURRENCES'))
.outerjoin(NodeNodeNgram, (NodeNodeNgram.ngram_id==N.c.id) &
(NodeNodeNgram.node1_id==Node.id) &
(NodeNodeNgram.node2_id==corpus_id)))
# Return found ngrams sorted by list type, and then alphabetically
return query.order_by('type', 'ng')
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment