Merge branch 'stable' into stable-imt

2791e98e · Alexandre Delanoë · 4e9dc26a · 0f3ecfc8 · 2791e98e · 2791e98e
Commit 2791e98e authored Sep 25, 2017 by Alexandre Delanoë
Hide whitespace changes
Inline Side-by-side

Showing with 118 additions and 14 deletions

db.py gargantext/util/db.py +1 -0

main.py gargantext/util/toolchain/main.py +15 -2

gargantext_notebook.py install/notebook/gargantext_notebook.py +102 -12

No files found.
--- a/gargantext/util/db.py
+++ b/gargantext/util/db.py
@@ -25,6 +25,7 @@ session = scoped_session(sessionmaker(bind=engine))
 ########################################################################
 from sqlalchemy.orm import aliased
 from sqlalchemy import func, desc
+from sqlalchemy.sql.expression import case

 ########################################################################
 # bulk insertions

--- a/gargantext/util/toolchain/main.py
+++ b/gargantext/util/toolchain/main.py
@@ -62,12 +62,12 @@ def parse_extract_indexhyperdata(corpus):

    # apply actions
    print('CORPUS #%d' % (corpus.id))
-    
+
    corpus.status('Docs', progress=1)
    corpus.save_hyperdata()
    session.commit()
    parse(corpus)
-    
+
    docs = corpus.children("DOCUMENT").count()
    print('CORPUS #%d: parsed %d' % (corpus.id, docs))
    extract_ngrams(corpus)
@@ -242,6 +242,19 @@ def recount(corpus_id):
    corpus.save_hyperdata()
    session.commit()

+    # START OF KLUDGE...
+    from gargantext.models import NodeNgram, DocumentNode
+    from .ngrams_addition import index_new_ngrams
+    maplist_id = corpus.children("MAPLIST").first().id
+    ngram_ids = session.query(NodeNgram.ngram_id.distinct())
+    indexed_ngrams = ngram_ids.join(DocumentNode).filter(DocumentNode.parent_id==corpus.id)
+    not_indexed_ngrams = ngram_ids.filter(NodeNgram.node_id==maplist_id,
+                                          ~NodeNgram.ngram_id.in_(indexed_ngrams))
+    not_indexed_ngrams = [x[0] for x in not_indexed_ngrams]
+    added = index_new_ngrams(not_indexed_ngrams, corpus)
+    print('RECOUNT #%d: [%s] indexed %s ngrams' % (corpus.id, t(), added))
+    # ...END OF KLUDGE
+
    # -> overwrite occurrences (=> NodeNodeNgram)
    occ_id = compute_occs(corpus,
                            groupings_id = group_id,

--- a/install/notebook/gargantext_notebook.py
+++ b/install/notebook/gargantext_notebook.py
@@ -15,8 +15,9 @@ os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gargantext.settings')
 django.setup()

 from gargantext.constants import QUERY_SIZE_N_MAX, get_resource, get_resource_by_name
-from gargantext.models import Node, ProjectNode, DocumentNode
-from gargantext.util.db import session, get_engine, func
+from gargantext.models import (Node, ProjectNode, DocumentNode,
+                               Ngram, NodeNgram, NodeNgramNgram, NodeNodeNgram)
+from gargantext.util.db import session, get_engine, func, aliased, case
 from collections import Counter
 import importlib
 from django.http import Http404
@@ -53,20 +54,32 @@ def scan_hal(request):
    return hal.scan_results(request)


-def _search_docs(corpus_id, request):
-    return (session.query(DocumentNode)
-                   .filter_by(parent_id=corpus_id)
-                   .filter(Node.title_abstract.match(request)))
+def _search_docs(corpus_id, request, fast=False):
+    q = session.query(DocumentNode).filter_by(parent_id=corpus_id)

+    # Search ngram <request> in hyperdata <field>
+    H = lambda field, request: Node.hyperdata[field].astext.op('~*')(request)

-def scan_gargantext(corpus_id, request):
-    return (_search_docs(corpus_id, request)
-                .with_entities(func.count(DocumentNode.id.distinct()))
-                .one())[0]
+    if not fast:
+        # Only match <request> starting and ending with word boundary
+        # Sequence of spaces will match any sequence of spaces
+        request = '\s+'.join(filter(None, r'\m{}\M'.format(request).split(' ')))

+    return q.filter(Node.title_abstract.match(request)) if fast else \
+           q.filter(H('title', request) | H('abstract', request))

-def scan_gargantext_and_delete(corpus_id, request):
-    r = _search_docs(corpus_id, request).delete(synchronize_session='fetch')
+
+def scan_gargantext(corpus_id, request, fast=False, documents=False):
+    query = _search_docs(corpus_id, request, fast)
+
+    if documents:
+        return query.all()
+
+    return query.with_entities(func.count(DocumentNode.id.distinct())).one()[0]
+
+
+def scan_gargantext_and_delete(corpus_id, request, fast=False):
+    r = _search_docs(corpus_id, request, fast).delete(synchronize_session='fetch')
    session.commit()

    return r
@@ -191,3 +204,80 @@ def run_moissonneur(moissonneur, project, name, query):
        session.commit()

    return corpus
+
+
+ALL_LIST_TYPES = ['main', 'map', 'stop']
+
+
+def _ngrams(corpus_id, list_types, entities):
+    list_types = (list_types,) if isinstance(list_types, str) else list_types
+    list_typenames = [
+        '{}LIST'.format(t.upper()) for t in list_types if t in ALL_LIST_TYPES]
+
+    # `Node` is our list, ie. MAINLIST and/or MAPLIST and/or STOPLIST
+    return (session.query(*entities)
+                   .select_from(Ngram)
+                   .filter(NodeNgram.ngram_id==Ngram.id,
+                           NodeNgram.node_id==Node.id,
+                           Node.parent_id==corpus_id,
+                           Node.typename.in_(list_typenames)))
+
+
+def corpus_list(corpus_id, list_types=ALL_LIST_TYPES, with_synonyms=False,
+                with_count=False):
+    # Link between a GROUPLIST, a normal form (ngram1), and a synonym (ngram2)
+    NNN = NodeNgramNgram
+
+    # Get the list type from the Node type -- as in CSV export
+    list_type = (case([(Node.typename=='MAINLIST', 'main'),
+                       (Node.typename=='MAPLIST',  'map'),
+                       (Node.typename=='STOPLIST', 'stop')])
+                 .label('type'))
+
+    # We will retrieve each ngram as the following tuple:
+    entities = (list_type, Ngram.terms.label('ng'))
+
+    if with_count:
+        entities += (Ngram.id.label('id'),)
+
+    # First, get ngrams from wanted lists
+    ngrams = _ngrams(corpus_id, list_types, entities)
+
+    # Secondly, exclude "synonyms" (grouped ngrams that are not normal forms).
+    # We have to exclude synonyms first because data is inconsistent and some
+    # of them can be both in GROUPLIST and in MAIN/MAP/STOP lists. We want to
+    # take synonyms from GROUPLIST only -- see below.
+    Groups = aliased(Node, name='groups')
+    query = (ngrams.outerjoin(Groups, (Groups.parent_id==corpus_id) & (Groups.typename=='GROUPLIST'))
+                   .outerjoin(NNN, (NNN.node_id==Groups.id) & (NNN.ngram2_id==Ngram.id))
+                   .filter(NNN.ngram1_id==None))
+
+    # If `with_synonyms` is True, add them from GROUPLIST: this is the reliable
+    # source for them
+    if with_synonyms:
+        Synonym = aliased(Ngram)
+        ent = (list_type, Synonym.terms.label('ng'), Synonym.id.label('id'))
+        synonyms = (ngrams.with_entities(*ent)
+                          .filter(NNN.ngram1_id==Ngram.id,
+                                  NNN.ngram2_id==Synonym.id,
+                                  NNN.node_id==Groups.id,
+                                  Groups.parent_id==corpus_id,
+                                  Groups.typename=='GROUPLIST'))
+        query = query.union(synonyms)
+
+    # Again, data is inconsistent: MAINLIST may intersect with MAPLIST and
+    # we don't wan't that
+    if 'main' in list_types and 'map' not in list_types:
+        # Exclude MAPLIST ngrams from MAINLIST
+        query = query.except_(_ngrams(corpus_id, 'map', entities))
+
+    if with_count:
+        N = query.subquery()
+        return (session.query(N.c.type, N.c.ng, NodeNodeNgram.score)
+                       .join(Node, (Node.parent_id==corpus_id) & (Node.typename=='OCCURRENCES'))
+                       .outerjoin(NodeNodeNgram, (NodeNodeNgram.ngram_id==N.c.id) &
+                                                 (NodeNodeNgram.node1_id==Node.id) &
+                                                 (NodeNodeNgram.node2_id==corpus_id)))
+
+    # Return found ngrams sorted by list type, and then alphabetically
+    return query.order_by('type', 'ng')