[FEAT] ngrams are parsed when a corpus is uploaded

(yeeehhhah!)

[FEAT] ngrams are parsed when a corpus is uploaded
(yeeehhhah!)
a5fc141c · Mathieu Rodic · edf1f157 · a5fc141c · a5fc141c · a5fc141c
Commit a5fc141c authored Feb 18, 2016 by Mathieu Rodic
16 changed files
--- a/README.md
+++ b/README.md
@@ -6,6 +6,7 @@ sudo pip3 install virtualenv
 sudo apt-get install rabbitmq-server
 virtualenv-3.4 VENV
 source VENV/bin/activate
+pip install git+https://github.com/zzzeek/sqlalchemy.git@rel_1_1
 pip install -U -r requirements.txt
 ```

--- a/TODO.md
+++ b/TODO.md
@@ -7,3 +7,15 @@
 ## Single project view
 - re-implement deletion
+# Taggers
+Path for data used by taggers should be defined in `gargantext.constants`.
+# Database
+## Bulk insertion
+The replacement of spaces should be more elegant.
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -9,12 +9,18 @@ NODETYPES = [
    'DOCUMENT',
 ]
+from gargantext.util.taggers import *
 LANGUAGES = {
-    'fr': {
-        # 'tagger': FrenchNgramsTagger
-    },
    'en': {
-        # 'tagger': EnglishNgramsTagger
+        'tagger': TurboTagger,
+        # 'tagger': EnglishMeltTagger,
+        # 'tagger': NltkTagger,
+    },
+    'fr': {
+        'tagger': FrenchMeltTagger,
+        # 'tagger': TreeTagger,
    },
 }

--- a/gargantext/models/__init__.py
+++ b/gargantext/models/__init__.py
 from .nodes import *
 from .users import *
+from .ngrams import *
--- a/gargantext/models/ngrams.py
+++ b/gargantext/models/ngrams.py
+from gargantext.util.db import *
+from gargantext.util.files import upload
+from gargantext.constants import *
+from .nodes import Node
+__all__ = ['Ngram', 'NodeNgram']
+class Ngram(Base):
+    __tablename__ = 'ngrams'
+    id = Column(Integer, primary_key=True)
+    terms = Column(String(255), unique=True)
+    n = Column(Integer)
+class NodeNgram(Base):
+    __tablename__ = 'nodes_ngrams'
+    id = Column(Integer)
+    node_id = Column(Integer, ForeignKey(Node.id), primary_key=True)
+    ngram_id = Column(Integer, ForeignKey(Ngram.id), primary_key=True)
+    weight = Column(Float)
--- a/gargantext/models/nodes.py
+++ b/gargantext/models/nodes.py
@@ -30,12 +30,13 @@ class Node(Base):
    # main data
    name = Column(String(255))
    date  = Column(DateTime(), default=datetime.now)
-    # metadata
+    # metadata (see https://bashelton.com/2014/03/updating-postgresql-json-fields-via-sqlalchemy/)
-    hyperdata = Column(JSONB, default={})
+    hyperdata = Column(JSONB, default=dict)
    def __init__(self, **kwargs):
+        if 'hyperdata' not in kwargs:
+            kwargs['hyperdata'] = kwargs.get('hyperdata', {})
        Base.__init__(self, **kwargs)
-        self.hyperdata = {}
    def __getitem__(self, key):
        return self.hyperdata[key]
@@ -43,6 +44,22 @@ class Node(Base):
    def __setitem__(self, key, value):
        self.hyperdata[key] = value
+    def save_hyperdata(self):
+        """This is a necessary, yet ugly trick.
+        Indeed, PostgreSQL does not yet manage incremental updates (see
+        https://bashelton.com/2014/03/updating-postgresql-json-fields-via-sqlalchemy/)
+        """
+        from sqlalchemy.orm.attributes import flag_modified
+        flag_modified(self, 'hyperdata')
+        # # previous trick (even super-uglier)
+        # hyperdata = self.hyperdata
+        # self.hyperdata = None
+        # session.add(self)
+        # session.commit()
+        # self.hyperdata = hyperdata
+        # session.add(self)
+        # session.commit()
    def children(self, typename=None):
        """Return a query to all the direct children of the current node.
        Allows filtering by typename (see `constants.py`)
@@ -63,26 +80,22 @@ class Node(Base):
    def resources(self):
        if 'resources' not in self.hyperdata:
-            self.hyperdata['resources'] = []
+            self['resources'] = MutableList()
        return self['resources']
    def add_resource(self, type, path=None, url=None):
-        self.resources().append({'type': type, 'path':path, 'url':url})
+        self.resources().append(MutableDict(
+            {'type': type, 'path':path, 'url':url, 'extracted': False}
+        ))
-    def status(self, action=None, progress=None, autocommit=False):
+    def status(self, action=None, progress=None):
        if 'status' not in self.hyperdata:
-            self['status'] = {'action': action, 'progress': progress}
+            self['status'] = MutableDict(
+                {'action': action, 'progress': progress}
+            )
        else:
            if action is not None:
                self['status']['action'] = action
            if progress is not None:
                self['status']['progress'] = progress
-        if autocommit:
-            hyperdata = self.hyperdata.copy()
-            self.hyperdata = None
-            session.add(self)
-            session.commit()
-            self.hyperdata = hyperdata
-            session.add(self)
-            session.commit()
        return self['status']
--- a/gargantext/settings.py
+++ b/gargantext/settings.py
@@ -36,7 +36,7 @@ djcelery.setup_loader()
 BROKER_URL = 'amqp://guest:guest@localhost:5672/'
 CELERY_ACCEPT_CONTENT = ['pickle', 'json', 'msgpack', 'yaml']
 CELERY_IMPORTS = (
-    'gargantext.util.workflow',
+    'gargantext.util.toolchain',
    # 'gargantext.models',
    # 'gargantext.util.db',
 )

--- a/gargantext/util/db.py
+++ b/gargantext/util/db.py
@@ -51,11 +51,11 @@ def get_cursor():
 class bulk_insert:
-    def __init__(self, table, keys, data, cursor=None):
+    def __init__(self, table, fields, data, cursor=None):
        # prepare the iterator
        self.iter = iter(data)
        # template
-        self.template = '%s' + (len(keys) - 1) * '\t%s' + '\n'
+        self.template = '%s' + (len(fields) - 1) * '\t%s' + '\n'
        # prepare the cursor
        if cursor is None:
            db, cursor = get_cursor()
@@ -65,7 +65,7 @@ class bulk_insert:
        # insert data
        if not isinstance(table, str):
            table = table.__tablename__
-        cursor.copy_from(self, table, columns=keys)
+        cursor.copy_from(self, table, columns=fields)
        # commit if necessary
        if mustcommit:
            db.commit()
@@ -81,15 +81,19 @@ class bulk_insert:
    readline = read
-def bulk_insert_ifnotexists(model, uniquekey, fields, values):
+def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None):
-    db, cursor = get_cursor()
+    if cursor is None:
-    # create temporary table with given values
+        db, cursor = get_cursor()
+        mustcommit = True
+    else:
+        mustcommit = False
+    # create temporary table with given data
    sql_columns = 'id INTEGER'
    for field in fields:
        column = getattr(model, field)
        sql_columns += ', %s %s' % (field, column.type, )
    cursor.execute('CREATE TEMPORARY TABLE __tmp__ (%s)' % (sql_columns, ))
-    bulk_insert('__tmp__', fields, values, cursor=cursor)
+    bulk_insert('__tmp__', fields, data, cursor=cursor)
    # update ids of the temporary table
    cursor.execute('''
        UPDATE __tmp__
@@ -124,5 +128,6 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, values):
        row[1]: row[0] for row in cursor.fetchall()
    }
    # this is the end!
-    db.commit()
+    if mustcommit:
+        db.commit()
    return result
--- a/gargantext/util/ngramsextractors.py
+++ b/gargantext/util/ngramsextractors.py
+from gargantext.util.languages import languages
+from gargantext.constants import LANGUAGES
+import nltk
+import re
+class NgramsExtractor:
+    def __init__(self, tagger):
+        self._tagger = tagger()
+    @staticmethod
+    def clean_text(text):
+        """Clean the text for better POS tagging.
+        For now, only removes (short) XML tags.
+        """
+        return re.sub(r'<[^>]{0,45}>', '', text)
+    def extract(self, text, rule='{<JJ.*>*<NN.*>+<JJ.*>*}', label='NP'):
+        text = self.clean_text(text)
+        grammar = nltk.RegexpParser(label + ': ' + rule)
+        tagged_tokens = list(self._tagger.tag_text(text))
+        if len(tagged_tokens):
+            grammar_parsed = grammar.parse(tagged_tokens)
+            for subtree in grammar_parsed.subtrees():
+                if subtree.label() == label:
+                    yield subtree.leaves()
+class NgramsExtractors(dict):
+    def __missing__(self, key):
+        if len(key) == 2 and key == key.lower():
+            tagger = LANGUAGES[key]['tagger']
+            self[key] = NgramsExtractor(tagger)
+        else:
+            self[key] = self[LANGUAGES[key].iso3]
+        return self[key]
+# this below will be shared within the current thread
+ngramsextractors = NgramsExtractors()
--- a/gargantext/util/scheduling.py
+++ b/gargantext/util/scheduling.py
 """This module defines three distinct decorators for scheduling.
+ - `scheduled_now` is only there for debugging purpose: the decorated method
+   is executed as is
+ - `scheduled_thread` starts the decorated method as a new thread, but does not
+   really "follow" it
+ - `scheduled_celery` ensures tasks management via Celery, but is preferable not
+   to use while in debugging mode
+Note that it is strongly discouraged to use database objects (model instances,
+etc.) as parameters of methods decorated with those decorators.
+Prefer using built-in types, such as `float`, `str`, `dict` (for a complete
+list, see https://docs.python.org/3/library/stdtypes.html).
 """
 def scheduled_now(func):
    """Provides a decorator to execute the task right away.
    Mostly useful for debugging purpose.
@@ -11,7 +24,7 @@ def scheduled_now(func):
 import threading
 def scheduled_thread(func):
    """Provides a decorator to schedule a task as a new thread.
-    Problem: a shutdown may lose the task forever...
+    Problem: an unexpected shutdown may lose the task forever...
    """
    def go(*args, **kwargs):
        thread = threading.Thread(target=func, args=args, kwargs=kwargs)

--- a/gargantext/util/taggers/TreeTagger.py
+++ b/gargantext/util/taggers/TreeTagger.py
@@ -42,11 +42,12 @@ def _readOutput(output, buffer):
            time.sleep(0.1)
-"""Use TreeTagger for the tagging.
-Shall be used for french texts.
-"""
 class TreeTagger(Tagger):
+    """Use TreeTagger for the tagging.
+    Shall be used for french texts.
+    """
    def start(self, treeTaggerPath='./lib/treetagger'):
        print(treeTaggerPath)
        if treeTaggerPath[0] == '.':

--- a/gargantext/util/toolchain/__init__.py
+++ b/gargantext/util/toolchain/__init__.py
+from .parsing import parse
+from .ngrams_extraction import extract_ngrams
+from gargantext.util.db import session
+from gargantext.models import Node
+def parse_extract(corpus):
+    # retrieve corpus from database from id
+    if isinstance(corpus, int):
+        corpus_id = corpus
+        corpus = session.query(Node).filter(Node.id == corpus_id).first()
+        if corpus is None:
+            print('NO SUCH CORPUS: #%d' % corpus_id)
+            return
+    # apply actions
+    parse(corpus)
+    extract_ngrams(corpus)
--- a/gargantext/util/toolchain/ngrams_extraction.py
+++ b/gargantext/util/toolchain/ngrams_extraction.py
+from gargantext.util.db import *
+from gargantext.models import *
+from gargantext.constants import *
+from gargantext.util.ngramsextractors import ngramsextractors
+from collections import defaultdict
+def extract_ngrams(corpus, rule='{<JJ.*>*<NN.*>+<JJ.*>*}', keys=('title', 'abstract', )):
+    """Extract ngrams for every document below the given corpus.
+    Default language is given by the resource type.
+    The result is then inserted into database.
+    Only fields indicated in `keys` are tagged.
+    """
+    db, cursor = get_cursor()
+    nodes_ngrams_count = defaultdict(int)
+    ngrams_data = set()
+    # extract ngrams
+    resource_type_index = corpus.resources()[0]['type']
+    resource_type = RESOURCETYPES[resource_type_index]
+    default_language_iso2 = resource_type['default_language']
+    for document in corpus.children('DOCUMENT'):
+        for key in keys:
+            value = document.hyperdata.get(key, '')
+            if len(value) == 0:
+                continue
+            # get ngrams
+            language_iso2 = document.hyperdata.get('language_iso2', default_language_iso2)
+            ngramsextractor = ngramsextractors[language_iso2]
+            for ngram in ngramsextractor.extract(value):
+                tokens = tuple(token[0] for token in ngram)
+                terms = ' '.join(tokens)
+                nodes_ngrams_count[(document.id, terms)] += 1
+                ngrams_data.add((terms[:255], len(tokens), ))
+    # integrate ngrams
+    ngrams_ids = bulk_insert_ifnotexists(
+        model = Ngram,
+        uniquekey = 'terms',
+        fields = ('terms', 'n'),
+        data = ngrams_data,
+        cursor = cursor,
+    )
+    db.commit()
+    # integrate node-ngram associations
+    nodes_ngrams_data = tuple(
+        (node_ngram[0], ngrams_ids[node_ngram[1]], count)
+        for node_ngram, count in nodes_ngrams_count.items()
+    )
+    bulk_insert(
+        table = NodeNgram,
+        fields = ('node_id', 'ngram_id', 'weight'),
+        data = nodes_ngrams_data,
+        cursor = cursor,
+    )
+    db.commit()
+    # the end!
--- a/gargantext/util/workflow.py
+++ b/gargantext/util/workflow.py
 from gargantext.util.db import *
 from gargantext.models import *
-from gargantext.util.scheduling import scheduled
 from gargantext.constants import *
-@scheduled
+def parse(corpus):
-def parse(corpus_id):
-    # retrieve corpus from database
-    corpus = session.query(Node).filter(Node.id == corpus_id).first()
-    if corpus is None:
-        print('NO SUCH CORPUS: #%d' % corpus_id)
-        return
    # retrieve resource information
    documents_count = 0
-    for resource in corpus['resources']:
+    for resource in corpus.resources():
        # information about the resource
+        if resource['extracted']:
+            continue
        resource_parser = RESOURCETYPES[resource['type']]['parser']
        resource_path = resource['path']
        # extract and insert documents from corpus resource into database
@@ -27,8 +21,12 @@ def parse(corpus_id):
            )
            session.add(document)
            if documents_count % 64 == 0:
-                corpus.status(action='parsing', progress=documents_count, autocommit=True)
+                corpus.status(action='parsing', progress=documents_count)
+                corpus.save_hyperdata()
            documents_count += 1
+        # update info about the resource
+        resource['extracted'] = True
+        corpus.save_hyperdata()
    # commit all changes
    corpus.status(action='parsing', progress=documents_count)
    session.commit()
--- a/gargantext/views/pages/projects.py
+++ b/gargantext/views/pages/projects.py
-from gargantext.util import workflow
 from gargantext.util.http import *
 from gargantext.util.db import *
 from gargantext.util.db_cache import cache
@@ -6,6 +5,9 @@ from gargantext.util.files import upload
 from gargantext.models import *
 from gargantext.constants import *
+from gargantext.util.scheduling import scheduled
+from gargantext.util.toolchain import parse_extract
 from datetime import datetime
 from collections import defaultdict
 import re
@@ -92,7 +94,8 @@ def project(request, project_id):
        )
        session.add(corpus)
        session.commit()
-        workflow.parse(corpus.id)
+        parse_extract
+        scheduled(parse_extract)(corpus.id)
    # corpora within this project
    corpora = project.children('CORPUS').all()

--- a/requirements.txt
+++ b/requirements.txt
 Django==1.9.2
 PyYAML==3.11
 RandomWords==0.1.12
-SQLAlchemy==1.0.11
+SQLAlchemy==1.1.0b1dev
 amqp==1.4.9
 anyjson==0.3.3
 billiard==3.3.0.22
@@ -15,6 +15,7 @@ jdatetime==1.7.2
 kombu==3.0.33
 lxml==3.5.0
 nltk==3.1
+numpy==1.10.4
 psycopg2==2.6.1
 pycountry==1.20
 python-dateutil==2.4.2