[BUG] fixed some issues with ngrams extraction

[CODE] improved bulk_insert rows management [OPTI] ngrams extraction is treated little by little

[BUG] fixed some issues with ngrams extraction
[CODE] improved bulk_insert rows management [OPTI] ngrams extraction is treated little by little
67d60c9d · Mathieu Rodic · a5fc141c · 67d60c9d · 67d60c9d · 67d60c9d
Commit 67d60c9d authored Feb 18, 2016 by Mathieu Rodic
5 changed files
--- a/TODO.md
+++ b/TODO.md
@@ -15,7 +15,3 @@ Path for data used by taggers should be defined in `gargantext.constants`.


 # Database
-
-## Bulk insertion
-
-The replacement of spaces should be more elegant.
--- a/gargantext/util/db.py
+++ b/gargantext/util/db.py
@@ -54,8 +54,6 @@ class bulk_insert:
    def __init__(self, table, fields, data, cursor=None):
        # prepare the iterator
        self.iter = iter(data)
-        # template
-        self.template = '%s' + (len(fields) - 1) * '\t%s' + '\n'
        # prepare the cursor
        if cursor is None:
            db, cursor = get_cursor()
@@ -71,10 +69,13 @@ class bulk_insert:
            db.commit()

    def read(self, size=None):
+        # see http://www.postgresql.org/docs/9.4/static/sql-copy.html#AEN72054
        try:
-            return self.template % tuple(
-                str(x).replace('\r', ' ').replace('\n', ' ').replace('\t', ' ').replace("\\","") for x in next(self.iter)
-            )
+            return '\t'.join(
+                value.replace('\\', '\\\\').replace('\n', '\\\n').replace('\r', '\\\r').replace('\t', '\\\t')
+                if isinstance(value, str) else str(value)
+                for value in next(self.iter)
+            ) + '\n'
        except StopIteration:
            return ''

@@ -128,6 +129,7 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None):
        row[1]: row[0] for row in cursor.fetchall()
    }
    # this is the end!
+    cursor.execute('DROP TABLE __tmp__')
    if mustcommit:
        db.commit()
    return result
--- a/gargantext/util/ngramsextractors.py
+++ b/gargantext/util/ngramsextractors.py
@@ -31,6 +31,8 @@ class NgramsExtractor:
 class NgramsExtractors(dict):

    def __missing__(self, key):
+        if not isinstance(key, str):
+            raise KeyError
        if len(key) == 2 and key == key.lower():
            tagger = LANGUAGES[key]['tagger']
            self[key] = NgramsExtractor(tagger)

--- a/gargantext/util/toolchain/__init__.py
+++ b/gargantext/util/toolchain/__init__.py
@@ -15,5 +15,8 @@ def parse_extract(corpus):
            print('NO SUCH CORPUS: #%d' % corpus_id)
            return
    # apply actions
+    print('CORPUS #%d' % (corpus.id))
    parse(corpus)
+    print('CORPUS #%d: parsed' % (corpus.id))
    extract_ngrams(corpus)
+    print('CORPUS #%d: extracted ngrams' % (corpus.id))
--- a/gargantext/util/toolchain/ngrams_extraction.py
+++ b/gargantext/util/toolchain/ngrams_extraction.py
@@ -6,6 +6,31 @@ from gargantext.util.ngramsextractors import ngramsextractors
 from collections import defaultdict


+def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor):
+    print('INTEGRATE')
+    # integrate ngrams
+    ngrams_ids = bulk_insert_ifnotexists(
+        model = Ngram,
+        uniquekey = 'terms',
+        fields = ('terms', 'n'),
+        data = ngrams_data,
+        cursor = cursor,
+    )
+    db.commit()
+    # integrate node-ngram associations
+    nodes_ngrams_data = tuple(
+        (node_ngram[0], ngrams_ids[node_ngram[1]], count)
+        for node_ngram, count in nodes_ngrams_count.items()
+    )
+    bulk_insert(
+        table = NodeNgram,
+        fields = ('node_id', 'ngram_id', 'weight'),
+        data = nodes_ngrams_data,
+        cursor = cursor,
+    )
+    db.commit()
+
+
 def extract_ngrams(corpus, rule='{<JJ.*>*<NN.*>+<JJ.*>*}', keys=('title', 'abstract', )):
    """Extract ngrams for every document below the given corpus.
    Default language is given by the resource type.
@@ -20,37 +45,28 @@ def extract_ngrams(corpus, rule='{<JJ.*>*<NN.*>+<JJ.*>*}', keys=('title', 'abstr
    resource_type = RESOURCETYPES[resource_type_index]
    default_language_iso2 = resource_type['default_language']
    for document in corpus.children('DOCUMENT'):
+        # get ngrams extractor for the current document
+        language_iso2 = document.hyperdata.get('language_iso2', default_language_iso2)
+        try:
+            ngramsextractor = ngramsextractors[language_iso2]
+        except KeyError:
+            print('Unrecognized language: `%s`' % (language_iso2, ))
+            continue
+        # extract ngrams on each of the considered keys
        for key in keys:
-            value = document.hyperdata.get(key, '')
-            if len(value) == 0:
+            value = document.hyperdata.get(key, None)
+            if not isinstance(value, str):
                continue
            # get ngrams
-            language_iso2 = document.hyperdata.get('language_iso2', default_language_iso2)
-            ngramsextractor = ngramsextractors[language_iso2]
            for ngram in ngramsextractor.extract(value):
                tokens = tuple(token[0] for token in ngram)
                terms = ' '.join(tokens)
                nodes_ngrams_count[(document.id, terms)] += 1
                ngrams_data.add((terms[:255], len(tokens), ))
-    # integrate ngrams
-    ngrams_ids = bulk_insert_ifnotexists(
-        model = Ngram,
-        uniquekey = 'terms',
-        fields = ('terms', 'n'),
-        data = ngrams_data,
-        cursor = cursor,
-    )
-    db.commit()
-    # integrate node-ngram associations
-    nodes_ngrams_data = tuple(
-        (node_ngram[0], ngrams_ids[node_ngram[1]], count)
-        for node_ngram, count in nodes_ngrams_count.items()
-    )
-    bulk_insert(
-        table = NodeNgram,
-        fields = ('node_id', 'ngram_id', 'weight'),
-        data = nodes_ngrams_data,
-        cursor = cursor,
-    )
-    db.commit()
-    # the end!
+        # integrate ngrams and nodes-ngrams
+        if len(nodes_ngrams_count) >= 4096:
+            _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
+            nodes_ngrams_count.clear()
+            ngrams_data.clear()
+    # integrate ngrams and nodes-ngrams
+    _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)