Commit 67d60c9d authored by Mathieu Rodic's avatar Mathieu Rodic

[BUG] fixed some issues with ngrams extraction

[CODE] improved bulk_insert rows management
[OPTI] ngrams extraction is treated little by little
parent a5fc141c
......@@ -15,7 +15,3 @@ Path for data used by taggers should be defined in `gargantext.constants`.
# Database
## Bulk insertion
The replacement of spaces should be more elegant.
......@@ -54,8 +54,6 @@ class bulk_insert:
def __init__(self, table, fields, data, cursor=None):
# prepare the iterator
self.iter = iter(data)
# template
self.template = '%s' + (len(fields) - 1) * '\t%s' + '\n'
# prepare the cursor
if cursor is None:
db, cursor = get_cursor()
......@@ -71,10 +69,13 @@ class bulk_insert:
db.commit()
def read(self, size=None):
# see http://www.postgresql.org/docs/9.4/static/sql-copy.html#AEN72054
try:
return self.template % tuple(
str(x).replace('\r', ' ').replace('\n', ' ').replace('\t', ' ').replace("\\","") for x in next(self.iter)
)
return '\t'.join(
value.replace('\\', '\\\\').replace('\n', '\\\n').replace('\r', '\\\r').replace('\t', '\\\t')
if isinstance(value, str) else str(value)
for value in next(self.iter)
) + '\n'
except StopIteration:
return ''
......@@ -128,6 +129,7 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None):
row[1]: row[0] for row in cursor.fetchall()
}
# this is the end!
cursor.execute('DROP TABLE __tmp__')
if mustcommit:
db.commit()
return result
......@@ -31,6 +31,8 @@ class NgramsExtractor:
class NgramsExtractors(dict):
def __missing__(self, key):
if not isinstance(key, str):
raise KeyError
if len(key) == 2 and key == key.lower():
tagger = LANGUAGES[key]['tagger']
self[key] = NgramsExtractor(tagger)
......
......@@ -15,5 +15,8 @@ def parse_extract(corpus):
print('NO SUCH CORPUS: #%d' % corpus_id)
return
# apply actions
print('CORPUS #%d' % (corpus.id))
parse(corpus)
print('CORPUS #%d: parsed' % (corpus.id))
extract_ngrams(corpus)
print('CORPUS #%d: extracted ngrams' % (corpus.id))
......@@ -6,6 +6,31 @@ from gargantext.util.ngramsextractors import ngramsextractors
from collections import defaultdict
def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor):
print('INTEGRATE')
# integrate ngrams
ngrams_ids = bulk_insert_ifnotexists(
model = Ngram,
uniquekey = 'terms',
fields = ('terms', 'n'),
data = ngrams_data,
cursor = cursor,
)
db.commit()
# integrate node-ngram associations
nodes_ngrams_data = tuple(
(node_ngram[0], ngrams_ids[node_ngram[1]], count)
for node_ngram, count in nodes_ngrams_count.items()
)
bulk_insert(
table = NodeNgram,
fields = ('node_id', 'ngram_id', 'weight'),
data = nodes_ngrams_data,
cursor = cursor,
)
db.commit()
def extract_ngrams(corpus, rule='{<JJ.*>*<NN.*>+<JJ.*>*}', keys=('title', 'abstract', )):
"""Extract ngrams for every document below the given corpus.
Default language is given by the resource type.
......@@ -20,37 +45,28 @@ def extract_ngrams(corpus, rule='{<JJ.*>*<NN.*>+<JJ.*>*}', keys=('title', 'abstr
resource_type = RESOURCETYPES[resource_type_index]
default_language_iso2 = resource_type['default_language']
for document in corpus.children('DOCUMENT'):
# get ngrams extractor for the current document
language_iso2 = document.hyperdata.get('language_iso2', default_language_iso2)
try:
ngramsextractor = ngramsextractors[language_iso2]
except KeyError:
print('Unrecognized language: `%s`' % (language_iso2, ))
continue
# extract ngrams on each of the considered keys
for key in keys:
value = document.hyperdata.get(key, '')
if len(value) == 0:
value = document.hyperdata.get(key, None)
if not isinstance(value, str):
continue
# get ngrams
language_iso2 = document.hyperdata.get('language_iso2', default_language_iso2)
ngramsextractor = ngramsextractors[language_iso2]
for ngram in ngramsextractor.extract(value):
tokens = tuple(token[0] for token in ngram)
terms = ' '.join(tokens)
nodes_ngrams_count[(document.id, terms)] += 1
ngrams_data.add((terms[:255], len(tokens), ))
# integrate ngrams
ngrams_ids = bulk_insert_ifnotexists(
model = Ngram,
uniquekey = 'terms',
fields = ('terms', 'n'),
data = ngrams_data,
cursor = cursor,
)
db.commit()
# integrate node-ngram associations
nodes_ngrams_data = tuple(
(node_ngram[0], ngrams_ids[node_ngram[1]], count)
for node_ngram, count in nodes_ngrams_count.items()
)
bulk_insert(
table = NodeNgram,
fields = ('node_id', 'ngram_id', 'weight'),
data = nodes_ngrams_data,
cursor = cursor,
)
db.commit()
# the end!
# integrate ngrams and nodes-ngrams
if len(nodes_ngrams_count) >= 4096:
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
nodes_ngrams_count.clear()
ngrams_data.clear()
# integrate ngrams and nodes-ngrams
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment