Commit 67d60c9d authored by Mathieu Rodic's avatar Mathieu Rodic

[BUG] fixed some issues with ngrams extraction

[CODE] improved bulk_insert rows management
[OPTI] ngrams extraction is treated little by little
parent a5fc141c
...@@ -15,7 +15,3 @@ Path for data used by taggers should be defined in `gargantext.constants`. ...@@ -15,7 +15,3 @@ Path for data used by taggers should be defined in `gargantext.constants`.
# Database # Database
## Bulk insertion
The replacement of spaces should be more elegant.
...@@ -54,8 +54,6 @@ class bulk_insert: ...@@ -54,8 +54,6 @@ class bulk_insert:
def __init__(self, table, fields, data, cursor=None): def __init__(self, table, fields, data, cursor=None):
# prepare the iterator # prepare the iterator
self.iter = iter(data) self.iter = iter(data)
# template
self.template = '%s' + (len(fields) - 1) * '\t%s' + '\n'
# prepare the cursor # prepare the cursor
if cursor is None: if cursor is None:
db, cursor = get_cursor() db, cursor = get_cursor()
...@@ -71,10 +69,13 @@ class bulk_insert: ...@@ -71,10 +69,13 @@ class bulk_insert:
db.commit() db.commit()
def read(self, size=None): def read(self, size=None):
# see http://www.postgresql.org/docs/9.4/static/sql-copy.html#AEN72054
try: try:
return self.template % tuple( return '\t'.join(
str(x).replace('\r', ' ').replace('\n', ' ').replace('\t', ' ').replace("\\","") for x in next(self.iter) value.replace('\\', '\\\\').replace('\n', '\\\n').replace('\r', '\\\r').replace('\t', '\\\t')
) if isinstance(value, str) else str(value)
for value in next(self.iter)
) + '\n'
except StopIteration: except StopIteration:
return '' return ''
...@@ -128,6 +129,7 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None): ...@@ -128,6 +129,7 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None):
row[1]: row[0] for row in cursor.fetchall() row[1]: row[0] for row in cursor.fetchall()
} }
# this is the end! # this is the end!
cursor.execute('DROP TABLE __tmp__')
if mustcommit: if mustcommit:
db.commit() db.commit()
return result return result
...@@ -31,6 +31,8 @@ class NgramsExtractor: ...@@ -31,6 +31,8 @@ class NgramsExtractor:
class NgramsExtractors(dict): class NgramsExtractors(dict):
def __missing__(self, key): def __missing__(self, key):
if not isinstance(key, str):
raise KeyError
if len(key) == 2 and key == key.lower(): if len(key) == 2 and key == key.lower():
tagger = LANGUAGES[key]['tagger'] tagger = LANGUAGES[key]['tagger']
self[key] = NgramsExtractor(tagger) self[key] = NgramsExtractor(tagger)
......
...@@ -15,5 +15,8 @@ def parse_extract(corpus): ...@@ -15,5 +15,8 @@ def parse_extract(corpus):
print('NO SUCH CORPUS: #%d' % corpus_id) print('NO SUCH CORPUS: #%d' % corpus_id)
return return
# apply actions # apply actions
print('CORPUS #%d' % (corpus.id))
parse(corpus) parse(corpus)
print('CORPUS #%d: parsed' % (corpus.id))
extract_ngrams(corpus) extract_ngrams(corpus)
print('CORPUS #%d: extracted ngrams' % (corpus.id))
...@@ -6,6 +6,31 @@ from gargantext.util.ngramsextractors import ngramsextractors ...@@ -6,6 +6,31 @@ from gargantext.util.ngramsextractors import ngramsextractors
from collections import defaultdict from collections import defaultdict
def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor):
print('INTEGRATE')
# integrate ngrams
ngrams_ids = bulk_insert_ifnotexists(
model = Ngram,
uniquekey = 'terms',
fields = ('terms', 'n'),
data = ngrams_data,
cursor = cursor,
)
db.commit()
# integrate node-ngram associations
nodes_ngrams_data = tuple(
(node_ngram[0], ngrams_ids[node_ngram[1]], count)
for node_ngram, count in nodes_ngrams_count.items()
)
bulk_insert(
table = NodeNgram,
fields = ('node_id', 'ngram_id', 'weight'),
data = nodes_ngrams_data,
cursor = cursor,
)
db.commit()
def extract_ngrams(corpus, rule='{<JJ.*>*<NN.*>+<JJ.*>*}', keys=('title', 'abstract', )): def extract_ngrams(corpus, rule='{<JJ.*>*<NN.*>+<JJ.*>*}', keys=('title', 'abstract', )):
"""Extract ngrams for every document below the given corpus. """Extract ngrams for every document below the given corpus.
Default language is given by the resource type. Default language is given by the resource type.
...@@ -20,37 +45,28 @@ def extract_ngrams(corpus, rule='{<JJ.*>*<NN.*>+<JJ.*>*}', keys=('title', 'abstr ...@@ -20,37 +45,28 @@ def extract_ngrams(corpus, rule='{<JJ.*>*<NN.*>+<JJ.*>*}', keys=('title', 'abstr
resource_type = RESOURCETYPES[resource_type_index] resource_type = RESOURCETYPES[resource_type_index]
default_language_iso2 = resource_type['default_language'] default_language_iso2 = resource_type['default_language']
for document in corpus.children('DOCUMENT'): for document in corpus.children('DOCUMENT'):
# get ngrams extractor for the current document
language_iso2 = document.hyperdata.get('language_iso2', default_language_iso2)
try:
ngramsextractor = ngramsextractors[language_iso2]
except KeyError:
print('Unrecognized language: `%s`' % (language_iso2, ))
continue
# extract ngrams on each of the considered keys
for key in keys: for key in keys:
value = document.hyperdata.get(key, '') value = document.hyperdata.get(key, None)
if len(value) == 0: if not isinstance(value, str):
continue continue
# get ngrams # get ngrams
language_iso2 = document.hyperdata.get('language_iso2', default_language_iso2)
ngramsextractor = ngramsextractors[language_iso2]
for ngram in ngramsextractor.extract(value): for ngram in ngramsextractor.extract(value):
tokens = tuple(token[0] for token in ngram) tokens = tuple(token[0] for token in ngram)
terms = ' '.join(tokens) terms = ' '.join(tokens)
nodes_ngrams_count[(document.id, terms)] += 1 nodes_ngrams_count[(document.id, terms)] += 1
ngrams_data.add((terms[:255], len(tokens), )) ngrams_data.add((terms[:255], len(tokens), ))
# integrate ngrams # integrate ngrams and nodes-ngrams
ngrams_ids = bulk_insert_ifnotexists( if len(nodes_ngrams_count) >= 4096:
model = Ngram, _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
uniquekey = 'terms', nodes_ngrams_count.clear()
fields = ('terms', 'n'), ngrams_data.clear()
data = ngrams_data, # integrate ngrams and nodes-ngrams
cursor = cursor, _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
)
db.commit()
# integrate node-ngram associations
nodes_ngrams_data = tuple(
(node_ngram[0], ngrams_ids[node_ngram[1]], count)
for node_ngram, count in nodes_ngrams_count.items()
)
bulk_insert(
table = NodeNgram,
fields = ('node_id', 'ngram_id', 'weight'),
data = nodes_ngrams_data,
cursor = cursor,
)
db.commit()
# the end!
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment