Commit 407b96ab authored by Administrator's avatar Administrator

[FIX] Bug ngrams must be < 255

parent 852f71b6
...@@ -128,7 +128,7 @@ def parse_resources(corpus, user=None, user_id=None): ...@@ -128,7 +128,7 @@ def parse_resources(corpus, user=None, user_id=None):
nodes.append(node) nodes.append(node)
# #
# TODO: mark node-resources associations as parsed # TODO: mark node-resources associations as parsed
# #
dbg.show('insert %d documents' % len(nodes)) dbg.show('insert %d documents' % len(nodes))
session.add_all(nodes) session.add_all(nodes)
session.commit() session.commit()
...@@ -205,7 +205,7 @@ def extract_ngrams(corpus, keys): ...@@ -205,7 +205,7 @@ def extract_ngrams(corpus, keys):
language.id: language.iso2 language.id: language.iso2
for language in session.query(Language) for language in session.query(Language)
} }
ngrams_data = set() ngrams_data = set()
ngrams_language_data = set() ngrams_language_data = set()
ngrams_tag_data = set() ngrams_tag_data = set()
...@@ -241,7 +241,7 @@ def extract_ngrams(corpus, keys): ...@@ -241,7 +241,7 @@ def extract_ngrams(corpus, keys):
#tag_id = 14 #tag_id = 14
#print('tag_id_2', tag_id) #print('tag_id_2', tag_id)
node_ngram_list[node_id][terms] += 1 node_ngram_list[node_id][terms] += 1
ngrams_data.add((n, terms)) ngrams_data.add((n, terms[:255]))
ngrams_language_data.add((terms, language_id)) ngrams_language_data.add((terms, language_id))
ngrams_tag_data.add((terms, tag_id)) ngrams_tag_data.add((terms, tag_id))
...@@ -268,7 +268,7 @@ def extract_ngrams(corpus, keys): ...@@ -268,7 +268,7 @@ def extract_ngrams(corpus, keys):
ngram.terms = tmp__ngrams.terms ngram.terms = tmp__ngrams.terms
''' % (Ngram.__table__.name, )) ''' % (Ngram.__table__.name, ))
# insert, then get the ids back # insert, then get the ids back
cursor.execute(''' cursor.execute('''
INSERT INTO INSERT INTO
%s (n, terms) %s (n, terms)
...@@ -279,8 +279,8 @@ def extract_ngrams(corpus, keys): ...@@ -279,8 +279,8 @@ def extract_ngrams(corpus, keys):
WHERE WHERE
id IS NULL id IS NULL
''' % (Ngram.__table__.name, )) ''' % (Ngram.__table__.name, ))
cursor.execute(''' cursor.execute('''
UPDATE UPDATE
tmp__ngrams tmp__ngrams
...@@ -293,14 +293,14 @@ def extract_ngrams(corpus, keys): ...@@ -293,14 +293,14 @@ def extract_ngrams(corpus, keys):
AND AND
tmp__ngrams.id IS NULL tmp__ngrams.id IS NULL
''' % (Ngram.__table__.name, )) ''' % (Ngram.__table__.name, ))
# get all ids # get all ids
ngram_ids = dict() ngram_ids = dict()
cursor.execute('SELECT id, terms FROM tmp__ngrams') cursor.execute('SELECT id, terms FROM tmp__ngrams')
for row in cursor.fetchall(): for row in cursor.fetchall():
ngram_ids[row[1]] = row[0] ngram_ids[row[1]] = row[0]
# #
dbg.show('insert associations') dbg.show('insert associations')
node_ngram_data = list() node_ngram_data = list()
for node_id, ngrams in node_ngram_list.items(): for node_id, ngrams in node_ngram_list.items():
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment