Commit 403913fc authored by c24b's avatar c24b

filtering out ignored docs

parent 961ba2d8
......@@ -36,7 +36,7 @@ def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor):
db.commit()
def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_INDEX_SUBGRAMS):
def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_INDEX_SUBGRAMS):
"""Extract ngrams for every document below the given corpus.
Default language is given by the resource type.
The result is then inserted into database.
......@@ -52,78 +52,52 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
documents_count = 0
#load available taggers for source default langage
tagger_bots = {lang: load_tagger(lang) for lang in source['default_languages']}
#skipped documents that have an unsupported languages
corpus.skipped_docs = [doc.id for doc in corpus.children('DOCUMENT') if doc.hyperdata["language_iso2"] not in source["default_languages"]]
print(corpus.hyperdata["languages"])
#add it to corpus.Language info
#diff = set(corpus.hyperdata["languages"].keys()) - set(source["default_languages"]))
#if len(diff) > 1:
# if lang_doc in corpus.hyperdata['languages']:
# skipped_lg_infos = corpus.hyperdata['languages'].pop(lang_doc)
# corpus.hyperdata['languages']['__skipped__'][lang_doc] = skipped_lg_infos
# corpus.save_hyperdata()
# session.commit()
# continue
for documents_count, document in enumerate(corpus.children('DOCUMENT')):
#skipped documents that have been skipped previously for parsing error or unsupported language
docs = [doc for doc in corpus.children('DOCUMENT') if doc.id not in corpus.skipped_docs]
#print(corpus.hyperdata["languages"])
for documents_count, document in enumerate(docs):
lang_doc = document.hyperdata['language_iso2']
if document not in corpus.skipped_docs:
# if document.id in corpus.skipped_docs:
# # get the langage of the current document
# # skip document
# print('Unsupported language: `%s` (doc #%i)' % (lang_doc, document.id))
# # and remember that for later processes (eg stemming)
# #document.hyperdata['__skipped__'] = 'ngrams_extraction'
# #document.save_hyperdata()
# #session.commit()
# continue
#
#
# else:
# extract ngrams on each of the considered keys
ngramextractor = tagger_bots[lang_doc]
for key in keys:
value = document.hyperdata.get(key, None)
if not isinstance(value, str):
continue
ngramextractor = tagger_bots[lang_doc]
for key in keys:
value = document.hyperdata.get(key, None)
if not isinstance(value, str):
continue
# get ngrams
for ngram in ngramsextractor.extract(value):
tokens = tuple(normalize_forms(token[0]) for token in ngram)
if do_subngrams:
# ex tokens = ["very", "cool", "exemple"]
# subterms = [['very', 'cool'],
# ['very', 'cool', 'exemple'],
# ['cool', 'exemple']]
subterms = subsequences(tokens)
else:
subterms = [tokens]
for seqterm in subterms:
ngram = ' '.join(seqterm)
if len(ngram) > 1:
# doc <=> ngram index
nodes_ngrams_count[(document.id, ngram)] += 1
# add fields : terms n
ngrams_data.add((ngram[:255], len(seqterm), ))
# integrate ngrams and nodes-ngrams
if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE:
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
nodes_ngrams_count.clear()
ngrams_data.clear()
if documents_count % BATCH_NGRAMSEXTRACTION_SIZE == 0:
corpus.status('Ngrams', progress=documents_count+1)
corpus.save_hyperdata()
session.commit()
for ngram in ngramsextractor.extract(value):
tokens = tuple(normalize_forms(token[0]) for token in ngram)
if do_subngrams:
# ex tokens = ["very", "cool", "exemple"]
# subterms = [['very', 'cool'],
# ['very', 'cool', 'exemple'],
# ['cool', 'exemple']]
subterms = subsequences(tokens)
else:
subterms = [tokens]
for seqterm in subterms:
ngram = ' '.join(seqterm)
if len(ngram) > 1:
# doc <=> ngram index
nodes_ngrams_count[(document.id, ngram)] += 1
# add fields : terms n
ngrams_data.add((ngram[:255], len(seqterm), ))
# integrate ngrams and nodes-ngrams
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
corpus.status('Ngrams', progress=documents_count+1, complete=True)
corpus.save_hyperdata()
session.commit()
if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE:
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
nodes_ngrams_count.clear()
ngrams_data.clear()
if documents_count % BATCH_NGRAMSEXTRACTION_SIZE == 0:
corpus.status('Ngrams', progress=documents_count+1)
corpus.save_hyperdata()
session.commit()
# integrate ngrams and nodes-ngrams
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
corpus.status('Ngrams', progress=documents_count+1, complete=True)
corpus.save_hyperdata()
session.commit()
except Exception as error:
corpus.status('Ngrams', error=error)
corpus.save_hyperdata()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment