Commit fa1a268e authored by c24b's avatar c24b

load corresponding tagger without ngramsextractors

parent 9fcf90b5
from gargantext.util.languages import languages
from gargantext.constants import LANGUAGES, DEFAULT_MAX_NGRAM_LEN, RULE_JJNN, RULE_NPN
import nltk
import re
class NgramsExtractor:
def __init__(self, tagger):
self._tagger = tagger()
@staticmethod
def clean_text(text):
"""Clean the text for better POS tagging.
For now, only removes (short) XML tags.
"""
return re.sub(r'<[^>]{0,45}>', '', text)
def extract(self, text, rule=RULE_JJNN, label='NP', max_n_words=DEFAULT_MAX_NGRAM_LEN):
text = self.clean_text(text)
grammar = nltk.RegexpParser(label + ': ' + rule)
tagged_tokens = list(self._tagger.tag_text(text))
if len(tagged_tokens):
grammar_parsed = grammar.parse(tagged_tokens)
for subtree in grammar_parsed.subtrees():
if subtree.label() == label:
if len(subtree) < max_n_words:
yield subtree.leaves()
# ex: [('wild', 'JJ'), ('pollinators', 'NNS')]
class NgramsExtractors(dict):
def __missing__(self, key):
if not isinstance(key, str):
raise KeyError
if len(key) == 2 and key == key.lower():
tagger = LANGUAGES[key]['tagger']
self[key] = NgramsExtractor(tagger)
else:
self[key] = self[LANGUAGES[key].iso3]
return self[key]
# this below will be shared within the current thread
ngramsextractors = NgramsExtractors()
from gargantext.util.db import *
from gargantext.models import *
from gargantext.constants import *
from gargantext.util.ngramsextractors import ngramsextractors
#from gargantext.util.ngramsextractors import ngramsextractors
from collections import defaultdict
from re import sub
......@@ -47,75 +47,83 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
nodes_ngrams_count = defaultdict(int)
ngrams_data = set()
# extract ngrams
resource_type_index = corpus.resources()[0]['type']
resource = corpus.resources()[0]
documents_count = 0
resource_type = RESOURCETYPES[resource_type_index]
default_language_iso2 = resource_type['default_languages']
#load available taggers for source default langage
taggers_bots = {lang: load_tagger(lang) for lang in resource['default_languages']}
#skipped documents that have an unsupported languages
corpus.skipped_docs = [doc.id for doc in enumerate(corpus.children('DOCUMENT')) if doc.hyperdata["language_iso2"] not in resource["default_languages"]]
print(set(corpus.languages.keys()).intersection(resource["default_languages"]))
# if lang_doc in corpus.hyperdata['languages']:
# skipped_lg_infos = corpus.hyperdata['languages'].pop(lang_doc)
# corpus.hyperdata['languages']['__skipped__'][lang_doc] = skipped_lg_infos
# corpus.save_hyperdata()
# session.commit()
# continue
for documents_count, document in enumerate(corpus.children('DOCUMENT')):
# get ngrams extractor for the current document
language_iso2 = document.hyperdata.get('language_iso2', default_language_iso2)
try:
# this looks for a tagger in constants.LANGUAGES
ngramsextractor = ngramsextractors[language_iso2]
except KeyError:
# skip document
print('Unsupported language: `%s` (doc #%i)' % (language_iso2, document.id))
# and remember that for later processes (eg stemming)
document.hyperdata['__skipped__'] = 'ngrams_extraction'
document.save_hyperdata()
session.commit()
if language_iso2 in corpus.hyperdata['languages']:
skipped_lg_infos = corpus.hyperdata['languages'].pop(language_iso2)
corpus.hyperdata['languages']['__skipped__'][language_iso2] = skipped_lg_infos
lang_doc = document.hyperdata['language_iso2']
if document not in corpus.skipped_docs:
# if document.id in corpus.skipped_docs:
# # get the langage of the current document
# # skip document
# print('Unsupported language: `%s` (doc #%i)' % (lang_doc, document.id))
# # and remember that for later processes (eg stemming)
# #document.hyperdata['__skipped__'] = 'ngrams_extraction'
# #document.save_hyperdata()
# #session.commit()
# continue
#
#
# else:
# extract ngrams on each of the considered keys
ngramextractor = taggers_bot[lang_doc]
for key in keys:
value = document.hyperdata.get(key, None)
if not isinstance(value, str):
continue
# get ngrams
for ngram in ngramsextractor.extract(value):
tokens = tuple(normalize_forms(token[0]) for token in ngram)
if do_subngrams:
# ex tokens = ["very", "cool", "exemple"]
# subterms = [['very', 'cool'],
# ['very', 'cool', 'exemple'],
# ['cool', 'exemple']]
subterms = subsequences(tokens)
else:
subterms = [tokens]
for seqterm in subterms:
ngram = ' '.join(seqterm)
if len(ngram) > 1:
# doc <=> ngram index
nodes_ngrams_count[(document.id, ngram)] += 1
# add fields : terms n
ngrams_data.add((ngram[:255], len(seqterm), ))
# integrate ngrams and nodes-ngrams
if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE:
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
nodes_ngrams_count.clear()
ngrams_data.clear()
if documents_count % BATCH_NGRAMSEXTRACTION_SIZE == 0:
corpus.status('Ngrams', progress=documents_count+1)
corpus.save_hyperdata()
session.commit()
continue
# extract ngrams on each of the considered keys
for key in keys:
value = document.hyperdata.get(key, None)
if not isinstance(value, str):
continue
# get ngrams
for ngram in ngramsextractor.extract(value):
tokens = tuple(normalize_forms(token[0]) for token in ngram)
if do_subngrams:
# ex tokens = ["very", "cool", "exemple"]
# subterms = [['very', 'cool'],
# ['very', 'cool', 'exemple'],
# ['cool', 'exemple']]
subterms = subsequences(tokens)
else:
subterms = [tokens]
for seqterm in subterms:
ngram = ' '.join(seqterm)
if len(ngram) > 1:
# doc <=> ngram index
nodes_ngrams_count[(document.id, ngram)] += 1
# add fields : terms n
ngrams_data.add((ngram[:255], len(seqterm), ))
# integrate ngrams and nodes-ngrams
if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE:
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
nodes_ngrams_count.clear()
ngrams_data.clear()
if documents_count % BATCH_NGRAMSEXTRACTION_SIZE == 0:
corpus.status('Ngrams', progress=documents_count+1)
corpus.save_hyperdata()
session.commit()
# integrate ngrams and nodes-ngrams
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
corpus.status('Ngrams', progress=documents_count+1, complete=True)
corpus.save_hyperdata()
session.commit()
except Exception as error:
corpus.status('Ngrams', error=error)
corpus.save_hyperdata()
session.commit()
raise error
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
corpus.status('Ngrams', progress=documents_count+1, complete=True)
corpus.save_hyperdata()
session.commit()
except Exception as error:
corpus.status('Ngrams', error=error)
corpus.save_hyperdata()
session.commit()
raise error
def normalize_forms(term_str, do_lowercase=DEFAULT_ALL_LOWERCASE_FLAG):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment