Commit fa1a268e authored by c24b's avatar c24b

load corresponding tagger without ngramsextractors

parent 9fcf90b5
from gargantext.util.languages import languages
from gargantext.constants import LANGUAGES, DEFAULT_MAX_NGRAM_LEN, RULE_JJNN, RULE_NPN
import nltk
import re
class NgramsExtractor:
def __init__(self, tagger):
self._tagger = tagger()
@staticmethod
def clean_text(text):
"""Clean the text for better POS tagging.
For now, only removes (short) XML tags.
"""
return re.sub(r'<[^>]{0,45}>', '', text)
def extract(self, text, rule=RULE_JJNN, label='NP', max_n_words=DEFAULT_MAX_NGRAM_LEN):
text = self.clean_text(text)
grammar = nltk.RegexpParser(label + ': ' + rule)
tagged_tokens = list(self._tagger.tag_text(text))
if len(tagged_tokens):
grammar_parsed = grammar.parse(tagged_tokens)
for subtree in grammar_parsed.subtrees():
if subtree.label() == label:
if len(subtree) < max_n_words:
yield subtree.leaves()
# ex: [('wild', 'JJ'), ('pollinators', 'NNS')]
class NgramsExtractors(dict):
def __missing__(self, key):
if not isinstance(key, str):
raise KeyError
if len(key) == 2 and key == key.lower():
tagger = LANGUAGES[key]['tagger']
self[key] = NgramsExtractor(tagger)
else:
self[key] = self[LANGUAGES[key].iso3]
return self[key]
# this below will be shared within the current thread
ngramsextractors = NgramsExtractors()
from gargantext.util.db import * from gargantext.util.db import *
from gargantext.models import * from gargantext.models import *
from gargantext.constants import * from gargantext.constants import *
from gargantext.util.ngramsextractors import ngramsextractors #from gargantext.util.ngramsextractors import ngramsextractors
from collections import defaultdict from collections import defaultdict
from re import sub from re import sub
...@@ -47,30 +47,38 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_ ...@@ -47,30 +47,38 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
nodes_ngrams_count = defaultdict(int) nodes_ngrams_count = defaultdict(int)
ngrams_data = set() ngrams_data = set()
# extract ngrams # extract ngrams
resource_type_index = corpus.resources()[0]['type'] resource = corpus.resources()[0]
documents_count = 0 documents_count = 0
resource_type = RESOURCETYPES[resource_type_index] #load available taggers for source default langage
default_language_iso2 = resource_type['default_languages'] taggers_bots = {lang: load_tagger(lang) for lang in resource['default_languages']}
#skipped documents that have an unsupported languages
corpus.skipped_docs = [doc.id for doc in enumerate(corpus.children('DOCUMENT')) if doc.hyperdata["language_iso2"] not in resource["default_languages"]]
print(set(corpus.languages.keys()).intersection(resource["default_languages"]))
# if lang_doc in corpus.hyperdata['languages']:
# skipped_lg_infos = corpus.hyperdata['languages'].pop(lang_doc)
# corpus.hyperdata['languages']['__skipped__'][lang_doc] = skipped_lg_infos
# corpus.save_hyperdata()
# session.commit()
# continue
for documents_count, document in enumerate(corpus.children('DOCUMENT')): for documents_count, document in enumerate(corpus.children('DOCUMENT')):
# get ngrams extractor for the current document lang_doc = document.hyperdata['language_iso2']
language_iso2 = document.hyperdata.get('language_iso2', default_language_iso2) if document not in corpus.skipped_docs:
try: # if document.id in corpus.skipped_docs:
# this looks for a tagger in constants.LANGUAGES # # get the langage of the current document
ngramsextractor = ngramsextractors[language_iso2] # # skip document
except KeyError: # print('Unsupported language: `%s` (doc #%i)' % (lang_doc, document.id))
# skip document # # and remember that for later processes (eg stemming)
print('Unsupported language: `%s` (doc #%i)' % (language_iso2, document.id)) # #document.hyperdata['__skipped__'] = 'ngrams_extraction'
# and remember that for later processes (eg stemming) # #document.save_hyperdata()
document.hyperdata['__skipped__'] = 'ngrams_extraction' # #session.commit()
document.save_hyperdata() # continue
session.commit() #
if language_iso2 in corpus.hyperdata['languages']: #
skipped_lg_infos = corpus.hyperdata['languages'].pop(language_iso2) # else:
corpus.hyperdata['languages']['__skipped__'][language_iso2] = skipped_lg_infos
corpus.save_hyperdata()
session.commit()
continue
# extract ngrams on each of the considered keys # extract ngrams on each of the considered keys
ngramextractor = taggers_bot[lang_doc]
for key in keys: for key in keys:
value = document.hyperdata.get(key, None) value = document.hyperdata.get(key, None)
if not isinstance(value, str): if not isinstance(value, str):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment