Commit f93d4266 authored by c24b's avatar c24b

LANG => tagger + stemmer

parent 7c61a9fa
......@@ -131,8 +131,7 @@ def get_resource_by_name(sourcename):
# taggers -----------------------------------------------
def get_tagger(lang):
'''
lang => default langage[0] => Tagger
lang => observed language[0] => Tagger
'''
name = LANGUAGES[lang]["tagger"]
module = "gargantext.util.taggers.%s" %(name)
......@@ -150,7 +149,6 @@ RESOURCETYPES = [
'parser': "EuropresseParser",
'file_formats':["zip", "txt"],
'crawler': None,
'default_languages': ['en', 'fr'],
},
{ 'type': 2,
'name': 'Jstor [RIS]',
......@@ -158,7 +156,6 @@ RESOURCETYPES = [
'parser': "RISParser",
'file_formats':["zip", "txt"],
'crawler': None,
'default_languages': ['en'],
},
{ 'type': 3,
'name': 'Pubmed [XML]',
......@@ -166,7 +163,6 @@ RESOURCETYPES = [
'parser': "PubmedParser",
'file_formats':["zip", "xml"],
'crawler': "PubmedCrawler",
'default_languages': ['en'],
},
{ 'type':4,
'name': 'Scopus [RIS]',
......@@ -174,7 +170,6 @@ RESOURCETYPES = [
'parser': "RISParser",
'file_formats':["zip", "txt"],
'crawler': None,
'default_languages': ['en'],
},
{ 'type':5,
'name': 'Web of Science [ISI]',
......@@ -183,7 +178,6 @@ RESOURCETYPES = [
'file_formats':["zip", "txt"],
#'crawler': "ISICrawler",
'crawler': None,
'default_languages': ['en'],
},
{ 'type':6,
'name': 'Zotero [RIS]',
......@@ -191,7 +185,6 @@ RESOURCETYPES = [
'parser': 'RISParser',
'file_formats':["zip", "ris", "txt"],
'crawler': None,
'default_languages': ['en'],
},
{ 'type':7,
'name': 'CSV',
......@@ -199,7 +192,6 @@ RESOURCETYPES = [
'parser': 'CSVParser',
'file_formats':["zip", "csv"],
'crawler': None,
'default_languages': ['en'],
},
{ 'type': 8,
'name': 'ISTex',
......@@ -207,7 +199,6 @@ RESOURCETYPES = [
'parser': "ISTexParser",
'file_formats':["zip", "txt"],
'crawler': None,
'default_languages': ['en', 'fr'],
},
{ "type":9,
"name": 'SCOAP [XML]',
......@@ -215,7 +206,6 @@ RESOURCETYPES = [
"format": 'MARC21',
'file_formats':["zip","xml"],
"crawler": "CernCrawler",
'default_languages': ['en'],
},
{ "type":10,
"name": 'REPEC [RIS]',
......@@ -223,7 +213,6 @@ RESOURCETYPES = [
"format": 'RIS',
'file_formats':["zip","ris", "txt"],
"crawler": None,
'default_languages': ['en'],
},
]
#shortcut for resources declaration in template
......@@ -278,13 +267,11 @@ def load_tagger(lang):
given a LANG load the corresponding tagger
lang(str) > Tagger(Object)
'''
try:
filename = LANGUAGES[lang]["tagger"]
module = 'gargantext.util.taggers.%s' %(filename)
module = importlib.import_module(module)
return getattr(module, filename)()
except:
raise ImportError("No tagger for this lang %s TIP: declare a new parser in LANGUAGES" %lang)
filename = LANGUAGES[lang]["tagger"]
module = 'gargantext.util.taggers.%s' %(filename)
module = importlib.import_module(module)
return getattr(module, filename)()
# linguistic extraction parameters ---------------------------------------------
......@@ -361,14 +348,6 @@ QUERY_SIZE_N_MAX = 1000
QUERY_SIZE_N_DEFAULT = 1000
# Grammar rules for chunking
RULE_JJNN = "{<JJ.*>*<NN.*|>+<JJ.*>*}"
RULE_NPN = "{<JJ.*>*<NN.*>+((<P|IN> <DT>? <JJ.*>* <NN.*>+ <JJ.*>*)|(<JJ.*>))*}"
RULE_TINA = "^((VBD,|VBG,|VBN,|CD.?,|JJ.?,|\?,){0,2}?(N.?.?,|\?,)+?(CD.,)??)\
+?((PREP.?|DET.?,|IN.?,|CC.?,|\?,)((VBD,|VBG,|VBN,|CD.?,|JJ.?,|\?\
,){0,2}?(N.?.?,|\?,)+?)+?)*?$"
# ------------------------------------------------------------------------------
# Graph constraints to compute the graph:
# Modes: live graph generation, graph asynchronously computed or errors detected
......
......@@ -8,7 +8,7 @@ class NltkTagger(Tagger):
#import nltk
def __init__(self, *args, **kwargs):
self.tagr = PerceptronTagger()
super(self.__class__, self).__init__(*args, **kwargs)
#super(self.__class__, self).__init__(*args, **kwargs)
#def __start__(self):
#~ self.tagr = PerceptronTagger()
......@@ -16,14 +16,14 @@ class NltkTagger(Tagger):
def tag_tokens(self, tokens, single=True):
return self.tagr.tag(tokens)
def extract(self, text, rule=RULE_JJNN, label='NP', max_n_words=DEFAULT_MAX_NGRAM_LEN):
self.text = self.clean_text(text)
grammar = nltk.RegexpParser(label + ': ' + rule)
tagged_tokens = list(self.tag_text(self.text))
if len(tagged_tokens):
grammar_parsed = grammar.parse(tagged_tokens)
for subtree in grammar_parsed.subtrees():
if subtree.label() == label:
if len(subtree) < max_n_words:
yield subtree.leaves()
# ex: [('wild', 'JJ'), ('pollinators', 'NNS')]
# def extract(self, text, rule=RULE_JJNN, label='NP', max_n_words=DEFAULT_MAX_NGRAM_LEN):
# self.text = self.clean_text(text)
# grammar = nltk.RegexpParser(label + ': ' + rule)
# tagged_tokens = list(self.tag_text(self.text))
# if len(tagged_tokens):
# grammar_parsed = grammar.parse(tagged_tokens)
# for subtree in grammar_parsed.subtrees():
# if subtree.label() == label:
# if len(subtree) < max_n_words:
# yield subtree.leaves()
# # ex: [('wild', 'JJ'), ('pollinators', 'NNS')]
......@@ -26,8 +26,7 @@ def prepare_stemmers(corpus):
and formatted
"""
supported_stemmers_lang = [lang for lang in corpus.hyperdata["languages"]\
if lang != "__unknown__" \
if lang in LANGUAGES.keys()]
if lang != "__unknown__" ]
stemmers = {lang:SnowballStemmer(languages[lang].name.lower()) for lang \
in supported_stemmers_lang}
stemmers['__unknown__'] = SnowballStemmer("english")
......@@ -51,9 +50,8 @@ def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
# and group if same stem/lemma
stemmers = prepare_stemmers(corpus)
print("# STEMMERS LOADED", stemmers)
supported_stemmers_lang = [lang for lang in corpus.hyperdata["languages"] \
if lang != "__unknown__" \
and lang in LANGUAGES.keys()]
supported_stemmers_lang = [lang for lang in corpus.hyperdata["languages"] if lang != "__unknown__"]
print("#SUPPORTED STEMMERS LANGS", supported_stemmers_lang)
# todo dict {lg => {ngrams_todo} }
todo_ngrams_per_lg = defaultdict(set)
......@@ -64,8 +62,8 @@ def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
# preloop per doc to sort ngrams by language
for doc in corpus.children('DOCUMENT'):
if doc.id not in corpus.hyperdata['skipped_docs']:
if ('language_iso2' in doc.hyperdata) \
and doc.hyperdata['language_iso2'] in supported_stemmers_lang:
if ('language_iso2' in doc.hyperdata) and doc.hyperdata['language_iso2'] \
in supported_stemmers_lang:
lgid = doc.hyperdata['language_iso2']
else:
......
......@@ -52,7 +52,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
#load available taggers for default langage of plateform
#print(LANGUAGES.keys())
tagger_bots = {lang: load_tagger(lang) for lang in corpus.hyperdata["languages"] \
if lang != "__unknown__" and lang in LANGUAGES.keys()}
if lang != "__unknown__"}
print("#TAGGERS LOADED: ", tagger_bots)
supported_taggers_lang = tagger_bots.keys()
print("#SUPPORTED TAGGER LANGS", supported_taggers_lang)
......
......@@ -14,8 +14,10 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
'''
if "language_iso2" in hyperdata.keys():
observed_languages.append(hyperdata["language_iso2"])
if hyperdata["language_iso2"] not in LANGUAGES.keys():
skipped_languages.append(hyperdata["language_iso2"])
return observed_languages,skipped_languages
observed_languages[hyperdata["language_iso2"]]
return observed_languages,skipped_languages
......@@ -23,6 +25,9 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
#convert
try:
lang = languages[hyperdata["language_iso3"]].iso2
if lang not in LANGUAGES.keys():
skipped_languages.append(lang)
return observed_languages,skipped_languages
observed_languages.append(lang)
return observed_languages,skipped_languages
except KeyError:
......@@ -35,6 +40,9 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
try:
#convert
lang = hyperdata["language_fullname"].iso2
if lang not in LANGUAGES.keys():
skipped_languages.append(lang)
return observed_languages,skipped_languages
observed_languages.append(lang)
return observed_languages,skipped_languages
except KeyError:
......@@ -59,6 +67,9 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
else:
#detect_lang return iso2
lang = detect_lang(text)
if lang not in LANGUAGES.keys():
skipped_languages.append(lang)
return observed_languages,skipped_languages
observed_languages.append(lang)
return observed_languages,skipped_languages
......@@ -80,6 +91,7 @@ def parse(corpus):
#corpus.status(error)
raise ValueError("Resource '%s' has no Parser" %resource["name"])
parserbot = load_parser(source)
print(parserbot)
#observed languages in default languages
observed_languages = []
#skipped_languages
......@@ -167,14 +179,7 @@ def parse(corpus):
corpus.save_hyperdata()
#TODO: assign main lang of the corpus to unsupported languages docs
# for d_id in corpus.skipped_docs:
# document = session.query(Node).filter(Node.id == d_id, Node.typename == "DOCUMENT").first()
# if document.hyperdata["error"].startswith("Error: unsupported language"):
# print(document.hyperdata["language_iso2"])
# document.hyperdata["language_iso2"] = corpus.language_id
# document.save_hyperdata()
# session.commit()
except Exception as error:
corpus.status('Docs', error=error)
corpus.save_hyperdata()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment