Commit c450b765 authored by c24b's avatar c24b

languages + skipped_docs => parsing documents ERROR no lang detected in parsing

parent 38939cdb
......@@ -102,7 +102,7 @@ def do_maplist(corpus,
if n_ngrams == 0:
raise ValueError("No ngrams in cooc table ?")
#return
# results, with same structure as quotas
chosen_ngrams = {
'topgen':{'monograms':[], 'multigrams':[]},
......
#!/usr/bin/python3 env
"""
For initial ngram groups via stemming
Exemple:
......@@ -26,7 +27,7 @@ def prepare_stemmers(corpus):
# always get a generic stemmer in case language code unknown
'__unknown__' : SnowballStemmer("english")
}
for lang in corpus.hyperdata["languages"].keys():
for lang in corpus.languages.keys():
print(lang)
if (lang != '__skipped__'):
lgname = languages[lang].name.lower()
......
from gargantext.util.db import *
from gargantext.models import *
from gargantext.constants import *
#from gargantext.util.ngramsextractors import ngramsextractors
from collections import defaultdict
from re import sub
......@@ -51,23 +49,28 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
source = get_resource(resource["type"])
documents_count = 0
#load available taggers for source default langage
tagger_bots = {lang: load_tagger(lang) for lang in source['default_languages']}
#skipped documents that have been skipped previously for parsing error or unsupported language
print(corpus.skipped_docs)
tagger_bots = {lang: load_tagger(lang) for lang in corpus.languages if lang != "__skipped__"}
docs = [doc for doc in corpus.children('DOCUMENT') if doc.id not in corpus.skipped_docs]
#sort docs by lang?
docs = sorted(docs, key= lambda k: k.language_iso2)
#print(corpus.hyperdata["languages"])
for documents_count, document in enumerate(docs):
lang_doc = document.language_iso2
print(lang_doc)
try:
lang_doc = document.hyperdata["language_iso2"]
except AttributeError:
print("NO LANG DETECTED")
document.status("NGRAMS", error="No lang detected?")
corpus.skipped_docs.append(document.id)
continue
for key in keys:
value = document.hyperdata.get(key, None)
value = document.get(key, None)
print("VAL", value)
if not isinstance(value, str):
continue
# get ngrams
for ngram in tagger_bots[lang_doc](value):
tokens = tuple(normalize_forms(token[0]) for token in ngram)
print("tk", tokens)
if do_subngrams:
# ex tokens = ["very", "cool", "exemple"]
# subterms = [['very', 'cool'],
......
......@@ -9,32 +9,35 @@ def parse(corpus):
try:
documents_count = 0
corpus.status('Docs', progress=0)
#print(corpus.resources())
#1 corpus => 1 resource
resources = corpus.resources()
#get the sources capabilities for a given corpus
resource = corpus.resources()[0]
print(resource)
sources = [get_resource(resource["type"]) for resource in corpus.resources()]
#print(resource)
sources = [get_resource(resource["type"]) for resource in corpus.resources() if resource["extracted"] is False]
print(sources)
if len(sources) == 0:
#>>> documents have already been parsed?????
raise ValueError(len(sources))
pass
if len(sources) > 0:
#>>> necessairement 1 corpus = 1 source dans l'archi actuelle
source = sources[0]
resource = resources[0]
#source.extend(resource)
if source["parser"] is None:
#corpus.status(error)
raise ValueError("Resource '%s' has no Parser" %resource["name"])
else:
corpus.languages = defaultdict.fromkeys(sources[0]["default_languages"], 0)
corpus.languages = defaultdict.fromkeys(source["default_languages"], 0)
corpus.skipped_docs = []
session.add(corpus)
session.commit()
#load the corresponding parser
resource_parser = load_parser(source)
parserbot = load_parser(source)
skipped_languages = []
# extract and insert documents from resource.path into database
print(resource)
for hyperdata in resource_parser(resource["path"]):
#print(resource["path"])
for hyperdata in parserbot(resource["path"]):
# indexed text fields defined in constants
for k in DEFAULT_INDEX_FIELDS:
if k in hyperdata.keys():
......@@ -46,12 +49,22 @@ def parse(corpus):
# a simple census to raise language info at corpus level
if "language_iso2" in hyperdata.keys():
try:
corpus.hyperdata["languages"][hyperdata["language_iso2"]] += 1
corpus.languages[hyperdata["language_iso2"]] += 1
except KeyError:
print("KeyError", hyperdata["language_iso2"])
hyperdata["error"] = "Error: unsupported language"
skipped_languages.append(hyperdata["language_iso2"])
elif "language_iso3" in hyperdata.keys():
try:
lang = language[hyperdata["language_iso3"]]
corpus.languages[lang] += 1
except KeyError:
print("KeyError", lang)
hyperdata["error"] = "Error: unsupported language"
skipped_languages.append(hyperdata["language_iso2"])
else:
hyperdata["error"] = "Error: no language found"
raise ValueError("PARSING ERROR: No lang detected")
# save as DB child
# ----------------
document = corpus.add_child(
......@@ -60,17 +73,25 @@ def parse(corpus):
hyperdata = hyperdata,
)
session.add(document)
if "error" in document.hyperdata.keys():
if "error" in hyperdata.keys():
#document.status("error")
print(hyperdata["error"])
document.status('Parsing', error= document.hyperdata["error"])
#session.delete(document)
document.save_hyperdata()
session.commit()
corpus.skipped_docs.append(document.id)
# logging
if documents_count % BATCH_PARSING_SIZE == 0:
corpus.status('Docs', progress=documents_count)
corpus.save_hyperdata()
session.add(corpus)
session.commit()
documents_count += 1
# update info about the resource
resource['extracted'] = True
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment