Commit c450b765 authored by c24b's avatar c24b

languages + skipped_docs => parsing documents ERROR no lang detected in parsing

parent 38939cdb
...@@ -102,7 +102,7 @@ def do_maplist(corpus, ...@@ -102,7 +102,7 @@ def do_maplist(corpus,
if n_ngrams == 0: if n_ngrams == 0:
raise ValueError("No ngrams in cooc table ?") raise ValueError("No ngrams in cooc table ?")
#return
# results, with same structure as quotas # results, with same structure as quotas
chosen_ngrams = { chosen_ngrams = {
'topgen':{'monograms':[], 'multigrams':[]}, 'topgen':{'monograms':[], 'multigrams':[]},
......
#!/usr/bin/python3 env
""" """
For initial ngram groups via stemming For initial ngram groups via stemming
Exemple: Exemple:
...@@ -26,7 +27,7 @@ def prepare_stemmers(corpus): ...@@ -26,7 +27,7 @@ def prepare_stemmers(corpus):
# always get a generic stemmer in case language code unknown # always get a generic stemmer in case language code unknown
'__unknown__' : SnowballStemmer("english") '__unknown__' : SnowballStemmer("english")
} }
for lang in corpus.hyperdata["languages"].keys(): for lang in corpus.languages.keys():
print(lang) print(lang)
if (lang != '__skipped__'): if (lang != '__skipped__'):
lgname = languages[lang].name.lower() lgname = languages[lang].name.lower()
......
from gargantext.util.db import * from gargantext.util.db import *
from gargantext.models import * from gargantext.models import *
from gargantext.constants import * from gargantext.constants import *
#from gargantext.util.ngramsextractors import ngramsextractors
from collections import defaultdict from collections import defaultdict
from re import sub from re import sub
...@@ -51,23 +49,28 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND ...@@ -51,23 +49,28 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
source = get_resource(resource["type"]) source = get_resource(resource["type"])
documents_count = 0 documents_count = 0
#load available taggers for source default langage #load available taggers for source default langage
tagger_bots = {lang: load_tagger(lang) for lang in source['default_languages']}
#skipped documents that have been skipped previously for parsing error or unsupported language #skipped documents that have been skipped previously for parsing error or unsupported language
print(corpus.skipped_docs) tagger_bots = {lang: load_tagger(lang) for lang in corpus.languages if lang != "__skipped__"}
docs = [doc for doc in corpus.children('DOCUMENT') if doc.id not in corpus.skipped_docs] docs = [doc for doc in corpus.children('DOCUMENT') if doc.id not in corpus.skipped_docs]
#sort docs by lang? #sort docs by lang?
docs = sorted(docs, key= lambda k: k.language_iso2)
#print(corpus.hyperdata["languages"])
for documents_count, document in enumerate(docs): for documents_count, document in enumerate(docs):
lang_doc = document.language_iso2 try:
print(lang_doc) lang_doc = document.hyperdata["language_iso2"]
except AttributeError:
print("NO LANG DETECTED")
document.status("NGRAMS", error="No lang detected?")
corpus.skipped_docs.append(document.id)
continue
for key in keys: for key in keys:
value = document.hyperdata.get(key, None) value = document.get(key, None)
print("VAL", value)
if not isinstance(value, str): if not isinstance(value, str):
continue continue
# get ngrams # get ngrams
for ngram in tagger_bots[lang_doc](value): for ngram in tagger_bots[lang_doc](value):
tokens = tuple(normalize_forms(token[0]) for token in ngram) tokens = tuple(normalize_forms(token[0]) for token in ngram)
print("tk", tokens)
if do_subngrams: if do_subngrams:
# ex tokens = ["very", "cool", "exemple"] # ex tokens = ["very", "cool", "exemple"]
# subterms = [['very', 'cool'], # subterms = [['very', 'cool'],
......
...@@ -9,32 +9,35 @@ def parse(corpus): ...@@ -9,32 +9,35 @@ def parse(corpus):
try: try:
documents_count = 0 documents_count = 0
corpus.status('Docs', progress=0) corpus.status('Docs', progress=0)
#print(corpus.resources()) #1 corpus => 1 resource
resources = corpus.resources()
#get the sources capabilities for a given corpus #get the sources capabilities for a given corpus
resource = corpus.resources()[0] #print(resource)
print(resource) sources = [get_resource(resource["type"]) for resource in corpus.resources() if resource["extracted"] is False]
sources = [get_resource(resource["type"]) for resource in corpus.resources()]
print(sources) print(sources)
if len(sources) == 0: if len(sources) == 0:
#>>> documents have already been parsed????? #>>> documents have already been parsed?????
raise ValueError(len(sources)) pass
if len(sources) > 0: if len(sources) > 0:
#>>> necessairement 1 corpus = 1 source dans l'archi actuelle #>>> necessairement 1 corpus = 1 source dans l'archi actuelle
source = sources[0] source = sources[0]
resource = resources[0]
#source.extend(resource)
if source["parser"] is None: if source["parser"] is None:
#corpus.status(error) #corpus.status(error)
raise ValueError("Resource '%s' has no Parser" %resource["name"]) raise ValueError("Resource '%s' has no Parser" %resource["name"])
else: else:
corpus.languages = defaultdict.fromkeys(sources[0]["default_languages"], 0) corpus.languages = defaultdict.fromkeys(source["default_languages"], 0)
corpus.skipped_docs = [] corpus.skipped_docs = []
session.add(corpus) session.add(corpus)
session.commit() session.commit()
#load the corresponding parser #load the corresponding parser
resource_parser = load_parser(source) parserbot = load_parser(source)
skipped_languages = [] skipped_languages = []
# extract and insert documents from resource.path into database # extract and insert documents from resource.path into database
print(resource) #print(resource["path"])
for hyperdata in resource_parser(resource["path"]): for hyperdata in parserbot(resource["path"]):
# indexed text fields defined in constants # indexed text fields defined in constants
for k in DEFAULT_INDEX_FIELDS: for k in DEFAULT_INDEX_FIELDS:
if k in hyperdata.keys(): if k in hyperdata.keys():
...@@ -46,12 +49,22 @@ def parse(corpus): ...@@ -46,12 +49,22 @@ def parse(corpus):
# a simple census to raise language info at corpus level # a simple census to raise language info at corpus level
if "language_iso2" in hyperdata.keys(): if "language_iso2" in hyperdata.keys():
try: try:
corpus.hyperdata["languages"][hyperdata["language_iso2"]] += 1 corpus.languages[hyperdata["language_iso2"]] += 1
except KeyError: except KeyError:
print("KeyError", hyperdata["language_iso2"])
hyperdata["error"] = "Error: unsupported language"
skipped_languages.append(hyperdata["language_iso2"])
elif "language_iso3" in hyperdata.keys():
try:
lang = language[hyperdata["language_iso3"]]
corpus.languages[lang] += 1
except KeyError:
print("KeyError", lang)
hyperdata["error"] = "Error: unsupported language" hyperdata["error"] = "Error: unsupported language"
skipped_languages.append(hyperdata["language_iso2"]) skipped_languages.append(hyperdata["language_iso2"])
else: else:
hyperdata["error"] = "Error: no language found" raise ValueError("PARSING ERROR: No lang detected")
# save as DB child # save as DB child
# ---------------- # ----------------
document = corpus.add_child( document = corpus.add_child(
...@@ -60,17 +73,25 @@ def parse(corpus): ...@@ -60,17 +73,25 @@ def parse(corpus):
hyperdata = hyperdata, hyperdata = hyperdata,
) )
session.add(document) session.add(document)
if "error" in document.hyperdata.keys():
if "error" in hyperdata.keys():
#document.status("error") #document.status("error")
print(hyperdata["error"])
document.status('Parsing', error= document.hyperdata["error"]) document.status('Parsing', error= document.hyperdata["error"])
#session.delete(document) document.save_hyperdata()
session.commit()
corpus.skipped_docs.append(document.id) corpus.skipped_docs.append(document.id)
# logging # logging
if documents_count % BATCH_PARSING_SIZE == 0: if documents_count % BATCH_PARSING_SIZE == 0:
corpus.status('Docs', progress=documents_count) corpus.status('Docs', progress=documents_count)
corpus.save_hyperdata() corpus.save_hyperdata()
session.add(corpus) session.add(corpus)
session.commit() session.commit()
documents_count += 1 documents_count += 1
# update info about the resource # update info about the resource
resource['extracted'] = True resource['extracted'] = True
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment