Commit a9b2e3e2 authored by Romain Loth's avatar Romain Loth

restore resources loop and make the 'extracted' boolean work with save_hyperdata

parent ff681f29
...@@ -10,31 +10,33 @@ def parse(corpus): ...@@ -10,31 +10,33 @@ def parse(corpus):
try: try:
documents_count = 0 documents_count = 0
corpus.status('Docs', progress=0) corpus.status('Docs', progress=0)
#1 corpus => 1 resource
# shortcut to hyperdata's list of added resources (packs of docs)
resources = corpus.resources() resources = corpus.resources()
#get the sources capabilities for a given corpus resource
sources = [get_resource(resource["type"]) for resource in corpus.resources() if resource["extracted"] is False] # each resource contains a path to a file with the docs
if len(sources) == 0: for i, resource in enumerate(resources):
#>>> documents have already been parsed?????
return # we'll only want the resources that have never been extracted
if len(sources) > 0: if resource["extracted"]:
#>>> necessairement 1 corpus = 1 source dans l'archi actuelle continue
source = sources[0]
resource = resources[0] # the sourcetype's infos
#source.extend(resource) source_infos = get_resource(resource['type'])
if source["parser"] is None:
if source_infos["parser"] is None:
#corpus.status(error) #corpus.status(error)
raise ValueError("Resource '%s' has no Parser" %resource["name"]) raise ValueError("Resource '%s' has no Parser" %resource["name"])
else: else:
#observed langages in corpus docs #observed langages in corpus docs
corpus.languages = defaultdict.fromkeys(source["default_languages"], 0) corpus.languages = defaultdict.fromkeys(source_infos["default_languages"], 0)
#remember the skipped docs in parsing #remember the skipped docs in parsing
skipped_languages = [] skipped_languages = []
corpus.skipped_docs = [] corpus.skipped_docs = []
session.add(corpus) session.add(corpus)
session.commit() session.commit()
#load the corresponding parser #load the corresponding parser
parserbot = load_parser(source) parserbot = load_parser(source_infos)
# extract and insert documents from resource.path into database # extract and insert documents from resource.path into database
default_lang_field = ["language_"+l for l in ["iso2", "iso3", "full_name"]] default_lang_field = ["language_"+l for l in ["iso2", "iso3", "full_name"]]
...@@ -48,7 +50,6 @@ def parse(corpus): ...@@ -48,7 +50,6 @@ def parse(corpus):
hyperdata["error"] = "Error normalize_chars" hyperdata["error"] = "Error normalize_chars"
#any parser should implement a language_iso2 #any parser should implement a language_iso2
if "language_iso2" in hyperdata.keys(): if "language_iso2" in hyperdata.keys():
try: try:
...@@ -109,7 +110,9 @@ def parse(corpus): ...@@ -109,7 +110,9 @@ def parse(corpus):
session.commit() session.commit()
# update info about the resource # update info about the resource
resource['extracted'] = True corpus.hyperdata['resources'][i]['extracted'] = True
corpus.save_hyperdata()
session.commit()
# add a corpus-level info about languages adding a __skipped__ info # add a corpus-level info about languages adding a __skipped__ info
corpus.languages['__skipped__'] = Counter(skipped_languages) corpus.languages['__skipped__'] = Counter(skipped_languages)
print("LANGUES") print("LANGUES")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment