Commit 97f70d54 authored by c24b's avatar c24b

EUROPRESSE

parent cc674dea
......@@ -145,9 +145,9 @@ def get_tagger(lang):
RESOURCETYPES = [
{ "type":1,
'name': 'Europress',
'format': 'Europress',
'parser': "EuropressParser",
'name': 'Europresse',
'format': 'Europresse',
'parser': "EuropresseParser",
'file_formats':["zip"],
'crawler': None,
'default_languages': ['en', 'fr'],
......
......@@ -29,7 +29,7 @@ import sys
from ._Parser import Parser
class EuropressParser(Parser):
class EuropresseParser(Parser):
def parse(self, file):
#print("europr_parser file", file)
......@@ -266,13 +266,3 @@ class EuropressParser(Parser):
except:
print('Something bad happened.')
if __name__ == "__main__":
e = EuropressFileParser()
hyperdata = e.parse(str(sys.argv[1]))
for h in hyperdata:
try:
print(h['journal'], ":", h['publication_date'])
except:
pass
......@@ -48,7 +48,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
documents_count = 0
source = get_resource(resource["type"])
#load only the docs that have passed the parsing without error
docs = [doc for doc in corpus.children('DOCUMENT') if doc.id not in corpus.hyperdata["skipped_docs"]]
#load available taggers for default langage of plateform
#print(LANGUAGES.keys())
tagger_bots = {lang: load_tagger(lang) for lang in corpus.hyperdata["languages"] \
......@@ -56,53 +56,54 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
supported_taggers_lang = tagger_bots.keys()
#sort docs by lang?
# for lang, tagger in tagger_bots.items():
for documents_count, document in enumerate(docs):
language_iso2 = document.hyperdata.get('language_iso2')
if language_iso2 not in supported_taggers_lang:
#print("ERROR NO language_iso2")
document.status("NGRAMS", error="Error: unsupported language for tagging")
session.add(document)
session.commit()
corpus.hyperdata["skipped_docs"].append(document.id)
corpus.save_hyperdata()
continue
else:
tagger = tagger_bots[language_iso2]
#print(language_iso2)
#>>> romain-stable-patch
#to do verify if document has no KEYS to index
for key in keys:
try:
value = document.hyperdata[str(key)]
if not isinstance(value, str):
#print("DBG wrong content in doc for key", key)
for documents_count, document in enumerate(corpus.children('DOCUMENT')):
if doc.id not in corpus.hyperdata["skipped_docs"]:
language_iso2 = document.hyperdata.get('language_iso2')
if language_iso2 not in supported_taggers_lang:
#print("ERROR NO language_iso2")
document.status("NGRAMS", error="Error: unsupported language for tagging")
session.add(document)
session.commit()
corpus.hyperdata["skipped_docs"].append(document.id)
corpus.save_hyperdata()
continue
else:
tagger = tagger_bots[language_iso2]
#print(language_iso2)
#>>> romain-stable-patch
#to do verify if document has no KEYS to index
for key in keys:
try:
value = document.hyperdata[str(key)]
if not isinstance(value, str):
#print("DBG wrong content in doc for key", key)
continue
# get ngrams
for ngram in tagger.extract(value):
tokens = tuple(normalize_forms(token[0]) for token in ngram)
if do_subngrams:
# ex tokens = ["very", "cool", "exemple"]
# subterms = [['very', 'cool'],
# ['very', 'cool', 'exemple'],
# ['cool', 'exemple']]
subterms = subsequences(tokens)
else:
subterms = [tokens]
for seqterm in subterms:
ngram = ' '.join(seqterm)
if len(ngram) > 1:
# doc <=> ngram index
nodes_ngrams_count[(document.id, ngram)] += 1
# add fields : terms n
ngrams_data.add((ngram[:255], len(seqterm), ))
except:
#value not in doc
continue
# get ngrams
for ngram in tagger.extract(value):
tokens = tuple(normalize_forms(token[0]) for token in ngram)
if do_subngrams:
# ex tokens = ["very", "cool", "exemple"]
# subterms = [['very', 'cool'],
# ['very', 'cool', 'exemple'],
# ['cool', 'exemple']]
subterms = subsequences(tokens)
else:
subterms = [tokens]
for seqterm in subterms:
ngram = ' '.join(seqterm)
if len(ngram) > 1:
# doc <=> ngram index
nodes_ngrams_count[(document.id, ngram)] += 1
# add fields : terms n
ngrams_data.add((ngram[:255], len(seqterm), ))
except:
#value not in doc
continue
# integrate ngrams and nodes-ngrams
if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE:
......
......@@ -164,7 +164,6 @@ def parse(corpus):
except Exception as error:
corpus.status('Docs', error=error)
corpus.save_hyperdata()
session.commit()
raise error
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment