Commit 6a2e10fb authored by Administrator's avatar Administrator

[BUG FIX] Adding try/except for ugly encoded corpora.

parent 0c97f772
...@@ -42,141 +42,150 @@ class EuropressFileParser(FileParser): ...@@ -42,141 +42,150 @@ class EuropressFileParser(FileParser):
# initialize the list of metadata # initialize the list of metadata
metadata_list = [] metadata_list = []
# parse all the articles, one by one # parse all the articles, one by one
try:
for html_article in html_articles:
metadata = {}
if len(html_article):
for name in html_article.xpath("./tr/td/span[@class = 'DocPublicationName']"):
if name.text is not None:
format_journal = re.compile('(.*), (.*)', re.UNICODE)
test_journal = format_journal.match(name.text)
if test_journal is not None:
metadata['source'] = test_journal.group(1)
metadata['volume'] = test_journal.group(2)
else:
metadata['source'] = name.text.encode(codif)
for header in html_article.xpath("./tr/td/span[@class = 'DocHeader']"):
try:
text = header.text
except Exception as error:
print(error)
if isinstance(text, bytes):
text = text.decode(encoding)
format_date_fr = re.compile('\d*\s*\w+\s+\d{4}', re.UNICODE)
test_date_fr = format_date_fr.match(text)
format_date_en = re.compile('\w+\s+\d+,\s+\d{4}', re.UNICODE)
test_date_en = format_date_en.match(text)
format_sect = re.compile('(\D+),', re.UNICODE)
test_sect = format_sect.match(text)
format_page = re.compile(', p. (\w+)', re.UNICODE)
test_page = format_page.match(text)
if test_date_fr is not None:
self.localeEncoding = "fr_FR"
locale.setlocale(locale.LC_ALL, localeEncoding)
if encoding != "utf-8":
text = text.replace('י', 'é')
text = text.replace('ű', 'û')
text = text.replace(' aot ', ' août ')
for html_article in html_articles:
metadata = {}
if len(html_article):
for name in html_article.xpath("./tr/td/span[@class = 'DocPublicationName']"):
if name.text is not None:
format_journal = re.compile('(.*), (.*)', re.UNICODE)
test_journal = format_journal.match(name.text)
if test_journal is not None:
metadata['source'] = test_journal.group(1)
metadata['volume'] = test_journal.group(2)
else:
metadata['source'] = name.text.encode(codif)
for header in html_article.xpath("./tr/td/span[@class = 'DocHeader']"):
text = header.text
if isinstance(text, bytes):
text = text.decode(encoding)
format_date_fr = re.compile('\d*\s*\w+\s+\d{4}', re.UNICODE)
test_date_fr = format_date_fr.match(text)
format_date_en = re.compile('\w+\s+\d+,\s+\d{4}', re.UNICODE)
test_date_en = format_date_en.match(text)
format_sect = re.compile('(\D+),', re.UNICODE) try :
test_sect = format_sect.match(text) metadata['publication_date'] = datetime.strptime(text, '%d %B %Y')
format_page = re.compile(', p. (\w+)', re.UNICODE)
test_page = format_page.match(text)
if test_date_fr is not None:
self.localeEncoding = "fr_FR"
locale.setlocale(locale.LC_ALL, localeEncoding)
if encoding != "utf-8":
text = text.replace('י', 'é')
text = text.replace('ű', 'û')
text = text.replace(' aot ', ' août ')
try :
metadata['publication_date'] = datetime.strptime(text, '%d %B %Y')
except :
try:
metadata['publication_date'] = datetime.strptime(text, '%B %Y')
except : except :
try: try:
metadata['publication_date'] = dateutil.parser.parse(text) metadata['publication_date'] = datetime.strptime(text, '%B %Y')
except Exception as error: except :
print(error) try:
print(text) metadata['publication_date'] = dateutil.parser.parse(text)
pass except Exception as error:
print(error)
if test_date_en is not None: print(text)
localeEncoding = "en_GB.UTF-8" pass
locale.setlocale(locale.LC_ALL, localeEncoding)
try : if test_date_en is not None:
metadata['publication_date'] = datetime.strptime(text, '%B %d, %Y') localeEncoding = "en_GB.UTF-8"
except : locale.setlocale(locale.LC_ALL, localeEncoding)
try : try :
metadata['publication_date'] = datetime.strptime(text, '%B %Y') metadata['publication_date'] = datetime.strptime(text, '%B %d, %Y')
except : except :
pass try :
metadata['publication_date'] = datetime.strptime(text, '%B %Y')
except :
pass
if test_sect is not None: if test_sect is not None:
metadata['section'] = test_sect.group(1).encode(codif) metadata['section'] = test_sect.group(1).encode(codif)
if test_page is not None:
metadata['page'] = test_page.group(1).encode(codif)
metadata['title'] = html_article.xpath("string(./tr/td/span[@class = 'TitreArticleVisu'])").encode(codif)
metadata['text'] = html_article.xpath("./tr/td/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()")
line = 0
br_tag = 10
for i in html_articles[count].iter():
# print line, br, i, i.tag, i.attrib, i.tail
if i.tag == "span":
if "class" in i.attrib:
if i.attrib['class'] == 'TitreArticleVisu':
line = 1
br_tag = 2
if line == 1 and i.tag == "br":
br_tag -= 1
if line == 1 and br_tag == 0:
try:
metadata['authors'] = str.title(etree.tostring(i, method="text", encoding=codif)).encode(codif)#.split(';')
except:
metadata['authors'] = 'not found'
line = 0
br_tag = 10
if test_page is not None:
metadata['page'] = test_page.group(1).encode(codif) try:
if metadata['publication_date'] is not None or metadata['publication_date'] != '':
metadata['title'] = html_article.xpath("string(./tr/td/span[@class = 'TitreArticleVisu'])").encode(codif) try:
metadata['text'] = html_article.xpath("./tr/td/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()") back = metadata['publication_date']
except Exception as e:
line = 0 #print(e)
br_tag = 10 pass
for i in html_articles[count].iter(): else:
# print line, br, i, i.tag, i.attrib, i.tail try:
if i.tag == "span": metadata['publication_date'] = back
if "class" in i.attrib: except Exception as e:
if i.attrib['class'] == 'TitreArticleVisu': print(e)
line = 1 except :
br_tag = 2 metadata['publication_date'] = timezone.now()
if line == 1 and i.tag == "br":
br_tag -= 1 #if lang == 'fr':
if line == 1 and br_tag == 0: #metadata['language_iso2'] = 'fr'
try: #elif lang == 'en':
metadata['authors'] = str.title(etree.tostring(i, method="text", encoding=codif)).encode(codif)#.split(';') # metadata['language_iso2'] = 'en'
except:
metadata['authors'] = 'not found'
line = 0 metadata['publication_year'] = metadata['publication_date'].strftime('%Y')
br_tag = 10 metadata['publication_month'] = metadata['publication_date'].strftime('%m')
metadata['publication_day'] = metadata['publication_date'].strftime('%d')
metadata['publication_date'] = ""
try:
if metadata['publication_date'] is not None or metadata['publication_date'] != '': metadata['object_id'] = str(metadata['text'][-9])
try: metadata['text'].pop()
back = metadata['publication_date'] metadata['text'] = str(' '.join(metadata['text']))
except Exception as e: metadata['text'] = str(re.sub('Tous droits réservés.*$', '', metadata['text']))
#print(e)
pass metadata['bdd'] = u'europresse'
else: metadata['url'] = u''
try:
metadata['publication_date'] = back #metadata_str = {}
except Exception as e: for key, value in metadata.items():
print(e) metadata[key] = value.decode() if isinstance(value, bytes) else value
except : metadata_list.append(metadata)
metadata['publication_date'] = timezone.now() count += 1
#if lang == 'fr': except Exception as error:
#metadata['language_iso2'] = 'fr' print(error)
#elif lang == 'en': pass
# metadata['language_iso2'] = 'en'
metadata['publication_year'] = metadata['publication_date'].strftime('%Y')
metadata['publication_month'] = metadata['publication_date'].strftime('%m')
metadata['publication_day'] = metadata['publication_date'].strftime('%d')
metadata['publication_date'] = ""
metadata['object_id'] = str(metadata['text'][-9])
metadata['text'].pop()
metadata['text'] = str(' '.join(metadata['text']))
metadata['text'] = str(re.sub('Tous droits réservés.*$', '', metadata['text']))
metadata['bdd'] = u'europresse'
metadata['url'] = u''
#metadata_str = {}
for key, value in metadata.items():
metadata[key] = value.decode() if isinstance(value, bytes) else value
metadata_list.append(metadata)
count += 1
# from pprint import pprint # from pprint import pprint
# pprint(metadata_list) # pprint(metadata_list)
# return [] # return []
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment