Commit 87df8fc9 authored by Administrator's avatar Administrator

[BUGFIX] Europresse parser, bytes to strings.

parent 53f48588
...@@ -12,7 +12,7 @@ from parsing.NgramsExtractors import * ...@@ -12,7 +12,7 @@ from parsing.NgramsExtractors import *
class EuropressFileParser(FileParser): class EuropressFileParser(FileParser):
def _parse(self, file, lang='en'): def _parse(self, file):
localeEncoding = "fr_FR" localeEncoding = "fr_FR"
codif = "UTF-8" codif = "UTF-8"
count = 0 count = 0
...@@ -124,7 +124,7 @@ class EuropressFileParser(FileParser): ...@@ -124,7 +124,7 @@ class EuropressFileParser(FileParser):
metadata['date'] = datetime.now() metadata['date'] = datetime.now()
#if lang == 'fr': #if lang == 'fr':
metadata['language_iso2'] = 'fr' #metadata['language_iso2'] = 'fr'
#elif lang == 'en': #elif lang == 'en':
# metadata['language_iso2'] = 'en' # metadata['language_iso2'] = 'en'
...@@ -142,12 +142,17 @@ class EuropressFileParser(FileParser): ...@@ -142,12 +142,17 @@ class EuropressFileParser(FileParser):
metadata['bdd'] = u'europresse' metadata['bdd'] = u'europresse'
metadata['url'] = u'' metadata['url'] = u''
#metadata_str = {}
for key, value in metadata.items():
metadata[key] = value.decode() if isinstance(value, bytes) else value
metadata_list.append(metadata) metadata_list.append(metadata)
count += 1 count += 1
# from pprint import pprint
# pprint(metadata_list)
# return []
return metadata_list return metadata_list
#
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment