Commit 4b37259d authored by PkSM3's avatar PkSM3

[BUGFIX] europress, when saving metadata and text is empty

parent 187f3efa
...@@ -13,13 +13,14 @@ from ..NgramsExtractors import * ...@@ -13,13 +13,14 @@ from ..NgramsExtractors import *
class EuropressFileParser(FileParser): class EuropressFileParser(FileParser):
def _parse(self, file): def _parse(self, file):
localeEncoding = "fr_FR" localeEncoding = "fr_FR"
codif = "UTF-8" codif = "UTF-8"
count = 0 count = 0
if isinstance(file, str): if isinstance(file, str):
file = open(file, 'rb') file = open(file, 'rb')
#print(file) # print(file)
contents = file.read() contents = file.read()
#print(len(contents)) #print(len(contents))
#return [] #return []
...@@ -174,11 +175,14 @@ class EuropressFileParser(FileParser): ...@@ -174,11 +175,14 @@ class EuropressFileParser(FileParser):
metadata['publication_month'] = metadata['publication_date'].strftime('%m') metadata['publication_month'] = metadata['publication_date'].strftime('%m')
metadata['publication_day'] = metadata['publication_date'].strftime('%d') metadata['publication_day'] = metadata['publication_date'].strftime('%d')
metadata['publication_date'] = "" metadata['publication_date'] = ""
if len(metadata['text'])>0:
metadata['doi'] = str(metadata['text'][-9])
metadata['text'].pop()
metadata['text'] = str(' '.join(metadata['text']))
metadata['text'] = str(re.sub('Tous droits réservés.*$', '', metadata['text']))
metadata['object_id'] = str(metadata['text'][-9]) else: metadata['doi'] = "not found"
metadata['text'].pop()
metadata['text'] = str(' '.join(metadata['text']))
metadata['text'] = str(re.sub('Tous droits réservés.*$', '', metadata['text']))
metadata['bdd'] = u'europresse' metadata['bdd'] = u'europresse'
metadata['url'] = u'' metadata['url'] = u''
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment