Commit 69333ff9 authored by Administrator's avatar Administrator

[CLEAN] removing print debug.

parent 40cee908
...@@ -11,7 +11,7 @@ from ..NgramsExtractors import * ...@@ -11,7 +11,7 @@ from ..NgramsExtractors import *
from admin.utils import PrintException from admin.utils import PrintException
class EuropressFileParser(FileParser): class EuropressFileParser(FileParser):
def _parse(self, file): def _parse(self, file):
localeEncoding = "fr_FR" localeEncoding = "fr_FR"
...@@ -39,21 +39,21 @@ class EuropressFileParser(FileParser): ...@@ -39,21 +39,21 @@ class EuropressFileParser(FileParser):
try: try:
html_parser = etree.HTMLParser(encoding=codif) html_parser = etree.HTMLParser(encoding=codif)
html = etree.fromstring(contents, html_parser) html = etree.fromstring(contents, html_parser)
try : try :
format_europresse = 50 format_europresse = 50
html_articles = html.xpath('/html/body/table/tbody') html_articles = html.xpath('/html/body/table/tbody')
if len(html_articles) < 1: if len(html_articles) < 1:
html_articles = html.xpath('/html/body/table') html_articles = html.xpath('/html/body/table')
if len(html_articles) < 1: if len(html_articles) < 1:
format_europresse = 1 format_europresse = 1
html_articles = html.xpath('//div[@id="docContain"]') html_articles = html.xpath('//div[@id="docContain"]')
except : except :
PrintException() PrintException()
if format_europresse == 50 : if format_europresse == 50 :
name_xpath = "./tr/td/span[@class = 'DocPublicationName']" name_xpath = "./tr/td/span[@class = 'DocPublicationName']"
header_xpath = "./tr/td/span[@class = 'DocHeader']" header_xpath = "./tr/td/span[@class = 'DocHeader']"
...@@ -77,7 +77,7 @@ class EuropressFileParser(FileParser): ...@@ -77,7 +77,7 @@ class EuropressFileParser(FileParser):
or self::td[@class='txtCertificat'] \ or self::td[@class='txtCertificat'] \
)]/text()" )]/text()"
doi_xpath = "//span[@id='ucPubliC_lblNodoc']/text()" doi_xpath = "//span[@id='ucPubliC_lblNodoc']/text()"
except Exception as error : except Exception as error :
PrintException() PrintException()
...@@ -85,9 +85,9 @@ class EuropressFileParser(FileParser): ...@@ -85,9 +85,9 @@ class EuropressFileParser(FileParser):
# parse all the articles, one by one # parse all the articles, one by one
try: try:
for html_article in html_articles: for html_article in html_articles:
hyperdata = {} hyperdata = {}
if len(html_article): if len(html_article):
for name in html_article.xpath(name_xpath): for name in html_article.xpath(name_xpath):
if name.text is not None: if name.text is not None:
...@@ -98,26 +98,26 @@ class EuropressFileParser(FileParser): ...@@ -98,26 +98,26 @@ class EuropressFileParser(FileParser):
hyperdata['volume'] = test_journal.group(2) hyperdata['volume'] = test_journal.group(2)
else: else:
hyperdata['journal'] = name.text.encode(codif) hyperdata['journal'] = name.text.encode(codif)
countbis = 0 countbis = 0
for header in html_article.xpath(header_xpath): for header in html_article.xpath(header_xpath):
# print(count) # print(count)
# countbis += 1 # countbis += 1
# try: # try:
# print('109', hyperdata['publication_date']) # print('109', hyperdata['publication_date'])
# except: # except:
# print('no date yet') # print('no date yet')
# pass # pass
try: try:
text = header.text text = header.text
#print("header", text) #print("header", text)
except Exception as error: except Exception as error:
print(error) print(error)
if isinstance(text, bytes): if isinstance(text, bytes):
text = text.decode(encoding) text = text.decode(encoding)
format_date_fr = re.compile('\d*\s*\w+\s+\d{4}', re.UNICODE) format_date_fr = re.compile('\d*\s*\w+\s+\d{4}', re.UNICODE)
...@@ -134,9 +134,9 @@ class EuropressFileParser(FileParser): ...@@ -134,9 +134,9 @@ class EuropressFileParser(FileParser):
test_date_en = None test_date_en = None
test_sect = None test_sect = None
test_page = None test_page = None
if test_date_fr is not None: if test_date_fr is not None:
self.localeEncoding = "fr_FR" self.localeEncoding = "fr_FR"
locale.setlocale(locale.LC_ALL, localeEncoding) locale.setlocale(locale.LC_ALL, localeEncoding)
...@@ -158,7 +158,7 @@ class EuropressFileParser(FileParser): ...@@ -158,7 +158,7 @@ class EuropressFileParser(FileParser):
except Exception as error: except Exception as error:
print(error, text) print(error, text)
pass pass
if test_date_en is not None: if test_date_en is not None:
localeEncoding = "en_GB.UTF-8" localeEncoding = "en_GB.UTF-8"
...@@ -173,20 +173,20 @@ class EuropressFileParser(FileParser): ...@@ -173,20 +173,20 @@ class EuropressFileParser(FileParser):
if test_sect is not None: if test_sect is not None:
hyperdata['section'] = test_sect.group(1).encode(codif) hyperdata['section'] = test_sect.group(1).encode(codif)
if test_page is not None: if test_page is not None:
hyperdata['page'] = test_page.group(1).encode(codif) hyperdata['page'] = test_page.group(1).encode(codif)
try: # try:
print('183', hyperdata['publication_date']) # print('183', hyperdata['publication_date'])
except: # except:
print('no date yet') # print('no date yet')
pass # pass
#
hyperdata['title'] = html_article.xpath(title_xpath).encode(codif) hyperdata['title'] = html_article.xpath(title_xpath).encode(codif)
hyperdata['abstract'] = html_article.xpath(text_xpath) hyperdata['abstract'] = html_article.xpath(text_xpath)
line = 0 line = 0
br_tag = 10 br_tag = 10
for i in html_articles[count].iter(): for i in html_articles[count].iter():
...@@ -205,13 +205,13 @@ class EuropressFileParser(FileParser): ...@@ -205,13 +205,13 @@ class EuropressFileParser(FileParser):
hyperdata['authors'] = 'not found' hyperdata['authors'] = 'not found'
line = 0 line = 0
br_tag = 10 br_tag = 10
try: try:
if hyperdata['publication_date'] is not None or hyperdata['publication_date'] != '': if hyperdata['publication_date'] is not None or hyperdata['publication_date'] != '':
try: try:
back = hyperdata['publication_date'] back = hyperdata['publication_date']
except Exception as e: except Exception as e:
#print(e) #print(e)
pass pass
else: else:
...@@ -226,14 +226,14 @@ class EuropressFileParser(FileParser): ...@@ -226,14 +226,14 @@ class EuropressFileParser(FileParser):
#hyperdata['language_iso2'] = 'fr' #hyperdata['language_iso2'] = 'fr'
#elif lang == 'en': #elif lang == 'en':
# hyperdata['language_iso2'] = 'en' # hyperdata['language_iso2'] = 'en'
hyperdata['publication_year'] = hyperdata['publication_date'].strftime('%Y') hyperdata['publication_year'] = hyperdata['publication_date'].strftime('%Y')
hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m') hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m')
hyperdata['publication_day'] = hyperdata['publication_date'].strftime('%d') hyperdata['publication_day'] = hyperdata['publication_date'].strftime('%d')
#hyperdata.pop('publication_date') #hyperdata.pop('publication_date')
if len(hyperdata['abstract'])>0 and format_europresse == 50: if len(hyperdata['abstract'])>0 and format_europresse == 50:
hyperdata['doi'] = str(hyperdata['abstract'][-9]) hyperdata['doi'] = str(hyperdata['abstract'][-9])
hyperdata['abstract'].pop() hyperdata['abstract'].pop()
# Here add separator for paragraphs # Here add separator for paragraphs
...@@ -245,15 +245,15 @@ class EuropressFileParser(FileParser): ...@@ -245,15 +245,15 @@ class EuropressFileParser(FileParser):
# Here add separator for paragraphs # Here add separator for paragraphs
hyperdata['abstract'] = str(' '.join(hyperdata['abstract'])) hyperdata['abstract'] = str(' '.join(hyperdata['abstract']))
else: else:
hyperdata['doi'] = "not found" hyperdata['doi'] = "not found"
hyperdata['length_words'] = len(hyperdata['abstract'].split(' ')) hyperdata['length_words'] = len(hyperdata['abstract'].split(' '))
hyperdata['length_letters'] = len(hyperdata['abstract']) hyperdata['length_letters'] = len(hyperdata['abstract'])
hyperdata['bdd'] = u'europresse' hyperdata['bdd'] = u'europresse'
hyperdata['url'] = u'' hyperdata['url'] = u''
#hyperdata_str = {} #hyperdata_str = {}
for key, value in hyperdata.items(): for key, value in hyperdata.items():
hyperdata[key] = value.decode() if isinstance(value, bytes) else value hyperdata[key] = value.decode() if isinstance(value, bytes) else value
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment