import re import locale from lxml import etree from datetime import datetime, date from django.utils import timezone import dateutil.parser from .FileParser import FileParser from ..NgramsExtractors import * from admin.utils import PrintException class EuropressFileParser(FileParser): def _parse(self, file): localeEncoding = "fr_FR" codif = "UTF-8" count = 0 if isinstance(file, str): file = open(file, 'rb') # print(file) contents = file.read() #print(len(contents)) #return [] encoding = self.detect_encoding(contents) #print(encoding) if encoding != "utf-8": try: contents = contents.decode("latin1", errors='replace').encode(codif) except: PrintException() # try: # contents = contents.decode(encoding, errors='replace').encode(codif) # except Exception as error: # print(error) try: html_parser = etree.HTMLParser(encoding=codif) html = etree.fromstring(contents, html_parser) try : format_europresse = 50 html_articles = html.xpath('/html/body/table/tbody') if len(html_articles) < 1: html_articles = html.xpath('/html/body/table') if len(html_articles) < 1: format_europresse = 1 html_articles = html.xpath('//div[@id="docContain"]') except : PrintException() if format_europresse == 50 : name_xpath = "./tr/td/span[@class = 'DocPublicationName']" header_xpath = "./tr/td/span[@class = 'DocHeader']" title_xpath = "string(./tr/td/span[@class = 'TitreArticleVisu'])" text_xpath = "./tr/td/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()" elif format_europresse == 1 : name_xpath = "//span[@class = 'DocPublicationName']" header_xpath = "//span[@class = 'DocHeader']" title_xpath = "string(//div[@class = 'titreArticleVisu'])" text_xpath = "./descendant::*[\ not(\ self::div[@class='Doc-SourceText'] \ or self::span[@class='DocHeader'] \ or self::span[@class='DocPublicationName'] \ or self::span[@id='docNameVisu'] \ or self::span[@class='DocHeader'] \ or self::div[@class='titreArticleVisu'] \ or self::span[@id='docNameContType'] \ or descendant-or-self::span[@id='ucPubliC_lblCertificatIssuedTo'] \ or descendant-or-self::span[@id='ucPubliC_lblEndDate'] \ or self::td[@class='txtCertificat'] \ )]/text()" doi_xpath = "//span[@id='ucPubliC_lblNodoc']/text()" except Exception as error : PrintException() # parse all the articles, one by one try: for html_article in html_articles: hyperdata = {} if len(html_article): for name in html_article.xpath(name_xpath): if name.text is not None: format_journal = re.compile('(.*), (.*)', re.UNICODE) test_journal = format_journal.match(name.text) if test_journal is not None: hyperdata['journal'] = test_journal.group(1) hyperdata['volume'] = test_journal.group(2) else: hyperdata['journal'] = name.text.encode(codif) countbis = 0 for header in html_article.xpath(header_xpath): # print(count) # countbis += 1 # try: # print('109', hyperdata['publication_date']) # except: # print('no date yet') # pass try: text = header.text #print("header", text) except Exception as error: print(error) if isinstance(text, bytes): text = text.decode(encoding) format_date_fr = re.compile('\d*\s*\w+\s+\d{4}', re.UNICODE) if text is not None: test_date_fr = format_date_fr.match(text) format_date_en = re.compile('\w+\s+\d+,\s+\d{4}', re.UNICODE) test_date_en = format_date_en.match(text) format_sect = re.compile('(\D+),', re.UNICODE) test_sect = format_sect.match(text) format_page = re.compile(', p. (\w+)', re.UNICODE) test_page = format_page.match(text) else: test_date_fr = None test_date_en = None test_sect = None test_page = None if test_date_fr is not None: self.localeEncoding = "fr_FR" locale.setlocale(locale.LC_ALL, localeEncoding) if encoding != "utf-8": text = text.replace('י', 'é') text = text.replace('ű', 'û') text = text.replace(' aot ', ' août ') try : hyperdata['publication_date'] = datetime.strptime(text, '%d %B %Y') except : try: hyperdata['publication_date'] = datetime.strptime(text, '%B %Y') except : try: locale.setlocale(locale.LC_ALL, "fr_FR") hyperdata['publication_date'] = datetime.strptime(text, '%d %B %Y') # hyperdata['publication_date'] = dateutil.parser.parse(text) except Exception as error: print(error, text) pass if test_date_en is not None: localeEncoding = "en_GB.UTF-8" locale.setlocale(locale.LC_ALL, localeEncoding) try : hyperdata['publication_date'] = datetime.strptime(text, '%B %d, %Y') except : try : hyperdata['publication_date'] = datetime.strptime(text, '%B %Y') except : pass if test_sect is not None: hyperdata['section'] = test_sect.group(1).encode(codif) if test_page is not None: hyperdata['page'] = test_page.group(1).encode(codif) # try: # print('183', hyperdata['publication_date']) # except: # print('no date yet') # pass # hyperdata['title'] = html_article.xpath(title_xpath).encode(codif) hyperdata['abstract'] = html_article.xpath(text_xpath) line = 0 br_tag = 10 for i in html_articles[count].iter(): # print line, br, i, i.tag, i.attrib, i.tail if i.tag == "span": if "class" in i.attrib: if i.attrib['class'] == 'TitreArticleVisu': line = 1 br_tag = 2 if line == 1 and i.tag == "br": br_tag -= 1 if line == 1 and br_tag == 0: try: hyperdata['authors'] = str.title(etree.tostring(i, method="text", encoding=codif)).encode(codif)#.split(';') except: hyperdata['authors'] = 'not found' line = 0 br_tag = 10 try: if hyperdata['publication_date'] is not None or hyperdata['publication_date'] != '': try: back = hyperdata['publication_date'] except Exception as e: #print(e) pass else: try: hyperdata['publication_date'] = back except Exception as e: print(e) except : hyperdata['publication_date'] = timezone.now() #if lang == 'fr': #hyperdata['language_iso2'] = 'fr' #elif lang == 'en': # hyperdata['language_iso2'] = 'en' hyperdata['publication_year'] = hyperdata['publication_date'].strftime('%Y') hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m') hyperdata['publication_day'] = hyperdata['publication_date'].strftime('%d') #hyperdata.pop('publication_date') if len(hyperdata['abstract'])>0 and format_europresse == 50: hyperdata['doi'] = str(hyperdata['abstract'][-9]) hyperdata['abstract'].pop() # Here add separator for paragraphs hyperdata['abstract'] = str(' '.join(hyperdata['abstract'])) hyperdata['abstract'] = str(re.sub('Tous droits réservés.*$', '', hyperdata['abstract'])) elif format_europresse == 1: hyperdata['doi'] = ' '.join(html_article.xpath(doi_xpath)) hyperdata['abstract'] = hyperdata['abstract'][:-9] # Here add separator for paragraphs hyperdata['abstract'] = str(' '.join(hyperdata['abstract'])) else: hyperdata['doi'] = "not found" hyperdata['length_words'] = len(hyperdata['abstract'].split(' ')) hyperdata['length_letters'] = len(hyperdata['abstract']) hyperdata['bdd'] = u'europresse' hyperdata['url'] = u'' #hyperdata_str = {} for key, value in hyperdata.items(): hyperdata[key] = value.decode() if isinstance(value, bytes) else value yield hyperdata count += 1 file.close() except Exception as error: print(error) pass