EuropressFileParser.py 9.29 KB
import re
import locale
from lxml import etree
from datetime import datetime, date
from django.utils import timezone
import dateutil.parser

from .FileParser import FileParser
from ..NgramsExtractors import *



class EuropressFileParser(FileParser):
  
    def _parse(self, file):

        localeEncoding = "fr_FR"
        codif      = "UTF-8"
        count = 0

        if isinstance(file, str):
            file = open(file, 'rb')
        # print(file)
        contents = file.read()
        #print(len(contents))
        #return []
        encoding = self.detect_encoding(contents)
        #print(encoding)
        if encoding != "utf-8":
            try:
                contents = contents.decode("latin1", errors='replace').encode(codif)
            except Exception as error:
                print(error)
#                try:
#                    contents = contents.decode(encoding, errors='replace').encode(codif)
#                except Exception as error:
#                    print(error)

        try:
            html_parser = etree.HTMLParser(encoding=codif)
            html = etree.fromstring(contents, html_parser)
            
            try:
                html_articles = html.xpath('/html/body/table/tbody')
                if len(html_articles) < 1:
                    html_articles = html.xpath('/html/body/table')
            except Exception as error:
                print(error)
            

        except:
            return []

        # initialize the list of metadata
        metadata_list = []
        # parse all the articles, one by one
        try:
            for html_article in html_articles:
                
                metadata = {}
                
                if len(html_article):
                    for name in html_article.xpath("./tr/td/span[@class = 'DocPublicationName']"):
                        if name.text is not None:
                            format_journal = re.compile('(.*), (.*)', re.UNICODE)
                            test_journal = format_journal.match(name.text)
                            if test_journal is not None:
                                metadata['source'] = test_journal.group(1)
                                metadata['volume'] = test_journal.group(2)
                            else:
                                metadata['source'] = name.text.encode(codif)

                    for header in html_article.xpath("./tr/td/span[@class = 'DocHeader']"):
                        try:
                            text = header.text
                        except Exception as error:
                            print(error)

                        
                        if isinstance(text, bytes):
                            text = text.decode(encoding)
                        format_date_fr = re.compile('\d*\s*\w+\s+\d{4}', re.UNICODE)
                        if text is not None:
                            test_date_fr = format_date_fr.match(text)
                            format_date_en = re.compile('\w+\s+\d+,\s+\d{4}', re.UNICODE)
                            test_date_en = format_date_en.match(text)
                            format_sect = re.compile('(\D+),', re.UNICODE)
                            test_sect = format_sect.match(text)
                            format_page = re.compile(', p. (\w+)', re.UNICODE)
                            test_page = format_page.match(text)
                        else:
                            test_date_fr = None
                            test_date_en = None
                            test_sect = None
                            test_page = None
                        
                        
                        
                        if test_date_fr is not None:
                            self.localeEncoding = "fr_FR"
                            locale.setlocale(locale.LC_ALL, localeEncoding)
                            if encoding != "utf-8":
                                text = text.replace('י', 'é')
                                text = text.replace('ű', 'û')
                                text = text.replace(' aot ', ' août ')

                            try :
                                metadata['publication_date'] = datetime.strptime(text, '%d %B %Y')
                            except :
                                try:
                                    metadata['publication_date'] = datetime.strptime(text, '%B %Y')
                                except :
                                    try:
                                        locale.setlocale(locale.LC_ALL, "fr_FR")
                                        metadata['publication_date'] = datetime.strptime(text, '%d %B %Y')
                                        # metadata['publication_date'] = dateutil.parser.parse(text)
                                    except Exception as error:
                                        print(error)
                                        print(text)
                                        pass
                        
                        
                        
                        if test_date_en is not None:
                            localeEncoding = "en_GB.UTF-8"
                            locale.setlocale(locale.LC_ALL, localeEncoding)
                            try :
                                metadata['publication_date'] = datetime.strptime(text, '%B %d, %Y')
                            except :
                                try :
                                    metadata['publication_date'] = datetime.strptime(text, '%B %Y')
                                except :
                                    pass

                        if test_sect is not None:
                            metadata['section'] = test_sect.group(1).encode(codif)
                        
                        if test_page is not None:
                            metadata['page'] = test_page.group(1).encode(codif)

                    metadata['title'] = html_article.xpath("string(./tr/td/span[@class = 'TitreArticleVisu'])").encode(codif)
                    metadata['text']  = html_article.xpath("./tr/td/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()")
                   
                    line = 0
                    br_tag = 10
                    for i in html_articles[count].iter():
                       # print line, br, i, i.tag, i.attrib, i.tail
                        if i.tag == "span":
                            if "class" in i.attrib:
                                if i.attrib['class'] == 'TitreArticleVisu':
                                    line = 1
                                    br_tag = 2
                        if line == 1 and i.tag == "br":
                            br_tag -= 1
                        if line == 1 and br_tag == 0:
                            try:
                                metadata['authors'] = str.title(etree.tostring(i, method="text", encoding=codif)).encode(codif)#.split(';')
                            except:
                                metadata['authors'] = 'not found'
                            line = 0
                            br_tag = 10
                    
                    
                    try:
                        if metadata['publication_date'] is not None or metadata['publication_date'] != '':
                            try:
                                back = metadata['publication_date']
                            except Exception as e: 
                                #print(e)
                                pass
                        else:
                            try:
                                metadata['publication_date'] = back
                            except Exception as e:
                                print(e)
                    except :
                        metadata['publication_date'] = timezone.now()

                    #if lang == 'fr':
                    #metadata['language_iso2'] = 'fr'
                    #elif lang == 'en':
                    #    metadata['language_iso2'] = 'en'
                    
                    
                    metadata['publication_year']  = metadata['publication_date'].strftime('%Y')
                    metadata['publication_month'] = metadata['publication_date'].strftime('%m')
                    metadata['publication_day']  = metadata['publication_date'].strftime('%d')
                    metadata['publication_date'] = ""
                    
                    if len(metadata['text'])>0: 
                        metadata['doi'] = str(metadata['text'][-9])
                        metadata['text'].pop()
                        metadata['text'] = str(' '.join(metadata['text']))
                        metadata['text'] = str(re.sub('Tous droits réservés.*$', '', metadata['text']))

                    else: metadata['doi'] = "not found"

                    metadata['bdd']  = u'europresse'
                    metadata['url']  = u''
                    
                  #metadata_str = {}
                    for key, value in metadata.items():
                        metadata[key] = value.decode() if isinstance(value, bytes) else value
                    metadata_list.append(metadata)
                    count += 1
        
        except Exception as error:
            print(error)
            pass

#       from pprint import pprint
#       pprint(metadata_list)
#       return []
        return metadata_list