EuropressFileParser_old.py 13.9 KB
import sys
sys.path.append('/srv/gargantext')

from admin.env import *
import re
import locale
from lxml import etree
from lxml.html import html5parser
from datetime import datetime, date
from django.utils import timezone
import dateutil.parser
import dateparser

from parsing.FileParsers.FileParser import FileParser
from parsing.NgramsExtractors import *

from admin.utils import PrintException

class EuropressFileParser(FileParser):
    def _parse(self, file):
        localeEncoding = "fr_FR"
        codif      = "UTF-8"
        count = 0

        if isinstance(file, str):
            file = open(file, 'rb')
        print(file)
        contents = file.read()
        encoding = self.detect_encoding(contents)
        
        if encoding != "utf-8":
            try:
                contents = contents.decode("latin1", errors='replace').encode(codif)
            except:
                PrintException()
#                try:
#                    contents = contents.decode(encoding, errors='replace').encode(codif)
#                except Exception as error:
#                    print(error)

        try:
            html_parser = etree.HTMLParser(encoding=codif)
            html = etree.fromstring(contents, html_parser)

            try :

                format_europresse = 50
                html_articles = html.xpath('/html/body/table/tbody')

                if len(html_articles) < 1:
                    html_articles = html.xpath('/html/body/table')

                    if len(html_articles) < 1:
                        format_europresse = 1
                        html_articles = html.xpath('//div[@id="docContain"]')

                        if len(html_articles) < 1 :
                            format_europresse = 50.2
                            html_parser = html5parser.etree.HTMLParser(encoding=codif)
                            html = html5parser.etree.fromstring(contents, html_parser)
                            html_articles = html.xpath('//article')

                            if len(html_articles) < 1:
                                print("no article found")

            except :
                PrintException()

            if format_europresse == 50 :
                name_xpath      = "./tr/td/span[@class = 'DocPublicationName']"
                header_xpath    = "./tr/td/span[@class = 'DocHeader']"
                title_xpath     = "string(./tr/td/span[@class = 'TitreArticleVisu'])"
                text_xpath      = "./tr/td/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()"
            elif format_europresse == 1 :
                name_xpath      = "//span[@class = 'DocPublicationName']"
                header_xpath    = "//span[@class = 'DocHeader']"
                title_xpath     = "string(//div[@class = 'titreArticleVisu'])"
                text_xpath      = "./descendant::*[\
                        not(\
                           self::div[@class='Doc-SourceText'] \
                        or self::span[@class='DocHeader'] \
                        or self::span[@class='DocPublicationName'] \
                        or self::span[@id='docNameVisu'] \
                        or self::span[@class='DocHeader'] \
                        or self::div[@class='titreArticleVisu'] \
                        or self::span[@id='docNameContType'] \
                        or descendant-or-self::span[@id='ucPubliC_lblCertificatIssuedTo'] \
                        or descendant-or-self::span[@id='ucPubliC_lblEndDate'] \
                        or self::td[@class='txtCertificat'] \
                        )]/text()"
                doi_xpath  = "//span[@id='ucPubliC_lblNodoc']/text()"
            elif format_europresse == 50.2 :
                name_xpath      = "./header/div/span[@class = 'DocPublicationName']"
                header_xpath    = "./header/div/span[@class = 'DocHeader']"
                title_xpath     = "./header/div/span[@class = 'TitreArticleVisu'])"
                text_xpath      = "./header/div/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()"


        except Exception as error :
            PrintException()

        # parse all the articles, one by one
        try:
            for html_article in html_articles:

                hyperdata = {}
                if len(html_article):
                    for name in html_article.xpath(name_xpath):
                        #print("test name.text")
                        if name.text is not None:
                            #print(name.text)
                            format_journal = re.compile('(.*), (.*)', re.UNICODE)
                            test_journal = format_journal.match(name.text)
                            if test_journal is not None:
                                hyperdata['journal'] = test_journal.group(1)
                                hyperdata['volume'] = test_journal.group(2)
                            else:
                                hyperdata['journal'] = name.text.encode(codif)

                    countbis = 0

                    for header in html_article.xpath(header_xpath):
#                        print(count)
#                        countbis += 1
#                        try:
#                            print('109', hyperdata['publication_date'])
#                        except:
#                            print('no date yet')
#                            pass

                        try:
                            text = header.text
                            #print("header", text)
                        except Exception as error:
                            print(error)


                        if isinstance(text, bytes):
                            text = text.decode(encoding)

                        if format_europresse == 50.2:
                            # TODO here check the split if needed: 'Brest Ville, mercredi 26 novembre 2014'
                            try:# # 2015-oct-08 exception added
                                text = text.split(', ')[1] 
                            except:
                                pass

                        format_date_fr = re.compile('\d*\s*\w+\s+\d{4}', re.UNICODE)
                        format_date_fr_v2 = re.compile('\s*\w+\s+\d+\s+\w+\s+\d{4}', re.UNICODE)
                        if text is not None:
                            test_date_fr = format_date_fr.match(text)

                            #TODO check the v2 format here
                            test_date_fr_v2 = format_date_fr_v2.match(text)

                            format_date_en = re.compile('\w+\s+\d+,\s+\d{4}', re.UNICODE)
                            test_date_en = format_date_en.match(text)
                            format_sect = re.compile('(\D+),', re.UNICODE)
                            test_sect = format_sect.match(text)
                            format_page = re.compile(', p. (\w+)', re.UNICODE)
                            test_page = format_page.match(text)
                        else:
                            test_date_fr = None
                            test_date_fr_v2 = None
                            test_date_en = None
                            test_sect = None
                            test_page = None


                        if test_date_fr is not None or test_date_fr_v2 is not None:
                            self.localeEncoding = "fr_FR"
                            locale.setlocale(locale.LC_ALL, "fr_FR.utf-8")
                            if encoding != "utf-8":
                                text = text.replace('י', 'é')
                                text = text.replace('ű', 'û')
                                text = text.replace(' aot ', ' août ')

                            try:
                                hyperdata['publication_date'] = dateparser.parse(text, languages=['fr'])
                            except:
                                try :
                                    hyperdata['publication_date'] = datetime.strptime(text, '%d %B %Y')
                                except :
                                    try:
                                        hyperdata['publication_date'] = datetime.strptime(text, '%B %Y')
                                    except :
                                        try:
                                            locale.setlocale(locale.LC_ALL, "fr_FR")
                                            hyperdata['publication_date'] = datetime.strptime(text, '%d %B %Y')
                                            # hyperdata['publication_date'] = dateutil.parser.parse(text)
                                        except :
                                            # TODO format to parse: ' mercredi 26 novembre 2014'
                                            try :
                                                hyperdata['publication_date'] = datetime.strptime(text, ' %A %d %B %Y')
                                            except Exception as error:
                                                print(error, text)
                                                pass


                        if test_date_en is not None:
                            localeEncoding = "en_GB.UTF-8"
                            locale.setlocale(locale.LC_ALL, localeEncoding)
                            try :
                                hyperdata['publication_date'] = datetime.strptime(text, '%B %d, %Y')
                            except :
                                try :
                                    hyperdata['publication_date'] = datetime.strptime(text, '%B %Y')
                                except :
                                    pass

                        if test_sect is not None:
                            hyperdata['section'] = test_sect.group(1).encode(codif)

                        if test_page is not None:
                            hyperdata['page'] = test_page.group(1).encode(codif)

#                    try:
#                        print('183', hyperdata['publication_date'])
#                    except:
#                        print('no date yet')
#                        pass
#

                    hyperdata['title'] = html_article.xpath(title_xpath).encode(codif)
                    hyperdata['abstract']  = html_article.xpath(text_xpath)

                    line = 0
                    br_tag = 10
                    for i in html_articles[count].iter():
                       # print line, br, i, i.tag, i.attrib, i.tail
                        if i.tag == "span":
                            if "class" in i.attrib:
                                if i.attrib['class'] == 'TitreArticleVisu':
                                    line = 1
                                    br_tag = 2
                        if line == 1 and i.tag == "br":
                            br_tag -= 1
                        if line == 1 and br_tag == 0:
                            try:
                                hyperdata['authors'] = str.title(etree.tostring(i, method="text", encoding=codif)).encode(codif)#.split(';')
                            except:
                                hyperdata['authors'] = 'not found'
                            line = 0
                            br_tag = 10


                    try:
                        if hyperdata['publication_date'] is not None or hyperdata['publication_date'] != '':
                            try:
                                back = hyperdata['publication_date']
                            except Exception as e:
                                #print(e)
                                pass
                        else:
                            try:
                                hyperdata['publication_date'] = back
                            except Exception as e:
                                print(e)
                    except :
                        hyperdata['publication_date'] = timezone.now()

                    #if lang == 'fr':
                    #hyperdata['language_iso2'] = 'fr'
                    #elif lang == 'en':
                    #    hyperdata['language_iso2'] = 'en'

                    hyperdata['publication_year']  = hyperdata['publication_date'].strftime('%Y')
                    hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m')
                    hyperdata['publication_day']  = hyperdata['publication_date'].strftime('%d')
                    #hyperdata.pop('publication_date')

                    if len(hyperdata['abstract'])>0 and format_europresse == 50:
                        hyperdata['doi'] = str(hyperdata['abstract'][-9])
                        hyperdata['abstract'].pop()
# Here add separator for paragraphs
                        hyperdata['abstract'] = str(' '.join(hyperdata['abstract']))
                        hyperdata['abstract'] = str(re.sub('Tous droits réservés.*$', '', hyperdata['abstract']))
                    elif format_europresse == 1:
                        hyperdata['doi'] = ' '.join(html_article.xpath(doi_xpath))
                        hyperdata['abstract'] = hyperdata['abstract'][:-9]
# Here add separator for paragraphs
                        hyperdata['abstract'] = str(' '.join(hyperdata['abstract']))

                    else:
                        hyperdata['doi'] = "not found"

                    # try:
                    #     hyperdata['length_words'] = len(hyperdata['abstract'].split(' '))
                    # except:
                    #     PrintException()

                    hyperdata['length_letters'] = len(hyperdata['abstract'])

                    hyperdata['bdd']  = u'europresse'
                    hyperdata['url']  = u''

                  #hyperdata_str = {}
                    for key, value in hyperdata.items():
                        hyperdata[key] = value.decode() if isinstance(value, bytes) else value
                    yield hyperdata
                    count += 1
            file.close()

        except Exception as error:
            print(error)
            pass

if __name__ == "__main__":
    e = EuropressFileParser()
    e.parse(str(sys.argv[1]))