[BUGFIX] europress fr parser lala

f3e15144 · PkSM3 · 1fcead98 · f3e15144
Commit f3e15144 authored Oct 08, 2015 by PkSM3
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 43 deletions

EuropressFileParser.py parsing/FileParsers/EuropressFileParser.py +9 -43

No files found.
--- a/parsing/FileParsers/EuropressFileParser.py
+++ b/parsing/FileParsers/EuropressFileParser.py
 import re
 import locale
 from lxml import etree
-from lxml.html import html5parser
 from datetime import datetime, date
 from django.utils import timezone
 import dateutil.parser
@@ -52,16 +51,6 @@ class EuropressFileParser(FileParser):
                    if len(html_articles) < 1:
                        format_europresse = 1
                        html_articles = html.xpath('//div[@id="docContain"]')
-
-                        if len(html_articles) < 1 :
-                            format_europresse = 50.2
-                            html_parser = html5parser.etree.HTMLParser(encoding=codif)
-                            html = html5parser.etree.fromstring(contents, html_parser)
-                            html_articles = html.xpath('//article')
-
-                            if len(html_articles) < 1:
-                                print("no article found")
-
            except :
                PrintException()

@@ -88,11 +77,6 @@ class EuropressFileParser(FileParser):
                        or self::td[@class='txtCertificat'] \
                        )]/text()"
                doi_xpath  = "//span[@id='ucPubliC_lblNodoc']/text()"
-            elif format_europresse == 50.2 :
-                name_xpath      = "./header/div/span[@class = 'DocPublicationName']"
-                header_xpath    = "./header/div/span[@class = 'DocHeader']"
-                title_xpath     = "string(./header/div/span[@class = 'TitreArticleVisu'])"
-                text_xpath      = "./header/div/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()"


        except Exception as error :
@@ -106,9 +90,7 @@ class EuropressFileParser(FileParser):

                if len(html_article):
                    for name in html_article.xpath(name_xpath):
-                        #print("test name.text")
                        if name.text is not None:
-                            #print(name.text)
                            format_journal = re.compile('(.*), (.*)', re.UNICODE)
                            test_journal = format_journal.match(name.text)
                            if test_journal is not None:
@@ -122,6 +104,7 @@ class EuropressFileParser(FileParser):
                    for header in html_article.xpath(header_xpath):
 #                        print(count)
 #                        countbis += 1
+
 #                        try:
 #                            print('109', hyperdata['publication_date'])
 #                        except:
@@ -130,26 +113,16 @@ class EuropressFileParser(FileParser):

                        try:
                            text = header.text
-                            print("header", text)
+                            #print("header", text)
                        except Exception as error:
                            print(error)


                        if isinstance(text, bytes):
                            text = text.decode(encoding)
-
-                        if format_europresse == 50.2:
-                            # TODO here check the split if needed: 'Brest Ville, mercredi 26 novembre 2014'
-                            text = text.split(', ')[1]
-
                        format_date_fr = re.compile('\d*\s*\w+\s+\d{4}', re.UNICODE)
-                        format_date_fr_v2 = re.compile('\s*\w+\s+\d+\s+\w+\s+\d{4}', re.UNICODE)
                        if text is not None:
                            test_date_fr = format_date_fr.match(text)
-
-                            #TODO check the v2 format here
-                            test_date_fr_v2 = format_date_fr_v2.match(text)
-
                            format_date_en = re.compile('\w+\s+\d+,\s+\d{4}', re.UNICODE)
                            test_date_en = format_date_en.match(text)
                            format_sect = re.compile('(\D+),', re.UNICODE)
@@ -158,13 +131,13 @@ class EuropressFileParser(FileParser):
                            test_page = format_page.match(text)
                        else:
                            test_date_fr = None
-                            test_date_fr_v2 = None
                            test_date_en = None
                            test_sect = None
                            test_page = None


-                        if test_date_fr is not None or test_date_fr_v2 is not None:
+
+                        if test_date_fr is not None:
                            self.localeEncoding = "fr_FR"
                            locale.setlocale(locale.LC_ALL, localeEncoding)
                            if encoding != "utf-8":
@@ -182,13 +155,9 @@ class EuropressFileParser(FileParser):
                                        locale.setlocale(locale.LC_ALL, "fr_FR")
                                        hyperdata['publication_date'] = datetime.strptime(text, '%d %B %Y')
                                        # hyperdata['publication_date'] = dateutil.parser.parse(text)
-                                    except :
-                                        # TODO format to parse: ' mercredi 26 novembre 2014'
-                                        try :
-                                            hyperdata['publication_date'] = datetime.strptime(text, ' %A %d %B %Y')
-                                        except Exception as error:
-                                            print(error, text)
-                                            pass
+                                    except Exception as error:
+                                        print(error, text)
+                                        pass


                        if test_date_en is not None:
@@ -258,6 +227,7 @@ class EuropressFileParser(FileParser):
                    #elif lang == 'en':
                    #    hyperdata['language_iso2'] = 'en'

+
                    hyperdata['publication_year']  = hyperdata['publication_date'].strftime('%Y')
                    hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m')
                    hyperdata['publication_day']  = hyperdata['publication_date'].strftime('%d')
@@ -278,11 +248,7 @@ class EuropressFileParser(FileParser):
                    else:
                        hyperdata['doi'] = "not found"

-                    try:
-                        hyperdata['length_words'] = len(hyperdata['abstract'].split(' '))
-                    except:
-                        PrintException()
-
+                    hyperdata['length_words'] = len(hyperdata['abstract'].split(' '))
                    hyperdata['length_letters'] = len(hyperdata['abstract'])

                    hyperdata['bdd']  = u'europresse'