[FIX] dates for Europresse parser: ok.

a83006e4 · delanoe · 2fac6579 · a83006e4 · a83006e4
Commit a83006e4 authored Nov 10, 2015 by delanoe
Hide whitespace changes
Inline Side-by-side

Showing with 20 additions and 15 deletions

EuropressFileParser.py parsing/FileParsers/EuropressFileParser.py +18 -13

FileParser.py parsing/FileParsers/FileParser.py +2 -2

No files found.
--- a/parsing/FileParsers/EuropressFileParser.py
+++ b/parsing/FileParsers/EuropressFileParser.py
@@ -29,9 +29,9 @@ class EuropressFileParser(FileParser):
        format_date = re.compile('.*\d{4}.*', re.UNICODE)
        if isinstance(file, str):
-            file_open = open(file, 'rb')
+            file = open(file, 'rb')
-        contents = file_open.read()
+        contents = file.read()
        encoding = self.detect_encoding(contents)
        if encoding != "utf-8":
@@ -91,19 +91,27 @@ class EuropressFileParser(FileParser):
                    header = header.split(', ')
                    if format_date.match(header[0]):
                        date       = header[0]
-                    else:
+                    elif format_date.match(header[1]):
                        hyperdata['rubrique']   = header[0]
                        date       = header[1]
+                        try:
-                    try:
+                            hyperdata['page']       = header[2].split(' ')[1]
-                        hyperdata['page']       = header[2].split(' ')[1]
+                        except:
-                    except:
+                            pass
-                        pass
+                    else:
+                        date       = header[2]
                try:
-                    hyperdata['publication_date'] = dateparser.parse(date, languages=['fr', 'en'])
+                    hyperdata['publication_date'] = dateparser.parse(date.strip(), languages=['fr', 'en'])
                except:
-                    hyperdata['publication_date'] = timezone.now()
+                    hyperdata['publication_date'] = timezone.now().strftime("%Y-%m-%d %H:%M:%S")
+                hyperdata['publication_year']  = hyperdata['publication_date'].strftime('%Y')
+                hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m')
+                hyperdata['publication_day']  = hyperdata['publication_date'].strftime('%d')
+                #print(hyperdata['publication_date'])
                try:
                    title   = paragraph_list(html_article.xpath(title_xpath))
                    hyperdata['title'] = title[0]
@@ -118,8 +126,6 @@ class EuropressFileParser(FileParser):
                yield hyperdata
-            file_open.close()
        except :
            PrintException()
            pass
@@ -133,4 +139,3 @@ if __name__ == "__main__":
        except:
            pass
--- a/parsing/FileParsers/FileParser.py
+++ b/parsing/FileParsers/FileParser.py
@@ -126,7 +126,7 @@ class FileParser:
        # initialize the list of hyperdata
        hyperdata_list = []
        if zipfile.is_zipfile(file):
-            print(file, "# is the file is a ZIP archive, recurse on each of its files...")
+            #print(file, "# is the file is a ZIP archive, recurse on each of its files...")
            zipArchive = zipfile.ZipFile(file)
            for filename in zipArchive.namelist():
                try:
@@ -137,7 +137,7 @@ class FileParser:
                    print(error)
        # ...otherwise, let's parse it directly!
        else:
-            print(file, "it is not a zip file")
+            #print(file, "it is not a zip file")
            try:
                for hyperdata in self._parse(file):
                    hyperdata_list.append(self.format_hyperdata(hyperdata))