Commit 2a7f88bd authored by delanoe's avatar delanoe

[CLEAN] Old europress script for archives.

parent 842efbe4
import sys
sys.path.append('/srv/gargantext')
from admin.env import *
import re import re
import locale import locale
from lxml import etree from lxml import etree
...@@ -7,27 +11,23 @@ from django.utils import timezone ...@@ -7,27 +11,23 @@ from django.utils import timezone
import dateutil.parser import dateutil.parser
import dateparser import dateparser
from .FileParser import FileParser from parsing.FileParsers.FileParser import FileParser
from ..NgramsExtractors import * from parsing.NgramsExtractors import *
from admin.utils import PrintException from admin.utils import PrintException
class EuropressFileParser(FileParser): class EuropressFileParser(FileParser):
def _parse(self, file): def _parse(self, file):
localeEncoding = "fr_FR" localeEncoding = "fr_FR"
codif = "UTF-8" codif = "UTF-8"
count = 0 count = 0
if isinstance(file, str): if isinstance(file, str):
file = open(file, 'rb') file = open(file, 'rb')
# print(file) print(file)
contents = file.read() contents = file.read()
#print(len(contents))
#return []
encoding = self.detect_encoding(contents) encoding = self.detect_encoding(contents)
#print(encoding)
if encoding != "utf-8": if encoding != "utf-8":
try: try:
contents = contents.decode("latin1", errors='replace').encode(codif) contents = contents.decode("latin1", errors='replace').encode(codif)
...@@ -92,7 +92,7 @@ class EuropressFileParser(FileParser): ...@@ -92,7 +92,7 @@ class EuropressFileParser(FileParser):
elif format_europresse == 50.2 : elif format_europresse == 50.2 :
name_xpath = "./header/div/span[@class = 'DocPublicationName']" name_xpath = "./header/div/span[@class = 'DocPublicationName']"
header_xpath = "./header/div/span[@class = 'DocHeader']" header_xpath = "./header/div/span[@class = 'DocHeader']"
title_xpath = "string(./header/div/span[@class = 'TitreArticleVisu'])" title_xpath = "./header/div/span[@class = 'TitreArticleVisu'])"
text_xpath = "./header/div/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()" text_xpath = "./header/div/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()"
...@@ -104,7 +104,6 @@ class EuropressFileParser(FileParser): ...@@ -104,7 +104,6 @@ class EuropressFileParser(FileParser):
for html_article in html_articles: for html_article in html_articles:
hyperdata = {} hyperdata = {}
if len(html_article): if len(html_article):
for name in html_article.xpath(name_xpath): for name in html_article.xpath(name_xpath):
#print("test name.text") #print("test name.text")
...@@ -131,7 +130,7 @@ class EuropressFileParser(FileParser): ...@@ -131,7 +130,7 @@ class EuropressFileParser(FileParser):
try: try:
text = header.text text = header.text
print("header", text) #print("header", text)
except Exception as error: except Exception as error:
print(error) print(error)
...@@ -305,3 +304,9 @@ class EuropressFileParser(FileParser): ...@@ -305,3 +304,9 @@ class EuropressFileParser(FileParser):
except Exception as error: except Exception as error:
print(error) print(error)
pass pass
if __name__ == "__main__":
e = EuropressFileParser()
e.parse(str(sys.argv[1]))
...@@ -125,8 +125,8 @@ class FileParser: ...@@ -125,8 +125,8 @@ class FileParser:
""" """
# initialize the list of hyperdata # initialize the list of hyperdata
hyperdata_list = [] hyperdata_list = []
# is the file is a ZIP archive, recurse on each of its files...
if zipfile.is_zipfile(file): if zipfile.is_zipfile(file):
print(file, "# is the file is a ZIP archive, recurse on each of its files...")
zipArchive = zipfile.ZipFile(file) zipArchive = zipfile.ZipFile(file)
for filename in zipArchive.namelist(): for filename in zipArchive.namelist():
try: try:
...@@ -137,6 +137,7 @@ class FileParser: ...@@ -137,6 +137,7 @@ class FileParser:
print(error) print(error)
# ...otherwise, let's parse it directly! # ...otherwise, let's parse it directly!
else: else:
print(file, "it is not a zip file")
try: try:
for hyperdata in self._parse(file): for hyperdata in self._parse(file):
hyperdata_list.append(self.format_hyperdata(hyperdata)) hyperdata_list.append(self.format_hyperdata(hyperdata))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment