Commit 56f0df4a authored by PkSM3's avatar PkSM3

[IDK] trying to push

parents e920cbb4 e592af7f
......@@ -3,6 +3,7 @@ import locale
from lxml import etree
from datetime import datetime, date
from django.utils import timezone
import dateutil.parser
from .FileParser import FileParser
from ..NgramsExtractors import *
......@@ -23,9 +24,15 @@ class EuropressFileParser(FileParser):
#print(len(contents))
#return []
encoding = self.detect_encoding(contents)
print(encoding)
#print(encoding)
if encoding != "utf-8":
try:
contents = contents.decode("latin1", errors='replace').encode(codif)
except :
try:
contents = contents.decode(encoding, errors='replace').encode(codif)
except Exception as error:
print(error)
try:
html_parser = etree.HTMLParser(encoding=codif)
......@@ -37,7 +44,7 @@ class EuropressFileParser(FileParser):
# initialize the list of metadata
metadata_list = []
# parse all the articles, one by one
try:
for html_article in html_articles:
metadata = {}
......@@ -54,7 +61,12 @@ class EuropressFileParser(FileParser):
metadata['source'] = name.text.encode(codif)
for header in html_article.xpath("./tr/td/span[@class = 'DocHeader']"):
try:
text = header.text
except Exception as error:
print(error)
if isinstance(text, bytes):
text = text.decode(encoding)
......@@ -78,12 +90,17 @@ class EuropressFileParser(FileParser):
text = text.replace('ű', 'û')
text = text.replace(' aot ', ' août ')
try :
metadata['publication_date'] = datetime.strptime(text, '%d %B %Y')
except :
try:
metadata['publication_date'] = datetime.strptime(text, '%B %Y')
except :
try:
metadata['publication_date'] = dateutil.parser.parse(text)
except Exception as error:
print(error)
print(text)
pass
......@@ -167,6 +184,10 @@ class EuropressFileParser(FileParser):
metadata_list.append(metadata)
count += 1
except Exception as error:
print(error)
pass
# from pprint import pprint
# pprint(metadata_list)
# return []
......
......@@ -36,6 +36,10 @@ class RisFileParser(FileParser):
#print(metadata)
try:
#print("append")
if 'language_fullname' not in metadata.keys():
if 'language_iso3' not in metadata.keys():
if 'language_iso2' not in metadata.keys():
metadata['language_iso2'] = 'en'
metadata_list.append(metadata)
metadata = {}
#print("append succeeded")
......
......@@ -23,12 +23,12 @@
<div class="navbar-collapse collapse">
<ul class="nav navbar-nav">
<!-- <li><a href="/admin/">Admin/</a></li> --!>
<li><a href="/projects/">Projects/</a></li>
<li><a href="/projects/">Projects</a></li>
{% if project %}
<li><a href="/project/{{project.id}}">{{project.name}}/</a></li>
<li><a href="/project/{{project.id}}">{{project.name}}</a></li>
{% endif %}
{% if corpus %}
<li><a href="/project/{{project.id}}/corpus/{{corpus.id}}">{{corpus.name}}/</a></li>
<li><a href="/project/{{project.id}}/corpus/{{corpus.id}}">{{corpus.name}}</a></li>
{% endif %}
......@@ -61,7 +61,7 @@
<hr>
<footer>
<p>Gargantext v.1.0 (Copyrights {{ date.year }})</p>
<p>Gargantext v.1.0 (Copyrights CNRS {{ date.year }})</p>
</footer>
......
......@@ -56,6 +56,7 @@
{{ formResource.as_p}}
<input type="submit" class="btn" value="Add this corpus" />
</form>
</center>
</p>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment