Commit b9062d6c authored by sim's avatar sim

[FIX] Bug #103: use timezone aware datetimes while parsing docs

parent 663a31db
......@@ -36,7 +36,7 @@ import os
import re
import importlib
from gargantext.util.lists import *
from gargantext.util.tools import datetime, convert_to_date
from gargantext.util import datetime, convert_to_datetime
from .settings import BASE_DIR
# types & models (nodes, lists, hyperdata, resource) ---------------------------------------------
......@@ -108,9 +108,9 @@ INDEXED_HYPERDATA = {
'publication_date':
{ 'id' : 2
, 'type' : datetime.datetime
, 'convert_to_db' : convert_to_date
, 'convert_from_db': datetime.datetime.fromtimestamp
, 'type' : datetime
, 'convert_to_db' : convert_to_datetime
, 'convert_from_db': convert_to_datetime
},
'title':
......
from .dates import datetime, convert_to_datetime, MINYEAR
import os
from gargantext.settings import MEDIA_ROOT
import datetime
import dateutil
from datetime import MINYEAR
from django.utils.dateparse import parse_datetime
from django.utils.timezone import datetime as _datetime, utc as UTC, now as utcnow
__all__ = ['convert_to_datetime', 'datetime', 'MINYEAR']
class datetime(_datetime):
@staticmethod
def now():
return utcnow()
@staticmethod
def utcfromtimestamp(ts):
return _datetime.utcfromtimestamp(ts).replace(tzinfo=UTC)
@staticmethod
def parse(s):
dt = parse_datetime(s)
return dt.astimezone(UTC) if dt.tzinfo else dt.replace(tzinfo=UTC)
def convert_to_datetime(dt):
if isinstance(dt, (int, float)):
return datetime.utcfromtimestamp(dt)
elif isinstance(dt, str):
return datetime.parse(dt)
elif isinstance(dt, _datetime):
args = (dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second)
return datetime(*args, tzinfo=dt.tzinfo or UTC).astimezone(UTC)
def convert_to_date(date):
if isinstance(date, (int, float)):
return datetime.datetime.timestamp(date)
else:
return dateutil.parser.parse(date)
raise ValueError("Can't convert to datetime: %r" % dt)
......@@ -73,9 +73,6 @@ class MultivacParser(Parser):
date = datetime.now()
hyperdata["publication_date"] = date
hyperdata["publication_year"] = str(date.year)
hyperdata["publication_month"] = str(date.month)
hyperdata["publication_day"] = str(date.day)
hyperdata_list.append(hyperdata)
......
import datetime
import dateutil.parser
import zipfile
import re
import dateparser as date_parser
from gargantext.util.languages import languages
from gargantext.util import datetime, convert_to_datetime, MINYEAR
DEFAULT_DATE = datetime.datetime(datetime.MINYEAR, 1, 1)
DEFAULT_DATE = datetime(MINYEAR, 1, 1)
class Parser:
......@@ -34,29 +34,29 @@ class Parser:
def format_hyperdata_dates(self, hyperdata):
"""Format the dates found in the hyperdata.
Examples:
{"publication_date": "2014-10-23 09:57:42"}
-> {"publication_date": "2014-10-23 09:57:42", "publication_year": "2014", ...}
{"publication_date": "2014-10-23 09:57:42+00:00"}
-> {"publication_date": "2014-10-23 09:57:42+00:00", "publication_year": "2014", ...}
{"publication_year": "2014"}
-> {"publication_date": "2014-01-01 00:00:00", "publication_year": "2014", ...}
-> {"publication_date": "2014-01-01 00:00:00+00:00", "publication_year": "2014", ...}
"""
# First, check the split dates...
# This part mainly deal with Zotero data but can be usefull for others
# parts
date_string = hyperdata.get('publication_date_to_parse', None)
date_string = hyperdata.get('publication_date_to_parse')
if date_string is not None:
date_string = re.sub(r'\/\/+(\w*|\d*)', '', date_string)
try:
hyperdata['publication' + "_date"] = dateutil.parser.parse(
hyperdata['publication_date'] = dateutil.parser.parse(
date_string,
default=DEFAULT_DATE
).strftime("%Y-%m-%d %H:%M:%S")
)
except Exception as error:
print(error, 'Date not parsed for:', date_string)
hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
hyperdata['publication_date'] = datetime.now()
elif hyperdata.get('publication_year', None) is not None:
elif hyperdata.get('publication_year') is not None:
prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_year"]
# eg prefixes : ['publication']
......@@ -64,56 +64,45 @@ class Parser:
for prefix in prefixes:
date_string = hyperdata[prefix + "_year"]
# FIXME: except for year is it necessary to test that key exists
# when we have a default value in .get(key, "01") ??
key = prefix + "_month"
if key in hyperdata:
date_string += " " + hyperdata.get(key, "01")
key = prefix + "_day"
if key in hyperdata:
date_string += " " + hyperdata.get(key, "01")
key = prefix + "_hour"
if key in hyperdata:
date_string += " " + hyperdata.get(key, "01")
key = prefix + "_minute"
if key in hyperdata:
date_string += ":" + hyperdata.get(key, "01")
key = prefix + "_second"
if key in hyperdata:
date_string += ":" + hyperdata.get(key, "01")
for part in ('month', 'day', 'hour', 'minute', 'second'):
key = prefix + '_' + part
if key not in hyperdata:
break
sep = ":" if key in ('minute', 'second') else " "
date_string += sep + hyperdata.get(key, '01')
try:
hyperdata[prefix + "_date"] = dateutil.parser.parse(date_string).strftime("%Y-%m-%d %H:%M:%S")
hyperdata[prefix + "_date"] = dateutil.parser.parse(date_string)
except Exception as error:
try:
print("_Parser: error in full date parse", error, date_string)
# Date format: 1994 NOV-DEC
hyperdata[prefix + "_date"] = date_parser.parse(str(date_string)[:8]).strftime("%Y-%m-%d %H:%M:%S")
hyperdata[prefix + "_date"] = date_parser.parse(str(date_string)[:8])
except Exception as error:
try:
print("_Parser: error in short date parse", error)
# FIXME Date format: 1994 SPR
# By default, we take the year only
hyperdata[prefix + "_date"] = date_parser.parse(str(date_string)[:4]).strftime("%Y-%m-%d %H:%M:%S")
hyperdata[prefix + "_date"] = date_parser.parse(str(date_string)[:4])
except Exception as error:
print("_Parser:", error)
else:
print("WARNING: Date unknown at _Parser level, using now()")
hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
hyperdata['publication_date'] = datetime.now()
# ...then parse all the "date" fields, to parse it into separate elements
prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_date"]
for prefix in prefixes:
date = dateutil.parser.parse(hyperdata[prefix + "_date"])
#print(date)
hyperdata[prefix + "_year"] = date.strftime("%Y")
hyperdata[prefix + "_month"] = date.strftime("%m")
hyperdata[prefix + "_day"] = date.strftime("%d")
hyperdata[prefix + "_hour"] = date.strftime("%H")
hyperdata[prefix + "_minute"] = date.strftime("%M")
hyperdata[prefix + "_second"] = date.strftime("%S")
name = prefix + "_date"
date = hyperdata[name]
hyperdata[name] = str(convert_to_datetime(date))
for part in ('year', 'month', 'day', 'hour', 'minute', 'second'):
hyperdata[prefix + '_' + part] = getattr(date, part)
# print("line 116", hyperdata['publication_date'])
# finally, return the transformed result!
return hyperdata
......
......@@ -43,8 +43,7 @@ def _nodes_hyperdata_generator(corpus):
key['id'],
None,
None,
value.strftime("%Y-%m-%d %H:%M:%S"),
# FIXME check timestamp +%Z
str(value),
None,
None,
)
......
......@@ -9,7 +9,6 @@ from gargantext.util.db import get_engine
from gargantext.util.db_cache import cache
from gargantext.constants import DEFAULT_COOC_THRESHOLD, NODETYPES
from gargantext.constants import INDEXED_HYPERDATA
from gargantext.util.tools import datetime, convert_to_date
def compute_coocs( corpus,
overwrite_id = None,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment