Commit c7e81064 authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge branch 'testing-merge' into stable-imt-merge

parents 5a9f3b3a aa325e73
...@@ -14,7 +14,7 @@ TELL ALEMBIC TO NOT START FROM SCRATCH ...@@ -14,7 +14,7 @@ TELL ALEMBIC TO NOT START FROM SCRATCH
# "upgrade head" command. If you don't want to do this, you can of course # "upgrade head" command. If you don't want to do this, you can of course
# drop your database and really start from scratch. # drop your database and really start from scratch.
alembic stamp 601e9d9baa4c alembic stamp bedce47c9e34
UPGRADE TO LATEST DATABASE VERSION UPGRADE TO LATEST DATABASE VERSION
......
...@@ -7,7 +7,7 @@ Create Date: 2017-07-06 10:52:16.161118 ...@@ -7,7 +7,7 @@ Create Date: 2017-07-06 10:52:16.161118
""" """
from alembic import op from alembic import op
import sqlalchemy as sa import sqlalchemy as sa
from gargantext.tools.alembic import ReplaceableObject from gargantext.util.alembic import ReplaceableObject
# revision identifiers, used by Alembic. # revision identifiers, used by Alembic.
......
"""Fix issue with Node.hyperdata index
Revision ID: bedce47c9e34
Revises: 08230100f262
Create Date: 2017-07-10 11:30:59.168190
"""
from alembic import op
import sqlalchemy as sa
import gargantext
# revision identifiers, used by Alembic.
revision = 'bedce47c9e34'
down_revision = '08230100f262'
branch_labels = None
depends_on = None
def upgrade():
op.drop_index('nodes_hyperdata_idx', table_name='nodes')
op.create_index('nodes_hyperdata_idx', 'nodes', ['hyperdata'], unique=False, postgresql_using="gin")
def downgrade():
# We won't unfix the bug when downgrading...
pass
...@@ -36,7 +36,7 @@ import os ...@@ -36,7 +36,7 @@ import os
import re import re
import importlib import importlib
from gargantext.util.lists import * from gargantext.util.lists import *
from gargantext.util.tools import datetime, convert_to_date from gargantext.util import datetime, convert_to_datetime
from .settings import BASE_DIR from .settings import BASE_DIR
# types & models (nodes, lists, hyperdata, resource) --------------------------------------------- # types & models (nodes, lists, hyperdata, resource) ---------------------------------------------
...@@ -108,9 +108,9 @@ INDEXED_HYPERDATA = { ...@@ -108,9 +108,9 @@ INDEXED_HYPERDATA = {
'publication_date': 'publication_date':
{ 'id' : 2 { 'id' : 2
, 'type' : datetime.datetime , 'type' : datetime
, 'convert_to_db' : convert_to_date , 'convert_to_db' : convert_to_datetime
, 'convert_from_db': datetime.datetime.fromtimestamp , 'convert_from_db': convert_to_datetime
}, },
'title': 'title':
......
from django.core.management.base import BaseCommand, CommandError from django.core.management.base import BaseCommand, CommandError
from gargantext.tools.show_nodes import tree_show, nodes from gargantext.util.show_nodes import tree_show, nodes
import colorama import colorama
......
from django.core.management.base import BaseCommand, CommandError
from gargantext.models import Node
class Command(BaseCommand):
help = 'Something'
def handle(self, *args, **options):
self.stdout.write(self.style.SUCCESS('Oh yeah!'))
...@@ -58,7 +58,7 @@ class Node(Base): ...@@ -58,7 +58,7 @@ class Node(Base):
__tablename__ = 'nodes' __tablename__ = 'nodes'
__table_args__ = ( __table_args__ = (
Index('nodes_user_id_typename_parent_id_idx', 'user_id', 'typename', 'parent_id'), Index('nodes_user_id_typename_parent_id_idx', 'user_id', 'typename', 'parent_id'),
Index('nodes_hyperdata_idx', 'hyperdata')) Index('nodes_hyperdata_idx', 'hyperdata', postgresql_using='gin'))
# TODO # TODO
# create INDEX full_text_idx on nodes using gin(to_tsvector('english', hyperdata ->> 'abstract' || 'title')); # create INDEX full_text_idx on nodes using gin(to_tsvector('english', hyperdata ->> 'abstract' || 'title'));
......
from .dates import datetime, convert_to_datetime, MINYEAR
import os
from gargantext.settings import MEDIA_ROOT
from datetime import MINYEAR
from django.utils.dateparse import parse_datetime
from django.utils.timezone import datetime as _datetime, utc as UTC, now as utcnow
__all__ = ['convert_to_datetime', 'datetime', 'MINYEAR']
class datetime(_datetime):
@staticmethod
def now():
return utcnow()
@staticmethod
def utcfromtimestamp(ts):
return _datetime.utcfromtimestamp(ts).replace(tzinfo=UTC)
@staticmethod
def parse(s):
dt = parse_datetime(s)
return dt.astimezone(UTC) if dt.tzinfo else dt.replace(tzinfo=UTC)
def convert_to_datetime(dt):
if isinstance(dt, (int, float)):
return datetime.utcfromtimestamp(dt)
elif isinstance(dt, str):
return datetime.parse(dt)
elif isinstance(dt, _datetime):
args = (dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second)
return datetime(*args, tzinfo=dt.tzinfo or UTC).astimezone(UTC)
else:
raise ValueError("Can't convert to datetime: %r" % dt)
...@@ -29,6 +29,7 @@ class ModelCache(dict): ...@@ -29,6 +29,7 @@ class ModelCache(dict):
continue continue
if formatted_key in self: if formatted_key in self:
self[key] = self[formatted_key] self[key] = self[formatted_key]
element = self[key]
else: else:
element = session.query(self._model).filter(or_(*conditions)).first() element = session.query(self._model).filter(or_(*conditions)).first()
if element is None: if element is None:
......
...@@ -461,6 +461,7 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM, ...@@ -461,6 +461,7 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM,
raise ValueError('Wrong header "%s" on line %i (only possible headers are "label", "forms" and "status")' % (colname, n_read_lines)) raise ValueError('Wrong header "%s" on line %i (only possible headers are "label", "forms" and "status")' % (colname, n_read_lines))
if 'label' not in columns: if 'label' not in columns:
raise ValueError('CSV must contain at least one column with the header "label"') raise ValueError('CSV must contain at least one column with the header "label"')
continue
if not len(csv_row): if not len(csv_row):
continue continue
...@@ -567,7 +568,8 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM, ...@@ -567,7 +568,8 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM,
'map': UnweightedList(), 'map': UnweightedList(),
'main': UnweightedList(), 'main': UnweightedList(),
'stop': UnweightedList(), 'stop': UnweightedList(),
'groupings' : Translations() 'groupings' : Translations(),
'new_ngram_count': n_added_ng,
} }
for list_type in imported_nodes_ngrams: for list_type in imported_nodes_ngrams:
...@@ -663,6 +665,7 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]): ...@@ -663,6 +665,7 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
for ng_id in new_lists[list_type].items: for ng_id in new_lists[list_type].items:
collect(ng_id) collect(ng_id)
if new_lists.get('new_ngram_count', 0) > 0:
from gargantext.util.toolchain.main import t from gargantext.util.toolchain.main import t
print("MERGE DEBUG: starting index_new_ngrams", t()) print("MERGE DEBUG: starting index_new_ngrams", t())
n_added = index_new_ngrams(all_possibly_new_ngram_ids, onto_corpus) n_added = index_new_ngrams(all_possibly_new_ngram_ids, onto_corpus)
...@@ -827,7 +830,7 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]): ...@@ -827,7 +830,7 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
@shared_task @shared_task
def import_and_merge_ngramlists(file_contents, onto_corpus_id): def import_and_merge_ngramlists(file_contents, onto_corpus_id, overwrite=False):
""" """
A single function to run import_ngramlists and merge_ngramlists together A single function to run import_ngramlists and merge_ngramlists together
""" """
...@@ -837,6 +840,7 @@ def import_and_merge_ngramlists(file_contents, onto_corpus_id): ...@@ -837,6 +840,7 @@ def import_and_merge_ngramlists(file_contents, onto_corpus_id):
corpus_node = session.query(Node).filter(Node.id == onto_corpus_id).first() corpus_node = session.query(Node).filter(Node.id == onto_corpus_id).first()
# merge the new_lists onto those of the target corpus # merge the new_lists onto those of the target corpus
log_msg = merge_ngramlists(new_lists, onto_corpus=corpus_node) del_originals = ['stop', 'main', 'map'] if overwrite else []
log_msg = merge_ngramlists(new_lists, onto_corpus=corpus_node, del_originals=del_originals)
return log_msg return log_msg
...@@ -73,9 +73,6 @@ class MultivacParser(Parser): ...@@ -73,9 +73,6 @@ class MultivacParser(Parser):
date = datetime.now() date = datetime.now()
hyperdata["publication_date"] = date hyperdata["publication_date"] = date
hyperdata["publication_year"] = str(date.year)
hyperdata["publication_month"] = str(date.month)
hyperdata["publication_day"] = str(date.day)
hyperdata_list.append(hyperdata) hyperdata_list.append(hyperdata)
......
import datetime
import dateutil.parser import dateutil.parser
import zipfile import zipfile
import re import re
import dateparser as date_parser import dateparser as date_parser
from gargantext.util.languages import languages from gargantext.util.languages import languages
from gargantext.util import datetime, convert_to_datetime, MINYEAR
DEFAULT_DATE = datetime.datetime(datetime.MINYEAR, 1, 1) DEFAULT_DATE = datetime(MINYEAR, 1, 1)
class Parser: class Parser:
...@@ -34,29 +34,29 @@ class Parser: ...@@ -34,29 +34,29 @@ class Parser:
def format_hyperdata_dates(self, hyperdata): def format_hyperdata_dates(self, hyperdata):
"""Format the dates found in the hyperdata. """Format the dates found in the hyperdata.
Examples: Examples:
{"publication_date": "2014-10-23 09:57:42"} {"publication_date": "2014-10-23 09:57:42+00:00"}
-> {"publication_date": "2014-10-23 09:57:42", "publication_year": "2014", ...} -> {"publication_date": "2014-10-23 09:57:42+00:00", "publication_year": "2014", ...}
{"publication_year": "2014"} {"publication_year": "2014"}
-> {"publication_date": "2014-01-01 00:00:00", "publication_year": "2014", ...} -> {"publication_date": "2014-01-01 00:00:00+00:00", "publication_year": "2014", ...}
""" """
# First, check the split dates... # First, check the split dates...
# This part mainly deal with Zotero data but can be usefull for others # This part mainly deal with Zotero data but can be usefull for others
# parts # parts
date_string = hyperdata.get('publication_date_to_parse', None) date_string = hyperdata.get('publication_date_to_parse')
if date_string is not None: if date_string is not None:
date_string = re.sub(r'\/\/+(\w*|\d*)', '', date_string) date_string = re.sub(r'\/\/+(\w*|\d*)', '', date_string)
try: try:
hyperdata['publication' + "_date"] = dateutil.parser.parse( hyperdata['publication_date'] = dateutil.parser.parse(
date_string, date_string,
default=DEFAULT_DATE default=DEFAULT_DATE
).strftime("%Y-%m-%d %H:%M:%S") )
except Exception as error: except Exception as error:
print(error, 'Date not parsed for:', date_string) print(error, 'Date not parsed for:', date_string)
hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") hyperdata['publication_date'] = datetime.now()
elif hyperdata.get('publication_year', None) is not None: elif hyperdata.get('publication_year') is not None:
prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_year"] prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_year"]
# eg prefixes : ['publication'] # eg prefixes : ['publication']
...@@ -64,56 +64,45 @@ class Parser: ...@@ -64,56 +64,45 @@ class Parser:
for prefix in prefixes: for prefix in prefixes:
date_string = hyperdata[prefix + "_year"] date_string = hyperdata[prefix + "_year"]
# FIXME: except for year is it necessary to test that key exists for part in ('month', 'day', 'hour', 'minute', 'second'):
# when we have a default value in .get(key, "01") ?? key = prefix + '_' + part
key = prefix + "_month" if key not in hyperdata:
if key in hyperdata: break
date_string += " " + hyperdata.get(key, "01")
key = prefix + "_day" sep = ":" if key in ('minute', 'second') else " "
if key in hyperdata: date_string += sep + hyperdata.get(key, '01')
date_string += " " + hyperdata.get(key, "01")
key = prefix + "_hour"
if key in hyperdata:
date_string += " " + hyperdata.get(key, "01")
key = prefix + "_minute"
if key in hyperdata:
date_string += ":" + hyperdata.get(key, "01")
key = prefix + "_second"
if key in hyperdata:
date_string += ":" + hyperdata.get(key, "01")
try: try:
hyperdata[prefix + "_date"] = dateutil.parser.parse(date_string).strftime("%Y-%m-%d %H:%M:%S") hyperdata[prefix + "_date"] = dateutil.parser.parse(date_string)
except Exception as error: except Exception as error:
try: try:
print("_Parser: error in full date parse", error, date_string) print("_Parser: error in full date parse", error, date_string)
# Date format: 1994 NOV-DEC # Date format: 1994 NOV-DEC
hyperdata[prefix + "_date"] = date_parser.parse(str(date_string)[:8]).strftime("%Y-%m-%d %H:%M:%S") hyperdata[prefix + "_date"] = date_parser.parse(str(date_string)[:8])
except Exception as error: except Exception as error:
try: try:
print("_Parser: error in short date parse", error) print("_Parser: error in short date parse", error)
# FIXME Date format: 1994 SPR # FIXME Date format: 1994 SPR
# By default, we take the year only # By default, we take the year only
hyperdata[prefix + "_date"] = date_parser.parse(str(date_string)[:4]).strftime("%Y-%m-%d %H:%M:%S") hyperdata[prefix + "_date"] = date_parser.parse(str(date_string)[:4])
except Exception as error: except Exception as error:
print("_Parser:", error) print("_Parser:", error)
else: else:
print("WARNING: Date unknown at _Parser level, using now()") print("WARNING: Date unknown at _Parser level, using now()")
hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") hyperdata['publication_date'] = datetime.now()
# ...then parse all the "date" fields, to parse it into separate elements # ...then parse all the "date" fields, to parse it into separate elements
prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_date"] prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_date"]
for prefix in prefixes: for prefix in prefixes:
date = dateutil.parser.parse(hyperdata[prefix + "_date"]) name = prefix + "_date"
#print(date) date = hyperdata[name]
hyperdata[name] = str(convert_to_datetime(date))
hyperdata[prefix + "_year"] = date.strftime("%Y")
hyperdata[prefix + "_month"] = date.strftime("%m") for part in ('year', 'month', 'day', 'hour', 'minute', 'second'):
hyperdata[prefix + "_day"] = date.strftime("%d") hyperdata[prefix + '_' + part] = getattr(date, part)
hyperdata[prefix + "_hour"] = date.strftime("%H")
hyperdata[prefix + "_minute"] = date.strftime("%M")
hyperdata[prefix + "_second"] = date.strftime("%S")
# print("line 116", hyperdata['publication_date']) # print("line 116", hyperdata['publication_date'])
# finally, return the transformed result! # finally, return the transformed result!
return hyperdata return hyperdata
......
# Make this a standalone script... # Make this a standalone script...
# Can be called this way: python3 gargantext/tools/show_nodes.py # Can be called this way: python3 gargantext/util/show_nodes.py
import os import os
import django import django
......
...@@ -43,8 +43,7 @@ def _nodes_hyperdata_generator(corpus): ...@@ -43,8 +43,7 @@ def _nodes_hyperdata_generator(corpus):
key['id'], key['id'],
None, None,
None, None,
value.strftime("%Y-%m-%d %H:%M:%S"), str(value),
# FIXME check timestamp +%Z
None, None,
None, None,
) )
......
...@@ -9,7 +9,6 @@ from gargantext.util.db import get_engine ...@@ -9,7 +9,6 @@ from gargantext.util.db import get_engine
from gargantext.util.db_cache import cache from gargantext.util.db_cache import cache
from gargantext.constants import DEFAULT_COOC_THRESHOLD, NODETYPES from gargantext.constants import DEFAULT_COOC_THRESHOLD, NODETYPES
from gargantext.constants import INDEXED_HYPERDATA from gargantext.constants import INDEXED_HYPERDATA
from gargantext.util.tools import datetime, convert_to_date
def compute_coocs( corpus, def compute_coocs( corpus,
overwrite_id = None, overwrite_id = None,
......
import os
from gargantext.settings import MEDIA_ROOT
import datetime
import dateutil
def convert_to_date(date):
if isinstance(date, (int, float)):
return datetime.datetime.timestamp(date)
else:
return dateutil.parser.parse(date)
def ensure_dir(user):
'''
If user is new, folder does not exist yet, create it then
'''
dirpath = '%s/corpora/%s' % (MEDIA_ROOT, user.username)
if not os.path.exists(dirpath):
print("Creating folder %s" % dirpath)
os.makedirs(dirpath)
...@@ -92,7 +92,8 @@ class CSVLists(APIView): ...@@ -92,7 +92,8 @@ class CSVLists(APIView):
log_msg = "Async generation" log_msg = "Async generation"
corpus_node_id = corpus_node.id corpus_node_id = corpus_node.id
scheduled(import_and_merge_ngramlists)(csv_contents, corpus_node_id) scheduled(import_and_merge_ngramlists)(csv_contents, corpus_node_id,
overwrite=bool(params.get('overwrite')))
return JsonHttpResponse({ return JsonHttpResponse({
'log': log_msg, 'log': log_msg,
...@@ -153,7 +154,8 @@ class CSVLists(APIView): ...@@ -153,7 +154,8 @@ class CSVLists(APIView):
# attempt to merge and send response # attempt to merge and send response
try: try:
# merge the source_lists onto those of the target corpus # merge the source_lists onto those of the target corpus
log_msg = merge_ngramlists(source_lists, onto_corpus=corpus_node) delete = todo_lists if bool(params.get('overwrite')) else []
log_msg = merge_ngramlists(source_lists, onto_corpus=corpus_node, del_originals=delete)
return JsonHttpResponse({ return JsonHttpResponse({
'log': log_msg, 'log': log_msg,
}, 200) }, 200)
......
...@@ -250,6 +250,23 @@ em { ...@@ -250,6 +250,23 @@ em {
<br/> <br/>
<div class="checkbox">
<label>
<input type="checkbox" id="importoverwrite"> Overwrite old lists
<script>
function updateSubmitLabel() {
$('#importsubmit').val($(this).is(':checked') ? 'Overwrite current table' : 'Import and merge with current table');
}
$(function() {
updateSubmitLabel.call($('#importoverwrite'));
$('#importoverwrite').change(updateSubmitLabel);
});
</script>
</label>
</div>
<br/>
<input type="submit" class="btn btn-xs btn-info" id="importsubmit" value="Import and merge with current table" /> <input type="submit" class="btn btn-xs btn-info" id="importsubmit" value="Import and merge with current table" />
</form> </form>
</div> </div>
...@@ -372,6 +389,8 @@ function listmergeUpdate(aFormData){ ...@@ -372,6 +389,8 @@ function listmergeUpdate(aFormData){
// all params are added in the url like a GET // all params are added in the url like a GET
theUrl += "&from_corpus="+sourceCorpusId theUrl += "&from_corpus="+sourceCorpusId
theUrl += "&todo="+todoLists.join(',') theUrl += "&todo="+todoLists.join(',')
if ($('#importoverwrite').is(':checked'))
theUrl += "&overwrite=1"
// result url looks like this : /api/ngramlists/import?onto_corpus=2&from=13308&todo=map,stop // result url looks like this : /api/ngramlists/import?onto_corpus=2&from=13308&todo=map,stop
// console.log(theUrl) // console.log(theUrl)
...@@ -424,7 +443,7 @@ function listmergeCsvPost(theFile){ ...@@ -424,7 +443,7 @@ function listmergeCsvPost(theFile){
//postCorpusFile //postCorpusFile
$.ajax({ $.ajax({
url: "{{importroute | safe}}", url: "{{importroute | safe}}" + ($('#importoverwrite').is(':checked') ? '&overwrite=1' : ''),
type: 'POST', type: 'POST',
async: true, async: true,
contentType: false, contentType: false,
...@@ -436,11 +455,11 @@ function listmergeCsvPost(theFile){ ...@@ -436,11 +455,11 @@ function listmergeCsvPost(theFile){
success: function(response) { success: function(response) {
my_html = '<h3 style="color:green">File upload, you will receive a notification email</h3>' my_html = '<h3 style="color:green">File upload, you will receive a notification email</h3>'
my_html += "<p class='note'>" + response['log'].replace(/\n/g, '<br/>') + "</p>" my_html += "<p class='note'>" + response['log'].replace(/\n/g, '<br/>') + "</p>"
my_html += "<p'>(this page will reload in 3s)</p>" my_html += "<p'>(this page will reload in 30s)</p>"
$('#formanswer').html(my_html); $('#formanswer').html(my_html);
console.log(response) ; console.log(response) ;
// reload after 3s // reload after 3s
setTimeout("location.reload(true)", 3000); setTimeout("location.reload(true)", 30000);
}, },
error: function(result, t) { error: function(result, t) {
if (t != 'timeout') { if (t != 'timeout') {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment