Commit c7e81064 authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge branch 'testing-merge' into stable-imt-merge

parents 5a9f3b3a aa325e73
......@@ -14,7 +14,7 @@ TELL ALEMBIC TO NOT START FROM SCRATCH
# "upgrade head" command. If you don't want to do this, you can of course
# drop your database and really start from scratch.
alembic stamp 601e9d9baa4c
alembic stamp bedce47c9e34
UPGRADE TO LATEST DATABASE VERSION
......
......@@ -7,7 +7,7 @@ Create Date: 2017-07-06 10:52:16.161118
"""
from alembic import op
import sqlalchemy as sa
from gargantext.tools.alembic import ReplaceableObject
from gargantext.util.alembic import ReplaceableObject
# revision identifiers, used by Alembic.
......
"""Fix issue with Node.hyperdata index
Revision ID: bedce47c9e34
Revises: 08230100f262
Create Date: 2017-07-10 11:30:59.168190
"""
from alembic import op
import sqlalchemy as sa
import gargantext
# revision identifiers, used by Alembic.
revision = 'bedce47c9e34'
down_revision = '08230100f262'
branch_labels = None
depends_on = None
def upgrade():
op.drop_index('nodes_hyperdata_idx', table_name='nodes')
op.create_index('nodes_hyperdata_idx', 'nodes', ['hyperdata'], unique=False, postgresql_using="gin")
def downgrade():
# We won't unfix the bug when downgrading...
pass
......@@ -36,7 +36,7 @@ import os
import re
import importlib
from gargantext.util.lists import *
from gargantext.util.tools import datetime, convert_to_date
from gargantext.util import datetime, convert_to_datetime
from .settings import BASE_DIR
# types & models (nodes, lists, hyperdata, resource) ---------------------------------------------
......@@ -108,9 +108,9 @@ INDEXED_HYPERDATA = {
'publication_date':
{ 'id' : 2
, 'type' : datetime.datetime
, 'convert_to_db' : convert_to_date
, 'convert_from_db': datetime.datetime.fromtimestamp
, 'type' : datetime
, 'convert_to_db' : convert_to_datetime
, 'convert_from_db': convert_to_datetime
},
'title':
......
from django.core.management.base import BaseCommand, CommandError
from gargantext.tools.show_nodes import tree_show, nodes
from gargantext.util.show_nodes import tree_show, nodes
import colorama
......
from django.core.management.base import BaseCommand, CommandError
from gargantext.models import Node
class Command(BaseCommand):
help = 'Something'
def handle(self, *args, **options):
self.stdout.write(self.style.SUCCESS('Oh yeah!'))
......@@ -58,26 +58,26 @@ class Node(Base):
__tablename__ = 'nodes'
__table_args__ = (
Index('nodes_user_id_typename_parent_id_idx', 'user_id', 'typename', 'parent_id'),
Index('nodes_hyperdata_idx', 'hyperdata'))
# TODO
Index('nodes_hyperdata_idx', 'hyperdata', postgresql_using='gin'))
# TODO
# create INDEX full_text_idx on nodes using gin(to_tsvector('english', hyperdata ->> 'abstract' || 'title'));
id = Column(Integer, primary_key=True)
typename = Column(NodeType, index=True)
__mapper_args__ = { 'polymorphic_on': typename }
# foreign keys
user_id = Column(Integer, ForeignKey(User.id, ondelete='CASCADE'))
user = relationship(User)
parent_id = Column(Integer, ForeignKey('nodes.id', ondelete='CASCADE'))
parent = relationship('Node', remote_side=[id])
name = Column(String(255))
date = Column(DateTime(timezone=True), default=datetime.now)
hyperdata = Column(JSONB, default=dict)
# metadata (see https://bashelton.com/2014/03/updating-postgresql-json-fields-via-sqlalchemy/)
# To make search possible uncomment the line below
......
from .dates import datetime, convert_to_datetime, MINYEAR
import os
from gargantext.settings import MEDIA_ROOT
from datetime import MINYEAR
from django.utils.dateparse import parse_datetime
from django.utils.timezone import datetime as _datetime, utc as UTC, now as utcnow
__all__ = ['convert_to_datetime', 'datetime', 'MINYEAR']
class datetime(_datetime):
@staticmethod
def now():
return utcnow()
@staticmethod
def utcfromtimestamp(ts):
return _datetime.utcfromtimestamp(ts).replace(tzinfo=UTC)
@staticmethod
def parse(s):
dt = parse_datetime(s)
return dt.astimezone(UTC) if dt.tzinfo else dt.replace(tzinfo=UTC)
def convert_to_datetime(dt):
if isinstance(dt, (int, float)):
return datetime.utcfromtimestamp(dt)
elif isinstance(dt, str):
return datetime.parse(dt)
elif isinstance(dt, _datetime):
args = (dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second)
return datetime(*args, tzinfo=dt.tzinfo or UTC).astimezone(UTC)
else:
raise ValueError("Can't convert to datetime: %r" % dt)
......@@ -29,6 +29,7 @@ class ModelCache(dict):
continue
if formatted_key in self:
self[key] = self[formatted_key]
element = self[key]
else:
element = session.query(self._model).filter(or_(*conditions)).first()
if element is None:
......
......@@ -461,6 +461,7 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM,
raise ValueError('Wrong header "%s" on line %i (only possible headers are "label", "forms" and "status")' % (colname, n_read_lines))
if 'label' not in columns:
raise ValueError('CSV must contain at least one column with the header "label"')
continue
if not len(csv_row):
continue
......@@ -567,7 +568,8 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM,
'map': UnweightedList(),
'main': UnweightedList(),
'stop': UnweightedList(),
'groupings' : Translations()
'groupings' : Translations(),
'new_ngram_count': n_added_ng,
}
for list_type in imported_nodes_ngrams:
......@@ -663,12 +665,13 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
for ng_id in new_lists[list_type].items:
collect(ng_id)
from gargantext.util.toolchain.main import t
print("MERGE DEBUG: starting index_new_ngrams", t())
n_added = index_new_ngrams(all_possibly_new_ngram_ids, onto_corpus)
print("MERGE DEBUG: finished index_new_ngrams", t())
if new_lists.get('new_ngram_count', 0) > 0:
from gargantext.util.toolchain.main import t
print("MERGE DEBUG: starting index_new_ngrams", t())
n_added = index_new_ngrams(all_possibly_new_ngram_ids, onto_corpus)
print("MERGE DEBUG: finished index_new_ngrams", t())
my_log.append("MERGE: added %i new ngram occurrences in docs" % n_added)
my_log.append("MERGE: added %i new ngram occurrences in docs" % n_added)
# ======== Get the old lists =========
old_lists = {}
......@@ -827,7 +830,7 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
@shared_task
def import_and_merge_ngramlists(file_contents, onto_corpus_id):
def import_and_merge_ngramlists(file_contents, onto_corpus_id, overwrite=False):
"""
A single function to run import_ngramlists and merge_ngramlists together
"""
......@@ -837,6 +840,7 @@ def import_and_merge_ngramlists(file_contents, onto_corpus_id):
corpus_node = session.query(Node).filter(Node.id == onto_corpus_id).first()
# merge the new_lists onto those of the target corpus
log_msg = merge_ngramlists(new_lists, onto_corpus=corpus_node)
del_originals = ['stop', 'main', 'map'] if overwrite else []
log_msg = merge_ngramlists(new_lists, onto_corpus=corpus_node, del_originals=del_originals)
return log_msg
......@@ -18,30 +18,30 @@ class MultivacParser(Parser):
'''
contents = filebuf.read().decode("UTF-8")
data = json.loads(contents)
filebuf.close()
json_docs = data
hyperdata_list = []
hyperdata_path = { "id" : "id"
, "title" : "title"
, "abstract" : "abstract"
, "type" : "type"
}
for json_doc in json_docs:
hyperdata = {}
doc = json_doc["_source"]
for key, path in hyperdata_path.items():
hyperdata[key] = doc.get(path, "")
hyperdata["source"] = doc.get("serial" , {})\
.get("journaltitle", "REPEC Database")
try:
hyperdata["url"] = doc.get("file", {})\
.get("url" , "")
......@@ -51,15 +51,15 @@ class MultivacParser(Parser):
hyperdata["authors"] = ", ".join(
[ p.get("person", {})
.get("name" , "")
for p in doc.get("hasauthor", [])
]
)
year = doc.get("serial" , {})\
.get("issuedate", None)
if year == "Invalide date":
year = doc.get("issuedate" , None)
......@@ -73,10 +73,7 @@ class MultivacParser(Parser):
date = datetime.now()
hyperdata["publication_date"] = date
hyperdata["publication_year"] = str(date.year)
hyperdata["publication_month"] = str(date.month)
hyperdata["publication_day"] = str(date.day)
hyperdata_list.append(hyperdata)
return hyperdata_list
import datetime
import dateutil.parser
import zipfile
import re
import dateparser as date_parser
from gargantext.util.languages import languages
from gargantext.util import datetime, convert_to_datetime, MINYEAR
DEFAULT_DATE = datetime.datetime(datetime.MINYEAR, 1, 1)
DEFAULT_DATE = datetime(MINYEAR, 1, 1)
class Parser:
......@@ -34,29 +34,29 @@ class Parser:
def format_hyperdata_dates(self, hyperdata):
"""Format the dates found in the hyperdata.
Examples:
{"publication_date": "2014-10-23 09:57:42"}
-> {"publication_date": "2014-10-23 09:57:42", "publication_year": "2014", ...}
{"publication_date": "2014-10-23 09:57:42+00:00"}
-> {"publication_date": "2014-10-23 09:57:42+00:00", "publication_year": "2014", ...}
{"publication_year": "2014"}
-> {"publication_date": "2014-01-01 00:00:00", "publication_year": "2014", ...}
-> {"publication_date": "2014-01-01 00:00:00+00:00", "publication_year": "2014", ...}
"""
# First, check the split dates...
# This part mainly deal with Zotero data but can be usefull for others
# parts
date_string = hyperdata.get('publication_date_to_parse', None)
date_string = hyperdata.get('publication_date_to_parse')
if date_string is not None:
date_string = re.sub(r'\/\/+(\w*|\d*)', '', date_string)
try:
hyperdata['publication' + "_date"] = dateutil.parser.parse(
hyperdata['publication_date'] = dateutil.parser.parse(
date_string,
default=DEFAULT_DATE
).strftime("%Y-%m-%d %H:%M:%S")
)
except Exception as error:
print(error, 'Date not parsed for:', date_string)
hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
hyperdata['publication_date'] = datetime.now()
elif hyperdata.get('publication_year', None) is not None:
elif hyperdata.get('publication_year') is not None:
prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_year"]
# eg prefixes : ['publication']
......@@ -64,56 +64,45 @@ class Parser:
for prefix in prefixes:
date_string = hyperdata[prefix + "_year"]
# FIXME: except for year is it necessary to test that key exists
# when we have a default value in .get(key, "01") ??
key = prefix + "_month"
if key in hyperdata:
date_string += " " + hyperdata.get(key, "01")
key = prefix + "_day"
if key in hyperdata:
date_string += " " + hyperdata.get(key, "01")
key = prefix + "_hour"
if key in hyperdata:
date_string += " " + hyperdata.get(key, "01")
key = prefix + "_minute"
if key in hyperdata:
date_string += ":" + hyperdata.get(key, "01")
key = prefix + "_second"
if key in hyperdata:
date_string += ":" + hyperdata.get(key, "01")
for part in ('month', 'day', 'hour', 'minute', 'second'):
key = prefix + '_' + part
if key not in hyperdata:
break
sep = ":" if key in ('minute', 'second') else " "
date_string += sep + hyperdata.get(key, '01')
try:
hyperdata[prefix + "_date"] = dateutil.parser.parse(date_string).strftime("%Y-%m-%d %H:%M:%S")
hyperdata[prefix + "_date"] = dateutil.parser.parse(date_string)
except Exception as error:
try:
print("_Parser: error in full date parse", error, date_string)
# Date format: 1994 NOV-DEC
hyperdata[prefix + "_date"] = date_parser.parse(str(date_string)[:8]).strftime("%Y-%m-%d %H:%M:%S")
hyperdata[prefix + "_date"] = date_parser.parse(str(date_string)[:8])
except Exception as error:
try:
print("_Parser: error in short date parse", error)
# FIXME Date format: 1994 SPR
# By default, we take the year only
hyperdata[prefix + "_date"] = date_parser.parse(str(date_string)[:4]).strftime("%Y-%m-%d %H:%M:%S")
hyperdata[prefix + "_date"] = date_parser.parse(str(date_string)[:4])
except Exception as error:
print("_Parser:", error)
else:
print("WARNING: Date unknown at _Parser level, using now()")
hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
hyperdata['publication_date'] = datetime.now()
# ...then parse all the "date" fields, to parse it into separate elements
prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_date"]
for prefix in prefixes:
date = dateutil.parser.parse(hyperdata[prefix + "_date"])
#print(date)
hyperdata[prefix + "_year"] = date.strftime("%Y")
hyperdata[prefix + "_month"] = date.strftime("%m")
hyperdata[prefix + "_day"] = date.strftime("%d")
hyperdata[prefix + "_hour"] = date.strftime("%H")
hyperdata[prefix + "_minute"] = date.strftime("%M")
hyperdata[prefix + "_second"] = date.strftime("%S")
name = prefix + "_date"
date = hyperdata[name]
hyperdata[name] = str(convert_to_datetime(date))
for part in ('year', 'month', 'day', 'hour', 'minute', 'second'):
hyperdata[prefix + '_' + part] = getattr(date, part)
# print("line 116", hyperdata['publication_date'])
# finally, return the transformed result!
return hyperdata
......
# Make this a standalone script...
# Can be called this way: python3 gargantext/tools/show_nodes.py
# Can be called this way: python3 gargantext/util/show_nodes.py
import os
import django
......
......@@ -43,8 +43,7 @@ def _nodes_hyperdata_generator(corpus):
key['id'],
None,
None,
value.strftime("%Y-%m-%d %H:%M:%S"),
# FIXME check timestamp +%Z
str(value),
None,
None,
)
......
......@@ -9,7 +9,6 @@ from gargantext.util.db import get_engine
from gargantext.util.db_cache import cache
from gargantext.constants import DEFAULT_COOC_THRESHOLD, NODETYPES
from gargantext.constants import INDEXED_HYPERDATA
from gargantext.util.tools import datetime, convert_to_date
def compute_coocs( corpus,
overwrite_id = None,
......@@ -95,7 +94,7 @@ def compute_coocs( corpus,
# 2b) stating the filters
cooc_filter_sql = """
WHERE
WHERE
n.typename = {nodetype_id}
AND n.parent_id = {corpus_id}
GROUP BY 1,2
......@@ -105,7 +104,7 @@ def compute_coocs( corpus,
""".format( nodetype_id = NODETYPES.index('DOCUMENT')
, corpus_id=corpus.id
)
# 3) taking the cooccurrences of ngram x2
ngram_filter_A_sql += """
-- STEP 1: X axis of the matrix
......@@ -162,25 +161,25 @@ def compute_coocs( corpus,
# 4) prepare the synonyms
if groupings_id:
ngram_filter_A_sql += """
LEFT JOIN nodes_ngrams_ngrams
AS grA ON wlA.ngram_id = grA.ngram1_id
LEFT JOIN nodes_ngrams_ngrams
AS grA ON wlA.ngram_id = grA.ngram1_id
AND grA.node_id = {groupings_id}
-- \--> adding (joining) ngrams that are grouped
LEFT JOIN nodes_ngrams
AS wlAA ON grA.ngram2_id = wlAA.ngram_id
AND wlAA.node_id = wlA.node_id
AND wlAA.node_id = wlA.node_id
-- \--> adding (joining) ngrams that are not grouped
--LEFT JOIN ngrams AS wlAA ON grA.ngram2_id = wlAA.id
-- \--> for joining all synonyms even if they are not in the main list (white list)
""".format(groupings_id = groupings_id)
ngram_filter_B_sql += """
LEFT JOIN nodes_ngrams_ngrams
AS grB ON wlB.ngram_id = grB.ngram1_id
AS grB ON wlB.ngram_id = grB.ngram1_id
AND grB.node_id = {groupings_id}
-- \--> adding (joining) ngrams that are grouped
LEFT JOIN nodes_ngrams
LEFT JOIN nodes_ngrams
AS wlBB ON grB.ngram2_id = wlBB.ngram_id
AND wlBB.node_id = wlB.node_id
-- \--> adding (joining) ngrams that are not grouped
......
import os
from gargantext.settings import MEDIA_ROOT
import datetime
import dateutil
def convert_to_date(date):
if isinstance(date, (int, float)):
return datetime.datetime.timestamp(date)
else:
return dateutil.parser.parse(date)
def ensure_dir(user):
'''
If user is new, folder does not exist yet, create it then
'''
dirpath = '%s/corpora/%s' % (MEDIA_ROOT, user.username)
if not os.path.exists(dirpath):
print("Creating folder %s" % dirpath)
os.makedirs(dirpath)
......@@ -90,10 +90,11 @@ class CSVLists(APIView):
# import the csv
# try:
log_msg = "Async generation"
corpus_node_id = corpus_node.id
scheduled(import_and_merge_ngramlists)(csv_contents, corpus_node_id)
scheduled(import_and_merge_ngramlists)(csv_contents, corpus_node_id,
overwrite=bool(params.get('overwrite')))
return JsonHttpResponse({
'log': log_msg,
}, 200)
......@@ -153,7 +154,8 @@ class CSVLists(APIView):
# attempt to merge and send response
try:
# merge the source_lists onto those of the target corpus
log_msg = merge_ngramlists(source_lists, onto_corpus=corpus_node)
delete = todo_lists if bool(params.get('overwrite')) else []
log_msg = merge_ngramlists(source_lists, onto_corpus=corpus_node, del_originals=delete)
return JsonHttpResponse({
'log': log_msg,
}, 200)
......
......@@ -250,6 +250,23 @@ em {
<br/>
<div class="checkbox">
<label>
<input type="checkbox" id="importoverwrite"> Overwrite old lists
<script>
function updateSubmitLabel() {
$('#importsubmit').val($(this).is(':checked') ? 'Overwrite current table' : 'Import and merge with current table');
}
$(function() {
updateSubmitLabel.call($('#importoverwrite'));
$('#importoverwrite').change(updateSubmitLabel);
});
</script>
</label>
</div>
<br/>
<input type="submit" class="btn btn-xs btn-info" id="importsubmit" value="Import and merge with current table" />
</form>
</div>
......@@ -372,6 +389,8 @@ function listmergeUpdate(aFormData){
// all params are added in the url like a GET
theUrl += "&from_corpus="+sourceCorpusId
theUrl += "&todo="+todoLists.join(',')
if ($('#importoverwrite').is(':checked'))
theUrl += "&overwrite=1"
// result url looks like this : /api/ngramlists/import?onto_corpus=2&from=13308&todo=map,stop
// console.log(theUrl)
......@@ -424,7 +443,7 @@ function listmergeCsvPost(theFile){
//postCorpusFile
$.ajax({
url: "{{importroute | safe}}",
url: "{{importroute | safe}}" + ($('#importoverwrite').is(':checked') ? '&overwrite=1' : ''),
type: 'POST',
async: true,
contentType: false,
......@@ -436,11 +455,11 @@ function listmergeCsvPost(theFile){
success: function(response) {
my_html = '<h3 style="color:green">File upload, you will receive a notification email</h3>'
my_html += "<p class='note'>" + response['log'].replace(/\n/g, '<br/>') + "</p>"
my_html += "<p'>(this page will reload in 3s)</p>"
my_html += "<p'>(this page will reload in 30s)</p>"
$('#formanswer').html(my_html);
console.log(response) ;
// reload after 3s
setTimeout("location.reload(true)", 3000);
setTimeout("location.reload(true)", 30000);
},
error: function(result, t) {
if (t != 'timeout') {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment