Merge branch 'testing-merge' into stable-imt-merge

c7e81064 · Alexandre Delanoë · 5a9f3b3a · aa325e73 · c7e81064 · c7e81064
Commit c7e81064 authored Aug 04, 2017 by Alexandre Delanoë
20 changed files
--- a/alembic/README
+++ b/alembic/README
@@ -14,7 +14,7 @@ TELL ALEMBIC TO NOT START FROM SCRATCH
    # "upgrade head" command. If you don't want to do this, you can of course
    # drop your database and really start from scratch.
-    alembic stamp 601e9d9baa4c
+    alembic stamp bedce47c9e34
 UPGRADE TO LATEST DATABASE VERSION

--- a/alembic/versions/601e9d9baa4c_add_occ_hist_occ_hist_part_functions.py
+++ b/alembic/versions/601e9d9baa4c_add_occ_hist_occ_hist_part_functions.py
@@ -7,7 +7,7 @@ Create Date: 2017-07-06 10:52:16.161118
 """
 from alembic import op
 import sqlalchemy as sa
-from gargantext.tools.alembic import ReplaceableObject
+from gargantext.util.alembic import ReplaceableObject
 # revision identifiers, used by Alembic.

--- a/alembic/versions/bedce47c9e34_fix_issue_with_node_hyperdata_index.py
+++ b/alembic/versions/bedce47c9e34_fix_issue_with_node_hyperdata_index.py
+"""Fix issue with Node.hyperdata index
+Revision ID: bedce47c9e34
+Revises: 08230100f262
+Create Date: 2017-07-10 11:30:59.168190
+"""
+from alembic import op
+import sqlalchemy as sa
+import gargantext
+# revision identifiers, used by Alembic.
+revision = 'bedce47c9e34'
+down_revision = '08230100f262'
+branch_labels = None
+depends_on = None
+def upgrade():
+    op.drop_index('nodes_hyperdata_idx', table_name='nodes')
+    op.create_index('nodes_hyperdata_idx', 'nodes', ['hyperdata'], unique=False, postgresql_using="gin")
+def downgrade():
+    # We won't unfix the bug when downgrading...
+    pass
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -36,7 +36,7 @@ import os
 import re
 import importlib
 from gargantext.util.lists import *
-from gargantext.util.tools import datetime, convert_to_date
+from gargantext.util import datetime, convert_to_datetime
 from .settings import BASE_DIR
 # types & models (nodes, lists, hyperdata, resource) ---------------------------------------------
@@ -108,9 +108,9 @@ INDEXED_HYPERDATA = {
    'publication_date':
        { 'id'             : 2
-        , 'type'           : datetime.datetime
+        , 'type'           : datetime
-        , 'convert_to_db'  : convert_to_date
+        , 'convert_to_db'  : convert_to_datetime
-        , 'convert_from_db': datetime.datetime.fromtimestamp
+        , 'convert_from_db': convert_to_datetime
        },
    'title':

--- a/gargantext/management/commands/nodes.py
+++ b/gargantext/management/commands/nodes.py
 from django.core.management.base import BaseCommand, CommandError
-from gargantext.tools.show_nodes import tree_show, nodes
+from gargantext.util.show_nodes import tree_show, nodes
 import colorama

--- a/gargantext/management/commands/something.py
+++ b/gargantext/management/commands/something.py
-from django.core.management.base import BaseCommand, CommandError
-from gargantext.models import Node
-class Command(BaseCommand):
-    help = 'Something'
-    def handle(self, *args, **options):
-        self.stdout.write(self.style.SUCCESS('Oh yeah!'))
--- a/gargantext/models/nodes.py
+++ b/gargantext/models/nodes.py
@@ -58,26 +58,26 @@ class Node(Base):
    __tablename__ = 'nodes'
    __table_args__ = (
            Index('nodes_user_id_typename_parent_id_idx', 'user_id', 'typename', 'parent_id'),
-            Index('nodes_hyperdata_idx', 'hyperdata'))
+            Index('nodes_hyperdata_idx', 'hyperdata', postgresql_using='gin'))
-    # TODO 
+    # TODO
    # create INDEX full_text_idx on nodes using gin(to_tsvector('english', hyperdata ->> 'abstract' || 'title'));
    id = Column(Integer, primary_key=True)
    typename = Column(NodeType, index=True)
    __mapper_args__ = { 'polymorphic_on': typename }
    # foreign keys
    user_id       = Column(Integer, ForeignKey(User.id, ondelete='CASCADE'))
    user          = relationship(User)
    parent_id     = Column(Integer, ForeignKey('nodes.id', ondelete='CASCADE'))
    parent        = relationship('Node', remote_side=[id])
    name = Column(String(255))
    date  = Column(DateTime(timezone=True), default=datetime.now)
    hyperdata     = Column(JSONB, default=dict)
    # metadata (see https://bashelton.com/2014/03/updating-postgresql-json-fields-via-sqlalchemy/)
    # To make search possible uncomment the line below

--- a/gargantext/util/__init__.py
+++ b/gargantext/util/__init__.py
+from .dates import datetime, convert_to_datetime, MINYEAR
--- a/gargantext/tools/alembic.py
+++ b/gargantext/tools/alembic.py
--- a/gargantext/util/dates.py
+++ b/gargantext/util/dates.py
+import os
+from gargantext.settings import MEDIA_ROOT
+from datetime import MINYEAR
+from django.utils.dateparse import parse_datetime
+from django.utils.timezone import datetime as _datetime, utc as UTC, now as utcnow
+__all__ = ['convert_to_datetime', 'datetime', 'MINYEAR']
+class datetime(_datetime):
+    @staticmethod
+    def now():
+        return utcnow()
+    @staticmethod
+    def utcfromtimestamp(ts):
+        return _datetime.utcfromtimestamp(ts).replace(tzinfo=UTC)
+    @staticmethod
+    def parse(s):
+        dt = parse_datetime(s)
+        return dt.astimezone(UTC) if dt.tzinfo else dt.replace(tzinfo=UTC)
+def convert_to_datetime(dt):
+    if isinstance(dt, (int, float)):
+        return datetime.utcfromtimestamp(dt)
+    elif isinstance(dt, str):
+        return datetime.parse(dt)
+    elif isinstance(dt, _datetime):
+        args = (dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second)
+        return datetime(*args, tzinfo=dt.tzinfo or UTC).astimezone(UTC)
+    else:
+        raise ValueError("Can't convert to datetime: %r" % dt)
--- a/gargantext/util/db_cache.py
+++ b/gargantext/util/db_cache.py
@@ -29,6 +29,7 @@ class ModelCache(dict):
                continue
        if formatted_key in self:
            self[key] = self[formatted_key]
+            element = self[key]
        else:
            element = session.query(self._model).filter(or_(*conditions)).first()
            if element is None:

--- a/gargantext/util/ngramlists_tools.py
+++ b/gargantext/util/ngramlists_tools.py
@@ -461,6 +461,7 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM,
                    raise ValueError('Wrong header "%s" on line %i (only possible headers are "label", "forms" and "status")' % (colname, n_read_lines))
            if 'label' not in columns:
                raise ValueError('CSV must contain at least one column with the header "label"')
+            continue
        if not len(csv_row):
            continue
@@ -567,7 +568,8 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM,
         'map':  UnweightedList(),
         'main': UnweightedList(),
         'stop': UnweightedList(),
-         'groupings' : Translations()
+         'groupings' : Translations(),
+         'new_ngram_count': n_added_ng,
         }
    for list_type in imported_nodes_ngrams:
@@ -663,12 +665,13 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
            for ng_id in new_lists[list_type].items:
                collect(ng_id)
-    from gargantext.util.toolchain.main import t
+    if new_lists.get('new_ngram_count', 0) > 0:
-    print("MERGE DEBUG: starting index_new_ngrams", t())
+        from gargantext.util.toolchain.main import t
-    n_added = index_new_ngrams(all_possibly_new_ngram_ids, onto_corpus)
+        print("MERGE DEBUG: starting index_new_ngrams", t())
-    print("MERGE DEBUG: finished index_new_ngrams", t())
+        n_added = index_new_ngrams(all_possibly_new_ngram_ids, onto_corpus)
+        print("MERGE DEBUG: finished index_new_ngrams", t())
-    my_log.append("MERGE: added %i new ngram occurrences in docs" % n_added)
+        my_log.append("MERGE: added %i new ngram occurrences in docs" % n_added)
    # ======== Get the old lists =========
    old_lists = {}
@@ -827,7 +830,7 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
 @shared_task
-def import_and_merge_ngramlists(file_contents, onto_corpus_id):
+def import_and_merge_ngramlists(file_contents, onto_corpus_id, overwrite=False):
    """
    A single function to run import_ngramlists and merge_ngramlists together
    """
@@ -837,6 +840,7 @@ def import_and_merge_ngramlists(file_contents, onto_corpus_id):
    corpus_node = session.query(Node).filter(Node.id == onto_corpus_id).first()
    # merge the new_lists onto those of the target corpus
-    log_msg = merge_ngramlists(new_lists, onto_corpus=corpus_node)
+    del_originals = ['stop', 'main', 'map'] if overwrite else []
+    log_msg = merge_ngramlists(new_lists, onto_corpus=corpus_node, del_originals=del_originals)
    return log_msg
--- a/gargantext/util/parsers/MULTIVAC.py
+++ b/gargantext/util/parsers/MULTIVAC.py
@@ -18,30 +18,30 @@ class MultivacParser(Parser):
        '''
        contents = filebuf.read().decode("UTF-8")
        data = json.loads(contents)
        filebuf.close()
        json_docs = data
        hyperdata_list = []
        hyperdata_path = { "id"       : "id"
                         , "title"    : "title"
                         , "abstract" : "abstract"
                         , "type"     : "type"
                         }
        for json_doc in json_docs:
            hyperdata = {}
            doc = json_doc["_source"]
            for key, path in hyperdata_path.items():
                    hyperdata[key] = doc.get(path, "")
            hyperdata["source"] = doc.get("serial"      , {})\
                                     .get("journaltitle", "REPEC Database")
            try:
                hyperdata["url"]    = doc.get("file", {})\
                                         .get("url" , "")
@@ -51,15 +51,15 @@ class MultivacParser(Parser):
            hyperdata["authors"] = ", ".join(
                                             [ p.get("person", {})
                                                .get("name"  , "")
                                               for p in doc.get("hasauthor", [])
                                             ]
                                            )
            year = doc.get("serial"  , {})\
                      .get("issuedate", None)
            if year == "Invalide date":
                year = doc.get("issuedate"  , None)
@@ -73,10 +73,7 @@ class MultivacParser(Parser):
                    date = datetime.now()
            hyperdata["publication_date"] = date
-            hyperdata["publication_year"]  = str(date.year)
-            hyperdata["publication_month"] = str(date.month)
-            hyperdata["publication_day"]   = str(date.day)
            hyperdata_list.append(hyperdata)
        return hyperdata_list
--- a/gargantext/util/parsers/_Parser.py
+++ b/gargantext/util/parsers/_Parser.py
-import datetime
 import dateutil.parser
 import zipfile
 import re
 import dateparser as date_parser
 from gargantext.util.languages import languages
+from gargantext.util import datetime, convert_to_datetime, MINYEAR
-DEFAULT_DATE = datetime.datetime(datetime.MINYEAR, 1, 1)
+DEFAULT_DATE = datetime(MINYEAR, 1, 1)
 class Parser:
@@ -34,29 +34,29 @@ class Parser:
    def format_hyperdata_dates(self, hyperdata):
        """Format the dates found in the hyperdata.
        Examples:
-            {"publication_date": "2014-10-23 09:57:42"}
+            {"publication_date": "2014-10-23 09:57:42+00:00"}
-            -> {"publication_date": "2014-10-23 09:57:42", "publication_year": "2014", ...}
+            -> {"publication_date": "2014-10-23 09:57:42+00:00", "publication_year": "2014", ...}
            {"publication_year": "2014"}
-            -> {"publication_date": "2014-01-01 00:00:00", "publication_year": "2014", ...}
+            -> {"publication_date": "2014-01-01 00:00:00+00:00", "publication_year": "2014", ...}
        """
        # First, check the split dates...
        # This part mainly deal with Zotero data but can be usefull for others
        # parts
-        date_string = hyperdata.get('publication_date_to_parse', None)
+        date_string = hyperdata.get('publication_date_to_parse')
        if date_string is not None:
            date_string = re.sub(r'\/\/+(\w*|\d*)', '', date_string)
            try:
-                hyperdata['publication' + "_date"] = dateutil.parser.parse(
+                hyperdata['publication_date'] = dateutil.parser.parse(
                    date_string,
                    default=DEFAULT_DATE
-                ).strftime("%Y-%m-%d %H:%M:%S")
+                )
            except Exception as error:
                print(error, 'Date not parsed for:', date_string)
-                hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+                hyperdata['publication_date'] = datetime.now()
-        elif hyperdata.get('publication_year', None) is not None:
+        elif hyperdata.get('publication_year') is not None:
            prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_year"]
            # eg prefixes : ['publication']
@@ -64,56 +64,45 @@ class Parser:
            for prefix in prefixes:
                date_string = hyperdata[prefix + "_year"]
-                # FIXME: except for year is it necessary to test that key exists
+                for part in ('month', 'day', 'hour', 'minute', 'second'):
-                #        when we have a default value in .get(key, "01") ??
+                    key = prefix + '_' + part
-                key = prefix + "_month"
+                    if key not in hyperdata:
-                if key in hyperdata:
+                        break
-                    date_string += " " + hyperdata.get(key, "01")
-                    key = prefix + "_day"
+                    sep = ":" if key in ('minute', 'second') else " "
-                    if key in hyperdata:
+                    date_string += sep + hyperdata.get(key, '01')
-                        date_string += " " + hyperdata.get(key, "01")
-                        key = prefix + "_hour"
-                        if key in hyperdata:
-                            date_string += " " + hyperdata.get(key, "01")
-                            key = prefix + "_minute"
-                            if key in hyperdata:
-                                date_string += ":" + hyperdata.get(key, "01")
-                                key = prefix + "_second"
-                                if key in hyperdata:
-                                    date_string += ":" + hyperdata.get(key, "01")
                try:
-                    hyperdata[prefix + "_date"] = dateutil.parser.parse(date_string).strftime("%Y-%m-%d %H:%M:%S")
+                    hyperdata[prefix + "_date"] = dateutil.parser.parse(date_string)
                except Exception as error:
                    try:
                        print("_Parser: error in full date parse", error, date_string)
                        # Date format:  1994 NOV-DEC
-                        hyperdata[prefix + "_date"] = date_parser.parse(str(date_string)[:8]).strftime("%Y-%m-%d %H:%M:%S")
+                        hyperdata[prefix + "_date"] = date_parser.parse(str(date_string)[:8])
                    except Exception as error:
                        try:
                            print("_Parser: error in short date parse", error)
                            # FIXME Date format:  1994 SPR
                            # By default, we take the year only
-                            hyperdata[prefix + "_date"] = date_parser.parse(str(date_string)[:4]).strftime("%Y-%m-%d %H:%M:%S")
+                            hyperdata[prefix + "_date"] = date_parser.parse(str(date_string)[:4])
                        except Exception as error:
                            print("_Parser:", error)
        else:
            print("WARNING: Date unknown at _Parser level, using now()")
-            hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+            hyperdata['publication_date'] = datetime.now()
        # ...then parse all the "date" fields, to parse it into separate elements
        prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_date"]
        for prefix in prefixes:
-            date = dateutil.parser.parse(hyperdata[prefix + "_date"])
+            name = prefix + "_date"
-            #print(date)
+            date = hyperdata[name]
+            hyperdata[name] = str(convert_to_datetime(date))
-            hyperdata[prefix + "_year"]      = date.strftime("%Y")
-            hyperdata[prefix + "_month"]     = date.strftime("%m")
+            for part in ('year', 'month', 'day', 'hour', 'minute', 'second'):
-            hyperdata[prefix + "_day"]       = date.strftime("%d")
+                hyperdata[prefix + '_' + part] = getattr(date, part)
-            hyperdata[prefix + "_hour"]      = date.strftime("%H")
-            hyperdata[prefix + "_minute"]    = date.strftime("%M")
-            hyperdata[prefix + "_second"]    = date.strftime("%S")
        # print("line 116", hyperdata['publication_date'])
        # finally, return the transformed result!
        return hyperdata

--- a/gargantext/tools/show_nodes.py
+++ b/gargantext/tools/show_nodes.py
 # Make this a standalone script...
-# Can be called this way: python3 gargantext/tools/show_nodes.py
+# Can be called this way: python3 gargantext/util/show_nodes.py
 import os
 import django

--- a/gargantext/util/toolchain/hyperdata_indexing.py
+++ b/gargantext/util/toolchain/hyperdata_indexing.py
@@ -43,8 +43,7 @@ def _nodes_hyperdata_generator(corpus):
                            key['id'],
                            None,
                            None,
-                            value.strftime("%Y-%m-%d %H:%M:%S"), 
+                            str(value),
-                            # FIXME check timestamp +%Z
                            None,
                            None,
                        )

--- a/gargantext/util/toolchain/ngram_coocs.py
+++ b/gargantext/util/toolchain/ngram_coocs.py
@@ -9,7 +9,6 @@ from gargantext.util.db        import get_engine
 from gargantext.util.db_cache  import cache
 from gargantext.constants      import DEFAULT_COOC_THRESHOLD, NODETYPES
 from gargantext.constants      import INDEXED_HYPERDATA
-from gargantext.util.tools     import datetime, convert_to_date
 def compute_coocs(  corpus,
                    overwrite_id    = None,
@@ -95,7 +94,7 @@ def compute_coocs(  corpus,
    # 2b) stating the filters
    cooc_filter_sql = """
-        WHERE 
+        WHERE
            n.typename  = {nodetype_id}
        AND n.parent_id = {corpus_id}
        GROUP BY 1,2
@@ -105,7 +104,7 @@ def compute_coocs(  corpus,
        """.format( nodetype_id = NODETYPES.index('DOCUMENT')
                  , corpus_id=corpus.id
                  )
    # 3) taking the cooccurrences of ngram x2
    ngram_filter_A_sql += """
        -- STEP 1: X axis of the matrix
@@ -162,25 +161,25 @@ def compute_coocs(  corpus,
    # 4) prepare the synonyms
    if groupings_id:
        ngram_filter_A_sql += """
-        LEFT JOIN  nodes_ngrams_ngrams 
+        LEFT JOIN  nodes_ngrams_ngrams
-               AS grA  ON wlA.ngram_id = grA.ngram1_id 
+               AS grA  ON wlA.ngram_id = grA.ngram1_id
                      AND grA.node_id  = {groupings_id}
        -- \--> adding (joining) ngrams that are grouped
        LEFT JOIN  nodes_ngrams
               AS wlAA ON grA.ngram2_id = wlAA.ngram_id
-                      AND wlAA.node_id  = wlA.node_id 
+                      AND wlAA.node_id  = wlA.node_id
        -- \--> adding (joining) ngrams that are not grouped
        --LEFT JOIN  ngrams        AS wlAA ON grA.ngram2_id = wlAA.id
        -- \--> for joining all synonyms even if they are not in the main list (white list)
        """.format(groupings_id = groupings_id)
        ngram_filter_B_sql += """
        LEFT JOIN  nodes_ngrams_ngrams
-               AS grB  ON wlB.ngram_id = grB.ngram1_id 
+               AS grB  ON wlB.ngram_id = grB.ngram1_id
                      AND grB.node_id  = {groupings_id}
        -- \--> adding (joining) ngrams that are grouped
-        LEFT JOIN  nodes_ngrams 
+        LEFT JOIN  nodes_ngrams
               AS wlBB ON grB.ngram2_id = wlBB.ngram_id
                      AND wlBB.node_id   = wlB.node_id
        -- \--> adding (joining) ngrams that are not grouped

--- a/gargantext/util/tools.py
+++ b/gargantext/util/tools.py
-import os
-from gargantext.settings import MEDIA_ROOT
-import datetime
-import dateutil
-def convert_to_date(date):
-    if isinstance(date, (int, float)):
-        return datetime.datetime.timestamp(date)
-    else:
-        return dateutil.parser.parse(date)
-def ensure_dir(user):
-    '''
-    If user is new, folder does not exist yet, create it then
-    '''
-    dirpath = '%s/corpora/%s' % (MEDIA_ROOT, user.username)
-    if not os.path.exists(dirpath):
-        print("Creating folder %s" % dirpath)
-        os.makedirs(dirpath)
--- a/gargantext/views/api/ngramlists.py
+++ b/gargantext/views/api/ngramlists.py
@@ -90,10 +90,11 @@ class CSVLists(APIView):
        # import the csv
        # try:
        log_msg = "Async generation"
        corpus_node_id = corpus_node.id
-        scheduled(import_and_merge_ngramlists)(csv_contents, corpus_node_id)
+        scheduled(import_and_merge_ngramlists)(csv_contents, corpus_node_id,
+                                               overwrite=bool(params.get('overwrite')))
        return JsonHttpResponse({
            'log': log_msg,
            }, 200)
@@ -153,7 +154,8 @@ class CSVLists(APIView):
        # attempt to merge and send response
        try:
            # merge the source_lists onto those of the target corpus
-            log_msg = merge_ngramlists(source_lists, onto_corpus=corpus_node)
+            delete = todo_lists if bool(params.get('overwrite')) else []
+            log_msg = merge_ngramlists(source_lists, onto_corpus=corpus_node, del_originals=delete)
            return JsonHttpResponse({
                'log': log_msg,
                }, 200)

--- a/templates/pages/corpora/terms.html
+++ b/templates/pages/corpora/terms.html
@@ -250,6 +250,23 @@ em {
            <br/>
+            <div class="checkbox">
+                <label>
+                    <input type="checkbox" id="importoverwrite"> Overwrite old lists
+                    <script>
+                        function updateSubmitLabel() {
+                            $('#importsubmit').val($(this).is(':checked') ? 'Overwrite current table' : 'Import and merge with current table');
+                        }
+                        $(function() {
+                            updateSubmitLabel.call($('#importoverwrite'));
+                            $('#importoverwrite').change(updateSubmitLabel);
+                        });
+                    </script>
+                </label>
+            </div>
+            <br/>
            <input type="submit" class="btn btn-xs btn-info" id="importsubmit" value="Import and merge with current table" />
        </form>
    </div>
@@ -372,6 +389,8 @@ function listmergeUpdate(aFormData){
    // all params are added in the url like a GET
    theUrl += "&from_corpus="+sourceCorpusId
    theUrl += "&todo="+todoLists.join(',')
+    if ($('#importoverwrite').is(':checked'))
+        theUrl += "&overwrite=1"
    // result url looks like this : /api/ngramlists/import?onto_corpus=2&from=13308&todo=map,stop
    // console.log(theUrl)
@@ -424,7 +443,7 @@ function listmergeCsvPost(theFile){
        //postCorpusFile
        $.ajax({
-             url: "{{importroute | safe}}",
+             url: "{{importroute | safe}}" + ($('#importoverwrite').is(':checked') ? '&overwrite=1' : ''),
             type: 'POST',
             async: true,
             contentType: false,
@@ -436,11 +455,11 @@ function listmergeCsvPost(theFile){
             success: function(response) {
                 my_html  = '<h3 style="color:green">File upload, you will receive a notification email</h3>'
                 my_html += "<p class='note'>" + response['log'].replace(/\n/g, '<br/>') + "</p>"
-                 my_html += "<p'>(this page will reload in 3s)</p>"
+                 my_html += "<p'>(this page will reload in 30s)</p>"
                 $('#formanswer').html(my_html);
                 console.log(response) ;
                 // reload after 3s
-                 setTimeout("location.reload(true)", 3000);
+                 setTimeout("location.reload(true)", 30000);
                 },
              error: function(result, t) {
                  if (t != 'timeout') {