Merge remote-tracking branch 'origin/simon-unstable-lists-fix' into unstable

e24efe96 · Alexandre Delanoë · 06f55400 · 224eae66 · e24efe96 · e24efe96
Commit e24efe96 authored Sep 08, 2017 by Alexandre Delanoë
6 changed files
--- a/gargantext/util/group_tools.py
+++ b/gargantext/util/group_tools.py
@@ -7,7 +7,7 @@ from gargantext.util.db  import session, aliased
 from gargantext.models   import Ngram, NodeNgramNgram
 from igraph              import Graph  # for group_union
-def query_groups(groupings_id, details=False):
+def query_groups(groupings_id, details=False, sort=False):
    """
    Listing of couples (mainform,   subform)
                 aka   (ngram1_id, ngram2_id)
@@ -15,24 +15,27 @@ def query_groups(groupings_id, details=False):
    Parameter:
      - details: if False, just send the array of couples
                 if True, send quadruplets with (ngram1_id, term1, ngram2_id, term2)
+      - sort: order results by terms of ngram1 then ngram2
    """
+    if details or sort:
+        Ngram1, Ngram2 = Ngram, aliased(Ngram)
    if not details:
        # simple contents
-        query = session.query(NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id)
+        columns = (NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id)
    else:
        # detailed contents (id + terms)
-        Ngram1 = aliased(Ngram)
+        columns = (Ngram1.id, Ngram1.terms,
-        Ngram2 = aliased(Ngram)
+                   Ngram2.id, Ngram2.terms)
-        query = (session
-                    .query(
+    query = session.query(*columns)
-                        NodeNgramNgram.ngram1_id,
-                        Ngram1.terms,
+    if details or sort:
-                        NodeNgramNgram.ngram2_id,
+        query = (query.join(Ngram1, NodeNgramNgram.ngram1_id == Ngram1.id)
-                        Ngram2.terms,
+                      .join(Ngram2, NodeNgramNgram.ngram2_id == Ngram2.id))
-                     )
-                    .join(Ngram1, NodeNgramNgram.ngram1_id == Ngram1.id)
+    if sort:
-                    .join(Ngram2, NodeNgramNgram.ngram2_id == Ngram2.id)
+        query = query.order_by(Ngram1.terms, Ngram2.terms)
-                )
    # main filter
    # -----------

--- a/gargantext/util/lists.py
+++ b/gargantext/util/lists.py
@@ -50,6 +50,9 @@ class _BaseClass:
        else:
            return NotImplemented
+    def __len__(self):
+        return len(self.items)
    def __repr__(self):
        items = self.items
        if isinstance(items, defaultdict):

--- a/gargantext/util/ngramlists_tools.py
+++ b/gargantext/util/ngramlists_tools.py
@@ -8,8 +8,7 @@ Tools to work with ngramlists (MAINLIST, MAPLIST, STOPLIST)
 """
 from gargantext.util.group_tools import query_groups, group_union
-from gargantext.util.db          import session, desc, func, \
+from gargantext.util.db          import session, bulk_insert_ifnotexists
-                                        bulk_insert_ifnotexists
 from gargantext.models           import Ngram, NodeNgram, NodeNodeNgram, \
                                        NodeNgramNgram, Node
@@ -25,7 +24,6 @@ from gargantext.util.toolchain.ngrams_extraction import normalize_forms
 # merge will also index the new ngrams in the docs of the corpus
 from gargantext.util.toolchain.ngrams_addition   import index_new_ngrams
-from sqlalchemy.sql      import exists
 from os                  import path
 from csv                 import writer, reader, QUOTE_MINIMAL
 from collections         import defaultdict
@@ -35,8 +33,8 @@ from celery              import shared_task
 def query_list(list_id,
                pagination_limit=None, pagination_offset=None,
-                details=False, scoring_metric_id=None, groupings_id=None
+                details=False, scoring_metric_id=None, groupings_id=None,
-                ):
+                sort=False):
    """
    Paginated listing of ngram_ids in a NodeNgram lists.
@@ -51,6 +49,7 @@ def query_list(list_id,
                           (for details and sorting)
      - groupings_id: optional id of a list of grouping relations (synonyms)
                      (each synonym will be added to the list if not already in there)
+      - sort: order by Ngram.terms (not possible if details is False)
    FIXME: subforms appended recently and not generalized enough
            => add a common part for all "if groupings_id"
@@ -114,7 +113,10 @@ def query_list(list_id,
        query = query.limit(pagination_limit)
    if pagination_offset:
-        query = query.offset(pagination_offsets)
+        query = query.offset(pagination_offset)
+    if details and sort:
+        query = query.order_by(Ngram.terms)
    return query
@@ -175,9 +177,7 @@ def ngrams_to_csv_rows(ngram_objs, ngram_dico={}, group_infos={},
            # 3 columns = |status,         |  mainform, |  forms
            #             (type_of_list)    ( term )     ( subterm1|&|subterm2 )
-            csv_rows.append(
+            csv_rows.append([list_type, ng_obj.terms, this_grouped_terms])
-                  [list_type,ng_obj.terms,this_grouped_terms]
-                  )
    return csv_rows
@@ -220,9 +220,10 @@ def export_ngramlists(node,fname=None,delimiter=DEFAULT_CSV_DELIM,titles=True):
    # listes de ngram_ids correspondantes
    # ------------------------------------
    # contenu: liste des objets ngrammes [(2562,"monterme",1),...]
-    stop_ngrams  = query_list(stoplist_node.id, details=True, groupings_id=group_node.id).all()
+    stop_ngrams, main_ngrams, map_ngrams = (
-    main_ngrams  = query_list(mainlist_node.id, details=True, groupings_id=group_node.id).all()
+        query_list(n.id, details=True, groupings_id=group_node.id, sort=True).all()
-    map_ngrams  = query_list(maplist_node.id, details=True, groupings_id=group_node.id).all()
+        for n in (stoplist_node, mainlist_node, maplist_node)
+    )
    # pour debug ---------->8 --------------------
    #~ stop_ngrams = stop_ngrams[0:10]
@@ -239,7 +240,7 @@ def export_ngramlists(node,fname=None,delimiter=DEFAULT_CSV_DELIM,titles=True):
    # for the groups we got couples of ids in the DB
    # -------------------
    # ex: [(3544, 2353), (2787, 4032), ...]
-    group_ngram_id_couples = query_groups(group_node.id).all()
+    group_ngram_id_couples = query_groups(group_node.id, sort=True)
    # we expend this to double structure for groups lookup
    # 1) g['links'] = k couples (x,y_i) as a set   [x => {y1,y2}]
@@ -386,6 +387,9 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM,
    NB: To merge the imported lists into a corpus node's lists,
        chain this function with merge_ngramlists()
    '''
+    list_types = ['stop','main','map']
    # ---------------
    #  ngram storage
    # ---------------
@@ -450,7 +454,6 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM,
        # headers
        if i == 0:
-            n_cols = len(csv_row)
            for j, colname in enumerate(csv_row):
                if colname in ['label', 'status', 'forms']:
                    columns[colname] = j
@@ -497,31 +500,30 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM,
            continue
        # --- check correct list type
-        if not this_list_type in ['stop','main','map']:
+        if not this_list_type in list_types:
            print("IMPORT WARN: (skip line) wrong list type at CSV %s:l.%i" % (fname, i))
            continue
        # subforms can be duplicated (in forms and another label)
        # but we must take care of unwanted other duplicates too
-        if this_row_label in imported_unique_ngramstrs:
+        if imported_unique_ngramstrs.get(this_row_label) == 1:
-            print("TODO IMPORT DUPL: (skip line) term appears more than once at CSV %s:l.%i"
+            print("TODO IMPORT DUPL: (skip line) term %r appears more than once at CSV %s:l.%i"
-                    % (fname, i))
+                    % (this_row_label, fname, i))
        # ================= Store the data ====================
        # the ngram census
-        imported_unique_ngramstrs[this_row_label] = True
+        imported_unique_ngramstrs[this_row_label] = 1
        # and the "list to ngram" relation
        imported_nodes_ngrams[this_list_type].append(this_row_label)
        # ====== Store synonyms from the import (if any) ======
        if len(this_row_forms) != 0:
-            other_terms = []
            for raw_term_str in this_row_forms.split(group_delimiter):
                # each subform is also like an ngram declaration
                term_str = normalize_forms(normalize_chars(raw_term_str))
-                imported_unique_ngramstrs[term_str] = True
+                imported_unique_ngramstrs[term_str] = 2
                imported_nodes_ngrams[this_list_type].append(term_str)
                # the optional repeated mainform doesn't interest us
@@ -599,7 +601,10 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM,
                % (n_total_ng, n_added_ng, n_total_ng-n_added_ng) )
    print("IMPORT: read %i grouping relations" % n_group_relations)
-    # print("IMPORT RESULT", result)
+    list_counts = [(typ, len(result.get(typ))) for typ in list_types]
+    list_counts.append(('total', sum(x[1] for x in list_counts)))
+    print("IMPORT: " + '; '.join('%s %s' % stats for stats in list_counts))
    return result
 def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
@@ -707,9 +712,11 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
    # ======== Merging all involved ngrams =========
-    # all memberships with resolved conflicts of interfering memberships
+    # all ngram memberships with resolved conflicts of interfering memberships
+    # (associates ngram ids with list types -- see linfos definition above)
    resolved_memberships = {}
+    # iterates over each ngram of each list type for both old and new lists
    for list_set in [old_lists, new_lists]:
        for lid, info in enumerate(linfos):
            list_type = info['key']
@@ -811,7 +818,7 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
            list_type = linfos[lid]['key']
            merged_results[list_type].items.add(ng_id)
-    # print("IMPORT: added %i elements in the lists indices" % added_nd_ng)
+    print("IMPORT: added %i elements in the lists indices" % added_nd_ng)
    # ======== Overwrite old data with new =========
    for lid, info in enumerate(linfos):
@@ -834,10 +841,14 @@ def import_and_merge_ngramlists(file_contents, onto_corpus_id, overwrite=False):
    """
    A single function to run import_ngramlists and merge_ngramlists together
    """
-    print("import list")
+    print("IMPORT CSV termlists file with %s lines in corpus %s (%s)" % (
+        len(file_contents),
+        onto_corpus_id, 'overwrite' if overwrite else 'merge'))
    new_lists = import_ngramlists(file_contents)
-    corpus_node = session.query(Node).filter(Node.id == onto_corpus_id).first()
+    corpus_node = session.query(Node).get(onto_corpus_id)
    # merge the new_lists onto those of the target corpus
    del_originals = ['stop', 'main', 'map'] if overwrite else []

--- a/gargantext/util/parsers/CSV.py
+++ b/gargantext/util/parsers/CSV.py
@@ -4,128 +4,67 @@ import sys
 import csv
 csv.field_size_limit(sys.maxsize)
 import numpy as np
-import os
 class CSVParser(Parser):
+    DELIMITERS = ", \t;|:"
-    def CSVsample( self, small_contents , delim) :
+    def detect_delimiter(self, lines, sample_size=10):
-        reader = csv.reader(small_contents, delimiter=delim)
+        sample = lines[:sample_size]
-        Freqs = []
+        # Compute frequency of each delimiter on each input line
-        for row in reader:
+        delimiters_freqs = {
-            Freqs.append(len(row))
+            d: [line.count(d) for line in sample]
+            for d in self.DELIMITERS
+        }
-        return Freqs
+        # Select delimiters with a standard deviation of zero, ie. delimiters
+        # for which we have the same number of fields on each line
+        selected_delimiters = [
+            (d, np.sum(freqs))
+            for d, freqs in delimiters_freqs.items()
+            if any(freqs) and np.std(freqs) == 0
+        ]
+        if selected_delimiters:
+            # Choose the delimiter with highest frequency amongst selected ones
+            sorted_delimiters = sorted(selected_delimiters, key=lambda x: x[1])
+            return sorted_delimiters[-1][0]
    def parse(self, filebuf):
        print("CSV: parsing (assuming UTF-8 and LF line endings)")
        contents = filebuf.read().decode("UTF-8").split("\n")
-        sample_size = 10
+        # Filter out empty lines
-        sample_contents = contents[0:sample_size]
+        contents = [line for line in contents if line.strip()]
-        hyperdata_list = []
+        # Delimiter auto-detection
+        delimiter = self.detect_delimiter(contents, sample_size=10)
-        # # = = = = [ Getting delimiters frequency ] = = = = #
-        PossibleDelimiters = [ ',',' ','\t', ';', '|', ':' ]
+        if delimiter is None:
-        AllDelimiters = {}
+            raise ValueError("CSV: couldn't detect delimiter, bug or malformed data")
-        for delim in PossibleDelimiters:
-            AllDelimiters[delim] = self.CSVsample( sample_contents , delim )
+        print("CSV: selected delimiter: %r" % delimiter)
-        # # = = = = [ / Getting delimiters frequency ] = = = = #
-        # # OUTPUT example:
+        # Parse CSV
-        # #  AllDelimiters = {
+        reader = csv.reader(contents, delimiter=delimiter)
-        # #   '\t': [1, 1, 1, 1, 1],
-        # #   ' ': [1, 13, 261, 348, 330],
+        # Get first not empty row and its fields (ie. header row), or (0, [])
-        # #   ',': [15, 15, 15, 15, 15],
+        first_row, headers = \
-        # #   ';': [1, 1, 1, 1, 1],
+            next(((i, fields) for i, fields in enumerate(reader) if any(fields)),
-        # #   '|': [1, 1, 1, 1, 1]
+                 (0, []))
-        # #  }
+        # Get first not empty column of the first row, or 0
-        # # = = = = [ Stand.Dev=0 & Sum of delimiters ] = = = = #
+        first_col = next((i for i, field in enumerate(headers) if field), 0)
-        Delimiters = []
-        for d in AllDelimiters:
+        # Strip out potential empty fields in headers
-            freqs = AllDelimiters[d]
+        headers = headers[first_col:]
-            suma = np.sum( freqs )
-            if suma >0:
-                std = np.std( freqs )
-                # print [ d , suma , len(freqs) , std]
-                if std == 0:
-                    Delimiters.append ( [ d , suma , len(freqs) , std] )
-        # # = = = = [ / Stand.Dev=0 & Sum of delimiters ] = = = = #
-        # # OUTPUT example:
-        # #  Delimiters = [
-        # #     ['\t', 5, 5, 0.0],
-        # #     [',', 75, 5, 0.0],
-        # #     ['|', 5, 5, 0.0]
-        # #  ]
-        # # = = = = [ Delimiter selection ] = = = = #
-        Sorted_Delims = sorted(Delimiters, key=lambda x: x[1], reverse=True)
-        HighestDelim = Sorted_Delims[0][0]
-        # HighestDelim = ","
-        print("CSV selected delimiter:",[HighestDelim])
-        # # = = = = [ / Delimiter selection ] = = = = #
-        # # = = = = [ First data coordinate ] = = = = #
-        Coords = {
-            "row": -1,
-            "column": -1
-        }
-        reader = csv.reader(contents, delimiter=HighestDelim)
+        # Return a generator of dictionaries with column labels as keys,
+        # filtering out empty rows
-        for rownum, tokens in enumerate(reader):
+        for i, fields in enumerate(reader):
-            if rownum % 250 == 0:
+            if i % 500 == 0:
-                print("CSV row: ", rownum)
+                print("CSV: parsing row #%s..." % (i+1))
-            joined_tokens = "".join (tokens)
+            if any(fields):
-            if Coords["row"]<0 and len( joined_tokens )>0 :
+                yield dict(zip(headers, fields[first_col:]))
-                Coords["row"] = rownum
-                for columnum in range(len(tokens)):
-                    t = tokens[columnum]
-                    if len(t)>0:
-                        Coords["column"] = columnum
-                        break
-        # # = = = = [ / First data coordinate ] = = = = #
-        # # = = = = [ Setting Headers ] = = = = #
-        Headers_Int2Str = {}
-        reader = csv.reader(contents, delimiter=HighestDelim)
-        for rownum, tokens in enumerate(reader):
-            if rownum>=Coords["row"]:
-                for columnum in range( Coords["column"],len(tokens) ):
-                    t = tokens[columnum]
-                    Headers_Int2Str[columnum] = t
-                break
-        # print("Headers_Int2Str")
-        # print(Headers_Int2Str)
-        # # = = = = [ / Setting Headers ] = = = = #
-        # # OUTPUT example:
-        # #  Headers_Int2Str = {
-        # #     0: 'publication_date',
-        # #      1: 'publication_month',
-        # #      2: 'publication_second',
-        # #      3: 'abstract'
-        # #  }
-        # # = = = = [ Reading the whole CSV and saving ] = = = = #
-        hyperdata_list = []
-        reader = csv.reader(contents, delimiter=HighestDelim)
-        for rownum, tokens in enumerate(reader):
-            if rownum>Coords["row"]:
-                RecordDict = {}
-                for columnum in range( Coords["column"],len(tokens) ):
-                    data = tokens[columnum]
-                    RecordDict[ Headers_Int2Str[columnum] ] = data
-                if len(RecordDict.keys())>0:
-                    hyperdata_list.append( RecordDict )
-        # # = = = = [ / Reading the whole CSV and saving ] = = = = #
-        return hyperdata_list
--- a/gargantext/util/toolchain/ngrams_extraction.py
+++ b/gargantext/util/toolchain/ngrams_extraction.py
@@ -81,7 +81,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
                    corpus.hyperdata["skipped_docs"].append(document.id)
                    corpus.save_hyperdata()
                    continue
-                else:
                # ready !
                tagger = tagger_bots[language_iso2]
@@ -95,7 +95,8 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
                            continue
                            # get ngrams
                        for ngram in tagger.extract(value):
-                                tokens = tuple(normalize_forms(token[0]) for token in ngram)
+                            normal_forms = (normalize_forms(t[0]) for t in ngram)
+                            tokens = tuple(nf for nf in normal_forms if nf)
                            if do_subngrams:
                                # ex tokens = ["very", "cool", "exemple"]
                                #    subterms = [['very', 'cool'],...]

--- a/templates/pages/projects/project.html
+++ b/templates/pages/projects/project.html
@@ -440,11 +440,12 @@
                          // in the form "Add a corpus"
                          var type = $("#id_type").val()
+                          var file = $("#id_file").val()
                          // 5 booleans
                          var nameField = $("#id_name").val() != ""
                          var typeField = (type != "") && (type != "0")
-                          var fileField = $("#id_file").val() != ""
+                          var fileField = file != ""
                          var wantfileField = $("#file_yes").prop("checked")
                          var crawling = ((type==3)||(type==8)||(type==9)) && ! wantfileField
@@ -457,6 +458,23 @@
                          if (! crawling) {
                              $("#submit_thing").prop('disabled' , !(nameField && typeField && fileField))
                          }
+                          // Automatically select CSV when type is undefined
+                          // and we have a .csv file
+                          if (!typeField && file && file.match(/.csv$/i)) {
+                              // Get CSV type id
+                              var csv = $('#id_type > option')
+                                  .filter(function() {
+                                      return $(this).text() === 'CSV'
+                                  })
+                                  .attr('value')
+                              // Select CSV type
+                              $('#id_type').val(csv)
+                              // Focus on name field
+                              setTimeout(function() {
+                                  $("#id_name").focus()
+                              })
+                          }
                        }
                        function bringDaNoise() {