merge in refactoring

2b82c054 · Romain Loth · 8499ab9a · 5e7c5603 · 2b82c054 · 2b82c054
Commit 2b82c054 authored May 11, 2016 by Romain Loth
15 changed files
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -195,17 +195,17 @@ RESOURCETYPES = [
 ]

 # linguistic extraction parameters ---------------------------------------------
-DEFAULT_TFIDF_CUTOFF_RATIO      = .45        # MAINLIST maximum terms in %
+DEFAULT_TFIDF_CUTOFF_RATIO      = .75        # MAINLIST maximum terms in %

-DEFAULT_TFIDF_HARD_LIMIT        = 750        # MAINLIST maximum terms abs
+DEFAULT_TFIDF_HARD_LIMIT        = 5000       # MAINLIST maximum terms abs
                                             # (makes COOCS larger ~ O(N²) /!\)

 DEFAULT_COOC_THRESHOLD          = 2          # inclusive minimum for COOCS coefs
                                             # (makes COOCS more sparse)

-DEFAULT_MAPLIST_MAX             = 300        # MAPLIST maximum terms
+DEFAULT_MAPLIST_MAX             = 350        # MAPLIST maximum terms

-DEFAULT_MAPLIST_MONOGRAMS_RATIO = .5         # part of monograms in MAPLIST
+DEFAULT_MAPLIST_MONOGRAMS_RATIO = .15         # part of monograms in MAPLIST

 DEFAULT_MAX_NGRAM_LEN           = 7          # limit used after POStagging rule
                                             # (initial ngrams number is a power law of this /!\)

--- a/gargantext/util/parsers/CSV.py
+++ b/gargantext/util/parsers/CSV.py
@@ -124,7 +124,8 @@ class CSVParser(Parser):
                for columnum in range( Coords["column"],len(tokens) ):
                    data = tokens[columnum]
                    RecordDict[ Headers_Int2Str[columnum] ] = data
-                hyperdata_list.append( RecordDict )
+                if len(RecordDict.keys())>0:
+                    hyperdata_list.append( RecordDict )
        # # = = = = [ / Reading the whole CSV and saving ] = = = = #

        return hyperdata_list
--- a/gargantext/util/toolchain/list_map.py
+++ b/gargantext/util/toolchain/list_map.py
@@ -7,7 +7,7 @@ from gargantext.models.ngrams import Node, Ngram, NodeNgram, \
 from gargantext.util.db       import session, aliased, func
 from gargantext.util.db_cache import cache
 from gargantext.util.lists    import UnweightedList
-from sqlalchemy               import desc
+from sqlalchemy               import desc, asc
 from gargantext.constants     import DEFAULT_MAPLIST_MAX,\
                                     DEFAULT_MAPLIST_MONOGRAMS_RATIO

@@ -52,7 +52,7 @@ def do_maplist(corpus,

    primary_groupterms_subquery = (session
                            # we want only primary terms (ngram1)
-                            .query(NodeNgramNgram.ngram1_id)
+                            .query(NodeNgramNgram.ngram2_id)
                            .filter(NodeNgramNgram.node_id == grouplist_id)
                            .subquery()
                         )
@@ -64,13 +64,13 @@ def do_maplist(corpus,
                .join(Ngram, Ngram.id == ScoreSpec.ngram_id)
                .filter(ScoreSpec.node_id == specificity_id)
                .filter(ScoreSpec.ngram_id.in_(mainterms_subquery))
-                .filter(ScoreSpec.ngram_id.in_(primary_groupterms_subquery))
+                .filter(ScoreSpec.ngram_id.notin_(primary_groupterms_subquery))
            )

    # TODO: move these 2 pools up to mainlist selection
    top_monograms = (query
                .filter(Ngram.n == 1)
-                .order_by(desc(ScoreSpec.weight))
+                .order_by(asc(ScoreSpec.weight))
                .limit(monograms_limit)
                .all()
               )
@@ -81,7 +81,7 @@ def do_maplist(corpus,
                .limit(multigrams_limit)
                .all()
               )
-    obtained_mono = len(top_monograms)
+    obtained_mono  = len(top_monograms)
    obtained_multi = len(top_multigrams)
    obtained_total = obtained_mono + obtained_multi
    # print("MAPLIST: top_monograms =", obtained_mono)

--- a/gargantext/util/toolchain/list_stop.py
+++ b/gargantext/util/toolchain/list_stop.py
@@ -27,10 +27,10 @@ def is_stop_word(ngram, stop_words=None):
            # , "(.*)(\.)(.*)"         trop fort (enlève les sigles !)
            , "(.*)(\,)(.*)"
            , "(.*)(< ?/?p ?>)(.*)"       # marques de paragraphes
-            , "(.*)(study)(.*)"
+            , "(.*)(study|elsevier)(.*)"
            , "(.*)\b(xx|xi|xv)\b(.*)"
            , "(.*)(result)(.*)"
-            , "(.*)(année|nombre|moitié)(.*)"
+            , "(.*)(year|année|nombre|moitié)(.*)"
            , "(.*)(temps)(.*)"
            , "(.*)(%)(.*)"
            , "(.*)(\{)(.*)"

--- a/gargantext/util/toolchain/metric_specificity.py
+++ b/gargantext/util/toolchain/metric_specificity.py
@@ -7,6 +7,7 @@ from gargantext.util.db       import session, aliased, func, bulk_insert
 from gargantext.util.lists    import WeightedList
 from collections              import defaultdict
 from pandas                   import DataFrame
+import pandas as pd

 def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
    '''
@@ -33,13 +34,23 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None):

    print("SPECIFICITY: computing on %i ngrams" % nb_ngrams)

-    d = DataFrame(matrix).fillna(0)
+    x = DataFrame(matrix).fillna(0)

-    # proba (x/y) ( <= on divise chaque colonne par son total)
-    d = d / d.sum(axis=0)
+    # proba (x/y) ( <= on divise chaque ligne par son total)
+    x = x / x.sum(axis=1)

+    # vectorisation
    # d:Matrix => v: Vector (len = nb_ngrams)
-    v = d.sum(axis=1)
+    # v = d.sum(axis=1) (- lui-même)
+    xs = x.sum(axis=1) - x
+    ys = x.sum(axis=0) - x
+    
+
+    # top inclus ou exclus
+    #n = ( xs + ys) / (2 * (x.shape[0] - 1))
+    
+    # top generic or specific (asc is spec, desc is generic)
+    v = ( xs - ys) / ( 2 * (x.shape[0] - 1))

    ## d ##
    #######
@@ -66,7 +77,7 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
    # pour l'instant on va utiliser les sommes en ligne comme ranking de spécificité
    # (**même** ordre qu'avec la formule d'avant le refactoring mais calcul + simple)
    # TODO analyser la cohérence math ET sem de cet indicateur
-    v.sort_values(inplace=True)
+    #v.sort_values(inplace=True)

    # [ ('biodiversité' , 0.333 ),
    #   ('Grenelle'     , 0.5   ),
@@ -92,10 +103,11 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
        the_id = specnode.id

    # print(v)
+    pd.options.display.float_format = '${:,.2f}'.format

    data = WeightedList(
            zip(  v.index.tolist()
-                , v.values.tolist()
+                , v.values.tolist()[0]
             )
           )
    data.save(the_id)

--- a/gargantext/util/toolchain/metric_tfidf.py
+++ b/gargantext/util/toolchain/metric_tfidf.py
@@ -8,7 +8,7 @@ FIXME: "having the same source" means we need to select inside hyperdata
       with a (perhaps costly) JSON query: WHERE hyperdata->'resources' @> ...
 """

-from gargantext.models   import Node, NodeNgram, NodeNodeNgram
+from gargantext.models   import Node, NodeNgram, NodeNodeNgram, NodeNgramNgram
 from gargantext.util.db  import session, bulk_insert, func # = sqlalchemy.func like sum() or count()
 from sqlalchemy          import text  # for query from raw SQL statement
 from math                import log
@@ -29,6 +29,13 @@ def compute_occs(corpus, overwrite_id = None):
        - overwrite_id: optional id of a pre-existing OCCURRENCES node for this corpus
                     (the Node and its previous NodeNodeNgram rows will be replaced)
    """
+    # 0) Get the groups
+    group_id = (session.query(Node.id)
+                       .filter(Node.parent_id == corpus.id)
+                       .filter(Node.typename  == "GROUPLIST")
+                       .first()
+                )
+

    # 1) all the doc_ids of our corpus (scope of counts for filter)
    # slower alternative: [doc.id for doc in corpus.children('DOCUMENT').all()]
@@ -45,6 +52,7 @@ def compute_occs(corpus, overwrite_id = None):
                    NodeNgram.ngram_id,
                    func.sum(NodeNgram.weight)
                 )
+                #.join(NodeNgramNgram, NodeNgramNgram.node_id == group_id)
                .filter(NodeNgram.node_id.in_(docids_subquery))
                .group_by(NodeNgram.ngram_id)
                .all()

--- a/gargantext/views/api/nodes.py
+++ b/gargantext/views/api/nodes.py
@@ -84,9 +84,10 @@ class NodeListResource(APIView):
            response = HttpResponse(content_type='text/csv')
            response['Content-Disposition'] = 'attachment; filename="Gargantext_Corpus.csv"'

-            writer = csv.writer(response)
+            writer = csv.writer(response, delimiter='\t')

-            keys =  [ 'title'   , 'journal', 'publication_date'
+            keys =  [ 'title'   , 'journal'
+                    , 'publication_year', 'publication_month', 'publication_day'
                    , 'abstract', 'authors']

            writer.writerow(keys)

--- a/static/lib/gargantext/Docs_dyna_chart_and_table.js
+++ b/static/lib/gargantext/Docs_dyna_chart_and_table.js
@@ -170,7 +170,7 @@ function toggleFavstatus (rec_id) {
    var myHttpAction = statusBefore ? 'DELETE' : 'PUT'

    $.ajax({
-      url: 'http://localhost:8000/api/nodes/'+corpus_id+'/favorites?docs='+doc_id,
+      url: window.location.origin + '/api/nodes/'+corpus_id+'/favorites?docs='+doc_id,
      type: myHttpAction,
      beforeSend: function(xhr) {
        xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
@@ -602,7 +602,7 @@ $.ajax({
  success: function(maindata){
      // unfortunately favorites info is in a separate request (other nodes)
      $.ajax({
-        url: 'http://localhost:8000/api/nodes/'+corpus_id+'/favorites',
+        url: window.location.origin + '/api/nodes/'+corpus_id+'/favorites',
        success: function(favdata){
          // initialize favs lookup
          for (var i in favdata['favdocs']) {

--- a/static/lib/gargantext/Journals_dyna_chart_and_table.js
+++ b/static/lib/gargantext/Journals_dyna_chart_and_table.js
@@ -290,7 +290,7 @@ function Main_test( data , initial) {
    var div_table = '<p align="right">'+"\n"
      div_table += '<table id="my-ajax-table" class="table table-bordered table-hover">'+"\n"
      div_table += "\t"+'<thead>'+"\n"
-      div_table += "\t"+"\t"+'<th data-dynatable-column="name">Title</th>'+"\n"
+      div_table += "\t"+"\t"+'<th data-dynatable-column="name"><span class="glyphicon glyphicon-text-size"></span> Title</th>'+"\n"
      div_table += "\t"+"\t"+'<th data-dynatable-column="score" data-dynatable-sorts="score">No. Pubs</th>'+"\n"
      // div_table += "\t"+"\t"+'<th id="score_column_id" data-dynatable-sorts="score" data-dynatable-column="score">Score</th>'+"\n"
      div_table += "\t"+"\t"+'</th>'+"\n"

--- a/templates/pages/corpora/journals.html
+++ b/templates/pages/corpora/journals.html
@@ -59,9 +59,12 @@
        <div class="panel panel-default">

            <div class="panel-heading">
-              <h4 class="panel-title">
-                Publications by source
-              </h4>
+              <h2 class="panel-title">
+                <center>
+                    <span class="glyphicon glyphicon-hand-down" aria-hidden="true"></span>
+                    Publications by source
+                </center>
+              </h2>
            </div>



--- a/templates/pages/corpora/terms.html
+++ b/templates/pages/corpora/terms.html
@@ -55,12 +55,15 @@
                <div class="panel panel-default">

                    <div class="panel-heading">
-                      <h4 class="panel-title">
+                      <h2 class="panel-title">
+                        <center>
+                          <span class="glyphicon glyphicon-hand-down" aria-hidden="true"></span>
                          Extracted terms
                            <!-- <button title='run test function' onclick="doATest()">
                                TEST
                            </button> -->
-                        </a>
+                        </center>
+                      </h2>

                        <!-- see in javascript function queries.functions['my_state_filter'] -->
                        <div class="pull-left" style="margin-top:1.85em;">

--- a/templates/pages/corpora/titles.html
+++ b/templates/pages/corpora/titles.html
@@ -54,9 +54,12 @@
    <div class="jumbotron">
        <div class="panel panel-default">
            <div class="panel-heading">
-                <h4 class="panel-title">
-                    Publications by title
-                </h4>
+                <h2 class="panel-title">
+                    <center>
+                        <span class="glyphicon glyphicon-hand-down" aria-hidden="true"></span>
+                        Publications by title
+                    </center>
+                </h2>
                <!-- search box with custom function in Docs_dyna_chart_and_tables.js -->
                <div class="pull-left" style="margin-top:1.85em; font-size: 16px;">
                    <span class="glyphicon glyphicon-search" aria-hidden="true"></span>

--- a/templates/pages/main/about.html
+++ b/templates/pages/main/about.html
@@ -41,7 +41,7 @@
                            <li>
                                Version 3.0.0
                                <ul>
-                                    <li>[NAME] Blue Jasmine</li>
+                                    <li>[NAME] Blue Jasmin</li>
                                    <li>[CODE] Refactored</li>
                                    <li>[DATABASE] New schema</li>
                                </ul>

--- a/templates/pages/menu.html
+++ b/templates/pages/menu.html
@@ -111,14 +111,26 @@
                                        </a>
                                        <i class="caret"></i>
                                    <ul class="dropdown-menu">
-                                                    <li>
-                                                        <a tabindex="-1"
-                                                                data-url="/projects/{{project.id}}/corpora/{{ corpus.id }}/explorer?field1=ngrams&amp;field2=ngrams&amp;distance=conditional&amp;bridgeness=5" onclick='gotoexplorer(this)'  >With conditional distance        </a>
-                                                    </li>
-                                                    <li>
-                                                        <a tabindex="-1"
-                                                                data-url="/projects/{{project.id}}/corpora/{{ corpus.id }}/explorer?field1=ngrams&amp;field2=ngrams&amp;distance=distributional&amp;bridgeness=5" onclick='gotoexplorer(this)'  >With distributional distance</a>
-                                                    </li>
+                                        {% if view != "graph" %}
+                                            <li>
+                                                <a tabindex="-1"
+                                                    data-url="/projects/{{project.id}}/corpora/{{ corpus.id }}/explorer?field1=ngrams&amp;field2=ngrams&amp;distance=conditional&amp;bridgeness=5" onclick='gotoexplorer(this)'  >With conditional distance        </a>
+                                            </li>
+                                            <li>
+                                                <a tabindex="-1"
+                                                        data-url="/projects/{{project.id}}/corpora/{{ corpus.id }}/explorer?field1=ngrams&amp;field2=ngrams&amp;distance=distributional&amp;bridgeness=5" onclick='gotoexplorer(this)'  >With distributional distance</a>
+                                            </li>
+                                        {% else %}
+                                            <li>
+                                                <a tabindex="-1"
+                                                    href="/projects/{{project.id}}/corpora/{{ corpus.id }}/explorer?field1=ngrams&amp;field2=ngrams&amp;distance=conditional&amp;bridgeness=5" >With conditional distance        </a>
+                                            </li>
+                                            <li>
+                                                <a tabindex="-1"
+                                                        href="/projects/{{project.id}}/corpora/{{ corpus.id }}/explorer?field1=ngrams&amp;field2=ngrams&amp;distance=distributional&amp;bridgeness=5">With distributional distance</a>
+                                            </li>
+
+                                        {% endif %}
                                    </ul>
                                    </li>
                                    {% endif %}
@@ -138,30 +150,30 @@
                        <div class="jumbotron" style="margin-bottom:0">
                            <br>
                            <br>
-                                            <!--
-                                            <a type="button" class="btn btn-default
-                                                    href="/projects/{{project.id}}/corpora/{{ corpus.id }}/">Export corpus</a>
-                                            --!>
-                                            <!-- <li class="divider"></li> --!>
-
                                <div class="row">
-                                    <div class="col-md-5">
-                                        {% if project %}
-                                            <h3><a href="/projects/{{project.id}}">
-                                                    <span class="glyphicon glyphicon-book" aria-hidden="true"></span>
-                                                    {{ project.name }}
+                                    <h3>
+                                        <a href="/projects/{{project.id}}">
+                                            <span class="glyphicon glyphicon-book" aria-hidden="true"></span>
+                                            {{ project.name | truncatechars:50}}
+                                        </a>
+                                    </h3>
+                                </div>
+                                <div class="row">
+                                    <div class="col-md-1">
+                                    </div>
+                                    <div class="col-md-6">
+                                            <h3>
+                                                <span class="glyphicon glyphicon-cd" aria-hidden="true"></span>
+                                                {{ resourcename | truncatechars:20 }}
+                                            </h3>
+                                            <h3>
+                                                <span class="glyphicon glyphicon-file" aria-hidden="true"></span>
+                                                {{ corpus.name | truncatechars:20 }}
+                                                <a class="btn btn-primary" role="button" href="/api/nodes?parent_id={{corpus.id}}&types[]=DOCUMENT&pagination_limit=100000&formated=csv">
+                                                    <span class="glyphicon glyphicon-download" aria-hidden="true"></span>
                                                </a>
-                                            <br>
-                                                    <span class="glyphicon glyphicon-cd" aria-hidden="true"></span>
-                                                    {{ resourcename | truncatechars:20 }}
-                                            <br>
-                                                    <span class="glyphicon glyphicon-file" aria-hidden="true"></span>
-                                                    {{ corpus.name }}
-                                            <br>
-                                                    <span class="glyphicon glyphicon-calendar" aria-hidden="true"></span>
-                                                    {{ corpus.date }}
                                            </h3>
-                                        {% endif %}
+
                                    </div>
                                    <div class="col-md-5">
                                        <h3>

--- a/templates/pages/projects/project.html
+++ b/templates/pages/projects/project.html
@@ -73,6 +73,7 @@
        {% for key, corpora in list_corpora.items %}
            <h2>
                <div class="row">
+                <div class="col-md-1 content"></div>
                    <span class="glyphicon glyphicon-cd" aria-hidden="true"></span>
                    {{ key }}
            </h2>
@@ -80,6 +81,7 @@
                        <div id="corpus_{{corpus.id}}">
                            <div class="row">
                                <h4>
+                                    <div class="col-md-1 content"></div>
                                    <div class="col-md-5 content">
                                        <a href="/projects/{{project.id}}/corpora/{{corpus.id}}">
                                            <span class="glyphicon glyphicon-file" aria-hidden="true"></span>
@@ -108,8 +110,7 @@
                                            <span class="glyphicon glyphicon-trash" aria-hidden="true"></span>
                                        </button>
                                    </div>
-
-                                    <div class="col-md-5 content">
+                                    <div class="col-md-3 content">
                                        {% for state in corpus.hyperdata.statuses %}
                                                {% ifequal state.action "ngrams_extraction" %}
                                                    {% if state.complete %}
@@ -169,6 +170,7 @@
                                                {% endifequal %}
                                        {% endfor %}
                                    </div>
+                                    <div class="col-md-1 content"></div>
                                </h4>
                            </div>
                        </div>