Commit 2b82c054 authored by Romain Loth's avatar Romain Loth

merge in refactoring

parents 8499ab9a 5e7c5603
...@@ -195,17 +195,17 @@ RESOURCETYPES = [ ...@@ -195,17 +195,17 @@ RESOURCETYPES = [
] ]
# linguistic extraction parameters --------------------------------------------- # linguistic extraction parameters ---------------------------------------------
DEFAULT_TFIDF_CUTOFF_RATIO = .45 # MAINLIST maximum terms in % DEFAULT_TFIDF_CUTOFF_RATIO = .75 # MAINLIST maximum terms in %
DEFAULT_TFIDF_HARD_LIMIT = 750 # MAINLIST maximum terms abs DEFAULT_TFIDF_HARD_LIMIT = 5000 # MAINLIST maximum terms abs
# (makes COOCS larger ~ O(N²) /!\) # (makes COOCS larger ~ O(N²) /!\)
DEFAULT_COOC_THRESHOLD = 2 # inclusive minimum for COOCS coefs DEFAULT_COOC_THRESHOLD = 2 # inclusive minimum for COOCS coefs
# (makes COOCS more sparse) # (makes COOCS more sparse)
DEFAULT_MAPLIST_MAX = 300 # MAPLIST maximum terms DEFAULT_MAPLIST_MAX = 350 # MAPLIST maximum terms
DEFAULT_MAPLIST_MONOGRAMS_RATIO = .5 # part of monograms in MAPLIST DEFAULT_MAPLIST_MONOGRAMS_RATIO = .15 # part of monograms in MAPLIST
DEFAULT_MAX_NGRAM_LEN = 7 # limit used after POStagging rule DEFAULT_MAX_NGRAM_LEN = 7 # limit used after POStagging rule
# (initial ngrams number is a power law of this /!\) # (initial ngrams number is a power law of this /!\)
......
...@@ -124,7 +124,8 @@ class CSVParser(Parser): ...@@ -124,7 +124,8 @@ class CSVParser(Parser):
for columnum in range( Coords["column"],len(tokens) ): for columnum in range( Coords["column"],len(tokens) ):
data = tokens[columnum] data = tokens[columnum]
RecordDict[ Headers_Int2Str[columnum] ] = data RecordDict[ Headers_Int2Str[columnum] ] = data
hyperdata_list.append( RecordDict ) if len(RecordDict.keys())>0:
hyperdata_list.append( RecordDict )
# # = = = = [ / Reading the whole CSV and saving ] = = = = # # # = = = = [ / Reading the whole CSV and saving ] = = = = #
return hyperdata_list return hyperdata_list
...@@ -7,7 +7,7 @@ from gargantext.models.ngrams import Node, Ngram, NodeNgram, \ ...@@ -7,7 +7,7 @@ from gargantext.models.ngrams import Node, Ngram, NodeNgram, \
from gargantext.util.db import session, aliased, func from gargantext.util.db import session, aliased, func
from gargantext.util.db_cache import cache from gargantext.util.db_cache import cache
from gargantext.util.lists import UnweightedList from gargantext.util.lists import UnweightedList
from sqlalchemy import desc from sqlalchemy import desc, asc
from gargantext.constants import DEFAULT_MAPLIST_MAX,\ from gargantext.constants import DEFAULT_MAPLIST_MAX,\
DEFAULT_MAPLIST_MONOGRAMS_RATIO DEFAULT_MAPLIST_MONOGRAMS_RATIO
...@@ -52,7 +52,7 @@ def do_maplist(corpus, ...@@ -52,7 +52,7 @@ def do_maplist(corpus,
primary_groupterms_subquery = (session primary_groupterms_subquery = (session
# we want only primary terms (ngram1) # we want only primary terms (ngram1)
.query(NodeNgramNgram.ngram1_id) .query(NodeNgramNgram.ngram2_id)
.filter(NodeNgramNgram.node_id == grouplist_id) .filter(NodeNgramNgram.node_id == grouplist_id)
.subquery() .subquery()
) )
...@@ -64,13 +64,13 @@ def do_maplist(corpus, ...@@ -64,13 +64,13 @@ def do_maplist(corpus,
.join(Ngram, Ngram.id == ScoreSpec.ngram_id) .join(Ngram, Ngram.id == ScoreSpec.ngram_id)
.filter(ScoreSpec.node_id == specificity_id) .filter(ScoreSpec.node_id == specificity_id)
.filter(ScoreSpec.ngram_id.in_(mainterms_subquery)) .filter(ScoreSpec.ngram_id.in_(mainterms_subquery))
.filter(ScoreSpec.ngram_id.in_(primary_groupterms_subquery)) .filter(ScoreSpec.ngram_id.notin_(primary_groupterms_subquery))
) )
# TODO: move these 2 pools up to mainlist selection # TODO: move these 2 pools up to mainlist selection
top_monograms = (query top_monograms = (query
.filter(Ngram.n == 1) .filter(Ngram.n == 1)
.order_by(desc(ScoreSpec.weight)) .order_by(asc(ScoreSpec.weight))
.limit(monograms_limit) .limit(monograms_limit)
.all() .all()
) )
...@@ -81,7 +81,7 @@ def do_maplist(corpus, ...@@ -81,7 +81,7 @@ def do_maplist(corpus,
.limit(multigrams_limit) .limit(multigrams_limit)
.all() .all()
) )
obtained_mono = len(top_monograms) obtained_mono = len(top_monograms)
obtained_multi = len(top_multigrams) obtained_multi = len(top_multigrams)
obtained_total = obtained_mono + obtained_multi obtained_total = obtained_mono + obtained_multi
# print("MAPLIST: top_monograms =", obtained_mono) # print("MAPLIST: top_monograms =", obtained_mono)
......
...@@ -27,10 +27,10 @@ def is_stop_word(ngram, stop_words=None): ...@@ -27,10 +27,10 @@ def is_stop_word(ngram, stop_words=None):
# , "(.*)(\.)(.*)" trop fort (enlève les sigles !) # , "(.*)(\.)(.*)" trop fort (enlève les sigles !)
, "(.*)(\,)(.*)" , "(.*)(\,)(.*)"
, "(.*)(< ?/?p ?>)(.*)" # marques de paragraphes , "(.*)(< ?/?p ?>)(.*)" # marques de paragraphes
, "(.*)(study)(.*)" , "(.*)(study|elsevier)(.*)"
, "(.*)\b(xx|xi|xv)\b(.*)" , "(.*)\b(xx|xi|xv)\b(.*)"
, "(.*)(result)(.*)" , "(.*)(result)(.*)"
, "(.*)(année|nombre|moitié)(.*)" , "(.*)(year|année|nombre|moitié)(.*)"
, "(.*)(temps)(.*)" , "(.*)(temps)(.*)"
, "(.*)(%)(.*)" , "(.*)(%)(.*)"
, "(.*)(\{)(.*)" , "(.*)(\{)(.*)"
......
...@@ -7,6 +7,7 @@ from gargantext.util.db import session, aliased, func, bulk_insert ...@@ -7,6 +7,7 @@ from gargantext.util.db import session, aliased, func, bulk_insert
from gargantext.util.lists import WeightedList from gargantext.util.lists import WeightedList
from collections import defaultdict from collections import defaultdict
from pandas import DataFrame from pandas import DataFrame
import pandas as pd
def compute_specificity(corpus, cooc_id=None, overwrite_id = None): def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
''' '''
...@@ -33,13 +34,23 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None): ...@@ -33,13 +34,23 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
print("SPECIFICITY: computing on %i ngrams" % nb_ngrams) print("SPECIFICITY: computing on %i ngrams" % nb_ngrams)
d = DataFrame(matrix).fillna(0) x = DataFrame(matrix).fillna(0)
# proba (x/y) ( <= on divise chaque colonne par son total) # proba (x/y) ( <= on divise chaque ligne par son total)
d = d / d.sum(axis=0) x = x / x.sum(axis=1)
# vectorisation
# d:Matrix => v: Vector (len = nb_ngrams) # d:Matrix => v: Vector (len = nb_ngrams)
v = d.sum(axis=1) # v = d.sum(axis=1) (- lui-même)
xs = x.sum(axis=1) - x
ys = x.sum(axis=0) - x
# top inclus ou exclus
#n = ( xs + ys) / (2 * (x.shape[0] - 1))
# top generic or specific (asc is spec, desc is generic)
v = ( xs - ys) / ( 2 * (x.shape[0] - 1))
## d ## ## d ##
####### #######
...@@ -66,7 +77,7 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None): ...@@ -66,7 +77,7 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
# pour l'instant on va utiliser les sommes en ligne comme ranking de spécificité # pour l'instant on va utiliser les sommes en ligne comme ranking de spécificité
# (**même** ordre qu'avec la formule d'avant le refactoring mais calcul + simple) # (**même** ordre qu'avec la formule d'avant le refactoring mais calcul + simple)
# TODO analyser la cohérence math ET sem de cet indicateur # TODO analyser la cohérence math ET sem de cet indicateur
v.sort_values(inplace=True) #v.sort_values(inplace=True)
# [ ('biodiversité' , 0.333 ), # [ ('biodiversité' , 0.333 ),
# ('Grenelle' , 0.5 ), # ('Grenelle' , 0.5 ),
...@@ -92,10 +103,11 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None): ...@@ -92,10 +103,11 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
the_id = specnode.id the_id = specnode.id
# print(v) # print(v)
pd.options.display.float_format = '${:,.2f}'.format
data = WeightedList( data = WeightedList(
zip( v.index.tolist() zip( v.index.tolist()
, v.values.tolist() , v.values.tolist()[0]
) )
) )
data.save(the_id) data.save(the_id)
......
...@@ -8,7 +8,7 @@ FIXME: "having the same source" means we need to select inside hyperdata ...@@ -8,7 +8,7 @@ FIXME: "having the same source" means we need to select inside hyperdata
with a (perhaps costly) JSON query: WHERE hyperdata->'resources' @> ... with a (perhaps costly) JSON query: WHERE hyperdata->'resources' @> ...
""" """
from gargantext.models import Node, NodeNgram, NodeNodeNgram from gargantext.models import Node, NodeNgram, NodeNodeNgram, NodeNgramNgram
from gargantext.util.db import session, bulk_insert, func # = sqlalchemy.func like sum() or count() from gargantext.util.db import session, bulk_insert, func # = sqlalchemy.func like sum() or count()
from sqlalchemy import text # for query from raw SQL statement from sqlalchemy import text # for query from raw SQL statement
from math import log from math import log
...@@ -29,6 +29,13 @@ def compute_occs(corpus, overwrite_id = None): ...@@ -29,6 +29,13 @@ def compute_occs(corpus, overwrite_id = None):
- overwrite_id: optional id of a pre-existing OCCURRENCES node for this corpus - overwrite_id: optional id of a pre-existing OCCURRENCES node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced) (the Node and its previous NodeNodeNgram rows will be replaced)
""" """
# 0) Get the groups
group_id = (session.query(Node.id)
.filter(Node.parent_id == corpus.id)
.filter(Node.typename == "GROUPLIST")
.first()
)
# 1) all the doc_ids of our corpus (scope of counts for filter) # 1) all the doc_ids of our corpus (scope of counts for filter)
# slower alternative: [doc.id for doc in corpus.children('DOCUMENT').all()] # slower alternative: [doc.id for doc in corpus.children('DOCUMENT').all()]
...@@ -45,6 +52,7 @@ def compute_occs(corpus, overwrite_id = None): ...@@ -45,6 +52,7 @@ def compute_occs(corpus, overwrite_id = None):
NodeNgram.ngram_id, NodeNgram.ngram_id,
func.sum(NodeNgram.weight) func.sum(NodeNgram.weight)
) )
#.join(NodeNgramNgram, NodeNgramNgram.node_id == group_id)
.filter(NodeNgram.node_id.in_(docids_subquery)) .filter(NodeNgram.node_id.in_(docids_subquery))
.group_by(NodeNgram.ngram_id) .group_by(NodeNgram.ngram_id)
.all() .all()
......
...@@ -84,9 +84,10 @@ class NodeListResource(APIView): ...@@ -84,9 +84,10 @@ class NodeListResource(APIView):
response = HttpResponse(content_type='text/csv') response = HttpResponse(content_type='text/csv')
response['Content-Disposition'] = 'attachment; filename="Gargantext_Corpus.csv"' response['Content-Disposition'] = 'attachment; filename="Gargantext_Corpus.csv"'
writer = csv.writer(response) writer = csv.writer(response, delimiter='\t')
keys = [ 'title' , 'journal', 'publication_date' keys = [ 'title' , 'journal'
, 'publication_year', 'publication_month', 'publication_day'
, 'abstract', 'authors'] , 'abstract', 'authors']
writer.writerow(keys) writer.writerow(keys)
......
...@@ -170,7 +170,7 @@ function toggleFavstatus (rec_id) { ...@@ -170,7 +170,7 @@ function toggleFavstatus (rec_id) {
var myHttpAction = statusBefore ? 'DELETE' : 'PUT' var myHttpAction = statusBefore ? 'DELETE' : 'PUT'
$.ajax({ $.ajax({
url: 'http://localhost:8000/api/nodes/'+corpus_id+'/favorites?docs='+doc_id, url: window.location.origin + '/api/nodes/'+corpus_id+'/favorites?docs='+doc_id,
type: myHttpAction, type: myHttpAction,
beforeSend: function(xhr) { beforeSend: function(xhr) {
xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken")); xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
...@@ -602,7 +602,7 @@ $.ajax({ ...@@ -602,7 +602,7 @@ $.ajax({
success: function(maindata){ success: function(maindata){
// unfortunately favorites info is in a separate request (other nodes) // unfortunately favorites info is in a separate request (other nodes)
$.ajax({ $.ajax({
url: 'http://localhost:8000/api/nodes/'+corpus_id+'/favorites', url: window.location.origin + '/api/nodes/'+corpus_id+'/favorites',
success: function(favdata){ success: function(favdata){
// initialize favs lookup // initialize favs lookup
for (var i in favdata['favdocs']) { for (var i in favdata['favdocs']) {
......
...@@ -290,7 +290,7 @@ function Main_test( data , initial) { ...@@ -290,7 +290,7 @@ function Main_test( data , initial) {
var div_table = '<p align="right">'+"\n" var div_table = '<p align="right">'+"\n"
div_table += '<table id="my-ajax-table" class="table table-bordered table-hover">'+"\n" div_table += '<table id="my-ajax-table" class="table table-bordered table-hover">'+"\n"
div_table += "\t"+'<thead>'+"\n" div_table += "\t"+'<thead>'+"\n"
div_table += "\t"+"\t"+'<th data-dynatable-column="name">Title</th>'+"\n" div_table += "\t"+"\t"+'<th data-dynatable-column="name"><span class="glyphicon glyphicon-text-size"></span> Title</th>'+"\n"
div_table += "\t"+"\t"+'<th data-dynatable-column="score" data-dynatable-sorts="score">No. Pubs</th>'+"\n" div_table += "\t"+"\t"+'<th data-dynatable-column="score" data-dynatable-sorts="score">No. Pubs</th>'+"\n"
// div_table += "\t"+"\t"+'<th id="score_column_id" data-dynatable-sorts="score" data-dynatable-column="score">Score</th>'+"\n" // div_table += "\t"+"\t"+'<th id="score_column_id" data-dynatable-sorts="score" data-dynatable-column="score">Score</th>'+"\n"
div_table += "\t"+"\t"+'</th>'+"\n" div_table += "\t"+"\t"+'</th>'+"\n"
......
...@@ -59,9 +59,12 @@ ...@@ -59,9 +59,12 @@
<div class="panel panel-default"> <div class="panel panel-default">
<div class="panel-heading"> <div class="panel-heading">
<h4 class="panel-title"> <h2 class="panel-title">
Publications by source <center>
</h4> <span class="glyphicon glyphicon-hand-down" aria-hidden="true"></span>
Publications by source
</center>
</h2>
</div> </div>
......
...@@ -55,12 +55,15 @@ ...@@ -55,12 +55,15 @@
<div class="panel panel-default"> <div class="panel panel-default">
<div class="panel-heading"> <div class="panel-heading">
<h4 class="panel-title"> <h2 class="panel-title">
<center>
<span class="glyphicon glyphicon-hand-down" aria-hidden="true"></span>
Extracted terms Extracted terms
<!-- <button title='run test function' onclick="doATest()"> <!-- <button title='run test function' onclick="doATest()">
TEST TEST
</button> --> </button> -->
</a> </center>
</h2>
<!-- see in javascript function queries.functions['my_state_filter'] --> <!-- see in javascript function queries.functions['my_state_filter'] -->
<div class="pull-left" style="margin-top:1.85em;"> <div class="pull-left" style="margin-top:1.85em;">
......
...@@ -54,9 +54,12 @@ ...@@ -54,9 +54,12 @@
<div class="jumbotron"> <div class="jumbotron">
<div class="panel panel-default"> <div class="panel panel-default">
<div class="panel-heading"> <div class="panel-heading">
<h4 class="panel-title"> <h2 class="panel-title">
Publications by title <center>
</h4> <span class="glyphicon glyphicon-hand-down" aria-hidden="true"></span>
Publications by title
</center>
</h2>
<!-- search box with custom function in Docs_dyna_chart_and_tables.js --> <!-- search box with custom function in Docs_dyna_chart_and_tables.js -->
<div class="pull-left" style="margin-top:1.85em; font-size: 16px;"> <div class="pull-left" style="margin-top:1.85em; font-size: 16px;">
<span class="glyphicon glyphicon-search" aria-hidden="true"></span> <span class="glyphicon glyphicon-search" aria-hidden="true"></span>
......
...@@ -41,7 +41,7 @@ ...@@ -41,7 +41,7 @@
<li> <li>
Version 3.0.0 Version 3.0.0
<ul> <ul>
<li>[NAME] Blue Jasmine</li> <li>[NAME] Blue Jasmin</li>
<li>[CODE] Refactored</li> <li>[CODE] Refactored</li>
<li>[DATABASE] New schema</li> <li>[DATABASE] New schema</li>
</ul> </ul>
......
...@@ -111,14 +111,26 @@ ...@@ -111,14 +111,26 @@
</a> </a>
<i class="caret"></i> <i class="caret"></i>
<ul class="dropdown-menu"> <ul class="dropdown-menu">
<li> {% if view != "graph" %}
<a tabindex="-1" <li>
data-url="/projects/{{project.id}}/corpora/{{ corpus.id }}/explorer?field1=ngrams&amp;field2=ngrams&amp;distance=conditional&amp;bridgeness=5" onclick='gotoexplorer(this)' >With conditional distance </a> <a tabindex="-1"
</li> data-url="/projects/{{project.id}}/corpora/{{ corpus.id }}/explorer?field1=ngrams&amp;field2=ngrams&amp;distance=conditional&amp;bridgeness=5" onclick='gotoexplorer(this)' >With conditional distance </a>
<li> </li>
<a tabindex="-1" <li>
data-url="/projects/{{project.id}}/corpora/{{ corpus.id }}/explorer?field1=ngrams&amp;field2=ngrams&amp;distance=distributional&amp;bridgeness=5" onclick='gotoexplorer(this)' >With distributional distance</a> <a tabindex="-1"
</li> data-url="/projects/{{project.id}}/corpora/{{ corpus.id }}/explorer?field1=ngrams&amp;field2=ngrams&amp;distance=distributional&amp;bridgeness=5" onclick='gotoexplorer(this)' >With distributional distance</a>
</li>
{% else %}
<li>
<a tabindex="-1"
href="/projects/{{project.id}}/corpora/{{ corpus.id }}/explorer?field1=ngrams&amp;field2=ngrams&amp;distance=conditional&amp;bridgeness=5" >With conditional distance </a>
</li>
<li>
<a tabindex="-1"
href="/projects/{{project.id}}/corpora/{{ corpus.id }}/explorer?field1=ngrams&amp;field2=ngrams&amp;distance=distributional&amp;bridgeness=5">With distributional distance</a>
</li>
{% endif %}
</ul> </ul>
</li> </li>
{% endif %} {% endif %}
...@@ -138,30 +150,30 @@ ...@@ -138,30 +150,30 @@
<div class="jumbotron" style="margin-bottom:0"> <div class="jumbotron" style="margin-bottom:0">
<br> <br>
<br> <br>
<!--
<a type="button" class="btn btn-default
href="/projects/{{project.id}}/corpora/{{ corpus.id }}/">Export corpus</a>
--!>
<!-- <li class="divider"></li> --!>
<div class="row"> <div class="row">
<div class="col-md-5"> <h3>
{% if project %} <a href="/projects/{{project.id}}">
<h3><a href="/projects/{{project.id}}"> <span class="glyphicon glyphicon-book" aria-hidden="true"></span>
<span class="glyphicon glyphicon-book" aria-hidden="true"></span> {{ project.name | truncatechars:50}}
{{ project.name }} </a>
</h3>
</div>
<div class="row">
<div class="col-md-1">
</div>
<div class="col-md-6">
<h3>
<span class="glyphicon glyphicon-cd" aria-hidden="true"></span>
{{ resourcename | truncatechars:20 }}
</h3>
<h3>
<span class="glyphicon glyphicon-file" aria-hidden="true"></span>
{{ corpus.name | truncatechars:20 }}
<a class="btn btn-primary" role="button" href="/api/nodes?parent_id={{corpus.id}}&types[]=DOCUMENT&pagination_limit=100000&formated=csv">
<span class="glyphicon glyphicon-download" aria-hidden="true"></span>
</a> </a>
<br>
<span class="glyphicon glyphicon-cd" aria-hidden="true"></span>
{{ resourcename | truncatechars:20 }}
<br>
<span class="glyphicon glyphicon-file" aria-hidden="true"></span>
{{ corpus.name }}
<br>
<span class="glyphicon glyphicon-calendar" aria-hidden="true"></span>
{{ corpus.date }}
</h3> </h3>
{% endif %}
</div> </div>
<div class="col-md-5"> <div class="col-md-5">
<h3> <h3>
......
...@@ -73,6 +73,7 @@ ...@@ -73,6 +73,7 @@
{% for key, corpora in list_corpora.items %} {% for key, corpora in list_corpora.items %}
<h2> <h2>
<div class="row"> <div class="row">
<div class="col-md-1 content"></div>
<span class="glyphicon glyphicon-cd" aria-hidden="true"></span> <span class="glyphicon glyphicon-cd" aria-hidden="true"></span>
{{ key }} {{ key }}
</h2> </h2>
...@@ -80,6 +81,7 @@ ...@@ -80,6 +81,7 @@
<div id="corpus_{{corpus.id}}"> <div id="corpus_{{corpus.id}}">
<div class="row"> <div class="row">
<h4> <h4>
<div class="col-md-1 content"></div>
<div class="col-md-5 content"> <div class="col-md-5 content">
<a href="/projects/{{project.id}}/corpora/{{corpus.id}}"> <a href="/projects/{{project.id}}/corpora/{{corpus.id}}">
<span class="glyphicon glyphicon-file" aria-hidden="true"></span> <span class="glyphicon glyphicon-file" aria-hidden="true"></span>
...@@ -108,8 +110,7 @@ ...@@ -108,8 +110,7 @@
<span class="glyphicon glyphicon-trash" aria-hidden="true"></span> <span class="glyphicon glyphicon-trash" aria-hidden="true"></span>
</button> </button>
</div> </div>
<div class="col-md-3 content">
<div class="col-md-5 content">
{% for state in corpus.hyperdata.statuses %} {% for state in corpus.hyperdata.statuses %}
{% ifequal state.action "ngrams_extraction" %} {% ifequal state.action "ngrams_extraction" %}
{% if state.complete %} {% if state.complete %}
...@@ -169,6 +170,7 @@ ...@@ -169,6 +170,7 @@
{% endifequal %} {% endifequal %}
{% endfor %} {% endfor %}
</div> </div>
<div class="col-md-1 content"></div>
</h4> </h4>
</div> </div>
</div> </div>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment