Commit 2b82c054 authored by Romain Loth's avatar Romain Loth

merge in refactoring

parents 8499ab9a 5e7c5603
......@@ -195,17 +195,17 @@ RESOURCETYPES = [
]
# linguistic extraction parameters ---------------------------------------------
DEFAULT_TFIDF_CUTOFF_RATIO = .45 # MAINLIST maximum terms in %
DEFAULT_TFIDF_CUTOFF_RATIO = .75 # MAINLIST maximum terms in %
DEFAULT_TFIDF_HARD_LIMIT = 750 # MAINLIST maximum terms abs
DEFAULT_TFIDF_HARD_LIMIT = 5000 # MAINLIST maximum terms abs
# (makes COOCS larger ~ O(N²) /!\)
DEFAULT_COOC_THRESHOLD = 2 # inclusive minimum for COOCS coefs
# (makes COOCS more sparse)
DEFAULT_MAPLIST_MAX = 300 # MAPLIST maximum terms
DEFAULT_MAPLIST_MAX = 350 # MAPLIST maximum terms
DEFAULT_MAPLIST_MONOGRAMS_RATIO = .5 # part of monograms in MAPLIST
DEFAULT_MAPLIST_MONOGRAMS_RATIO = .15 # part of monograms in MAPLIST
DEFAULT_MAX_NGRAM_LEN = 7 # limit used after POStagging rule
# (initial ngrams number is a power law of this /!\)
......
......@@ -124,7 +124,8 @@ class CSVParser(Parser):
for columnum in range( Coords["column"],len(tokens) ):
data = tokens[columnum]
RecordDict[ Headers_Int2Str[columnum] ] = data
hyperdata_list.append( RecordDict )
if len(RecordDict.keys())>0:
hyperdata_list.append( RecordDict )
# # = = = = [ / Reading the whole CSV and saving ] = = = = #
return hyperdata_list
......@@ -7,7 +7,7 @@ from gargantext.models.ngrams import Node, Ngram, NodeNgram, \
from gargantext.util.db import session, aliased, func
from gargantext.util.db_cache import cache
from gargantext.util.lists import UnweightedList
from sqlalchemy import desc
from sqlalchemy import desc, asc
from gargantext.constants import DEFAULT_MAPLIST_MAX,\
DEFAULT_MAPLIST_MONOGRAMS_RATIO
......@@ -52,7 +52,7 @@ def do_maplist(corpus,
primary_groupterms_subquery = (session
# we want only primary terms (ngram1)
.query(NodeNgramNgram.ngram1_id)
.query(NodeNgramNgram.ngram2_id)
.filter(NodeNgramNgram.node_id == grouplist_id)
.subquery()
)
......@@ -64,13 +64,13 @@ def do_maplist(corpus,
.join(Ngram, Ngram.id == ScoreSpec.ngram_id)
.filter(ScoreSpec.node_id == specificity_id)
.filter(ScoreSpec.ngram_id.in_(mainterms_subquery))
.filter(ScoreSpec.ngram_id.in_(primary_groupterms_subquery))
.filter(ScoreSpec.ngram_id.notin_(primary_groupterms_subquery))
)
# TODO: move these 2 pools up to mainlist selection
top_monograms = (query
.filter(Ngram.n == 1)
.order_by(desc(ScoreSpec.weight))
.order_by(asc(ScoreSpec.weight))
.limit(monograms_limit)
.all()
)
......@@ -81,7 +81,7 @@ def do_maplist(corpus,
.limit(multigrams_limit)
.all()
)
obtained_mono = len(top_monograms)
obtained_mono = len(top_monograms)
obtained_multi = len(top_multigrams)
obtained_total = obtained_mono + obtained_multi
# print("MAPLIST: top_monograms =", obtained_mono)
......
......@@ -27,10 +27,10 @@ def is_stop_word(ngram, stop_words=None):
# , "(.*)(\.)(.*)" trop fort (enlève les sigles !)
, "(.*)(\,)(.*)"
, "(.*)(< ?/?p ?>)(.*)" # marques de paragraphes
, "(.*)(study)(.*)"
, "(.*)(study|elsevier)(.*)"
, "(.*)\b(xx|xi|xv)\b(.*)"
, "(.*)(result)(.*)"
, "(.*)(année|nombre|moitié)(.*)"
, "(.*)(year|année|nombre|moitié)(.*)"
, "(.*)(temps)(.*)"
, "(.*)(%)(.*)"
, "(.*)(\{)(.*)"
......
......@@ -7,6 +7,7 @@ from gargantext.util.db import session, aliased, func, bulk_insert
from gargantext.util.lists import WeightedList
from collections import defaultdict
from pandas import DataFrame
import pandas as pd
def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
'''
......@@ -33,13 +34,23 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
print("SPECIFICITY: computing on %i ngrams" % nb_ngrams)
d = DataFrame(matrix).fillna(0)
x = DataFrame(matrix).fillna(0)
# proba (x/y) ( <= on divise chaque colonne par son total)
d = d / d.sum(axis=0)
# proba (x/y) ( <= on divise chaque ligne par son total)
x = x / x.sum(axis=1)
# vectorisation
# d:Matrix => v: Vector (len = nb_ngrams)
v = d.sum(axis=1)
# v = d.sum(axis=1) (- lui-même)
xs = x.sum(axis=1) - x
ys = x.sum(axis=0) - x
# top inclus ou exclus
#n = ( xs + ys) / (2 * (x.shape[0] - 1))
# top generic or specific (asc is spec, desc is generic)
v = ( xs - ys) / ( 2 * (x.shape[0] - 1))
## d ##
#######
......@@ -66,7 +77,7 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
# pour l'instant on va utiliser les sommes en ligne comme ranking de spécificité
# (**même** ordre qu'avec la formule d'avant le refactoring mais calcul + simple)
# TODO analyser la cohérence math ET sem de cet indicateur
v.sort_values(inplace=True)
#v.sort_values(inplace=True)
# [ ('biodiversité' , 0.333 ),
# ('Grenelle' , 0.5 ),
......@@ -92,10 +103,11 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
the_id = specnode.id
# print(v)
pd.options.display.float_format = '${:,.2f}'.format
data = WeightedList(
zip( v.index.tolist()
, v.values.tolist()
, v.values.tolist()[0]
)
)
data.save(the_id)
......
......@@ -8,7 +8,7 @@ FIXME: "having the same source" means we need to select inside hyperdata
with a (perhaps costly) JSON query: WHERE hyperdata->'resources' @> ...
"""
from gargantext.models import Node, NodeNgram, NodeNodeNgram
from gargantext.models import Node, NodeNgram, NodeNodeNgram, NodeNgramNgram
from gargantext.util.db import session, bulk_insert, func # = sqlalchemy.func like sum() or count()
from sqlalchemy import text # for query from raw SQL statement
from math import log
......@@ -29,6 +29,13 @@ def compute_occs(corpus, overwrite_id = None):
- overwrite_id: optional id of a pre-existing OCCURRENCES node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced)
"""
# 0) Get the groups
group_id = (session.query(Node.id)
.filter(Node.parent_id == corpus.id)
.filter(Node.typename == "GROUPLIST")
.first()
)
# 1) all the doc_ids of our corpus (scope of counts for filter)
# slower alternative: [doc.id for doc in corpus.children('DOCUMENT').all()]
......@@ -45,6 +52,7 @@ def compute_occs(corpus, overwrite_id = None):
NodeNgram.ngram_id,
func.sum(NodeNgram.weight)
)
#.join(NodeNgramNgram, NodeNgramNgram.node_id == group_id)
.filter(NodeNgram.node_id.in_(docids_subquery))
.group_by(NodeNgram.ngram_id)
.all()
......
......@@ -84,9 +84,10 @@ class NodeListResource(APIView):
response = HttpResponse(content_type='text/csv')
response['Content-Disposition'] = 'attachment; filename="Gargantext_Corpus.csv"'
writer = csv.writer(response)
writer = csv.writer(response, delimiter='\t')
keys = [ 'title' , 'journal', 'publication_date'
keys = [ 'title' , 'journal'
, 'publication_year', 'publication_month', 'publication_day'
, 'abstract', 'authors']
writer.writerow(keys)
......
......@@ -170,7 +170,7 @@ function toggleFavstatus (rec_id) {
var myHttpAction = statusBefore ? 'DELETE' : 'PUT'
$.ajax({
url: 'http://localhost:8000/api/nodes/'+corpus_id+'/favorites?docs='+doc_id,
url: window.location.origin + '/api/nodes/'+corpus_id+'/favorites?docs='+doc_id,
type: myHttpAction,
beforeSend: function(xhr) {
xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
......@@ -602,7 +602,7 @@ $.ajax({
success: function(maindata){
// unfortunately favorites info is in a separate request (other nodes)
$.ajax({
url: 'http://localhost:8000/api/nodes/'+corpus_id+'/favorites',
url: window.location.origin + '/api/nodes/'+corpus_id+'/favorites',
success: function(favdata){
// initialize favs lookup
for (var i in favdata['favdocs']) {
......
......@@ -290,7 +290,7 @@ function Main_test( data , initial) {
var div_table = '<p align="right">'+"\n"
div_table += '<table id="my-ajax-table" class="table table-bordered table-hover">'+"\n"
div_table += "\t"+'<thead>'+"\n"
div_table += "\t"+"\t"+'<th data-dynatable-column="name">Title</th>'+"\n"
div_table += "\t"+"\t"+'<th data-dynatable-column="name"><span class="glyphicon glyphicon-text-size"></span> Title</th>'+"\n"
div_table += "\t"+"\t"+'<th data-dynatable-column="score" data-dynatable-sorts="score">No. Pubs</th>'+"\n"
// div_table += "\t"+"\t"+'<th id="score_column_id" data-dynatable-sorts="score" data-dynatable-column="score">Score</th>'+"\n"
div_table += "\t"+"\t"+'</th>'+"\n"
......
......@@ -59,9 +59,12 @@
<div class="panel panel-default">
<div class="panel-heading">
<h4 class="panel-title">
Publications by source
</h4>
<h2 class="panel-title">
<center>
<span class="glyphicon glyphicon-hand-down" aria-hidden="true"></span>
Publications by source
</center>
</h2>
</div>
......
......@@ -55,12 +55,15 @@
<div class="panel panel-default">
<div class="panel-heading">
<h4 class="panel-title">
<h2 class="panel-title">
<center>
<span class="glyphicon glyphicon-hand-down" aria-hidden="true"></span>
Extracted terms
<!-- <button title='run test function' onclick="doATest()">
TEST
</button> -->
</a>
</center>
</h2>
<!-- see in javascript function queries.functions['my_state_filter'] -->
<div class="pull-left" style="margin-top:1.85em;">
......
......@@ -54,9 +54,12 @@
<div class="jumbotron">
<div class="panel panel-default">
<div class="panel-heading">
<h4 class="panel-title">
Publications by title
</h4>
<h2 class="panel-title">
<center>
<span class="glyphicon glyphicon-hand-down" aria-hidden="true"></span>
Publications by title
</center>
</h2>
<!-- search box with custom function in Docs_dyna_chart_and_tables.js -->
<div class="pull-left" style="margin-top:1.85em; font-size: 16px;">
<span class="glyphicon glyphicon-search" aria-hidden="true"></span>
......
......@@ -41,7 +41,7 @@
<li>
Version 3.0.0
<ul>
<li>[NAME] Blue Jasmine</li>
<li>[NAME] Blue Jasmin</li>
<li>[CODE] Refactored</li>
<li>[DATABASE] New schema</li>
</ul>
......
......@@ -111,14 +111,26 @@
</a>
<i class="caret"></i>
<ul class="dropdown-menu">
<li>
<a tabindex="-1"
data-url="/projects/{{project.id}}/corpora/{{ corpus.id }}/explorer?field1=ngrams&amp;field2=ngrams&amp;distance=conditional&amp;bridgeness=5" onclick='gotoexplorer(this)' >With conditional distance </a>
</li>
<li>
<a tabindex="-1"
data-url="/projects/{{project.id}}/corpora/{{ corpus.id }}/explorer?field1=ngrams&amp;field2=ngrams&amp;distance=distributional&amp;bridgeness=5" onclick='gotoexplorer(this)' >With distributional distance</a>
</li>
{% if view != "graph" %}
<li>
<a tabindex="-1"
data-url="/projects/{{project.id}}/corpora/{{ corpus.id }}/explorer?field1=ngrams&amp;field2=ngrams&amp;distance=conditional&amp;bridgeness=5" onclick='gotoexplorer(this)' >With conditional distance </a>
</li>
<li>
<a tabindex="-1"
data-url="/projects/{{project.id}}/corpora/{{ corpus.id }}/explorer?field1=ngrams&amp;field2=ngrams&amp;distance=distributional&amp;bridgeness=5" onclick='gotoexplorer(this)' >With distributional distance</a>
</li>
{% else %}
<li>
<a tabindex="-1"
href="/projects/{{project.id}}/corpora/{{ corpus.id }}/explorer?field1=ngrams&amp;field2=ngrams&amp;distance=conditional&amp;bridgeness=5" >With conditional distance </a>
</li>
<li>
<a tabindex="-1"
href="/projects/{{project.id}}/corpora/{{ corpus.id }}/explorer?field1=ngrams&amp;field2=ngrams&amp;distance=distributional&amp;bridgeness=5">With distributional distance</a>
</li>
{% endif %}
</ul>
</li>
{% endif %}
......@@ -138,30 +150,30 @@
<div class="jumbotron" style="margin-bottom:0">
<br>
<br>
<!--
<a type="button" class="btn btn-default
href="/projects/{{project.id}}/corpora/{{ corpus.id }}/">Export corpus</a>
--!>
<!-- <li class="divider"></li> --!>
<div class="row">
<div class="col-md-5">
{% if project %}
<h3><a href="/projects/{{project.id}}">
<span class="glyphicon glyphicon-book" aria-hidden="true"></span>
{{ project.name }}
<h3>
<a href="/projects/{{project.id}}">
<span class="glyphicon glyphicon-book" aria-hidden="true"></span>
{{ project.name | truncatechars:50}}
</a>
</h3>
</div>
<div class="row">
<div class="col-md-1">
</div>
<div class="col-md-6">
<h3>
<span class="glyphicon glyphicon-cd" aria-hidden="true"></span>
{{ resourcename | truncatechars:20 }}
</h3>
<h3>
<span class="glyphicon glyphicon-file" aria-hidden="true"></span>
{{ corpus.name | truncatechars:20 }}
<a class="btn btn-primary" role="button" href="/api/nodes?parent_id={{corpus.id}}&types[]=DOCUMENT&pagination_limit=100000&formated=csv">
<span class="glyphicon glyphicon-download" aria-hidden="true"></span>
</a>
<br>
<span class="glyphicon glyphicon-cd" aria-hidden="true"></span>
{{ resourcename | truncatechars:20 }}
<br>
<span class="glyphicon glyphicon-file" aria-hidden="true"></span>
{{ corpus.name }}
<br>
<span class="glyphicon glyphicon-calendar" aria-hidden="true"></span>
{{ corpus.date }}
</h3>
{% endif %}
</div>
<div class="col-md-5">
<h3>
......
......@@ -73,6 +73,7 @@
{% for key, corpora in list_corpora.items %}
<h2>
<div class="row">
<div class="col-md-1 content"></div>
<span class="glyphicon glyphicon-cd" aria-hidden="true"></span>
{{ key }}
</h2>
......@@ -80,6 +81,7 @@
<div id="corpus_{{corpus.id}}">
<div class="row">
<h4>
<div class="col-md-1 content"></div>
<div class="col-md-5 content">
<a href="/projects/{{project.id}}/corpora/{{corpus.id}}">
<span class="glyphicon glyphicon-file" aria-hidden="true"></span>
......@@ -108,8 +110,7 @@
<span class="glyphicon glyphicon-trash" aria-hidden="true"></span>
</button>
</div>
<div class="col-md-5 content">
<div class="col-md-3 content">
{% for state in corpus.hyperdata.statuses %}
{% ifequal state.action "ngrams_extraction" %}
{% if state.complete %}
......@@ -169,6 +170,7 @@
{% endifequal %}
{% endfor %}
</div>
<div class="col-md-1 content"></div>
</h4>
</div>
</div>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment