Commit a180ff49 authored by delanoe's avatar delanoe

Merge branch 'romain-goodies' into unstable

parents 0b233ee1 9ad0d542
......@@ -86,7 +86,19 @@ class bulk_insert:
readline = read
def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None):
def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None, do_stats=False):
"""
Inserts bulk data with an intermediate check on a uniquekey
(ex: Ngram.terms) to see if the row existed before.
If the row already existed we just retrieve its id.
If it didn't exist we create it and retrieve the id.
Returns a dict {uniquekey => id}
Option:
do stats: also returns the number of those that had no previous id
"""
if cursor is None:
db, cursor = get_cursor()
mustcommit = True
......@@ -109,6 +121,7 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None):
sourcetable = model.__tablename__,
uniquecolumn = uniquekey,
))
# insert what has not been found to the real table
cursor.execute('''
INSERT INTO {sourcetable} ({columns})
......@@ -119,6 +132,11 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None):
sourcetable = model.__tablename__,
columns = ', '.join(fields),
))
if do_stats:
# remember how many rows we inserted just now
n_new = cursor.rowcount
# retrieve dict associating unique key to id
cursor.execute('''
SELECT source.id, source.{uniquecolumn}
......@@ -130,10 +148,15 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None):
columns = ', '.join(fields),
))
result = {
# term : new_id
row[1]: row[0] for row in cursor.fetchall()
}
# this is the end!
cursor.execute('DROP TABLE __tmp__')
if mustcommit:
db.commit()
return result
if do_stats:
return result, n_new
else:
return result
"""
Utilities for group management
- query_grouped_ngrams(group_id) to retrieve subforms
- group_union() to join two groupings lists
"""
from gargantext.util.db import session, aliased
from gargantext.models import Ngram, NodeNgramNgram
from igraph import Graph # for group_union
def query_groups(groupings_id, details=False):
"""
Listing of couples (mainform, subform)
aka (ngram1_id, ngram2_id)
Parameter:
- details: if False, just send the array of couples
if True, send quadruplets with (ngram1_id, term1, ngram2_id, term2)
"""
if not details:
# simple contents
query = session.query(NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id)
else:
# detailed contents (id + terms)
Ngram1 = aliased(Ngram)
Ngram2 = aliased(Ngram)
query = (session
.query(
NodeNgramNgram.ngram1_id,
Ngram1.terms,
NodeNgramNgram.ngram2_id,
Ngram2.terms,
)
.join(Ngram1, NodeNgramNgram.ngram1_id == Ngram1.id)
.join(Ngram2, NodeNgramNgram.ngram2_id == Ngram2.id)
)
# main filter
# -----------
query = query.filter(NodeNgramNgram.node_id == groupings_id)
return query
def query_grouped_ngrams(groupings_id, details=False, scoring_metric_id=None):
"""
Listing of "hidden" ngram_ids from the groups
Works only for grouplists
Parameter:
- details: if False, send just the array of ngram_ids
if True, send triples with (ngram_id, term, scoring)
^^^^^^^
deprecated: scoring_metric_id: id of a scoring metric node (TFIDF or OCCS)
(for details and sorting)
(no more OCCS counts of subforms)
"""
if not details:
# simple contents
query = session.query(NodeNgramNgram.ngram2_id)
else:
# detailed contents (terms and some NodeNodeNgram for score)
query = (session
.query(
NodeNgramNgram.ngram2_id,
Ngram.terms,
# NodeNodeNgram.score #
)
.join(Ngram, NodeNgramNgram.ngram2_id == Ngram.id)
# .join(NodeNodeNgram, NodeNgramNgram.ngram2_id == NodeNodeNgram.ngram_id)
# .filter(NodeNodeNgram.node1_id == scoring_metric_id)
# .order_by(desc(NodeNodeNgram.score))
)
# main filter
# -----------
query = query.filter(NodeNgramNgram.node_id == groupings_id)
return query
def group_union(g_a_links, g_b_links):
"""
Synonym groups are modelled by sets of couples in the DB
Input : 2 arrays of links (ngramx_id, ngramy_id)
Input : 1 array of links (ngramx_id, ngramy_id)
Synonymity is considered transitive so in effect the groups
can form a set (defined by the connected component of couples).
A requested feature is also that one node dominates others
(aka "leader effect"; leader will be in the map, the others won't)
Summary of major union effects in various cases:
GROUP 1 Group 2 Group 1 ∪ 2
A -> B A -> C A -> B (simple union)
A -> C
D -> E E -> F D -> E
D -> F (D "leader effect")
G -> H G -> I G -> H ( transitivity +
H -> J G -> I "leader effect")
G -> J
rloth: this is some slightly amended code
from Samuel's in rest_v1_0.ngrams.Group.get
TODO use "most frequent" score if leader candidates are ex aequo by degree.
"""
# output: list of links forming new group
new_links = []
# 1) create graph with both lists
# -------------------------------
# from igraph import Graph
# the set of all our ngram_ids
all_vertices = set(
[ngid for couple in g_a_links+g_b_links for ngid in couple]
)
# initialize the synonym graph with size
sg = Graph(len(all_vertices), directed=True)
# add our IDs as "name" (special attribute good for edge creation)
sg.vs['name'] = [str(x) for x in all_vertices]
# add the edges as named couples
sg.add_edges([(str(x),str(y)) for (x,y) in g_a_links])
#print('UNION A:', g_a_links)
#print('initially %i components' % len(sg.as_undirected().components()))
# same with the other edges
sg.add_edges([(str(x),str(y)) for (x,y) in g_b_links])
#print('UNION B:', g_b_links)
#print('after union %i components' % len(sg.as_undirected().components()))
# 2) list resulting components
# -----------------------------
synonym_components = sg.as_undirected().components()
# for example
# cs = [[0, 3, 6], [1, 2, 8], [4, 5, 9, 11], [7,10]]
# there should be no singletons by construction
# list of all outdegrees for "leader" detection
# (leader = term most often marked as source by the users)
odegs = sg.outdegree()
#for i, v in enumerate(sg.vs):
# print("%i - name:%s - odeg:%i" % (i, v['name'], odegs[i]))
for component in synonym_components:
# we map back to our ids, preserving order
our_comp = [int(our_id) for our_id in sg.vs[component]['name']]
# 3) take main node and unnest into new links list
# -------------------------------------------------
# position (within this component) of the best node (by degree)
max_odeg = -1
main_node_local_index = None
for position, vertex_id in enumerate(component):
this_odeg = odegs[vertex_id]
if this_odeg > max_odeg:
main_node_local_index = position
max_odeg = this_odeg
# we set it aside in our translated version our_comp
main_node = our_comp.pop(main_node_local_index)
# and unnest the others
for remaining_id in our_comp:
new_links.append((main_node, remaining_id))
return new_links
This diff is collapsed.
......@@ -9,6 +9,9 @@ from re import sub
from gargantext.util.scheduling import scheduled
def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor):
"""
@param ngrams_data a set like {('single word', 2), ('apple', 1),...}
"""
print('INTEGRATE')
# integrate ngrams
ngrams_ids = bulk_insert_ifnotexists(
......
......@@ -8,118 +8,88 @@ API views for advanced operations on ngrams and ngramlists
"""
from gargantext.util.http import APIView, get_parameters, JsonHttpResponse,\
ValidationException, Http404
from gargantext.util.db import session, aliased, desc, bulk_insert
ValidationException, Http404, HttpResponse
from gargantext.util.db import session, aliased, bulk_insert
from gargantext.util.db_cache import cache
from sqlalchemy import tuple_
from gargantext.models import Ngram, NodeNgram, NodeNodeNgram, NodeNgramNgram
from gargantext.util.lists import UnweightedList, Translations
# useful subroutines
from gargantext.util.ngramlists_tools import query_list, export_ngramlists, \
import_ngramlists, merge_ngramlists
from gargantext.util.group_tools import query_grouped_ngrams
def _query_list(list_id,
pagination_limit=None, pagination_offset=None,
details=False, scoring_metric_id=None
):
class List(APIView):
"""
see already available API query api/nodes/<list_id>?fields[]=ngrams
"""
Paginated listing of ngram_ids in a NodeNgram lists.
pass
Works for a mainlist or stoplist or maplist (not grouplists!)
Parameter:
- pagination_limit, pagination_offset
- details: if False, send just the array of ngram_ids
if True, send triples with (ngram_id, term, scoring)
^^^^^^^
- scoring_metric_id: id of a scoring metric node (TFIDF or OCCS)
(for details and sorting)
class CSVLists(APIView):
"""
if not details:
# simple contents
query = session.query(NodeNgram.ngram_id).filter(NodeNgram.node_id == list_id)
else:
# detailed contents (terms and some NodeNodeNgram for score)
# NB: score can be undefined (eg ex-subform that now became free)
# ==> we need outerjoin
# and the filter needs to have scoring_metric_id so we do it before
ScoresTable = (session
.query(NodeNodeNgram.score, NodeNodeNgram.ngram_id)
.filter(NodeNodeNgram.node1_id == scoring_metric_id)
.subquery()
)
query = (session
.query(
NodeNgram.ngram_id,
Ngram.terms,
ScoresTable.c.score
)
.join(Ngram, NodeNgram.ngram_id == Ngram.id)
# main filter ----------------------
.filter(NodeNgram.node_id == list_id)
# scores if possible
.outerjoin(ScoresTable,
ScoresTable.c.ngram_id == NodeNgram.ngram_id)
.order_by(desc(ScoresTable.c.score))
)
For CSV exports of all lists of a corpus
if pagination_limit:
query = query.limit(pagination_limit)
Or CSV import into existing lists as "patch"
"""
def get(self, request):
params = get_parameters(request)
corpus_id = int(params.pop("corpus"))
corpus_node = cache.Node[corpus_id]
if pagination_offset:
query = query.offset(pagination_offsets)
# response is file-like + headers
response = HttpResponse(content_type='text/csv')
response['Content-Disposition'] = 'attachment; filename="corpus-%i_gargantext_term_list.csv"' % corpus_id
return query
# fill the response with the data
export_ngramlists(corpus_node, fname=response, titles=True)
return response
def post(self,request):
"""
Merge the lists of a corpus with other lists from a CSV source
or from another corpus
params in request.GET:
corpus: the corpus whose lists are getting patched
params in request.FILES:
csvsource: the csv file
def _query_grouped_ngrams(groupings_id, details=False, scoring_metric_id=None):
"""
Listing of "hidden" ngram_ids from the groups
or in get
dbsource: another corpus instead of the csvfile
(? this last option should perhaps not be in CSVLists ?)
Works only for grouplists
NB: not using PATCH because we'll need POST file upload
Parameter:
- details: if False, send just the array of ngram_ids
if True, send triples with (ngram_id, term, scoring)
^^^^^^^
deprecated: scoring_metric_id: id of a scoring metric node (TFIDF or OCCS)
(for details and sorting)
(no more OCCS counts of subforms)
"""
if not details:
# simple contents
query = session.query(NodeNgramNgram.ngram2_id)
else:
# detailed contents (terms and some NodeNodeNgram for score)
query = (session
.query(
NodeNgramNgram.ngram2_id,
Ngram.terms,
# NodeNodeNgram.score #
)
.join(Ngram, NodeNgramNgram.ngram2_id == Ngram.id)
# .join(NodeNodeNgram, NodeNgramNgram.ngram2_id == NodeNodeNgram.ngram_id)
# .filter(NodeNodeNgram.node1_id == scoring_metric_id)
# .order_by(desc(NodeNodeNgram.score))
)
/!\ We assume we checked the file size client-side before upload
# main filter
# -----------
query = query.filter(NodeNgramNgram.node_id == groupings_id)
£TODO check authentication and user.id
"""
# this time the corpus param is the one with the target lists to be patched
params = get_parameters(request)
corpus_id = int(params.pop("onto_corpus"))
corpus_node = cache.Node[corpus_id]
return query
# request also contains the file
# csv_file has type django.core.files.uploadedfile.InMemoryUploadedFile
# ----------------------
csv_file = request.data['csvfile']
# import the csv
new_lists = import_ngramlists(csv_file)
del csv_file
# merge the new_lists onto those of the target corpus
log_msg = merge_ngramlists(new_lists, onto_corpus=corpus_node)
return JsonHttpResponse({
'log': log_msg,
}, 200)
class List(APIView):
"""
see already available API query api/nodes/<list_id>?fields[]=ngrams
"""
pass
class GroupChange(APIView):
......@@ -441,7 +411,7 @@ class MapListGlance(APIView):
listmembers = {'maplist':[]} # ngram ids sorted per list name
# infos for all ngrams from maplist
map_ngrams = _query_list(maplist_id, details=True,
map_ngrams = query_list(maplist_id, details=True,
scoring_metric_id= scores_id).all()
# ex: [(8805, 'mean age', 4.0),
......@@ -566,25 +536,25 @@ class ListFamily(APIView):
if "head" in parameters:
# head <=> only mainlist AND only k top ngrams
glance_limit = int(parameters['head'])
mainlist_query = _query_list(mainlist_id, details=True,
mainlist_query = query_list(mainlist_id, details=True,
pagination_limit = glance_limit,
scoring_metric_id= scores_id)
else:
# infos for all ngrams from mainlist
mainlist_query = _query_list(mainlist_id, details=True,
mainlist_query = query_list(mainlist_id, details=True,
scoring_metric_id= scores_id)
# infos for grouped ngrams, absent from mainlist
hidden_ngrams_query = _query_grouped_ngrams(groups_id, details=True,
hidden_ngrams_query = query_grouped_ngrams(groups_id, details=True,
scoring_metric_id= scores_id)
# infos for stoplist terms, absent from mainlist
stop_ngrams_query = _query_list(other_list_ids['stoplist'], details=True,
stop_ngrams_query = query_list(other_list_ids['stoplist'], details=True,
scoring_metric_id=scores_id)
# and for the other lists (stop and map)
# no details needed here, just the member ids
for li in other_list_ids:
li_elts = _query_list(other_list_ids[li], details=False
li_elts = query_list(other_list_ids[li], details=False
).all()
# simple array of ngram_ids
listmembers[li] = [ng[0] for ng in li_elts]
......
......@@ -27,6 +27,15 @@ urlpatterns = [ url(r'^nodes$' , nodes.NodeListResource.as_view()
# \
# corpus id
, url(r'^ngramlists/export$', ngramlists.CSVLists.as_view() )
# get a CSV export of the ngramlists of a corpus
# ex: GET ngramlists/export?corpus=43
# TODO : unify to a /api/ngrams?formatted=csv
# (similar to /api/nodes?formatted=csv)
, url(r'^ngramlists/import$', ngramlists.CSVLists.as_view() )
# same handling class as export (CSVLists)
# but this route used only for POST + file
, url(r'^ngramlists/change$', ngramlists.ListChange.as_view() )
# add or remove ngram from a list
......
......@@ -33,6 +33,9 @@ def ngramtable(request, project_id, corpus_id):
'project': project,
'corpus' : corpus,
'resourcename' : resourcename(corpus),
'view': 'terms'
'view': 'terms',
# for the CSV import modal
'csvimportroute': "/api/ngramlists/import?onto_corpus=%i"% corpus.id
},
)
......@@ -11,6 +11,7 @@ django-pgfields==1.4.4
django-pgjsonb==0.0.16
djangorestframework==3.3.2
html5lib==0.9999999
python-igraph>=0.7.1
jdatetime==1.7.2
kombu==3.0.33 # messaging
nltk==3.1
......
......@@ -19,3 +19,13 @@
line-height: .85;
margin-bottom: -5px;
}
.exportbtn {
/* border: 1px solid #333 ; */
margin-top:17px ; /* valigns with bootstrap h2 */
}
.btn .glyphicon {
/* glyphicons are always rendered too high within bootstrap buttons */
vertical-align:middle
}
......@@ -72,6 +72,15 @@
<button id="Save_All" class="btn btn-muted" disabled style="font-size:120%">
<b>Save all changes</b>
</button>
<br/>
<br/>
<!-- import icon -->
<span class="needsaveicon glyphicon glyphicon-import"></span>
&nbsp;
<button id="ImportList" class="btn btn-warning" style="font-size:120%"
onclick="$('#csvimport').modal('show');">
<b>Import a Termlist</b>
</button>
</div>
<!-- see in javascript function queries.functions['my_state_filter'] -->
<div class="pull-right" style="margin-top:2.1em;padding-left:1em;">
......@@ -107,25 +116,110 @@
</div> <!-- /div panel -->
</div> <!-- /jumbotron -->
<!--
<button id="ImportList" onclick="GetUserPortfolio();" class="btn btn-warning">
Import a Corpus-List
</button>
-->
<!--</div> This div is closed in the menu !-->
<!--</div> This div is closed in the menu !-->
<!--
# stub to import a list (aka orange button)
<button id="ImportList" onclick="GetUserPortfolio();" class="btn btn-warning">Import a Corpus-List</button>
-->
<div class="modal" aria-hidden="true" id="csvimport">
<div class="modal-dialog">
<div class="modal-content">
<div class="modal-header">
<button type="button" class="close" data-dismiss="modal" aria-hidden="true">×</button>
<h3 id="myModalLabel">Import a CSV term list</h3>
</div>
<div class="modal-body" id="uploadform">
<form id="csvimportform"
onsubmit="return postCSV(event)"
enctype="multipart/form-data"
method="post">
{% csrf_token %}
<label>From your disk:</label>
<input type="file" id="csvfile" accept="text/csv">
<br/>
<label>From another corpus:</label>
<p>TODO</p>
<br/>
<input type="submit" class="btn btn-xs btn-info" id="csvsubmit" value="Submit" />
</form>
</div>
<div class="modal-footer" id="formanswer"></div>
</div>
</div>
</div>
<script type="text/javascript" src="{% static "lib/jquery/dynatable/jquery.dynatable.js" %}"></script>
<!-- custom-lib for dynatable.js and dc.js -->
<script type="text/javascript" src="{% static "lib/gargantext/NGrams_dyna_chart_and_table.js" %}"></script>
<script type="text/javascript">
/* merci c24b !
* Uses csvimportroute variable from the django template
* Ex: /api/ngramlists/import?onto_corpus=corpus_id
*
* Uses input#csvfile as source data.
*/
function postCSV(e){
// don't do page reload of usual submits
e.preventDefault()
// 2MB ≈ 70000 ngrams
var max_size = 2097152
// we take it straight from the input element
theFile = $('input#csvfile')[0].files[0]
// debug
// console.log(theFile.name, "size", theFile.size, theFile.lastModifiedDate)
if (! theFile) {
console.warn('Ignoring "submit": no provided file')
return false
}
else if (theFile.size > max_size) {
console.warn('Ignoring "submit": file is too big')
$('#formanswer').html(
'The import failed: your file is too big ('+max_size/1024+'kB max).'
);
return false
}
// normal case
else {
// append into an empty form (or fixme: initialize it using form element)
var myFileFormData = new FormData();
myFileFormData.append("csvfile", theFile)
//postCorpusFile
$.ajax({
url: "{{csvimportroute | safe}}",
type: 'POST',
async: true,
contentType: false,
processData: false,
data: myFileFormData,
beforeSend: function(xhr) {
xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
},
success: function(response) {
my_html = "<h2 color='green'>IMPORT OK ! </h2>"
my_html += "<p class='note'>" + response['log'].replace(/\n/g, '<br/>') + "</p>"
my_html += "<p'>(this page will reload in 3s)</p>"
$('#formanswer').html(my_html);
console.log(response) ;
// reload after 3s
setTimeout("location.reload(true)", 3000);
},
error: function(result) {
$('#formanswer').html('Erreur');
console.error(result);
},
});
$('#formanswer').html('CSV import in Progress');
}
};
</script>
{% endblock %}
......@@ -41,7 +41,7 @@
{% if corpus %}
<li><a href="/projects/{{project.id}}/corpora/{{corpus.id}}">
<span class="glyphicon glyphicon-file" aria-hidden="true"></span>
{{corpus.name | truncatechars:15}}
{{corpus.name | truncatechars:25}}
</a>
</li>
{% endif %}
......@@ -150,12 +150,32 @@
<br>
<br>
<div class="row">
<h3>
<a href="/projects/{{project.id}}">
<span class="glyphicon glyphicon-book" aria-hidden="true"></span>
{{ project.name | truncatechars:50}}
<div class="col-md-6">
<h3>
<a href="/projects/{{project.id}}">
<span class="glyphicon glyphicon-book" aria-hidden="true"></span>
{{ project.name | truncatechars:50}}
</a>
</h3>
</div>
<!-- export button -->
<div class="col-md-6">
{% if view == 'terms' %}
<a class="btn btn-primary exportbtn pull-right" role="button"
href="/api/ngramlists/export?corpus={{corpus.id}}"
title="Export terms table in CSV">
Export terms table &nbsp; <span class="glyphicon glyphicon-download" aria-hidden="true"></span>
</a>
{% elif view == 'titles' %}
<a class="btn btn-primary exportbtn pull-right" role="button"
href="/api/nodes?parent_id={{corpus.id}}&types[]=DOCUMENT&pagination_limit=100000&formated=csv"
title="Export full corpus in CSV">
Export corpus &nbsp; <span class="glyphicon glyphicon-download" aria-hidden="true"></span>
</a>
</h3>
{% else %}
<!-- TODO export journal table -->
{% endif %}
</div>
</div>
<div class="row">
<div class="col-md-1">
......@@ -167,10 +187,7 @@
</h3>
<h3>
<span class="glyphicon glyphicon-file" aria-hidden="true"></span>
{{ corpus.name | truncatechars:20 }}
<a class="btn btn-primary" role="button" href="/api/nodes?parent_id={{corpus.id}}&types[]=DOCUMENT&pagination_limit=100000&formated=csv">
<span class="glyphicon glyphicon-download" aria-hidden="true"></span>
</a>
{{ corpus.name | truncatechars:30 }}
</h3>
</div>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment