Commit 2e43fa9a authored by Romain Loth's avatar Romain Loth

add "export terms table" option + corresponding logic and api route (ex:...

add "export terms table" option + corresponding logic and api route (ex: http://localhost:8000/api/ngramlists/export?corpus=4543)
parent 3aa5a72c
"""
Utilities for group management
- query_grouped_ngrams(group_id) to retrieve subforms
- group_union() to join two groupings lists
"""
from gargantext.util.db import session, aliased
from gargantext.models import Ngram, NodeNgramNgram
from igraph import Graph # for group_union
def query_groups(groupings_id, details=False):
"""
Listing of couples (mainform, subform)
aka (ngram1_id, ngram2_id)
Parameter:
- details: if False, just send the array of couples
if True, send quadruplets with (ngram1_id, term1, ngram2_id, term2)
"""
if not details:
# simple contents
query = session.query(NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id)
else:
# detailed contents (id + terms)
Ngram1 = aliased(Ngram)
Ngram2 = aliased(Ngram)
query = (session
.query(
NodeNgramNgram.ngram1_id,
Ngram1.terms,
NodeNgramNgram.ngram2_id,
Ngram2.terms,
)
.join(Ngram1, NodeNgramNgram.ngram1_id == Ngram1.id)
.join(Ngram2, NodeNgramNgram.ngram2_id == Ngram2.id)
)
# main filter
# -----------
query = query.filter(NodeNgramNgram.node_id == groupings_id)
return query
def query_grouped_ngrams(groupings_id, details=False, scoring_metric_id=None):
"""
Listing of "hidden" ngram_ids from the groups
Works only for grouplists
Parameter:
- details: if False, send just the array of ngram_ids
if True, send triples with (ngram_id, term, scoring)
^^^^^^^
deprecated: scoring_metric_id: id of a scoring metric node (TFIDF or OCCS)
(for details and sorting)
(no more OCCS counts of subforms)
"""
if not details:
# simple contents
query = session.query(NodeNgramNgram.ngram2_id)
else:
# detailed contents (terms and some NodeNodeNgram for score)
query = (session
.query(
NodeNgramNgram.ngram2_id,
Ngram.terms,
# NodeNodeNgram.score #
)
.join(Ngram, NodeNgramNgram.ngram2_id == Ngram.id)
# .join(NodeNodeNgram, NodeNgramNgram.ngram2_id == NodeNodeNgram.ngram_id)
# .filter(NodeNodeNgram.node1_id == scoring_metric_id)
# .order_by(desc(NodeNodeNgram.score))
)
# main filter
# -----------
query = query.filter(NodeNgramNgram.node_id == groupings_id)
return query
def group_union(g_a_links, g_b_links):
"""
Synonym groups are modelled by sets of couples in the DB
Input : 2 arrays of links (ngramx_id, ngramy_id)
Input : 1 array of links (ngramx_id, ngramy_id)
Synonymity is considered transitive so in effect the groups
can form a set (defined by the connected component of couples).
A requested feature is also that one node dominates others
(aka "leader effect"; leader will be in the map, the others won't)
Summary of major union effects in various cases:
GROUP 1 Group 2 Group 1 ∪ 2
A -> B A -> C A -> B (simple union)
A -> C
D -> E E -> F D -> E
D -> F (D "leader effect")
G -> H G -> I G -> H ( transitivity +
H -> J G -> I "leader effect")
G -> J
rloth: this is some slightly amended code
from Samuel's in rest_v1_0.ngrams.Group.get
"""
# output: list of links forming new group
new_links = []
# 1) create graph with both lists
# -------------------------------
# from igraph import Graph
# the set of all our ngram_ids
all_vertices = set(
[ngid for couple in g_a_links+g_b_links for ngid in couple]
)
# initialize the synonym graph with size
sg = Graph(len(all_vertices), directed=True)
# add our IDs as "name" (special attribute good for edge creation)
sg.vs['name'] = [str(x) for x in all_vertices]
# add the edges as named couples
sg.add_edges([(str(x),str(y)) for (x,y) in g_a_links])
#print('UNION A:', g_a_links)
#print('initially %i components' % len(sg.as_undirected().components()))
# same with the other edges
sg.add_edges([(str(x),str(y)) for (x,y) in g_b_links])
#print('UNION B:', g_b_links)
#print('after union %i components' % len(sg.as_undirected().components()))
# 2) list resulting components
# -----------------------------
synonym_components = sg.as_undirected().components()
# for example
# cs = [[0, 3, 6], [1, 2, 8], [4, 5, 9, 11], [7,10]]
# there should be no singletons by construction
# list of all outdegrees for "leader" detection
# (leader = term most often marked as source by the users)
odegs = sg.outdegree()
#for i, v in enumerate(sg.vs):
# print("%i - name:%s - odeg:%i" % (i, v['name'], odegs[i]))
for component in synonym_components:
# we map back to our ids, preserving order
our_comp = [int(our_id) for our_id in sg.vs[component]['name']]
# 3) take main node and unnest into new links list
# -------------------------------------------------
# position (within this component) of the best node (by degree)
max_odeg = -1
main_node_local_index = None
for position, vertex_id in enumerate(component):
this_odeg = odegs[vertex_id]
if this_odeg > max_odeg:
main_node_local_index = position
max_odeg = this_odeg
# we set it aside in our translated version our_comp
main_node = our_comp.pop(main_node_local_index)
# and unnest the others
for remaining_id in our_comp:
new_links.append((main_node, remaining_id))
return new_links
"""
Tools to work with ngramlists (MAINLIST, MAPLIST, STOPLIST)
- query_list(list_id) to retrieve ngrams
- import_ngramlists(corpus_id)
"""
from gargantext.util.group_tools import query_groups, group_union
from gargantext.util.db import session, bulk_insert, desc, func
from gargantext.models import Ngram, NodeNgram, NodeNodeNgram, NodeNgramNgram
from sqlalchemy.sql import exists
from os import path
from csv import writer, reader, QUOTE_MINIMAL
from collections import defaultdict
from re import match
from io import StringIO # pseudo file to write CSV to memory
def query_list(list_id,
pagination_limit=None, pagination_offset=None,
details=False, scoring_metric_id=None
):
"""
Paginated listing of ngram_ids in a NodeNgram lists.
Works for a mainlist or stoplist or maplist (not grouplists!)
Parameter:
- pagination_limit, pagination_offset
- details: if False, send just the array of ngram_ids
if True and no scoring, send couples with (ngram_id, term)
if True and a scoring_id, send triples with (ngram_id, term, scoring)
- scoring_metric_id: id of a scoring metric node (TFIDF or OCCS)
(for details and sorting)
"""
# simple contents
if not details:
query = session.query(NodeNgram.ngram_id).filter(NodeNgram.node_id == list_id)
# detailed contents (id + terms)
elif not scoring_metric_id:
query = (session.query(Ngram.id, Ngram.terms, Ngram.n)
.join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
.filter(NodeNgram.node_id == list_id)
)
# detailed contents (id + terms) + score
else:
# NB: score can be undefined (eg ex-subform that now became free)
# ==> we need outerjoin
# and the filter needs to have scoring_metric_id so we do it before
ScoresTable = (session
.query(NodeNodeNgram.score, NodeNodeNgram.ngram_id)
.filter(NodeNodeNgram.node1_id == scoring_metric_id)
.subquery()
)
query = (session
.query(
NodeNgram.ngram_id,
Ngram.terms,
ScoresTable.c.score
)
.join(Ngram, NodeNgram.ngram_id == Ngram.id)
# main filter ----------------------
.filter(NodeNgram.node_id == list_id)
# scores if possible
.outerjoin(ScoresTable,
ScoresTable.c.ngram_id == NodeNgram.ngram_id)
.order_by(desc(ScoresTable.c.score))
)
if pagination_limit:
query = query.limit(pagination_limit)
if pagination_offset:
query = query.offset(pagination_offsets)
return query
# helper func for exports
def ngrams_to_csv_rows(ngram_objs, id_groupings={}, list_type=""):
"""
@param: ngram_objs
an array of ngrams (eg: from a db query.all())
@param: optional id_groupings
a dict of sets {mainform_id : {subform_idA, subform_idB, etc}}
@param: list_type (a str 'map','main' or 'stop' to fill in col 4)
Outputs a basic info table per ngram
(ng_id, term string, term size, list_type)
with an optional 5th column of grouped subforms ex: "4|42"
Returns format is a csv_rows matrix (as a list of lists)
[
[ligne1_colA, ligne1_colB..],
[ligne2_colA, ligne2_colB..],
..
]
(to be used for instance like: csv.writer.writerows(csv_rows)
list_type ici:
0 <=> stopList
1 <=> miamList
2 <=> mapList
"""
# transcrire les objets ngrammes en tableau (liste de listes)
csv_rows = list()
for ng_obj in ngram_objs:
ng_id = ng_obj.id
if ng_id in id_groupings.keys():
this_grouped = "|".join(str(gid) for gid in id_groupings[ng_id])
else:
this_grouped = ""
# transcription : 5 columns
# ID , terme , n , type_de_liste , grouped_id|grouped_id...
csv_rows.append(
[ng_id,ng_obj.terms,ng_obj.n,list_type,this_grouped]
)
return csv_rows
def export_ngramlists(node,fname=None,delimiter="\t",titles=False):
"""
export of the 3 lists under a corpus node (MAP, MAIN, STOP)
with local combination of groups
@param node: the corpus node
@param fname: optional filename to write the CSV
(if absent, returns a str with CSV contents)
@param delimiter: optional column separator in the CSV
(if absent defaults to tabulation)
@param titles: optional flag to print or not a first line with headers
# ID , term , nwords , list_type , grouped_id|grouped_id...
1622 textile 1 main 1623|3397
3397 textile production 2 main
3410 possibility 1 stop
TODO : REFACTOR split list logic from corpus logic
=> possibility to act on one list
"""
# the node arg has to be a corpus here
if not hasattr(node, "typename") or node.typename != "CORPUS":
raise TypeError("EXPORT: node argument must be a Corpus Node")
# les nodes couvrant les listes
# -----------------------------
stoplist_node = node.children("STOPLIST").first()
mainlist_node = node.children("MAINLIST").first()
maplist_node = node.children("MAPLIST").first()
# et les groupes de synonymes
group_node = node.children("GROUPLIST").first()
# listes de ngram_ids correspondantes
# ------------------------------------
# contenu: liste des objets ngrammes [(2562,"monterme",1),...]
stop_ngrams = query_list(stoplist_node.id, details=True).all()
main_ngrams = query_list(mainlist_node.id, details=True).all()
map_ngrams = query_list(maplist_node.id, details=True).all()
# pour debug ---------->8 --------------------
#~ stop_ngrams = stop_ngrams[0:10]
#~ main_ngrams = main_ngrams[0:10]
#~ map_ngrams = map_ngrams[0:10]
# --------------------->8 --------------------
# pour la group_list on a des couples de ngram_ids
# -------------------
# ex: [(3544, 2353), (2787, 4032), ...]
group_ngram_id_couples = query_groups(group_node.id).all()
# k couples comme set
# --------------------
# [(x => y1), (x => y2)] >~~~~~~~> [x => {y1,y2}]
grouped = defaultdict(set)
for ngram in group_ngram_id_couples:
grouped[ngram[0]].add(ngram[1])
# on applique notre fonction ng_to_csv sur chaque liste
# ------------------------------------------------------
map_csv_rows = ngrams_to_csv_rows(map_ngrams,
id_groupings=grouped,
list_type="map")
stop_csv_rows = ngrams_to_csv_rows(stop_ngrams,
id_groupings=grouped,
list_type="stop")
# miam contient map donc il y a un préalable ici
map_ngram_ids = [ng.id for ng in map_ngrams]
main_without_map = [ng for ng in main_ngrams if ng not in map_ngram_ids]
miam_csv_rows = ngrams_to_csv_rows(main_without_map,
id_groupings=grouped,
list_type="main")
# all lists together now
this_corpus_all_rows = map_csv_rows + miam_csv_rows + stop_csv_rows
# choice of output: file or string
print(type(fname))
if fname == None:
out_file = StringIO()
elif type(fname) == str:
out_file = open(fname, 'w')
else:
straight_to_handle = True
out_file = fname
# csv.writer()
csv_wr = writer(out_file,
delimiter=delimiter,
quoting=QUOTE_MINIMAL)
if titles:
csv_wr.writerow(["oldid","term","nwords","listtype","subforms"])
# write to outfile
csv_wr.writerows(this_corpus_all_rows)
if fname == None:
# return output as a string
print("EXPORT: wrote %i ngrams to CSV string"
% len(this_corpus_all_rows))
return out_file.getvalue()
elif straight_to_handle:
print("EXPORT: wrote %i ngrams to CSV response handle"
% len(this_corpus_all_rows))
else:
# just close output file
out_file.close()
print("EXPORT: wrote %i ngrams to CSV file '%s'"
% (len(this_corpus_all_rows), path.abspath(fname)))
......@@ -8,118 +8,42 @@ API views for advanced operations on ngrams and ngramlists
"""
from gargantext.util.http import APIView, get_parameters, JsonHttpResponse,\
ValidationException, Http404
from gargantext.util.db import session, aliased, desc, bulk_insert
ValidationException, Http404, HttpResponse
from gargantext.util.db import session, aliased, bulk_insert
from gargantext.util.db_cache import cache
from sqlalchemy import tuple_
from gargantext.models import Ngram, NodeNgram, NodeNodeNgram, NodeNgramNgram
from gargantext.util.lists import UnweightedList, Translations
def _query_list(list_id,
pagination_limit=None, pagination_offset=None,
details=False, scoring_metric_id=None
):
"""
Paginated listing of ngram_ids in a NodeNgram lists.
Works for a mainlist or stoplist or maplist (not grouplists!)
# subroutines that were previously in this module are now in util.XYZ_tools
from gargantext.util.ngramlists_tools import query_list, export_ngramlists
from gargantext.util.group_tools import query_grouped_ngrams
Parameter:
- pagination_limit, pagination_offset
- details: if False, send just the array of ngram_ids
if True, send triples with (ngram_id, term, scoring)
^^^^^^^
- scoring_metric_id: id of a scoring metric node (TFIDF or OCCS)
(for details and sorting)
class List(APIView):
"""
if not details:
# simple contents
query = session.query(NodeNgram.ngram_id).filter(NodeNgram.node_id == list_id)
else:
# detailed contents (terms and some NodeNodeNgram for score)
# NB: score can be undefined (eg ex-subform that now became free)
# ==> we need outerjoin
# and the filter needs to have scoring_metric_id so we do it before
ScoresTable = (session
.query(NodeNodeNgram.score, NodeNodeNgram.ngram_id)
.filter(NodeNodeNgram.node1_id == scoring_metric_id)
.subquery()
)
query = (session
.query(
NodeNgram.ngram_id,
Ngram.terms,
ScoresTable.c.score
)
.join(Ngram, NodeNgram.ngram_id == Ngram.id)
# main filter ----------------------
.filter(NodeNgram.node_id == list_id)
# scores if possible
.outerjoin(ScoresTable,
ScoresTable.c.ngram_id == NodeNgram.ngram_id)
.order_by(desc(ScoresTable.c.score))
)
if pagination_limit:
query = query.limit(pagination_limit)
if pagination_offset:
query = query.offset(pagination_offsets)
return query
def _query_grouped_ngrams(groupings_id, details=False, scoring_metric_id=None):
see already available API query api/nodes/<list_id>?fields[]=ngrams
"""
Listing of "hidden" ngram_ids from the groups
Works only for grouplists
pass
Parameter:
- details: if False, send just the array of ngram_ids
if True, send triples with (ngram_id, term, scoring)
^^^^^^^
deprecated: scoring_metric_id: id of a scoring metric node (TFIDF or OCCS)
(for details and sorting)
(no more OCCS counts of subforms)
class CSVLists(APIView):
"""
if not details:
# simple contents
query = session.query(NodeNgramNgram.ngram2_id)
else:
# detailed contents (terms and some NodeNodeNgram for score)
query = (session
.query(
NodeNgramNgram.ngram2_id,
Ngram.terms,
# NodeNodeNgram.score #
)
.join(Ngram, NodeNgramNgram.ngram2_id == Ngram.id)
# .join(NodeNodeNgram, NodeNgramNgram.ngram2_id == NodeNodeNgram.ngram_id)
# .filter(NodeNodeNgram.node1_id == scoring_metric_id)
# .order_by(desc(NodeNodeNgram.score))
)
For CSV exports of all lists of a corpus
"""
def get(self, request):
params = get_parameters(request)
corpus_id = int(params.pop("corpus"))
corpus_node = cache.Node[corpus_id]
# main filter
# -----------
query = query.filter(NodeNgramNgram.node_id == groupings_id)
# response is file-like + headers
response = HttpResponse(content_type='text/csv')
response['Content-Disposition'] = 'attachment; filename="corpus-%i_gargantext_term_list.csv"' % corpus_id
return query
# fill the response with the data
export_ngramlists(corpus_node, fname=response, titles=True)
return response
class List(APIView):
"""
see already available API query api/nodes/<list_id>?fields[]=ngrams
"""
pass
class GroupChange(APIView):
......@@ -441,7 +365,7 @@ class MapListGlance(APIView):
listmembers = {'maplist':[]} # ngram ids sorted per list name
# infos for all ngrams from maplist
map_ngrams = _query_list(maplist_id, details=True,
map_ngrams = query_list(maplist_id, details=True,
scoring_metric_id= scores_id).all()
# ex: [(8805, 'mean age', 4.0),
......@@ -566,25 +490,25 @@ class ListFamily(APIView):
if "head" in parameters:
# head <=> only mainlist AND only k top ngrams
glance_limit = int(parameters['head'])
mainlist_query = _query_list(mainlist_id, details=True,
mainlist_query = query_list(mainlist_id, details=True,
pagination_limit = glance_limit,
scoring_metric_id= scores_id)
else:
# infos for all ngrams from mainlist
mainlist_query = _query_list(mainlist_id, details=True,
mainlist_query = query_list(mainlist_id, details=True,
scoring_metric_id= scores_id)
# infos for grouped ngrams, absent from mainlist
hidden_ngrams_query = _query_grouped_ngrams(groups_id, details=True,
hidden_ngrams_query = query_grouped_ngrams(groups_id, details=True,
scoring_metric_id= scores_id)
# infos for stoplist terms, absent from mainlist
stop_ngrams_query = _query_list(other_list_ids['stoplist'], details=True,
stop_ngrams_query = query_list(other_list_ids['stoplist'], details=True,
scoring_metric_id=scores_id)
# and for the other lists (stop and map)
# no details needed here, just the member ids
for li in other_list_ids:
li_elts = _query_list(other_list_ids[li], details=False
li_elts = query_list(other_list_ids[li], details=False
).all()
# simple array of ngram_ids
listmembers[li] = [ng[0] for ng in li_elts]
......
......@@ -27,6 +27,11 @@ urlpatterns = [ url(r'^nodes$' , nodes.NodeListResource.as_view()
# \
# corpus id
, url(r'^ngramlists/export$', ngramlists.CSVLists.as_view() )
# get a CSV export of the ngramlists of a corpus
# ex: GET ngramlists/export?corpus=43
# TODO : unify to a /api/ngrams?formatted=csv
# (similar to /api/nodes?formatted=csv)
, url(r'^ngramlists/change$', ngramlists.ListChange.as_view() )
# add or remove ngram from a list
......
......@@ -19,3 +19,13 @@
line-height: .85;
margin-bottom: -5px;
}
.exportbtn {
/* border: 1px solid #333 ; */
margin-top:17px ; /* valigns with bootstrap h2 */
}
.btn .glyphicon {
/* glyphicons are always rendered too high within bootstrap buttons */
vertical-align:middle
}
......@@ -41,7 +41,7 @@
{% if corpus %}
<li><a href="/projects/{{project.id}}/corpora/{{corpus.id}}">
<span class="glyphicon glyphicon-file" aria-hidden="true"></span>
{{corpus.name | truncatechars:15}}
{{corpus.name | truncatechars:25}}
</a>
</li>
{% endif %}
......@@ -150,12 +150,32 @@
<br>
<br>
<div class="row">
<h3>
<a href="/projects/{{project.id}}">
<span class="glyphicon glyphicon-book" aria-hidden="true"></span>
{{ project.name | truncatechars:50}}
<div class="col-md-6">
<h3>
<a href="/projects/{{project.id}}">
<span class="glyphicon glyphicon-book" aria-hidden="true"></span>
{{ project.name | truncatechars:50}}
</a>
</h3>
</div>
<!-- export button -->
<div class="col-md-6">
{% if view == 'terms' %}
<a class="btn btn-primary exportbtn pull-right" role="button"
href="/api/ngramlists/export?corpus={{corpus.id}}"
title="Export terms table in CSV">
Export terms table &nbsp; <span class="glyphicon glyphicon-download" aria-hidden="true"></span>
</a>
{% elif view == 'titles' %}
<a class="btn btn-primary exportbtn pull-right" role="button"
href="/api/nodes?parent_id={{corpus.id}}&types[]=DOCUMENT&pagination_limit=100000&formated=csv"
title="Export full corpus in CSV">
Export corpus &nbsp; <span class="glyphicon glyphicon-download" aria-hidden="true"></span>
</a>
</h3>
{% else %}
<!-- TODO export journal table -->
{% endif %}
</div>
</div>
<div class="row">
<div class="col-md-1">
......@@ -167,10 +187,7 @@
</h3>
<h3>
<span class="glyphicon glyphicon-file" aria-hidden="true"></span>
{{ corpus.name | truncatechars:20 }}
<a class="btn btn-primary" role="button" href="/api/nodes?parent_id={{corpus.id}}&types[]=DOCUMENT&pagination_limit=100000&formated=csv">
<span class="glyphicon glyphicon-download" aria-hidden="true"></span>
</a>
{{ corpus.name | truncatechars:30 }}
</h3>
</div>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment