Commit a180ff49 authored by delanoe's avatar delanoe

Merge branch 'romain-goodies' into unstable

parents 0b233ee1 9ad0d542
......@@ -86,7 +86,19 @@ class bulk_insert:
readline = read
def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None):
def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None, do_stats=False):
"""
Inserts bulk data with an intermediate check on a uniquekey
(ex: Ngram.terms) to see if the row existed before.
If the row already existed we just retrieve its id.
If it didn't exist we create it and retrieve the id.
Returns a dict {uniquekey => id}
Option:
do stats: also returns the number of those that had no previous id
"""
if cursor is None:
db, cursor = get_cursor()
mustcommit = True
......@@ -109,6 +121,7 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None):
sourcetable = model.__tablename__,
uniquecolumn = uniquekey,
))
# insert what has not been found to the real table
cursor.execute('''
INSERT INTO {sourcetable} ({columns})
......@@ -119,6 +132,11 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None):
sourcetable = model.__tablename__,
columns = ', '.join(fields),
))
if do_stats:
# remember how many rows we inserted just now
n_new = cursor.rowcount
# retrieve dict associating unique key to id
cursor.execute('''
SELECT source.id, source.{uniquecolumn}
......@@ -130,10 +148,15 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None):
columns = ', '.join(fields),
))
result = {
# term : new_id
row[1]: row[0] for row in cursor.fetchall()
}
# this is the end!
cursor.execute('DROP TABLE __tmp__')
if mustcommit:
db.commit()
return result
if do_stats:
return result, n_new
else:
return result
"""
Utilities for group management
- query_grouped_ngrams(group_id) to retrieve subforms
- group_union() to join two groupings lists
"""
from gargantext.util.db import session, aliased
from gargantext.models import Ngram, NodeNgramNgram
from igraph import Graph # for group_union
def query_groups(groupings_id, details=False):
"""
Listing of couples (mainform, subform)
aka (ngram1_id, ngram2_id)
Parameter:
- details: if False, just send the array of couples
if True, send quadruplets with (ngram1_id, term1, ngram2_id, term2)
"""
if not details:
# simple contents
query = session.query(NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id)
else:
# detailed contents (id + terms)
Ngram1 = aliased(Ngram)
Ngram2 = aliased(Ngram)
query = (session
.query(
NodeNgramNgram.ngram1_id,
Ngram1.terms,
NodeNgramNgram.ngram2_id,
Ngram2.terms,
)
.join(Ngram1, NodeNgramNgram.ngram1_id == Ngram1.id)
.join(Ngram2, NodeNgramNgram.ngram2_id == Ngram2.id)
)
# main filter
# -----------
query = query.filter(NodeNgramNgram.node_id == groupings_id)
return query
def query_grouped_ngrams(groupings_id, details=False, scoring_metric_id=None):
"""
Listing of "hidden" ngram_ids from the groups
Works only for grouplists
Parameter:
- details: if False, send just the array of ngram_ids
if True, send triples with (ngram_id, term, scoring)
^^^^^^^
deprecated: scoring_metric_id: id of a scoring metric node (TFIDF or OCCS)
(for details and sorting)
(no more OCCS counts of subforms)
"""
if not details:
# simple contents
query = session.query(NodeNgramNgram.ngram2_id)
else:
# detailed contents (terms and some NodeNodeNgram for score)
query = (session
.query(
NodeNgramNgram.ngram2_id,
Ngram.terms,
# NodeNodeNgram.score #
)
.join(Ngram, NodeNgramNgram.ngram2_id == Ngram.id)
# .join(NodeNodeNgram, NodeNgramNgram.ngram2_id == NodeNodeNgram.ngram_id)
# .filter(NodeNodeNgram.node1_id == scoring_metric_id)
# .order_by(desc(NodeNodeNgram.score))
)
# main filter
# -----------
query = query.filter(NodeNgramNgram.node_id == groupings_id)
return query
def group_union(g_a_links, g_b_links):
"""
Synonym groups are modelled by sets of couples in the DB
Input : 2 arrays of links (ngramx_id, ngramy_id)
Input : 1 array of links (ngramx_id, ngramy_id)
Synonymity is considered transitive so in effect the groups
can form a set (defined by the connected component of couples).
A requested feature is also that one node dominates others
(aka "leader effect"; leader will be in the map, the others won't)
Summary of major union effects in various cases:
GROUP 1 Group 2 Group 1 ∪ 2
A -> B A -> C A -> B (simple union)
A -> C
D -> E E -> F D -> E
D -> F (D "leader effect")
G -> H G -> I G -> H ( transitivity +
H -> J G -> I "leader effect")
G -> J
rloth: this is some slightly amended code
from Samuel's in rest_v1_0.ngrams.Group.get
TODO use "most frequent" score if leader candidates are ex aequo by degree.
"""
# output: list of links forming new group
new_links = []
# 1) create graph with both lists
# -------------------------------
# from igraph import Graph
# the set of all our ngram_ids
all_vertices = set(
[ngid for couple in g_a_links+g_b_links for ngid in couple]
)
# initialize the synonym graph with size
sg = Graph(len(all_vertices), directed=True)
# add our IDs as "name" (special attribute good for edge creation)
sg.vs['name'] = [str(x) for x in all_vertices]
# add the edges as named couples
sg.add_edges([(str(x),str(y)) for (x,y) in g_a_links])
#print('UNION A:', g_a_links)
#print('initially %i components' % len(sg.as_undirected().components()))
# same with the other edges
sg.add_edges([(str(x),str(y)) for (x,y) in g_b_links])
#print('UNION B:', g_b_links)
#print('after union %i components' % len(sg.as_undirected().components()))
# 2) list resulting components
# -----------------------------
synonym_components = sg.as_undirected().components()
# for example
# cs = [[0, 3, 6], [1, 2, 8], [4, 5, 9, 11], [7,10]]
# there should be no singletons by construction
# list of all outdegrees for "leader" detection
# (leader = term most often marked as source by the users)
odegs = sg.outdegree()
#for i, v in enumerate(sg.vs):
# print("%i - name:%s - odeg:%i" % (i, v['name'], odegs[i]))
for component in synonym_components:
# we map back to our ids, preserving order
our_comp = [int(our_id) for our_id in sg.vs[component]['name']]
# 3) take main node and unnest into new links list
# -------------------------------------------------
# position (within this component) of the best node (by degree)
max_odeg = -1
main_node_local_index = None
for position, vertex_id in enumerate(component):
this_odeg = odegs[vertex_id]
if this_odeg > max_odeg:
main_node_local_index = position
max_odeg = this_odeg
# we set it aside in our translated version our_comp
main_node = our_comp.pop(main_node_local_index)
# and unnest the others
for remaining_id in our_comp:
new_links.append((main_node, remaining_id))
return new_links
"""
Tools to work with ngramlists (MAINLIST, MAPLIST, STOPLIST)
- query_list(list_id) to retrieve ngrams
- export_ngramlists(corpus_node)
- import_ngramlists(corpus_node)
- merge_ngramlists(new_lists, onto_corpus = corpus_node)
"""
from gargantext.util.group_tools import query_groups, group_union
from gargantext.util.db import session, desc, func, \
bulk_insert_ifnotexists
from gargantext.models import Ngram, NodeNgram, NodeNodeNgram, \
NodeNgramNgram
from gargantext.util.lists import UnweightedList, Translations
# import will implement the same text cleaning procedures as toolchain
from gargantext.util.toolchain.parsing import normalize_chars
from gargantext.util.toolchain.ngrams_extraction import normalize_terms
from sqlalchemy.sql import exists
from os import path
from csv import writer, reader, QUOTE_MINIMAL
from collections import defaultdict
from re import match
from io import StringIO # pseudo file to write CSV to memory
def query_list(list_id,
pagination_limit=None, pagination_offset=None,
details=False, scoring_metric_id=None, groupings_id=None
):
"""
Paginated listing of ngram_ids in a NodeNgram lists.
Works for a mainlist or stoplist or maplist (not grouplists!)
Parameter:
- pagination_limit, pagination_offset
- details: if False, send just the array of ngram_ids
if True and no scoring, send couples with (ngram_id, term)
if True and a scoring_id, send triples with (ngram_id, term, scoring)
- scoring_metric_id: id of a scoring metric node (TFIDF or OCCS)
(for details and sorting)
- groupings_id: optional id of a list of grouping relations (synonyms)
(each synonym will be added to the list if not already in there)
FIXME: subforms appended recently and not generalized enough
=> add a common part for all "if groupings_id"
=> provide the option also in combination with scoring
"""
# simple contents
if not details:
query = session.query(NodeNgram.ngram_id).filter(NodeNgram.node_id == list_id)
if groupings_id:
subforms = (session.query(NodeNgramNgram.ngram2_id)
# subform ids...
.filter(NodeNgramNgram.node_id == groupings_id)
# .. that are connected to a mainform
.join(NodeNgram, NodeNgram.ngram_id == NodeNgramNgram.ngram1_id)
# .. which is in the list
.filter(NodeNgram.node_id == list_id)
)
# union with the main q
query = query.union(subforms)
# detailed contents (id + terms)
elif not scoring_metric_id:
query = (session.query(Ngram.id, Ngram.terms, Ngram.n)
.join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
.filter(NodeNgram.node_id == list_id)
)
if groupings_id:
subforms = (session.query(Ngram.id, Ngram.terms, Ngram.n)
.join(NodeNgramNgram, NodeNgramNgram.ngram2_id == Ngram.id)
# subform ids...
.filter(NodeNgramNgram.node_id == groupings_id)
# .. that are connected to a mainform
.join(NodeNgram, NodeNgram.ngram_id == NodeNgramNgram.ngram1_id)
# .. which is in the list
.filter(NodeNgram.node_id == list_id)
)
# union with the main q
query = query.union(subforms)
# detailed contents (id + terms) + score
else:
# NB: score can be undefined (eg ex-subform that now became free)
# ==> we need outerjoin
# and the filter needs to have scoring_metric_id so we do it before
ScoresTable = (session
.query(NodeNodeNgram.score, NodeNodeNgram.ngram_id)
.filter(NodeNodeNgram.node1_id == scoring_metric_id)
.subquery()
)
query = (session
.query(
NodeNgram.ngram_id,
Ngram.terms,
ScoresTable.c.score
)
.join(Ngram, NodeNgram.ngram_id == Ngram.id)
# main filter ----------------------
.filter(NodeNgram.node_id == list_id)
# scores if possible
.outerjoin(ScoresTable,
ScoresTable.c.ngram_id == NodeNgram.ngram_id)
.order_by(desc(ScoresTable.c.score))
)
if pagination_limit:
query = query.limit(pagination_limit)
if pagination_offset:
query = query.offset(pagination_offsets)
return query
# helper func for exports
def ngrams_to_csv_rows(ngram_objs, id_groupings={}, list_type=""):
"""
@param: ngram_objs
an array of ngrams (eg: from a db query.all())
@param: optional id_groupings
a dict of sets {mainform_id : {subform_idA, subform_idB, etc}}
@param: list_type (a str 'map','main' or 'stop' to fill in col 4)
Outputs a basic info table per ngram
(ng_id, term string, term size, list_type)
with an optional 5th column of grouped subforms ex: "4|42"
Returns format is a csv_rows matrix (as a list of lists)
[
[ligne1_colA, ligne1_colB..],
[ligne2_colA, ligne2_colB..],
..
]
(to be used for instance like: csv.writer.writerows(csv_rows)
list_type ici:
0 <=> stopList
1 <=> miamList
2 <=> mapList
"""
# transcrire les objets ngrammes en tableau (liste de listes)
csv_rows = list()
for ng_obj in ngram_objs:
ng_id = ng_obj.id
if ng_id in id_groupings.keys():
this_grouped = "|".join(str(gid) for gid in id_groupings[ng_id])
else:
this_grouped = ""
# transcription : 5 columns
# ID , terme , n , type_de_liste , grouped_id|grouped_id...
csv_rows.append(
[ng_id,ng_obj.terms,ng_obj.n,list_type,this_grouped]
)
return csv_rows
def export_ngramlists(node,fname=None,delimiter="\t",titles=False):
"""
export of the 3 lists under a corpus node (MAP, MAIN, STOP)
with local combination of groups
@param node: the corpus node
@param fname: optional filename to write the CSV
(if absent, returns a str with CSV contents)
@param delimiter: optional column separator in the CSV
(if absent defaults to tabulation)
@param titles: optional flag to print or not a first line with headers
# ID , term , nwords , list_type , grouped_id|grouped_id...
1622 textile 1 main 1623|3397
3397 textile production 2 main
3410 possibility 1 stop
TODO : REFACTOR split list logic from corpus logic
=> possibility to act on one list
"""
# the node arg has to be a corpus here
if not hasattr(node, "typename") or node.typename != "CORPUS":
raise TypeError("EXPORT: node argument must be a Corpus Node")
# les nodes couvrant les listes
# -----------------------------
stoplist_node = node.children("STOPLIST").first()
mainlist_node = node.children("MAINLIST").first()
maplist_node = node.children("MAPLIST").first()
# et les groupes de synonymes
group_node = node.children("GROUPLIST").first()
# listes de ngram_ids correspondantes
# ------------------------------------
# contenu: liste des objets ngrammes [(2562,"monterme",1),...]
stop_ngrams = query_list(stoplist_node.id, details=True, groupings_id=group_node.id).all()
main_ngrams = query_list(mainlist_node.id, details=True, groupings_id=group_node.id).all()
map_ngrams = query_list(maplist_node.id, details=True, groupings_id=group_node.id).all()
# pour debug ---------->8 --------------------
#~ stop_ngrams = stop_ngrams[0:10]
#~ main_ngrams = main_ngrams[0:10]
#~ map_ngrams = map_ngrams[0:10]
# --------------------->8 --------------------
# pour la group_list on a des couples de ngram_ids
# -------------------
# ex: [(3544, 2353), (2787, 4032), ...]
group_ngram_id_couples = query_groups(group_node.id).all()
# k couples comme set
# --------------------
# [(x => y1), (x => y2)] >~~~~~~~> [x => {y1,y2}]
grouped = defaultdict(set)
for ngram in group_ngram_id_couples:
grouped[ngram[0]].add(ngram[1])
# on applique notre fonction ng_to_csv sur chaque liste
# ------------------------------------------------------
map_csv_rows = ngrams_to_csv_rows(map_ngrams,
id_groupings=grouped,
list_type="map")
stop_csv_rows = ngrams_to_csv_rows(stop_ngrams,
id_groupings=grouped,
list_type="stop")
# miam contient map donc il y a un préalable ici
map_ngram_ids = {ng.id for ng in map_ngrams}
main_without_map = [ng for ng in main_ngrams if ng.id not in map_ngram_ids]
miam_csv_rows = ngrams_to_csv_rows(main_without_map,
id_groupings=grouped,
list_type="main")
# all lists together now
this_corpus_all_rows = map_csv_rows + miam_csv_rows + stop_csv_rows
# choice of output: file or string
if fname == None:
out_file = StringIO()
elif type(fname) == str:
out_file = open(fname, 'w')
else:
straight_to_handle = True
out_file = fname
# csv.writer()
csv_wr = writer(out_file,
delimiter=delimiter,
quoting=QUOTE_MINIMAL)
if titles:
csv_wr.writerow(["oldid","term","nwords","listtype","subforms"])
# write to outfile
csv_wr.writerows(this_corpus_all_rows)
if fname == None:
# return output as a string
print("EXPORT: wrote %i ngrams to CSV string"
% len(this_corpus_all_rows))
return out_file.getvalue()
elif straight_to_handle:
print("EXPORT: wrote %i ngrams to CSV response handle"
% len(this_corpus_all_rows))
else:
# just close output file
out_file.close()
print("EXPORT: wrote %i ngrams to CSV file '%s'"
% (len(this_corpus_all_rows), path.abspath(fname)))
def import_ngramlists(fname, delimiter='\t', group_delimiter='|'):
'''
This function reads a CSV of an ngrams table for a Corpus,
then it converts old ngram_ids to those of the current DB
(and adds to DB any unknown ngrams)
then recreates an equivalent set of MAINLIST, MAPLIST, STOPLIST + GROUPS
Input example:
oldid | term |nwords| ltype |group_oldids
-------+---------------+------+--------+---------------
3842 water table 2 map 3724
3724 water tables 2 map
4277 water supply 2 map 190362|13415
13415 water supplies 2 map
190362 water-supply 1 map
20489 wastewater 1 map
Output: 3 x UnweightedList + 1 x Translations
@param fname a local filename or a filehandle-like
@param delimiter a character used as separator in the CSV
@param group_delimiter a character used as grouped subforms separator
(in the last column)
The conversion of old_id to ngram_id works in 2 steps:
=> look up each term str in the DB with bulk_insert_ifnotexists
(creates absent ngrams if necessary)
=> use the new ids to map the relations involving the old ones
NB: the creation of MAINLIST also adds all elements from the MAPLIST
NB: To merge the imported lists into a corpus node's lists,
chain this function with merge_ngramlists()
'''
# --------------
#
# --------------
# main storage for the ngrams by list
import_nodes_ngrams = {'stop':[], 'main':[], 'map':[]}
# separate storage for the term's couples [(term str, nwords int),...]
imported_ngrams_dbdata = []
# and all the old ids, by term (for id lookup after dbdata bulk_insert)
imported_ngrams_oldids = {}
# and for the imported_grouping list of couples [(x1,y1),(x1,y2),(x2,y3),..]
imported_groupings = []
# /!\ imported_grouping contains only external ids (aka oldids)
# (ie imported ids.. that will have to be translated
# to target db ids)
# skipped lines can (very rarely) be used in groups => mark as ignored
ignored_oldids = []
# =============== READ CSV ===============
if isinstance(fname, str):
fh = open(fname, "r")
elif callable(getattr(fname, "read", None)):
fh = fname
else:
raise TypeError("IMPORT: fname argument has unknown type %s" % type(fh))
# reading all directly b/c csv.reader takes only lines or a real fh in bytes
# and we usually have a "false" fh (uploadedfile.InMemoryUploadedFile) in strings
# (but we checked its size before!)
contents = fh.read().decode("UTF-8").split("\n")
# end of CSV read
fh.close()
# <class 'django.core.files.uploadedfile.InMemoryUploadedFile'>
ngrams_csv_rows = reader(contents,
delimiter = delimiter,
quoting = QUOTE_MINIMAL
)
# for stats
n_read_lines = 0
n_total_ng = 0
n_added_ng = 0
n_group_relations = 0
# load CSV + initial checks
for i, csv_row in enumerate(ngrams_csv_rows):
# fyi
n_read_lines +=1
# print("---------------READ LINE %i" % i)
if not len(csv_row):
continue
try:
this_ng_oldid = str(csv_row[0])
this_ng_term = str(csv_row[1])
this_ng_nwords = int(csv_row[2])
this_list_type = str(csv_row[3])
this_ng_group = str(csv_row[4])
# string normalizations
this_ng_term = normalize_terms(normalize_chars(this_ng_term))
except:
if i == 0:
print("IMPORT WARN: (skip line) probable header line at CSV %s:l.0" % fname)
continue
else:
raise ValueError("Error on CSV read line %i" %n_read_lines)
# --- check format before any old ID retrieve
if not match(r"\d+$", this_ng_oldid):
print("IMPORT WARN: (skip line) bad ID at CSV %s:l.%i" % (fname, i))
continue
else:
this_ng_oldid = int(this_ng_oldid)
# --- term checking
if not len(this_ng_term) > 0:
print("IMPORT WARN: (skip line) empty term at CSV %s:l.%i" % (fname, i))
ignored_oldids.append(this_ng_oldid)
continue
# --- check if not a duplicate string
if this_ng_term in imported_ngrams_oldids:
ignored_oldids.append(this_ng_oldid)
print("IMPORT WARN: (skip line) term appears more than once (previous id: %i) at CSV %s:l.%i"
% (imported_ngrams_oldids[this_ng_term], fname, i))
continue
# --- check correct list type
if not this_list_type in ['stop','main','map']:
ignored_oldids.append(this_ng_oldid)
print("IMPORT WARN: (skip line) wrong list type at CSV %s:l.%i" % (fname, i))
continue
# ================= Store the data ====================
# the ngram data
imported_ngrams_dbdata.append([this_ng_term, this_ng_nwords])
imported_ngrams_oldids[this_ng_term] = this_ng_oldid
# and the "list to ngram" relation
import_nodes_ngrams[this_list_type].append(this_ng_oldid)
# ====== Store synonyms from the import (if any) ======
if len(this_ng_group) != 0:
group_as_external_ids = this_ng_group.split('|')
for external_subform_id in group_as_external_ids:
external_subform_id = int(external_subform_id)
imported_groupings.append(
(this_ng_oldid,external_subform_id)
)
# ======== ngram save + id lookup =========
n_total_ng = len(imported_ngrams_dbdata)
# returns a dict {term => id} and a count of inserted ones
(new_ngrams_ids, n_added_ng) = bulk_insert_ifnotexists(
model = Ngram,
uniquekey = 'terms',
fields = ('terms', 'n'),
data = imported_ngrams_dbdata,
do_stats = True
)
del imported_ngrams_dbdata
# loop on old ngrams and create direct mapping old_id => new_id
old_to_new_id_map = {}
for term, oldid in imported_ngrams_oldids.items():
old_to_new_id_map[oldid] = new_ngrams_ids[term]
del new_ngrams_ids
del imported_ngrams_oldids
# print(old_to_new_id_map)
# print(import_nodes_ngrams)
# ======== Import into lists =========
# 3 x abstract lists + 1 translations
result = {
'map': UnweightedList(),
'main': UnweightedList(),
'stop': UnweightedList(),
'groupings' : Translations()
}
for list_type in import_nodes_ngrams:
for old_id in import_nodes_ngrams[list_type]:
new_id = old_to_new_id_map[old_id]
# add to the abstract list
result[list_type].items.add(new_id)
# for main also add map elements
if list_type == 'main':
for old_id in import_nodes_ngrams['map']:
new_id = old_to_new_id_map[old_id]
result['main'].items.add(new_id)
# ======== Synonyms =========
for (x,y) in imported_groupings:
if (x not in ignored_oldids) and (y not in ignored_oldids):
new_mainform_id = old_to_new_id_map[x]
new_subform_id = old_to_new_id_map[y]
# /!\ Translations use (subform => mainform) order
result['groupings'].items[new_subform_id] = new_mainform_id
n_group_relations += 1
# ------------------------------------------------------------------
print("IMPORT: read %i lines from the CSV" % n_read_lines)
print("IMPORT: read %i terms (%i added and %i already existing)"
% (n_total_ng, n_added_ng, n_total_ng-n_added_ng) )
print("IMPORT: read %i grouping relations" % n_group_relations)
return result
def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
"""
Integrates an external terms table to the current one:
- merges groups (using group_union() function)
- resolves conflicts if terms belong in different lists
> map wins over both other types
> main wins over stop
> stop never wins
@param new_lists: a dict of *new* imported lists with format:
{'stop': UnweightedList,
'main': UnweightedList,
'map': UnweightedList,
'groupings': Translations }
@param onto_corpus: a corpus node to get the *old* lists
@param del_originals: an array of original wordlists to ignore
and delete during the merge
possible values : ['stop','main','map']
par exemple
del_originals = ['stop','main'] => effacera la stoplist
et la mainlist
mais pas la maplist qui sera fusionnée
(les éléments de la map list
seront remis dans la main à la fin)
NB: Uses group_tools.group_union() to merge the synonym links.
FIXME: new terms created at import_ngramlists() can now be added to lists
but are never added to docs
"""
# log to send back to client-side (lines will be joined)
my_log = []
# the tgt node arg has to be a corpus here
if not hasattr(onto_corpus, "typename") or onto_corpus.typename != "CORPUS":
raise TypeError("IMPORT: 'onto_corpus' argument must be a Corpus Node")
# for stats
added_nd_ng = 0 # number of added list elements
# our list shortcuts will be 0,1,2 (aka lid)
# by order of precedence
linfos = [
{'key': 'stop', 'name':"STOPLIST"}, # lid = 0
{'key': 'main', 'name':"MAINLIST"}, # lid = 1
{'key': 'map', 'name':"MAPLIST"} # lid = 2
]
# ======== Get the old lists =========
old_lists = {}
# DB nodes stored with same indices 0,1,2 (resp. stop, miam and map)
# find target ids of the list node objects
tgt_nodeids = [
onto_corpus.children("STOPLIST").first().id,
onto_corpus.children("MAINLIST").first().id,
onto_corpus.children("MAPLIST").first().id
]
old_group_id = onto_corpus.children("GROUPLIST").first().id
# retrieve old data into old_lists[list_type]...
# ----------------------------------------------
for lid, linfo in enumerate(linfos):
list_type = linfo['key']
if list_type not in del_originals:
# NB can't use UnweightedList(tgt_nodeids[lid])
# because we need to include out-of-list subforms
list_ngrams_q = query_list(tgt_nodeids[lid],
groupings_id=old_group_id)
old_lists[list_type] = UnweightedList(list_ngrams_q.all())
else:
# ...or use empty objects if replacing old list
# ----------------------------------------------
old_lists[list_type] = UnweightedList()
msg = "MERGE: ignoring old %s which will be overwritten" % linfo['name']
print(msg)
my_log.append(msg)
# ======== Merging all involved ngrams =========
# all memberships with resolved conflicts of interfering memberships
resolved_memberships = {}
for list_set in [old_lists, new_lists]:
for lid, info in enumerate(linfos):
list_type = info['key']
# we use the fact that lids are ordered ints...
for ng_id in list_set[list_type].items:
if ng_id not in resolved_memberships:
resolved_memberships[ng_id] = lid
else:
# ...now resolving is simply taking the max
# stop < main < map
resolved_memberships[ng_id] = max(
lid,
resolved_memberships[ng_id]
)
# now each ngram is only in its most important list
# -------------------------------------------------
# NB temporarily map items are not in main anymore
# but we'll copy it at the end
# NB temporarily all subforms were treated separately
# from mainforms but we'll force them into same list
# after we merge the groups
del old_lists
del new_lists['stop']
del new_lists['main']
del new_lists['map']
# ======== Merging old and new groups =========
# get the arcs already in the target DB (directed couples)
previous_links = session.query(
NodeNgramNgram.ngram1_id,
NodeNgramNgram.ngram2_id
).filter(
NodeNgramNgram.node_id == old_group_id
).all()
n_links_previous = len(previous_links)
# same format for the new arcs (Translations ~~~> array of couples)
translated_imported_links = []
add_link = translated_imported_links.append
n_links_added = 0
for (y,x) in new_lists['groupings'].items.items():
add_link((x,y))
n_links_added += 1
del new_lists
# group_union: joins 2 different synonym-links lists into 1 new list
new_links = group_union(previous_links, translated_imported_links)
del previous_links
del translated_imported_links
n_links_after = len(new_links)
merged_group = Translations([(y,x) for (x,y) in new_links])
del new_links
# ======== Overwrite old data with new =========
merged_group.save(old_group_id)
msg = "MERGE: groupings %i updated (links before/added/after: %i/%i/%i)" % (old_group_id, n_links_previous, n_links_added, n_links_after)
my_log.append(msg)
print(msg)
# ======== Target list(s) append data =========
# if list 2 => write in both tgt_data_lists [1,2]
# lists 0 or 1 => straightforward targets [0] or [1]
merged_results = {
'stop': UnweightedList(),
'main': UnweightedList(),
'map': UnweightedList()
}
for (ng_id, winner_lid) in resolved_memberships.items():
## 1) using the new groups
# normal case if not a subform
if ng_id not in merged_group.items:
target_lid = winner_lid
# inherit case if is a subform
else:
mainform_id = merged_group.items[ng_id]
# inherited winner
try:
target_lid = resolved_memberships[mainform_id]
except KeyError:
target_lid = winner_lid
print("MERGE: WARN ng_id %i has incorrect mainform %i ?" % (ng_id, mainform_id))
## 2) map => map + main
if target_lid == 2:
todo_lids = [1,2]
else:
todo_lids = [target_lid]
## 3) storage
for lid in todo_lids:
list_type = linfos[lid]['key']
merged_results[list_type].items.add(ng_id)
# print("IMPORT: added %i elements in the lists indices" % added_nd_ng)
# ======== Overwrite old data with new =========
for lid, info in enumerate(linfos):
tgt_id = tgt_nodeids[lid]
list_type = info['key']
result = merged_results[list_type]
result.save(tgt_id)
msg = "MERGE: %s %i updated (new size: %i)" % (info['name'],tgt_id, len(merged_results[list_type].items))
my_log.append(msg)
print(msg)
# return a log
return("\n".join(my_log))
......@@ -9,6 +9,9 @@ from re import sub
from gargantext.util.scheduling import scheduled
def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor):
"""
@param ngrams_data a set like {('single word', 2), ('apple', 1),...}
"""
print('INTEGRATE')
# integrate ngrams
ngrams_ids = bulk_insert_ifnotexists(
......
......@@ -8,118 +8,88 @@ API views for advanced operations on ngrams and ngramlists
"""
from gargantext.util.http import APIView, get_parameters, JsonHttpResponse,\
ValidationException, Http404
from gargantext.util.db import session, aliased, desc, bulk_insert
ValidationException, Http404, HttpResponse
from gargantext.util.db import session, aliased, bulk_insert
from gargantext.util.db_cache import cache
from sqlalchemy import tuple_
from gargantext.models import Ngram, NodeNgram, NodeNodeNgram, NodeNgramNgram
from gargantext.util.lists import UnweightedList, Translations
# useful subroutines
from gargantext.util.ngramlists_tools import query_list, export_ngramlists, \
import_ngramlists, merge_ngramlists
from gargantext.util.group_tools import query_grouped_ngrams
def _query_list(list_id,
pagination_limit=None, pagination_offset=None,
details=False, scoring_metric_id=None
):
class List(APIView):
"""
see already available API query api/nodes/<list_id>?fields[]=ngrams
"""
Paginated listing of ngram_ids in a NodeNgram lists.
pass
Works for a mainlist or stoplist or maplist (not grouplists!)
Parameter:
- pagination_limit, pagination_offset
- details: if False, send just the array of ngram_ids
if True, send triples with (ngram_id, term, scoring)
^^^^^^^
- scoring_metric_id: id of a scoring metric node (TFIDF or OCCS)
(for details and sorting)
class CSVLists(APIView):
"""
if not details:
# simple contents
query = session.query(NodeNgram.ngram_id).filter(NodeNgram.node_id == list_id)
else:
# detailed contents (terms and some NodeNodeNgram for score)
# NB: score can be undefined (eg ex-subform that now became free)
# ==> we need outerjoin
# and the filter needs to have scoring_metric_id so we do it before
ScoresTable = (session
.query(NodeNodeNgram.score, NodeNodeNgram.ngram_id)
.filter(NodeNodeNgram.node1_id == scoring_metric_id)
.subquery()
)
query = (session
.query(
NodeNgram.ngram_id,
Ngram.terms,
ScoresTable.c.score
)
.join(Ngram, NodeNgram.ngram_id == Ngram.id)
# main filter ----------------------
.filter(NodeNgram.node_id == list_id)
# scores if possible
.outerjoin(ScoresTable,
ScoresTable.c.ngram_id == NodeNgram.ngram_id)
.order_by(desc(ScoresTable.c.score))
)
For CSV exports of all lists of a corpus
if pagination_limit:
query = query.limit(pagination_limit)
Or CSV import into existing lists as "patch"
"""
def get(self, request):
params = get_parameters(request)
corpus_id = int(params.pop("corpus"))
corpus_node = cache.Node[corpus_id]
if pagination_offset:
query = query.offset(pagination_offsets)
# response is file-like + headers
response = HttpResponse(content_type='text/csv')
response['Content-Disposition'] = 'attachment; filename="corpus-%i_gargantext_term_list.csv"' % corpus_id
return query
# fill the response with the data
export_ngramlists(corpus_node, fname=response, titles=True)
return response
def post(self,request):
"""
Merge the lists of a corpus with other lists from a CSV source
or from another corpus
params in request.GET:
corpus: the corpus whose lists are getting patched
params in request.FILES:
csvsource: the csv file
def _query_grouped_ngrams(groupings_id, details=False, scoring_metric_id=None):
"""
Listing of "hidden" ngram_ids from the groups
or in get
dbsource: another corpus instead of the csvfile
(? this last option should perhaps not be in CSVLists ?)
Works only for grouplists
NB: not using PATCH because we'll need POST file upload
Parameter:
- details: if False, send just the array of ngram_ids
if True, send triples with (ngram_id, term, scoring)
^^^^^^^
deprecated: scoring_metric_id: id of a scoring metric node (TFIDF or OCCS)
(for details and sorting)
(no more OCCS counts of subforms)
"""
if not details:
# simple contents
query = session.query(NodeNgramNgram.ngram2_id)
else:
# detailed contents (terms and some NodeNodeNgram for score)
query = (session
.query(
NodeNgramNgram.ngram2_id,
Ngram.terms,
# NodeNodeNgram.score #
)
.join(Ngram, NodeNgramNgram.ngram2_id == Ngram.id)
# .join(NodeNodeNgram, NodeNgramNgram.ngram2_id == NodeNodeNgram.ngram_id)
# .filter(NodeNodeNgram.node1_id == scoring_metric_id)
# .order_by(desc(NodeNodeNgram.score))
)
/!\ We assume we checked the file size client-side before upload
# main filter
# -----------
query = query.filter(NodeNgramNgram.node_id == groupings_id)
£TODO check authentication and user.id
"""
# this time the corpus param is the one with the target lists to be patched
params = get_parameters(request)
corpus_id = int(params.pop("onto_corpus"))
corpus_node = cache.Node[corpus_id]
return query
# request also contains the file
# csv_file has type django.core.files.uploadedfile.InMemoryUploadedFile
# ----------------------
csv_file = request.data['csvfile']
# import the csv
new_lists = import_ngramlists(csv_file)
del csv_file
# merge the new_lists onto those of the target corpus
log_msg = merge_ngramlists(new_lists, onto_corpus=corpus_node)
return JsonHttpResponse({
'log': log_msg,
}, 200)
class List(APIView):
"""
see already available API query api/nodes/<list_id>?fields[]=ngrams
"""
pass
class GroupChange(APIView):
......@@ -441,7 +411,7 @@ class MapListGlance(APIView):
listmembers = {'maplist':[]} # ngram ids sorted per list name
# infos for all ngrams from maplist
map_ngrams = _query_list(maplist_id, details=True,
map_ngrams = query_list(maplist_id, details=True,
scoring_metric_id= scores_id).all()
# ex: [(8805, 'mean age', 4.0),
......@@ -566,25 +536,25 @@ class ListFamily(APIView):
if "head" in parameters:
# head <=> only mainlist AND only k top ngrams
glance_limit = int(parameters['head'])
mainlist_query = _query_list(mainlist_id, details=True,
mainlist_query = query_list(mainlist_id, details=True,
pagination_limit = glance_limit,
scoring_metric_id= scores_id)
else:
# infos for all ngrams from mainlist
mainlist_query = _query_list(mainlist_id, details=True,
mainlist_query = query_list(mainlist_id, details=True,
scoring_metric_id= scores_id)
# infos for grouped ngrams, absent from mainlist
hidden_ngrams_query = _query_grouped_ngrams(groups_id, details=True,
hidden_ngrams_query = query_grouped_ngrams(groups_id, details=True,
scoring_metric_id= scores_id)
# infos for stoplist terms, absent from mainlist
stop_ngrams_query = _query_list(other_list_ids['stoplist'], details=True,
stop_ngrams_query = query_list(other_list_ids['stoplist'], details=True,
scoring_metric_id=scores_id)
# and for the other lists (stop and map)
# no details needed here, just the member ids
for li in other_list_ids:
li_elts = _query_list(other_list_ids[li], details=False
li_elts = query_list(other_list_ids[li], details=False
).all()
# simple array of ngram_ids
listmembers[li] = [ng[0] for ng in li_elts]
......
......@@ -27,6 +27,15 @@ urlpatterns = [ url(r'^nodes$' , nodes.NodeListResource.as_view()
# \
# corpus id
, url(r'^ngramlists/export$', ngramlists.CSVLists.as_view() )
# get a CSV export of the ngramlists of a corpus
# ex: GET ngramlists/export?corpus=43
# TODO : unify to a /api/ngrams?formatted=csv
# (similar to /api/nodes?formatted=csv)
, url(r'^ngramlists/import$', ngramlists.CSVLists.as_view() )
# same handling class as export (CSVLists)
# but this route used only for POST + file
, url(r'^ngramlists/change$', ngramlists.ListChange.as_view() )
# add or remove ngram from a list
......
......@@ -33,6 +33,9 @@ def ngramtable(request, project_id, corpus_id):
'project': project,
'corpus' : corpus,
'resourcename' : resourcename(corpus),
'view': 'terms'
'view': 'terms',
# for the CSV import modal
'csvimportroute': "/api/ngramlists/import?onto_corpus=%i"% corpus.id
},
)
......@@ -11,6 +11,7 @@ django-pgfields==1.4.4
django-pgjsonb==0.0.16
djangorestframework==3.3.2
html5lib==0.9999999
python-igraph>=0.7.1
jdatetime==1.7.2
kombu==3.0.33 # messaging
nltk==3.1
......
......@@ -19,3 +19,13 @@
line-height: .85;
margin-bottom: -5px;
}
.exportbtn {
/* border: 1px solid #333 ; */
margin-top:17px ; /* valigns with bootstrap h2 */
}
.btn .glyphicon {
/* glyphicons are always rendered too high within bootstrap buttons */
vertical-align:middle
}
......@@ -72,6 +72,15 @@
<button id="Save_All" class="btn btn-muted" disabled style="font-size:120%">
<b>Save all changes</b>
</button>
<br/>
<br/>
<!-- import icon -->
<span class="needsaveicon glyphicon glyphicon-import"></span>
&nbsp;
<button id="ImportList" class="btn btn-warning" style="font-size:120%"
onclick="$('#csvimport').modal('show');">
<b>Import a Termlist</b>
</button>
</div>
<!-- see in javascript function queries.functions['my_state_filter'] -->
<div class="pull-right" style="margin-top:2.1em;padding-left:1em;">
......@@ -107,25 +116,110 @@
</div> <!-- /div panel -->
</div> <!-- /jumbotron -->
<!--
<button id="ImportList" onclick="GetUserPortfolio();" class="btn btn-warning">
Import a Corpus-List
</button>
-->
<!--</div> This div is closed in the menu !-->
<!--</div> This div is closed in the menu !-->
<!--
# stub to import a list (aka orange button)
<button id="ImportList" onclick="GetUserPortfolio();" class="btn btn-warning">Import a Corpus-List</button>
-->
<div class="modal" aria-hidden="true" id="csvimport">
<div class="modal-dialog">
<div class="modal-content">
<div class="modal-header">
<button type="button" class="close" data-dismiss="modal" aria-hidden="true">×</button>
<h3 id="myModalLabel">Import a CSV term list</h3>
</div>
<div class="modal-body" id="uploadform">
<form id="csvimportform"
onsubmit="return postCSV(event)"
enctype="multipart/form-data"
method="post">
{% csrf_token %}
<label>From your disk:</label>
<input type="file" id="csvfile" accept="text/csv">
<br/>
<label>From another corpus:</label>
<p>TODO</p>
<br/>
<input type="submit" class="btn btn-xs btn-info" id="csvsubmit" value="Submit" />
</form>
</div>
<div class="modal-footer" id="formanswer"></div>
</div>
</div>
</div>
<script type="text/javascript" src="{% static "lib/jquery/dynatable/jquery.dynatable.js" %}"></script>
<!-- custom-lib for dynatable.js and dc.js -->
<script type="text/javascript" src="{% static "lib/gargantext/NGrams_dyna_chart_and_table.js" %}"></script>
<script type="text/javascript">
/* merci c24b !
* Uses csvimportroute variable from the django template
* Ex: /api/ngramlists/import?onto_corpus=corpus_id
*
* Uses input#csvfile as source data.
*/
function postCSV(e){
// don't do page reload of usual submits
e.preventDefault()
// 2MB ≈ 70000 ngrams
var max_size = 2097152
// we take it straight from the input element
theFile = $('input#csvfile')[0].files[0]
// debug
// console.log(theFile.name, "size", theFile.size, theFile.lastModifiedDate)
if (! theFile) {
console.warn('Ignoring "submit": no provided file')
return false
}
else if (theFile.size > max_size) {
console.warn('Ignoring "submit": file is too big')
$('#formanswer').html(
'The import failed: your file is too big ('+max_size/1024+'kB max).'
);
return false
}
// normal case
else {
// append into an empty form (or fixme: initialize it using form element)
var myFileFormData = new FormData();
myFileFormData.append("csvfile", theFile)
//postCorpusFile
$.ajax({
url: "{{csvimportroute | safe}}",
type: 'POST',
async: true,
contentType: false,
processData: false,
data: myFileFormData,
beforeSend: function(xhr) {
xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
},
success: function(response) {
my_html = "<h2 color='green'>IMPORT OK ! </h2>"
my_html += "<p class='note'>" + response['log'].replace(/\n/g, '<br/>') + "</p>"
my_html += "<p'>(this page will reload in 3s)</p>"
$('#formanswer').html(my_html);
console.log(response) ;
// reload after 3s
setTimeout("location.reload(true)", 3000);
},
error: function(result) {
$('#formanswer').html('Erreur');
console.error(result);
},
});
$('#formanswer').html('CSV import in Progress');
}
};
</script>
{% endblock %}
......@@ -41,7 +41,7 @@
{% if corpus %}
<li><a href="/projects/{{project.id}}/corpora/{{corpus.id}}">
<span class="glyphicon glyphicon-file" aria-hidden="true"></span>
{{corpus.name | truncatechars:15}}
{{corpus.name | truncatechars:25}}
</a>
</li>
{% endif %}
......@@ -150,12 +150,32 @@
<br>
<br>
<div class="row">
<h3>
<a href="/projects/{{project.id}}">
<span class="glyphicon glyphicon-book" aria-hidden="true"></span>
{{ project.name | truncatechars:50}}
<div class="col-md-6">
<h3>
<a href="/projects/{{project.id}}">
<span class="glyphicon glyphicon-book" aria-hidden="true"></span>
{{ project.name | truncatechars:50}}
</a>
</h3>
</div>
<!-- export button -->
<div class="col-md-6">
{% if view == 'terms' %}
<a class="btn btn-primary exportbtn pull-right" role="button"
href="/api/ngramlists/export?corpus={{corpus.id}}"
title="Export terms table in CSV">
Export terms table &nbsp; <span class="glyphicon glyphicon-download" aria-hidden="true"></span>
</a>
{% elif view == 'titles' %}
<a class="btn btn-primary exportbtn pull-right" role="button"
href="/api/nodes?parent_id={{corpus.id}}&types[]=DOCUMENT&pagination_limit=100000&formated=csv"
title="Export full corpus in CSV">
Export corpus &nbsp; <span class="glyphicon glyphicon-download" aria-hidden="true"></span>
</a>
</h3>
{% else %}
<!-- TODO export journal table -->
{% endif %}
</div>
</div>
<div class="row">
<div class="col-md-1">
......@@ -167,10 +187,7 @@
</h3>
<h3>
<span class="glyphicon glyphicon-file" aria-hidden="true"></span>
{{ corpus.name | truncatechars:20 }}
<a class="btn btn-primary" role="button" href="/api/nodes?parent_id={{corpus.id}}&types[]=DOCUMENT&pagination_limit=100000&formated=csv">
<span class="glyphicon glyphicon-download" aria-hidden="true"></span>
</a>
{{ corpus.name | truncatechars:30 }}
</h3>
</div>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment