Commit e24efe96 authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge remote-tracking branch 'origin/simon-unstable-lists-fix' into unstable

parents 06f55400 224eae66
...@@ -7,7 +7,7 @@ from gargantext.util.db import session, aliased ...@@ -7,7 +7,7 @@ from gargantext.util.db import session, aliased
from gargantext.models import Ngram, NodeNgramNgram from gargantext.models import Ngram, NodeNgramNgram
from igraph import Graph # for group_union from igraph import Graph # for group_union
def query_groups(groupings_id, details=False): def query_groups(groupings_id, details=False, sort=False):
""" """
Listing of couples (mainform, subform) Listing of couples (mainform, subform)
aka (ngram1_id, ngram2_id) aka (ngram1_id, ngram2_id)
...@@ -15,24 +15,27 @@ def query_groups(groupings_id, details=False): ...@@ -15,24 +15,27 @@ def query_groups(groupings_id, details=False):
Parameter: Parameter:
- details: if False, just send the array of couples - details: if False, just send the array of couples
if True, send quadruplets with (ngram1_id, term1, ngram2_id, term2) if True, send quadruplets with (ngram1_id, term1, ngram2_id, term2)
- sort: order results by terms of ngram1 then ngram2
""" """
if details or sort:
Ngram1, Ngram2 = Ngram, aliased(Ngram)
if not details: if not details:
# simple contents # simple contents
query = session.query(NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id) columns = (NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id)
else: else:
# detailed contents (id + terms) # detailed contents (id + terms)
Ngram1 = aliased(Ngram) columns = (Ngram1.id, Ngram1.terms,
Ngram2 = aliased(Ngram) Ngram2.id, Ngram2.terms)
query = (session
.query( query = session.query(*columns)
NodeNgramNgram.ngram1_id,
Ngram1.terms, if details or sort:
NodeNgramNgram.ngram2_id, query = (query.join(Ngram1, NodeNgramNgram.ngram1_id == Ngram1.id)
Ngram2.terms, .join(Ngram2, NodeNgramNgram.ngram2_id == Ngram2.id))
)
.join(Ngram1, NodeNgramNgram.ngram1_id == Ngram1.id) if sort:
.join(Ngram2, NodeNgramNgram.ngram2_id == Ngram2.id) query = query.order_by(Ngram1.terms, Ngram2.terms)
)
# main filter # main filter
# ----------- # -----------
......
...@@ -50,6 +50,9 @@ class _BaseClass: ...@@ -50,6 +50,9 @@ class _BaseClass:
else: else:
return NotImplemented return NotImplemented
def __len__(self):
return len(self.items)
def __repr__(self): def __repr__(self):
items = self.items items = self.items
if isinstance(items, defaultdict): if isinstance(items, defaultdict):
......
...@@ -8,8 +8,7 @@ Tools to work with ngramlists (MAINLIST, MAPLIST, STOPLIST) ...@@ -8,8 +8,7 @@ Tools to work with ngramlists (MAINLIST, MAPLIST, STOPLIST)
""" """
from gargantext.util.group_tools import query_groups, group_union from gargantext.util.group_tools import query_groups, group_union
from gargantext.util.db import session, desc, func, \ from gargantext.util.db import session, bulk_insert_ifnotexists
bulk_insert_ifnotexists
from gargantext.models import Ngram, NodeNgram, NodeNodeNgram, \ from gargantext.models import Ngram, NodeNgram, NodeNodeNgram, \
NodeNgramNgram, Node NodeNgramNgram, Node
...@@ -25,7 +24,6 @@ from gargantext.util.toolchain.ngrams_extraction import normalize_forms ...@@ -25,7 +24,6 @@ from gargantext.util.toolchain.ngrams_extraction import normalize_forms
# merge will also index the new ngrams in the docs of the corpus # merge will also index the new ngrams in the docs of the corpus
from gargantext.util.toolchain.ngrams_addition import index_new_ngrams from gargantext.util.toolchain.ngrams_addition import index_new_ngrams
from sqlalchemy.sql import exists
from os import path from os import path
from csv import writer, reader, QUOTE_MINIMAL from csv import writer, reader, QUOTE_MINIMAL
from collections import defaultdict from collections import defaultdict
...@@ -35,8 +33,8 @@ from celery import shared_task ...@@ -35,8 +33,8 @@ from celery import shared_task
def query_list(list_id, def query_list(list_id,
pagination_limit=None, pagination_offset=None, pagination_limit=None, pagination_offset=None,
details=False, scoring_metric_id=None, groupings_id=None details=False, scoring_metric_id=None, groupings_id=None,
): sort=False):
""" """
Paginated listing of ngram_ids in a NodeNgram lists. Paginated listing of ngram_ids in a NodeNgram lists.
...@@ -51,6 +49,7 @@ def query_list(list_id, ...@@ -51,6 +49,7 @@ def query_list(list_id,
(for details and sorting) (for details and sorting)
- groupings_id: optional id of a list of grouping relations (synonyms) - groupings_id: optional id of a list of grouping relations (synonyms)
(each synonym will be added to the list if not already in there) (each synonym will be added to the list if not already in there)
- sort: order by Ngram.terms (not possible if details is False)
FIXME: subforms appended recently and not generalized enough FIXME: subforms appended recently and not generalized enough
=> add a common part for all "if groupings_id" => add a common part for all "if groupings_id"
...@@ -114,7 +113,10 @@ def query_list(list_id, ...@@ -114,7 +113,10 @@ def query_list(list_id,
query = query.limit(pagination_limit) query = query.limit(pagination_limit)
if pagination_offset: if pagination_offset:
query = query.offset(pagination_offsets) query = query.offset(pagination_offset)
if details and sort:
query = query.order_by(Ngram.terms)
return query return query
...@@ -175,9 +177,7 @@ def ngrams_to_csv_rows(ngram_objs, ngram_dico={}, group_infos={}, ...@@ -175,9 +177,7 @@ def ngrams_to_csv_rows(ngram_objs, ngram_dico={}, group_infos={},
# 3 columns = |status, | mainform, | forms # 3 columns = |status, | mainform, | forms
# (type_of_list) ( term ) ( subterm1|&|subterm2 ) # (type_of_list) ( term ) ( subterm1|&|subterm2 )
csv_rows.append( csv_rows.append([list_type, ng_obj.terms, this_grouped_terms])
[list_type,ng_obj.terms,this_grouped_terms]
)
return csv_rows return csv_rows
...@@ -220,9 +220,10 @@ def export_ngramlists(node,fname=None,delimiter=DEFAULT_CSV_DELIM,titles=True): ...@@ -220,9 +220,10 @@ def export_ngramlists(node,fname=None,delimiter=DEFAULT_CSV_DELIM,titles=True):
# listes de ngram_ids correspondantes # listes de ngram_ids correspondantes
# ------------------------------------ # ------------------------------------
# contenu: liste des objets ngrammes [(2562,"monterme",1),...] # contenu: liste des objets ngrammes [(2562,"monterme",1),...]
stop_ngrams = query_list(stoplist_node.id, details=True, groupings_id=group_node.id).all() stop_ngrams, main_ngrams, map_ngrams = (
main_ngrams = query_list(mainlist_node.id, details=True, groupings_id=group_node.id).all() query_list(n.id, details=True, groupings_id=group_node.id, sort=True).all()
map_ngrams = query_list(maplist_node.id, details=True, groupings_id=group_node.id).all() for n in (stoplist_node, mainlist_node, maplist_node)
)
# pour debug ---------->8 -------------------- # pour debug ---------->8 --------------------
#~ stop_ngrams = stop_ngrams[0:10] #~ stop_ngrams = stop_ngrams[0:10]
...@@ -239,7 +240,7 @@ def export_ngramlists(node,fname=None,delimiter=DEFAULT_CSV_DELIM,titles=True): ...@@ -239,7 +240,7 @@ def export_ngramlists(node,fname=None,delimiter=DEFAULT_CSV_DELIM,titles=True):
# for the groups we got couples of ids in the DB # for the groups we got couples of ids in the DB
# ------------------- # -------------------
# ex: [(3544, 2353), (2787, 4032), ...] # ex: [(3544, 2353), (2787, 4032), ...]
group_ngram_id_couples = query_groups(group_node.id).all() group_ngram_id_couples = query_groups(group_node.id, sort=True)
# we expend this to double structure for groups lookup # we expend this to double structure for groups lookup
# 1) g['links'] = k couples (x,y_i) as a set [x => {y1,y2}] # 1) g['links'] = k couples (x,y_i) as a set [x => {y1,y2}]
...@@ -386,6 +387,9 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM, ...@@ -386,6 +387,9 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM,
NB: To merge the imported lists into a corpus node's lists, NB: To merge the imported lists into a corpus node's lists,
chain this function with merge_ngramlists() chain this function with merge_ngramlists()
''' '''
list_types = ['stop','main','map']
# --------------- # ---------------
# ngram storage # ngram storage
# --------------- # ---------------
...@@ -450,7 +454,6 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM, ...@@ -450,7 +454,6 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM,
# headers # headers
if i == 0: if i == 0:
n_cols = len(csv_row)
for j, colname in enumerate(csv_row): for j, colname in enumerate(csv_row):
if colname in ['label', 'status', 'forms']: if colname in ['label', 'status', 'forms']:
columns[colname] = j columns[colname] = j
...@@ -497,31 +500,30 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM, ...@@ -497,31 +500,30 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM,
continue continue
# --- check correct list type # --- check correct list type
if not this_list_type in ['stop','main','map']: if not this_list_type in list_types:
print("IMPORT WARN: (skip line) wrong list type at CSV %s:l.%i" % (fname, i)) print("IMPORT WARN: (skip line) wrong list type at CSV %s:l.%i" % (fname, i))
continue continue
# subforms can be duplicated (in forms and another label) # subforms can be duplicated (in forms and another label)
# but we must take care of unwanted other duplicates too # but we must take care of unwanted other duplicates too
if this_row_label in imported_unique_ngramstrs: if imported_unique_ngramstrs.get(this_row_label) == 1:
print("TODO IMPORT DUPL: (skip line) term appears more than once at CSV %s:l.%i" print("TODO IMPORT DUPL: (skip line) term %r appears more than once at CSV %s:l.%i"
% (fname, i)) % (this_row_label, fname, i))
# ================= Store the data ==================== # ================= Store the data ====================
# the ngram census # the ngram census
imported_unique_ngramstrs[this_row_label] = True imported_unique_ngramstrs[this_row_label] = 1
# and the "list to ngram" relation # and the "list to ngram" relation
imported_nodes_ngrams[this_list_type].append(this_row_label) imported_nodes_ngrams[this_list_type].append(this_row_label)
# ====== Store synonyms from the import (if any) ====== # ====== Store synonyms from the import (if any) ======
if len(this_row_forms) != 0: if len(this_row_forms) != 0:
other_terms = []
for raw_term_str in this_row_forms.split(group_delimiter): for raw_term_str in this_row_forms.split(group_delimiter):
# each subform is also like an ngram declaration # each subform is also like an ngram declaration
term_str = normalize_forms(normalize_chars(raw_term_str)) term_str = normalize_forms(normalize_chars(raw_term_str))
imported_unique_ngramstrs[term_str] = True imported_unique_ngramstrs[term_str] = 2
imported_nodes_ngrams[this_list_type].append(term_str) imported_nodes_ngrams[this_list_type].append(term_str)
# the optional repeated mainform doesn't interest us # the optional repeated mainform doesn't interest us
...@@ -599,7 +601,10 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM, ...@@ -599,7 +601,10 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM,
% (n_total_ng, n_added_ng, n_total_ng-n_added_ng) ) % (n_total_ng, n_added_ng, n_total_ng-n_added_ng) )
print("IMPORT: read %i grouping relations" % n_group_relations) print("IMPORT: read %i grouping relations" % n_group_relations)
# print("IMPORT RESULT", result) list_counts = [(typ, len(result.get(typ))) for typ in list_types]
list_counts.append(('total', sum(x[1] for x in list_counts)))
print("IMPORT: " + '; '.join('%s %s' % stats for stats in list_counts))
return result return result
def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]): def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
...@@ -707,9 +712,11 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]): ...@@ -707,9 +712,11 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
# ======== Merging all involved ngrams ========= # ======== Merging all involved ngrams =========
# all memberships with resolved conflicts of interfering memberships # all ngram memberships with resolved conflicts of interfering memberships
# (associates ngram ids with list types -- see linfos definition above)
resolved_memberships = {} resolved_memberships = {}
# iterates over each ngram of each list type for both old and new lists
for list_set in [old_lists, new_lists]: for list_set in [old_lists, new_lists]:
for lid, info in enumerate(linfos): for lid, info in enumerate(linfos):
list_type = info['key'] list_type = info['key']
...@@ -811,7 +818,7 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]): ...@@ -811,7 +818,7 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
list_type = linfos[lid]['key'] list_type = linfos[lid]['key']
merged_results[list_type].items.add(ng_id) merged_results[list_type].items.add(ng_id)
# print("IMPORT: added %i elements in the lists indices" % added_nd_ng) print("IMPORT: added %i elements in the lists indices" % added_nd_ng)
# ======== Overwrite old data with new ========= # ======== Overwrite old data with new =========
for lid, info in enumerate(linfos): for lid, info in enumerate(linfos):
...@@ -834,10 +841,14 @@ def import_and_merge_ngramlists(file_contents, onto_corpus_id, overwrite=False): ...@@ -834,10 +841,14 @@ def import_and_merge_ngramlists(file_contents, onto_corpus_id, overwrite=False):
""" """
A single function to run import_ngramlists and merge_ngramlists together A single function to run import_ngramlists and merge_ngramlists together
""" """
print("import list")
print("IMPORT CSV termlists file with %s lines in corpus %s (%s)" % (
len(file_contents),
onto_corpus_id, 'overwrite' if overwrite else 'merge'))
new_lists = import_ngramlists(file_contents) new_lists = import_ngramlists(file_contents)
corpus_node = session.query(Node).filter(Node.id == onto_corpus_id).first() corpus_node = session.query(Node).get(onto_corpus_id)
# merge the new_lists onto those of the target corpus # merge the new_lists onto those of the target corpus
del_originals = ['stop', 'main', 'map'] if overwrite else [] del_originals = ['stop', 'main', 'map'] if overwrite else []
......
...@@ -4,128 +4,67 @@ import sys ...@@ -4,128 +4,67 @@ import sys
import csv import csv
csv.field_size_limit(sys.maxsize) csv.field_size_limit(sys.maxsize)
import numpy as np import numpy as np
import os
class CSVParser(Parser): class CSVParser(Parser):
DELIMITERS = ", \t;|:"
def CSVsample( self, small_contents , delim) : def detect_delimiter(self, lines, sample_size=10):
reader = csv.reader(small_contents, delimiter=delim) sample = lines[:sample_size]
Freqs = [] # Compute frequency of each delimiter on each input line
for row in reader: delimiters_freqs = {
Freqs.append(len(row)) d: [line.count(d) for line in sample]
for d in self.DELIMITERS
}
return Freqs # Select delimiters with a standard deviation of zero, ie. delimiters
# for which we have the same number of fields on each line
selected_delimiters = [
(d, np.sum(freqs))
for d, freqs in delimiters_freqs.items()
if any(freqs) and np.std(freqs) == 0
]
if selected_delimiters:
# Choose the delimiter with highest frequency amongst selected ones
sorted_delimiters = sorted(selected_delimiters, key=lambda x: x[1])
return sorted_delimiters[-1][0]
def parse(self, filebuf): def parse(self, filebuf):
print("CSV: parsing (assuming UTF-8 and LF line endings)") print("CSV: parsing (assuming UTF-8 and LF line endings)")
contents = filebuf.read().decode("UTF-8").split("\n") contents = filebuf.read().decode("UTF-8").split("\n")
sample_size = 10 # Filter out empty lines
sample_contents = contents[0:sample_size] contents = [line for line in contents if line.strip()]
hyperdata_list = [] # Delimiter auto-detection
delimiter = self.detect_delimiter(contents, sample_size=10)
# # = = = = [ Getting delimiters frequency ] = = = = #
PossibleDelimiters = [ ',',' ','\t', ';', '|', ':' ] if delimiter is None:
AllDelimiters = {} raise ValueError("CSV: couldn't detect delimiter, bug or malformed data")
for delim in PossibleDelimiters:
AllDelimiters[delim] = self.CSVsample( sample_contents , delim ) print("CSV: selected delimiter: %r" % delimiter)
# # = = = = [ / Getting delimiters frequency ] = = = = #
# # OUTPUT example: # Parse CSV
# # AllDelimiters = { reader = csv.reader(contents, delimiter=delimiter)
# # '\t': [1, 1, 1, 1, 1],
# # ' ': [1, 13, 261, 348, 330], # Get first not empty row and its fields (ie. header row), or (0, [])
# # ',': [15, 15, 15, 15, 15], first_row, headers = \
# # ';': [1, 1, 1, 1, 1], next(((i, fields) for i, fields in enumerate(reader) if any(fields)),
# # '|': [1, 1, 1, 1, 1] (0, []))
# # }
# Get first not empty column of the first row, or 0
# # = = = = [ Stand.Dev=0 & Sum of delimiters ] = = = = # first_col = next((i for i, field in enumerate(headers) if field), 0)
Delimiters = []
for d in AllDelimiters: # Strip out potential empty fields in headers
freqs = AllDelimiters[d] headers = headers[first_col:]
suma = np.sum( freqs )
if suma >0:
std = np.std( freqs )
# print [ d , suma , len(freqs) , std]
if std == 0:
Delimiters.append ( [ d , suma , len(freqs) , std] )
# # = = = = [ / Stand.Dev=0 & Sum of delimiters ] = = = = #
# # OUTPUT example:
# # Delimiters = [
# # ['\t', 5, 5, 0.0],
# # [',', 75, 5, 0.0],
# # ['|', 5, 5, 0.0]
# # ]
# # = = = = [ Delimiter selection ] = = = = #
Sorted_Delims = sorted(Delimiters, key=lambda x: x[1], reverse=True)
HighestDelim = Sorted_Delims[0][0]
# HighestDelim = ","
print("CSV selected delimiter:",[HighestDelim])
# # = = = = [ / Delimiter selection ] = = = = #
# # = = = = [ First data coordinate ] = = = = #
Coords = {
"row": -1,
"column": -1
}
reader = csv.reader(contents, delimiter=HighestDelim) # Return a generator of dictionaries with column labels as keys,
# filtering out empty rows
for rownum, tokens in enumerate(reader): for i, fields in enumerate(reader):
if rownum % 250 == 0: if i % 500 == 0:
print("CSV row: ", rownum) print("CSV: parsing row #%s..." % (i+1))
joined_tokens = "".join (tokens) if any(fields):
if Coords["row"]<0 and len( joined_tokens )>0 : yield dict(zip(headers, fields[first_col:]))
Coords["row"] = rownum
for columnum in range(len(tokens)):
t = tokens[columnum]
if len(t)>0:
Coords["column"] = columnum
break
# # = = = = [ / First data coordinate ] = = = = #
# # = = = = [ Setting Headers ] = = = = #
Headers_Int2Str = {}
reader = csv.reader(contents, delimiter=HighestDelim)
for rownum, tokens in enumerate(reader):
if rownum>=Coords["row"]:
for columnum in range( Coords["column"],len(tokens) ):
t = tokens[columnum]
Headers_Int2Str[columnum] = t
break
# print("Headers_Int2Str")
# print(Headers_Int2Str)
# # = = = = [ / Setting Headers ] = = = = #
# # OUTPUT example:
# # Headers_Int2Str = {
# # 0: 'publication_date',
# # 1: 'publication_month',
# # 2: 'publication_second',
# # 3: 'abstract'
# # }
# # = = = = [ Reading the whole CSV and saving ] = = = = #
hyperdata_list = []
reader = csv.reader(contents, delimiter=HighestDelim)
for rownum, tokens in enumerate(reader):
if rownum>Coords["row"]:
RecordDict = {}
for columnum in range( Coords["column"],len(tokens) ):
data = tokens[columnum]
RecordDict[ Headers_Int2Str[columnum] ] = data
if len(RecordDict.keys())>0:
hyperdata_list.append( RecordDict )
# # = = = = [ / Reading the whole CSV and saving ] = = = = #
return hyperdata_list
...@@ -81,7 +81,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND ...@@ -81,7 +81,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
corpus.hyperdata["skipped_docs"].append(document.id) corpus.hyperdata["skipped_docs"].append(document.id)
corpus.save_hyperdata() corpus.save_hyperdata()
continue continue
else:
# ready ! # ready !
tagger = tagger_bots[language_iso2] tagger = tagger_bots[language_iso2]
...@@ -95,7 +95,8 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND ...@@ -95,7 +95,8 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
continue continue
# get ngrams # get ngrams
for ngram in tagger.extract(value): for ngram in tagger.extract(value):
tokens = tuple(normalize_forms(token[0]) for token in ngram) normal_forms = (normalize_forms(t[0]) for t in ngram)
tokens = tuple(nf for nf in normal_forms if nf)
if do_subngrams: if do_subngrams:
# ex tokens = ["very", "cool", "exemple"] # ex tokens = ["very", "cool", "exemple"]
# subterms = [['very', 'cool'],...] # subterms = [['very', 'cool'],...]
......
...@@ -440,11 +440,12 @@ ...@@ -440,11 +440,12 @@
// in the form "Add a corpus" // in the form "Add a corpus"
var type = $("#id_type").val() var type = $("#id_type").val()
var file = $("#id_file").val()
// 5 booleans // 5 booleans
var nameField = $("#id_name").val() != "" var nameField = $("#id_name").val() != ""
var typeField = (type != "") && (type != "0") var typeField = (type != "") && (type != "0")
var fileField = $("#id_file").val() != "" var fileField = file != ""
var wantfileField = $("#file_yes").prop("checked") var wantfileField = $("#file_yes").prop("checked")
var crawling = ((type==3)||(type==8)||(type==9)) && ! wantfileField var crawling = ((type==3)||(type==8)||(type==9)) && ! wantfileField
...@@ -457,6 +458,23 @@ ...@@ -457,6 +458,23 @@
if (! crawling) { if (! crawling) {
$("#submit_thing").prop('disabled' , !(nameField && typeField && fileField)) $("#submit_thing").prop('disabled' , !(nameField && typeField && fileField))
} }
// Automatically select CSV when type is undefined
// and we have a .csv file
if (!typeField && file && file.match(/.csv$/i)) {
// Get CSV type id
var csv = $('#id_type > option')
.filter(function() {
return $(this).text() === 'CSV'
})
.attr('value')
// Select CSV type
$('#id_type').val(csv)
// Focus on name field
setTimeout(function() {
$("#id_name").focus()
})
}
} }
function bringDaNoise() { function bringDaNoise() {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment