Commit 63ec1b5c authored by Romain Loth's avatar Romain Loth

filter terms that became empty after normalization + esthetics

parent a77ea0cf
...@@ -66,6 +66,7 @@ def compute_coocs(corpus, ...@@ -66,6 +66,7 @@ def compute_coocs(corpus,
""" """
# - TODO add grouped element's values in grouping 'chief ngram'
# - TODO cvalue_id: allow a metric as additional input filter # - TODO cvalue_id: allow a metric as additional input filter
# - TODO n_min, n_max : filter on Ngram.n (aka length of ngram) # - TODO n_min, n_max : filter on Ngram.n (aka length of ngram)
# - TODO start, end : filter on document date # - TODO start, end : filter on document date
...@@ -159,9 +160,9 @@ def compute_coocs(corpus, ...@@ -159,9 +160,9 @@ def compute_coocs(corpus,
matrix = WeightedMatrix(coocs_query.all()) matrix = WeightedMatrix(coocs_query.all())
# fyi # fyi
# shape_0 = len({pair[0] for pair in matrix.items}) shape_0 = len({pair[0] for pair in matrix.items})
# shape_1 = len({pair[1] for pair in matrix.items}) shape_1 = len({pair[1] for pair in matrix.items})
# print("COOCS: NEW matrix shape [%ix%i]" % (shape_0, shape_1)) print("COOCS: NEW matrix shape [%ix%i]" % (shape_0, shape_1))
# 5) SAVE # 5) SAVE
# -------- # --------
......
...@@ -63,8 +63,9 @@ def extract_ngrams(corpus, keys=('title', 'abstract', )): ...@@ -63,8 +63,9 @@ def extract_ngrams(corpus, keys=('title', 'abstract', )):
for ngram in ngramsextractor.extract(value): for ngram in ngramsextractor.extract(value):
tokens = tuple(token[0] for token in ngram) tokens = tuple(token[0] for token in ngram)
terms = normalize_terms(' '.join(tokens)) terms = normalize_terms(' '.join(tokens))
nodes_ngrams_count[(document.id, terms)] += 1 if len(terms) > 1:
ngrams_data.add((terms[:255], len(tokens), )) nodes_ngrams_count[(document.id, terms)] += 1
ngrams_data.add((terms[:255], len(tokens), ))
# integrate ngrams and nodes-ngrams # integrate ngrams and nodes-ngrams
if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE: if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE:
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor) _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
......
...@@ -11,7 +11,7 @@ def ngramtable(request, project_id, corpus_id): ...@@ -11,7 +11,7 @@ def ngramtable(request, project_id, corpus_id):
=> maplist and mainlist terms in a table => maplist and mainlist terms in a table
with groupings as '+' nodes with groupings as '+' nodes
=> uses API GET batch of lists => uses API GET batch of lists
=> uses API PUT/DEL for list modifications (TODO) => uses API PUT/DEL for list modifications
=> uses frontend AJAX through Ngrams_dyna_charts_and_table.js => uses frontend AJAX through Ngrams_dyna_charts_and_table.js
# TODO refactor Ngrams_dyna_charts_and_table.js # TODO refactor Ngrams_dyna_charts_and_table.js
''' '''
...@@ -21,7 +21,7 @@ def ngramtable(request, project_id, corpus_id): ...@@ -21,7 +21,7 @@ def ngramtable(request, project_id, corpus_id):
# and the project just for project.id in corpusBannerTop # and the project just for project.id in corpusBannerTop
project = cache.Node[project_id] project = cache.Node[project_id]
# rendered page : journals.html # rendered page : terms.html
return render( return render(
template_name = 'pages/corpora/terms.html', template_name = 'pages/corpora/terms.html',
request = request, request = request,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment