Commit 60ec5d88 authored by delanoe's avatar delanoe

[FIX][NGRAMS WORKFLOW] Default Map List improved in quality. Remark: map seems...

[FIX][NGRAMS WORKFLOW] Default Map List improved in quality. Remark: map seems to be less impressive. Maybe need to increase threeshold of map list selection.
parent b3faf308
......@@ -7,7 +7,7 @@ from gargantext.models.ngrams import Node, Ngram, NodeNgram, \
from gargantext.util.db import session, aliased, func
from gargantext.util.db_cache import cache
from gargantext.util.lists import UnweightedList
from sqlalchemy import desc
from sqlalchemy import desc, asc
from gargantext.constants import DEFAULT_MAPLIST_MAX,\
DEFAULT_MAPLIST_MONOGRAMS_RATIO
......@@ -70,7 +70,7 @@ def do_maplist(corpus,
# TODO: move these 2 pools up to mainlist selection
top_monograms = (query
.filter(Ngram.n == 1)
.order_by(desc(ScoreSpec.weight))
.order_by(asc(ScoreSpec.weight))
.limit(monograms_limit)
.all()
)
......
......@@ -7,6 +7,7 @@ from gargantext.util.db import session, aliased, func, bulk_insert
from gargantext.util.lists import WeightedList
from collections import defaultdict
from pandas import DataFrame
import pandas as pd
def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
'''
......@@ -33,13 +34,23 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
print("SPECIFICITY: computing on %i ngrams" % nb_ngrams)
d = DataFrame(matrix).fillna(0)
x = DataFrame(matrix).fillna(0)
# proba (x/y) ( <= on divise chaque colonne par son total)
d = d / d.sum(axis=0)
# proba (x/y) ( <= on divise chaque ligne par son total)
x = x / x.sum(axis=1)
# vectorisation
# d:Matrix => v: Vector (len = nb_ngrams)
v = d.sum(axis=1)
# v = d.sum(axis=1) (- lui-même)
xs = x.sum(axis=1) - x
ys = x.sum(axis=0) - x
# top inclus ou exclus
#n = ( xs + ys) / (2 * (x.shape[0] - 1))
# top generic or specific (asc is spec, desc is generic)
v = ( xs - ys) / ( 2 * (x.shape[0] - 1))
## d ##
#######
......@@ -66,7 +77,7 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
# pour l'instant on va utiliser les sommes en ligne comme ranking de spécificité
# (**même** ordre qu'avec la formule d'avant le refactoring mais calcul + simple)
# TODO analyser la cohérence math ET sem de cet indicateur
v.sort_values(inplace=True)
#v.sort_values(inplace=True)
# [ ('biodiversité' , 0.333 ),
# ('Grenelle' , 0.5 ),
......@@ -92,10 +103,11 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
the_id = specnode.id
# print(v)
pd.options.display.float_format = '${:,.2f}'.format
data = WeightedList(
zip( v.index.tolist()
, v.values.tolist()
, v.values.tolist()[0]
)
)
data.save(the_id)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment