Commit 49b9c2f0 authored by Romain Loth's avatar Romain Loth

[WIP] old specificity metric transformed into 2 better metrics: specclusion...

[WIP] old specificity metric transformed into 2 better metrics: specclusion and genclusion based on conditional probability
parent 4f676883
......@@ -12,7 +12,8 @@ LISTTYPES = {
'STOPLIST' : UnweightedList,
'MAINLIST' : UnweightedList,
'MAPLIST' : UnweightedList,
'SPECIFICITY' : WeightedList,
'SPECCLUSION' : WeightedList,
'GENCLUSION' : WeightedList,
'OCCURRENCES' : WeightedIndex, # could be WeightedList
'COOCCURRENCES': WeightedMatrix,
'TFIDF-CORPUS' : WeightedIndex,
......@@ -47,6 +48,7 @@ NODETYPES = [
# more scores (sorry!)
'TIRANK-LOCAL', # 16
'TIRANK-GLOBAL', # 17
'GENCLUSION', # 18
]
INDEXED_HYPERDATA = {
......
......@@ -9,22 +9,26 @@ from gargantext.util.db_cache import cache
from gargantext.util.lists import UnweightedList
from sqlalchemy import desc, asc
from gargantext.constants import DEFAULT_MAPLIST_MAX,\
DEFAULT_MAPLIST_GENCLUSION_RATIO,\
DEFAULT_MAPLIST_MONOGRAMS_RATIO
def do_maplist(corpus,
overwrite_id = None,
mainlist_id = None,
specificity_id = None,
specclusion_id = None,
genclusion_id = None,
grouplist_id = None,
limit=DEFAULT_MAPLIST_MAX,
genclusion_part=DEFAULT_MAPLIST_GENCLUSION_RATIO,
monograms_part=DEFAULT_MAPLIST_MONOGRAMS_RATIO
):
'''
According to Specificities and mainlist
According to Genericity/Specificity and mainlist
Parameters:
- mainlist_id (starting point, already cleaned of stoplist terms)
- specificity_id (ranking factor)
- specclusion_id (inclusion by cooc specificity -- ranking factor)
- genclusion_id (inclusion by cooc genericity -- ranking factor)
- grouplist_id (filtering grouped ones)
- overwrite_id: optional if preexisting MAPLIST node to overwrite
......@@ -33,8 +37,8 @@ def do_maplist(corpus,
- monograms_part: a ratio of terms with only one lexical unit to keep
'''
if not (mainlist_id and specificity_id and grouplist_id):
raise ValueError("Please provide mainlist_id, specificity_id and grouplist_id")
if not (mainlist_id and specclusion_id and genclusion_id and grouplist_id):
raise ValueError("Please provide mainlist_id, specclusion_id, genclusion_id and grouplist_id")
monograms_limit = round(limit * monograms_part)
multigrams_limit = limit - monograms_limit
......@@ -58,7 +62,7 @@ def do_maplist(corpus,
# specificity-ranked
query = (session.query(ScoreSpec.ngram_id)
.join(Ngram, Ngram.id == ScoreSpec.ngram_id)
.filter(ScoreSpec.node_id == specificity_id)
.filter(ScoreSpec.node_id == specclusion_id)
# we want only terms within mainlist
.join(MainlistTable, Ngram.id == MainlistTable.ngram_id)
......@@ -73,7 +77,7 @@ def do_maplist(corpus,
# TODO: move these 2 pools up to mainlist selection
top_monograms = (query
.filter(Ngram.n == 1)
.order_by(asc(ScoreSpec.weight))
.order_by(desc(ScoreSpec.weight))
.limit(monograms_limit)
.all()
)
......
......@@ -10,8 +10,8 @@ from .ngram_groups import compute_groups
from .metric_tfidf import compute_occs, compute_tfidf_local, compute_ti_ranking
from .list_main import do_mainlist
from .ngram_coocs import compute_coocs
from .metric_specificity import compute_specificity
from .list_map import do_maplist # TEST
from .metric_specgen import compute_specgen
from .list_map import do_maplist
from .mail_notification import notify_owner
from gargantext.util.db import session
from gargantext.models import Node
......@@ -136,22 +136,26 @@ def parse_extract_indexhyperdata(corpus):
# => used for doc <=> ngram association
# ------------
# -> cooccurrences on mainlist: compute + write (=> Node and NodeNgramNgram)
# -> cooccurrences on mainlist: compute + write (=> Node and NodeNgramNgram)*
coocs = compute_coocs(corpus,
on_list_id = mainlist_id,
groupings_id = group_id,
just_pass_result = True)
just_pass_result = True,
diagonal_filter = False) # preserving the diagonal
# (useful for spec/gen)
print('CORPUS #%d: [%s] computed mainlist coocs for specif rank' % (corpus.id, t()))
# -> specificity: compute + write (=> NodeNodeNgram)
spec_id = compute_specificity(corpus,cooc_matrix = coocs)
# -> specclusion/genclusion: compute + write (2 Nodes + 2 lists in NodeNgram)
(spec_id, gen_id) = compute_specgen(corpus,cooc_matrix = coocs)
# no need here for subforms because cooc already counted them in mainform
print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), spec_id))
print('CORPUS #%d: [%s] new spec-clusion node #%i' % (corpus.id, t(), spec_id))
print('CORPUS #%d: [%s] new gen-clusion node #%i' % (corpus.id, t(), gen_id))
# maplist: compute + write (to Node and NodeNgram)
map_id = do_maplist(corpus,
mainlist_id = mainlist_id,
specificity_id=spec_id,
specclusion_id=spec_id,
genclusion_id=gen_id,
grouplist_id=group_id
)
print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id))
......@@ -187,7 +191,7 @@ def recount(corpus):
- ndocs
- ti_rank
- coocs
- specificity
- specclusion/genclusion
- tfidf
NB: no new extraction, no list change, just the metrics
......@@ -208,10 +212,15 @@ def recount(corpus):
old_tirank_id = None
try:
old_spec_id = corpus.children("SPECIFICITY").first().id
old_spec_id = corpus.children("SPECCLUSION").first().id
except:
old_spec_id = None
try:
old_gen_id = corpus.children("GENCLUSION").first().id
except:
old_gen_id = None
try:
old_ltfidf_id = corpus.children("TFIDF-CORPUS").first().id
except:
......@@ -254,11 +263,13 @@ def recount(corpus):
just_pass_result = True)
print('RECOUNT #%d: [%s] updated mainlist coocs for specif rank' % (corpus.id, t()))
# -> specificity: compute + write (=> NodeNgram)
spec_id = compute_specificity(corpus,cooc_matrix = coocs, overwrite_id = old_spec_id)
# -> specclusion/genclusion: compute + write (=> NodeNodeNgram)
(spec_id, gen_id) = compute_specgen(corpus, cooc_matrix = coocs,
spec_overwrite_id = spec_id, gen_overwrite_id = gen_id)
print('RECOUNT #%d: [%s] updated specificity node #%i' % (corpus.id, t(), spec_id))
print('RECOUNT #%d: [%s] updated spec-clusion node #%i' % (corpus.id, t(), spec_id))
print('RECOUNT #%d: [%s] updated gen-clusion node #%i' % (corpus.id, t(), gen_id))
print('RECOUNT #%d: [%s] FINISHED metric recounts' % (corpus.id, t()))
......
"""
Computes a specificity metric from the ngram cooccurrence matrix.
+ SAVE => WeightedList => NodeNgram
"""
from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram
from gargantext.util.db import session, aliased, func, bulk_insert
from gargantext.util.lists import WeightedList
from collections import defaultdict
from pandas import DataFrame
from numpy import diag
def round3(floating_number):
"""
Rounds a floating number to 3 decimals
Good when we don't need so much details in the DB writen data
"""
return float("%.3f" % floating_number)
def compute_specgen(corpus, cooc_id=None, cooc_matrix=None,
spec_overwrite_id = None, gen_overwrite_id = None):
'''
Compute genericity/specificity:
P(j|i) = N(ij) / N(ii)
P(i|j) = N(ij) / N(jj)
Gen(i) = Sum{j} P(j_k|i)
Spec(i) = Sum{j} P(i|j_k)
Gen-clusion(i) = (Spec(i) + Gen(i)) / 2
Spec-clusion(i) = (Spec(i) - Gen(i)) / 2
Parameters:
- cooc_id: mandatory id of a cooccurrences node to use as base
- spec_overwrite_id: optional preexisting specificity node to overwrite
- gen_overwrite_id: optional preexisting genericity node to overwrite
'''
matrix = defaultdict(lambda : defaultdict(float))
if cooc_id == None and cooc_matrix == None:
raise TypeError("compute_specificity: needs a cooc_id or cooc_matrix param")
elif cooc_id:
cooccurrences = (session.query(NodeNgramNgram)
.filter(NodeNgramNgram.node_id==cooc_id)
)
# no filtering: cooc already filtered on mainlist_id at creation
for cooccurrence in cooccurrences:
matrix[cooccurrence.ngram1_id][cooccurrence.ngram2_id] = cooccurrence.weight
# matrix[cooccurrence.ngram2_id][cooccurrence.ngram1_id] = cooccurrence.weight
elif cooc_matrix:
# copy WeightedMatrix into local matrix structure
for (ngram1_id, ngram2_id) in cooc_matrix.items:
w = cooc_matrix.items[(ngram1_id, ngram2_id)]
# ------- 8< --------------------------------------------
# tempo hack to ignore lines/columns where diagonal == 0
# £TODO find why they exist and then remove this snippet
if (((ngram1_id,ngram1_id) not in cooc_matrix.items) or
((ngram2_id,ngram2_id) not in cooc_matrix.items)):
continue
# ------- 8< --------------------------------------------
matrix[ngram1_id][ngram2_id] = w
nb_ngrams = len(matrix)
print("SPECIFICITY: computing on %i ngrams" % nb_ngrams)
# example corpus (7 docs, 8 nouns)
# --------------------------------
# "The report says that humans are animals."
# "The report says that rivers are full of water."
# "The report says that humans like to make war."
# "The report says that animals must eat food."
# "The report says that animals drink water."
# "The report says that humans like food and water."
# "The report says that grass is food for some animals."
#===========================================================================
cooc_counts = DataFrame(matrix).fillna(0)
# cooc_counts matrix
# ------------------
# animals food grass humans report rivers war water
# animals 4 2 1 1 4 0 0 1
# food 2 3 1 1 3 0 0 1
# grass 1 1 1 0 1 0 0 0
# humans 1 1 0 3 3 0 1 1
# report 4 3 1 3 7 1 1 3
# rivers 0 0 0 0 1 1 0 1
# war 0 0 0 1 1 0 1 0
# water 1 1 0 1 3 1 0 3
#===========================================================================
# conditional p(col|line)
diagonal = list(diag(cooc_counts))
# debug
# print("WARN diag: ", diagonal)
# print("WARN diag: =================== 0 in diagonal ?\n",
# 0 in diagonal ? "what ??? zeros in the diagonal :/" : "ok no zeros",
# "\n===================")
p_col_given_line = cooc_counts / list(diag(cooc_counts))
# p_col_given_line
# ----------------
# animals food grass humans report rivers war water
# animals 1.0 0.7 1.0 0.3 0.6 0.0 0.0 0.3
# food 0.5 1.0 1.0 0.3 0.4 0.0 0.0 0.3
# grass 0.2 0.3 1.0 0.0 0.1 0.0 0.0 0.0
# humans 0.2 0.3 0.0 1.0 0.4 0.0 1.0 0.3
# report 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
# rivers 0.0 0.0 0.0 0.0 0.1 1.0 0.0 0.3
# war 0.0 0.0 0.0 0.3 0.1 0.0 1.0 0.0
# water 0.2 0.3 0.0 0.3 0.4 1.0 0.0 1.0
#===========================================================================
# total per lines (<=> genericity)
Gen = p_col_given_line.sum(axis=1)
# Gen.sort_values(ascending=False)
# ---
# report 8.0
# animals 3.9
# food 3.6
# water 3.3
# humans 3.3
# grass 1.7
# war 1.5
# rivers 1.5
#===========================================================================
# total columnwise (<=> specificity)
Spec = p_col_given_line.sum(axis=0)
# Spec.sort_values(ascending=False)
# ----
# grass 4.0
# food 3.7
# water 3.3
# humans 3.3
# report 3.3
# animals 3.2
# war 3.0
# rivers 3.0
#===========================================================================
# our "inclusion by specificity" metric
Specclusion = Spec-Gen
# Specclusion.sort_values(ascending=False)
# -----------
# grass 1.1
# war 0.8
# rivers 0.8
# food 0.0
# humans -0.0
# water -0.0
# animals -0.3
# report -2.4
#===========================================================================
# our "inclusion by genericity" metric
Genclusion = Spec+Gen
# Genclusion.sort_values(ascending=False)
# -----------
# report 11.3
# food 7.3
# animals 7.2
# water 6.7
# humans 6.7
# grass 5.7
# war 4.5
# rivers 4.5
#===========================================================================
# specificity node
if spec_overwrite_id:
# overwrite pre-existing id
the_spec_id = spec_overwrite_id
session.query(NodeNgram).filter(NodeNgram.node_id==the_spec_id).delete()
session.commit()
else:
specnode = corpus.add_child(
typename = "SPECCLUSION",
name = "Specclusion (in:%s)" % corpus.id
)
session.add(specnode)
session.commit()
the_spec_id = specnode.id
# debug:
options.display.float_format = '${:,.3f}'.format
# print(Specclusion)
if not Specclusion.empty:
data = WeightedList(
zip( Specclusion.index.tolist()
, [v for v in map(round3, Specclusion.values.tolist())]
)
)
data.save(the_spec_id)
else:
print("WARNING: had no terms in COOCS => empty SPECCLUSION node")
#===========================================================================
# genclusion node
if gen_overwrite_id:
the_gen_id = gen_overwrite_id
session.query(NodeNgram).filter(NodeNgram.node_id==the_gen_id).delete()
session.commit()
else:
gennode = corpus.add_child(
typename = "GENCLUSION",
name = "Genclusion (in:%s)" % corpus.id
)
session.add(gennode)
session.commit()
the_gen_id = gennode.id
if not Genclusion.empty:
data = WeightedList(
zip( Genclusion.index.tolist()
, [v for v in map(round3, Genclusion.values.tolist())]
)
)
data.save(the_gen_id)
else:
print("WARNING: had no terms in COOCS => empty GENCLUSION node")
#===========================================================================
return(the_spec_id, the_gen_id)
"""
Computes a specificity metric from the ngram cooccurrence matrix.
+ SAVE => WeightedList => NodeNgram
"""
from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram
from gargantext.util.db import session, aliased, func, bulk_insert
from gargantext.util.lists import WeightedList
from collections import defaultdict
from pandas import DataFrame
import pandas as pd
def compute_specificity(corpus, cooc_id=None, cooc_matrix=None, overwrite_id = None):
'''
Compute the specificity, simple calculus.
Parameters:
- cooc_id: mandatory id of a cooccurrences node to use as base
- overwrite_id: optional preexisting specificity node to overwrite
'''
matrix = defaultdict(lambda : defaultdict(float))
if cooc_id == None and cooc_matrix == None:
raise TypeError("compute_specificity: needs a cooc_id or cooc_matrix param")
elif cooc_id:
cooccurrences = (session.query(NodeNgramNgram)
.filter(NodeNgramNgram.node_id==cooc_id)
)
# no filtering: cooc already filtered on mainlist_id at creation
for cooccurrence in cooccurrences:
matrix[cooccurrence.ngram1_id][cooccurrence.ngram2_id] = cooccurrence.weight
matrix[cooccurrence.ngram2_id][cooccurrence.ngram1_id] = cooccurrence.weight
elif cooc_matrix:
# copy WeightedMatrix into local matrix structure
for (ngram1_id, ngram2_id) in cooc_matrix.items:
w = cooc_matrix.items[(ngram1_id, ngram2_id)]
matrix[ngram1_id][ngram2_id] = w
nb_ngrams = len(matrix)
print("SPECIFICITY: computing on %i ngrams" % nb_ngrams)
x = DataFrame(matrix).fillna(0)
# proba (x/y) ( <= on divise chaque ligne par son total)
x = x / x.sum(axis=1)
# vectorisation
# d:Matrix => v: Vector (len = nb_ngrams)
# v = d.sum(axis=1) (- lui-même)
xs = x.sum(axis=1) - x
ys = x.sum(axis=0) - x
# top inclus ou exclus
#n = ( xs + ys) / (2 * (x.shape[0] - 1))
# top generic or specific (asc is spec, desc is generic)
v = ( xs - ys) / ( 2 * (x.shape[0] - 1))
## d ##
#######
# Grenelle biodiversité kilomètres site élus île
# Grenelle 0 0 4 0 0 0
# biodiversité 0 0 0 0 4 0
# kilomètres 4 0 0 0 4 0
# site 0 0 0 0 4 6
# élus 0 4 4 4 0 0
# île 0 0 0 6 0 0
## d.sum(axis=1) ##
###################
# Grenelle 4
# biodiversité 4
# kilomètres 8
# site 10
# élus 12
# île 6
# résultat temporaire
# -------------------
# pour l'instant on va utiliser les sommes en ligne comme ranking de spécificité
# (**même** ordre qu'avec la formule d'avant le refactoring mais calcul + simple)
# TODO analyser la cohérence math ET sem de cet indicateur
#v.sort_values(inplace=True)
# [ ('biodiversité' , 0.333 ),
# ('Grenelle' , 0.5 ),
# ('île' , 0.599 ),
# ('kilomètres' , 1.333 ),
# ('site' , 1.333 ),
# ('élus' , 1.899 ) ]
# ----------------
# specificity node
if overwrite_id:
# overwrite pre-existing id
the_id = overwrite_id
session.query(NodeNgram).filter(NodeNgram.node_id==the_id).delete()
session.commit()
else:
specnode = corpus.add_child(
typename = "SPECIFICITY",
name = "Specif (in:%s)" % corpus.id
)
session.add(specnode)
session.commit()
the_id = specnode.id
# print(v)
pd.options.display.float_format = '${:,.2f}'.format
if not v.empty:
data = WeightedList(
zip( v.index.tolist()
, v.values.tolist()[0]
)
)
data.save(the_id)
else:
print("WARNING: had no terms in COOCS => empty SPECIFICITY node")
return(the_id)
......@@ -18,7 +18,8 @@ def compute_coocs( corpus,
stoplist_id = None,
start = None,
end = None,
symmetry_filter = False):
symmetry_filter = False,
diagonal_filter = True):
"""
Count how often some extracted terms appear
together in a small context (document)
......@@ -55,6 +56,9 @@ def compute_coocs( corpus,
NB the expected type of parameter value is datetime.datetime
(string is also possible but format must follow
this convention: "2001-01-01" aka "%Y-%m-%d")
- symmetry_filter: prevent calculating where ngram1_id > ngram2_id
- diagonal_filter: prevent calculating where ngram1_id == ngram2_id
(deprecated parameters)
- field1,2: allowed to count other things than ngrams (eg tags) but no use case at present
......@@ -69,7 +73,7 @@ def compute_coocs( corpus,
JOIN nodes_ngrams AS idxb
ON idxa.node_id = idxb.node_id <== that's cooc
---------------------------------
AND idxa.ngram_id <> idxb.ngram_id
AND idxa.ngram_id <> idxb.ngram_id (diagonal_filter)
AND idxa.node_id = MY_DOC ;
on entire corpus
......@@ -152,16 +156,14 @@ def compute_coocs( corpus,
ucooc
# for debug (2/4)
#, Xngram.terms.label("w_x")
#, Yngram.terms.label("w_y")
# , Xngram.terms.label("w_x")
# , Yngram.terms.label("w_y")
)
.join(Yindex, Xindex.node_id == Yindex.node_id ) # <- by definition of cooc
.join(Node, Node.id == Xindex.node_id) # <- b/c within corpus
.filter(Node.parent_id == corpus.id) # <- b/c within corpus
.filter(Node.typename == "DOCUMENT") # <- b/c within corpus
.filter(Xindex_ngform_id != Yindex_ngform_id) # <- b/c not with itself
)
# outerjoin the synonyms if needed
......@@ -179,12 +181,12 @@ def compute_coocs( corpus,
.group_by(
Xindex_ngform_id, Yindex_ngform_id # <- what we're counting
# for debug (3/4)
#,"w_x", "w_y"
# ,"w_x", "w_y"
)
# for debug (4/4)
#.join(Xngram, Xngram.id == Xindex_ngform_id)
#.join(Yngram, Yngram.id == Yindex_ngform_id)
# .join(Xngram, Xngram.id == Xindex_ngform_id)
# .join(Yngram, Yngram.id == Yindex_ngform_id)
.order_by(ucooc)
)
......@@ -192,6 +194,9 @@ def compute_coocs( corpus,
# 4) INPUT FILTERS (reduce N before O(N²))
if on_list_id:
# £TODO listes différentes ou bien une liste pour x et tous les ngrammes pour y
# car permettrait expansion de liste aux plus proches voisins (MacLachlan)
# (avec une matr rectangulaire)
m1 = aliased(NodeNgram)
m2 = aliased(NodeNgram)
......@@ -226,6 +231,10 @@ def compute_coocs( corpus,
)
if diagonal_filter:
# don't compute ngram with itself
coocs_query = coocs_query.filter(Xindex_ngform_id != Yindex_ngform_id)
if start or end:
Time = aliased(NodeHyperdata)
......@@ -268,6 +277,7 @@ def compute_coocs( corpus,
# threshold
# £TODO adjust COOC_THRESHOLD a posteriori:
# ex: sometimes 2 sometimes 4 depending on sparsity
print("COOCS: filtering pairs under threshold:", threshold)
coocs_query = coocs_query.having(ucooc >= threshold)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment