Commit 49b9c2f0 authored by Romain Loth's avatar Romain Loth

[WIP] old specificity metric transformed into 2 better metrics: specclusion...

[WIP] old specificity metric transformed into 2 better metrics: specclusion and genclusion based on conditional probability
parent 4f676883
...@@ -12,7 +12,8 @@ LISTTYPES = { ...@@ -12,7 +12,8 @@ LISTTYPES = {
'STOPLIST' : UnweightedList, 'STOPLIST' : UnweightedList,
'MAINLIST' : UnweightedList, 'MAINLIST' : UnweightedList,
'MAPLIST' : UnweightedList, 'MAPLIST' : UnweightedList,
'SPECIFICITY' : WeightedList, 'SPECCLUSION' : WeightedList,
'GENCLUSION' : WeightedList,
'OCCURRENCES' : WeightedIndex, # could be WeightedList 'OCCURRENCES' : WeightedIndex, # could be WeightedList
'COOCCURRENCES': WeightedMatrix, 'COOCCURRENCES': WeightedMatrix,
'TFIDF-CORPUS' : WeightedIndex, 'TFIDF-CORPUS' : WeightedIndex,
...@@ -47,6 +48,7 @@ NODETYPES = [ ...@@ -47,6 +48,7 @@ NODETYPES = [
# more scores (sorry!) # more scores (sorry!)
'TIRANK-LOCAL', # 16 'TIRANK-LOCAL', # 16
'TIRANK-GLOBAL', # 17 'TIRANK-GLOBAL', # 17
'GENCLUSION', # 18
] ]
INDEXED_HYPERDATA = { INDEXED_HYPERDATA = {
......
...@@ -9,22 +9,26 @@ from gargantext.util.db_cache import cache ...@@ -9,22 +9,26 @@ from gargantext.util.db_cache import cache
from gargantext.util.lists import UnweightedList from gargantext.util.lists import UnweightedList
from sqlalchemy import desc, asc from sqlalchemy import desc, asc
from gargantext.constants import DEFAULT_MAPLIST_MAX,\ from gargantext.constants import DEFAULT_MAPLIST_MAX,\
DEFAULT_MAPLIST_GENCLUSION_RATIO,\
DEFAULT_MAPLIST_MONOGRAMS_RATIO DEFAULT_MAPLIST_MONOGRAMS_RATIO
def do_maplist(corpus, def do_maplist(corpus,
overwrite_id = None, overwrite_id = None,
mainlist_id = None, mainlist_id = None,
specificity_id = None, specclusion_id = None,
genclusion_id = None,
grouplist_id = None, grouplist_id = None,
limit=DEFAULT_MAPLIST_MAX, limit=DEFAULT_MAPLIST_MAX,
genclusion_part=DEFAULT_MAPLIST_GENCLUSION_RATIO,
monograms_part=DEFAULT_MAPLIST_MONOGRAMS_RATIO monograms_part=DEFAULT_MAPLIST_MONOGRAMS_RATIO
): ):
''' '''
According to Specificities and mainlist According to Genericity/Specificity and mainlist
Parameters: Parameters:
- mainlist_id (starting point, already cleaned of stoplist terms) - mainlist_id (starting point, already cleaned of stoplist terms)
- specificity_id (ranking factor) - specclusion_id (inclusion by cooc specificity -- ranking factor)
- genclusion_id (inclusion by cooc genericity -- ranking factor)
- grouplist_id (filtering grouped ones) - grouplist_id (filtering grouped ones)
- overwrite_id: optional if preexisting MAPLIST node to overwrite - overwrite_id: optional if preexisting MAPLIST node to overwrite
...@@ -33,8 +37,8 @@ def do_maplist(corpus, ...@@ -33,8 +37,8 @@ def do_maplist(corpus,
- monograms_part: a ratio of terms with only one lexical unit to keep - monograms_part: a ratio of terms with only one lexical unit to keep
''' '''
if not (mainlist_id and specificity_id and grouplist_id): if not (mainlist_id and specclusion_id and genclusion_id and grouplist_id):
raise ValueError("Please provide mainlist_id, specificity_id and grouplist_id") raise ValueError("Please provide mainlist_id, specclusion_id, genclusion_id and grouplist_id")
monograms_limit = round(limit * monograms_part) monograms_limit = round(limit * monograms_part)
multigrams_limit = limit - monograms_limit multigrams_limit = limit - monograms_limit
...@@ -58,7 +62,7 @@ def do_maplist(corpus, ...@@ -58,7 +62,7 @@ def do_maplist(corpus,
# specificity-ranked # specificity-ranked
query = (session.query(ScoreSpec.ngram_id) query = (session.query(ScoreSpec.ngram_id)
.join(Ngram, Ngram.id == ScoreSpec.ngram_id) .join(Ngram, Ngram.id == ScoreSpec.ngram_id)
.filter(ScoreSpec.node_id == specificity_id) .filter(ScoreSpec.node_id == specclusion_id)
# we want only terms within mainlist # we want only terms within mainlist
.join(MainlistTable, Ngram.id == MainlistTable.ngram_id) .join(MainlistTable, Ngram.id == MainlistTable.ngram_id)
...@@ -73,7 +77,7 @@ def do_maplist(corpus, ...@@ -73,7 +77,7 @@ def do_maplist(corpus,
# TODO: move these 2 pools up to mainlist selection # TODO: move these 2 pools up to mainlist selection
top_monograms = (query top_monograms = (query
.filter(Ngram.n == 1) .filter(Ngram.n == 1)
.order_by(asc(ScoreSpec.weight)) .order_by(desc(ScoreSpec.weight))
.limit(monograms_limit) .limit(monograms_limit)
.all() .all()
) )
......
...@@ -10,8 +10,8 @@ from .ngram_groups import compute_groups ...@@ -10,8 +10,8 @@ from .ngram_groups import compute_groups
from .metric_tfidf import compute_occs, compute_tfidf_local, compute_ti_ranking from .metric_tfidf import compute_occs, compute_tfidf_local, compute_ti_ranking
from .list_main import do_mainlist from .list_main import do_mainlist
from .ngram_coocs import compute_coocs from .ngram_coocs import compute_coocs
from .metric_specificity import compute_specificity from .metric_specgen import compute_specgen
from .list_map import do_maplist # TEST from .list_map import do_maplist
from .mail_notification import notify_owner from .mail_notification import notify_owner
from gargantext.util.db import session from gargantext.util.db import session
from gargantext.models import Node from gargantext.models import Node
...@@ -136,22 +136,26 @@ def parse_extract_indexhyperdata(corpus): ...@@ -136,22 +136,26 @@ def parse_extract_indexhyperdata(corpus):
# => used for doc <=> ngram association # => used for doc <=> ngram association
# ------------ # ------------
# -> cooccurrences on mainlist: compute + write (=> Node and NodeNgramNgram) # -> cooccurrences on mainlist: compute + write (=> Node and NodeNgramNgram)*
coocs = compute_coocs(corpus, coocs = compute_coocs(corpus,
on_list_id = mainlist_id, on_list_id = mainlist_id,
groupings_id = group_id, groupings_id = group_id,
just_pass_result = True) just_pass_result = True,
diagonal_filter = False) # preserving the diagonal
# (useful for spec/gen)
print('CORPUS #%d: [%s] computed mainlist coocs for specif rank' % (corpus.id, t())) print('CORPUS #%d: [%s] computed mainlist coocs for specif rank' % (corpus.id, t()))
# -> specificity: compute + write (=> NodeNodeNgram) # -> specclusion/genclusion: compute + write (2 Nodes + 2 lists in NodeNgram)
spec_id = compute_specificity(corpus,cooc_matrix = coocs) (spec_id, gen_id) = compute_specgen(corpus,cooc_matrix = coocs)
# no need here for subforms because cooc already counted them in mainform # no need here for subforms because cooc already counted them in mainform
print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), spec_id)) print('CORPUS #%d: [%s] new spec-clusion node #%i' % (corpus.id, t(), spec_id))
print('CORPUS #%d: [%s] new gen-clusion node #%i' % (corpus.id, t(), gen_id))
# maplist: compute + write (to Node and NodeNgram) # maplist: compute + write (to Node and NodeNgram)
map_id = do_maplist(corpus, map_id = do_maplist(corpus,
mainlist_id = mainlist_id, mainlist_id = mainlist_id,
specificity_id=spec_id, specclusion_id=spec_id,
genclusion_id=gen_id,
grouplist_id=group_id grouplist_id=group_id
) )
print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id)) print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id))
...@@ -187,7 +191,7 @@ def recount(corpus): ...@@ -187,7 +191,7 @@ def recount(corpus):
- ndocs - ndocs
- ti_rank - ti_rank
- coocs - coocs
- specificity - specclusion/genclusion
- tfidf - tfidf
NB: no new extraction, no list change, just the metrics NB: no new extraction, no list change, just the metrics
...@@ -208,10 +212,15 @@ def recount(corpus): ...@@ -208,10 +212,15 @@ def recount(corpus):
old_tirank_id = None old_tirank_id = None
try: try:
old_spec_id = corpus.children("SPECIFICITY").first().id old_spec_id = corpus.children("SPECCLUSION").first().id
except: except:
old_spec_id = None old_spec_id = None
try:
old_gen_id = corpus.children("GENCLUSION").first().id
except:
old_gen_id = None
try: try:
old_ltfidf_id = corpus.children("TFIDF-CORPUS").first().id old_ltfidf_id = corpus.children("TFIDF-CORPUS").first().id
except: except:
...@@ -254,11 +263,13 @@ def recount(corpus): ...@@ -254,11 +263,13 @@ def recount(corpus):
just_pass_result = True) just_pass_result = True)
print('RECOUNT #%d: [%s] updated mainlist coocs for specif rank' % (corpus.id, t())) print('RECOUNT #%d: [%s] updated mainlist coocs for specif rank' % (corpus.id, t()))
# -> specificity: compute + write (=> NodeNgram)
spec_id = compute_specificity(corpus,cooc_matrix = coocs, overwrite_id = old_spec_id)
# -> specclusion/genclusion: compute + write (=> NodeNodeNgram)
(spec_id, gen_id) = compute_specgen(corpus, cooc_matrix = coocs,
spec_overwrite_id = spec_id, gen_overwrite_id = gen_id)
print('RECOUNT #%d: [%s] updated specificity node #%i' % (corpus.id, t(), spec_id)) print('RECOUNT #%d: [%s] updated spec-clusion node #%i' % (corpus.id, t(), spec_id))
print('RECOUNT #%d: [%s] updated gen-clusion node #%i' % (corpus.id, t(), gen_id))
print('RECOUNT #%d: [%s] FINISHED metric recounts' % (corpus.id, t())) print('RECOUNT #%d: [%s] FINISHED metric recounts' % (corpus.id, t()))
......
"""
Computes a specificity metric from the ngram cooccurrence matrix.
+ SAVE => WeightedList => NodeNgram
"""
from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram
from gargantext.util.db import session, aliased, func, bulk_insert
from gargantext.util.lists import WeightedList
from collections import defaultdict
from pandas import DataFrame
from numpy import diag
def round3(floating_number):
"""
Rounds a floating number to 3 decimals
Good when we don't need so much details in the DB writen data
"""
return float("%.3f" % floating_number)
def compute_specgen(corpus, cooc_id=None, cooc_matrix=None,
spec_overwrite_id = None, gen_overwrite_id = None):
'''
Compute genericity/specificity:
P(j|i) = N(ij) / N(ii)
P(i|j) = N(ij) / N(jj)
Gen(i) = Sum{j} P(j_k|i)
Spec(i) = Sum{j} P(i|j_k)
Gen-clusion(i) = (Spec(i) + Gen(i)) / 2
Spec-clusion(i) = (Spec(i) - Gen(i)) / 2
Parameters:
- cooc_id: mandatory id of a cooccurrences node to use as base
- spec_overwrite_id: optional preexisting specificity node to overwrite
- gen_overwrite_id: optional preexisting genericity node to overwrite
'''
matrix = defaultdict(lambda : defaultdict(float))
if cooc_id == None and cooc_matrix == None:
raise TypeError("compute_specificity: needs a cooc_id or cooc_matrix param")
elif cooc_id:
cooccurrences = (session.query(NodeNgramNgram)
.filter(NodeNgramNgram.node_id==cooc_id)
)
# no filtering: cooc already filtered on mainlist_id at creation
for cooccurrence in cooccurrences:
matrix[cooccurrence.ngram1_id][cooccurrence.ngram2_id] = cooccurrence.weight
# matrix[cooccurrence.ngram2_id][cooccurrence.ngram1_id] = cooccurrence.weight
elif cooc_matrix:
# copy WeightedMatrix into local matrix structure
for (ngram1_id, ngram2_id) in cooc_matrix.items:
w = cooc_matrix.items[(ngram1_id, ngram2_id)]
# ------- 8< --------------------------------------------
# tempo hack to ignore lines/columns where diagonal == 0
# £TODO find why they exist and then remove this snippet
if (((ngram1_id,ngram1_id) not in cooc_matrix.items) or
((ngram2_id,ngram2_id) not in cooc_matrix.items)):
continue
# ------- 8< --------------------------------------------
matrix[ngram1_id][ngram2_id] = w
nb_ngrams = len(matrix)
print("SPECIFICITY: computing on %i ngrams" % nb_ngrams)
# example corpus (7 docs, 8 nouns)
# --------------------------------
# "The report says that humans are animals."
# "The report says that rivers are full of water."
# "The report says that humans like to make war."
# "The report says that animals must eat food."
# "The report says that animals drink water."
# "The report says that humans like food and water."
# "The report says that grass is food for some animals."
#===========================================================================
cooc_counts = DataFrame(matrix).fillna(0)
# cooc_counts matrix
# ------------------
# animals food grass humans report rivers war water
# animals 4 2 1 1 4 0 0 1
# food 2 3 1 1 3 0 0 1
# grass 1 1 1 0 1 0 0 0
# humans 1 1 0 3 3 0 1 1
# report 4 3 1 3 7 1 1 3
# rivers 0 0 0 0 1 1 0 1
# war 0 0 0 1 1 0 1 0
# water 1 1 0 1 3 1 0 3
#===========================================================================
# conditional p(col|line)
diagonal = list(diag(cooc_counts))
# debug
# print("WARN diag: ", diagonal)
# print("WARN diag: =================== 0 in diagonal ?\n",
# 0 in diagonal ? "what ??? zeros in the diagonal :/" : "ok no zeros",
# "\n===================")
p_col_given_line = cooc_counts / list(diag(cooc_counts))
# p_col_given_line
# ----------------
# animals food grass humans report rivers war water
# animals 1.0 0.7 1.0 0.3 0.6 0.0 0.0 0.3
# food 0.5 1.0 1.0 0.3 0.4 0.0 0.0 0.3
# grass 0.2 0.3 1.0 0.0 0.1 0.0 0.0 0.0
# humans 0.2 0.3 0.0 1.0 0.4 0.0 1.0 0.3
# report 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
# rivers 0.0 0.0 0.0 0.0 0.1 1.0 0.0 0.3
# war 0.0 0.0 0.0 0.3 0.1 0.0 1.0 0.0
# water 0.2 0.3 0.0 0.3 0.4 1.0 0.0 1.0
#===========================================================================
# total per lines (<=> genericity)
Gen = p_col_given_line.sum(axis=1)
# Gen.sort_values(ascending=False)
# ---
# report 8.0
# animals 3.9
# food 3.6
# water 3.3
# humans 3.3
# grass 1.7
# war 1.5
# rivers 1.5
#===========================================================================
# total columnwise (<=> specificity)
Spec = p_col_given_line.sum(axis=0)
# Spec.sort_values(ascending=False)
# ----
# grass 4.0
# food 3.7
# water 3.3
# humans 3.3
# report 3.3
# animals 3.2
# war 3.0
# rivers 3.0
#===========================================================================
# our "inclusion by specificity" metric
Specclusion = Spec-Gen
# Specclusion.sort_values(ascending=False)
# -----------
# grass 1.1
# war 0.8
# rivers 0.8
# food 0.0
# humans -0.0
# water -0.0
# animals -0.3
# report -2.4
#===========================================================================
# our "inclusion by genericity" metric
Genclusion = Spec+Gen
# Genclusion.sort_values(ascending=False)
# -----------
# report 11.3
# food 7.3
# animals 7.2
# water 6.7
# humans 6.7
# grass 5.7
# war 4.5
# rivers 4.5
#===========================================================================
# specificity node
if spec_overwrite_id:
# overwrite pre-existing id
the_spec_id = spec_overwrite_id
session.query(NodeNgram).filter(NodeNgram.node_id==the_spec_id).delete()
session.commit()
else:
specnode = corpus.add_child(
typename = "SPECCLUSION",
name = "Specclusion (in:%s)" % corpus.id
)
session.add(specnode)
session.commit()
the_spec_id = specnode.id
# debug:
options.display.float_format = '${:,.3f}'.format
# print(Specclusion)
if not Specclusion.empty:
data = WeightedList(
zip( Specclusion.index.tolist()
, [v for v in map(round3, Specclusion.values.tolist())]
)
)
data.save(the_spec_id)
else:
print("WARNING: had no terms in COOCS => empty SPECCLUSION node")
#===========================================================================
# genclusion node
if gen_overwrite_id:
the_gen_id = gen_overwrite_id
session.query(NodeNgram).filter(NodeNgram.node_id==the_gen_id).delete()
session.commit()
else:
gennode = corpus.add_child(
typename = "GENCLUSION",
name = "Genclusion (in:%s)" % corpus.id
)
session.add(gennode)
session.commit()
the_gen_id = gennode.id
if not Genclusion.empty:
data = WeightedList(
zip( Genclusion.index.tolist()
, [v for v in map(round3, Genclusion.values.tolist())]
)
)
data.save(the_gen_id)
else:
print("WARNING: had no terms in COOCS => empty GENCLUSION node")
#===========================================================================
return(the_spec_id, the_gen_id)
"""
Computes a specificity metric from the ngram cooccurrence matrix.
+ SAVE => WeightedList => NodeNgram
"""
from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram
from gargantext.util.db import session, aliased, func, bulk_insert
from gargantext.util.lists import WeightedList
from collections import defaultdict
from pandas import DataFrame
import pandas as pd
def compute_specificity(corpus, cooc_id=None, cooc_matrix=None, overwrite_id = None):
'''
Compute the specificity, simple calculus.
Parameters:
- cooc_id: mandatory id of a cooccurrences node to use as base
- overwrite_id: optional preexisting specificity node to overwrite
'''
matrix = defaultdict(lambda : defaultdict(float))
if cooc_id == None and cooc_matrix == None:
raise TypeError("compute_specificity: needs a cooc_id or cooc_matrix param")
elif cooc_id:
cooccurrences = (session.query(NodeNgramNgram)
.filter(NodeNgramNgram.node_id==cooc_id)
)
# no filtering: cooc already filtered on mainlist_id at creation
for cooccurrence in cooccurrences:
matrix[cooccurrence.ngram1_id][cooccurrence.ngram2_id] = cooccurrence.weight
matrix[cooccurrence.ngram2_id][cooccurrence.ngram1_id] = cooccurrence.weight
elif cooc_matrix:
# copy WeightedMatrix into local matrix structure
for (ngram1_id, ngram2_id) in cooc_matrix.items:
w = cooc_matrix.items[(ngram1_id, ngram2_id)]
matrix[ngram1_id][ngram2_id] = w
nb_ngrams = len(matrix)
print("SPECIFICITY: computing on %i ngrams" % nb_ngrams)
x = DataFrame(matrix).fillna(0)
# proba (x/y) ( <= on divise chaque ligne par son total)
x = x / x.sum(axis=1)
# vectorisation
# d:Matrix => v: Vector (len = nb_ngrams)
# v = d.sum(axis=1) (- lui-même)
xs = x.sum(axis=1) - x
ys = x.sum(axis=0) - x
# top inclus ou exclus
#n = ( xs + ys) / (2 * (x.shape[0] - 1))
# top generic or specific (asc is spec, desc is generic)
v = ( xs - ys) / ( 2 * (x.shape[0] - 1))
## d ##
#######
# Grenelle biodiversité kilomètres site élus île
# Grenelle 0 0 4 0 0 0
# biodiversité 0 0 0 0 4 0
# kilomètres 4 0 0 0 4 0
# site 0 0 0 0 4 6
# élus 0 4 4 4 0 0
# île 0 0 0 6 0 0
## d.sum(axis=1) ##
###################
# Grenelle 4
# biodiversité 4
# kilomètres 8
# site 10
# élus 12
# île 6
# résultat temporaire
# -------------------
# pour l'instant on va utiliser les sommes en ligne comme ranking de spécificité
# (**même** ordre qu'avec la formule d'avant le refactoring mais calcul + simple)
# TODO analyser la cohérence math ET sem de cet indicateur
#v.sort_values(inplace=True)
# [ ('biodiversité' , 0.333 ),
# ('Grenelle' , 0.5 ),
# ('île' , 0.599 ),
# ('kilomètres' , 1.333 ),
# ('site' , 1.333 ),
# ('élus' , 1.899 ) ]
# ----------------
# specificity node
if overwrite_id:
# overwrite pre-existing id
the_id = overwrite_id
session.query(NodeNgram).filter(NodeNgram.node_id==the_id).delete()
session.commit()
else:
specnode = corpus.add_child(
typename = "SPECIFICITY",
name = "Specif (in:%s)" % corpus.id
)
session.add(specnode)
session.commit()
the_id = specnode.id
# print(v)
pd.options.display.float_format = '${:,.2f}'.format
if not v.empty:
data = WeightedList(
zip( v.index.tolist()
, v.values.tolist()[0]
)
)
data.save(the_id)
else:
print("WARNING: had no terms in COOCS => empty SPECIFICITY node")
return(the_id)
...@@ -18,7 +18,8 @@ def compute_coocs( corpus, ...@@ -18,7 +18,8 @@ def compute_coocs( corpus,
stoplist_id = None, stoplist_id = None,
start = None, start = None,
end = None, end = None,
symmetry_filter = False): symmetry_filter = False,
diagonal_filter = True):
""" """
Count how often some extracted terms appear Count how often some extracted terms appear
together in a small context (document) together in a small context (document)
...@@ -55,6 +56,9 @@ def compute_coocs( corpus, ...@@ -55,6 +56,9 @@ def compute_coocs( corpus,
NB the expected type of parameter value is datetime.datetime NB the expected type of parameter value is datetime.datetime
(string is also possible but format must follow (string is also possible but format must follow
this convention: "2001-01-01" aka "%Y-%m-%d") this convention: "2001-01-01" aka "%Y-%m-%d")
- symmetry_filter: prevent calculating where ngram1_id > ngram2_id
- diagonal_filter: prevent calculating where ngram1_id == ngram2_id
(deprecated parameters) (deprecated parameters)
- field1,2: allowed to count other things than ngrams (eg tags) but no use case at present - field1,2: allowed to count other things than ngrams (eg tags) but no use case at present
...@@ -69,7 +73,7 @@ def compute_coocs( corpus, ...@@ -69,7 +73,7 @@ def compute_coocs( corpus,
JOIN nodes_ngrams AS idxb JOIN nodes_ngrams AS idxb
ON idxa.node_id = idxb.node_id <== that's cooc ON idxa.node_id = idxb.node_id <== that's cooc
--------------------------------- ---------------------------------
AND idxa.ngram_id <> idxb.ngram_id AND idxa.ngram_id <> idxb.ngram_id (diagonal_filter)
AND idxa.node_id = MY_DOC ; AND idxa.node_id = MY_DOC ;
on entire corpus on entire corpus
...@@ -152,16 +156,14 @@ def compute_coocs( corpus, ...@@ -152,16 +156,14 @@ def compute_coocs( corpus,
ucooc ucooc
# for debug (2/4) # for debug (2/4)
#, Xngram.terms.label("w_x") # , Xngram.terms.label("w_x")
#, Yngram.terms.label("w_y") # , Yngram.terms.label("w_y")
) )
.join(Yindex, Xindex.node_id == Yindex.node_id ) # <- by definition of cooc .join(Yindex, Xindex.node_id == Yindex.node_id ) # <- by definition of cooc
.join(Node, Node.id == Xindex.node_id) # <- b/c within corpus .join(Node, Node.id == Xindex.node_id) # <- b/c within corpus
.filter(Node.parent_id == corpus.id) # <- b/c within corpus .filter(Node.parent_id == corpus.id) # <- b/c within corpus
.filter(Node.typename == "DOCUMENT") # <- b/c within corpus .filter(Node.typename == "DOCUMENT") # <- b/c within corpus
.filter(Xindex_ngform_id != Yindex_ngform_id) # <- b/c not with itself
) )
# outerjoin the synonyms if needed # outerjoin the synonyms if needed
...@@ -179,12 +181,12 @@ def compute_coocs( corpus, ...@@ -179,12 +181,12 @@ def compute_coocs( corpus,
.group_by( .group_by(
Xindex_ngform_id, Yindex_ngform_id # <- what we're counting Xindex_ngform_id, Yindex_ngform_id # <- what we're counting
# for debug (3/4) # for debug (3/4)
#,"w_x", "w_y" # ,"w_x", "w_y"
) )
# for debug (4/4) # for debug (4/4)
#.join(Xngram, Xngram.id == Xindex_ngform_id) # .join(Xngram, Xngram.id == Xindex_ngform_id)
#.join(Yngram, Yngram.id == Yindex_ngform_id) # .join(Yngram, Yngram.id == Yindex_ngform_id)
.order_by(ucooc) .order_by(ucooc)
) )
...@@ -192,6 +194,9 @@ def compute_coocs( corpus, ...@@ -192,6 +194,9 @@ def compute_coocs( corpus,
# 4) INPUT FILTERS (reduce N before O(N²)) # 4) INPUT FILTERS (reduce N before O(N²))
if on_list_id: if on_list_id:
# £TODO listes différentes ou bien une liste pour x et tous les ngrammes pour y
# car permettrait expansion de liste aux plus proches voisins (MacLachlan)
# (avec une matr rectangulaire)
m1 = aliased(NodeNgram) m1 = aliased(NodeNgram)
m2 = aliased(NodeNgram) m2 = aliased(NodeNgram)
...@@ -226,6 +231,10 @@ def compute_coocs( corpus, ...@@ -226,6 +231,10 @@ def compute_coocs( corpus,
) )
if diagonal_filter:
# don't compute ngram with itself
coocs_query = coocs_query.filter(Xindex_ngform_id != Yindex_ngform_id)
if start or end: if start or end:
Time = aliased(NodeHyperdata) Time = aliased(NodeHyperdata)
...@@ -268,6 +277,7 @@ def compute_coocs( corpus, ...@@ -268,6 +277,7 @@ def compute_coocs( corpus,
# threshold # threshold
# £TODO adjust COOC_THRESHOLD a posteriori: # £TODO adjust COOC_THRESHOLD a posteriori:
# ex: sometimes 2 sometimes 4 depending on sparsity # ex: sometimes 2 sometimes 4 depending on sparsity
print("COOCS: filtering pairs under threshold:", threshold)
coocs_query = coocs_query.having(ucooc >= threshold) coocs_query = coocs_query.having(ucooc >= threshold)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment