Commit 4bfc0b6c authored by delanoe's avatar delanoe

Merge branch 'romain-goodies' into unstable

parents 1925c104 f542b69e
......@@ -12,14 +12,16 @@ LISTTYPES = {
'STOPLIST' : UnweightedList,
'MAINLIST' : UnweightedList,
'MAPLIST' : UnweightedList,
'SPECIFICITY' : WeightedList,
'SPECCLUSION' : WeightedList,
'GENCLUSION' : WeightedList,
'OCCURRENCES' : WeightedIndex, # could be WeightedList
'COOCCURRENCES': WeightedMatrix,
'TFIDF-CORPUS' : WeightedIndex,
'TFIDF-GLOBAL' : WeightedIndex,
'TIRANK-LOCAL' : WeightedIndex, # could be WeightedList
'TIRANK-GLOBAL' : WeightedIndex # could be WeightedList
'TIRANK-GLOBAL' : WeightedIndex, # could be WeightedList
}
# 'OWNLIST' : UnweightedList, # £TODO use this for any term-level tags
NODETYPES = [
# TODO separate id not array index, read by models.node
......@@ -37,7 +39,7 @@ NODETYPES = [
'COOCCURRENCES', # 9
# scores
'OCCURRENCES', # 10
'SPECIFICITY', # 11
'SPECCLUSION', # 11
'CVALUE', # 12
'TFIDF-CORPUS', # 13
'TFIDF-GLOBAL', # 14
......@@ -47,6 +49,7 @@ NODETYPES = [
# more scores (sorry!)
'TIRANK-LOCAL', # 16
'TIRANK-GLOBAL', # 17
'GENCLUSION', # 18
]
INDEXED_HYPERDATA = {
......@@ -222,12 +225,16 @@ DEFAULT_RANK_CUTOFF_RATIO = .75 # MAINLIST maximum terms in %
DEFAULT_RANK_HARD_LIMIT = 5000 # MAINLIST maximum terms abs
# (makes COOCS larger ~ O(N²) /!\)
DEFAULT_COOC_THRESHOLD = 2 # inclusive minimum for COOCS coefs
DEFAULT_COOC_THRESHOLD = 3 # inclusive minimum for COOCS coefs
# (makes COOCS more sparse)
DEFAULT_MAPLIST_MAX = 350 # MAPLIST maximum terms
DEFAULT_MAPLIST_MONOGRAMS_RATIO = .15 # part of monograms in MAPLIST
DEFAULT_MAPLIST_MONOGRAMS_RATIO = .2 # quota of monograms in MAPLIST
# (vs multigrams = 1-mono)
DEFAULT_MAPLIST_GENCLUSION_RATIO = .6 # quota of top genclusion in MAPLIST
# (vs top specclusion = 1-gen)
DEFAULT_MAX_NGRAM_LEN = 7 # limit used after POStagging rule
# (initial ngrams number is a power law of this /!\)
......@@ -272,7 +279,7 @@ DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY
# about batch processing...
BATCH_PARSING_SIZE = 256
BATCH_NGRAMSEXTRACTION_SIZE = 1024
BATCH_NGRAMSEXTRACTION_SIZE = 3000 # how many distinct ngrams before INTEGRATE
# Scrapers config
......@@ -282,7 +289,7 @@ QUERY_SIZE_N_DEFAULT = 1000
# Grammar rules for chunking
RULE_JJNN = "{<JJ.*>*<NN.*|>+<JJ.*>*}"
RULE_JJDTNN = "{<JJ.*>*<NN.*>+((<P|IN> <DT>? <JJ.*>* <NN.*>+ <JJ.*>*)|(<JJ.*>))*}"
RULE_NPN = "{<JJ.*>*<NN.*>+((<P|IN> <DT>? <JJ.*>* <NN.*>+ <JJ.*>*)|(<JJ.*>))*}"
RULE_TINA = "^((VBD,|VBG,|VBN,|CD.?,|JJ.?,|\?,){0,2}?(N.?.?,|\?,)+?(CD.,)??)\
+?((PREP.?|DET.?,|IN.?,|CC.?,|\?,)((VBD,|VBG,|VBN,|CD.?,|JJ.?,|\?\
,){0,2}?(N.?.?,|\?,)+?)+?)*?$"
......@@ -19,7 +19,7 @@ from gargantext.constants import DEFAULT_CSV_DELIM, DEFAULT_CSV_DELIM_GRO
# import will implement the same text cleaning procedures as toolchain
from gargantext.util.toolchain.parsing import normalize_chars
from gargantext.util.toolchain.ngrams_extraction import normalize_terms
from gargantext.util.toolchain.ngrams_extraction import normalize_forms
from sqlalchemy.sql import exists
from os import path
......
from gargantext.util.languages import languages
from gargantext.constants import LANGUAGES, DEFAULT_MAX_NGRAM_LEN, RULE_JJNN, RULE_JJDTNN
from gargantext.constants import LANGUAGES, DEFAULT_MAX_NGRAM_LEN, RULE_JJNN, RULE_NPN
import nltk
import re
......
......@@ -39,11 +39,11 @@ def do_mainlist(corpus,
# retrieve helper nodes if not provided
if not ranking_scores_id:
ranking_scores_id = session.query(Node.id).filter(
Node.typename == "TFIDF-GLOBAL",
Node.typename == "TIRANK-GLOBAL",
Node.parent_id == corpus.id
).first()
if not ranking_scores_id:
raise ValueError("MAINLIST: TFIDF node needed for mainlist creation")
raise ValueError("MAINLIST: TIRANK node needed for mainlist creation")
if not stoplist_id:
stoplist_id = session.query(Node.id).filter(
......
......@@ -9,37 +9,49 @@ from gargantext.util.db_cache import cache
from gargantext.util.lists import UnweightedList
from sqlalchemy import desc, asc
from gargantext.constants import DEFAULT_MAPLIST_MAX,\
DEFAULT_MAPLIST_GENCLUSION_RATIO,\
DEFAULT_MAPLIST_MONOGRAMS_RATIO
def do_maplist(corpus,
overwrite_id = None,
mainlist_id = None,
specificity_id = None,
specclusion_id = None,
genclusion_id = None,
grouplist_id = None,
limit=DEFAULT_MAPLIST_MAX,
genclusion_part=DEFAULT_MAPLIST_GENCLUSION_RATIO,
monograms_part=DEFAULT_MAPLIST_MONOGRAMS_RATIO
):
'''
According to Specificities and mainlist
According to Genericity/Specificity and mainlist
Parameters:
- mainlist_id (starting point, already cleaned of stoplist terms)
- specificity_id (ranking factor)
- specclusion_id (ngram inclusion by cooc specificity -- ranking factor)
- genclusion_id (ngram inclusion by cooc genericity -- ranking factor)
- grouplist_id (filtering grouped ones)
- overwrite_id: optional if preexisting MAPLIST node to overwrite
+ 2 constants to modulate the terms choice
+ 3 params to modulate the terms choice
- limit for the amount of picked terms
- monograms_part: a ratio of terms with only one lexical unit to keep
(multigrams quota = limit * (1-monograms_part))
- genclusion_part: a ratio of terms with only one lexical unit to keep
(speclusion quota = limit * (1-genclusion_part))
'''
if not (mainlist_id and specificity_id and grouplist_id):
raise ValueError("Please provide mainlist_id, specificity_id and grouplist_id")
if not (mainlist_id and specclusion_id and genclusion_id and grouplist_id):
raise ValueError("Please provide mainlist_id, specclusion_id, genclusion_id and grouplist_id")
monograms_limit = round(limit * monograms_part)
multigrams_limit = limit - monograms_limit
print("MAPLIST: monograms_limit =", monograms_limit)
print("MAPLIST: multigrams_limit = ", multigrams_limit)
quotas = {'topgen':{}, 'topspec':{}}
genclusion_limit = round(limit * genclusion_part)
speclusion_limit = limit - genclusion_limit
quotas['topgen']['monograms'] = round(genclusion_limit * monograms_part)
quotas['topgen']['multigrams'] = genclusion_limit - quotas['topgen']['monograms']
quotas['topspec']['monograms'] = round(speclusion_limit * monograms_part)
quotas['topspec']['multigrams'] = speclusion_limit - quotas['topspec']['monograms']
print("MAPLIST quotas:", quotas)
#dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
......@@ -54,11 +66,19 @@ def do_maplist(corpus,
)
ScoreSpec=aliased(NodeNgram)
# specificity-ranked
query = (session.query(ScoreSpec.ngram_id)
ScoreGen=aliased(NodeNgram)
# ngram with both ranking factors spec and gen
query = (session.query(
ScoreSpec.ngram_id,
ScoreSpec.weight,
ScoreGen.weight,
Ngram.n
)
.join(Ngram, Ngram.id == ScoreSpec.ngram_id)
.filter(ScoreSpec.node_id == specificity_id)
.join(ScoreGen, ScoreGen.ngram_id == ScoreSpec.ngram_id)
.filter(ScoreSpec.node_id == specclusion_id)
.filter(ScoreGen.node_id == genclusion_id)
# we want only terms within mainlist
.join(MainlistTable, Ngram.id == MainlistTable.ngram_id)
......@@ -68,36 +88,99 @@ def do_maplist(corpus,
.outerjoin(IsSubform,
IsSubform.c.ngram2_id == ScoreSpec.ngram_id)
.filter(IsSubform.c.ngram2_id == None)
)
# TODO: move these 2 pools up to mainlist selection
top_monograms = (query
.filter(Ngram.n == 1)
.order_by(asc(ScoreSpec.weight))
.limit(monograms_limit)
.all()
)
top_multigrams = (query
.filter(Ngram.n >= 2)
# specificity-ranked
.order_by(desc(ScoreSpec.weight))
.limit(multigrams_limit)
.all()
)
obtained_mono = len(top_monograms)
obtained_multi = len(top_multigrams)
obtained_total = obtained_mono + obtained_multi
# print("MAPLIST: top_monograms =", obtained_mono)
# print("MAPLIST: top_multigrams = ", obtained_multi)
)
# format in scored_ngrams array:
# -------------------------------
# [(37723, 8.428, 14.239, 3 ), etc]
# ngramid wspec wgen nwords
scored_ngrams = query.all()
n_ngrams = len(scored_ngrams)
if n_ngrams == 0:
raise ValueError("No ngrams in cooc table ?")
# results, with same structure as quotas
chosen_ngrams = {
'topgen':{'monograms':[], 'multigrams':[]},
'topspec':{'monograms':[], 'multigrams':[]}
}
# specificity and genericity are rather reverse-correlated
# but occasionally they can have common ngrams (same ngram well ranked in both)
# => we'll use a lookup table to check if we didn't already get it
already_gotten_ngramids = {}
# 2 loops to fill spec-clusion then gen-clusion quotas
# (1st loop uses order from DB, 2nd loop uses our own sort at end of 1st)
for rkr in ['topspec', 'topgen']:
got_enough_mono = False
got_enough_multi = False
all_done = False
i = -1
while((not all_done) and (not (got_enough_mono and got_enough_multi))):
# retrieve sorted ngram n° i
i += 1
(ng_id, wspec, wgen, nwords) = scored_ngrams[i]
# before any continue case, we check the next i for max reached
all_done = (i+1 >= n_ngrams)
if ng_id in already_gotten_ngramids:
continue
# NB: nwords could be replaced by a simple search on r' '
if nwords == 1:
if got_enough_mono:
continue
else:
# add ngram to results and lookup
chosen_ngrams[rkr]['monograms'].append(ng_id)
already_gotten_ngramids[ng_id] = True
# multi
else:
if got_enough_multi:
continue
else:
# add ngram to results and lookup
chosen_ngrams[rkr]['multigrams'].append(ng_id)
already_gotten_ngramids[ng_id] = True
got_enough_mono = (len(chosen_ngrams[rkr]['monograms']) >= quotas[rkr]['monograms'])
got_enough_multi = (len(chosen_ngrams[rkr]['multigrams']) >= quotas[rkr]['multigrams'])
# at the end of the first loop we just need to sort all by the second ranker (gen)
scored_ngrams = sorted(scored_ngrams, key=lambda ng_infos: ng_infos[2], reverse=True)
obtained_spec_mono = len(chosen_ngrams['topspec']['monograms'])
obtained_spec_multi = len(chosen_ngrams['topspec']['multigrams'])
obtained_gen_mono = len(chosen_ngrams['topgen']['monograms'])
obtained_gen_multi = len(chosen_ngrams['topgen']['multigrams'])
obtained_total = obtained_spec_mono \
+ obtained_spec_multi \
+ obtained_gen_mono \
+ obtained_gen_multi
print("MAPLIST: top_spec_monograms =", obtained_spec_mono)
print("MAPLIST: top_spec_multigrams =", obtained_spec_multi)
print("MAPLIST: top_gen_monograms =", obtained_gen_mono)
print("MAPLIST: top_gen_multigrams =", obtained_gen_multi)
print("MAPLIST: kept %i ngrams in total " % obtained_total)
obtained_data = chosen_ngrams['topspec']['monograms'] \
+ chosen_ngrams['topspec']['multigrams'] \
+ chosen_ngrams['topgen']['monograms'] \
+ chosen_ngrams['topgen']['multigrams']
# NEW MAPLIST NODE
# -----------------
# saving the parameters of the analysis in the Node JSON
new_hyperdata = { 'corpus': corpus.id,
'limit' : limit,
'monograms_part' : monograms_part,
'monograms_result' : obtained_mono/obtained_total if obtained_total != 0 else 0
'monograms_part' : monograms_part,
'genclusion_part' : genclusion_part,
}
if overwrite_id:
# overwrite pre-existing node
......@@ -118,9 +201,7 @@ def do_maplist(corpus,
the_id = the_maplist.id
# create UnweightedList object and save (=> new NodeNgram rows)
datalist = UnweightedList(
[res.ngram_id for res in top_monograms + top_multigrams]
)
datalist = UnweightedList(obtained_data)
# save
datalist.save(the_id)
......
......@@ -10,8 +10,8 @@ from .ngram_groups import compute_groups
from .metric_tfidf import compute_occs, compute_tfidf_local, compute_ti_ranking
from .list_main import do_mainlist
from .ngram_coocs import compute_coocs
from .metric_specificity import compute_specificity
from .list_map import do_maplist # TEST
from .metric_specgen import compute_specgen
from .list_map import do_maplist
from .mail_notification import notify_owner
from gargantext.util.db import session
from gargantext.models import Node
......@@ -136,22 +136,26 @@ def parse_extract_indexhyperdata(corpus):
# => used for doc <=> ngram association
# ------------
# -> cooccurrences on mainlist: compute + write (=> Node and NodeNgramNgram)
# -> cooccurrences on mainlist: compute + write (=> Node and NodeNgramNgram)*
coocs = compute_coocs(corpus,
on_list_id = mainlist_id,
groupings_id = group_id,
just_pass_result = True)
just_pass_result = True,
diagonal_filter = False) # preserving the diagonal
# (useful for spec/gen)
print('CORPUS #%d: [%s] computed mainlist coocs for specif rank' % (corpus.id, t()))
# -> specificity: compute + write (=> NodeNodeNgram)
spec_id = compute_specificity(corpus,cooc_matrix = coocs)
# -> specclusion/genclusion: compute + write (2 Nodes + 2 lists in NodeNgram)
(spec_id, gen_id) = compute_specgen(corpus,cooc_matrix = coocs)
# no need here for subforms because cooc already counted them in mainform
print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), spec_id))
print('CORPUS #%d: [%s] new spec-clusion node #%i' % (corpus.id, t(), spec_id))
print('CORPUS #%d: [%s] new gen-clusion node #%i' % (corpus.id, t(), gen_id))
# maplist: compute + write (to Node and NodeNgram)
map_id = do_maplist(corpus,
mainlist_id = mainlist_id,
specificity_id=spec_id,
specclusion_id=spec_id,
genclusion_id=gen_id,
grouplist_id=group_id
)
print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id))
......@@ -187,7 +191,7 @@ def recount(corpus):
- ndocs
- ti_rank
- coocs
- specificity
- specclusion/genclusion
- tfidf
NB: no new extraction, no list change, just the metrics
......@@ -208,10 +212,15 @@ def recount(corpus):
old_tirank_id = None
try:
old_spec_id = corpus.children("SPECIFICITY").first().id
old_spec_id = corpus.children("SPECCLUSION").first().id
except:
old_spec_id = None
try:
old_gen_id = corpus.children("GENCLUSION").first().id
except:
old_gen_id = None
try:
old_ltfidf_id = corpus.children("TFIDF-CORPUS").first().id
except:
......@@ -254,11 +263,13 @@ def recount(corpus):
just_pass_result = True)
print('RECOUNT #%d: [%s] updated mainlist coocs for specif rank' % (corpus.id, t()))
# -> specificity: compute + write (=> NodeNgram)
spec_id = compute_specificity(corpus,cooc_matrix = coocs, overwrite_id = old_spec_id)
# -> specclusion/genclusion: compute + write (=> NodeNodeNgram)
(spec_id, gen_id) = compute_specgen(corpus, cooc_matrix = coocs,
spec_overwrite_id = spec_id, gen_overwrite_id = gen_id)
print('RECOUNT #%d: [%s] updated specificity node #%i' % (corpus.id, t(), spec_id))
print('RECOUNT #%d: [%s] updated spec-clusion node #%i' % (corpus.id, t(), spec_id))
print('RECOUNT #%d: [%s] updated gen-clusion node #%i' % (corpus.id, t(), gen_id))
print('RECOUNT #%d: [%s] FINISHED metric recounts' % (corpus.id, t()))
......
"""
Computes a specificity metric from the ngram cooccurrence matrix.
+ SAVE => WeightedList => NodeNgram
"""
from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram
from gargantext.util.db import session, aliased, func, bulk_insert
from gargantext.util.lists import WeightedList
from collections import defaultdict
from pandas import DataFrame
from numpy import diag
def round3(floating_number):
"""
Rounds a floating number to 3 decimals
Good when we don't need so much details in the DB writen data
"""
return float("%.3f" % floating_number)
def compute_specgen(corpus, cooc_id=None, cooc_matrix=None,
spec_overwrite_id = None, gen_overwrite_id = None):
'''
Compute genericity/specificity:
P(j|i) = N(ij) / N(ii)
P(i|j) = N(ij) / N(jj)
Gen(i) = Sum{j} P(j_k|i)
Spec(i) = Sum{j} P(i|j_k)
Gen-clusion(i) = (Spec(i) + Gen(i)) / 2
Spec-clusion(i) = (Spec(i) - Gen(i)) / 2
Parameters:
- cooc_id: mandatory id of a cooccurrences node to use as base
- spec_overwrite_id: optional preexisting specificity node to overwrite
- gen_overwrite_id: optional preexisting genericity node to overwrite
'''
matrix = defaultdict(lambda : defaultdict(float))
if cooc_id == None and cooc_matrix == None:
raise TypeError("compute_specificity: needs a cooc_id or cooc_matrix param")
elif cooc_id:
cooccurrences = (session.query(NodeNgramNgram)
.filter(NodeNgramNgram.node_id==cooc_id)
)
# no filtering: cooc already filtered on mainlist_id at creation
for cooccurrence in cooccurrences:
matrix[cooccurrence.ngram1_id][cooccurrence.ngram2_id] = cooccurrence.weight
# matrix[cooccurrence.ngram2_id][cooccurrence.ngram1_id] = cooccurrence.weight
elif cooc_matrix:
# copy WeightedMatrix into local matrix structure
for (ngram1_id, ngram2_id) in cooc_matrix.items:
w = cooc_matrix.items[(ngram1_id, ngram2_id)]
# ------- 8< --------------------------------------------
# tempo hack to ignore lines/columns where diagonal == 0
# £TODO find why they exist and then remove this snippet
if (((ngram1_id,ngram1_id) not in cooc_matrix.items) or
((ngram2_id,ngram2_id) not in cooc_matrix.items)):
continue
# ------- 8< --------------------------------------------
matrix[ngram1_id][ngram2_id] = w
nb_ngrams = len(matrix)
print("SPECIFICITY: computing on %i ngrams" % nb_ngrams)
# example corpus (7 docs, 8 nouns)
# --------------------------------
# "The report says that humans are animals."
# "The report says that rivers are full of water."
# "The report says that humans like to make war."
# "The report says that animals must eat food."
# "The report says that animals drink water."
# "The report says that humans like food and water."
# "The report says that grass is food for some animals."
#===========================================================================
cooc_counts = DataFrame(matrix).fillna(0)
# cooc_counts matrix
# ------------------
# animals food grass humans report rivers war water
# animals 4 2 1 1 4 0 0 1
# food 2 3 1 1 3 0 0 1
# grass 1 1 1 0 1 0 0 0
# humans 1 1 0 3 3 0 1 1
# report 4 3 1 3 7 1 1 3
# rivers 0 0 0 0 1 1 0 1
# war 0 0 0 1 1 0 1 0
# water 1 1 0 1 3 1 0 3
#===========================================================================
# conditional p(col|line)
diagonal = list(diag(cooc_counts))
# debug
# print("WARN diag: ", diagonal)
# print("WARN diag: =================== 0 in diagonal ?\n",
# 0 in diagonal ? "what ??? zeros in the diagonal :/" : "ok no zeros",
# "\n===================")
p_col_given_line = cooc_counts / list(diag(cooc_counts))
# p_col_given_line
# ----------------
# animals food grass humans report rivers war water
# animals 1.0 0.7 1.0 0.3 0.6 0.0 0.0 0.3
# food 0.5 1.0 1.0 0.3 0.4 0.0 0.0 0.3
# grass 0.2 0.3 1.0 0.0 0.1 0.0 0.0 0.0
# humans 0.2 0.3 0.0 1.0 0.4 0.0 1.0 0.3
# report 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
# rivers 0.0 0.0 0.0 0.0 0.1 1.0 0.0 0.3
# war 0.0 0.0 0.0 0.3 0.1 0.0 1.0 0.0
# water 0.2 0.3 0.0 0.3 0.4 1.0 0.0 1.0
#===========================================================================
# total per lines (<=> genericity)
Gen = p_col_given_line.sum(axis=1)
# Gen.sort_values(ascending=False)
# ---
# report 8.0
# animals 3.9
# food 3.6
# water 3.3
# humans 3.3
# grass 1.7
# war 1.5
# rivers 1.5
#===========================================================================
# total columnwise (<=> specificity)
Spec = p_col_given_line.sum(axis=0)
# Spec.sort_values(ascending=False)
# ----
# grass 4.0
# food 3.7
# water 3.3
# humans 3.3
# report 3.3
# animals 3.2
# war 3.0
# rivers 3.0
#===========================================================================
# our "inclusion by specificity" metric
Specclusion = Spec-Gen
# Specclusion.sort_values(ascending=False)
# -----------
# grass 1.1
# war 0.8
# rivers 0.8
# food 0.0
# humans -0.0
# water -0.0
# animals -0.3
# report -2.4
#===========================================================================
# our "inclusion by genericity" metric
Genclusion = Spec+Gen
# Genclusion.sort_values(ascending=False)
# -----------
# report 11.3
# food 7.3
# animals 7.2
# water 6.7
# humans 6.7
# grass 5.7
# war 4.5
# rivers 4.5
#===========================================================================
# specificity node
if spec_overwrite_id:
# overwrite pre-existing id
the_spec_id = spec_overwrite_id
session.query(NodeNgram).filter(NodeNgram.node_id==the_spec_id).delete()
session.commit()
else:
specnode = corpus.add_child(
typename = "SPECCLUSION",
name = "Specclusion (in:%s)" % corpus.id
)
session.add(specnode)
session.commit()
the_spec_id = specnode.id
if not Specclusion.empty:
data = WeightedList(
zip( Specclusion.index.tolist()
, [v for v in map(round3, Specclusion.values.tolist())]
)
)
data.save(the_spec_id)
else:
print("WARNING: had no terms in COOCS => empty SPECCLUSION node")
#===========================================================================
# genclusion node
if gen_overwrite_id:
the_gen_id = gen_overwrite_id
session.query(NodeNgram).filter(NodeNgram.node_id==the_gen_id).delete()
session.commit()
else:
gennode = corpus.add_child(
typename = "GENCLUSION",
name = "Genclusion (in:%s)" % corpus.id
)
session.add(gennode)
session.commit()
the_gen_id = gennode.id
if not Genclusion.empty:
data = WeightedList(
zip( Genclusion.index.tolist()
, [v for v in map(round3, Genclusion.values.tolist())]
)
)
data.save(the_gen_id)
else:
print("WARNING: had no terms in COOCS => empty GENCLUSION node")
#===========================================================================
return(the_spec_id, the_gen_id)
"""
Computes a specificity metric from the ngram cooccurrence matrix.
+ SAVE => WeightedList => NodeNgram
"""
from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram
from gargantext.util.db import session, aliased, func, bulk_insert
from gargantext.util.lists import WeightedList
from collections import defaultdict
from pandas import DataFrame
import pandas as pd
def compute_specificity(corpus, cooc_id=None, cooc_matrix=None, overwrite_id = None):
'''
Compute the specificity, simple calculus.
Parameters:
- cooc_id: mandatory id of a cooccurrences node to use as base
- overwrite_id: optional preexisting specificity node to overwrite
'''
matrix = defaultdict(lambda : defaultdict(float))
if cooc_id == None and cooc_matrix == None:
raise TypeError("compute_specificity: needs a cooc_id or cooc_matrix param")
elif cooc_id:
cooccurrences = (session.query(NodeNgramNgram)
.filter(NodeNgramNgram.node_id==cooc_id)
)
# no filtering: cooc already filtered on mainlist_id at creation
for cooccurrence in cooccurrences:
matrix[cooccurrence.ngram1_id][cooccurrence.ngram2_id] = cooccurrence.weight
matrix[cooccurrence.ngram2_id][cooccurrence.ngram1_id] = cooccurrence.weight
elif cooc_matrix:
# copy WeightedMatrix into local matrix structure
for (ngram1_id, ngram2_id) in cooc_matrix.items:
w = cooc_matrix.items[(ngram1_id, ngram2_id)]
matrix[ngram1_id][ngram2_id] = w
nb_ngrams = len(matrix)
print("SPECIFICITY: computing on %i ngrams" % nb_ngrams)
x = DataFrame(matrix).fillna(0)
# proba (x/y) ( <= on divise chaque ligne par son total)
x = x / x.sum(axis=1)
# vectorisation
# d:Matrix => v: Vector (len = nb_ngrams)
# v = d.sum(axis=1) (- lui-même)
xs = x.sum(axis=1) - x
ys = x.sum(axis=0) - x
# top inclus ou exclus
#n = ( xs + ys) / (2 * (x.shape[0] - 1))
# top generic or specific (asc is spec, desc is generic)
v = ( xs - ys) / ( 2 * (x.shape[0] - 1))
## d ##
#######
# Grenelle biodiversité kilomètres site élus île
# Grenelle 0 0 4 0 0 0
# biodiversité 0 0 0 0 4 0
# kilomètres 4 0 0 0 4 0
# site 0 0 0 0 4 6
# élus 0 4 4 4 0 0
# île 0 0 0 6 0 0
## d.sum(axis=1) ##
###################
# Grenelle 4
# biodiversité 4
# kilomètres 8
# site 10
# élus 12
# île 6
# résultat temporaire
# -------------------
# pour l'instant on va utiliser les sommes en ligne comme ranking de spécificité
# (**même** ordre qu'avec la formule d'avant le refactoring mais calcul + simple)
# TODO analyser la cohérence math ET sem de cet indicateur
#v.sort_values(inplace=True)
# [ ('biodiversité' , 0.333 ),
# ('Grenelle' , 0.5 ),
# ('île' , 0.599 ),
# ('kilomètres' , 1.333 ),
# ('site' , 1.333 ),
# ('élus' , 1.899 ) ]
# ----------------
# specificity node
if overwrite_id:
# overwrite pre-existing id
the_id = overwrite_id
session.query(NodeNgram).filter(NodeNgram.node_id==the_id).delete()
session.commit()
else:
specnode = corpus.add_child(
typename = "SPECIFICITY",
name = "Specif (in:%s)" % corpus.id
)
session.add(specnode)
session.commit()
the_id = specnode.id
# print(v)
pd.options.display.float_format = '${:,.2f}'.format
if not v.empty:
data = WeightedList(
zip( v.index.tolist()
, v.values.tolist()[0]
)
)
data.save(the_id)
else:
print("WARNING: had no terms in COOCS => empty SPECIFICITY node")
return(the_id)
......@@ -18,7 +18,8 @@ def compute_coocs( corpus,
stoplist_id = None,
start = None,
end = None,
symmetry_filter = False):
symmetry_filter = False,
diagonal_filter = True):
"""
Count how often some extracted terms appear
together in a small context (document)
......@@ -55,6 +56,9 @@ def compute_coocs( corpus,
NB the expected type of parameter value is datetime.datetime
(string is also possible but format must follow
this convention: "2001-01-01" aka "%Y-%m-%d")
- symmetry_filter: prevent calculating where ngram1_id > ngram2_id
- diagonal_filter: prevent calculating where ngram1_id == ngram2_id
(deprecated parameters)
- field1,2: allowed to count other things than ngrams (eg tags) but no use case at present
......@@ -69,7 +73,7 @@ def compute_coocs( corpus,
JOIN nodes_ngrams AS idxb
ON idxa.node_id = idxb.node_id <== that's cooc
---------------------------------
AND idxa.ngram_id <> idxb.ngram_id
AND idxa.ngram_id <> idxb.ngram_id (diagonal_filter)
AND idxa.node_id = MY_DOC ;
on entire corpus
......@@ -152,16 +156,14 @@ def compute_coocs( corpus,
ucooc
# for debug (2/4)
#, Xngram.terms.label("w_x")
#, Yngram.terms.label("w_y")
# , Xngram.terms.label("w_x")
# , Yngram.terms.label("w_y")
)
.join(Yindex, Xindex.node_id == Yindex.node_id ) # <- by definition of cooc
.join(Node, Node.id == Xindex.node_id) # <- b/c within corpus
.filter(Node.parent_id == corpus.id) # <- b/c within corpus
.filter(Node.typename == "DOCUMENT") # <- b/c within corpus
.filter(Xindex_ngform_id != Yindex_ngform_id) # <- b/c not with itself
)
# outerjoin the synonyms if needed
......@@ -179,12 +181,12 @@ def compute_coocs( corpus,
.group_by(
Xindex_ngform_id, Yindex_ngform_id # <- what we're counting
# for debug (3/4)
#,"w_x", "w_y"
# ,"w_x", "w_y"
)
# for debug (4/4)
#.join(Xngram, Xngram.id == Xindex_ngform_id)
#.join(Yngram, Yngram.id == Yindex_ngform_id)
# .join(Xngram, Xngram.id == Xindex_ngform_id)
# .join(Yngram, Yngram.id == Yindex_ngform_id)
.order_by(ucooc)
)
......@@ -192,6 +194,9 @@ def compute_coocs( corpus,
# 4) INPUT FILTERS (reduce N before O(N²))
if on_list_id:
# £TODO listes différentes ou bien une liste pour x et tous les ngrammes pour y
# car permettrait expansion de liste aux plus proches voisins (MacLachlan)
# (avec une matr rectangulaire)
m1 = aliased(NodeNgram)
m2 = aliased(NodeNgram)
......@@ -226,6 +231,10 @@ def compute_coocs( corpus,
)
if diagonal_filter:
# don't compute ngram with itself
coocs_query = coocs_query.filter(Xindex_ngform_id != Yindex_ngform_id)
if start or end:
Time = aliased(NodeHyperdata)
......@@ -268,6 +277,7 @@ def compute_coocs( corpus,
# threshold
# £TODO adjust COOC_THRESHOLD a posteriori:
# ex: sometimes 2 sometimes 4 depending on sparsity
print("COOCS: filtering pairs under threshold:", threshold)
coocs_query = coocs_query.having(ucooc >= threshold)
......
......@@ -77,7 +77,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
continue
# get ngrams
for ngram in ngramsextractor.extract(value):
tokens = tuple(token[0] for token in ngram)
tokens = tuple(normalize_forms(token[0]) for token in ngram)
if do_subngrams:
# ex tokens = ["very", "cool", "exemple"]
......@@ -90,7 +90,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
subterms = [tokens]
for seqterm in subterms:
ngram = normalize_terms(' '.join(seqterm))
ngram = ' '.join(seqterm)
if len(ngram) > 1:
# doc <=> ngram index
nodes_ngrams_count[(document.id, ngram)] += 1
......@@ -118,7 +118,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
raise error
def normalize_terms(term_str, do_lowercase=DEFAULT_ALL_LOWERCASE_FLAG):
def normalize_forms(term_str, do_lowercase=DEFAULT_ALL_LOWERCASE_FLAG):
"""
Removes unwanted trailing punctuation
AND optionally puts everything to lowercase
......@@ -127,14 +127,14 @@ def normalize_terms(term_str, do_lowercase=DEFAULT_ALL_LOWERCASE_FLAG):
(benefits from normalize_chars upstream so there's less cases to consider)
"""
# print('normalize_terms IN: "%s"' % term_str)
term_str = sub(r'^[-",;/%(){}\\\[\]\.\' ]+', '', term_str)
term_str = sub(r'[-",;/%(){}\\\[\]\.\' ]+$', '', term_str)
# print('normalize_forms IN: "%s"' % term_str)
term_str = sub(r'^[-\'",;/%(){}\\\[\]\. ©]+', '', term_str)
term_str = sub(r'[-\'",;/%(){}\\\[\]\. ©]+$', '', term_str)
if do_lowercase:
term_str = term_str.lower()
# print('normalize_terms OUT: "%s"' % term_str)
# print('normalize_forms OUT: "%s"' % term_str)
return term_str
......
......@@ -57,7 +57,7 @@ class CSVLists(APIView):
params in request.GET:
onto_corpus: the corpus whose lists are getting patched
params in request.FILES:
params in request.data:
csvfile: the csv file
/!\ We assume we checked the file size client-side before upload
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment