Commit ae5353f2 authored by delanoe's avatar delanoe

Merge branch 'romain-goodies' into unstable

parents 07653e7d 74693a64
...@@ -12,14 +12,16 @@ LISTTYPES = { ...@@ -12,14 +12,16 @@ LISTTYPES = {
'STOPLIST' : UnweightedList, 'STOPLIST' : UnweightedList,
'MAINLIST' : UnweightedList, 'MAINLIST' : UnweightedList,
'MAPLIST' : UnweightedList, 'MAPLIST' : UnweightedList,
'SPECIFICITY' : WeightedList, 'SPECCLUSION' : WeightedList,
'GENCLUSION' : WeightedList,
'OCCURRENCES' : WeightedIndex, # could be WeightedList 'OCCURRENCES' : WeightedIndex, # could be WeightedList
'COOCCURRENCES': WeightedMatrix, 'COOCCURRENCES': WeightedMatrix,
'TFIDF-CORPUS' : WeightedIndex, 'TFIDF-CORPUS' : WeightedIndex,
'TFIDF-GLOBAL' : WeightedIndex, 'TFIDF-GLOBAL' : WeightedIndex,
'TIRANK-LOCAL' : WeightedIndex, # could be WeightedList 'TIRANK-LOCAL' : WeightedIndex, # could be WeightedList
'TIRANK-GLOBAL' : WeightedIndex # could be WeightedList 'TIRANK-GLOBAL' : WeightedIndex, # could be WeightedList
} }
# 'OWNLIST' : UnweightedList, # £TODO use this for any term-level tags
NODETYPES = [ NODETYPES = [
# TODO separate id not array index, read by models.node # TODO separate id not array index, read by models.node
...@@ -37,7 +39,7 @@ NODETYPES = [ ...@@ -37,7 +39,7 @@ NODETYPES = [
'COOCCURRENCES', # 9 'COOCCURRENCES', # 9
# scores # scores
'OCCURRENCES', # 10 'OCCURRENCES', # 10
'SPECIFICITY', # 11 'SPECCLUSION', # 11
'CVALUE', # 12 'CVALUE', # 12
'TFIDF-CORPUS', # 13 'TFIDF-CORPUS', # 13
'TFIDF-GLOBAL', # 14 'TFIDF-GLOBAL', # 14
...@@ -47,6 +49,7 @@ NODETYPES = [ ...@@ -47,6 +49,7 @@ NODETYPES = [
# more scores (sorry!) # more scores (sorry!)
'TIRANK-LOCAL', # 16 'TIRANK-LOCAL', # 16
'TIRANK-GLOBAL', # 17 'TIRANK-GLOBAL', # 17
'GENCLUSION', # 18
] ]
INDEXED_HYPERDATA = { INDEXED_HYPERDATA = {
...@@ -222,12 +225,16 @@ DEFAULT_RANK_CUTOFF_RATIO = .75 # MAINLIST maximum terms in % ...@@ -222,12 +225,16 @@ DEFAULT_RANK_CUTOFF_RATIO = .75 # MAINLIST maximum terms in %
DEFAULT_RANK_HARD_LIMIT = 5000 # MAINLIST maximum terms abs DEFAULT_RANK_HARD_LIMIT = 5000 # MAINLIST maximum terms abs
# (makes COOCS larger ~ O(N²) /!\) # (makes COOCS larger ~ O(N²) /!\)
DEFAULT_COOC_THRESHOLD = 2 # inclusive minimum for COOCS coefs DEFAULT_COOC_THRESHOLD = 3 # inclusive minimum for COOCS coefs
# (makes COOCS more sparse) # (makes COOCS more sparse)
DEFAULT_MAPLIST_MAX = 350 # MAPLIST maximum terms DEFAULT_MAPLIST_MAX = 350 # MAPLIST maximum terms
DEFAULT_MAPLIST_MONOGRAMS_RATIO = .15 # part of monograms in MAPLIST DEFAULT_MAPLIST_MONOGRAMS_RATIO = .2 # quota of monograms in MAPLIST
# (vs multigrams = 1-mono)
DEFAULT_MAPLIST_GENCLUSION_RATIO = .6 # quota of top genclusion in MAPLIST
# (vs top specclusion = 1-gen)
DEFAULT_MAX_NGRAM_LEN = 7 # limit used after POStagging rule DEFAULT_MAX_NGRAM_LEN = 7 # limit used after POStagging rule
# (initial ngrams number is a power law of this /!\) # (initial ngrams number is a power law of this /!\)
...@@ -272,7 +279,7 @@ DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY ...@@ -272,7 +279,7 @@ DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY
# about batch processing... # about batch processing...
BATCH_PARSING_SIZE = 256 BATCH_PARSING_SIZE = 256
BATCH_NGRAMSEXTRACTION_SIZE = 1024 BATCH_NGRAMSEXTRACTION_SIZE = 3000 # how many distinct ngrams before INTEGRATE
# Scrapers config # Scrapers config
...@@ -282,7 +289,7 @@ QUERY_SIZE_N_DEFAULT = 1000 ...@@ -282,7 +289,7 @@ QUERY_SIZE_N_DEFAULT = 1000
# Grammar rules for chunking # Grammar rules for chunking
RULE_JJNN = "{<JJ.*>*<NN.*|>+<JJ.*>*}" RULE_JJNN = "{<JJ.*>*<NN.*|>+<JJ.*>*}"
RULE_JJDTNN = "{<JJ.*>*<NN.*>+((<P|IN> <DT>? <JJ.*>* <NN.*>+ <JJ.*>*)|(<JJ.*>))*}" RULE_NPN = "{<JJ.*>*<NN.*>+((<P|IN> <DT>? <JJ.*>* <NN.*>+ <JJ.*>*)|(<JJ.*>))*}"
RULE_TINA = "^((VBD,|VBG,|VBN,|CD.?,|JJ.?,|\?,){0,2}?(N.?.?,|\?,)+?(CD.,)??)\ RULE_TINA = "^((VBD,|VBG,|VBN,|CD.?,|JJ.?,|\?,){0,2}?(N.?.?,|\?,)+?(CD.,)??)\
+?((PREP.?|DET.?,|IN.?,|CC.?,|\?,)((VBD,|VBG,|VBN,|CD.?,|JJ.?,|\?\ +?((PREP.?|DET.?,|IN.?,|CC.?,|\?,)((VBD,|VBG,|VBN,|CD.?,|JJ.?,|\?\
,){0,2}?(N.?.?,|\?,)+?)+?)*?$" ,){0,2}?(N.?.?,|\?,)+?)+?)*?$"
...@@ -19,7 +19,7 @@ from gargantext.constants import DEFAULT_CSV_DELIM, DEFAULT_CSV_DELIM_GRO ...@@ -19,7 +19,7 @@ from gargantext.constants import DEFAULT_CSV_DELIM, DEFAULT_CSV_DELIM_GRO
# import will implement the same text cleaning procedures as toolchain # import will implement the same text cleaning procedures as toolchain
from gargantext.util.toolchain.parsing import normalize_chars from gargantext.util.toolchain.parsing import normalize_chars
from gargantext.util.toolchain.ngrams_extraction import normalize_terms from gargantext.util.toolchain.ngrams_extraction import normalize_forms
from sqlalchemy.sql import exists from sqlalchemy.sql import exists
from os import path from os import path
......
from gargantext.util.languages import languages from gargantext.util.languages import languages
from gargantext.constants import LANGUAGES, DEFAULT_MAX_NGRAM_LEN, RULE_JJNN, RULE_JJDTNN from gargantext.constants import LANGUAGES, DEFAULT_MAX_NGRAM_LEN, RULE_JJNN, RULE_NPN
import nltk import nltk
import re import re
......
...@@ -39,11 +39,11 @@ def do_mainlist(corpus, ...@@ -39,11 +39,11 @@ def do_mainlist(corpus,
# retrieve helper nodes if not provided # retrieve helper nodes if not provided
if not ranking_scores_id: if not ranking_scores_id:
ranking_scores_id = session.query(Node.id).filter( ranking_scores_id = session.query(Node.id).filter(
Node.typename == "TFIDF-GLOBAL", Node.typename == "TIRANK-GLOBAL",
Node.parent_id == corpus.id Node.parent_id == corpus.id
).first() ).first()
if not ranking_scores_id: if not ranking_scores_id:
raise ValueError("MAINLIST: TFIDF node needed for mainlist creation") raise ValueError("MAINLIST: TIRANK node needed for mainlist creation")
if not stoplist_id: if not stoplist_id:
stoplist_id = session.query(Node.id).filter( stoplist_id = session.query(Node.id).filter(
......
...@@ -9,37 +9,49 @@ from gargantext.util.db_cache import cache ...@@ -9,37 +9,49 @@ from gargantext.util.db_cache import cache
from gargantext.util.lists import UnweightedList from gargantext.util.lists import UnweightedList
from sqlalchemy import desc, asc from sqlalchemy import desc, asc
from gargantext.constants import DEFAULT_MAPLIST_MAX,\ from gargantext.constants import DEFAULT_MAPLIST_MAX,\
DEFAULT_MAPLIST_GENCLUSION_RATIO,\
DEFAULT_MAPLIST_MONOGRAMS_RATIO DEFAULT_MAPLIST_MONOGRAMS_RATIO
def do_maplist(corpus, def do_maplist(corpus,
overwrite_id = None, overwrite_id = None,
mainlist_id = None, mainlist_id = None,
specificity_id = None, specclusion_id = None,
genclusion_id = None,
grouplist_id = None, grouplist_id = None,
limit=DEFAULT_MAPLIST_MAX, limit=DEFAULT_MAPLIST_MAX,
genclusion_part=DEFAULT_MAPLIST_GENCLUSION_RATIO,
monograms_part=DEFAULT_MAPLIST_MONOGRAMS_RATIO monograms_part=DEFAULT_MAPLIST_MONOGRAMS_RATIO
): ):
''' '''
According to Specificities and mainlist According to Genericity/Specificity and mainlist
Parameters: Parameters:
- mainlist_id (starting point, already cleaned of stoplist terms) - mainlist_id (starting point, already cleaned of stoplist terms)
- specificity_id (ranking factor) - specclusion_id (ngram inclusion by cooc specificity -- ranking factor)
- genclusion_id (ngram inclusion by cooc genericity -- ranking factor)
- grouplist_id (filtering grouped ones) - grouplist_id (filtering grouped ones)
- overwrite_id: optional if preexisting MAPLIST node to overwrite - overwrite_id: optional if preexisting MAPLIST node to overwrite
+ 2 constants to modulate the terms choice + 3 params to modulate the terms choice
- limit for the amount of picked terms - limit for the amount of picked terms
- monograms_part: a ratio of terms with only one lexical unit to keep - monograms_part: a ratio of terms with only one lexical unit to keep
(multigrams quota = limit * (1-monograms_part))
- genclusion_part: a ratio of terms with only one lexical unit to keep
(speclusion quota = limit * (1-genclusion_part))
''' '''
if not (mainlist_id and specificity_id and grouplist_id): if not (mainlist_id and specclusion_id and genclusion_id and grouplist_id):
raise ValueError("Please provide mainlist_id, specificity_id and grouplist_id") raise ValueError("Please provide mainlist_id, specclusion_id, genclusion_id and grouplist_id")
monograms_limit = round(limit * monograms_part) quotas = {'topgen':{}, 'topspec':{}}
multigrams_limit = limit - monograms_limit genclusion_limit = round(limit * genclusion_part)
print("MAPLIST: monograms_limit =", monograms_limit) speclusion_limit = limit - genclusion_limit
print("MAPLIST: multigrams_limit = ", multigrams_limit) quotas['topgen']['monograms'] = round(genclusion_limit * monograms_part)
quotas['topgen']['multigrams'] = genclusion_limit - quotas['topgen']['monograms']
quotas['topspec']['monograms'] = round(speclusion_limit * monograms_part)
quotas['topspec']['multigrams'] = speclusion_limit - quotas['topspec']['monograms']
print("MAPLIST quotas:", quotas)
#dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id) #dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
...@@ -54,11 +66,19 @@ def do_maplist(corpus, ...@@ -54,11 +66,19 @@ def do_maplist(corpus,
) )
ScoreSpec=aliased(NodeNgram) ScoreSpec=aliased(NodeNgram)
ScoreGen=aliased(NodeNgram)
# specificity-ranked
query = (session.query(ScoreSpec.ngram_id) # ngram with both ranking factors spec and gen
query = (session.query(
ScoreSpec.ngram_id,
ScoreSpec.weight,
ScoreGen.weight,
Ngram.n
)
.join(Ngram, Ngram.id == ScoreSpec.ngram_id) .join(Ngram, Ngram.id == ScoreSpec.ngram_id)
.filter(ScoreSpec.node_id == specificity_id) .join(ScoreGen, ScoreGen.ngram_id == ScoreSpec.ngram_id)
.filter(ScoreSpec.node_id == specclusion_id)
.filter(ScoreGen.node_id == genclusion_id)
# we want only terms within mainlist # we want only terms within mainlist
.join(MainlistTable, Ngram.id == MainlistTable.ngram_id) .join(MainlistTable, Ngram.id == MainlistTable.ngram_id)
...@@ -68,36 +88,99 @@ def do_maplist(corpus, ...@@ -68,36 +88,99 @@ def do_maplist(corpus,
.outerjoin(IsSubform, .outerjoin(IsSubform,
IsSubform.c.ngram2_id == ScoreSpec.ngram_id) IsSubform.c.ngram2_id == ScoreSpec.ngram_id)
.filter(IsSubform.c.ngram2_id == None) .filter(IsSubform.c.ngram2_id == None)
)
# TODO: move these 2 pools up to mainlist selection
top_monograms = (query
.filter(Ngram.n == 1)
.order_by(asc(ScoreSpec.weight))
.limit(monograms_limit)
.all()
)
top_multigrams = (query # specificity-ranked
.filter(Ngram.n >= 2)
.order_by(desc(ScoreSpec.weight)) .order_by(desc(ScoreSpec.weight))
.limit(multigrams_limit) )
.all()
) # format in scored_ngrams array:
obtained_mono = len(top_monograms) # -------------------------------
obtained_multi = len(top_multigrams) # [(37723, 8.428, 14.239, 3 ), etc]
obtained_total = obtained_mono + obtained_multi # ngramid wspec wgen nwords
# print("MAPLIST: top_monograms =", obtained_mono) scored_ngrams = query.all()
# print("MAPLIST: top_multigrams = ", obtained_multi) n_ngrams = len(scored_ngrams)
if n_ngrams == 0:
raise ValueError("No ngrams in cooc table ?")
# results, with same structure as quotas
chosen_ngrams = {
'topgen':{'monograms':[], 'multigrams':[]},
'topspec':{'monograms':[], 'multigrams':[]}
}
# specificity and genericity are rather reverse-correlated
# but occasionally they can have common ngrams (same ngram well ranked in both)
# => we'll use a lookup table to check if we didn't already get it
already_gotten_ngramids = {}
# 2 loops to fill spec-clusion then gen-clusion quotas
# (1st loop uses order from DB, 2nd loop uses our own sort at end of 1st)
for rkr in ['topspec', 'topgen']:
got_enough_mono = False
got_enough_multi = False
all_done = False
i = -1
while((not all_done) and (not (got_enough_mono and got_enough_multi))):
# retrieve sorted ngram n° i
i += 1
(ng_id, wspec, wgen, nwords) = scored_ngrams[i]
# before any continue case, we check the next i for max reached
all_done = (i+1 >= n_ngrams)
if ng_id in already_gotten_ngramids:
continue
# NB: nwords could be replaced by a simple search on r' '
if nwords == 1:
if got_enough_mono:
continue
else:
# add ngram to results and lookup
chosen_ngrams[rkr]['monograms'].append(ng_id)
already_gotten_ngramids[ng_id] = True
# multi
else:
if got_enough_multi:
continue
else:
# add ngram to results and lookup
chosen_ngrams[rkr]['multigrams'].append(ng_id)
already_gotten_ngramids[ng_id] = True
got_enough_mono = (len(chosen_ngrams[rkr]['monograms']) >= quotas[rkr]['monograms'])
got_enough_multi = (len(chosen_ngrams[rkr]['multigrams']) >= quotas[rkr]['multigrams'])
# at the end of the first loop we just need to sort all by the second ranker (gen)
scored_ngrams = sorted(scored_ngrams, key=lambda ng_infos: ng_infos[2], reverse=True)
obtained_spec_mono = len(chosen_ngrams['topspec']['monograms'])
obtained_spec_multi = len(chosen_ngrams['topspec']['multigrams'])
obtained_gen_mono = len(chosen_ngrams['topgen']['monograms'])
obtained_gen_multi = len(chosen_ngrams['topgen']['multigrams'])
obtained_total = obtained_spec_mono \
+ obtained_spec_multi \
+ obtained_gen_mono \
+ obtained_gen_multi
print("MAPLIST: top_spec_monograms =", obtained_spec_mono)
print("MAPLIST: top_spec_multigrams =", obtained_spec_multi)
print("MAPLIST: top_gen_monograms =", obtained_gen_mono)
print("MAPLIST: top_gen_multigrams =", obtained_gen_multi)
print("MAPLIST: kept %i ngrams in total " % obtained_total) print("MAPLIST: kept %i ngrams in total " % obtained_total)
obtained_data = chosen_ngrams['topspec']['monograms'] \
+ chosen_ngrams['topspec']['multigrams'] \
+ chosen_ngrams['topgen']['monograms'] \
+ chosen_ngrams['topgen']['multigrams']
# NEW MAPLIST NODE # NEW MAPLIST NODE
# ----------------- # -----------------
# saving the parameters of the analysis in the Node JSON # saving the parameters of the analysis in the Node JSON
new_hyperdata = { 'corpus': corpus.id, new_hyperdata = { 'corpus': corpus.id,
'limit' : limit, 'limit' : limit,
'monograms_part' : monograms_part, 'monograms_part' : monograms_part,
'monograms_result' : obtained_mono/obtained_total if obtained_total != 0 else 0 'genclusion_part' : genclusion_part,
} }
if overwrite_id: if overwrite_id:
# overwrite pre-existing node # overwrite pre-existing node
...@@ -118,9 +201,7 @@ def do_maplist(corpus, ...@@ -118,9 +201,7 @@ def do_maplist(corpus,
the_id = the_maplist.id the_id = the_maplist.id
# create UnweightedList object and save (=> new NodeNgram rows) # create UnweightedList object and save (=> new NodeNgram rows)
datalist = UnweightedList( datalist = UnweightedList(obtained_data)
[res.ngram_id for res in top_monograms + top_multigrams]
)
# save # save
datalist.save(the_id) datalist.save(the_id)
......
...@@ -10,8 +10,8 @@ from .ngram_groups import compute_groups ...@@ -10,8 +10,8 @@ from .ngram_groups import compute_groups
from .metric_tfidf import compute_occs, compute_tfidf_local, compute_ti_ranking from .metric_tfidf import compute_occs, compute_tfidf_local, compute_ti_ranking
from .list_main import do_mainlist from .list_main import do_mainlist
from .ngram_coocs import compute_coocs from .ngram_coocs import compute_coocs
from .metric_specificity import compute_specificity from .metric_specgen import compute_specgen
from .list_map import do_maplist # TEST from .list_map import do_maplist
from .mail_notification import notify_owner from .mail_notification import notify_owner
from gargantext.util.db import session from gargantext.util.db import session
from gargantext.models import Node from gargantext.models import Node
...@@ -136,22 +136,26 @@ def parse_extract_indexhyperdata(corpus): ...@@ -136,22 +136,26 @@ def parse_extract_indexhyperdata(corpus):
# => used for doc <=> ngram association # => used for doc <=> ngram association
# ------------ # ------------
# -> cooccurrences on mainlist: compute + write (=> Node and NodeNgramNgram) # -> cooccurrences on mainlist: compute + write (=> Node and NodeNgramNgram)*
coocs = compute_coocs(corpus, coocs = compute_coocs(corpus,
on_list_id = mainlist_id, on_list_id = mainlist_id,
groupings_id = group_id, groupings_id = group_id,
just_pass_result = True) just_pass_result = True,
diagonal_filter = False) # preserving the diagonal
# (useful for spec/gen)
print('CORPUS #%d: [%s] computed mainlist coocs for specif rank' % (corpus.id, t())) print('CORPUS #%d: [%s] computed mainlist coocs for specif rank' % (corpus.id, t()))
# -> specificity: compute + write (=> NodeNodeNgram) # -> specclusion/genclusion: compute + write (2 Nodes + 2 lists in NodeNgram)
spec_id = compute_specificity(corpus,cooc_matrix = coocs) (spec_id, gen_id) = compute_specgen(corpus,cooc_matrix = coocs)
# no need here for subforms because cooc already counted them in mainform # no need here for subforms because cooc already counted them in mainform
print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), spec_id)) print('CORPUS #%d: [%s] new spec-clusion node #%i' % (corpus.id, t(), spec_id))
print('CORPUS #%d: [%s] new gen-clusion node #%i' % (corpus.id, t(), gen_id))
# maplist: compute + write (to Node and NodeNgram) # maplist: compute + write (to Node and NodeNgram)
map_id = do_maplist(corpus, map_id = do_maplist(corpus,
mainlist_id = mainlist_id, mainlist_id = mainlist_id,
specificity_id=spec_id, specclusion_id=spec_id,
genclusion_id=gen_id,
grouplist_id=group_id grouplist_id=group_id
) )
print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id)) print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id))
...@@ -187,7 +191,7 @@ def recount(corpus): ...@@ -187,7 +191,7 @@ def recount(corpus):
- ndocs - ndocs
- ti_rank - ti_rank
- coocs - coocs
- specificity - specclusion/genclusion
- tfidf - tfidf
NB: no new extraction, no list change, just the metrics NB: no new extraction, no list change, just the metrics
...@@ -208,10 +212,15 @@ def recount(corpus): ...@@ -208,10 +212,15 @@ def recount(corpus):
old_tirank_id = None old_tirank_id = None
try: try:
old_spec_id = corpus.children("SPECIFICITY").first().id old_spec_id = corpus.children("SPECCLUSION").first().id
except: except:
old_spec_id = None old_spec_id = None
try:
old_gen_id = corpus.children("GENCLUSION").first().id
except:
old_gen_id = None
try: try:
old_ltfidf_id = corpus.children("TFIDF-CORPUS").first().id old_ltfidf_id = corpus.children("TFIDF-CORPUS").first().id
except: except:
...@@ -254,11 +263,13 @@ def recount(corpus): ...@@ -254,11 +263,13 @@ def recount(corpus):
just_pass_result = True) just_pass_result = True)
print('RECOUNT #%d: [%s] updated mainlist coocs for specif rank' % (corpus.id, t())) print('RECOUNT #%d: [%s] updated mainlist coocs for specif rank' % (corpus.id, t()))
# -> specificity: compute + write (=> NodeNgram)
spec_id = compute_specificity(corpus,cooc_matrix = coocs, overwrite_id = old_spec_id)
# -> specclusion/genclusion: compute + write (=> NodeNodeNgram)
(spec_id, gen_id) = compute_specgen(corpus, cooc_matrix = coocs,
spec_overwrite_id = spec_id, gen_overwrite_id = gen_id)
print('RECOUNT #%d: [%s] updated specificity node #%i' % (corpus.id, t(), spec_id)) print('RECOUNT #%d: [%s] updated spec-clusion node #%i' % (corpus.id, t(), spec_id))
print('RECOUNT #%d: [%s] updated gen-clusion node #%i' % (corpus.id, t(), gen_id))
print('RECOUNT #%d: [%s] FINISHED metric recounts' % (corpus.id, t())) print('RECOUNT #%d: [%s] FINISHED metric recounts' % (corpus.id, t()))
......
"""
Computes a specificity metric from the ngram cooccurrence matrix.
+ SAVE => WeightedList => NodeNgram
"""
from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram
from gargantext.util.db import session, aliased, func, bulk_insert
from gargantext.util.lists import WeightedList
from collections import defaultdict
from pandas import DataFrame
from numpy import diag
def round3(floating_number):
"""
Rounds a floating number to 3 decimals
Good when we don't need so much details in the DB writen data
"""
return float("%.3f" % floating_number)
def compute_specgen(corpus, cooc_id=None, cooc_matrix=None,
spec_overwrite_id = None, gen_overwrite_id = None):
'''
Compute genericity/specificity:
P(j|i) = N(ij) / N(ii)
P(i|j) = N(ij) / N(jj)
Gen(i) = Sum{j} P(j_k|i)
Spec(i) = Sum{j} P(i|j_k)
Gen-clusion(i) = (Spec(i) + Gen(i)) / 2
Spec-clusion(i) = (Spec(i) - Gen(i)) / 2
Parameters:
- cooc_id: mandatory id of a cooccurrences node to use as base
- spec_overwrite_id: optional preexisting specificity node to overwrite
- gen_overwrite_id: optional preexisting genericity node to overwrite
'''
matrix = defaultdict(lambda : defaultdict(float))
if cooc_id == None and cooc_matrix == None:
raise TypeError("compute_specificity: needs a cooc_id or cooc_matrix param")
elif cooc_id:
cooccurrences = (session.query(NodeNgramNgram)
.filter(NodeNgramNgram.node_id==cooc_id)
)
# no filtering: cooc already filtered on mainlist_id at creation
for cooccurrence in cooccurrences:
matrix[cooccurrence.ngram1_id][cooccurrence.ngram2_id] = cooccurrence.weight
# matrix[cooccurrence.ngram2_id][cooccurrence.ngram1_id] = cooccurrence.weight
elif cooc_matrix:
# copy WeightedMatrix into local matrix structure
for (ngram1_id, ngram2_id) in cooc_matrix.items:
w = cooc_matrix.items[(ngram1_id, ngram2_id)]
# ------- 8< --------------------------------------------
# tempo hack to ignore lines/columns where diagonal == 0
# £TODO find why they exist and then remove this snippet
if (((ngram1_id,ngram1_id) not in cooc_matrix.items) or
((ngram2_id,ngram2_id) not in cooc_matrix.items)):
continue
# ------- 8< --------------------------------------------
matrix[ngram1_id][ngram2_id] = w
nb_ngrams = len(matrix)
print("SPECIFICITY: computing on %i ngrams" % nb_ngrams)
# example corpus (7 docs, 8 nouns)
# --------------------------------
# "The report says that humans are animals."
# "The report says that rivers are full of water."
# "The report says that humans like to make war."
# "The report says that animals must eat food."
# "The report says that animals drink water."
# "The report says that humans like food and water."
# "The report says that grass is food for some animals."
#===========================================================================
cooc_counts = DataFrame(matrix).fillna(0)
# cooc_counts matrix
# ------------------
# animals food grass humans report rivers war water
# animals 4 2 1 1 4 0 0 1
# food 2 3 1 1 3 0 0 1
# grass 1 1 1 0 1 0 0 0
# humans 1 1 0 3 3 0 1 1
# report 4 3 1 3 7 1 1 3
# rivers 0 0 0 0 1 1 0 1
# war 0 0 0 1 1 0 1 0
# water 1 1 0 1 3 1 0 3
#===========================================================================
# conditional p(col|line)
diagonal = list(diag(cooc_counts))
# debug
# print("WARN diag: ", diagonal)
# print("WARN diag: =================== 0 in diagonal ?\n",
# 0 in diagonal ? "what ??? zeros in the diagonal :/" : "ok no zeros",
# "\n===================")
p_col_given_line = cooc_counts / list(diag(cooc_counts))
# p_col_given_line
# ----------------
# animals food grass humans report rivers war water
# animals 1.0 0.7 1.0 0.3 0.6 0.0 0.0 0.3
# food 0.5 1.0 1.0 0.3 0.4 0.0 0.0 0.3
# grass 0.2 0.3 1.0 0.0 0.1 0.0 0.0 0.0
# humans 0.2 0.3 0.0 1.0 0.4 0.0 1.0 0.3
# report 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
# rivers 0.0 0.0 0.0 0.0 0.1 1.0 0.0 0.3
# war 0.0 0.0 0.0 0.3 0.1 0.0 1.0 0.0
# water 0.2 0.3 0.0 0.3 0.4 1.0 0.0 1.0
#===========================================================================
# total per lines (<=> genericity)
Gen = p_col_given_line.sum(axis=1)
# Gen.sort_values(ascending=False)
# ---
# report 8.0
# animals 3.9
# food 3.6
# water 3.3
# humans 3.3
# grass 1.7
# war 1.5
# rivers 1.5
#===========================================================================
# total columnwise (<=> specificity)
Spec = p_col_given_line.sum(axis=0)
# Spec.sort_values(ascending=False)
# ----
# grass 4.0
# food 3.7
# water 3.3
# humans 3.3
# report 3.3
# animals 3.2
# war 3.0
# rivers 3.0
#===========================================================================
# our "inclusion by specificity" metric
Specclusion = Spec-Gen
# Specclusion.sort_values(ascending=False)
# -----------
# grass 1.1
# war 0.8
# rivers 0.8
# food 0.0
# humans -0.0
# water -0.0
# animals -0.3
# report -2.4
#===========================================================================
# our "inclusion by genericity" metric
Genclusion = Spec+Gen
# Genclusion.sort_values(ascending=False)
# -----------
# report 11.3
# food 7.3
# animals 7.2
# water 6.7
# humans 6.7
# grass 5.7
# war 4.5
# rivers 4.5
#===========================================================================
# specificity node
if spec_overwrite_id:
# overwrite pre-existing id
the_spec_id = spec_overwrite_id
session.query(NodeNgram).filter(NodeNgram.node_id==the_spec_id).delete()
session.commit()
else:
specnode = corpus.add_child(
typename = "SPECCLUSION",
name = "Specclusion (in:%s)" % corpus.id
)
session.add(specnode)
session.commit()
the_spec_id = specnode.id
if not Specclusion.empty:
data = WeightedList(
zip( Specclusion.index.tolist()
, [v for v in map(round3, Specclusion.values.tolist())]
)
)
data.save(the_spec_id)
else:
print("WARNING: had no terms in COOCS => empty SPECCLUSION node")
#===========================================================================
# genclusion node
if gen_overwrite_id:
the_gen_id = gen_overwrite_id
session.query(NodeNgram).filter(NodeNgram.node_id==the_gen_id).delete()
session.commit()
else:
gennode = corpus.add_child(
typename = "GENCLUSION",
name = "Genclusion (in:%s)" % corpus.id
)
session.add(gennode)
session.commit()
the_gen_id = gennode.id
if not Genclusion.empty:
data = WeightedList(
zip( Genclusion.index.tolist()
, [v for v in map(round3, Genclusion.values.tolist())]
)
)
data.save(the_gen_id)
else:
print("WARNING: had no terms in COOCS => empty GENCLUSION node")
#===========================================================================
return(the_spec_id, the_gen_id)
"""
Computes a specificity metric from the ngram cooccurrence matrix.
+ SAVE => WeightedList => NodeNgram
"""
from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram
from gargantext.util.db import session, aliased, func, bulk_insert
from gargantext.util.lists import WeightedList
from collections import defaultdict
from pandas import DataFrame
import pandas as pd
def compute_specificity(corpus, cooc_id=None, cooc_matrix=None, overwrite_id = None):
'''
Compute the specificity, simple calculus.
Parameters:
- cooc_id: mandatory id of a cooccurrences node to use as base
- overwrite_id: optional preexisting specificity node to overwrite
'''
matrix = defaultdict(lambda : defaultdict(float))
if cooc_id == None and cooc_matrix == None:
raise TypeError("compute_specificity: needs a cooc_id or cooc_matrix param")
elif cooc_id:
cooccurrences = (session.query(NodeNgramNgram)
.filter(NodeNgramNgram.node_id==cooc_id)
)
# no filtering: cooc already filtered on mainlist_id at creation
for cooccurrence in cooccurrences:
matrix[cooccurrence.ngram1_id][cooccurrence.ngram2_id] = cooccurrence.weight
matrix[cooccurrence.ngram2_id][cooccurrence.ngram1_id] = cooccurrence.weight
elif cooc_matrix:
# copy WeightedMatrix into local matrix structure
for (ngram1_id, ngram2_id) in cooc_matrix.items:
w = cooc_matrix.items[(ngram1_id, ngram2_id)]
matrix[ngram1_id][ngram2_id] = w
nb_ngrams = len(matrix)
print("SPECIFICITY: computing on %i ngrams" % nb_ngrams)
x = DataFrame(matrix).fillna(0)
# proba (x/y) ( <= on divise chaque ligne par son total)
x = x / x.sum(axis=1)
# vectorisation
# d:Matrix => v: Vector (len = nb_ngrams)
# v = d.sum(axis=1) (- lui-même)
xs = x.sum(axis=1) - x
ys = x.sum(axis=0) - x
# top inclus ou exclus
#n = ( xs + ys) / (2 * (x.shape[0] - 1))
# top generic or specific (asc is spec, desc is generic)
v = ( xs - ys) / ( 2 * (x.shape[0] - 1))
## d ##
#######
# Grenelle biodiversité kilomètres site élus île
# Grenelle 0 0 4 0 0 0
# biodiversité 0 0 0 0 4 0
# kilomètres 4 0 0 0 4 0
# site 0 0 0 0 4 6
# élus 0 4 4 4 0 0
# île 0 0 0 6 0 0
## d.sum(axis=1) ##
###################
# Grenelle 4
# biodiversité 4
# kilomètres 8
# site 10
# élus 12
# île 6
# résultat temporaire
# -------------------
# pour l'instant on va utiliser les sommes en ligne comme ranking de spécificité
# (**même** ordre qu'avec la formule d'avant le refactoring mais calcul + simple)
# TODO analyser la cohérence math ET sem de cet indicateur
#v.sort_values(inplace=True)
# [ ('biodiversité' , 0.333 ),
# ('Grenelle' , 0.5 ),
# ('île' , 0.599 ),
# ('kilomètres' , 1.333 ),
# ('site' , 1.333 ),
# ('élus' , 1.899 ) ]
# ----------------
# specificity node
if overwrite_id:
# overwrite pre-existing id
the_id = overwrite_id
session.query(NodeNgram).filter(NodeNgram.node_id==the_id).delete()
session.commit()
else:
specnode = corpus.add_child(
typename = "SPECIFICITY",
name = "Specif (in:%s)" % corpus.id
)
session.add(specnode)
session.commit()
the_id = specnode.id
# print(v)
pd.options.display.float_format = '${:,.2f}'.format
if not v.empty:
data = WeightedList(
zip( v.index.tolist()
, v.values.tolist()[0]
)
)
data.save(the_id)
else:
print("WARNING: had no terms in COOCS => empty SPECIFICITY node")
return(the_id)
...@@ -18,7 +18,8 @@ def compute_coocs( corpus, ...@@ -18,7 +18,8 @@ def compute_coocs( corpus,
stoplist_id = None, stoplist_id = None,
start = None, start = None,
end = None, end = None,
symmetry_filter = False): symmetry_filter = False,
diagonal_filter = True):
""" """
Count how often some extracted terms appear Count how often some extracted terms appear
together in a small context (document) together in a small context (document)
...@@ -55,6 +56,9 @@ def compute_coocs( corpus, ...@@ -55,6 +56,9 @@ def compute_coocs( corpus,
NB the expected type of parameter value is datetime.datetime NB the expected type of parameter value is datetime.datetime
(string is also possible but format must follow (string is also possible but format must follow
this convention: "2001-01-01" aka "%Y-%m-%d") this convention: "2001-01-01" aka "%Y-%m-%d")
- symmetry_filter: prevent calculating where ngram1_id > ngram2_id
- diagonal_filter: prevent calculating where ngram1_id == ngram2_id
(deprecated parameters) (deprecated parameters)
- field1,2: allowed to count other things than ngrams (eg tags) but no use case at present - field1,2: allowed to count other things than ngrams (eg tags) but no use case at present
...@@ -69,7 +73,7 @@ def compute_coocs( corpus, ...@@ -69,7 +73,7 @@ def compute_coocs( corpus,
JOIN nodes_ngrams AS idxb JOIN nodes_ngrams AS idxb
ON idxa.node_id = idxb.node_id <== that's cooc ON idxa.node_id = idxb.node_id <== that's cooc
--------------------------------- ---------------------------------
AND idxa.ngram_id <> idxb.ngram_id AND idxa.ngram_id <> idxb.ngram_id (diagonal_filter)
AND idxa.node_id = MY_DOC ; AND idxa.node_id = MY_DOC ;
on entire corpus on entire corpus
...@@ -152,16 +156,14 @@ def compute_coocs( corpus, ...@@ -152,16 +156,14 @@ def compute_coocs( corpus,
ucooc ucooc
# for debug (2/4) # for debug (2/4)
#, Xngram.terms.label("w_x") # , Xngram.terms.label("w_x")
#, Yngram.terms.label("w_y") # , Yngram.terms.label("w_y")
) )
.join(Yindex, Xindex.node_id == Yindex.node_id ) # <- by definition of cooc .join(Yindex, Xindex.node_id == Yindex.node_id ) # <- by definition of cooc
.join(Node, Node.id == Xindex.node_id) # <- b/c within corpus .join(Node, Node.id == Xindex.node_id) # <- b/c within corpus
.filter(Node.parent_id == corpus.id) # <- b/c within corpus .filter(Node.parent_id == corpus.id) # <- b/c within corpus
.filter(Node.typename == "DOCUMENT") # <- b/c within corpus .filter(Node.typename == "DOCUMENT") # <- b/c within corpus
.filter(Xindex_ngform_id != Yindex_ngform_id) # <- b/c not with itself
) )
# outerjoin the synonyms if needed # outerjoin the synonyms if needed
...@@ -179,12 +181,12 @@ def compute_coocs( corpus, ...@@ -179,12 +181,12 @@ def compute_coocs( corpus,
.group_by( .group_by(
Xindex_ngform_id, Yindex_ngform_id # <- what we're counting Xindex_ngform_id, Yindex_ngform_id # <- what we're counting
# for debug (3/4) # for debug (3/4)
#,"w_x", "w_y" # ,"w_x", "w_y"
) )
# for debug (4/4) # for debug (4/4)
#.join(Xngram, Xngram.id == Xindex_ngform_id) # .join(Xngram, Xngram.id == Xindex_ngform_id)
#.join(Yngram, Yngram.id == Yindex_ngform_id) # .join(Yngram, Yngram.id == Yindex_ngform_id)
.order_by(ucooc) .order_by(ucooc)
) )
...@@ -192,6 +194,9 @@ def compute_coocs( corpus, ...@@ -192,6 +194,9 @@ def compute_coocs( corpus,
# 4) INPUT FILTERS (reduce N before O(N²)) # 4) INPUT FILTERS (reduce N before O(N²))
if on_list_id: if on_list_id:
# £TODO listes différentes ou bien une liste pour x et tous les ngrammes pour y
# car permettrait expansion de liste aux plus proches voisins (MacLachlan)
# (avec une matr rectangulaire)
m1 = aliased(NodeNgram) m1 = aliased(NodeNgram)
m2 = aliased(NodeNgram) m2 = aliased(NodeNgram)
...@@ -226,6 +231,10 @@ def compute_coocs( corpus, ...@@ -226,6 +231,10 @@ def compute_coocs( corpus,
) )
if diagonal_filter:
# don't compute ngram with itself
coocs_query = coocs_query.filter(Xindex_ngform_id != Yindex_ngform_id)
if start or end: if start or end:
Time = aliased(NodeHyperdata) Time = aliased(NodeHyperdata)
...@@ -268,6 +277,7 @@ def compute_coocs( corpus, ...@@ -268,6 +277,7 @@ def compute_coocs( corpus,
# threshold # threshold
# £TODO adjust COOC_THRESHOLD a posteriori: # £TODO adjust COOC_THRESHOLD a posteriori:
# ex: sometimes 2 sometimes 4 depending on sparsity # ex: sometimes 2 sometimes 4 depending on sparsity
print("COOCS: filtering pairs under threshold:", threshold)
coocs_query = coocs_query.having(ucooc >= threshold) coocs_query = coocs_query.having(ucooc >= threshold)
......
...@@ -77,7 +77,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_ ...@@ -77,7 +77,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
continue continue
# get ngrams # get ngrams
for ngram in ngramsextractor.extract(value): for ngram in ngramsextractor.extract(value):
tokens = tuple(token[0] for token in ngram) tokens = tuple(normalize_forms(token[0]) for token in ngram)
if do_subngrams: if do_subngrams:
# ex tokens = ["very", "cool", "exemple"] # ex tokens = ["very", "cool", "exemple"]
...@@ -90,7 +90,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_ ...@@ -90,7 +90,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
subterms = [tokens] subterms = [tokens]
for seqterm in subterms: for seqterm in subterms:
ngram = normalize_terms(' '.join(seqterm)) ngram = ' '.join(seqterm)
if len(ngram) > 1: if len(ngram) > 1:
# doc <=> ngram index # doc <=> ngram index
nodes_ngrams_count[(document.id, ngram)] += 1 nodes_ngrams_count[(document.id, ngram)] += 1
...@@ -118,7 +118,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_ ...@@ -118,7 +118,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
raise error raise error
def normalize_terms(term_str, do_lowercase=DEFAULT_ALL_LOWERCASE_FLAG): def normalize_forms(term_str, do_lowercase=DEFAULT_ALL_LOWERCASE_FLAG):
""" """
Removes unwanted trailing punctuation Removes unwanted trailing punctuation
AND optionally puts everything to lowercase AND optionally puts everything to lowercase
...@@ -127,14 +127,14 @@ def normalize_terms(term_str, do_lowercase=DEFAULT_ALL_LOWERCASE_FLAG): ...@@ -127,14 +127,14 @@ def normalize_terms(term_str, do_lowercase=DEFAULT_ALL_LOWERCASE_FLAG):
(benefits from normalize_chars upstream so there's less cases to consider) (benefits from normalize_chars upstream so there's less cases to consider)
""" """
# print('normalize_terms IN: "%s"' % term_str) # print('normalize_forms IN: "%s"' % term_str)
term_str = sub(r'^[-",;/%(){}\\\[\]\.\' ]+', '', term_str) term_str = sub(r'^[-\'",;/%(){}\\\[\]\. ©]+', '', term_str)
term_str = sub(r'[-",;/%(){}\\\[\]\.\' ]+$', '', term_str) term_str = sub(r'[-\'",;/%(){}\\\[\]\. ©]+$', '', term_str)
if do_lowercase: if do_lowercase:
term_str = term_str.lower() term_str = term_str.lower()
# print('normalize_terms OUT: "%s"' % term_str) # print('normalize_forms OUT: "%s"' % term_str)
return term_str return term_str
......
...@@ -57,7 +57,7 @@ class CSVLists(APIView): ...@@ -57,7 +57,7 @@ class CSVLists(APIView):
params in request.GET: params in request.GET:
onto_corpus: the corpus whose lists are getting patched onto_corpus: the corpus whose lists are getting patched
params in request.FILES: params in request.data:
csvfile: the csv file csvfile: the csv file
/!\ We assume we checked the file size client-side before upload /!\ We assume we checked the file size client-side before upload
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment