Commit 6c438c85 authored by delanoe's avatar delanoe

Merge branch 'romain-refactoring' into unstable

parents f236759c e52afd97
...@@ -8,18 +8,19 @@ import re ...@@ -8,18 +8,19 @@ import re
LISTTYPES = { LISTTYPES = {
'DOCUMENT' : WeightedList, 'DOCUMENT' : WeightedList,
'GROUPLIST' : Translations, 'GROUPLIST' : Translations, # todo remove "LIST" from name
'STOPLIST' : UnweightedList, 'STOPLIST' : UnweightedList,
'MAINLIST' : UnweightedList, 'MAINLIST' : UnweightedList,
'MAPLIST' : UnweightedList, 'MAPLIST' : UnweightedList,
'SPECIFICITY' : WeightedList, 'SPECIFICITY' : WeightedList,
'OCCURRENCES' : WeightedContextIndex, 'OCCURRENCES' : WeightedIndex, # todo replace by WeightedList
'COOCCURRENCES': WeightedMatrix, 'COOCCURRENCES': WeightedMatrix,
'TFIDF-CORPUS' : WeightedContextIndex, 'TFIDF-CORPUS' : WeightedIndex, # todo split -> WeightedList for ti_rank and WeightedIndex for tfidf
'TFIDF-GLOBAL' : WeightedContextIndex, 'TFIDF-GLOBAL' : WeightedIndex, # todo split -> WeightedList for ti_rank and WeightedIndex for tfidf
} }
NODETYPES = [ NODETYPES = [
# TODO separate id not array index, read by models.node
None, None,
# documents hierarchy # documents hierarchy
'USER', # 1 'USER', # 1
...@@ -40,6 +41,7 @@ NODETYPES = [ ...@@ -40,6 +41,7 @@ NODETYPES = [
'TFIDF-GLOBAL', # 14 'TFIDF-GLOBAL', # 14
# docs subset # docs subset
'FAVORITES' # 15 'FAVORITES' # 15
# TODO add ti RANK
] ]
INDEXED_HYPERDATA = { INDEXED_HYPERDATA = {
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
""" """
__all__ = ['Translations', 'WeightedMatrix', 'UnweightedList', 'WeightedList', 'WeightedContextIndex'] __all__ = ['Translations', 'WeightedMatrix', 'UnweightedList', 'WeightedList', 'WeightedIndex']
from gargantext.util.db import session, bulk_insert from gargantext.util.db import session, bulk_insert
...@@ -165,15 +165,18 @@ class Translations(_BaseClass): ...@@ -165,15 +165,18 @@ class Translations(_BaseClass):
) )
class WeightedContextIndex(_BaseClass): class WeightedIndex(_BaseClass):
""" """
associated model : NodeNodeNgram associated model : NodeNodeNgram
associated columns : node1_id | node2_id | ngram_id | score (float) associated columns : node1_id | node2_id | ngram_id | score (float)
^^^^
reserved for this
object's id
Tensor representing a contextual index or registry Matrix representing a weighted word index across docs or small context nodes
(matrix of weighted ngrams *per* doc *per* context) (matrix of weighted ngrams *per* doc)
Exemple : tfidf by corpus Exemple : tfidf within a corpus
""" """
def __init__(self, source=None): def __init__(self, source=None):
self.items = defaultdict(float) self.items = defaultdict(float)
...@@ -182,7 +185,7 @@ class WeightedContextIndex(_BaseClass): ...@@ -182,7 +185,7 @@ class WeightedContextIndex(_BaseClass):
# ?TODO rename WeightedWordmatrix
class WeightedMatrix(_BaseClass): class WeightedMatrix(_BaseClass):
def __init__(self, source=None): def __init__(self, source=None):
...@@ -294,7 +297,7 @@ class WeightedMatrix(_BaseClass): ...@@ -294,7 +297,7 @@ class WeightedMatrix(_BaseClass):
result.items[key1, key2] = value / sqrt(other.items[key1] * other.items[key2]) result.items[key1, key2] = value / sqrt(other.items[key1] * other.items[key2])
return result return result
# ?TODO rename Wordlist
class UnweightedList(_BaseClass): class UnweightedList(_BaseClass):
def __init__(self, source=None): def __init__(self, source=None):
...@@ -399,6 +402,7 @@ class UnweightedList(_BaseClass): ...@@ -399,6 +402,7 @@ class UnweightedList(_BaseClass):
) )
# ?TODO rename WeightedWordlist
class WeightedList(_BaseClass): class WeightedList(_BaseClass):
def __init__(self, source=None): def __init__(self, source=None):
......
...@@ -111,20 +111,16 @@ def parse_extract_indexhyperdata(corpus): ...@@ -111,20 +111,16 @@ def parse_extract_indexhyperdata(corpus):
group_id = compute_groups(corpus, stoplist_id = None) group_id = compute_groups(corpus, stoplist_id = None)
print('CORPUS #%d: [%s] new grouplist node #%i' % (corpus.id, t(), group_id)) print('CORPUS #%d: [%s] new grouplist node #%i' % (corpus.id, t(), group_id))
# -> write occurrences to Node and NodeNodeNgram # possible: factorize with tfidf
occ_id = compute_occs(corpus)
print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id))
# ------------ # ------------
# -> write local tfidf similarities to Node and NodeNodeNgram # -> write occurrences to Node and NodeNodeNgram # (todo: NodeNgram)
ltfidf_id = compute_tfidf_local(corpus) occ_id = compute_occs(corpus, groupings_id = group_id)
print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id)) print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id))
# -> write global and cumulated ti_ranking (tfidf ranking vector) to Node and NodeNodeNgram # -> write cumulated ti_ranking (tfidf ranking vector) to Node and NodeNodeNgram (todo: NodeNgram)
tirank_id = compute_ti_ranking(corpus, tirank_id = compute_ti_ranking(corpus,
count_scope="global", groupings_id = group_id,
termset_scope="local") count_scope="global")
print('CORPUS #%d: [%s] new tfidf ranking node #%i' % (corpus.id, t(), tirank_id)) print('CORPUS #%d: [%s] new ti ranking node #%i' % (corpus.id, t(), tirank_id))
# -> mainlist: filter + write (to Node and NodeNgram) # -> mainlist: filter + write (to Node and NodeNgram)
mainlist_id = do_mainlist(corpus, mainlist_id = do_mainlist(corpus,
...@@ -132,13 +128,21 @@ def parse_extract_indexhyperdata(corpus): ...@@ -132,13 +128,21 @@ def parse_extract_indexhyperdata(corpus):
stoplist_id = stop_id) stoplist_id = stop_id)
print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id)) print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id))
# -> write local tfidf similarities to Node and NodeNodeNgram
# TODO only on mainlist
ltfidf_id = compute_tfidf_local(corpus)
print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))
# => used for doc <=> ngram association
# ------------ # ------------
# -> cooccurrences: compute + write (=> Node and NodeNodeNgram) # -> cooccurrences: compute + write (=> Node and NodeNodeNgram)
cooc_id = compute_coocs(corpus, mainlist_id = mainlist_id) cooc_id = compute_coocs(corpus, mainlist_id = mainlist_id, groupings_id = group_id)
print('CORPUS #%d: [%s] new coocs node #%i' % (corpus.id, t(), cooc_id)) print('CORPUS #%d: [%s] new coocs node #%i' % (corpus.id, t(), cooc_id))
# -> specificity: compute + write (=> NodeNodeNgram) # -> specificity: compute + write (=> NodeNodeNgram)
spec_id = compute_specificity(corpus, cooc_id=cooc_id) spec_id = compute_specificity(corpus, cooc_id=cooc_id
# ,groupings_id = group_id
)
print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), spec_id)) print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), spec_id))
# ?? maplist: compute + write (to Node and NodeNgram) # ?? maplist: compute + write (to Node and NodeNgram)
......
...@@ -65,6 +65,9 @@ def do_mainlist(corpus, ...@@ -65,6 +65,9 @@ def do_mainlist(corpus,
ordered_filtered_tfidf = (session ordered_filtered_tfidf = (session
.query(NodeNodeNgram.ngram_id) .query(NodeNodeNgram.ngram_id)
.filter(NodeNodeNgram.node1_id == ranking_scores_id) .filter(NodeNodeNgram.node1_id == ranking_scores_id)
# NOT IN but speed theoretically ok here
# see http://sqlperformance.com/2012/12/t-sql-queries/left-anti-semi-join
# but http://stackoverflow.com/questions/2246772/whats-the-difference-between-not-exists-vs-not-in-vs-left-join-where-is-null/2246793#2246793
.filter(~ NodeNodeNgram.ngram_id.in_(stopterms_subquery)) .filter(~ NodeNodeNgram.ngram_id.in_(stopterms_subquery))
.order_by(desc(NodeNodeNgram.score)) .order_by(desc(NodeNodeNgram.score))
) )
......
...@@ -9,13 +9,15 @@ FIXME: "having the same source" means we need to select inside hyperdata ...@@ -9,13 +9,15 @@ FIXME: "having the same source" means we need to select inside hyperdata
""" """
from gargantext.models import Node, NodeNgram, NodeNodeNgram, NodeNgramNgram from gargantext.models import Node, NodeNgram, NodeNodeNgram, NodeNgramNgram
from gargantext.util.db_cache import cache
from gargantext.util.db import session, bulk_insert, aliased, \ from gargantext.util.db import session, bulk_insert, aliased, \
func # = sqlalchemy.func like sum() or count() func # = sqlalchemy.func like sum() or count()
from sqlalchemy.sql.expression import case # for choice if ngram has mainform or not from sqlalchemy.sql.expression import case # for choice if ngram has mainform or not
from sqlalchemy import distinct # for list of unique ngram_ids within a corpus from sqlalchemy import distinct # for list of unique ngram_ids within a corpus
from math import log from math import log
from re import match
# £TODO # £TODO
# from gargantext.util.lists import WeightedContextIndex # from gargantext.util.lists import WeightedIndex
def compute_occs(corpus, overwrite_id = None, groupings_id = None,): def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
...@@ -32,7 +34,7 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,): ...@@ -32,7 +34,7 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
Parameters: Parameters:
- overwrite_id: optional id of a pre-existing OCCURRENCES node for this corpus - overwrite_id: optional id of a pre-existing OCCURRENCES node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced) (the Node and its previous NodeNodeNgram rows will be replaced)
- groupings_id: optional id of a GROUPLIST node for this corpus - groupings_id: optional id of a GROUPLIST node for these ngrams
IF absent the occurrences are the sums for each ngram IF absent the occurrences are the sums for each ngram
IF present they're the sums for each ngram's mainform IF present they're the sums for each ngram's mainform
""" """
...@@ -115,7 +117,8 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,): ...@@ -115,7 +117,8 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
if overwrite_id: if overwrite_id:
# overwrite pre-existing id # overwrite pre-existing id
the_id = overwrite_id the_id = overwrite_id
# occnode = cache.Node[overwrite_id] session.query(NodeNodeNgram).filter(NodeNodeNgram.node1_id == the_id).delete()
session.commit()
else: else:
# create the new OCCURRENCES node # create the new OCCURRENCES node
occnode = corpus.add_child( occnode = corpus.add_child(
...@@ -126,8 +129,8 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,): ...@@ -126,8 +129,8 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
session.commit() session.commit()
the_id = occnode.id the_id = occnode.id
# reflect that in NodeNodeNgrams (could be NodeNgram but harmony with tfidf) # £TODO make it NodeNgram instead NodeNodeNgram ! and rebase :/
# £TODO replace bulk_insert by something like WeightedContextMatrix.save() # (idem ti_ranking)
bulk_insert( bulk_insert(
NodeNodeNgram, NodeNodeNgram,
('node1_id' , 'node2_id', 'ngram_id', 'score'), ('node1_id' , 'node2_id', 'ngram_id', 'score'),
...@@ -137,14 +140,26 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,): ...@@ -137,14 +140,26 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
return the_id return the_id
def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overwrite_id=None): def compute_ti_ranking(corpus,
groupings_id = None,
count_scope="local", termset_scope="local",
overwrite_id=None):
""" """
# TODO check if cumulated tfs correspond to app's use cases and intention Calculates tfidf ranking within given scope
----------
Calculates tfidf ranking (cumulated tfidf for each ngram) within given scope |
via weighting of
cumulated tfidf --------- Sum{i}(tf_ij) * ln(N/|U{i}(docs{mot€d})|)
per ngram ng_i
(or per mainform ng_i' if groups)
across some docs d_j
Parameters: Parameters:
- the corpus itself - the corpus itself (or corpus_id)
- groupings_id: optional id of a GROUPLIST node for these ngrams
IF absent the ti weights are the sums for each ngram
IF present they're the sums for each ngram's mainform
- count_scope: {"local" or "global"} - count_scope: {"local" or "global"}
- local <=> frequencies counted in the current corpus - local <=> frequencies counted in the current corpus
- global <=> frequencies counted in all corpora of this type - global <=> frequencies counted in all corpora of this type
...@@ -153,43 +168,94 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw ...@@ -153,43 +168,94 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
- termset_scope: {"local" or "global"} - termset_scope: {"local" or "global"}
- local <=> output list of terms limited to the current corpus - local <=> output list of terms limited to the current corpus
(SELECT DISTINCT ngram_id FROM nodes_ngrams WHERE node_id IN <docs>) (SELECT DISTINCT ngram_id FROM nodes_ngrams WHERE node_id IN <docs>)
- global <=> output list of terms from all corpora of this type - global <=> output list of terms found in global doc scope
!!!! (many more terms) !!!! (many more terms)
- overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus - overwrite_id: optional id of a pre-existing XXXX node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced) (the Node and its previous Node NodeNgram rows will be replaced)
""" """
# validate string params
if count_scope not in ["local","global"]:
raise ValueError("compute_ti_ranking: count_scope param allowed values: 'local', 'global'")
if termset_scope not in ["local","global"]:
raise ValueError("compute_ti_ranking: termset_scope param allowed values: 'local', 'global'")
if count_scope == "local" and termset_scope == "global":
raise ValueError("compute_ti_ranking: the termset_scope param can be 'global' iff count_scope param is 'global' too.")
# get corpus
if type(corpus) == int:
corpus_id = corpus
corpus = cache.Node[corpus_id]
elif type(corpus) == str and match(r'\d+$', corpus):
corpus_id = int(corpus)
corpus = cache.Node[corpus_id]
else:
# assuming Node class
corpus_id = corpus.id
# prepare sqla mainform vs ngram selector
ngform_i = None
if not groupings_id:
ngform_i = NodeNgram.ngram_id
else:
# prepare translations
syno = (session.query(NodeNgramNgram.ngram1_id,
NodeNgramNgram.ngram2_id)
.filter(NodeNgramNgram.node_id == groupings_id)
.subquery()
)
# cf commentaire détaillé dans compute_occs() + todo facto
ngform_i = case([
(syno.c.ngram1_id != None, syno.c.ngram1_id),
(syno.c.ngram1_id == None, NodeNgram.ngram_id)
# condition value
])
# MAIN QUERY SKELETON # MAIN QUERY SKELETON
tf_nd_query = (session tf_nd_query = (session
.query( .query(
NodeNgram.ngram_id, # NodeNgram.ngram_id
# or similar if grouping ngrams under their mainform
ngform_i.label("counted_ngform"),
# the tfidf elements
# ------------------
func.sum(NodeNgram.weight), # tf: same as occurrences func.sum(NodeNgram.weight), # tf: same as occurrences
# ----------------------- # -----------------------
func.count(NodeNgram.node_id) # nd: n docs with term func.count(NodeNgram.node_id) # nd: n docs with term
# -------------------- # --------------------
) )
.group_by(NodeNgram.ngram_id) .group_by("counted_ngform")
# optional *count_scope*: if we'll restrict the doc nodes # count_scope to specify in which doc nodes to count
# ------------- # -----------
# .join(countdocs_subquery, # .join(countdocs_subquery,
# countdocs_subquery.c.id == NodeNgram.node_id) # countdocs_subquery.c.id == NodeNgram.node_id)
# optional *termset_scope*: if we'll restrict the ngrams # optional termset_scope: if we'll restrict the ngrams
# --------------- # -------------
# .join(termset_subquery, # .join(termset_subquery,
# termset_subquery.c.uniq_ngid == NodeNgram.ngram_id) # termset_subquery.c.uniq_ngid == NodeNgram.ngram_id)
# optional translations to bring the subform's replacement
# ------------
# .outerjoin(syno,
# syno.c.ngram2_id == NodeNgram.ngram_id)
) )
# validate string params
if count_scope not in ["local","global"]:
raise ValueError("compute_ti_ranking: count_scope param allowed values: 'local', 'global'") # TUNING THE QUERY
if termset_scope not in ["local","global"]:
raise ValueError("compute_ti_ranking: termset_scope param allowed values: 'local', 'global'") if groupings_id:
if count_scope == "local" and termset_scope == "global": tf_nd_query = tf_nd_query.outerjoin(
raise ValueError("compute_ti_ranking: the termset_scope param can be 'global' iff count_scope param is 'global' too.") syno,
syno.c.ngram2_id == NodeNgram.ngram_id
)
# local <=> within this corpus # local <=> within this corpus
if count_scope == "local": if count_scope == "local":
...@@ -197,14 +263,14 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw ...@@ -197,14 +263,14 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
countdocs_subquery = (session countdocs_subquery = (session
.query(Node.id) .query(Node.id)
.filter(Node.typename == "DOCUMENT") .filter(Node.typename == "DOCUMENT")
.filter(Node.parent_id == corpus.id) .filter(Node.parent_id == corpus_id)
.subquery() .subquery()
) )
# both scopes are the same: no need to independantly restrict the ngrams # no need to independantly restrict the ngrams
tf_nd_query = tf_nd_query.join(countdocs_subquery, tf_nd_query = tf_nd_query.join(countdocs_subquery,
countdocs_subquery.c.id == NodeNgram.node_id) countdocs_subquery.c.id == NodeNgram.node_id)
# ---
# global <=> within all corpora of this source # global <=> within all corpora of this source
elif count_scope == "global": elif count_scope == "global":
...@@ -220,6 +286,7 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw ...@@ -220,6 +286,7 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
# join on parent_id with selected corpora nodes # join on parent_id with selected corpora nodes
.join(CorpusNode, CorpusNode.id == Node.parent_id) .join(CorpusNode, CorpusNode.id == Node.parent_id)
.filter(CorpusNode.typename == "CORPUS") .filter(CorpusNode.typename == "CORPUS")
# TODO index corpus_sourcetype in DB
.filter(CorpusNode.hyperdata['resources'][0]['type'].astext == str(this_source_type)) .filter(CorpusNode.hyperdata['resources'][0]['type'].astext == str(this_source_type))
.subquery() .subquery()
) )
...@@ -228,15 +295,19 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw ...@@ -228,15 +295,19 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
# both scopes are the same: no need to independantly restrict the ngrams # both scopes are the same: no need to independantly restrict the ngrams
tf_nd_query = tf_nd_query.join(countdocs_subquery, tf_nd_query = tf_nd_query.join(countdocs_subquery,
countdocs_subquery.c.id == NodeNgram.node_id) countdocs_subquery.c.id == NodeNgram.node_id)
# ---
elif termset_scope == "local": elif termset_scope == "local":
# All unique terms in the original corpus # All unique terms...
termset_subquery = (session termset_subquery = (session
.query(distinct(NodeNgram.ngram_id).label("uniq_ngid")) .query(
distinct(NodeNgram.ngram_id).label("uniq_ngid")
)
# ... in the original corpus
.join(Node) .join(Node)
.filter(Node.typename == "DOCUMENT") .filter(Node.typename == "DOCUMENT")
.filter(Node.parent_id == corpus.id) .filter(Node.parent_id == corpus_id)
.subquery() .subquery()
) )
...@@ -247,42 +318,59 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw ...@@ -247,42 +318,59 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
.join(termset_subquery, .join(termset_subquery,
termset_subquery.c.uniq_ngid == NodeNgram.ngram_id) termset_subquery.c.uniq_ngid == NodeNgram.ngram_id)
) )
# ---
# N # M
total_docs = session.query(countdocs_subquery).count() total_docs = session.query(countdocs_subquery).count()
log_tot_docs = log(total_docs)
# result # result
tf_nd = tf_nd_query.all() tf_nd = tf_nd_query.all()
# ------------------------------------------------- # -------------- "sommatoire" sur mot i ----------------
tfidfs = {} tfidfsum = {}
log_tot_docs = log(total_docs) for (ngram_i, tf_i, nd_i) in tf_nd:
for (ngram_id, tf, nd) in tf_nd: # tfidfsum[ngram_i] = tf_i * log(total_docs/nd_i)
# tfidfs[ngram_id] = tf * log(total_docs/nd) tfidfsum[ngram_i] = tf_i * (log_tot_docs-log(nd_i))
tfidfs[ngram_id] = tf * (log_tot_docs-log(nd)) # ------------------------------------------------------
# -------------------------------------------------
# N pour info
total_ngramforms = len(tfidfsum)
if overwrite_id: if overwrite_id:
the_id = overwrite_id the_id = overwrite_id
session.query(NodeNodeNgram).filter(NodeNodeNgram.node1_id == the_id).delete()
session.commit()
else: else:
# create the new TFIDF-XXXX node # create the new TFIDF-XXXX node to get an id
tfidf_nd = corpus.add_child() tir_nd = corpus.add_child()
if count_scope == "local": # TODO discuss use and find new typename if count_scope == "local":
tfidf_nd.typename = "TFIDF-CORPUS" tir_nd.typename = "TFIDF-CORPUS"
tfidf_nd.name = "tfidf-cumul-corpus (in:%s)" % corpus.id tir_nd.name = "ti rank (%i ngforms in corpus:%s)" % (
total_ngramforms, corpus_id)
elif count_scope == "global": elif count_scope == "global":
tfidf_nd.typename = "TFIDF-GLOBAL" tir_nd.typename = "TFIDF-GLOBAL"
tfidf_nd.name = "tfidf-cumul-global (in type:%s)" % this_source_type tir_nd.name = "ti rank (%i ngforms %s in corpora of sourcetype:%s)" % (
session.add(tfidf_nd) total_ngramforms,
("from corpus %i" % corpus_id) if (termset_scope == "local") else "" ,
this_source_type)
session.add(tir_nd)
session.commit() session.commit()
the_id = tfidf_nd.id the_id = tir_nd.id
# TODO 1 discuss use and find new typename
# TODO 2 release these 2 typenames TFIDF-CORPUS and TFIDF-GLOBAL
# TODO 3 recreate them elsewhere in their sims (WeightedIndex) version
# TODO 4 requalify this here as a NodeNgram
# then TODO 5 use WeightedList.save() !
# reflect that in NodeNodeNgrams # reflect that in NodeNodeNgrams
# £TODO replace bulk_insert by something like WeightedContextMatrix.save()
bulk_insert( bulk_insert(
NodeNodeNgram, NodeNodeNgram,
('node1_id', 'node2_id','ngram_id', 'score'), ('node1_id', 'node2_id','ngram_id', 'score'),
((the_id, corpus.id, ng, tfidfs[ng]) for ng in tfidfs) ((the_id, corpus_id, ng, tfidfsum[ng]) for ng in tfidfsum)
) )
return the_id return the_id
...@@ -347,6 +435,8 @@ def compute_tfidf_local(corpus, overwrite_id=None): ...@@ -347,6 +435,8 @@ def compute_tfidf_local(corpus, overwrite_id=None):
if overwrite_id: if overwrite_id:
the_id = overwrite_id the_id = overwrite_id
session.query(NodeNodeNgram).filter(NodeNodeNgram.node1_id == the_id).delete()
session.commit()
else: else:
# create the new TFIDF-CORPUS node # create the new TFIDF-CORPUS node
tfidf_node = corpus.add_child() tfidf_node = corpus.add_child()
...@@ -357,7 +447,7 @@ def compute_tfidf_local(corpus, overwrite_id=None): ...@@ -357,7 +447,7 @@ def compute_tfidf_local(corpus, overwrite_id=None):
the_id = tfidf_node.id the_id = tfidf_node.id
# reflect that in NodeNodeNgrams # reflect that in NodeNodeNgrams
# £TODO replace bulk_insert by something like WeightedContextMatrix.save() # £TODO replace bulk_insert by something like WeightedIndex.save()
bulk_insert( bulk_insert(
NodeNodeNgram, NodeNodeNgram,
('node1_id', 'node2_id','ngram_id', 'score'), ('node1_id', 'node2_id','ngram_id', 'score'),
......
from gargantext.models import Node, NodeNgram, NodeNgramNgram, \ from gargantext.models import Node, NodeNgram, NodeNgramNgram, \
NodeHyperdata NodeHyperdata, Ngram
from gargantext.util.lists import WeightedMatrix from gargantext.util.lists import WeightedMatrix
from gargantext.util.db import session, aliased, func from gargantext.util.db import session, aliased, func
from gargantext.util.db_cache import cache from gargantext.util.db_cache import cache
from gargantext.constants import DEFAULT_COOC_THRESHOLD from gargantext.constants import DEFAULT_COOC_THRESHOLD
from datetime import datetime from datetime import datetime
from sqlalchemy.sql.expression import case # for choice if ngram has mainform or not
def compute_coocs( corpus, def compute_coocs( corpus,
overwrite_id = None, overwrite_id = None,
threshold = DEFAULT_COOC_THRESHOLD, threshold = DEFAULT_COOC_THRESHOLD,
groupings_id = None,
mainlist_id = None, mainlist_id = None,
stoplist_id = None, stoplist_id = None,
start = None, start = None,
...@@ -41,9 +44,11 @@ def compute_coocs( corpus, ...@@ -41,9 +44,11 @@ def compute_coocs( corpus,
- overwrite_id: id of a pre-existing COOCCURRENCES node for this corpus - overwrite_id: id of a pre-existing COOCCURRENCES node for this corpus
(all hyperdata and previous NodeNgramNgram rows will be replaced) (all hyperdata and previous NodeNgramNgram rows will be replaced)
- threshold: on output cooc count (previously called hapax) - threshold: on output cooc count (previously called hapax)
- groupings_id: optional synonym relations to add all subform counts
with their mainform's counts
- mainlist_id: mainlist to constrain the input ngrams - mainlist_id: mainlist to constrain the input ngrams
- stoplist_id: stoplist for filtering input ngrams - stoplist_id: stoplist for filtering input ngrams
(normally unnecessary if a mainlist is provided) (normally unnecessary if a mainlist is already provided)
- start, end: provide one or both temporal limits to filter on doc date - start, end: provide one or both temporal limits to filter on doc date
NB the expected type of parameter value is datetime.datetime NB the expected type of parameter value is datetime.datetime
(string is also possible but format must follow (string is also possible but format must follow
...@@ -56,25 +61,24 @@ def compute_coocs( corpus, ...@@ -56,25 +61,24 @@ def compute_coocs( corpus,
basic idea for one doc basic idea for one doc
====================== ======================
each pair of ngrams sharing same doc (node_id) each pair of ngrams sharing same doc (node_id)
SELEC idx1.ngram_id, idx2.ngram_id SELEC idxa.ngram_id, idxb.ngram_id
FROM nodes_ngrams AS idx1, nodes_ngrams AS idx2 FROM nodes_ngrams AS idxa, nodes_ngrams AS idxb
--------------------------------- ---------------------------------
WHERE idx1.node_id = idx2.node_id <== that's cooc WHERE idxa.node_id = idxb.node_id <== that's cooc
--------------------------------- ---------------------------------
AND idx1.ngram_id <> idx2.ngram_id AND idxa.ngram_id <> idxb.ngram_id
AND idx1.node_id = MY_DOC ; AND idxa.node_id = MY_DOC ;
on entire corpus on entire corpus
================= =================
coocs for each doc : coocs for each doc :
- each given pair like (termA, termB) will likely appear several times - each given pair like (termA, termB) will likely appear several times
=> we do GROUP BY (x1.ngram_id, x2.ngram_id) => we do GROUP BY (Xindex.ngram_id, Yindex.ngram_id)
- we count unique appearances of the pair (cooc) - we count unique appearances of the pair (cooc)
""" """
# - TODO add grouped element's values in grouping 'chief ngram'
# - TODO cvalue_id: allow a metric as additional input filter # - TODO cvalue_id: allow a metric as additional input filter
# - TODO n_min, n_max : filter on Ngram.n (aka length of ngram) # - TODO n_min, n_max : filter on Ngram.n (aka length of ngram)
# - TODO weighted: if False normal cooc to be saved as result # - TODO weighted: if False normal cooc to be saved as result
...@@ -85,130 +89,190 @@ def compute_coocs( corpus, ...@@ -85,130 +89,190 @@ def compute_coocs( corpus,
# 1.859.408 lignes pour la requête cooc simple # 1.859.408 lignes pour la requête cooc simple
# 71.134 lignes en se limitant aux ngrammes qui ont une occ > 1 (weight) # 71.134 lignes en se limitant aux ngrammes qui ont une occ > 1 (weight)
# docs of our corpus
docids_subquery = (session
.query(Node.id)
.filter(Node.parent_id == corpus.id)
.filter(Node.typename == "DOCUMENT")
.subquery()
)
# 2 x the occurrence index table # 2 x the occurrence index table
x1 = aliased(NodeNgram) Xindex = aliased(NodeNgram)
x2 = aliased(NodeNgram) Yindex = aliased(NodeNgram)
# cooccurrences columns definition # for debug (1/4)
ucooc = func.count(x1.ngram_id).label("ucooc") # Xngram = aliased(Ngram)
# Yngram = aliased(Ngram)
# 1) MAIN DB QUERY
coocs_query = ( # 1) prepare definition of counted forms
session.query(x1.ngram_id, x2.ngram_id, ucooc) if not groupings_id:
.join(Node, Node.id == x1.node_id) # <- b/c within corpus
.join(x2, x1.node_id == Node.id ) # <- b/c within corpus # no groupings => the counted forms are the ngrams
Xindex_ngform_id = Xindex.ngram_id
.filter(Node.parent_id == corpus.id) # <- b/c within corpus Yindex_ngform_id = Yindex.ngram_id
.filter(Node.typename == "DOCUMENT") # <- b/c within corpus
# groupings: cf commentaire détaillé dans compute_occs() + todo facto
.filter(x1.node_id == x2.node_id) # <- by definition of cooc else:
.filter(x1.ngram_id != x2.ngram_id) # <- b/c not with itself # prepare translations
.group_by(x1.ngram_id, x2.ngram_id) Xsyno = (session.query(NodeNgramNgram.ngram1_id,
NodeNgramNgram.ngram2_id)
.filter(NodeNgramNgram.node_id == groupings_id)
.subquery()
)
# further use as anon tables prevent doing Ysyno = Xsyno
Ysyno = (session.query(NodeNgramNgram.ngram1_id,
NodeNgramNgram.ngram2_id)
.filter(NodeNgramNgram.node_id == groupings_id)
.subquery()
)
# groupings => define the counted form depending on the existence of a synonym
Xindex_ngform_id = case([
(Xsyno.c.ngram1_id != None, Xsyno.c.ngram1_id),
(Xsyno.c.ngram1_id == None, Xindex.ngram_id)
# condition value
])
Yindex_ngform_id = case([
(Ysyno.c.ngram1_id != None, Ysyno.c.ngram1_id),
(Ysyno.c.ngram1_id == None, Yindex.ngram_id)
])
# ---
# 2) BASE DB QUERY
# cooccurrences columns definition ----------------
ucooc = func.count(Xindex_ngform_id).label("ucooc")
# NB could be X or Y in this line
# (we're counting grouped rows and just happen to do it on this column)
base_query = (
session.query(
Xindex_ngform_id,
Yindex_ngform_id,
ucooc
# for debug (2/4)
#, Xngram.terms.label("w_x")
#, Yngram.terms.label("w_y")
)
.join(Yindex, Xindex.node_id == Yindex.node_id ) # <- by definition of cooc
.join(Node, Node.id == Xindex.node_id) # <- b/c within corpus
.filter(Node.parent_id == corpus.id) # <- b/c within corpus
.filter(Node.typename == "DOCUMENT") # <- b/c within corpus
.filter(Xindex_ngform_id != Yindex_ngform_id) # <- b/c not with itself
)
# outerjoin the synonyms if needed
if groupings_id:
base_query = (base_query
.outerjoin(Xsyno, # <- synonyms for Xindex.ngrams
Xsyno.c.ngram2_id == Xindex.ngram_id)
.outerjoin(Ysyno, # <- synonyms for Yindex.ngrams
Ysyno.c.ngram2_id == Yindex.ngram_id)
)
# 3) counting clause in any case
coocs_query = (base_query
.group_by(
Xindex_ngform_id, Yindex_ngform_id # <- what we're counting
# for debug (3/4)
#,"w_x", "w_y"
)
# for debug (4/4)
#.join(Xngram, Xngram.id == Xindex_ngform_id)
#.join(Yngram, Yngram.id == Yindex_ngform_id)
.order_by(ucooc)
) )
# 2) INPUT FILTERS (reduce N before O(N²))
# 4) INPUT FILTERS (reduce N before O(N²))
if mainlist_id: if mainlist_id:
m1 = aliased(NodeNgram) m1 = aliased(NodeNgram)
m2 = aliased(NodeNgram) m2 = aliased(NodeNgram)
coocs_query = ( coocs_query coocs_query = ( coocs_query
.join(m1, m1.ngram_id == x1.ngram_id) .join(m1, m1.ngram_id == Xindex_ngform_id)
.join(m2, m2.ngram_id == x2.ngram_id) .join(m2, m2.ngram_id == Yindex_ngform_id)
.filter( m1.node_id == mainlist_id ) .filter( m1.node_id == mainlist_id )
.filter( m2.node_id == mainlist_id ) .filter( m2.node_id == mainlist_id )
) )
if stoplist_id: if stoplist_id:
s1 = aliased(NodeNgram) s1 = (session.query(NodeNgram.ngram_id)
s2 = aliased(NodeNgram) .filter(NodeNgram.node_id == stoplist_id)
.subquery()
)
# further use as anon tables prevent doing s2 = s1
s2 = (session.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id == stoplist_id)
.subquery()
)
coocs_query = ( coocs_query coocs_query = ( coocs_query
.join(m1, s1.ngram_id == x1.ngram_id) .outerjoin(s1, s1.c.ngram_id == Xindex_ngform_id)
.join(m2, s2.ngram_id == x2.ngram_id) .outerjoin(s2, s2.c.ngram_id == Yindex_ngform_id)
# équivalent NOT IN stoplist
.filter( s1.c.ngram_id == None )
.filter( s2.c.ngram_id == None )
.filter( s1.node_id == mainlist_id )
.filter( s2.node_id == mainlist_id )
) )
if start: if start or end:
if isinstance(start, datetime): Time = aliased(NodeHyperdata)
start_str = start.strftime("%Y-%m-%d %H:%M:%S")
else: coocs_query = (coocs_query
start_str = str(start) .join(Time, Time.node_id == Xindex.node_id)
.filter(Time.key=="publication_date")
# doc_ids matching this limit )
# TODO s/subqueries/inner joins/ && thanks!
starttime_subquery = (session if start:
.query(NodeHyperdata.node_id) if not isinstance(start, datetime):
.filter(NodeHyperdata.key=="publication_date") try:
.filter(NodeHyperdata.value_str >= start_str) start = datetime.strptime(start, '%Y-%m-%d')
.subquery() except:
) raise TypeError("'start' param expects datetime object or %%Y-%%m-%%d string")
# direct use of str comparison op because there is consistency b/w
# sql alpha sort and chrono sort *in this format %Y-%m-%d %H:%M:%S* # the filtering by start limit
coocs_query = coocs_query.filter(Time.value_utc >= start)
# the filtering by start limit
coocs_query = coocs_query.filter(x1.node_id.in_(starttime_subquery)) if end:
if not isinstance(end, datetime):
if end: try:
if isinstance(end, datetime): end = datetime.strptime(end, '%Y-%m-%d')
end_str = end.strftime("%Y-%m-%d %H:%M:%S") except:
else: raise TypeError("'end' param expects datetime object or %%Y-%%m-%%d string")
end_str = str(end)
# the filtering by start limit
# TODO s/subqueries/inner joins/ && thanks! coocs_query = coocs_query.filter(Time.value_utc <= end)
endtime_subquery = (session
.query(NodeHyperdata.node_id)
.filter(NodeHyperdata.key=="publication_date")
.filter(NodeHyperdata.value_str <= end_str)
.subquery()
)
# the filtering by end limit
coocs_query = coocs_query.filter(x1.node_id.in_(endtime_subquery))
if symmetry_filter: if symmetry_filter:
# 1 filtre tenant en compte de la symétrie # 1 filtre tenant en compte de la symétrie
# -> réduit le travail de moitié !! # -> réduit le travail de moitié !!
# -> mais empêchera l'accès direct aux cooccurrences de x2 # -> mais récupération sera plus couteuse via des requêtes OR comme:
# -> seront éparpillées: notées dans les x1 qui ont précédé x2
# -> récupération sera plus couteuse via des requêtes OR comme:
# WHERE ngram1 = mon_ngram OR ngram2 = mon_ngram # WHERE ngram1 = mon_ngram OR ngram2 = mon_ngram
coocs_query = coocs_query.filter(x1.ngram_id < x2.ngram_id) coocs_query = coocs_query.filter(Xindex_ngform_id < Yindex_ngform_id)
# ------------
# 2 filtres amont possibles pour réduire combinatoire
# - par exemple 929k lignes => 35k lignes
# - ici sur weight mais dégrade les résultats
# => imaginable sur une autre métrique (cvalue ou tfidf?)
# coocs_query = coocs_query.filter(x1.weight > 1)
# coocs_query = coocs_query.filter(x2.weight > 1)
# ------------
# 3) OUTPUT FILTERS # 5) OUTPUT FILTERS
# ------------------ # ------------------
# threshold # threshold
# £TODO adjust COOC_THRESHOLD a posteriori: # £TODO adjust COOC_THRESHOLD a posteriori:
# ex: sometimes 2 sometimes 4 depending on sparsity # ex: sometimes 2 sometimes 4 depending on sparsity
coocs_query = coocs_query.having(ucooc >= threshold) coocs_query = coocs_query.having(ucooc >= threshold)
# 4) EXECUTE QUERY
# 6) EXECUTE QUERY
# ---------------- # ----------------
# => storage in our matrix structure # => storage in our matrix structure
matrix = WeightedMatrix(coocs_query.all()) matrix = WeightedMatrix(coocs_query.all())
# -------------------
# fyi # fyi
shape_0 = len({pair[0] for pair in matrix.items}) shape_0 = len({pair[0] for pair in matrix.items})
......
...@@ -2042,6 +2042,29 @@ function AfterAjax(sourceUrl) { ...@@ -2042,6 +2042,29 @@ function AfterAjax(sourceUrl) {
// console.log(JSON.stringify(NGrams)) // console.log(JSON.stringify(NGrams))
// ------------------------------------------------------------------- // -------------------------------------------------------------------
// ----------------------------------------- MAPLIST
// keepstateId = 1
keepstateId = System[0]["statesD"]["keep"]
if( Object.keys(NGrams["map"]).length>0 ) {
for(var ngram_id in NGrams["map"]) {
myNgramInfo = NGrams["main"].ngrams[ngram_id]
// initialize state of maplist items
myNgramInfo["state"] = keepstateId ;
}
}
// ----------------------------------------- STOPLIST
// delstateId = 2
delstateId = System[0]["statesD"]["delete"]
if( Object.keys(NGrams["stop"]).length>0 ) {
for(var ngram_id in NGrams["stop"]) {
console.log('stopping ' + ngram_id)
myNgramInfo = NGrams["main"].ngrams[ngram_id]
// initialize state of stoplist items
myNgramInfo["state"] = delstateId ;
}
}
// Deleting subforms from the ngrams-table, clean start baby! // Deleting subforms from the ngrams-table, clean start baby!
if( Object.keys(NGrams["group"].links).length>0 ) { if( Object.keys(NGrams["group"].links).length>0 ) {
...@@ -2056,10 +2079,6 @@ function AfterAjax(sourceUrl) { ...@@ -2056,10 +2079,6 @@ function AfterAjax(sourceUrl) {
} }
} }
// debug:
// console.log('~~~~~~~~~~~~~> (sub) _forms')
// console.log( _forms )
// ------------------------------------------- MAINLIST // ------------------------------------------- MAINLIST
// ngrams_data_ will update NGrams.main.ngrams (with subforms removed) // ngrams_data_ will update NGrams.main.ngrams (with subforms removed)
var ngrams_data_ = {} var ngrams_data_ = {}
...@@ -2093,27 +2112,6 @@ function AfterAjax(sourceUrl) { ...@@ -2093,27 +2112,6 @@ function AfterAjax(sourceUrl) {
// console.log( NGrams["main"] ) // console.log( NGrams["main"] )
// ----------------------------------------- MAPLIST
if( Object.keys(NGrams["map"]).length>0 ) {
for(var ngram_id in NGrams["main"].ngrams) {
myNgram = NGrams["main"].ngrams[ngram_id]
if(NGrams["map"][ngram_id]) {
// keepstateId = 1
keepstateId = System[0]["statesD"]["keep"]
// initialize state of maplist items
myNgram["state"] = keepstateId ;
}
else if (NGrams["stop"][ngram_id]) {
// delstateId = 2
delstateId = System[0]["statesD"]["delete"]
// initialize state of stoplist items
myNgram["state"] = delstateId ;
}
}
}
// Building the Score-Selector //NGrams["scores"] // Building the Score-Selector //NGrams["scores"]
var FirstScore = NGrams["main"].scores.initial var FirstScore = NGrams["main"].scores.initial
// TODO scores_div // TODO scores_div
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment