Commit 6c438c85 authored by delanoe's avatar delanoe

Merge branch 'romain-refactoring' into unstable

parents f236759c e52afd97
......@@ -8,18 +8,19 @@ import re
LISTTYPES = {
'DOCUMENT' : WeightedList,
'GROUPLIST' : Translations,
'GROUPLIST' : Translations, # todo remove "LIST" from name
'STOPLIST' : UnweightedList,
'MAINLIST' : UnweightedList,
'MAPLIST' : UnweightedList,
'SPECIFICITY' : WeightedList,
'OCCURRENCES' : WeightedContextIndex,
'OCCURRENCES' : WeightedIndex, # todo replace by WeightedList
'COOCCURRENCES': WeightedMatrix,
'TFIDF-CORPUS' : WeightedContextIndex,
'TFIDF-GLOBAL' : WeightedContextIndex,
'TFIDF-CORPUS' : WeightedIndex, # todo split -> WeightedList for ti_rank and WeightedIndex for tfidf
'TFIDF-GLOBAL' : WeightedIndex, # todo split -> WeightedList for ti_rank and WeightedIndex for tfidf
}
NODETYPES = [
# TODO separate id not array index, read by models.node
None,
# documents hierarchy
'USER', # 1
......@@ -40,6 +41,7 @@ NODETYPES = [
'TFIDF-GLOBAL', # 14
# docs subset
'FAVORITES' # 15
# TODO add ti RANK
]
INDEXED_HYPERDATA = {
......
......@@ -2,7 +2,7 @@
"""
__all__ = ['Translations', 'WeightedMatrix', 'UnweightedList', 'WeightedList', 'WeightedContextIndex']
__all__ = ['Translations', 'WeightedMatrix', 'UnweightedList', 'WeightedList', 'WeightedIndex']
from gargantext.util.db import session, bulk_insert
......@@ -165,15 +165,18 @@ class Translations(_BaseClass):
)
class WeightedContextIndex(_BaseClass):
class WeightedIndex(_BaseClass):
"""
associated model : NodeNodeNgram
associated columns : node1_id | node2_id | ngram_id | score (float)
^^^^
reserved for this
object's id
Tensor representing a contextual index or registry
(matrix of weighted ngrams *per* doc *per* context)
Matrix representing a weighted word index across docs or small context nodes
(matrix of weighted ngrams *per* doc)
Exemple : tfidf by corpus
Exemple : tfidf within a corpus
"""
def __init__(self, source=None):
self.items = defaultdict(float)
......@@ -182,7 +185,7 @@ class WeightedContextIndex(_BaseClass):
# ?TODO rename WeightedWordmatrix
class WeightedMatrix(_BaseClass):
def __init__(self, source=None):
......@@ -294,7 +297,7 @@ class WeightedMatrix(_BaseClass):
result.items[key1, key2] = value / sqrt(other.items[key1] * other.items[key2])
return result
# ?TODO rename Wordlist
class UnweightedList(_BaseClass):
def __init__(self, source=None):
......@@ -399,6 +402,7 @@ class UnweightedList(_BaseClass):
)
# ?TODO rename WeightedWordlist
class WeightedList(_BaseClass):
def __init__(self, source=None):
......
......@@ -111,20 +111,16 @@ def parse_extract_indexhyperdata(corpus):
group_id = compute_groups(corpus, stoplist_id = None)
print('CORPUS #%d: [%s] new grouplist node #%i' % (corpus.id, t(), group_id))
# -> write occurrences to Node and NodeNodeNgram # possible: factorize with tfidf
occ_id = compute_occs(corpus)
print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id))
# ------------
# -> write local tfidf similarities to Node and NodeNodeNgram
ltfidf_id = compute_tfidf_local(corpus)
print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))
# -> write occurrences to Node and NodeNodeNgram # (todo: NodeNgram)
occ_id = compute_occs(corpus, groupings_id = group_id)
print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id))
# -> write global and cumulated ti_ranking (tfidf ranking vector) to Node and NodeNodeNgram
# -> write cumulated ti_ranking (tfidf ranking vector) to Node and NodeNodeNgram (todo: NodeNgram)
tirank_id = compute_ti_ranking(corpus,
count_scope="global",
termset_scope="local")
print('CORPUS #%d: [%s] new tfidf ranking node #%i' % (corpus.id, t(), tirank_id))
groupings_id = group_id,
count_scope="global")
print('CORPUS #%d: [%s] new ti ranking node #%i' % (corpus.id, t(), tirank_id))
# -> mainlist: filter + write (to Node and NodeNgram)
mainlist_id = do_mainlist(corpus,
......@@ -132,13 +128,21 @@ def parse_extract_indexhyperdata(corpus):
stoplist_id = stop_id)
print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id))
# -> write local tfidf similarities to Node and NodeNodeNgram
# TODO only on mainlist
ltfidf_id = compute_tfidf_local(corpus)
print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))
# => used for doc <=> ngram association
# ------------
# -> cooccurrences: compute + write (=> Node and NodeNodeNgram)
cooc_id = compute_coocs(corpus, mainlist_id = mainlist_id)
cooc_id = compute_coocs(corpus, mainlist_id = mainlist_id, groupings_id = group_id)
print('CORPUS #%d: [%s] new coocs node #%i' % (corpus.id, t(), cooc_id))
# -> specificity: compute + write (=> NodeNodeNgram)
spec_id = compute_specificity(corpus, cooc_id=cooc_id)
spec_id = compute_specificity(corpus, cooc_id=cooc_id
# ,groupings_id = group_id
)
print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), spec_id))
# ?? maplist: compute + write (to Node and NodeNgram)
......
......@@ -65,6 +65,9 @@ def do_mainlist(corpus,
ordered_filtered_tfidf = (session
.query(NodeNodeNgram.ngram_id)
.filter(NodeNodeNgram.node1_id == ranking_scores_id)
# NOT IN but speed theoretically ok here
# see http://sqlperformance.com/2012/12/t-sql-queries/left-anti-semi-join
# but http://stackoverflow.com/questions/2246772/whats-the-difference-between-not-exists-vs-not-in-vs-left-join-where-is-null/2246793#2246793
.filter(~ NodeNodeNgram.ngram_id.in_(stopterms_subquery))
.order_by(desc(NodeNodeNgram.score))
)
......
......@@ -9,13 +9,15 @@ FIXME: "having the same source" means we need to select inside hyperdata
"""
from gargantext.models import Node, NodeNgram, NodeNodeNgram, NodeNgramNgram
from gargantext.util.db_cache import cache
from gargantext.util.db import session, bulk_insert, aliased, \
func # = sqlalchemy.func like sum() or count()
from sqlalchemy.sql.expression import case # for choice if ngram has mainform or not
from sqlalchemy import distinct # for list of unique ngram_ids within a corpus
from math import log
from re import match
# £TODO
# from gargantext.util.lists import WeightedContextIndex
# from gargantext.util.lists import WeightedIndex
def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
......@@ -32,7 +34,7 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
Parameters:
- overwrite_id: optional id of a pre-existing OCCURRENCES node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced)
- groupings_id: optional id of a GROUPLIST node for this corpus
- groupings_id: optional id of a GROUPLIST node for these ngrams
IF absent the occurrences are the sums for each ngram
IF present they're the sums for each ngram's mainform
"""
......@@ -115,7 +117,8 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
if overwrite_id:
# overwrite pre-existing id
the_id = overwrite_id
# occnode = cache.Node[overwrite_id]
session.query(NodeNodeNgram).filter(NodeNodeNgram.node1_id == the_id).delete()
session.commit()
else:
# create the new OCCURRENCES node
occnode = corpus.add_child(
......@@ -126,8 +129,8 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
session.commit()
the_id = occnode.id
# reflect that in NodeNodeNgrams (could be NodeNgram but harmony with tfidf)
# £TODO replace bulk_insert by something like WeightedContextMatrix.save()
# £TODO make it NodeNgram instead NodeNodeNgram ! and rebase :/
# (idem ti_ranking)
bulk_insert(
NodeNodeNgram,
('node1_id' , 'node2_id', 'ngram_id', 'score'),
......@@ -137,14 +140,26 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
return the_id
def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overwrite_id=None):
def compute_ti_ranking(corpus,
groupings_id = None,
count_scope="local", termset_scope="local",
overwrite_id=None):
"""
# TODO check if cumulated tfs correspond to app's use cases and intention
Calculates tfidf ranking (cumulated tfidf for each ngram) within given scope
Calculates tfidf ranking within given scope
----------
|
via weighting of
cumulated tfidf --------- Sum{i}(tf_ij) * ln(N/|U{i}(docs{mot€d})|)
per ngram ng_i
(or per mainform ng_i' if groups)
across some docs d_j
Parameters:
- the corpus itself
- the corpus itself (or corpus_id)
- groupings_id: optional id of a GROUPLIST node for these ngrams
IF absent the ti weights are the sums for each ngram
IF present they're the sums for each ngram's mainform
- count_scope: {"local" or "global"}
- local <=> frequencies counted in the current corpus
- global <=> frequencies counted in all corpora of this type
......@@ -153,43 +168,94 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
- termset_scope: {"local" or "global"}
- local <=> output list of terms limited to the current corpus
(SELECT DISTINCT ngram_id FROM nodes_ngrams WHERE node_id IN <docs>)
- global <=> output list of terms from all corpora of this type
- global <=> output list of terms found in global doc scope
!!!! (many more terms)
- overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced)
- overwrite_id: optional id of a pre-existing XXXX node for this corpus
(the Node and its previous Node NodeNgram rows will be replaced)
"""
# validate string params
if count_scope not in ["local","global"]:
raise ValueError("compute_ti_ranking: count_scope param allowed values: 'local', 'global'")
if termset_scope not in ["local","global"]:
raise ValueError("compute_ti_ranking: termset_scope param allowed values: 'local', 'global'")
if count_scope == "local" and termset_scope == "global":
raise ValueError("compute_ti_ranking: the termset_scope param can be 'global' iff count_scope param is 'global' too.")
# get corpus
if type(corpus) == int:
corpus_id = corpus
corpus = cache.Node[corpus_id]
elif type(corpus) == str and match(r'\d+$', corpus):
corpus_id = int(corpus)
corpus = cache.Node[corpus_id]
else:
# assuming Node class
corpus_id = corpus.id
# prepare sqla mainform vs ngram selector
ngform_i = None
if not groupings_id:
ngform_i = NodeNgram.ngram_id
else:
# prepare translations
syno = (session.query(NodeNgramNgram.ngram1_id,
NodeNgramNgram.ngram2_id)
.filter(NodeNgramNgram.node_id == groupings_id)
.subquery()
)
# cf commentaire détaillé dans compute_occs() + todo facto
ngform_i = case([
(syno.c.ngram1_id != None, syno.c.ngram1_id),
(syno.c.ngram1_id == None, NodeNgram.ngram_id)
# condition value
])
# MAIN QUERY SKELETON
tf_nd_query = (session
.query(
NodeNgram.ngram_id,
# NodeNgram.ngram_id
# or similar if grouping ngrams under their mainform
ngform_i.label("counted_ngform"),
# the tfidf elements
# ------------------
func.sum(NodeNgram.weight), # tf: same as occurrences
# -----------------------
func.count(NodeNgram.node_id) # nd: n docs with term
# --------------------
)
.group_by(NodeNgram.ngram_id)
.group_by("counted_ngform")
# optional *count_scope*: if we'll restrict the doc nodes
# -------------
# count_scope to specify in which doc nodes to count
# -----------
# .join(countdocs_subquery,
# countdocs_subquery.c.id == NodeNgram.node_id)
# optional *termset_scope*: if we'll restrict the ngrams
# ---------------
# optional termset_scope: if we'll restrict the ngrams
# -------------
# .join(termset_subquery,
# termset_subquery.c.uniq_ngid == NodeNgram.ngram_id)
# optional translations to bring the subform's replacement
# ------------
# .outerjoin(syno,
# syno.c.ngram2_id == NodeNgram.ngram_id)
)
# validate string params
if count_scope not in ["local","global"]:
raise ValueError("compute_ti_ranking: count_scope param allowed values: 'local', 'global'")
if termset_scope not in ["local","global"]:
raise ValueError("compute_ti_ranking: termset_scope param allowed values: 'local', 'global'")
if count_scope == "local" and termset_scope == "global":
raise ValueError("compute_ti_ranking: the termset_scope param can be 'global' iff count_scope param is 'global' too.")
# TUNING THE QUERY
if groupings_id:
tf_nd_query = tf_nd_query.outerjoin(
syno,
syno.c.ngram2_id == NodeNgram.ngram_id
)
# local <=> within this corpus
if count_scope == "local":
......@@ -197,14 +263,14 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
countdocs_subquery = (session
.query(Node.id)
.filter(Node.typename == "DOCUMENT")
.filter(Node.parent_id == corpus.id)
.filter(Node.parent_id == corpus_id)
.subquery()
)
# both scopes are the same: no need to independantly restrict the ngrams
# no need to independantly restrict the ngrams
tf_nd_query = tf_nd_query.join(countdocs_subquery,
countdocs_subquery.c.id == NodeNgram.node_id)
# ---
# global <=> within all corpora of this source
elif count_scope == "global":
......@@ -220,6 +286,7 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
# join on parent_id with selected corpora nodes
.join(CorpusNode, CorpusNode.id == Node.parent_id)
.filter(CorpusNode.typename == "CORPUS")
# TODO index corpus_sourcetype in DB
.filter(CorpusNode.hyperdata['resources'][0]['type'].astext == str(this_source_type))
.subquery()
)
......@@ -228,15 +295,19 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
# both scopes are the same: no need to independantly restrict the ngrams
tf_nd_query = tf_nd_query.join(countdocs_subquery,
countdocs_subquery.c.id == NodeNgram.node_id)
# ---
elif termset_scope == "local":
# All unique terms in the original corpus
# All unique terms...
termset_subquery = (session
.query(distinct(NodeNgram.ngram_id).label("uniq_ngid"))
.query(
distinct(NodeNgram.ngram_id).label("uniq_ngid")
)
# ... in the original corpus
.join(Node)
.filter(Node.typename == "DOCUMENT")
.filter(Node.parent_id == corpus.id)
.filter(Node.parent_id == corpus_id)
.subquery()
)
......@@ -247,42 +318,59 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
.join(termset_subquery,
termset_subquery.c.uniq_ngid == NodeNgram.ngram_id)
)
# ---
# N
# M
total_docs = session.query(countdocs_subquery).count()
log_tot_docs = log(total_docs)
# result
tf_nd = tf_nd_query.all()
# -------------------------------------------------
tfidfs = {}
log_tot_docs = log(total_docs)
for (ngram_id, tf, nd) in tf_nd:
# tfidfs[ngram_id] = tf * log(total_docs/nd)
tfidfs[ngram_id] = tf * (log_tot_docs-log(nd))
# -------------------------------------------------
# -------------- "sommatoire" sur mot i ----------------
tfidfsum = {}
for (ngram_i, tf_i, nd_i) in tf_nd:
# tfidfsum[ngram_i] = tf_i * log(total_docs/nd_i)
tfidfsum[ngram_i] = tf_i * (log_tot_docs-log(nd_i))
# ------------------------------------------------------
# N pour info
total_ngramforms = len(tfidfsum)
if overwrite_id:
the_id = overwrite_id
session.query(NodeNodeNgram).filter(NodeNodeNgram.node1_id == the_id).delete()
session.commit()
else:
# create the new TFIDF-XXXX node
tfidf_nd = corpus.add_child()
if count_scope == "local": # TODO discuss use and find new typename
tfidf_nd.typename = "TFIDF-CORPUS"
tfidf_nd.name = "tfidf-cumul-corpus (in:%s)" % corpus.id
# create the new TFIDF-XXXX node to get an id
tir_nd = corpus.add_child()
if count_scope == "local":
tir_nd.typename = "TFIDF-CORPUS"
tir_nd.name = "ti rank (%i ngforms in corpus:%s)" % (
total_ngramforms, corpus_id)
elif count_scope == "global":
tfidf_nd.typename = "TFIDF-GLOBAL"
tfidf_nd.name = "tfidf-cumul-global (in type:%s)" % this_source_type
session.add(tfidf_nd)
tir_nd.typename = "TFIDF-GLOBAL"
tir_nd.name = "ti rank (%i ngforms %s in corpora of sourcetype:%s)" % (
total_ngramforms,
("from corpus %i" % corpus_id) if (termset_scope == "local") else "" ,
this_source_type)
session.add(tir_nd)
session.commit()
the_id = tfidf_nd.id
the_id = tir_nd.id
# TODO 1 discuss use and find new typename
# TODO 2 release these 2 typenames TFIDF-CORPUS and TFIDF-GLOBAL
# TODO 3 recreate them elsewhere in their sims (WeightedIndex) version
# TODO 4 requalify this here as a NodeNgram
# then TODO 5 use WeightedList.save() !
# reflect that in NodeNodeNgrams
# £TODO replace bulk_insert by something like WeightedContextMatrix.save()
bulk_insert(
NodeNodeNgram,
('node1_id', 'node2_id','ngram_id', 'score'),
((the_id, corpus.id, ng, tfidfs[ng]) for ng in tfidfs)
((the_id, corpus_id, ng, tfidfsum[ng]) for ng in tfidfsum)
)
return the_id
......@@ -347,6 +435,8 @@ def compute_tfidf_local(corpus, overwrite_id=None):
if overwrite_id:
the_id = overwrite_id
session.query(NodeNodeNgram).filter(NodeNodeNgram.node1_id == the_id).delete()
session.commit()
else:
# create the new TFIDF-CORPUS node
tfidf_node = corpus.add_child()
......@@ -357,7 +447,7 @@ def compute_tfidf_local(corpus, overwrite_id=None):
the_id = tfidf_node.id
# reflect that in NodeNodeNgrams
# £TODO replace bulk_insert by something like WeightedContextMatrix.save()
# £TODO replace bulk_insert by something like WeightedIndex.save()
bulk_insert(
NodeNodeNgram,
('node1_id', 'node2_id','ngram_id', 'score'),
......
from gargantext.models import Node, NodeNgram, NodeNgramNgram, \
NodeHyperdata
NodeHyperdata, Ngram
from gargantext.util.lists import WeightedMatrix
from gargantext.util.db import session, aliased, func
from gargantext.util.db_cache import cache
from gargantext.constants import DEFAULT_COOC_THRESHOLD
from datetime import datetime
from sqlalchemy.sql.expression import case # for choice if ngram has mainform or not
def compute_coocs( corpus,
overwrite_id = None,
threshold = DEFAULT_COOC_THRESHOLD,
groupings_id = None,
mainlist_id = None,
stoplist_id = None,
start = None,
......@@ -41,9 +44,11 @@ def compute_coocs( corpus,
- overwrite_id: id of a pre-existing COOCCURRENCES node for this corpus
(all hyperdata and previous NodeNgramNgram rows will be replaced)
- threshold: on output cooc count (previously called hapax)
- groupings_id: optional synonym relations to add all subform counts
with their mainform's counts
- mainlist_id: mainlist to constrain the input ngrams
- stoplist_id: stoplist for filtering input ngrams
(normally unnecessary if a mainlist is provided)
(normally unnecessary if a mainlist is already provided)
- start, end: provide one or both temporal limits to filter on doc date
NB the expected type of parameter value is datetime.datetime
(string is also possible but format must follow
......@@ -56,25 +61,24 @@ def compute_coocs( corpus,
basic idea for one doc
======================
each pair of ngrams sharing same doc (node_id)
SELEC idx1.ngram_id, idx2.ngram_id
FROM nodes_ngrams AS idx1, nodes_ngrams AS idx2
SELEC idxa.ngram_id, idxb.ngram_id
FROM nodes_ngrams AS idxa, nodes_ngrams AS idxb
---------------------------------
WHERE idx1.node_id = idx2.node_id <== that's cooc
WHERE idxa.node_id = idxb.node_id <== that's cooc
---------------------------------
AND idx1.ngram_id <> idx2.ngram_id
AND idx1.node_id = MY_DOC ;
AND idxa.ngram_id <> idxb.ngram_id
AND idxa.node_id = MY_DOC ;
on entire corpus
=================
coocs for each doc :
- each given pair like (termA, termB) will likely appear several times
=> we do GROUP BY (x1.ngram_id, x2.ngram_id)
=> we do GROUP BY (Xindex.ngram_id, Yindex.ngram_id)
- we count unique appearances of the pair (cooc)
"""
# - TODO add grouped element's values in grouping 'chief ngram'
# - TODO cvalue_id: allow a metric as additional input filter
# - TODO n_min, n_max : filter on Ngram.n (aka length of ngram)
# - TODO weighted: if False normal cooc to be saved as result
......@@ -85,130 +89,190 @@ def compute_coocs( corpus,
# 1.859.408 lignes pour la requête cooc simple
# 71.134 lignes en se limitant aux ngrammes qui ont une occ > 1 (weight)
# docs of our corpus
docids_subquery = (session
.query(Node.id)
.filter(Node.parent_id == corpus.id)
.filter(Node.typename == "DOCUMENT")
# 2 x the occurrence index table
Xindex = aliased(NodeNgram)
Yindex = aliased(NodeNgram)
# for debug (1/4)
# Xngram = aliased(Ngram)
# Yngram = aliased(Ngram)
# 1) prepare definition of counted forms
if not groupings_id:
# no groupings => the counted forms are the ngrams
Xindex_ngform_id = Xindex.ngram_id
Yindex_ngform_id = Yindex.ngram_id
# groupings: cf commentaire détaillé dans compute_occs() + todo facto
else:
# prepare translations
Xsyno = (session.query(NodeNgramNgram.ngram1_id,
NodeNgramNgram.ngram2_id)
.filter(NodeNgramNgram.node_id == groupings_id)
.subquery()
)
# 2 x the occurrence index table
x1 = aliased(NodeNgram)
x2 = aliased(NodeNgram)
# further use as anon tables prevent doing Ysyno = Xsyno
Ysyno = (session.query(NodeNgramNgram.ngram1_id,
NodeNgramNgram.ngram2_id)
.filter(NodeNgramNgram.node_id == groupings_id)
.subquery()
)
# cooccurrences columns definition
ucooc = func.count(x1.ngram_id).label("ucooc")
# groupings => define the counted form depending on the existence of a synonym
Xindex_ngform_id = case([
(Xsyno.c.ngram1_id != None, Xsyno.c.ngram1_id),
(Xsyno.c.ngram1_id == None, Xindex.ngram_id)
# condition value
])
# 1) MAIN DB QUERY
coocs_query = (
session.query(x1.ngram_id, x2.ngram_id, ucooc)
.join(Node, Node.id == x1.node_id) # <- b/c within corpus
.join(x2, x1.node_id == Node.id ) # <- b/c within corpus
Yindex_ngform_id = case([
(Ysyno.c.ngram1_id != None, Ysyno.c.ngram1_id),
(Ysyno.c.ngram1_id == None, Yindex.ngram_id)
])
# ---
# 2) BASE DB QUERY
# cooccurrences columns definition ----------------
ucooc = func.count(Xindex_ngform_id).label("ucooc")
# NB could be X or Y in this line
# (we're counting grouped rows and just happen to do it on this column)
base_query = (
session.query(
Xindex_ngform_id,
Yindex_ngform_id,
ucooc
# for debug (2/4)
#, Xngram.terms.label("w_x")
#, Yngram.terms.label("w_y")
)
.join(Yindex, Xindex.node_id == Yindex.node_id ) # <- by definition of cooc
.join(Node, Node.id == Xindex.node_id) # <- b/c within corpus
.filter(Node.parent_id == corpus.id) # <- b/c within corpus
.filter(Node.typename == "DOCUMENT") # <- b/c within corpus
.filter(Xindex_ngform_id != Yindex_ngform_id) # <- b/c not with itself
)
# outerjoin the synonyms if needed
if groupings_id:
base_query = (base_query
.outerjoin(Xsyno, # <- synonyms for Xindex.ngrams
Xsyno.c.ngram2_id == Xindex.ngram_id)
.outerjoin(Ysyno, # <- synonyms for Yindex.ngrams
Ysyno.c.ngram2_id == Yindex.ngram_id)
)
# 3) counting clause in any case
coocs_query = (base_query
.group_by(
Xindex_ngform_id, Yindex_ngform_id # <- what we're counting
# for debug (3/4)
#,"w_x", "w_y"
)
# for debug (4/4)
#.join(Xngram, Xngram.id == Xindex_ngform_id)
#.join(Yngram, Yngram.id == Yindex_ngform_id)
.filter(x1.node_id == x2.node_id) # <- by definition of cooc
.filter(x1.ngram_id != x2.ngram_id) # <- b/c not with itself
.group_by(x1.ngram_id, x2.ngram_id)
.order_by(ucooc)
)
# 2) INPUT FILTERS (reduce N before O(N²))
# 4) INPUT FILTERS (reduce N before O(N²))
if mainlist_id:
m1 = aliased(NodeNgram)
m2 = aliased(NodeNgram)
coocs_query = ( coocs_query
.join(m1, m1.ngram_id == x1.ngram_id)
.join(m2, m2.ngram_id == x2.ngram_id)
.join(m1, m1.ngram_id == Xindex_ngform_id)
.join(m2, m2.ngram_id == Yindex_ngform_id)
.filter( m1.node_id == mainlist_id )
.filter( m2.node_id == mainlist_id )
)
if stoplist_id:
s1 = aliased(NodeNgram)
s2 = aliased(NodeNgram)
s1 = (session.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id == stoplist_id)
.subquery()
)
# further use as anon tables prevent doing s2 = s1
s2 = (session.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id == stoplist_id)
.subquery()
)
coocs_query = ( coocs_query
.join(m1, s1.ngram_id == x1.ngram_id)
.join(m2, s2.ngram_id == x2.ngram_id)
.outerjoin(s1, s1.c.ngram_id == Xindex_ngform_id)
.outerjoin(s2, s2.c.ngram_id == Yindex_ngform_id)
# équivalent NOT IN stoplist
.filter( s1.c.ngram_id == None )
.filter( s2.c.ngram_id == None )
.filter( s1.node_id == mainlist_id )
.filter( s2.node_id == mainlist_id )
)
if start:
if isinstance(start, datetime):
start_str = start.strftime("%Y-%m-%d %H:%M:%S")
else:
start_str = str(start)
# doc_ids matching this limit
# TODO s/subqueries/inner joins/ && thanks!
starttime_subquery = (session
.query(NodeHyperdata.node_id)
.filter(NodeHyperdata.key=="publication_date")
.filter(NodeHyperdata.value_str >= start_str)
.subquery()
if start or end:
Time = aliased(NodeHyperdata)
coocs_query = (coocs_query
.join(Time, Time.node_id == Xindex.node_id)
.filter(Time.key=="publication_date")
)
# direct use of str comparison op because there is consistency b/w
# sql alpha sort and chrono sort *in this format %Y-%m-%d %H:%M:%S*
if start:
if not isinstance(start, datetime):
try:
start = datetime.strptime(start, '%Y-%m-%d')
except:
raise TypeError("'start' param expects datetime object or %%Y-%%m-%%d string")
# the filtering by start limit
coocs_query = coocs_query.filter(x1.node_id.in_(starttime_subquery))
coocs_query = coocs_query.filter(Time.value_utc >= start)
if end:
if isinstance(end, datetime):
end_str = end.strftime("%Y-%m-%d %H:%M:%S")
else:
end_str = str(end)
if not isinstance(end, datetime):
try:
end = datetime.strptime(end, '%Y-%m-%d')
except:
raise TypeError("'end' param expects datetime object or %%Y-%%m-%%d string")
# TODO s/subqueries/inner joins/ && thanks!
endtime_subquery = (session
.query(NodeHyperdata.node_id)
.filter(NodeHyperdata.key=="publication_date")
.filter(NodeHyperdata.value_str <= end_str)
.subquery()
)
# the filtering by end limit
coocs_query = coocs_query.filter(x1.node_id.in_(endtime_subquery))
# the filtering by start limit
coocs_query = coocs_query.filter(Time.value_utc <= end)
if symmetry_filter:
# 1 filtre tenant en compte de la symétrie
# -> réduit le travail de moitié !!
# -> mais empêchera l'accès direct aux cooccurrences de x2
# -> seront éparpillées: notées dans les x1 qui ont précédé x2
# -> récupération sera plus couteuse via des requêtes OR comme:
# -> mais récupération sera plus couteuse via des requêtes OR comme:
# WHERE ngram1 = mon_ngram OR ngram2 = mon_ngram
coocs_query = coocs_query.filter(x1.ngram_id < x2.ngram_id)
coocs_query = coocs_query.filter(Xindex_ngform_id < Yindex_ngform_id)
# ------------
# 2 filtres amont possibles pour réduire combinatoire
# - par exemple 929k lignes => 35k lignes
# - ici sur weight mais dégrade les résultats
# => imaginable sur une autre métrique (cvalue ou tfidf?)
# coocs_query = coocs_query.filter(x1.weight > 1)
# coocs_query = coocs_query.filter(x2.weight > 1)
# ------------
# 3) OUTPUT FILTERS
# 5) OUTPUT FILTERS
# ------------------
# threshold
# £TODO adjust COOC_THRESHOLD a posteriori:
# ex: sometimes 2 sometimes 4 depending on sparsity
coocs_query = coocs_query.having(ucooc >= threshold)
# 4) EXECUTE QUERY
# 6) EXECUTE QUERY
# ----------------
# => storage in our matrix structure
matrix = WeightedMatrix(coocs_query.all())
# -------------------
# fyi
shape_0 = len({pair[0] for pair in matrix.items})
......
......@@ -2042,6 +2042,29 @@ function AfterAjax(sourceUrl) {
// console.log(JSON.stringify(NGrams))
// -------------------------------------------------------------------
// ----------------------------------------- MAPLIST
// keepstateId = 1
keepstateId = System[0]["statesD"]["keep"]
if( Object.keys(NGrams["map"]).length>0 ) {
for(var ngram_id in NGrams["map"]) {
myNgramInfo = NGrams["main"].ngrams[ngram_id]
// initialize state of maplist items
myNgramInfo["state"] = keepstateId ;
}
}
// ----------------------------------------- STOPLIST
// delstateId = 2
delstateId = System[0]["statesD"]["delete"]
if( Object.keys(NGrams["stop"]).length>0 ) {
for(var ngram_id in NGrams["stop"]) {
console.log('stopping ' + ngram_id)
myNgramInfo = NGrams["main"].ngrams[ngram_id]
// initialize state of stoplist items
myNgramInfo["state"] = delstateId ;
}
}
// Deleting subforms from the ngrams-table, clean start baby!
if( Object.keys(NGrams["group"].links).length>0 ) {
......@@ -2056,10 +2079,6 @@ function AfterAjax(sourceUrl) {
}
}
// debug:
// console.log('~~~~~~~~~~~~~> (sub) _forms')
// console.log( _forms )
// ------------------------------------------- MAINLIST
// ngrams_data_ will update NGrams.main.ngrams (with subforms removed)
var ngrams_data_ = {}
......@@ -2093,27 +2112,6 @@ function AfterAjax(sourceUrl) {
// console.log( NGrams["main"] )
// ----------------------------------------- MAPLIST
if( Object.keys(NGrams["map"]).length>0 ) {
for(var ngram_id in NGrams["main"].ngrams) {
myNgram = NGrams["main"].ngrams[ngram_id]
if(NGrams["map"][ngram_id]) {
// keepstateId = 1
keepstateId = System[0]["statesD"]["keep"]
// initialize state of maplist items
myNgram["state"] = keepstateId ;
}
else if (NGrams["stop"][ngram_id]) {
// delstateId = 2
delstateId = System[0]["statesD"]["delete"]
// initialize state of stoplist items
myNgram["state"] = delstateId ;
}
}
}
// Building the Score-Selector //NGrams["scores"]
var FirstScore = NGrams["main"].scores.initial
// TODO scores_div
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment