Commit 6c438c85 authored by delanoe's avatar delanoe

Merge branch 'romain-refactoring' into unstable

parents f236759c e52afd97
......@@ -8,18 +8,19 @@ import re
LISTTYPES = {
'DOCUMENT' : WeightedList,
'GROUPLIST' : Translations,
'GROUPLIST' : Translations, # todo remove "LIST" from name
'STOPLIST' : UnweightedList,
'MAINLIST' : UnweightedList,
'MAPLIST' : UnweightedList,
'SPECIFICITY' : WeightedList,
'OCCURRENCES' : WeightedContextIndex,
'OCCURRENCES' : WeightedIndex, # todo replace by WeightedList
'COOCCURRENCES': WeightedMatrix,
'TFIDF-CORPUS' : WeightedContextIndex,
'TFIDF-GLOBAL' : WeightedContextIndex,
'TFIDF-CORPUS' : WeightedIndex, # todo split -> WeightedList for ti_rank and WeightedIndex for tfidf
'TFIDF-GLOBAL' : WeightedIndex, # todo split -> WeightedList for ti_rank and WeightedIndex for tfidf
}
NODETYPES = [
# TODO separate id not array index, read by models.node
None,
# documents hierarchy
'USER', # 1
......@@ -40,6 +41,7 @@ NODETYPES = [
'TFIDF-GLOBAL', # 14
# docs subset
'FAVORITES' # 15
# TODO add ti RANK
]
INDEXED_HYPERDATA = {
......
......@@ -2,7 +2,7 @@
"""
__all__ = ['Translations', 'WeightedMatrix', 'UnweightedList', 'WeightedList', 'WeightedContextIndex']
__all__ = ['Translations', 'WeightedMatrix', 'UnweightedList', 'WeightedList', 'WeightedIndex']
from gargantext.util.db import session, bulk_insert
......@@ -165,15 +165,18 @@ class Translations(_BaseClass):
)
class WeightedContextIndex(_BaseClass):
class WeightedIndex(_BaseClass):
"""
associated model : NodeNodeNgram
associated columns : node1_id | node2_id | ngram_id | score (float)
^^^^
reserved for this
object's id
Tensor representing a contextual index or registry
(matrix of weighted ngrams *per* doc *per* context)
Matrix representing a weighted word index across docs or small context nodes
(matrix of weighted ngrams *per* doc)
Exemple : tfidf by corpus
Exemple : tfidf within a corpus
"""
def __init__(self, source=None):
self.items = defaultdict(float)
......@@ -182,7 +185,7 @@ class WeightedContextIndex(_BaseClass):
# ?TODO rename WeightedWordmatrix
class WeightedMatrix(_BaseClass):
def __init__(self, source=None):
......@@ -294,7 +297,7 @@ class WeightedMatrix(_BaseClass):
result.items[key1, key2] = value / sqrt(other.items[key1] * other.items[key2])
return result
# ?TODO rename Wordlist
class UnweightedList(_BaseClass):
def __init__(self, source=None):
......@@ -399,6 +402,7 @@ class UnweightedList(_BaseClass):
)
# ?TODO rename WeightedWordlist
class WeightedList(_BaseClass):
def __init__(self, source=None):
......
......@@ -111,20 +111,16 @@ def parse_extract_indexhyperdata(corpus):
group_id = compute_groups(corpus, stoplist_id = None)
print('CORPUS #%d: [%s] new grouplist node #%i' % (corpus.id, t(), group_id))
# -> write occurrences to Node and NodeNodeNgram # possible: factorize with tfidf
occ_id = compute_occs(corpus)
print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id))
# ------------
# -> write local tfidf similarities to Node and NodeNodeNgram
ltfidf_id = compute_tfidf_local(corpus)
print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))
# -> write occurrences to Node and NodeNodeNgram # (todo: NodeNgram)
occ_id = compute_occs(corpus, groupings_id = group_id)
print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id))
# -> write global and cumulated ti_ranking (tfidf ranking vector) to Node and NodeNodeNgram
# -> write cumulated ti_ranking (tfidf ranking vector) to Node and NodeNodeNgram (todo: NodeNgram)
tirank_id = compute_ti_ranking(corpus,
count_scope="global",
termset_scope="local")
print('CORPUS #%d: [%s] new tfidf ranking node #%i' % (corpus.id, t(), tirank_id))
groupings_id = group_id,
count_scope="global")
print('CORPUS #%d: [%s] new ti ranking node #%i' % (corpus.id, t(), tirank_id))
# -> mainlist: filter + write (to Node and NodeNgram)
mainlist_id = do_mainlist(corpus,
......@@ -132,13 +128,21 @@ def parse_extract_indexhyperdata(corpus):
stoplist_id = stop_id)
print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id))
# -> write local tfidf similarities to Node and NodeNodeNgram
# TODO only on mainlist
ltfidf_id = compute_tfidf_local(corpus)
print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))
# => used for doc <=> ngram association
# ------------
# -> cooccurrences: compute + write (=> Node and NodeNodeNgram)
cooc_id = compute_coocs(corpus, mainlist_id = mainlist_id)
cooc_id = compute_coocs(corpus, mainlist_id = mainlist_id, groupings_id = group_id)
print('CORPUS #%d: [%s] new coocs node #%i' % (corpus.id, t(), cooc_id))
# -> specificity: compute + write (=> NodeNodeNgram)
spec_id = compute_specificity(corpus, cooc_id=cooc_id)
spec_id = compute_specificity(corpus, cooc_id=cooc_id
# ,groupings_id = group_id
)
print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), spec_id))
# ?? maplist: compute + write (to Node and NodeNgram)
......
......@@ -65,6 +65,9 @@ def do_mainlist(corpus,
ordered_filtered_tfidf = (session
.query(NodeNodeNgram.ngram_id)
.filter(NodeNodeNgram.node1_id == ranking_scores_id)
# NOT IN but speed theoretically ok here
# see http://sqlperformance.com/2012/12/t-sql-queries/left-anti-semi-join
# but http://stackoverflow.com/questions/2246772/whats-the-difference-between-not-exists-vs-not-in-vs-left-join-where-is-null/2246793#2246793
.filter(~ NodeNodeNgram.ngram_id.in_(stopterms_subquery))
.order_by(desc(NodeNodeNgram.score))
)
......
This diff is collapsed.
This diff is collapsed.
......@@ -2042,6 +2042,29 @@ function AfterAjax(sourceUrl) {
// console.log(JSON.stringify(NGrams))
// -------------------------------------------------------------------
// ----------------------------------------- MAPLIST
// keepstateId = 1
keepstateId = System[0]["statesD"]["keep"]
if( Object.keys(NGrams["map"]).length>0 ) {
for(var ngram_id in NGrams["map"]) {
myNgramInfo = NGrams["main"].ngrams[ngram_id]
// initialize state of maplist items
myNgramInfo["state"] = keepstateId ;
}
}
// ----------------------------------------- STOPLIST
// delstateId = 2
delstateId = System[0]["statesD"]["delete"]
if( Object.keys(NGrams["stop"]).length>0 ) {
for(var ngram_id in NGrams["stop"]) {
console.log('stopping ' + ngram_id)
myNgramInfo = NGrams["main"].ngrams[ngram_id]
// initialize state of stoplist items
myNgramInfo["state"] = delstateId ;
}
}
// Deleting subforms from the ngrams-table, clean start baby!
if( Object.keys(NGrams["group"].links).length>0 ) {
......@@ -2056,10 +2079,6 @@ function AfterAjax(sourceUrl) {
}
}
// debug:
// console.log('~~~~~~~~~~~~~> (sub) _forms')
// console.log( _forms )
// ------------------------------------------- MAINLIST
// ngrams_data_ will update NGrams.main.ngrams (with subforms removed)
var ngrams_data_ = {}
......@@ -2093,27 +2112,6 @@ function AfterAjax(sourceUrl) {
// console.log( NGrams["main"] )
// ----------------------------------------- MAPLIST
if( Object.keys(NGrams["map"]).length>0 ) {
for(var ngram_id in NGrams["main"].ngrams) {
myNgram = NGrams["main"].ngrams[ngram_id]
if(NGrams["map"][ngram_id]) {
// keepstateId = 1
keepstateId = System[0]["statesD"]["keep"]
// initialize state of maplist items
myNgram["state"] = keepstateId ;
}
else if (NGrams["stop"][ngram_id]) {
// delstateId = 2
delstateId = System[0]["statesD"]["delete"]
// initialize state of stoplist items
myNgram["state"] = delstateId ;
}
}
}
// Building the Score-Selector //NGrams["scores"]
var FirstScore = NGrams["main"].scores.initial
// TODO scores_div
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment