Commit 6c438c85 authored by delanoe's avatar delanoe

Merge branch 'romain-refactoring' into unstable

parents f236759c e52afd97
...@@ -8,18 +8,19 @@ import re ...@@ -8,18 +8,19 @@ import re
LISTTYPES = { LISTTYPES = {
'DOCUMENT' : WeightedList, 'DOCUMENT' : WeightedList,
'GROUPLIST' : Translations, 'GROUPLIST' : Translations, # todo remove "LIST" from name
'STOPLIST' : UnweightedList, 'STOPLIST' : UnweightedList,
'MAINLIST' : UnweightedList, 'MAINLIST' : UnweightedList,
'MAPLIST' : UnweightedList, 'MAPLIST' : UnweightedList,
'SPECIFICITY' : WeightedList, 'SPECIFICITY' : WeightedList,
'OCCURRENCES' : WeightedContextIndex, 'OCCURRENCES' : WeightedIndex, # todo replace by WeightedList
'COOCCURRENCES': WeightedMatrix, 'COOCCURRENCES': WeightedMatrix,
'TFIDF-CORPUS' : WeightedContextIndex, 'TFIDF-CORPUS' : WeightedIndex, # todo split -> WeightedList for ti_rank and WeightedIndex for tfidf
'TFIDF-GLOBAL' : WeightedContextIndex, 'TFIDF-GLOBAL' : WeightedIndex, # todo split -> WeightedList for ti_rank and WeightedIndex for tfidf
} }
NODETYPES = [ NODETYPES = [
# TODO separate id not array index, read by models.node
None, None,
# documents hierarchy # documents hierarchy
'USER', # 1 'USER', # 1
...@@ -40,6 +41,7 @@ NODETYPES = [ ...@@ -40,6 +41,7 @@ NODETYPES = [
'TFIDF-GLOBAL', # 14 'TFIDF-GLOBAL', # 14
# docs subset # docs subset
'FAVORITES' # 15 'FAVORITES' # 15
# TODO add ti RANK
] ]
INDEXED_HYPERDATA = { INDEXED_HYPERDATA = {
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
""" """
__all__ = ['Translations', 'WeightedMatrix', 'UnweightedList', 'WeightedList', 'WeightedContextIndex'] __all__ = ['Translations', 'WeightedMatrix', 'UnweightedList', 'WeightedList', 'WeightedIndex']
from gargantext.util.db import session, bulk_insert from gargantext.util.db import session, bulk_insert
...@@ -165,15 +165,18 @@ class Translations(_BaseClass): ...@@ -165,15 +165,18 @@ class Translations(_BaseClass):
) )
class WeightedContextIndex(_BaseClass): class WeightedIndex(_BaseClass):
""" """
associated model : NodeNodeNgram associated model : NodeNodeNgram
associated columns : node1_id | node2_id | ngram_id | score (float) associated columns : node1_id | node2_id | ngram_id | score (float)
^^^^
reserved for this
object's id
Tensor representing a contextual index or registry Matrix representing a weighted word index across docs or small context nodes
(matrix of weighted ngrams *per* doc *per* context) (matrix of weighted ngrams *per* doc)
Exemple : tfidf by corpus Exemple : tfidf within a corpus
""" """
def __init__(self, source=None): def __init__(self, source=None):
self.items = defaultdict(float) self.items = defaultdict(float)
...@@ -182,7 +185,7 @@ class WeightedContextIndex(_BaseClass): ...@@ -182,7 +185,7 @@ class WeightedContextIndex(_BaseClass):
# ?TODO rename WeightedWordmatrix
class WeightedMatrix(_BaseClass): class WeightedMatrix(_BaseClass):
def __init__(self, source=None): def __init__(self, source=None):
...@@ -294,7 +297,7 @@ class WeightedMatrix(_BaseClass): ...@@ -294,7 +297,7 @@ class WeightedMatrix(_BaseClass):
result.items[key1, key2] = value / sqrt(other.items[key1] * other.items[key2]) result.items[key1, key2] = value / sqrt(other.items[key1] * other.items[key2])
return result return result
# ?TODO rename Wordlist
class UnweightedList(_BaseClass): class UnweightedList(_BaseClass):
def __init__(self, source=None): def __init__(self, source=None):
...@@ -399,6 +402,7 @@ class UnweightedList(_BaseClass): ...@@ -399,6 +402,7 @@ class UnweightedList(_BaseClass):
) )
# ?TODO rename WeightedWordlist
class WeightedList(_BaseClass): class WeightedList(_BaseClass):
def __init__(self, source=None): def __init__(self, source=None):
......
...@@ -111,20 +111,16 @@ def parse_extract_indexhyperdata(corpus): ...@@ -111,20 +111,16 @@ def parse_extract_indexhyperdata(corpus):
group_id = compute_groups(corpus, stoplist_id = None) group_id = compute_groups(corpus, stoplist_id = None)
print('CORPUS #%d: [%s] new grouplist node #%i' % (corpus.id, t(), group_id)) print('CORPUS #%d: [%s] new grouplist node #%i' % (corpus.id, t(), group_id))
# -> write occurrences to Node and NodeNodeNgram # possible: factorize with tfidf
occ_id = compute_occs(corpus)
print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id))
# ------------ # ------------
# -> write local tfidf similarities to Node and NodeNodeNgram # -> write occurrences to Node and NodeNodeNgram # (todo: NodeNgram)
ltfidf_id = compute_tfidf_local(corpus) occ_id = compute_occs(corpus, groupings_id = group_id)
print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id)) print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id))
# -> write global and cumulated ti_ranking (tfidf ranking vector) to Node and NodeNodeNgram # -> write cumulated ti_ranking (tfidf ranking vector) to Node and NodeNodeNgram (todo: NodeNgram)
tirank_id = compute_ti_ranking(corpus, tirank_id = compute_ti_ranking(corpus,
count_scope="global", groupings_id = group_id,
termset_scope="local") count_scope="global")
print('CORPUS #%d: [%s] new tfidf ranking node #%i' % (corpus.id, t(), tirank_id)) print('CORPUS #%d: [%s] new ti ranking node #%i' % (corpus.id, t(), tirank_id))
# -> mainlist: filter + write (to Node and NodeNgram) # -> mainlist: filter + write (to Node and NodeNgram)
mainlist_id = do_mainlist(corpus, mainlist_id = do_mainlist(corpus,
...@@ -132,13 +128,21 @@ def parse_extract_indexhyperdata(corpus): ...@@ -132,13 +128,21 @@ def parse_extract_indexhyperdata(corpus):
stoplist_id = stop_id) stoplist_id = stop_id)
print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id)) print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id))
# -> write local tfidf similarities to Node and NodeNodeNgram
# TODO only on mainlist
ltfidf_id = compute_tfidf_local(corpus)
print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))
# => used for doc <=> ngram association
# ------------ # ------------
# -> cooccurrences: compute + write (=> Node and NodeNodeNgram) # -> cooccurrences: compute + write (=> Node and NodeNodeNgram)
cooc_id = compute_coocs(corpus, mainlist_id = mainlist_id) cooc_id = compute_coocs(corpus, mainlist_id = mainlist_id, groupings_id = group_id)
print('CORPUS #%d: [%s] new coocs node #%i' % (corpus.id, t(), cooc_id)) print('CORPUS #%d: [%s] new coocs node #%i' % (corpus.id, t(), cooc_id))
# -> specificity: compute + write (=> NodeNodeNgram) # -> specificity: compute + write (=> NodeNodeNgram)
spec_id = compute_specificity(corpus, cooc_id=cooc_id) spec_id = compute_specificity(corpus, cooc_id=cooc_id
# ,groupings_id = group_id
)
print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), spec_id)) print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), spec_id))
# ?? maplist: compute + write (to Node and NodeNgram) # ?? maplist: compute + write (to Node and NodeNgram)
......
...@@ -65,6 +65,9 @@ def do_mainlist(corpus, ...@@ -65,6 +65,9 @@ def do_mainlist(corpus,
ordered_filtered_tfidf = (session ordered_filtered_tfidf = (session
.query(NodeNodeNgram.ngram_id) .query(NodeNodeNgram.ngram_id)
.filter(NodeNodeNgram.node1_id == ranking_scores_id) .filter(NodeNodeNgram.node1_id == ranking_scores_id)
# NOT IN but speed theoretically ok here
# see http://sqlperformance.com/2012/12/t-sql-queries/left-anti-semi-join
# but http://stackoverflow.com/questions/2246772/whats-the-difference-between-not-exists-vs-not-in-vs-left-join-where-is-null/2246793#2246793
.filter(~ NodeNodeNgram.ngram_id.in_(stopterms_subquery)) .filter(~ NodeNodeNgram.ngram_id.in_(stopterms_subquery))
.order_by(desc(NodeNodeNgram.score)) .order_by(desc(NodeNodeNgram.score))
) )
......
This diff is collapsed.
This diff is collapsed.
...@@ -2042,6 +2042,29 @@ function AfterAjax(sourceUrl) { ...@@ -2042,6 +2042,29 @@ function AfterAjax(sourceUrl) {
// console.log(JSON.stringify(NGrams)) // console.log(JSON.stringify(NGrams))
// ------------------------------------------------------------------- // -------------------------------------------------------------------
// ----------------------------------------- MAPLIST
// keepstateId = 1
keepstateId = System[0]["statesD"]["keep"]
if( Object.keys(NGrams["map"]).length>0 ) {
for(var ngram_id in NGrams["map"]) {
myNgramInfo = NGrams["main"].ngrams[ngram_id]
// initialize state of maplist items
myNgramInfo["state"] = keepstateId ;
}
}
// ----------------------------------------- STOPLIST
// delstateId = 2
delstateId = System[0]["statesD"]["delete"]
if( Object.keys(NGrams["stop"]).length>0 ) {
for(var ngram_id in NGrams["stop"]) {
console.log('stopping ' + ngram_id)
myNgramInfo = NGrams["main"].ngrams[ngram_id]
// initialize state of stoplist items
myNgramInfo["state"] = delstateId ;
}
}
// Deleting subforms from the ngrams-table, clean start baby! // Deleting subforms from the ngrams-table, clean start baby!
if( Object.keys(NGrams["group"].links).length>0 ) { if( Object.keys(NGrams["group"].links).length>0 ) {
...@@ -2056,10 +2079,6 @@ function AfterAjax(sourceUrl) { ...@@ -2056,10 +2079,6 @@ function AfterAjax(sourceUrl) {
} }
} }
// debug:
// console.log('~~~~~~~~~~~~~> (sub) _forms')
// console.log( _forms )
// ------------------------------------------- MAINLIST // ------------------------------------------- MAINLIST
// ngrams_data_ will update NGrams.main.ngrams (with subforms removed) // ngrams_data_ will update NGrams.main.ngrams (with subforms removed)
var ngrams_data_ = {} var ngrams_data_ = {}
...@@ -2093,27 +2112,6 @@ function AfterAjax(sourceUrl) { ...@@ -2093,27 +2112,6 @@ function AfterAjax(sourceUrl) {
// console.log( NGrams["main"] ) // console.log( NGrams["main"] )
// ----------------------------------------- MAPLIST
if( Object.keys(NGrams["map"]).length>0 ) {
for(var ngram_id in NGrams["main"].ngrams) {
myNgram = NGrams["main"].ngrams[ngram_id]
if(NGrams["map"][ngram_id]) {
// keepstateId = 1
keepstateId = System[0]["statesD"]["keep"]
// initialize state of maplist items
myNgram["state"] = keepstateId ;
}
else if (NGrams["stop"][ngram_id]) {
// delstateId = 2
delstateId = System[0]["statesD"]["delete"]
// initialize state of stoplist items
myNgram["state"] = delstateId ;
}
}
}
// Building the Score-Selector //NGrams["scores"] // Building the Score-Selector //NGrams["scores"]
var FirstScore = NGrams["main"].scores.initial var FirstScore = NGrams["main"].scores.initial
// TODO scores_div // TODO scores_div
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment