Commit 144b774e authored by Romain Loth's avatar Romain Loth

tidying up

parent 58aa990d
......@@ -2,13 +2,13 @@ from .parsing import parse
from .ngrams_extraction import extract_ngrams
# in usual run order
from .list_stop import do_stoplist
from .ngram_scores import compute_occurrences_local, compute_tfidf
from .list_main import do_mainlist
from .ngram_coocs_tempo import compute_coocs
from .score_specificity import compute_specificity
from .list_map import do_maplist # TEST
from .ngram_groups import compute_groups
from .list_stop import do_stoplist
from .metric_tfidf import compute_occs, compute_tfidf
from .list_main import do_mainlist
from .ngram_coocs import compute_coocs
from .metric_specificity import compute_specificity
from .list_map import do_maplist # TEST
from .ngram_groups import compute_groups
from gargantext.util.db import session
from gargantext.models import Node
......@@ -50,7 +50,7 @@ def parse_extract(corpus):
print('CORPUS #%d: [%s] new grouplist node #%i' % (corpus.id, t(), group_id))
# -> write occurrences to Node and NodeNodeNgram # possible: factorize with tfidf
occ_id = compute_occurrences_local(corpus)
occ_id = compute_occs(corpus)
print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id))
# ------------
......
......@@ -67,6 +67,7 @@ def do_maplist(corpus,
.filter(ScoreSpec.ngram_id.in_(primary_groupterms_subquery))
)
# TODO: move these 2 pools up to mainlist selection
top_monograms = (query
.filter(Ngram.n == 1)
.order_by(desc(ScoreSpec.weight))
......
......@@ -16,7 +16,7 @@ from math import log
# from gargantext.util.lists import WeightedContextIndex
def compute_occurrences_local(corpus, overwrite_id = None):
def compute_occs(corpus, overwrite_id = None):
"""
Calculates sum of occs per ngram within corpus
(used as info in the ngrams table view)
......
......@@ -103,7 +103,6 @@ def compute_coocs(corpus,
)
# 2) INPUT FILTERS (reduce N before O(N²))
# £TODO add possibility to restrict to the mainlist
if mainlist_id:
main_subquery = (
session.query(NodeNgram.ngram_id)
......@@ -150,6 +149,8 @@ def compute_coocs(corpus,
# 3) OUTPUT FILTERS
# ------------------
# threshold
# £TODO adjust COOC_THRESHOLD a posteriori:
# ex: sometimes 2 sometimes 4 depending on sparsity
coocs_query = coocs_query.having(ucooc >= threshold)
# 4) EXECUTE QUERY
......
from gargantext.util.db import *
from gargantext.util.db_cache import *
from gargantext.constants import *
from gargantext.models.ngrams import Ngram, NodeNgram, NodeNgramNgram
def insert_ngrams(ngrams,get='terms-id'):
'''
insert_ngrams :: [(String, Int)] -> dict[terms] = id
'''
db, cursor = get_cursor()
cursor.execute('''
CREATE TEMPORARY TABLE tmp__ngram (
id INT,
terms VARCHAR(255) NOT NULL,
n INT
);
''')
bulk_insert('tmp__ngram', ['terms', 'n'], ngrams, cursor=cursor)
cursor.execute('''
UPDATE
tmp__ngram
SET
id = ngram.id
FROM
%s AS ngram
WHERE
tmp__ngram.terms = ngram.terms
''' % (Ngram.__table__.name,))
cursor.execute('''
INSERT INTO
%s (terms, n)
SELECT
terms, n
FROM
tmp__ngram
WHERE
id IS NULL
''' % (Ngram.__table__.name,))
cursor.execute('''
UPDATE
tmp__ngram
SET
id = ngram.id
FROM
%s AS ngram
WHERE
ngram.terms = tmp__ngram.terms
AND
ngram.n = tmp__ngram.n
AND
tmp__ngram.id IS NULL
''' % (Ngram.__table__.name,))
ngram_ids = dict()
cursor.execute('SELECT id, terms FROM tmp__ngram')
for row in cursor.fetchall():
ngram_ids[row[1]] = row[0]
db.commit()
return(ngram_ids)
from gargantext_web.db import get_session, cache, get_cursor
from gargantext_web.db import Node, NodeNgram, NodeNodeNgram
from gargantext_web.db import get_or_create_node
#from admin.utils import DebugTime
def compute_occs(corpus, debug=True):
'''
compute_occs :: Corpus -> IO ()
'''
#dbg = DebugTime('Corpus #%d - OCCURRENCES' % corpus.id)
#dbg.show('Calculate occurrences')
occs_node = get_or_create_node(nodetype='Occurrences', corpus=corpus, mysession=mysession)
#print(occs_node.id)
(session.query(NodeNodeNgram)
.filter(NodeNodeNgram.nodex_id==occs_node.id).delete()
)
session.commit()
db, cursor = get_cursor()
cursor.execute('''
INSERT INTO
%s (nodex_id, nodey_id, ngram_id, score)
SELECT
%d AS nodex_id,
%d AS nodey_id,
nodengram.ngram_id AS ngram_id,
SUM(nodengram.weight) AS score
FROM
%s AS nodengram
INNER JOIN
%s AS node ON nodengram.node_id = node.id
WHERE
node.parent_id = %d
AND
node.type_id = %d
GROUP BY
nodengram.ngram_id
''' % ( NodeNodeNgram.__table__.name
, occs_node.id, corpus.id
, NodeNgram.__table__.name
, Node.__table__.name
, corpus.id
, cache.NodeType['Document'].id
)
)
db.commit()
if debug is True:
data = session.query(NodeNodeNgram).filter(NodeNodeNgram.nodex_id==occs_node.id).all()
print([n for n in data])
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment