Commit 144b774e authored by Romain Loth's avatar Romain Loth

tidying up

parent 58aa990d
...@@ -2,13 +2,13 @@ from .parsing import parse ...@@ -2,13 +2,13 @@ from .parsing import parse
from .ngrams_extraction import extract_ngrams from .ngrams_extraction import extract_ngrams
# in usual run order # in usual run order
from .list_stop import do_stoplist from .list_stop import do_stoplist
from .ngram_scores import compute_occurrences_local, compute_tfidf from .metric_tfidf import compute_occs, compute_tfidf
from .list_main import do_mainlist from .list_main import do_mainlist
from .ngram_coocs_tempo import compute_coocs from .ngram_coocs import compute_coocs
from .score_specificity import compute_specificity from .metric_specificity import compute_specificity
from .list_map import do_maplist # TEST from .list_map import do_maplist # TEST
from .ngram_groups import compute_groups from .ngram_groups import compute_groups
from gargantext.util.db import session from gargantext.util.db import session
from gargantext.models import Node from gargantext.models import Node
...@@ -50,7 +50,7 @@ def parse_extract(corpus): ...@@ -50,7 +50,7 @@ def parse_extract(corpus):
print('CORPUS #%d: [%s] new grouplist node #%i' % (corpus.id, t(), group_id)) print('CORPUS #%d: [%s] new grouplist node #%i' % (corpus.id, t(), group_id))
# -> write occurrences to Node and NodeNodeNgram # possible: factorize with tfidf # -> write occurrences to Node and NodeNodeNgram # possible: factorize with tfidf
occ_id = compute_occurrences_local(corpus) occ_id = compute_occs(corpus)
print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id)) print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id))
# ------------ # ------------
......
...@@ -67,6 +67,7 @@ def do_maplist(corpus, ...@@ -67,6 +67,7 @@ def do_maplist(corpus,
.filter(ScoreSpec.ngram_id.in_(primary_groupterms_subquery)) .filter(ScoreSpec.ngram_id.in_(primary_groupterms_subquery))
) )
# TODO: move these 2 pools up to mainlist selection
top_monograms = (query top_monograms = (query
.filter(Ngram.n == 1) .filter(Ngram.n == 1)
.order_by(desc(ScoreSpec.weight)) .order_by(desc(ScoreSpec.weight))
......
...@@ -16,7 +16,7 @@ from math import log ...@@ -16,7 +16,7 @@ from math import log
# from gargantext.util.lists import WeightedContextIndex # from gargantext.util.lists import WeightedContextIndex
def compute_occurrences_local(corpus, overwrite_id = None): def compute_occs(corpus, overwrite_id = None):
""" """
Calculates sum of occs per ngram within corpus Calculates sum of occs per ngram within corpus
(used as info in the ngrams table view) (used as info in the ngrams table view)
......
...@@ -103,7 +103,6 @@ def compute_coocs(corpus, ...@@ -103,7 +103,6 @@ def compute_coocs(corpus,
) )
# 2) INPUT FILTERS (reduce N before O(N²)) # 2) INPUT FILTERS (reduce N before O(N²))
# £TODO add possibility to restrict to the mainlist
if mainlist_id: if mainlist_id:
main_subquery = ( main_subquery = (
session.query(NodeNgram.ngram_id) session.query(NodeNgram.ngram_id)
...@@ -150,6 +149,8 @@ def compute_coocs(corpus, ...@@ -150,6 +149,8 @@ def compute_coocs(corpus,
# 3) OUTPUT FILTERS # 3) OUTPUT FILTERS
# ------------------ # ------------------
# threshold # threshold
# £TODO adjust COOC_THRESHOLD a posteriori:
# ex: sometimes 2 sometimes 4 depending on sparsity
coocs_query = coocs_query.having(ucooc >= threshold) coocs_query = coocs_query.having(ucooc >= threshold)
# 4) EXECUTE QUERY # 4) EXECUTE QUERY
......
from gargantext.util.db import *
from gargantext.util.db_cache import *
from gargantext.constants import *
from gargantext.models.ngrams import Ngram, NodeNgram, NodeNgramNgram
def insert_ngrams(ngrams,get='terms-id'):
'''
insert_ngrams :: [(String, Int)] -> dict[terms] = id
'''
db, cursor = get_cursor()
cursor.execute('''
CREATE TEMPORARY TABLE tmp__ngram (
id INT,
terms VARCHAR(255) NOT NULL,
n INT
);
''')
bulk_insert('tmp__ngram', ['terms', 'n'], ngrams, cursor=cursor)
cursor.execute('''
UPDATE
tmp__ngram
SET
id = ngram.id
FROM
%s AS ngram
WHERE
tmp__ngram.terms = ngram.terms
''' % (Ngram.__table__.name,))
cursor.execute('''
INSERT INTO
%s (terms, n)
SELECT
terms, n
FROM
tmp__ngram
WHERE
id IS NULL
''' % (Ngram.__table__.name,))
cursor.execute('''
UPDATE
tmp__ngram
SET
id = ngram.id
FROM
%s AS ngram
WHERE
ngram.terms = tmp__ngram.terms
AND
ngram.n = tmp__ngram.n
AND
tmp__ngram.id IS NULL
''' % (Ngram.__table__.name,))
ngram_ids = dict()
cursor.execute('SELECT id, terms FROM tmp__ngram')
for row in cursor.fetchall():
ngram_ids[row[1]] = row[0]
db.commit()
return(ngram_ids)
from gargantext_web.db import get_session, cache, get_cursor
from gargantext_web.db import Node, NodeNgram, NodeNodeNgram
from gargantext_web.db import get_or_create_node
#from admin.utils import DebugTime
def compute_occs(corpus, debug=True):
'''
compute_occs :: Corpus -> IO ()
'''
#dbg = DebugTime('Corpus #%d - OCCURRENCES' % corpus.id)
#dbg.show('Calculate occurrences')
occs_node = get_or_create_node(nodetype='Occurrences', corpus=corpus, mysession=mysession)
#print(occs_node.id)
(session.query(NodeNodeNgram)
.filter(NodeNodeNgram.nodex_id==occs_node.id).delete()
)
session.commit()
db, cursor = get_cursor()
cursor.execute('''
INSERT INTO
%s (nodex_id, nodey_id, ngram_id, score)
SELECT
%d AS nodex_id,
%d AS nodey_id,
nodengram.ngram_id AS ngram_id,
SUM(nodengram.weight) AS score
FROM
%s AS nodengram
INNER JOIN
%s AS node ON nodengram.node_id = node.id
WHERE
node.parent_id = %d
AND
node.type_id = %d
GROUP BY
nodengram.ngram_id
''' % ( NodeNodeNgram.__table__.name
, occs_node.id, corpus.id
, NodeNgram.__table__.name
, Node.__table__.name
, corpus.id
, cache.NodeType['Document'].id
)
)
db.commit()
if debug is True:
data = session.query(NodeNodeNgram).filter(NodeNodeNgram.nodex_id==occs_node.id).all()
print([n for n in data])
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment