Commit 025087fd authored by delanoe's avatar delanoe

[FEAT] TFIDF: document/corpus and corpus/language.

parent 10c68905
......@@ -9,7 +9,8 @@ def debug_task(request):
print('Request: {0!r}'.format(request))
from gargantext_web.db import session, Node
from ngram.tfidf import compute_tfidf
from ngram.tfidf import compute_tfidf,compute_tfidf_global
@shared_task
def apply_sum(x, y):
......@@ -43,6 +44,7 @@ def apply_workflow(corpus_id):
update_processing(corpus, 3)
compute_tfidf(corpus)
compute_tfidf_global(corpus, lang='en')
ngrams2miam(user_id=corpus.user_id, corpus_id=corpus_id)
update_processing(corpus, 0)
......
from collections import defaultdict
from datetime import datetime
from random import random
from hashlib import md5
from time import time
from math import log
from gargantext_web.db import *
from gargantext_web.db import get_or_create_node
from admin.utils import DebugTime
......@@ -14,6 +9,8 @@ def compute_tfidf(corpus):
dbg = DebugTime('Corpus #%d - tfidf' % corpus.id)
# compute terms frequency sum
dbg.show('calculate terms frequencies sums')
tfidf_node = get_or_create_node(nodetype='Tfidf', user_id=corpus.user_id, parent_id=corpus.id)
db, cursor = get_cursor()
cursor.execute('''
CREATE TEMPORARY TABLE tmp__st (
......@@ -99,7 +96,7 @@ def compute_tfidf(corpus):
tmp__idf AS idf
INNER JOIN
tmp__tf AS tf ON tf.ngram_id = idf.ngram_id
''' % (NodeNodeNgram.__table__.name, corpus.id, ))
''' % (NodeNodeNgram.__table__.name, tfidf_node.id, ))
# # show off
# cursor.execute('''
# SELECT
......@@ -121,3 +118,116 @@ def compute_tfidf(corpus):
# print(row)
# the end!
db.commit()
#http://stackoverflow.com/questions/8674718/best-way-to-select-random-rows-postgresql
def compute_tfidf_global(corpus, lang='fr'):
dbg = DebugTime('Corpus #%d - tfidf global' % corpus.id)
dbg.show('calculate terms frequencies sums')
tfidf_node = get_or_create_node(nodetype='Tfidf (global)', user_id=corpus.user_id, parent_id=corpus.id)
# compute terms frequency sum
db, cursor = get_cursor()
cursor.execute('''
CREATE TEMPORARY TABLE tmp__tf (
ngram_id INT NOT NULL,
frequency DOUBLE PRECISION NOT NULL
);
''')
cursor.execute('''
INSERT INTO
tmp__tf (ngram_id, frequency)
SELECT
node_ngram.ngram_id AS ngram_id,
(count(*)) AS frequency
FROM %s AS node_ngram
INNER JOIN
%s AS node ON node.id = node_ngram.node_id
WHERE
node.parent_id = %d
GROUP BY node_ngram.ngram_id;
''' % (Node_Ngram.__table__.name, Node.__table__.name, corpus.id, ))
# show off
dbg.show('compute idf')
cursor.execute('''
CREATE TEMPORARY TABLE tmp__idf (
ngram_id INT NOT NULL,
idf DOUBLE PRECISION NOT NULL
)
''')
if lang == 'en':
cursor.execute('''
INSERT INTO
tmp__idf(ngram_id, idf)
SELECT
node_ngram.ngram_id,
-ln(COUNT(*))
FROM
%s AS node_ngram
INNER JOIN
tmp__tf ON tmp__tf.ngram_id = node_ngram.ngram_id
INNER JOIN
%s as doc ON doc.id = node_ngram.node_id
WHERE
doc.language_id = %d AND doc.type_id = %d
GROUP BY
node_ngram.ngram_id
;
''' % (Node_Ngram.__table__.name, Node.__table__.name, cache.Language[lang].id, cache.NodeType['Document'].id))
elif lang == 'fr':
cursor.execute('''
INSERT INTO
tmp__idf(ngram_id, idf)
SELECT
node_ngram.ngram_id,
-ln(COUNT(*))
FROM
%s AS node_ngram
INNER JOIN
tmp__tf ON tmp__tf.ngram_id = node_ngram.ngram_id
INNER JOIN
%s as doc ON doc.id = node_ngram.node_id
INNER JOIN
%s as corpus ON corpus.id = doc.parent_id
WHERE
corpus.language_id = %d AND doc.type_id = %d
GROUP BY
node_ngram.ngram_id
;
''' % (Node_Ngram.__table__.name, Node.__table__.name, Node.__table__.name, cache.Language[lang].id, cache.NodeType['Document'].id))
cursor.execute('''SELECT COUNT(*) FROM %s AS doc
WHERE doc.language_id = %d
AND doc.type_id = %d
''' % (Node.__table__.name, cache.Language[lang].id, cache.NodeType['Document'].id))
D = cursor.fetchone()[0]
if D>0:
lnD = log(D)
cursor.execute('UPDATE tmp__idf SET idf = idf + %f' % (lnD, ))
# show off
dbg.show('insert tfidf for %d documents' % D)
cursor.execute('''
INSERT INTO
%s (nodex_id, nodey_id, ngram_id, score)
SELECT
%d AS nodex_id,
%d AS nodey_id,
tf.ngram_id AS ngram_id,
(tf.frequency * idf.idf) AS score
FROM
tmp__idf AS idf
INNER JOIN
tmp__tf AS tf ON tf.ngram_id = idf.ngram_id
''' % (NodeNodeNgram.__table__.name, tfidf_node.id, corpus.id, ))
db.commit()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment