[FEAT] TFIDF: document/corpus and corpus/language.

025087fd · delanoe · 10c68905 · 025087fd · 025087fd
Commit 025087fd authored Sep 16, 2015 by delanoe
Hide whitespace changes
Inline Side-by-side

Showing with 120 additions and 8 deletions

celery.py gargantext_web/celery.py +3 -1

tfidf.py ngram/tfidf.py +117 -7

No files found.
--- a/gargantext_web/celery.py
+++ b/gargantext_web/celery.py
@@ -9,7 +9,8 @@ def debug_task(request):
    print('Request: {0!r}'.format(request))
 from gargantext_web.db import session, Node
-from ngram.tfidf import compute_tfidf
+from ngram.tfidf import compute_tfidf,compute_tfidf_global
 @shared_task
 def apply_sum(x, y):
@@ -43,6 +44,7 @@ def apply_workflow(corpus_id):
    update_processing(corpus, 3)
    compute_tfidf(corpus)
+    compute_tfidf_global(corpus, lang='en')
    ngrams2miam(user_id=corpus.user_id, corpus_id=corpus_id)
    update_processing(corpus, 0)

--- a/ngram/tfidf.py
+++ b/ngram/tfidf.py
-from collections import defaultdict
-from datetime import datetime
-from random import random
-from hashlib import md5
-from time import time
 from math import log
 from gargantext_web.db import *
+from gargantext_web.db import get_or_create_node
 from admin.utils import DebugTime
@@ -14,6 +9,8 @@ def compute_tfidf(corpus):
    dbg = DebugTime('Corpus #%d - tfidf' % corpus.id)
    # compute terms frequency sum
    dbg.show('calculate terms frequencies sums')
+    tfidf_node = get_or_create_node(nodetype='Tfidf', user_id=corpus.user_id, parent_id=corpus.id)
    db, cursor = get_cursor()
    cursor.execute('''
        CREATE TEMPORARY TABLE tmp__st (
@@ -99,7 +96,7 @@ def compute_tfidf(corpus):
                tmp__idf AS idf
            INNER JOIN
                tmp__tf AS tf ON tf.ngram_id = idf.ngram_id
-        ''' % (NodeNodeNgram.__table__.name, corpus.id, ))
+        ''' % (NodeNodeNgram.__table__.name, tfidf_node.id, ))
        # # show off
        # cursor.execute('''
        #     SELECT
@@ -121,3 +118,116 @@ def compute_tfidf(corpus):
        #     print(row)
        # the end!
        db.commit()
+#http://stackoverflow.com/questions/8674718/best-way-to-select-random-rows-postgresql
+def compute_tfidf_global(corpus, lang='fr'):
+    dbg = DebugTime('Corpus #%d - tfidf global' % corpus.id)
+    dbg.show('calculate terms frequencies sums')
+    tfidf_node = get_or_create_node(nodetype='Tfidf (global)', user_id=corpus.user_id, parent_id=corpus.id)
+    # compute terms frequency sum
+    db, cursor = get_cursor()
+    cursor.execute('''
+        CREATE TEMPORARY TABLE tmp__tf (
+        ngram_id INT NOT NULL,
+        frequency DOUBLE PRECISION NOT NULL
+        );
+    ''')
+    cursor.execute('''
+        INSERT INTO
+        tmp__tf (ngram_id, frequency)
+        SELECT
+        node_ngram.ngram_id AS ngram_id,
+        (count(*)) AS frequency
+        FROM %s AS node_ngram
+        INNER JOIN
+        %s AS node ON node.id = node_ngram.node_id
+        WHERE
+        node.parent_id = %d
+        GROUP BY node_ngram.ngram_id;
+    ''' % (Node_Ngram.__table__.name, Node.__table__.name,  corpus.id, ))
+    # show off
+    dbg.show('compute idf')
+    cursor.execute('''
+        CREATE TEMPORARY TABLE tmp__idf (
+            ngram_id INT NOT NULL,
+            idf DOUBLE PRECISION NOT NULL
+        )
+    ''')
+    if lang == 'en':
+        cursor.execute('''
+            INSERT INTO
+            tmp__idf(ngram_id, idf)
+            SELECT
+            node_ngram.ngram_id,
+            -ln(COUNT(*))
+            FROM
+            %s AS node_ngram
+            INNER JOIN
+            tmp__tf ON tmp__tf.ngram_id = node_ngram.ngram_id
+            INNER JOIN
+            %s as doc ON doc.id = node_ngram.node_id
+            WHERE
+            doc.language_id = %d AND doc.type_id = %d
+            GROUP BY
+            node_ngram.ngram_id
+            ;
+        ''' % (Node_Ngram.__table__.name, Node.__table__.name,  cache.Language[lang].id, cache.NodeType['Document'].id))
+    elif lang == 'fr':
+        cursor.execute('''
+            INSERT INTO
+            tmp__idf(ngram_id, idf)
+            SELECT
+            node_ngram.ngram_id,
+            -ln(COUNT(*))
+            FROM
+            %s AS node_ngram
+            INNER JOIN
+            tmp__tf ON tmp__tf.ngram_id = node_ngram.ngram_id
+            INNER JOIN
+            %s as doc ON doc.id = node_ngram.node_id
+            INNER JOIN
+            %s as corpus ON corpus.id = doc.parent_id
+            WHERE
+            corpus.language_id = %d AND doc.type_id = %d
+            GROUP BY
+            node_ngram.ngram_id
+            ;
+        ''' % (Node_Ngram.__table__.name, Node.__table__.name, Node.__table__.name,  cache.Language[lang].id, cache.NodeType['Document'].id))
+    cursor.execute('''SELECT COUNT(*) FROM %s AS doc
+                   WHERE doc.language_id = %d
+                   AND doc.type_id = %d
+                   ''' % (Node.__table__.name, cache.Language[lang].id, cache.NodeType['Document'].id))
+    D = cursor.fetchone()[0]
+    if D>0:
+        lnD = log(D)
+        cursor.execute('UPDATE tmp__idf SET idf = idf + %f' % (lnD, ))
+        # show off
+        dbg.show('insert tfidf for %d documents' % D)
+        cursor.execute('''
+            INSERT INTO
+                %s (nodex_id, nodey_id, ngram_id, score)
+            SELECT
+                %d AS nodex_id,
+                %d AS nodey_id,
+                tf.ngram_id AS ngram_id,
+                (tf.frequency * idf.idf) AS score
+            FROM
+                tmp__idf AS idf
+            INNER JOIN
+                tmp__tf AS tf ON tf.ngram_id = idf.ngram_id
+        ''' % (NodeNodeNgram.__table__.name, tfidf_node.id, corpus.id, ))
+        db.commit()