[FEATURE] Functions to create whitelist and coocurrence objects.

48579d8d · Administrator · cfa0fabb · 48579d8d · 48579d8d · 48579d8d
Commit 48579d8d authored Nov 26, 2014 by Administrator
Hide whitespace changes
Inline Side-by-side

Showing with 141 additions and 4 deletions

functions.py functions.py +133 -0

urls.py gargantext_web/urls.py +0 -1

views.py gargantext_web/views.py +8 -3

No files found.
--- a/functions.py
+++ b/functions.py
+from node.models import Language, ResourceType, Resource, \
+        Node, NodeType, Node_Resource, Project, Corpus, \
+        Node_Ngram, NodeNgramNgram
+
+from collections import defaultdict
+from django.db import connection, transaction
+
+def create_blacklist(user, corpus):
+    pass
+
+def create_synonymes(user, corpus):
+    pass
+
+def create_whitelist(user, corpus):
+    cursor = connection.cursor()
+    
+    try: 
+        whitelist_type = NodeType.objects.get(name='WhiteList')
+        blacklist_type = NodeType.objects.get(name='BlackList')
+    except:
+        whitelist_type = NodeType(name='WhiteList')
+        whitelist_type.save()
+    
+        blacklist_type = NodeType(name='BlackList')
+        blacklist_type.save()
+
+    white_list = Node.objects.create(name='WhiteList Corpus' + str(corpus.id), user=user, parent=corpus, type=whitelist_type)
+    black_list = Node.objects.create(name='BlackList Corpus' + str(corpus.id), user=user, parent=corpus, type=blacklist_type)
+
+    # delete avant pour éviter les doublons
+#    try:
+#        Node_Ngram.objects.filter(node=white_list).all().delete()
+#    except:
+#        print('First time we compute cooc')
+#
+    query_whitelist = """
+        INSERT INTO node_node_ngram (node_id, ngram_id, weight)
+        SELECT
+            %d,
+            ngX.id,
+            COUNT(*) AS occurrences
+        FROM
+            node_node AS n
+        INNER JOIN
+            node_node_ngram AS nngX ON nngX.node_id = n.id
+        INNER JOIN
+            node_ngram AS ngX ON ngX.id = nngX.ngram_id
+        WHERE
+            n.parent_id = %d
+        AND
+            n.type_id = 4
+        AND
+            ngX.n >= 1
+
+        GROUP BY
+            ngX.id
+        Having
+            COUNT(*) >= 1
+        ORDER BY
+            occurrences DESC
+        LIMIT
+            100
+        ;
+    """  % (white_list.id, corpus.id)
+    
+    cursor.execute(query_whitelist)
+
+    return white_list
+
+#def create_cooc(user, corpus, whitelist, blacklist, synonymes):
+def create_cooc(user=None, corpus=None, whitelist=None):
+    cursor = connection.cursor()
+    
+    try:
+        cooc_type  = NodeType.objects.get(name='Cooccurrence')
+    except:
+        cooc_type = NodeType(name='Cooccurrence')
+        cooc_type.save()
+    # pour les tests on supprime les cooc
+    Node.objects.filter(type=cooc_type, parent=corpus).delete()
+
+    cooc = Node.objects.create(user=user,\
+                           parent=corpus,\
+                           type=cooc_type,\
+                           name="Cooccurrences corpus " + str(corpus.pk))
+
+    query_cooc = """
+    INSERT INTO node_nodengramngram (node_id, "ngramx_id", "ngramy_id", score)
+        SELECT
+        %d as node_id,
+        ngX.id,
+        ngY.id,
+        COUNT(*) AS score
+    FROM
+        node_node AS n  -- the nodes who are direct children of the corpus
+        
+    INNER JOIN
+        node_node_ngram AS nngX ON nngX.node_id = n.id  --  list of ngrams contained in the node
+    INNER JOIN
+        node_node_ngram AS whitelistX ON whitelistX.ngram_id = nngX.ngram_id -- list of ngrams contained in the whitelist and in the node
+    INNER JOIN
+        node_ngram AS ngX ON ngX.id = whitelistX.ngram_id -- ngrams which are in both
+        
+    INNER JOIN
+        node_node_ngram AS nngY ON nngY.node_id = n.id
+    INNER JOIN
+        node_node_ngram AS whitelistY ON whitelistY.ngram_id = nngY.ngram_id
+    INNER JOIN
+        node_ngram AS ngY ON ngY.id = whitelistY.ngram_id
+        
+    WHERE
+        n.parent_id = %s
+    AND
+        whitelistX.node_id = %s
+    AND
+        whitelistY.node_id = %s
+    AND
+        nngX.ngram_id < nngY.ngram_id   --  so we only get distinct pairs of ngrams
+        
+    GROUP BY
+        ngX.id,
+        ngX.terms,
+        ngY.id,
+        ngY.terms
+    ORDER BY
+        score DESC
+    LIMIT
+        150
+    """ % (cooc.pk, corpus.id, whitelist.id, whitelist.id)
+
+    cursor.execute(query_cooc)
+    return cooc
+
--- a/gargantext_web/urls.py
+++ b/gargantext_web/urls.py
@@ -26,7 +26,6 @@ urlpatterns = patterns('',
    
    url(r'^project/(\d+)/corpus/(\d+)/$', views.corpus),
    url(r'^project/(\d+)/corpus/(\d+)/delete/$', views.delete_corpus),
-    url(r'^project/(\d+)/corpus/(\d+)/data.csv$', views.send_csv),
    
    url(r'^graph$', views.explorer_graph),
    url(r'^chart$', views.explorer_chart),

--- a/gargantext_web/views.py
+++ b/gargantext_web/views.py
@@ -7,8 +7,9 @@ from django.template import Context

 #from documents.models import Project, Corpus, Document

-from node.models import Language, ResourceType, Resource
-from node.models import Node, NodeType, Node_Resource, Project, Corpus, NodeNgramNgram
+from node.models import Language, ResourceType, Resource, \
+        Node, NodeType, Node_Resource, Project, Corpus, \
+        Node_Ngram, NodeNgramNgram
 from node.admin import CorpusForm, ProjectForm, ResourceForm

 from django.contrib.auth.models import User
@@ -20,6 +21,7 @@ from dateutil.parser import parse
 from django.db import connection
 from django import forms

+
 from collections import defaultdict

 from parsing.FileParsers import *
@@ -483,7 +485,7 @@ def json_node_link(request):

    matrix = defaultdict(lambda : defaultdict(float))
    labels = dict()
-    cooc = Node.objects.get(id=61314)
+    cooc = Node.objects.get(id=81249)

    for cooccurrence in NodeNgramNgram.objects.filter(node=cooc):
        labels[cooccurrence.ngramx.id] = cooccurrence.ngramx.terms
@@ -544,3 +546,6 @@ def graph_it(request):
        'date': date,
    }))    
    return HttpResponse(html)
+
+
+