Commit 48579d8d authored by Administrator's avatar Administrator

[FEATURE] Functions to create whitelist and coocurrence objects.

parent cfa0fabb
from node.models import Language, ResourceType, Resource, \
Node, NodeType, Node_Resource, Project, Corpus, \
Node_Ngram, NodeNgramNgram
from collections import defaultdict
from django.db import connection, transaction
def create_blacklist(user, corpus):
pass
def create_synonymes(user, corpus):
pass
def create_whitelist(user, corpus):
cursor = connection.cursor()
try:
whitelist_type = NodeType.objects.get(name='WhiteList')
blacklist_type = NodeType.objects.get(name='BlackList')
except:
whitelist_type = NodeType(name='WhiteList')
whitelist_type.save()
blacklist_type = NodeType(name='BlackList')
blacklist_type.save()
white_list = Node.objects.create(name='WhiteList Corpus' + str(corpus.id), user=user, parent=corpus, type=whitelist_type)
black_list = Node.objects.create(name='BlackList Corpus' + str(corpus.id), user=user, parent=corpus, type=blacklist_type)
# delete avant pour éviter les doublons
# try:
# Node_Ngram.objects.filter(node=white_list).all().delete()
# except:
# print('First time we compute cooc')
#
query_whitelist = """
INSERT INTO node_node_ngram (node_id, ngram_id, weight)
SELECT
%d,
ngX.id,
COUNT(*) AS occurrences
FROM
node_node AS n
INNER JOIN
node_node_ngram AS nngX ON nngX.node_id = n.id
INNER JOIN
node_ngram AS ngX ON ngX.id = nngX.ngram_id
WHERE
n.parent_id = %d
AND
n.type_id = 4
AND
ngX.n >= 1
GROUP BY
ngX.id
Having
COUNT(*) >= 1
ORDER BY
occurrences DESC
LIMIT
100
;
""" % (white_list.id, corpus.id)
cursor.execute(query_whitelist)
return white_list
#def create_cooc(user, corpus, whitelist, blacklist, synonymes):
def create_cooc(user=None, corpus=None, whitelist=None):
cursor = connection.cursor()
try:
cooc_type = NodeType.objects.get(name='Cooccurrence')
except:
cooc_type = NodeType(name='Cooccurrence')
cooc_type.save()
# pour les tests on supprime les cooc
Node.objects.filter(type=cooc_type, parent=corpus).delete()
cooc = Node.objects.create(user=user,\
parent=corpus,\
type=cooc_type,\
name="Cooccurrences corpus " + str(corpus.pk))
query_cooc = """
INSERT INTO node_nodengramngram (node_id, "ngramx_id", "ngramy_id", score)
SELECT
%d as node_id,
ngX.id,
ngY.id,
COUNT(*) AS score
FROM
node_node AS n -- the nodes who are direct children of the corpus
INNER JOIN
node_node_ngram AS nngX ON nngX.node_id = n.id -- list of ngrams contained in the node
INNER JOIN
node_node_ngram AS whitelistX ON whitelistX.ngram_id = nngX.ngram_id -- list of ngrams contained in the whitelist and in the node
INNER JOIN
node_ngram AS ngX ON ngX.id = whitelistX.ngram_id -- ngrams which are in both
INNER JOIN
node_node_ngram AS nngY ON nngY.node_id = n.id
INNER JOIN
node_node_ngram AS whitelistY ON whitelistY.ngram_id = nngY.ngram_id
INNER JOIN
node_ngram AS ngY ON ngY.id = whitelistY.ngram_id
WHERE
n.parent_id = %s
AND
whitelistX.node_id = %s
AND
whitelistY.node_id = %s
AND
nngX.ngram_id < nngY.ngram_id -- so we only get distinct pairs of ngrams
GROUP BY
ngX.id,
ngX.terms,
ngY.id,
ngY.terms
ORDER BY
score DESC
LIMIT
150
""" % (cooc.pk, corpus.id, whitelist.id, whitelist.id)
cursor.execute(query_cooc)
return cooc
......@@ -26,7 +26,6 @@ urlpatterns = patterns('',
url(r'^project/(\d+)/corpus/(\d+)/$', views.corpus),
url(r'^project/(\d+)/corpus/(\d+)/delete/$', views.delete_corpus),
url(r'^project/(\d+)/corpus/(\d+)/data.csv$', views.send_csv),
url(r'^graph$', views.explorer_graph),
url(r'^chart$', views.explorer_chart),
......
......@@ -7,8 +7,9 @@ from django.template import Context
#from documents.models import Project, Corpus, Document
from node.models import Language, ResourceType, Resource
from node.models import Node, NodeType, Node_Resource, Project, Corpus, NodeNgramNgram
from node.models import Language, ResourceType, Resource, \
Node, NodeType, Node_Resource, Project, Corpus, \
Node_Ngram, NodeNgramNgram
from node.admin import CorpusForm, ProjectForm, ResourceForm
from django.contrib.auth.models import User
......@@ -20,6 +21,7 @@ from dateutil.parser import parse
from django.db import connection
from django import forms
from collections import defaultdict
from parsing.FileParsers import *
......@@ -483,7 +485,7 @@ def json_node_link(request):
matrix = defaultdict(lambda : defaultdict(float))
labels = dict()
cooc = Node.objects.get(id=61314)
cooc = Node.objects.get(id=81249)
for cooccurrence in NodeNgramNgram.objects.filter(node=cooc):
labels[cooccurrence.ngramx.id] = cooccurrence.ngramx.terms
......@@ -544,3 +546,6 @@ def graph_it(request):
'date': date,
}))
return HttpResponse(html)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment