from node.models import Language, ResourceType, Resource, \ Node, NodeType, Node_Resource, Project, Corpus, \ Node_Ngram, NodeNgramNgram from collections import defaultdict from django.db import connection, transaction def create_blacklist(user, corpus): pass def create_synonymes(user, corpus): pass def create_whitelist(user, corpus): cursor = connection.cursor() try: whitelist_type = NodeType.objects.get(name='WhiteList') blacklist_type = NodeType.objects.get(name='BlackList') except: whitelist_type = NodeType(name='WhiteList') whitelist_type.save() blacklist_type = NodeType(name='BlackList') blacklist_type.save() white_list = Node.objects.create(name='WhiteList Corpus' + str(corpus.id), user=user, parent=corpus, type=whitelist_type) black_list = Node.objects.create(name='BlackList Corpus' + str(corpus.id), user=user, parent=corpus, type=blacklist_type) # delete avant pour éviter les doublons # try: # Node_Ngram.objects.filter(node=white_list).all().delete() # except: # print('First time we compute cooc') # query_whitelist = """ INSERT INTO node_node_ngram (node_id, ngram_id, weight) SELECT %d, ngX.id, COUNT(*) AS occurrences FROM node_node AS n INNER JOIN node_node_ngram AS nngX ON nngX.node_id = n.id INNER JOIN node_ngram AS ngX ON ngX.id = nngX.ngram_id WHERE n.parent_id = %d AND n.type_id = 4 AND ngX.n >= 1 GROUP BY ngX.id Having COUNT(*) >= 1 ORDER BY occurrences DESC LIMIT 100 ; """ % (white_list.id, corpus.id) cursor.execute(query_whitelist) return white_list #def create_cooc(user, corpus, whitelist, blacklist, synonymes): def create_cooc(user=None, corpus=None, whitelist=None): cursor = connection.cursor() try: cooc_type = NodeType.objects.get(name='Cooccurrence') except: cooc_type = NodeType(name='Cooccurrence') cooc_type.save() # pour les tests on supprime les cooc Node.objects.filter(type=cooc_type, parent=corpus).delete() cooc = Node.objects.create(user=user,\ parent=corpus,\ type=cooc_type,\ name="Cooccurrences corpus " + str(corpus.pk)) query_cooc = """ INSERT INTO node_nodengramngram (node_id, "ngramx_id", "ngramy_id", score) SELECT %d as node_id, ngX.id, ngY.id, COUNT(*) AS score FROM node_node AS n -- the nodes who are direct children of the corpus INNER JOIN node_node_ngram AS nngX ON nngX.node_id = n.id -- list of ngrams contained in the node INNER JOIN node_node_ngram AS whitelistX ON whitelistX.ngram_id = nngX.ngram_id -- list of ngrams contained in the whitelist and in the node INNER JOIN node_ngram AS ngX ON ngX.id = whitelistX.ngram_id -- ngrams which are in both INNER JOIN node_node_ngram AS nngY ON nngY.node_id = n.id INNER JOIN node_node_ngram AS whitelistY ON whitelistY.ngram_id = nngY.ngram_id INNER JOIN node_ngram AS ngY ON ngY.id = whitelistY.ngram_id WHERE n.parent_id = %s AND whitelistX.node_id = %s AND whitelistY.node_id = %s AND nngX.ngram_id < nngY.ngram_id -- so we only get distinct pairs of ngrams GROUP BY ngX.id, ngX.terms, ngY.id, ngY.terms ORDER BY score DESC LIMIT 150 """ % (cooc.pk, corpus.id, whitelist.id, whitelist.id) cursor.execute(query_cooc) return cooc