1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram, \
HyperdataKey
from gargantext.util.db import session, aliased, bulk_insert, func
from gargantext.util.lists import WeightedMatrix, UnweightedList, Translations
from gargantext.util.http import JsonHttpResponse
from sqlalchemy import desc, asc, or_, and_, func
import datetime
import ast
import networkx as nx
def doc_freq(corpus_id, node_ids):
'''
doc_freq :: Corpus_id -> [(Ngram_id, Int)]
Given a corpus, compute number of documents that have the ngram in it.
'''
return ( session.query(NodeNgram.ngram_id, func.count(NodeNgram.node_id))
.join(Node, NodeNgram.node_id == Node.id)
.filter( Node.parent_id == corpus_id
, Node.typename== 'DOCUMENT')
.filter( NodeNgram.weight > 0
, NodeNgram.ngram_id.in_(node_ids) )
.group_by(NodeNgram.ngram_id)
.all()
)
def doc_ngram_representativity(corpus_id, node_ids):
'''
doc_ngram_representativity :: Corpus_ID -> Dict Ngram_id Float
Given a corpus, compute part of of documents that have the ngram it it.
'''
nodes_count = ( session.query(Node)
.filter( Node.parent_id == corpus_id
, Node.typename == 'DOCUMENT'
)
.count()
)
result = dict()
for ngram_id, somme in doc_freq(corpus_id, node_ids):
result[ngram_id] = somme / nodes_count
return result
def compare_corpora(Corpus_id_A, Corpus_id_B, node_ids):
'''
compare_corpora :: Corpus_id -> Corpus_id -> Dict Ngram_id Float
Given two corpus :
- if corpora are the same, it return :
(dict of document frequency per ngram as key)
- if corpora are different, it returns :
doc_ngram_representativit(Corpus_id_A) / doc_ngram_representativity(Corpus_id_B)
(as dict per ngram as key)
'''
result = dict()
if int(Corpus_id_A) == int(Corpus_id_B):
for ngram_id, somme in doc_freq(Corpus_id_A, node_ids):
result[ngram_id] = somme
else:
data_A = doc_ngram_representativity(Corpus_id_A, node_ids)
data_B = doc_ngram_representativity(Corpus_id_B, node_ids)
queue = list()
for k in data_A.keys():
if k not in data_B.keys():
queue.append(k)
else:
result[k] = data_B[k] / data_A[k]
maximum = max([ result[k] for k in result.keys()])
minimum = min([ result[k] for k in result.keys()])
for k in queue:
result[k] = minimum
return result
def intersection(request , corpuses_ids, measure='cooc'):
'''
intersection :: (str(Int) + "a" str(Int)) -> Dict(Ngram.id :: Int, Score :: Int)
intersection = returns as Json Http Response the intersection of two graphs
'''
if request.method == 'POST' and "nodeids" in request.POST and len(request.POST["nodeids"])>0 :
node_ids = [int(i) for i in (ast.literal_eval( request.POST["nodeids"] )) ]
# Here are the visible nodes of the initial semantic map.
corpuses_ids = corpuses_ids.split('a')
corpuses_ids = [int(i) for i in corpuses_ids]
# corpus[1] will be the corpus to compare
return JsonHttpResponse(compare_corpora(corpuses_ids[0], corpuses_ids[1], node_ids))