1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
#from admin.env import *
from math import log
from gargantext_web.db import *
from gargantext_web.db import get_or_create_node
from admin.utils import DebugTime
def compute_tfidf(corpus):
# compute terms frequency sum
dbg = DebugTime('Corpus #%d - TFIDF' % corpus.id)
dbg.show('calculate terms frequencies sums')
tfidf_node = get_or_create_node(nodetype='Tfidf', corpus=corpus)
db, cursor = get_cursor()
cursor.execute('''
CREATE TEMPORARY TABLE tmp__st (
node_id INT NOT NULL,
frequency DOUBLE PRECISION NOT NULL
)
''')
cursor.execute('''
INSERT INTO
tmp__st (node_id, frequency)
SELECT
node_ngram.node_id,
SUM(node_ngram.weight) AS frequency
FROM
%s AS node
INNER JOIN
%s AS node_ngram ON node_ngram.node_id = node.id
WHERE
node.parent_id = %d
GROUP BY
node_ngram.node_id
''' % (Node.__table__.name, Node_Ngram.__table__.name, corpus.id, ))
# compute normalized terms frequencies
dbg.show('normalize terms frequencies')
cursor.execute('''
CREATE TEMPORARY TABLE tmp__tf (
node_id INT NOT NULL,
ngram_id INT NOT NULL,
frequency DOUBLE PRECISION NOT NULL
)
''')
cursor.execute('''
INSERT INTO
tmp__tf (node_id, ngram_id, frequency)
SELECT
node_ngram.node_id,
node_ngram.ngram_id,
(node_ngram.weight / node.frequency) AS frequency
FROM
%s AS node_ngram
INNER JOIN
tmp__st AS node ON node.node_id = node_ngram.node_id
''' % (Node_Ngram.__table__.name, ))
# show off
dbg.show('compute idf')
cursor.execute('''
CREATE TEMPORARY TABLE tmp__idf (
ngram_id INT NOT NULL,
idf DOUBLE PRECISION NOT NULL
)
''')
cursor.execute('''
INSERT INTO
tmp__idf(ngram_id, idf)
SELECT
node_ngram.ngram_id,
-ln(COUNT(*))
FROM
%s AS node
INNER JOIN
%s AS node_ngram ON node_ngram.node_id = node.id
WHERE
node.parent_id = %d
GROUP BY
node_ngram.ngram_id
''' % (Node.__table__.name, Node_Ngram.__table__.name, corpus.id, ))
cursor.execute('SELECT COUNT(*) FROM tmp__st')
D = cursor.fetchone()[0]
if D>0:
lnD = log(D)
cursor.execute('UPDATE tmp__idf SET idf = idf + %f' % (lnD, ))
# show off
dbg.show('insert tfidf for %d documents' % D)
cursor.execute('''
INSERT INTO
%s (nodex_id, nodey_id, ngram_id, score)
SELECT
%d AS nodex_id,
tf.node_id AS nodey_id,
tf.ngram_id AS ngram_id,
(tf.frequency * idf.idf) AS score
FROM
tmp__idf AS idf
INNER JOIN
tmp__tf AS tf ON tf.ngram_id = idf.ngram_id
''' % (NodeNodeNgram.__table__.name, tfidf_node.id, ))
# # show off
# cursor.execute('''
# SELECT
# node.name,
# ngram.terms,
# node_node_ngram.score AS tfidf
# FROM
# %s AS node_node_ngram
# INNER JOIN
# %s AS node ON node.id = node_node_ngram.nodey_id
# INNER JOIN
# %s AS ngram ON ngram.id = node_node_ngram.ngram_id
# WHERE
# node_node_ngram.nodex_id = %d
# ORDER BY
# score DESC
# ''' % (NodeNodeNgram.__table__.name, Node.__table__.name, Ngram.__table__.name, corpus.id, ))
# for row in cursor.fetchall():
# print(row)
# the end!
db.commit()
#http://stackoverflow.com/questions/8674718/best-way-to-select-random-rows-postgresql
def compute_tfidf_global(corpus):
dbg = DebugTime('Corpus #%d - tfidf global' % corpus.id)
dbg.show('calculate terms frequencies sums')
tfidf_node = get_or_create_node(nodetype='Tfidf (global)', corpus=corpus)
session.query(NodeNodeNgram).filter(NodeNodeNgram.nodex_id==tfidf_node.id).delete()
session.commit()
# compute terms frequency sum
db, cursor = get_cursor()
cursor.execute('''
CREATE TEMPORARY TABLE tmp__tf (
ngram_id INT NOT NULL,
frequency DOUBLE PRECISION NOT NULL
);
''')
cursor.execute('''
INSERT INTO
tmp__tf (ngram_id, frequency)
SELECT
node_ngram.ngram_id AS ngram_id,
(count(*)) AS frequency
FROM %s AS node_ngram
INNER JOIN
%s AS node ON node.id = node_ngram.node_id
WHERE
node.parent_id = %d
GROUP BY node_ngram.ngram_id;
''' % (Node_Ngram.__table__.name, Node.__table__.name, corpus.id, ))
# show off
dbg.show('compute idf')
cursor.execute('''
CREATE TEMPORARY TABLE tmp__idf (
ngram_id INT NOT NULL,
idf DOUBLE PRECISION NOT NULL
)
''')
# TODO uniform language use in corpus
if corpus.language_id == cache.Language['fr'].id:
lang='fr'
else:
lang='en'
if lang == 'en':
cursor.execute('''
INSERT INTO
tmp__idf(ngram_id, idf)
SELECT
node_ngram.ngram_id, -ln(COUNT(*))
FROM
%s AS node_ngram
INNER JOIN
tmp__tf ON tmp__tf.ngram_id = node_ngram.ngram_id
INNER JOIN
%s as doc ON doc.id = node_ngram.node_id
INNER JOIN
%s as corpus ON corpus.id = doc.parent_id
WHERE
doc.language_id = %d AND doc.type_id = %d AND corpus.type_id=%d
-- AND RANDOM() < 0.01
GROUP BY
node_ngram.ngram_id
-- limit 10000
;
''' % (Node_Ngram.__table__.name
, Node.__table__.name
, Node.__table__.name
, cache.Language[lang].id
, cache.NodeType['Document'].id
, corpus.type_id
)
)
elif lang == 'fr':
cursor.execute('''
INSERT INTO
tmp__idf(ngram_id, idf)
SELECT
node_ngram.ngram_id, -ln(COUNT(*))
FROM
%s AS node_ngram
INNER JOIN
tmp__tf ON tmp__tf.ngram_id = node_ngram.ngram_id
INNER JOIN
%s as doc ON doc.id = node_ngram.node_id
INNER JOIN
%s as corpus ON corpus.id = doc.parent_id
WHERE
corpus.language_id = %d AND doc.type_id = %d AND corpus.type_id=%d
AND RANDOM() < 0.01
GROUP BY
node_ngram.ngram_id
-- limit 10000
;
''' % (Node_Ngram.__table__.name
, Node.__table__.name
, Node.__table__.name
, cache.Language[lang].id
, cache.NodeType['Document'].id
, corpus.type_id
)
)
cursor.execute('''SELECT COUNT(*) FROM %s AS doc
WHERE doc.language_id = %d
AND doc.type_id = %d
''' % (Node.__table__.name, cache.Language[lang].id, cache.NodeType['Document'].id))
D = cursor.fetchone()[0]
if D>0:
lnD = log(D)
cursor.execute('UPDATE tmp__idf SET idf = idf + %f' % (lnD, ))
# show off
cursor.execute('''
INSERT INTO
%s (nodex_id, nodey_id, ngram_id, score)
SELECT
%d AS nodex_id,
%d AS nodey_id,
tf.ngram_id AS ngram_id,
(tf.frequency * idf.idf) AS score
FROM
tmp__idf AS idf
INNER JOIN
tmp__tf AS tf ON tf.ngram_id = idf.ngram_id
''' % (NodeNodeNgram.__table__.name, tfidf_node.id, corpus.id, ))
db.commit()
dbg.show('insert tfidf')
#corpus=session.query(Node).filter(Node.id==244250).first()
#compute_tfidf_global(corpus)