Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
025087fd
Commit
025087fd
authored
Sep 16, 2015
by
delanoe
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FEAT] TFIDF: document/corpus and corpus/language.
parent
10c68905
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
120 additions
and
8 deletions
+120
-8
celery.py
gargantext_web/celery.py
+3
-1
tfidf.py
ngram/tfidf.py
+117
-7
No files found.
gargantext_web/celery.py
View file @
025087fd
...
...
@@ -9,7 +9,8 @@ def debug_task(request):
print
(
'Request: {0!r}'
.
format
(
request
))
from
gargantext_web.db
import
session
,
Node
from
ngram.tfidf
import
compute_tfidf
from
ngram.tfidf
import
compute_tfidf
,
compute_tfidf_global
@
shared_task
def
apply_sum
(
x
,
y
):
...
...
@@ -43,6 +44,7 @@ def apply_workflow(corpus_id):
update_processing
(
corpus
,
3
)
compute_tfidf
(
corpus
)
compute_tfidf_global
(
corpus
,
lang
=
'en'
)
ngrams2miam
(
user_id
=
corpus
.
user_id
,
corpus_id
=
corpus_id
)
update_processing
(
corpus
,
0
)
...
...
ngram/tfidf.py
View file @
025087fd
from
collections
import
defaultdict
from
datetime
import
datetime
from
random
import
random
from
hashlib
import
md5
from
time
import
time
from
math
import
log
from
gargantext_web.db
import
*
from
gargantext_web.db
import
get_or_create_node
from
admin.utils
import
DebugTime
...
...
@@ -14,6 +9,8 @@ def compute_tfidf(corpus):
dbg
=
DebugTime
(
'Corpus #
%
d - tfidf'
%
corpus
.
id
)
# compute terms frequency sum
dbg
.
show
(
'calculate terms frequencies sums'
)
tfidf_node
=
get_or_create_node
(
nodetype
=
'Tfidf'
,
user_id
=
corpus
.
user_id
,
parent_id
=
corpus
.
id
)
db
,
cursor
=
get_cursor
()
cursor
.
execute
(
'''
CREATE TEMPORARY TABLE tmp__st (
...
...
@@ -99,7 +96,7 @@ def compute_tfidf(corpus):
tmp__idf AS idf
INNER JOIN
tmp__tf AS tf ON tf.ngram_id = idf.ngram_id
'''
%
(
NodeNodeNgram
.
__table__
.
name
,
corpus
.
id
,
))
'''
%
(
NodeNodeNgram
.
__table__
.
name
,
tfidf_node
.
id
,
))
# # show off
# cursor.execute('''
# SELECT
...
...
@@ -121,3 +118,116 @@ def compute_tfidf(corpus):
# print(row)
# the end!
db
.
commit
()
#http://stackoverflow.com/questions/8674718/best-way-to-select-random-rows-postgresql
def
compute_tfidf_global
(
corpus
,
lang
=
'fr'
):
dbg
=
DebugTime
(
'Corpus #
%
d - tfidf global'
%
corpus
.
id
)
dbg
.
show
(
'calculate terms frequencies sums'
)
tfidf_node
=
get_or_create_node
(
nodetype
=
'Tfidf (global)'
,
user_id
=
corpus
.
user_id
,
parent_id
=
corpus
.
id
)
# compute terms frequency sum
db
,
cursor
=
get_cursor
()
cursor
.
execute
(
'''
CREATE TEMPORARY TABLE tmp__tf (
ngram_id INT NOT NULL,
frequency DOUBLE PRECISION NOT NULL
);
'''
)
cursor
.
execute
(
'''
INSERT INTO
tmp__tf (ngram_id, frequency)
SELECT
node_ngram.ngram_id AS ngram_id,
(count(*)) AS frequency
FROM
%
s AS node_ngram
INNER JOIN
%
s AS node ON node.id = node_ngram.node_id
WHERE
node.parent_id =
%
d
GROUP BY node_ngram.ngram_id;
'''
%
(
Node_Ngram
.
__table__
.
name
,
Node
.
__table__
.
name
,
corpus
.
id
,
))
# show off
dbg
.
show
(
'compute idf'
)
cursor
.
execute
(
'''
CREATE TEMPORARY TABLE tmp__idf (
ngram_id INT NOT NULL,
idf DOUBLE PRECISION NOT NULL
)
'''
)
if
lang
==
'en'
:
cursor
.
execute
(
'''
INSERT INTO
tmp__idf(ngram_id, idf)
SELECT
node_ngram.ngram_id,
-ln(COUNT(*))
FROM
%
s AS node_ngram
INNER JOIN
tmp__tf ON tmp__tf.ngram_id = node_ngram.ngram_id
INNER JOIN
%
s as doc ON doc.id = node_ngram.node_id
WHERE
doc.language_id =
%
d AND doc.type_id =
%
d
GROUP BY
node_ngram.ngram_id
;
'''
%
(
Node_Ngram
.
__table__
.
name
,
Node
.
__table__
.
name
,
cache
.
Language
[
lang
]
.
id
,
cache
.
NodeType
[
'Document'
]
.
id
))
elif
lang
==
'fr'
:
cursor
.
execute
(
'''
INSERT INTO
tmp__idf(ngram_id, idf)
SELECT
node_ngram.ngram_id,
-ln(COUNT(*))
FROM
%
s AS node_ngram
INNER JOIN
tmp__tf ON tmp__tf.ngram_id = node_ngram.ngram_id
INNER JOIN
%
s as doc ON doc.id = node_ngram.node_id
INNER JOIN
%
s as corpus ON corpus.id = doc.parent_id
WHERE
corpus.language_id =
%
d AND doc.type_id =
%
d
GROUP BY
node_ngram.ngram_id
;
'''
%
(
Node_Ngram
.
__table__
.
name
,
Node
.
__table__
.
name
,
Node
.
__table__
.
name
,
cache
.
Language
[
lang
]
.
id
,
cache
.
NodeType
[
'Document'
]
.
id
))
cursor
.
execute
(
'''SELECT COUNT(*) FROM
%
s AS doc
WHERE doc.language_id =
%
d
AND doc.type_id =
%
d
'''
%
(
Node
.
__table__
.
name
,
cache
.
Language
[
lang
]
.
id
,
cache
.
NodeType
[
'Document'
]
.
id
))
D
=
cursor
.
fetchone
()[
0
]
if
D
>
0
:
lnD
=
log
(
D
)
cursor
.
execute
(
'UPDATE tmp__idf SET idf = idf +
%
f'
%
(
lnD
,
))
# show off
dbg
.
show
(
'insert tfidf for
%
d documents'
%
D
)
cursor
.
execute
(
'''
INSERT INTO
%
s (nodex_id, nodey_id, ngram_id, score)
SELECT
%
d AS nodex_id,
%
d AS nodey_id,
tf.ngram_id AS ngram_id,
(tf.frequency * idf.idf) AS score
FROM
tmp__idf AS idf
INNER JOIN
tmp__tf AS tf ON tf.ngram_id = idf.ngram_id
'''
%
(
NodeNodeNgram
.
__table__
.
name
,
tfidf_node
.
id
,
corpus
.
id
,
))
db
.
commit
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment