Commit a395f895 authored by Mathieu Rodic's avatar Mathieu Rodic

[OPTI] Improved performance for TfIDf computation

[OPTI] Further improved performance for metadata indexation
[FEAT] Introduced a tool for debugging and benchmarking
https://forge.iscpif.fr/issues/1438
parent d86ae874
...@@ -12,7 +12,7 @@ from node.admin import CustomForm ...@@ -12,7 +12,7 @@ from node.admin import CustomForm
from gargantext_web.db import * from gargantext_web.db import *
from gargantext_web.settings import DEBUG, MEDIA_ROOT from gargantext_web.settings import DEBUG, MEDIA_ROOT
from parsing.corpustools import add_resource, parse_resources, extract_ngrams from parsing.corpustools import add_resource, parse_resources, extract_ngrams, compute_tfidf
def project(request, project_id): def project(request, project_id):
...@@ -120,6 +120,7 @@ def project(request, project_id): ...@@ -120,6 +120,7 @@ def project(request, project_id):
try: try:
parse_resources(corpus) parse_resources(corpus)
extract_ngrams(corpus, ['title']) extract_ngrams(corpus, ['title'])
compute_tfidf(corpus)
except Exception as error: except Exception as error:
print('WORKFLOW ERROR') print('WORKFLOW ERROR')
print(error) print(error)
......
...@@ -2,6 +2,8 @@ from collections import defaultdict ...@@ -2,6 +2,8 @@ from collections import defaultdict
from datetime import datetime from datetime import datetime
from random import random from random import random
from hashlib import md5 from hashlib import md5
from time import time
from math import log
from gargantext_web.db import * from gargantext_web.db import *
...@@ -9,6 +11,23 @@ from .FileParsers import * ...@@ -9,6 +11,23 @@ from .FileParsers import *
class DebugTime:
def __init__(self, prefix):
self.prefix = prefix
self.message = None
self.time = None
def __del__(self):
if self.message is not None and self.time is not None:
print('%s - %s: %.4f' % (self.prefix, self.message, time() - self.time))
def show(self, message):
self.__del__()
self.message = message
self.time = time()
# keep all the parsers in a cache # keep all the parsers in a cache
class Parsers(defaultdict): class Parsers(defaultdict):
...@@ -72,6 +91,7 @@ def add_resource(corpus, **kwargs): ...@@ -72,6 +91,7 @@ def add_resource(corpus, **kwargs):
def parse_resources(corpus, user=None, user_id=None): def parse_resources(corpus, user=None, user_id=None):
dbg = DebugTime('Corpus #%d - parsing' % corpus.id)
session = Session() session = Session()
corpus_id = corpus.id corpus_id = corpus.id
type_id = cache.NodeType['Document'].id type_id = cache.NodeType['Document'].id
...@@ -88,7 +108,7 @@ def parse_resources(corpus, user=None, user_id=None): ...@@ -88,7 +108,7 @@ def parse_resources(corpus, user=None, user_id=None):
.filter(Node_Resource.parsed == False) .filter(Node_Resource.parsed == False)
) )
# make a new node for every parsed document of the corpus # make a new node for every parsed document of the corpus
print('PARSE RESOURCES: retrieve documents') dbg.show('analyze documents')
nodes = list() nodes = list()
for resource, resourcetype in resources_query: for resource, resourcetype in resources_query:
parser = parsers[resourcetype.name] parser = parsers[resourcetype.name]
...@@ -115,17 +135,21 @@ def parse_resources(corpus, user=None, user_id=None): ...@@ -115,17 +135,21 @@ def parse_resources(corpus, user=None, user_id=None):
# #
# TODO: mark node-resources associations as parsed # TODO: mark node-resources associations as parsed
# #
print('PARSE RESOURCES: insert documents') dbg.show('insert documents')
session.add_all(nodes) session.add_all(nodes)
session.commit() session.commit()
# now, index the metadata # now, index the metadata
print('PARSE RESOURCES: insert metadata') dbg.show('insert metadata')
node_metadata_lists = defaultdict(list) node_metadata_lists = defaultdict(list)
metadata_types = {
metadata.name: metadata
for metadata in session.query(Metadata)
}
for node in nodes: for node in nodes:
node_id = node.id node_id = node.id
for metadata_key, metadata_value in node.metadata.items(): for metadata_key, metadata_value in node.metadata.items():
try: try:
metadata = cache.Metadata[metadata_key] metadata = metadata_types[metadata_key]
except KeyError: except KeyError:
continue continue
if metadata.type == 'string': if metadata.type == 'string':
...@@ -171,6 +195,7 @@ class NgramsExtractors(defaultdict): ...@@ -171,6 +195,7 @@ class NgramsExtractors(defaultdict):
ngramsextractors = NgramsExtractors() ngramsextractors = NgramsExtractors()
def extract_ngrams(corpus, keys): def extract_ngrams(corpus, keys):
dbg = DebugTime('Corpus #%d - ngrams' % corpus.id)
default_language_iso2 = None if corpus.language_id is None else cache.Language[corpus.language_id].iso2 default_language_iso2 = None if corpus.language_id is None else cache.Language[corpus.language_id].iso2
# query the metadata associated with the given keys # query the metadata associated with the given keys
columns = [Node.id, Node.language_id] + [Node.metadata[key] for key in keys] columns = [Node.id, Node.language_id] + [Node.metadata[key] for key in keys]
...@@ -180,7 +205,7 @@ def extract_ngrams(corpus, keys): ...@@ -180,7 +205,7 @@ def extract_ngrams(corpus, keys):
.filter(Node.type_id == cache.NodeType['Document'].id) .filter(Node.type_id == cache.NodeType['Document'].id)
) )
# prepare data to be inserted # prepare data to be inserted
print('EXTRACT NGRAMS: find ngrams') dbg.show('find ngrams')
ngrams_data = set() ngrams_data = set()
node_ngram_list = defaultdict(lambda: defaultdict(int)) node_ngram_list = defaultdict(lambda: defaultdict(int))
for nodeinfo in metadata_query: for nodeinfo in metadata_query:
...@@ -199,7 +224,7 @@ def extract_ngrams(corpus, keys): ...@@ -199,7 +224,7 @@ def extract_ngrams(corpus, keys):
(n, terms) (n, terms)
) )
# insert ngrams to temporary table # insert ngrams to temporary table
print('EXTRACT NGRAMS: find ngrams ids') dbg.show('find ngrams ids')
db, cursor = get_cursor() db, cursor = get_cursor()
cursor.execute(''' cursor.execute('''
CREATE TEMPORARY TABLE tmp__ngrams ( CREATE TEMPORARY TABLE tmp__ngrams (
...@@ -249,7 +274,7 @@ def extract_ngrams(corpus, keys): ...@@ -249,7 +274,7 @@ def extract_ngrams(corpus, keys):
for row in cursor.fetchall(): for row in cursor.fetchall():
ngram_ids[row[1]] = row[0] ngram_ids[row[1]] = row[0]
# #
print('EXTRACT NGRAMS: insert associations') dbg.show('insert associations')
node_ngram_data = list() node_ngram_data = list()
for node_id, ngrams in node_ngram_list.items(): for node_id, ngrams in node_ngram_list.items():
for terms, weight in ngrams.items(): for terms, weight in ngrams.items():
...@@ -258,3 +283,116 @@ def extract_ngrams(corpus, keys): ...@@ -258,3 +283,116 @@ def extract_ngrams(corpus, keys):
bulk_insert(Node_Ngram, ['node_id', 'ngram_id', 'weight'], node_ngram_data, cursor=cursor) bulk_insert(Node_Ngram, ['node_id', 'ngram_id', 'weight'], node_ngram_data, cursor=cursor)
# commit to database # commit to database
db.commit() db.commit()
# tfidf calculation
def compute_tfidf(corpus):
dbg = DebugTime('Corpus #%d - tfidf' % corpus.id)
# compute terms frequency sum
dbg.show('calculate terms frequencies sums')
db, cursor = get_cursor()
cursor.execute('''
CREATE TEMPORARY TABLE tmp__st (
node_id INT NOT NULL,
frequency DOUBLE PRECISION NOT NULL
)
''')
cursor.execute('''
INSERT INTO
tmp__st (node_id, frequency)
SELECT
node_ngram.node_id,
SUM(node_ngram.weight) AS frequency
FROM
%s AS node
INNER JOIN
%s AS node_ngram ON node_ngram.node_id = node.id
WHERE
node.parent_id = %d
GROUP BY
node_ngram.node_id
''' % (Node.__table__.name, Node_Ngram.__table__.name, corpus.id, ))
# compute normalized terms frequencies
dbg.show('normalize terms frequencies')
cursor.execute('''
CREATE TEMPORARY TABLE tmp__tf (
node_id INT NOT NULL,
ngram_id INT NOT NULL,
frequency DOUBLE PRECISION NOT NULL
)
''')
cursor.execute('''
INSERT INTO
tmp__tf (node_id, ngram_id, frequency)
SELECT
node_ngram.node_id,
node_ngram.ngram_id,
(node_ngram.weight / node.frequency) AS frequency
FROM
%s AS node_ngram
INNER JOIN
tmp__st AS node ON node.node_id = node_ngram.node_id
''' % (Node_Ngram.__table__.name, ))
# show off
dbg.show('compute idf')
cursor.execute('''
CREATE TEMPORARY TABLE tmp__idf (
ngram_id INT NOT NULL,
idf DOUBLE PRECISION NOT NULL
)
''')
cursor.execute('''
INSERT INTO
tmp__idf(ngram_id, idf)
SELECT
node_ngram.ngram_id,
-ln(COUNT(*))
FROM
%s AS node
INNER JOIN
%s AS node_ngram ON node_ngram.node_id = node.id
WHERE
node.parent_id = %d
GROUP BY
node_ngram.ngram_id
''' % (Node.__table__.name, Node_Ngram.__table__.name, corpus.id, ))
cursor.execute('SELECT COUNT(*) FROM tmp__st')
D = cursor.fetchone()[0]
lnD = log(D)
cursor.execute('UPDATE tmp__idf SET idf = idf + %f' % (lnD, ))
# show off
dbg.show('compute results, a.k.a tfidf')
cursor.execute('''
INSERT INTO
%s (nodex_id, nodey_id, ngram_id, score)
SELECT
%d AS nodex_id,
tf.node_id AS nodey_id,
tf.ngram_id AS ngram_id,
(tf.frequency * idf.idf) AS score
FROM
tmp__idf AS idf
INNER JOIN
tmp__tf AS tf ON tf.ngram_id = idf.ngram_id
''' % (NodeNodeNgram.__table__.name, corpus.id, ))
# show off
cursor.execute('''
SELECT
node.name,
ngram.terms,
node_node_ngram.score AS tfidf
FROM
%s AS node_node_ngram
INNER JOIN
%s AS node ON node.id = node_node_ngram.nodey_id
INNER JOIN
%s AS ngram ON ngram.id = node_node_ngram.ngram_id
WHERE
node_node_ngram.nodex_id = %d
ORDER BY
score DESC
''' % (NodeNodeNgram.__table__.name, Node.__table__.name, Ngram.__table__.name, corpus.id, ))
# the end!
db.commit()
# Without this, we couldn't use the Django environment # Without this, we couldn't use the Django environment
import os import os
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "gargantext_web.settings") os.environ.setdefault("DJANGO_SETTINGS_MODULE", "gargantext_web.settings")
os.environ.setdefault("DJANGO_HSTORE_GLOBAL_REGISTER", "False") os.environ.setdefault("DJANGO_HSTORE_GLOBAL_REGISTER", "False")
# database tools
from gargantext_web.db import *
from parsing.corpustools import *
# perform a bulk insert, just to see
from gargantext_web.db import *
user = session.query(User).first() user = session.query(User).first()
project = session.query(Node).filter(Node.name == 'A').first() project = session.query(Node).filter(Node.name == 'A').first()
corpus = Node( corpus = Node(
parent_id = project.id, parent_id = project.id,
name = 'Test 123', name = 'Test 456',
type_id = cache.NodeType['Corpus'].id, type_id = cache.NodeType['Corpus'].id,
user_id = user.id, user_id = user.id,
) )
...@@ -23,13 +22,16 @@ corpus = Node( ...@@ -23,13 +22,16 @@ corpus = Node(
session.add(corpus) session.add(corpus)
session.commit() session.commit()
#
from parsing.corpus import *
add_resource(corpus, add_resource(corpus,
file = './data_samples/pubmed_result.xml', file = './data_samples/pubmed_result.xml',
type_id = cache.ResourceType['pubmed'].id, type_id = cache.ResourceType['pubmed'].id,
) )
parse_resources(corpus) parse_resources(corpus)
extract_ngrams(corpus, ('abstract', )) extract_ngrams(corpus, ('title', ))
# print(corpus)
# corpus = session.query(Node).filter(Node.id == 72771).first()
# corpus = session.query(Node).filter(Node.id == 73017).first()
compute_tfidf(corpus)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment