Commit c08a3b6b authored by delanoe's avatar delanoe

[FIX] 2 MAJOR BUGS on COOC SQL QUERY

    - [OLD] Performance regression
        -> lengthening and slowing the toolchain queue
        -> 2 secondes on 21 Europresse, documents is too much for instance)
    - [OLD] Some ngrams included whereas there are not in the corpus
    + [NEW] Clarity in the query
    + [NEW] Improved: 2000 ms before less than 500 ms after (factor 4
    optimization on a very small corpus); should be ok in bigger corpora

New behavior of the query tested with real corpus and this simple
example; copy paste these lines in test.sql and run it in a test
database (createdb test).

-- let be:

drop table nodes_ngrams;
drop table synonyms;
drop table ngrams;
drop table nodes;

create table nodes (
    id serial PRIMARY KEY not null
);

create table ngrams (
    id serial PRIMARY KEY not null,
    text varchar(50)
);

create table synonyms (
    id serial PRIMARY KEY not null,
    node_id INTEGER not null,
    ngram1_id INTEGER not null references ngrams(id),
    ngram2_id INTEGER not null references ngrams(id)
);

create table nodes_ngrams (
    id serial PRIMARY KEY not null,
    node_id INTEGER not null references nodes(id),
    ngram_id INTEGER not null references ngrams(id)
);

insert into nodes (id) values(1);
insert into nodes (id) values(2);
insert into nodes (id) values(3);

insert into ngrams (text) values('object');
insert into ngrams (text) values('table');
insert into ngrams (text) values('animal');
insert into ngrams (text) values('cat');
insert into ngrams (text) values('dog');
insert into ngrams (text) values('other');
insert into ngrams (text) values('abc');
insert into ngrams (text) values('xyz');
--select * from ngrams;
---- id |  text
--------+--------
----  1 | object
----  2 | table
----  3 | animal
----  4 | cat
----  5 | dog
----  6 | other
----  7 | abc
----  8 | xyz

insert into synonyms (node_id,ngram1_id,ngram2_id) values(1,1,2);
insert into synonyms (node_id,ngram1_id,ngram2_id) values(1,3,4);
insert into synonyms (node_id,ngram1_id,ngram2_id) values(1,3,5);
--select * from synonyms;
-- id | node_id | ngram1_id | ngram2_id
------+---------+-----------+-----------
--  1 |       1 |         1 |         2
--  2 |       1 |         3 |         4
--  3 |       1 |         3 |         5

insert into nodes_ngrams (node_id, ngram_id) values(1,1);
insert into nodes_ngrams (node_id, ngram_id) values(1,6);
insert into nodes_ngrams (node_id, ngram_id) values(1,2);
insert into nodes_ngrams (node_id, ngram_id) values(2,4);
insert into nodes_ngrams (node_id, ngram_id) values(2,5);
insert into nodes_ngrams (node_id, ngram_id) values(3,4);
insert into nodes_ngrams (node_id, ngram_id) values(3,5);
insert into nodes_ngrams (node_id, ngram_id) values(3,6);
--select * from nodes_ngrams;
-- id | node_id | ngram_id
------+---------+----------
--  1 |       1 |        1
--  2 |       1 |        6
--  3 |       1 |        2
--  4 |       2 |        4
--  5 |       2 |        5
--  6 |       3 |        4
--  7 |       3 |        5
--  8 |       3 |        6

select n1.ngram_id, n2.ngram_id, count(*)
from nodes n
INNER JOIN nodes_ngrams n1 ON n1.node_id = n.id
INNER JOIN nodes_ngrams n2 ON n2.node_id = n.id
where
n1.ngram_id <= n2.ngram_id
--AND
--n1.node_id = n2.node_id
group by 1,2
order BY n1.ngram_id ASC
;
-- ngram_id | ngram_id | count
------------+----------+-------
--        5 |        6 |     1
--        1 |        6 |     1
--        4 |        6 |     1
--        2 |        2 |     1
--        4 |        4 |     2
--        1 |        1 |     1
--        1 |        2 |     1
--        6 |        6 |     2
--        2 |        6 |     1
--        4 |        5 |     2
--        5 |        5 |     2
--(11 lignes)

select coalesce(n11.id, n1.ngram_id), coalesce(n22.id,n2.ngram_id), count(*)
from nodes n
INNER JOIN nodes_ngrams n1 ON n1.node_id = n.id
LEFT JOIN synonyms s1 on n1.ngram_id = s1.ngram2_id AND s1.node_id=1
LEFT JOIN ngrams n11 on s1.ngram1_id = n11.id

INNER JOIN nodes_ngrams n2 ON n2.node_id = n.id
LEFT JOIN synonyms s2 on n2.ngram_id = s2.ngram2_id AND s2.node_id=1
LEFT JOIN ngrams n22 on s2.ngram1_id = n22.id
where
n1.ngram_id <= n2.ngram_id
AND
n1.node_id = n2.node_id
group by 1,2
;
-- coalesce | coalesce | count
------------+----------+-------
--        1 |        6 |     2
--        3 |        3 |     6
--        1 |        1 |     3
--        3 |        6 |     2
--        6 |        6 |     2
--(5 lignes)
--> les sommes comptées correspondent
parent 6d527345
...@@ -2,33 +2,30 @@ ...@@ -2,33 +2,30 @@
# WARNING: to ensure consistency and retrocompatibility, lists should keep the # WARNING: to ensure consistency and retrocompatibility, lists should keep the
# initial order (ie., new elements should be appended at the end of the lists) # initial order (ie., new elements should be appended at the end of the lists)
abstract: abstract:
--------- ---------
something between global params, constants, something between global params, constants,
configuration variables, ini file... configuration variables, ini file...
contents: contents:
--------- ---------
+ database constants/ontology
+ db constants/ontology
- nodetypes - nodetypes
(db int <=> named types <=> python code) (db int <=> named types <=> python code)
+ input low-level limits + low-level limits
- query size - query size
- max upload size - max upload size
- doc parsing batch size - doc parsing batch size
- word extraction batch size - word extraction batch size
+ process config + main process config
- resourcetypes config (~ input ontology) - resourcetypes config (~ input ontology)
- wordlist generation params - wordlist generation params
- graph creation params - graph creation params
- £TODO sequence of transformations "custom pipeline" - £TODO sequence of transformations "custom pipeline"
+ input process subclasses/subroutines + subprocess config
- crawling, import - crawling, import
- tagger services and functions - tagger services and functions
- parser services - parser services
...@@ -83,6 +80,7 @@ NODETYPES = [ ...@@ -83,6 +80,7 @@ NODETYPES = [
# docs subset # docs subset
'FAVORITES', # 15 'FAVORITES', # 15
# more scores (sorry!) # more scores (sorry!)
'TIRANK-LOCAL', # 16 'TIRANK-LOCAL', # 16
'TIRANK-GLOBAL', # 17 'TIRANK-GLOBAL', # 17
...@@ -90,6 +88,13 @@ NODETYPES = [ ...@@ -90,6 +88,13 @@ NODETYPES = [
'RESOURCE', # 19 'RESOURCE', # 19
] ]
def get_nodetype_id_by_name(nodetype):
'''resource :: name => resource dict'''
for n in NODETYPES :
if str(n["name"]) == str(sourcename):
return n
INDEXED_HYPERDATA = { INDEXED_HYPERDATA = {
# TODO use properties during toolchain.hyperdata_indexing # TODO use properties during toolchain.hyperdata_indexing
# (type, convert_to_db, convert_from_db) # (type, convert_to_db, convert_from_db)
...@@ -154,7 +159,6 @@ INDEXED_HYPERDATA = { ...@@ -154,7 +159,6 @@ INDEXED_HYPERDATA = {
# user parameters---------------------------------------- # user parameters----------------------------------------
USER_LANG = ["fr", "en"] USER_LANG = ["fr", "en"]
# resources --------------------------------------------- # resources ---------------------------------------------
def get_resource(sourcetype): def get_resource(sourcetype):
'''resource :: type => resource dict''' '''resource :: type => resource dict'''
......
...@@ -7,7 +7,7 @@ from sqlalchemy import create_engine ...@@ -7,7 +7,7 @@ from sqlalchemy import create_engine
from gargantext.util.lists import WeightedMatrix from gargantext.util.lists import WeightedMatrix
# from gargantext.util.db import session, aliased, func # from gargantext.util.db import session, aliased, func
from gargantext.util.db_cache import cache from gargantext.util.db_cache import cache
from gargantext.constants import DEFAULT_COOC_THRESHOLD from gargantext.constants import DEFAULT_COOC_THRESHOLD, NODETYPES
from gargantext.constants import INDEXED_HYPERDATA from gargantext.constants import INDEXED_HYPERDATA
from gargantext.util.tools import datetime, convert_to_date from gargantext.util.tools import datetime, convert_to_date
...@@ -53,9 +53,9 @@ def compute_coocs( corpus, ...@@ -53,9 +53,9 @@ def compute_coocs( corpus,
- groupings_id: optional synonym relations to add all subform counts - groupings_id: optional synonym relations to add all subform counts
with their mainform's counts with their mainform's counts
- on_list_id: mainlist or maplist type, to constrain the input ngrams - on_list_id: mainlist or maplist type, to constrain the input ngrams
- stoplist_id: stoplist for filtering input ngrams - TODO stoplist_id: stoplist for filtering input ngrams
(normally unnecessary if a mainlist is already provided) (normally unnecessary if a mainlist is already provided)
- start, end: provide one or both temporal limits to filter on doc date - TODO start, end: provide one or both temporal limits to filter on doc date
NB the expected type of parameter value is datetime.datetime NB the expected type of parameter value is datetime.datetime
(string is also possible but format must follow (string is also possible but format must follow
this convention: "2001-01-01" aka "%Y-%m-%d") this convention: "2001-01-01" aka "%Y-%m-%d")
...@@ -72,183 +72,160 @@ def compute_coocs( corpus, ...@@ -72,183 +72,160 @@ def compute_coocs( corpus,
connection = engine.connect() connection = engine.connect()
# string vars for our SQL query # string vars for our SQL query
sql_statement = "" # setting work memory high to improve cache perf.
doc_idx_statement = "" final_sql = "set work_mem='1GB'; \n"
# where
# final_sql = cooc_sql + select_cooc_sql
cooc_sql = ""
select_cooc_sql = ""
# where
# cooc_sql = cooc_sql + ngram_filter_A_sql + ngram_filter + cooc_filter_sql
cooc_filter_sql = ""
ngram_filter_A_sql = ""
ngram_filter_B_sql = ""
# 2a) prepare the document selection (normal case) # 2a) prepare the document selection (normal case)
doc_idx_statement = """ cooc_sql += """
SELECT node_id, ngram_id WITH COOC as (
FROM nodes_ngrams SELECT
JOIN nodes COALESCE(grA.ngram1_id, wlA.ngram_id) as ngA,
ON node_id = nodes.id COALESCE(grB.ngram1_id, wlB.ngram_id) as ngB,
WHERE nodes.parent_id = {corpus_id} COUNT(*) AS score
AND nodes.typename = 4 FROM
""".format(corpus_id=corpus.id) nodes AS n
-- / \
-- X Y
# 2b) same if document filters -- SQL graph for getting the cooccurrences
if start or end: """
date_type_id = INDEXED_HYPERDATA['publication_date']['id']
doc_idx_statement = """
SELECT node_id, ngram_id
FROM nodes_ngrams
JOIN nodes
ON node_id = nodes.id
-- preparing for date filter (1/2)
JOIN nodes_hyperdata
ON nodes_hyperdata.node_id = nodes_ngrams.node_id
WHERE nodes.parent_id = {corpus_id}
AND nodes.typename = 4
-- preparing for date filter (2/2)
AND nodes_hyperdata.key = {date_type_id}
""".format(corpus_id=corpus.id, date_type_id = date_type_id)
if start:
if not isinstance(start, datetime):
try:
start = datetime.strptime(start, '%Y-%m-%d')
except:
raise TypeError("'start' param expects datetime object or %%Y-%%m-%%d string")
# datetime object ~> date db formatted filter (2013-09-16 00:00:00+02)
start_filter = "AND nodes_hyperdata.value_utc >= %s::date" % start.strftime('%Y-%m-%d %H:%M:%S%z')
# the filtering by start limit
doc_idx_statement += "\n" + start_filter
if end:
if not isinstance(end, datetime):
try:
end = datetime.strptime(end, '%Y-%m-%d')
except:
raise TypeError("'end' param expects datetime object or %%Y-%%m-%%d string")
# datetime object ~> date db formatted filter
end_filter = "AND nodes_hyperdata.value_utc <= %s::date" % end.strftime('%Y-%m-%d %H:%M:%S%z')
# the filtering by end limit
doc_idx_statement += "\n" + end_filter
# 4) prepare the synonyms
if groupings_id:
syn_statement = """
SELECT * FROM nodes_ngrams_ngrams
WHERE node_id = {groupings_id}
""".format(groupings_id = groupings_id)
# 5a) MAIN DB QUERY SKELETON (no groupings) -------------------------------- # 2b) stating the filters
if not groupings_id: cooc_filter_sql = """
sql_statement = """ WHERE
SELECT cooc.* n.typename = {nodetype_id}
FROM ( AND n.parent_id = {corpus_id}
SELECT idxA.ngram_id AS ngA, GROUP BY 1,2
idxB.ngram_id AS ngB, -- ==
count((idxA.ngram_id, -- GROUP BY ngA, ngB
idxB.ngram_id)) AS cwei )
-- read doc index x 2 """.format( nodetype_id = NODETYPES.index('DOCUMENT')
FROM ({doc_idx}) AS idxA , corpus_id=corpus.id
JOIN ({doc_idx}) AS idxB )
-- cooc <=> in same doc node
ON idxA.node_id = idxB.node_id # 3) taking the cooccurrences of ngram x2
ngram_filter_A_sql += """
GROUP BY ((idxA.ngram_id,idxB.ngram_id)) -- STEP 1: X axis of the matrix
) AS cooc INNER JOIN nodes_ngrams
""".format(doc_idx = doc_idx_statement) AS ngA ON ngA.node_id = n.id
# -------------------------------------------------------------------------- -- \--> get the occurrences node/ngram of the corpus
"""
# 5b) MAIN DB QUERY SKELETON (with groupings)
# groupings: we use additional Translation (synonyms) for ngA and ngB ngram_filter_B_sql += """
else: -- STEP 2: Y axi of the matrix
sql_statement = """ INNER JOIN nodes_ngrams
SELECT cooc.* AS ngB ON ngB.node_id = n.id
FROM ( -- \--> get the occurrences node/ngram of the corpus
SELECT COALESCE(synA.ngram1_id, idxA.ngram_id) AS ngA, """
COALESCE(synB.ngram1_id, idxB.ngram_id) AS ngB,
count((COALESCE(synA.ngram1_id, idxA.ngram_id), # 3) filter with lists (white or stop)
COALESCE(synB.ngram1_id, idxB.ngram_id))) AS cwei # on whiteList
-- read doc index x 2
FROM ({doc_idx}) AS idxA
JOIN ({doc_idx}) AS idxB
-- cooc <=> in same doc node
ON idxA.node_id = idxB.node_id
-- when idxA.ngram_id is a subform
LEFT JOIN ({synonyms}) as synA
ON synA.ngram2_id = idxA.ngram_id
-- when idxB.ngram_id is a subform
LEFT JOIN ({synonyms}) as synB
ON synB.ngram2_id = idxB.ngram_id
GROUP BY (COALESCE(synA.ngram1_id, idxA.ngram_id),
COALESCE(synB.ngram1_id, idxB.ngram_id))
) AS cooc
""".format(doc_idx = doc_idx_statement,
synonyms = syn_statement)
# 6) prepare 2 x node_ngrams alias if whitelist
if on_list_id: if on_list_id:
sql_statement +=""" ngram_filter_A_sql += """
JOIN nodes_ngrams AS whitelistA INNER JOIN nodes_ngrams
ON whitelistA.ngram_id = cooc.ngA AS wlA ON ngA.ngram_id = wlA.ngram_id
AND wlA.node_id = {wla_node_id}
-- \--> filter with white/main list
""".format(wla_node_id = on_list_id)
ngram_filter_B_sql += """
INNER JOIN nodes_ngrams
AS wlB ON ngB.ngram_id = wlB.ngram_id
AND wlB.node_id = {wlb_node_id}
-- \--> filter with white/main list
""".format(wlb_node_id = on_list_id)
# on stopList
# TODO NOT TESTED
if stoplist_id:
raise("Stoplist not tested yet")
ngram_filter_A_sql += """
LEFT JOIN nodes_ngrams
AS stA ON ngA.ngram_id = stA.ngram_id
AND stA.node_id = {sta_node_id}
AND stA.ngram_id IS NULL
-- \--> filter with stop list
""".format(sta_node_id = stoplist_id)
ngram_filter_B_sql += """
LEFT JOIN nodes_ngrams
AS stB ON ngB.ngram_id = stB.ngram_id
AND stB.node_id = {stb_node_id}
AND stB.ngram_id IS NULL
-- \--> filter with white/main list
""".format(stb_node_id = stoplist_id)
JOIN nodes_ngrams AS whitelistB
ON whitelistB.ngram_id = cooc.ngB
"""
if stoplist_id: # 4) prepare the synonyms
# used for reverse join if groupings_id:
sql_statement +=""" ngram_filter_A_sql += """
LEFT JOIN ( LEFT JOIN nodes_ngrams_ngrams
SELECT * FROM nodes_ngrams AS grA ON wlA.ngram_id = grA.ngram1_id
WHERE nodes_ngrams.node_id = %i AND grA.node_id = {groupings_id}
) AS stoplistA -- \--> adding (joining) ngrams that are grouped
ON stoplistA.ngram_id = cooc.ngA LEFT JOIN nodes_ngrams
AS wlAA ON grA.ngram2_id = wlAA.id
LEFT JOIN ( AND wlA.node_id = wlA.node_id
SELECT * FROM nodes_ngrams -- \--> adding (joining) ngrams that are not grouped
WHERE nodes_ngrams.node_id = %i --LEFT JOIN ngrams AS wlAA ON grA.ngram2_id = wlAA.id
) AS stoplistB -- \--> for joining all synonyms even if they are not in the main list (white list)
ON stoplistA.ngram_id = cooc.ngA
""" % (stoplist_id, stoplist_id)
# 7) FILTERS
# the inclusive threshold filter is always here """.format(groupings_id = groupings_id)
sql_statement += "\n WHERE cooc.cwei >= %i" % threshold
ngram_filter_B_sql += """
LEFT JOIN nodes_ngrams_ngrams
AS grB ON wlB.ngram_id = grB.ngram1_id
AND grB.node_id = {groupings_id}
-- \--> adding (joining) ngrams that are grouped
LEFT JOIN nodes_ngrams
AS wlBB ON grB.ngram2_id = wlBB.id
AND wlB.node_id = wlB.node_id
-- \--> adding (joining) ngrams that are not grouped
-- LEFT JOIN ngrams AS wlBB ON grB.ngram2_id = wlBB.id
-- \--> for joining all synonyms even if they are not in the main list (white list)
""".format(groupings_id = groupings_id)
# the optional whitelist perimeters
if on_list_id:
sql_statement += "\n AND whitelistA.node_id = %i" % on_list_id
sql_statement += "\n AND whitelistB.node_id = %i" % on_list_id
if stoplist_id: # 5) Buil the main COOC query
sql_statement += "\n AND stoplistA.ngram_id IS NULL" cooc_sql += ngram_filter_A_sql + ngram_filter_B_sql + cooc_filter_sql
sql_statement += "\n AND stoplistB.ngram_id IS NULL"
# 6) FILTERS
select_cooc_sql = """
SELECT ngA, ngB, score
FROM COOC --> from the query above
"""
# the inclusive threshold filter is always here
select_cooc_sql += "\n WHERE score >= %i" % threshold
# don't compute ngram with itself # don't compute ngram with itself
# NB: this option is bad for main toolchain # NB: this option is bad for main toolchain
if diagonal_filter: if diagonal_filter:
sql_statement += "\n AND ngA != ngB" select_cooc_sql += "\n AND ngA != ngB"
# 1 filtre tenant en compte de la symétrie # 1 filtre tenant en compte de la symétrie
# NB: this option is also bad for main toolchain # NB: this option is also bad for main toolchain
if symmetry_filter: if symmetry_filter:
sql_statement += "\n AND ngA <= ngB" select_cooc_sql += "\n AND ngA <= ngB"
# 7) Building the final query
final_sql += cooc_sql + select_cooc_sql
# 6) EXECUTE QUERY # 6) EXECUTE QUERY
# ---------------- # ----------------
# debug # debug
print(sql_statement) print(final_sql)
# executing the SQL statement # executing the SQL statement
results = connection.execute(sql_statement) results = connection.execute(final_sql)
# => storage in our matrix structure # => storage in our matrix structure
matrix = WeightedMatrix(results) matrix = WeightedMatrix(results)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment