Commit c08a3b6b authored by delanoe's avatar delanoe

[FIX] 2 MAJOR BUGS on COOC SQL QUERY

    - [OLD] Performance regression
        -> lengthening and slowing the toolchain queue
        -> 2 secondes on 21 Europresse, documents is too much for instance)
    - [OLD] Some ngrams included whereas there are not in the corpus
    + [NEW] Clarity in the query
    + [NEW] Improved: 2000 ms before less than 500 ms after (factor 4
    optimization on a very small corpus); should be ok in bigger corpora

New behavior of the query tested with real corpus and this simple
example; copy paste these lines in test.sql and run it in a test
database (createdb test).

-- let be:

drop table nodes_ngrams;
drop table synonyms;
drop table ngrams;
drop table nodes;

create table nodes (
    id serial PRIMARY KEY not null
);

create table ngrams (
    id serial PRIMARY KEY not null,
    text varchar(50)
);

create table synonyms (
    id serial PRIMARY KEY not null,
    node_id INTEGER not null,
    ngram1_id INTEGER not null references ngrams(id),
    ngram2_id INTEGER not null references ngrams(id)
);

create table nodes_ngrams (
    id serial PRIMARY KEY not null,
    node_id INTEGER not null references nodes(id),
    ngram_id INTEGER not null references ngrams(id)
);

insert into nodes (id) values(1);
insert into nodes (id) values(2);
insert into nodes (id) values(3);

insert into ngrams (text) values('object');
insert into ngrams (text) values('table');
insert into ngrams (text) values('animal');
insert into ngrams (text) values('cat');
insert into ngrams (text) values('dog');
insert into ngrams (text) values('other');
insert into ngrams (text) values('abc');
insert into ngrams (text) values('xyz');
--select * from ngrams;
---- id |  text
--------+--------
----  1 | object
----  2 | table
----  3 | animal
----  4 | cat
----  5 | dog
----  6 | other
----  7 | abc
----  8 | xyz

insert into synonyms (node_id,ngram1_id,ngram2_id) values(1,1,2);
insert into synonyms (node_id,ngram1_id,ngram2_id) values(1,3,4);
insert into synonyms (node_id,ngram1_id,ngram2_id) values(1,3,5);
--select * from synonyms;
-- id | node_id | ngram1_id | ngram2_id
------+---------+-----------+-----------
--  1 |       1 |         1 |         2
--  2 |       1 |         3 |         4
--  3 |       1 |         3 |         5

insert into nodes_ngrams (node_id, ngram_id) values(1,1);
insert into nodes_ngrams (node_id, ngram_id) values(1,6);
insert into nodes_ngrams (node_id, ngram_id) values(1,2);
insert into nodes_ngrams (node_id, ngram_id) values(2,4);
insert into nodes_ngrams (node_id, ngram_id) values(2,5);
insert into nodes_ngrams (node_id, ngram_id) values(3,4);
insert into nodes_ngrams (node_id, ngram_id) values(3,5);
insert into nodes_ngrams (node_id, ngram_id) values(3,6);
--select * from nodes_ngrams;
-- id | node_id | ngram_id
------+---------+----------
--  1 |       1 |        1
--  2 |       1 |        6
--  3 |       1 |        2
--  4 |       2 |        4
--  5 |       2 |        5
--  6 |       3 |        4
--  7 |       3 |        5
--  8 |       3 |        6

select n1.ngram_id, n2.ngram_id, count(*)
from nodes n
INNER JOIN nodes_ngrams n1 ON n1.node_id = n.id
INNER JOIN nodes_ngrams n2 ON n2.node_id = n.id
where
n1.ngram_id <= n2.ngram_id
--AND
--n1.node_id = n2.node_id
group by 1,2
order BY n1.ngram_id ASC
;
-- ngram_id | ngram_id | count
------------+----------+-------
--        5 |        6 |     1
--        1 |        6 |     1
--        4 |        6 |     1
--        2 |        2 |     1
--        4 |        4 |     2
--        1 |        1 |     1
--        1 |        2 |     1
--        6 |        6 |     2
--        2 |        6 |     1
--        4 |        5 |     2
--        5 |        5 |     2
--(11 lignes)

select coalesce(n11.id, n1.ngram_id), coalesce(n22.id,n2.ngram_id), count(*)
from nodes n
INNER JOIN nodes_ngrams n1 ON n1.node_id = n.id
LEFT JOIN synonyms s1 on n1.ngram_id = s1.ngram2_id AND s1.node_id=1
LEFT JOIN ngrams n11 on s1.ngram1_id = n11.id

INNER JOIN nodes_ngrams n2 ON n2.node_id = n.id
LEFT JOIN synonyms s2 on n2.ngram_id = s2.ngram2_id AND s2.node_id=1
LEFT JOIN ngrams n22 on s2.ngram1_id = n22.id
where
n1.ngram_id <= n2.ngram_id
AND
n1.node_id = n2.node_id
group by 1,2
;
-- coalesce | coalesce | count
------------+----------+-------
--        1 |        6 |     2
--        3 |        3 |     6
--        1 |        1 |     3
--        3 |        6 |     2
--        6 |        6 |     2
--(5 lignes)
--> les sommes comptées correspondent
parent 6d527345
......@@ -2,33 +2,30 @@
# WARNING: to ensure consistency and retrocompatibility, lists should keep the
# initial order (ie., new elements should be appended at the end of the lists)
abstract:
---------
something between global params, constants,
configuration variables, ini file...
contents:
---------
+ db constants/ontology
+ database constants/ontology
- nodetypes
(db int <=> named types <=> python code)
+ input low-level limits
+ low-level limits
- query size
- max upload size
- doc parsing batch size
- word extraction batch size
+ process config
+ main process config
- resourcetypes config (~ input ontology)
- wordlist generation params
- graph creation params
- £TODO sequence of transformations "custom pipeline"
+ input process subclasses/subroutines
+ subprocess config
- crawling, import
- tagger services and functions
- parser services
......@@ -83,6 +80,7 @@ NODETYPES = [
# docs subset
'FAVORITES', # 15
# more scores (sorry!)
'TIRANK-LOCAL', # 16
'TIRANK-GLOBAL', # 17
......@@ -90,6 +88,13 @@ NODETYPES = [
'RESOURCE', # 19
]
def get_nodetype_id_by_name(nodetype):
'''resource :: name => resource dict'''
for n in NODETYPES :
if str(n["name"]) == str(sourcename):
return n
INDEXED_HYPERDATA = {
# TODO use properties during toolchain.hyperdata_indexing
# (type, convert_to_db, convert_from_db)
......@@ -154,7 +159,6 @@ INDEXED_HYPERDATA = {
# user parameters----------------------------------------
USER_LANG = ["fr", "en"]
# resources ---------------------------------------------
def get_resource(sourcetype):
'''resource :: type => resource dict'''
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment