• delanoe's avatar
    [FIX] 2 MAJOR BUGS on COOC SQL QUERY · c08a3b6b
    delanoe authored
        - [OLD] Performance regression
            -> lengthening and slowing the toolchain queue
            -> 2 secondes on 21 Europresse, documents is too much for instance)
        - [OLD] Some ngrams included whereas there are not in the corpus
        + [NEW] Clarity in the query
        + [NEW] Improved: 2000 ms before less than 500 ms after (factor 4
        optimization on a very small corpus); should be ok in bigger corpora
    
    New behavior of the query tested with real corpus and this simple
    example; copy paste these lines in test.sql and run it in a test
    database (createdb test).
    
    -- let be:
    
    drop table nodes_ngrams;
    drop table synonyms;
    drop table ngrams;
    drop table nodes;
    
    create table nodes (
        id serial PRIMARY KEY not null
    );
    
    create table ngrams (
        id serial PRIMARY KEY not null,
        text varchar(50)
    );
    
    create table synonyms (
        id serial PRIMARY KEY not null,
        node_id INTEGER not null,
        ngram1_id INTEGER not null references ngrams(id),
        ngram2_id INTEGER not null references ngrams(id)
    );
    
    create table nodes_ngrams (
        id serial PRIMARY KEY not null,
        node_id INTEGER not null references nodes(id),
        ngram_id INTEGER not null references ngrams(id)
    );
    
    insert into nodes (id) values(1);
    insert into nodes (id) values(2);
    insert into nodes (id) values(3);
    
    insert into ngrams (text) values('object');
    insert into ngrams (text) values('table');
    insert into ngrams (text) values('animal');
    insert into ngrams (text) values('cat');
    insert into ngrams (text) values('dog');
    insert into ngrams (text) values('other');
    insert into ngrams (text) values('abc');
    insert into ngrams (text) values('xyz');
    --select * from ngrams;
    ---- id |  text
    --------+--------
    ----  1 | object
    ----  2 | table
    ----  3 | animal
    ----  4 | cat
    ----  5 | dog
    ----  6 | other
    ----  7 | abc
    ----  8 | xyz
    
    insert into synonyms (node_id,ngram1_id,ngram2_id) values(1,1,2);
    insert into synonyms (node_id,ngram1_id,ngram2_id) values(1,3,4);
    insert into synonyms (node_id,ngram1_id,ngram2_id) values(1,3,5);
    --select * from synonyms;
    -- id | node_id | ngram1_id | ngram2_id
    ------+---------+-----------+-----------
    --  1 |       1 |         1 |         2
    --  2 |       1 |         3 |         4
    --  3 |       1 |         3 |         5
    
    insert into nodes_ngrams (node_id, ngram_id) values(1,1);
    insert into nodes_ngrams (node_id, ngram_id) values(1,6);
    insert into nodes_ngrams (node_id, ngram_id) values(1,2);
    insert into nodes_ngrams (node_id, ngram_id) values(2,4);
    insert into nodes_ngrams (node_id, ngram_id) values(2,5);
    insert into nodes_ngrams (node_id, ngram_id) values(3,4);
    insert into nodes_ngrams (node_id, ngram_id) values(3,5);
    insert into nodes_ngrams (node_id, ngram_id) values(3,6);
    --select * from nodes_ngrams;
    -- id | node_id | ngram_id
    ------+---------+----------
    --  1 |       1 |        1
    --  2 |       1 |        6
    --  3 |       1 |        2
    --  4 |       2 |        4
    --  5 |       2 |        5
    --  6 |       3 |        4
    --  7 |       3 |        5
    --  8 |       3 |        6
    
    select n1.ngram_id, n2.ngram_id, count(*)
    from nodes n
    INNER JOIN nodes_ngrams n1 ON n1.node_id = n.id
    INNER JOIN nodes_ngrams n2 ON n2.node_id = n.id
    where
    n1.ngram_id <= n2.ngram_id
    --AND
    --n1.node_id = n2.node_id
    group by 1,2
    order BY n1.ngram_id ASC
    ;
    -- ngram_id | ngram_id | count
    ------------+----------+-------
    --        5 |        6 |     1
    --        1 |        6 |     1
    --        4 |        6 |     1
    --        2 |        2 |     1
    --        4 |        4 |     2
    --        1 |        1 |     1
    --        1 |        2 |     1
    --        6 |        6 |     2
    --        2 |        6 |     1
    --        4 |        5 |     2
    --        5 |        5 |     2
    --(11 lignes)
    
    select coalesce(n11.id, n1.ngram_id), coalesce(n22.id,n2.ngram_id), count(*)
    from nodes n
    INNER JOIN nodes_ngrams n1 ON n1.node_id = n.id
    LEFT JOIN synonyms s1 on n1.ngram_id = s1.ngram2_id AND s1.node_id=1
    LEFT JOIN ngrams n11 on s1.ngram1_id = n11.id
    
    INNER JOIN nodes_ngrams n2 ON n2.node_id = n.id
    LEFT JOIN synonyms s2 on n2.ngram_id = s2.ngram2_id AND s2.node_id=1
    LEFT JOIN ngrams n22 on s2.ngram1_id = n22.id
    where
    n1.ngram_id <= n2.ngram_id
    AND
    n1.node_id = n2.node_id
    group by 1,2
    ;
    -- coalesce | coalesce | count
    ------------+----------+-------
    --        1 |        6 |     2
    --        3 |        3 |     6
    --        1 |        1 |     3
    --        3 |        6 |     2
    --        6 |        6 |     2
    --(5 lignes)
    --> les sommes comptées correspondent
    c08a3b6b
Name
Last commit
Last update
annotations Loading commit data...
docs Loading commit data...
gargantext Loading commit data...
graph Loading commit data...
install Loading commit data...
moissonneurs Loading commit data...
static Loading commit data...
templates Loading commit data...
unittests Loading commit data...
uploads Loading commit data...
.gitignore Loading commit data...
CHANGELOG.md Loading commit data...
CODE_OF_CONDUCT.md Loading commit data...
LICENSE Loading commit data...
README.md Loading commit data...
TODO.md Loading commit data...
create_doc.sh Loading commit data...
dbmigrate.py Loading commit data...
gargantext.ini Loading commit data...
init_accounts.py Loading commit data...
manage.py Loading commit data...
mkdocs.yml Loading commit data...
start_celery Loading commit data...
start_uwsgi Loading commit data...