[FIX] 2 MAJOR BUGS on COOC SQL QUERY

- [OLD] Performance regression -> lengthening and slowing the toolchain queue -> 2 secondes on 21 Europresse, documents is too much for instance) - [OLD] Some ngrams included whereas there are not in the corpus + [NEW] Clarity in the query + [NEW] Improved: 2000 ms before less than 500 ms after (factor 4 optimization on a very small corpus); should be ok in bigger corpora New behavior of the query tested with real corpus and this simple example; copy paste these lines in test.sql and run it in a test database (createdb test). -- let be: drop table nodes_ngrams; drop table synonyms; drop table ngrams; drop table nodes; create table nodes ( id serial PRIMARY KEY not null ); create table ngrams ( id serial PRIMARY KEY not null, text varchar(50) ); create table synonyms ( id serial PRIMARY KEY not null, node_id INTEGER not null, ngram1_id INTEGER not null references ngrams(id), ngram2_id INTEGER not null references ngrams(id) ); create table nodes_ngrams ( id serial PRIMARY KEY not null, node_id INTEGER not null references nodes(id), ngram_id INTEGER not null references ngrams(id) ); insert into nodes (id) values(1); insert into nodes (id) values(2); insert into nodes (id) values(3); insert into ngrams (text) values('object'); insert into ngrams (text) values('table'); insert into ngrams (text) values('animal'); insert into ngrams (text) values('cat'); insert into ngrams (text) values('dog'); insert into ngrams (text) values('other'); insert into ngrams (text) values('abc'); insert into ngrams (text) values('xyz'); --select * from ngrams; ---- id | text --------+-------- ---- 1 | object ---- 2 | table ---- 3 | animal ---- 4 | cat ---- 5 | dog ---- 6 | other ---- 7 | abc ---- 8 | xyz insert into synonyms (node_id,ngram1_id,ngram2_id) values(1,1,2); insert into synonyms (node_id,ngram1_id,ngram2_id) values(1,3,4); insert into synonyms (node_id,ngram1_id,ngram2_id) values(1,3,5); --select * from synonyms; -- id | node_id | ngram1_id | ngram2_id ------+---------+-----------+----------- -- 1 | 1 | 1 | 2 -- 2 | 1 | 3 | 4 -- 3 | 1 | 3 | 5 insert into nodes_ngrams (node_id, ngram_id) values(1,1); insert into nodes_ngrams (node_id, ngram_id) values(1,6); insert into nodes_ngrams (node_id, ngram_id) values(1,2); insert into nodes_ngrams (node_id, ngram_id) values(2,4); insert into nodes_ngrams (node_id, ngram_id) values(2,5); insert into nodes_ngrams (node_id, ngram_id) values(3,4); insert into nodes_ngrams (node_id, ngram_id) values(3,5); insert into nodes_ngrams (node_id, ngram_id) values(3,6); --select * from nodes_ngrams; -- id | node_id | ngram_id ------+---------+---------- -- 1 | 1 | 1 -- 2 | 1 | 6 -- 3 | 1 | 2 -- 4 | 2 | 4 -- 5 | 2 | 5 -- 6 | 3 | 4 -- 7 | 3 | 5 -- 8 | 3 | 6 select n1.ngram_id, n2.ngram_id, count(*) from nodes n INNER JOIN nodes_ngrams n1 ON n1.node_id = n.id INNER JOIN nodes_ngrams n2 ON n2.node_id = n.id where n1.ngram_id <= n2.ngram_id --AND --n1.node_id = n2.node_id group by 1,2 order BY n1.ngram_id ASC ; -- ngram_id | ngram_id | count ------------+----------+------- -- 5 | 6 | 1 -- 1 | 6 | 1 -- 4 | 6 | 1 -- 2 | 2 | 1 -- 4 | 4 | 2 -- 1 | 1 | 1 -- 1 | 2 | 1 -- 6 | 6 | 2 -- 2 | 6 | 1 -- 4 | 5 | 2 -- 5 | 5 | 2 --(11 lignes) select coalesce(n11.id, n1.ngram_id), coalesce(n22.id,n2.ngram_id), count(*) from nodes n INNER JOIN nodes_ngrams n1 ON n1.node_id = n.id LEFT JOIN synonyms s1 on n1.ngram_id = s1.ngram2_id AND s1.node_id=1 LEFT JOIN ngrams n11 on s1.ngram1_id = n11.id INNER JOIN nodes_ngrams n2 ON n2.node_id = n.id LEFT JOIN synonyms s2 on n2.ngram_id = s2.ngram2_id AND s2.node_id=1 LEFT JOIN ngrams n22 on s2.ngram1_id = n22.id where n1.ngram_id <= n2.ngram_id AND n1.node_id = n2.node_id group by 1,2 ; -- coalesce | coalesce | count ------------+----------+------- -- 1 | 6 | 2 -- 3 | 3 | 6 -- 1 | 1 | 3 -- 3 | 6 | 2 -- 6 | 6 | 2 --(5 lignes) --> les sommes comptées correspondent

[FIX] 2 MAJOR BUGS on COOC SQL QUERY
- [OLD] Performance regression -> lengthening and slowing the toolchain queue -> 2 secondes on 21 Europresse, documents is too much for instance) - [OLD] Some ngrams included whereas there are not in the corpus + [NEW] Clarity in the query + [NEW] Improved: 2000 ms before less than 500 ms after (factor 4 optimization on a very small corpus); should be ok in bigger corpora New behavior of the query tested with real corpus and this simple example; copy paste these lines in test.sql and run it in a test database (createdb test). -- let be: drop table nodes_ngrams; drop table synonyms; drop table ngrams; drop table nodes; create table nodes ( id serial PRIMARY KEY not null ); create table ngrams ( id serial PRIMARY KEY not null, text varchar(50) ); create table synonyms ( id serial PRIMARY KEY not null, node_id INTEGER not null, ngram1_id INTEGER not null references ngrams(id), ngram2_id INTEGER not null references ngrams(id) ); create table nodes_ngrams ( id serial PRIMARY KEY not null, node_id INTEGER not null references nodes(id), ngram_id INTEGER not null references ngrams(id) ); insert into nodes (id) values(1); insert into nodes (id) values(2); insert into nodes (id) values(3); insert into ngrams (text) values('object'); insert into ngrams (text) values('table'); insert into ngrams (text) values('animal'); insert into ngrams (text) values('cat'); insert into ngrams (text) values('dog'); insert into ngrams (text) values('other'); insert into ngrams (text) values('abc'); insert into ngrams (text) values('xyz'); --select * from ngrams; ---- id | text --------+-------- ---- 1 | object ---- 2 | table ---- 3 | animal ---- 4 | cat ---- 5 | dog ---- 6 | other ---- 7 | abc ---- 8 | xyz insert into synonyms (node_id,ngram1_id,ngram2_id) values(1,1,2); insert into synonyms (node_id,ngram1_id,ngram2_id) values(1,3,4); insert into synonyms (node_id,ngram1_id,ngram2_id) values(1,3,5); --select * from synonyms; -- id | node_id | ngram1_id | ngram2_id ------+---------+-----------+----------- -- 1 | 1 | 1 | 2 -- 2 | 1 | 3 | 4 -- 3 | 1 | 3 | 5 insert into nodes_ngrams (node_id, ngram_id) values(1,1); insert into nodes_ngrams (node_id, ngram_id) values(1,6); insert into nodes_ngrams (node_id, ngram_id) values(1,2); insert into nodes_ngrams (node_id, ngram_id) values(2,4); insert into nodes_ngrams (node_id, ngram_id) values(2,5); insert into nodes_ngrams (node_id, ngram_id) values(3,4); insert into nodes_ngrams (node_id, ngram_id) values(3,5); insert into nodes_ngrams (node_id, ngram_id) values(3,6); --select * from nodes_ngrams; -- id | node_id | ngram_id ------+---------+---------- -- 1 | 1 | 1 -- 2 | 1 | 6 -- 3 | 1 | 2 -- 4 | 2 | 4 -- 5 | 2 | 5 -- 6 | 3 | 4 -- 7 | 3 | 5 -- 8 | 3 | 6 select n1.ngram_id, n2.ngram_id, count(*) from nodes n INNER JOIN nodes_ngrams n1 ON n1.node_id = n.id INNER JOIN nodes_ngrams n2 ON n2.node_id = n.id where n1.ngram_id <= n2.ngram_id --AND --n1.node_id = n2.node_id group by 1,2 order BY n1.ngram_id ASC ; -- ngram_id | ngram_id | count ------------+----------+------- -- 5 | 6 | 1 -- 1 | 6 | 1 -- 4 | 6 | 1 -- 2 | 2 | 1 -- 4 | 4 | 2 -- 1 | 1 | 1 -- 1 | 2 | 1 -- 6 | 6 | 2 -- 2 | 6 | 1 -- 4 | 5 | 2 -- 5 | 5 | 2 --(11 lignes) select coalesce(n11.id, n1.ngram_id), coalesce(n22.id,n2.ngram_id), count(*) from nodes n INNER JOIN nodes_ngrams n1 ON n1.node_id = n.id LEFT JOIN synonyms s1 on n1.ngram_id = s1.ngram2_id AND s1.node_id=1 LEFT JOIN ngrams n11 on s1.ngram1_id = n11.id INNER JOIN nodes_ngrams n2 ON n2.node_id = n.id LEFT JOIN synonyms s2 on n2.ngram_id = s2.ngram2_id AND s2.node_id=1 LEFT JOIN ngrams n22 on s2.ngram1_id = n22.id where n1.ngram_id <= n2.ngram_id AND n1.node_id = n2.node_id group by 1,2 ; -- coalesce | coalesce | count ------------+----------+------- -- 1 | 6 | 2 -- 3 | 3 | 6 -- 1 | 1 | 3 -- 3 | 6 | 2 -- 6 | 6 | 2 --(5 lignes) --> les sommes comptées correspondent
c08a3b6b · delanoe · 6d527345 · c08a3b6b · c08a3b6b
Commit c08a3b6b authored Jan 18, 2017 by delanoe
Expand all Hide whitespace changes
Inline Side-by-side

Showing with 144 additions and 163 deletions

constants.py gargantext/constants.py +12 -8

ngram_coocs.py gargantext/util/toolchain/ngram_coocs.py +132 -155

No files found.
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -2,33 +2,30 @@
 # WARNING: to ensure consistency and retrocompatibility, lists should keep the
 #   initial order (ie., new elements should be appended at the end of the lists)
 abstract:
 ---------
         something between global params, constants,
         configuration variables, ini file...
 contents:
 ---------
+      + database constants/ontology
-      + db constants/ontology
         - nodetypes
            (db int <=> named types <=> python code)
-      + input low-level limits
+      + low-level limits
         - query size
         - max upload size
         - doc parsing batch size
         - word extraction batch size
-      + process config
+      + main process config
         - resourcetypes config (~ input ontology)
         - wordlist generation params
         - graph creation params
         - £TODO sequence of transformations "custom pipeline"
-      + input process subclasses/subroutines
+      + subprocess config
         - crawling, import
         - tagger services and functions
         - parser services
@@ -83,6 +80,7 @@ NODETYPES = [
    # docs subset
    'FAVORITES',             # 15
    # more scores (sorry!)
    'TIRANK-LOCAL',          # 16
    'TIRANK-GLOBAL',         # 17
@@ -90,6 +88,13 @@ NODETYPES = [
    'RESOURCE',              # 19
 ]
+def get_nodetype_id_by_name(nodetype):
+    '''resource :: name => resource dict'''
+    for n in NODETYPES :
+        if str(n["name"]) == str(sourcename):
+            return n
 INDEXED_HYPERDATA = {
    # TODO use properties during toolchain.hyperdata_indexing
    # (type, convert_to_db, convert_from_db)
@@ -154,7 +159,6 @@ INDEXED_HYPERDATA = {
 # user parameters----------------------------------------
 USER_LANG = ["fr", "en"]
 # resources ---------------------------------------------
 def get_resource(sourcetype):
    '''resource :: type => resource dict'''

--- a/gargantext/util/toolchain/ngram_coocs.py
+++ b/gargantext/util/toolchain/ngram_coocs.py