[FIX] 2 MAJOR BUGS on COOC SQL QUERY

- [OLD] Performance regression -> lengthening and slowing the toolchain queue -> 2 secondes on 21 Europresse, documents is too much for instance) - [OLD] Some ngrams included whereas there are not in the corpus + [NEW] Clarity in the query + [NEW] Improved: 2000 ms before less than 500 ms after (factor 4 optimization on a very small corpus); should be ok in bigger corpora New behavior of the query tested with real corpus and this simple example; copy paste these lines in test.sql and run it in a test database (createdb test). -- let be: drop table nodes_ngrams; drop table synonyms; drop table ngrams; drop table nodes; create table nodes ( id serial PRIMARY KEY not null ); create table ngrams ( id serial PRIMARY KEY not null, text varchar(50) ); create table synonyms ( id serial PRIMARY KEY not null, node_id INTEGER not null, ngram1_id INTEGER not null references ngrams(id), ngram2_id INTEGER not null references ngrams(id) ); create table nodes_ngrams ( id serial PRIMARY KEY not null, node_id INTEGER not null references nodes(id), ngram_id INTEGER not null references ngrams(id) ); insert into nodes (id) values(1); insert into nodes (id) values(2); insert into nodes (id) values(3); insert into ngrams (text) values('object'); insert into ngrams (text) values('table'); insert into ngrams (text) values('animal'); insert into ngrams (text) values('cat'); insert into ngrams (text) values('dog'); insert into ngrams (text) values('other'); insert into ngrams (text) values('abc'); insert into ngrams (text) values('xyz'); --select * from ngrams; ---- id | text --------+-------- ---- 1 | object ---- 2 | table ---- 3 | animal ---- 4 | cat ---- 5 | dog ---- 6 | other ---- 7 | abc ---- 8 | xyz insert into synonyms (node_id,ngram1_id,ngram2_id) values(1,1,2); insert into synonyms (node_id,ngram1_id,ngram2_id) values(1,3,4); insert into synonyms (node_id,ngram1_id,ngram2_id) values(1,3,5); --select * from synonyms; -- id | node_id | ngram1_id | ngram2_id ------+---------+-----------+----------- -- 1 | 1 | 1 | 2 -- 2 | 1 | 3 | 4 -- 3 | 1 | 3 | 5 insert into nodes_ngrams (node_id, ngram_id) values(1,1); insert into nodes_ngrams (node_id, ngram_id) values(1,6); insert into nodes_ngrams (node_id, ngram_id) values(1,2); insert into nodes_ngrams (node_id, ngram_id) values(2,4); insert into nodes_ngrams (node_id, ngram_id) values(2,5); insert into nodes_ngrams (node_id, ngram_id) values(3,4); insert into nodes_ngrams (node_id, ngram_id) values(3,5); insert into nodes_ngrams (node_id, ngram_id) values(3,6); --select * from nodes_ngrams; -- id | node_id | ngram_id ------+---------+---------- -- 1 | 1 | 1 -- 2 | 1 | 6 -- 3 | 1 | 2 -- 4 | 2 | 4 -- 5 | 2 | 5 -- 6 | 3 | 4 -- 7 | 3 | 5 -- 8 | 3 | 6 select n1.ngram_id, n2.ngram_id, count(*) from nodes n INNER JOIN nodes_ngrams n1 ON n1.node_id = n.id INNER JOIN nodes_ngrams n2 ON n2.node_id = n.id where n1.ngram_id <= n2.ngram_id --AND --n1.node_id = n2.node_id group by 1,2 order BY n1.ngram_id ASC ; -- ngram_id | ngram_id | count ------------+----------+------- -- 5 | 6 | 1 -- 1 | 6 | 1 -- 4 | 6 | 1 -- 2 | 2 | 1 -- 4 | 4 | 2 -- 1 | 1 | 1 -- 1 | 2 | 1 -- 6 | 6 | 2 -- 2 | 6 | 1 -- 4 | 5 | 2 -- 5 | 5 | 2 --(11 lignes) select coalesce(n11.id, n1.ngram_id), coalesce(n22.id,n2.ngram_id), count(*) from nodes n INNER JOIN nodes_ngrams n1 ON n1.node_id = n.id LEFT JOIN synonyms s1 on n1.ngram_id = s1.ngram2_id AND s1.node_id=1 LEFT JOIN ngrams n11 on s1.ngram1_id = n11.id INNER JOIN nodes_ngrams n2 ON n2.node_id = n.id LEFT JOIN synonyms s2 on n2.ngram_id = s2.ngram2_id AND s2.node_id=1 LEFT JOIN ngrams n22 on s2.ngram1_id = n22.id where n1.ngram_id <= n2.ngram_id AND n1.node_id = n2.node_id group by 1,2 ; -- coalesce | coalesce | count ------------+----------+------- -- 1 | 6 | 2 -- 3 | 3 | 6 -- 1 | 1 | 3 -- 3 | 6 | 2 -- 6 | 6 | 2 --(5 lignes) --> les sommes comptées correspondent

[FIX] 2 MAJOR BUGS on COOC SQL QUERY
- [OLD] Performance regression -> lengthening and slowing the toolchain queue -> 2 secondes on 21 Europresse, documents is too much for instance) - [OLD] Some ngrams included whereas there are not in the corpus + [NEW] Clarity in the query + [NEW] Improved: 2000 ms before less than 500 ms after (factor 4 optimization on a very small corpus); should be ok in bigger corpora New behavior of the query tested with real corpus and this simple example; copy paste these lines in test.sql and run it in a test database (createdb test). -- let be: drop table nodes_ngrams; drop table synonyms; drop table ngrams; drop table nodes; create table nodes ( id serial PRIMARY KEY not null ); create table ngrams ( id serial PRIMARY KEY not null, text varchar(50) ); create table synonyms ( id serial PRIMARY KEY not null, node_id INTEGER not null, ngram1_id INTEGER not null references ngrams(id), ngram2_id INTEGER not null references ngrams(id) ); create table nodes_ngrams ( id serial PRIMARY KEY not null, node_id INTEGER not null references nodes(id), ngram_id INTEGER not null references ngrams(id) ); insert into nodes (id) values(1); insert into nodes (id) values(2); insert into nodes (id) values(3); insert into ngrams (text) values('object'); insert into ngrams (text) values('table'); insert into ngrams (text) values('animal'); insert into ngrams (text) values('cat'); insert into ngrams (text) values('dog'); insert into ngrams (text) values('other'); insert into ngrams (text) values('abc'); insert into ngrams (text) values('xyz'); --select * from ngrams; ---- id | text --------+-------- ---- 1 | object ---- 2 | table ---- 3 | animal ---- 4 | cat ---- 5 | dog ---- 6 | other ---- 7 | abc ---- 8 | xyz insert into synonyms (node_id,ngram1_id,ngram2_id) values(1,1,2); insert into synonyms (node_id,ngram1_id,ngram2_id) values(1,3,4); insert into synonyms (node_id,ngram1_id,ngram2_id) values(1,3,5); --select * from synonyms; -- id | node_id | ngram1_id | ngram2_id ------+---------+-----------+----------- -- 1 | 1 | 1 | 2 -- 2 | 1 | 3 | 4 -- 3 | 1 | 3 | 5 insert into nodes_ngrams (node_id, ngram_id) values(1,1); insert into nodes_ngrams (node_id, ngram_id) values(1,6); insert into nodes_ngrams (node_id, ngram_id) values(1,2); insert into nodes_ngrams (node_id, ngram_id) values(2,4); insert into nodes_ngrams (node_id, ngram_id) values(2,5); insert into nodes_ngrams (node_id, ngram_id) values(3,4); insert into nodes_ngrams (node_id, ngram_id) values(3,5); insert into nodes_ngrams (node_id, ngram_id) values(3,6); --select * from nodes_ngrams; -- id | node_id | ngram_id ------+---------+---------- -- 1 | 1 | 1 -- 2 | 1 | 6 -- 3 | 1 | 2 -- 4 | 2 | 4 -- 5 | 2 | 5 -- 6 | 3 | 4 -- 7 | 3 | 5 -- 8 | 3 | 6 select n1.ngram_id, n2.ngram_id, count(*) from nodes n INNER JOIN nodes_ngrams n1 ON n1.node_id = n.id INNER JOIN nodes_ngrams n2 ON n2.node_id = n.id where n1.ngram_id <= n2.ngram_id --AND --n1.node_id = n2.node_id group by 1,2 order BY n1.ngram_id ASC ; -- ngram_id | ngram_id | count ------------+----------+------- -- 5 | 6 | 1 -- 1 | 6 | 1 -- 4 | 6 | 1 -- 2 | 2 | 1 -- 4 | 4 | 2 -- 1 | 1 | 1 -- 1 | 2 | 1 -- 6 | 6 | 2 -- 2 | 6 | 1 -- 4 | 5 | 2 -- 5 | 5 | 2 --(11 lignes) select coalesce(n11.id, n1.ngram_id), coalesce(n22.id,n2.ngram_id), count(*) from nodes n INNER JOIN nodes_ngrams n1 ON n1.node_id = n.id LEFT JOIN synonyms s1 on n1.ngram_id = s1.ngram2_id AND s1.node_id=1 LEFT JOIN ngrams n11 on s1.ngram1_id = n11.id INNER JOIN nodes_ngrams n2 ON n2.node_id = n.id LEFT JOIN synonyms s2 on n2.ngram_id = s2.ngram2_id AND s2.node_id=1 LEFT JOIN ngrams n22 on s2.ngram1_id = n22.id where n1.ngram_id <= n2.ngram_id AND n1.node_id = n2.node_id group by 1,2 ; -- coalesce | coalesce | count ------------+----------+------- -- 1 | 6 | 2 -- 3 | 3 | 6 -- 1 | 1 | 3 -- 3 | 6 | 2 -- 6 | 6 | 2 --(5 lignes) --> les sommes comptées correspondent
c08a3b6b · delanoe · 6d527345 · c08a3b6b · c08a3b6b
Commit c08a3b6b authored Jan 18, 2017 by delanoe
Hide whitespace changes
Inline Side-by-side

Showing with 144 additions and 163 deletions

constants.py gargantext/constants.py +12 -8

ngram_coocs.py gargantext/util/toolchain/ngram_coocs.py +132 -155

No files found.
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -2,33 +2,30 @@
 # WARNING: to ensure consistency and retrocompatibility, lists should keep the
 #   initial order (ie., new elements should be appended at the end of the lists)

-
 abstract:
 ---------
         something between global params, constants,
         configuration variables, ini file...

-
 contents:
 ---------
-
-      + db constants/ontology
+      + database constants/ontology
         - nodetypes
            (db int <=> named types <=> python code)

-      + input low-level limits
+      + low-level limits
         - query size
         - max upload size
         - doc parsing batch size
         - word extraction batch size

-      + process config
+      + main process config
         - resourcetypes config (~ input ontology)
         - wordlist generation params
         - graph creation params
         - £TODO sequence of transformations "custom pipeline"

-      + input process subclasses/subroutines
+      + subprocess config
         - crawling, import
         - tagger services and functions
         - parser services
@@ -83,6 +80,7 @@ NODETYPES = [
    # docs subset
    'FAVORITES',             # 15
    # more scores (sorry!)
+
    'TIRANK-LOCAL',          # 16
    'TIRANK-GLOBAL',         # 17

@@ -90,6 +88,13 @@ NODETYPES = [
    'RESOURCE',              # 19
 ]

+
+def get_nodetype_id_by_name(nodetype):
+    '''resource :: name => resource dict'''
+    for n in NODETYPES :
+        if str(n["name"]) == str(sourcename):
+            return n
+
 INDEXED_HYPERDATA = {
    # TODO use properties during toolchain.hyperdata_indexing
    # (type, convert_to_db, convert_from_db)
@@ -154,7 +159,6 @@ INDEXED_HYPERDATA = {
 # user parameters----------------------------------------
 USER_LANG = ["fr", "en"]

-
 # resources ---------------------------------------------
 def get_resource(sourcetype):
    '''resource :: type => resource dict'''

--- a/gargantext/util/toolchain/ngram_coocs.py
+++ b/gargantext/util/toolchain/ngram_coocs.py
@@ -7,7 +7,7 @@ from sqlalchemy                import create_engine
 from gargantext.util.lists     import WeightedMatrix
 # from gargantext.util.db        import session, aliased, func
 from gargantext.util.db_cache  import cache
-from gargantext.constants      import DEFAULT_COOC_THRESHOLD
+from gargantext.constants      import DEFAULT_COOC_THRESHOLD, NODETYPES
 from gargantext.constants      import INDEXED_HYPERDATA
 from gargantext.util.tools     import datetime, convert_to_date

@@ -53,9 +53,9 @@ def compute_coocs(  corpus,
      - groupings_id: optional synonym relations to add all subform counts
                      with their mainform's counts
      - on_list_id: mainlist or maplist type, to constrain the input ngrams
-      - stoplist_id: stoplist for filtering input ngrams
+      - TODO stoplist_id: stoplist for filtering input ngrams
                     (normally unnecessary if a mainlist is already provided)
-      - start, end: provide one or both temporal limits to filter on doc date
+      - TODO start, end: provide one or both temporal limits to filter on doc date
                    NB the expected type of parameter value is datetime.datetime
                        (string is also possible but format must follow
                          this convention: "2001-01-01" aka "%Y-%m-%d")
@@ -72,183 +72,160 @@ def compute_coocs(  corpus,
    connection = engine.connect()

    # string vars for our SQL query
-    sql_statement     = ""
-    doc_idx_statement = ""
+    # setting work memory high to improve cache perf.
+    final_sql = "set work_mem='1GB'; \n"
+    # where
+    # final_sql = cooc_sql + select_cooc_sql
+    cooc_sql  = ""
+    select_cooc_sql    = ""
+    # where
+    # cooc_sql = cooc_sql + ngram_filter_A_sql + ngram_filter + cooc_filter_sql
+    cooc_filter_sql = ""
+    ngram_filter_A_sql = ""
+    ngram_filter_B_sql = ""

    # 2a) prepare the document selection (normal case)
-    doc_idx_statement = """
-    SELECT node_id, ngram_id
-    FROM nodes_ngrams
-    JOIN nodes
-        ON node_id = nodes.id
-    WHERE nodes.parent_id = {corpus_id}
-    AND nodes.typename = 4
-    """.format(corpus_id=corpus.id)
-
-
-    # 2b) same if document filters
-    if start or end:
-        date_type_id = INDEXED_HYPERDATA['publication_date']['id']
-
-        doc_idx_statement = """
-        SELECT node_id, ngram_id
-        FROM nodes_ngrams
-        JOIN nodes
-            ON node_id = nodes.id
-        -- preparing for date filter (1/2)
-        JOIN nodes_hyperdata
-            ON nodes_hyperdata.node_id = nodes_ngrams.node_id
-        WHERE nodes.parent_id = {corpus_id}
-        AND nodes.typename = 4
-
-        -- preparing for date filter (2/2)
-        AND nodes_hyperdata.key = {date_type_id}
-        """.format(corpus_id=corpus.id, date_type_id = date_type_id)
-
-        if start:
-            if not isinstance(start, datetime):
-                try:
-                    start = datetime.strptime(start, '%Y-%m-%d')
-                except:
-                    raise TypeError("'start' param expects datetime object or %%Y-%%m-%%d string")
-
-            # datetime object ~> date db formatted filter (2013-09-16 00:00:00+02)
-            start_filter = "AND nodes_hyperdata.value_utc >= %s::date" % start.strftime('%Y-%m-%d %H:%M:%S%z')
-
-            # the filtering by start limit
-            doc_idx_statement += "\n" + start_filter
-
-        if end:
-            if not isinstance(end, datetime):
-                try:
-                    end = datetime.strptime(end, '%Y-%m-%d')
-                except:
-                    raise TypeError("'end' param expects datetime object or %%Y-%%m-%%d string")
-
-            # datetime object ~> date db formatted filter
-            end_filter = "AND nodes_hyperdata.value_utc <= %s::date" % end.strftime('%Y-%m-%d %H:%M:%S%z')
-
-            # the filtering by end limit
-            doc_idx_statement += "\n" + end_filter
-
-    # 4) prepare the synonyms
-    if groupings_id:
-        syn_statement = """
-         SELECT * FROM nodes_ngrams_ngrams
-         WHERE node_id = {groupings_id}
-        """.format(groupings_id = groupings_id)
-
+    cooc_sql += """
+    WITH COOC as (
+    SELECT
+    COALESCE(grA.ngram1_id, wlA.ngram_id) as ngA,
+    COALESCE(grB.ngram1_id, wlB.ngram_id) as ngB,
+    COUNT(*) AS score
+    FROM
+    nodes AS n
+    --      / \
+    --     X   Y
+    -- SQL graph for getting the cooccurrences
+    """

-    # 5a) MAIN DB QUERY SKELETON (no groupings) --------------------------------
-    if not groupings_id:
-        sql_statement = """
-        SELECT cooc.*
-        FROM (
-         SELECT idxA.ngram_id AS ngA,
-                idxB.ngram_id AS ngB,
-          count((idxA.ngram_id,
-                 idxB.ngram_id)) AS cwei
-          -- read doc index x 2
-          FROM ({doc_idx}) AS idxA
-          JOIN ({doc_idx}) AS idxB
-          -- cooc <=> in same doc node
-          ON idxA.node_id = idxB.node_id
-
-          GROUP BY ((idxA.ngram_id,idxB.ngram_id))
-        ) AS cooc
-        """.format(doc_idx = doc_idx_statement)
-    # --------------------------------------------------------------------------
-
-    # 5b) MAIN DB QUERY SKELETON (with groupings)
-    # groupings: we use additional Translation (synonyms) for ngA and ngB
-    else:
-        sql_statement = """
-        SELECT cooc.*
-        FROM (
-         SELECT COALESCE(synA.ngram1_id, idxA.ngram_id) AS ngA,
-                COALESCE(synB.ngram1_id, idxB.ngram_id) AS ngB,
-          count((COALESCE(synA.ngram1_id, idxA.ngram_id),
-                COALESCE(synB.ngram1_id, idxB.ngram_id))) AS cwei
-          -- read doc index x 2
-          FROM ({doc_idx}) AS idxA
-          JOIN ({doc_idx}) AS idxB
-          -- cooc <=> in same doc node
-          ON idxA.node_id = idxB.node_id
-
-          -- when idxA.ngram_id is a subform
-          LEFT JOIN ({synonyms}) as synA
-          ON synA.ngram2_id = idxA.ngram_id
-          -- when idxB.ngram_id is a subform
-          LEFT JOIN ({synonyms}) as synB
-          ON synB.ngram2_id = idxB.ngram_id
-
-          GROUP BY (COALESCE(synA.ngram1_id, idxA.ngram_id),
-                    COALESCE(synB.ngram1_id, idxB.ngram_id))
-        ) AS cooc
-        """.format(doc_idx = doc_idx_statement,
-                   synonyms = syn_statement)
-
-
-    # 6) prepare 2 x node_ngrams alias if whitelist
+    # 2b) stating the filters
+    cooc_filter_sql = """
+        WHERE 
+            n.typename  = {nodetype_id}
+        AND n.parent_id = {corpus_id}
+        GROUP BY 1,2
+        --    ==
+        -- GROUP BY ngA, ngB
+        )
+        """.format( nodetype_id = NODETYPES.index('DOCUMENT')
+                  , corpus_id=corpus.id
+                  )
+    
+    # 3) taking the cooccurrences of ngram x2
+    ngram_filter_A_sql += """
+        -- STEP 1: X axis of the matrix
+        INNER JOIN nodes_ngrams
+                AS ngA  ON ngA.node_id  = n.id
+        -- \--> get the occurrences node/ngram of the corpus
+        """
+
+    ngram_filter_B_sql += """
+        -- STEP 2: Y axi of the matrix
+        INNER JOIN nodes_ngrams
+                AS ngB  ON ngB.node_id  = n.id
+        -- \--> get the occurrences node/ngram of the corpus
+        """
+
+    # 3) filter with lists (white or stop)
+    # on whiteList
    if on_list_id:
-        sql_statement +="""
-        JOIN nodes_ngrams AS whitelistA
-          ON whitelistA.ngram_id = cooc.ngA
+        ngram_filter_A_sql += """
+            INNER JOIN nodes_ngrams
+                    AS wlA  ON ngA.ngram_id = wlA.ngram_id
+                           AND wlA.node_id  = {wla_node_id}
+            -- \--> filter with white/main list
+            """.format(wla_node_id = on_list_id)
+
+        ngram_filter_B_sql += """
+            INNER JOIN nodes_ngrams
+                    AS wlB  ON ngB.ngram_id = wlB.ngram_id
+                           AND wlB.node_id  = {wlb_node_id}
+            -- \--> filter with white/main list
+            """.format(wlb_node_id = on_list_id)
+
+    # on stopList
+    # TODO NOT TESTED
+    if stoplist_id:
+        raise("Stoplist not tested yet")
+        ngram_filter_A_sql += """
+            LEFT JOIN nodes_ngrams
+                    AS stA  ON ngA.ngram_id = stA.ngram_id
+                           AND stA.node_id  = {sta_node_id}
+                           AND stA.ngram_id IS NULL
+            -- \--> filter with stop list
+            """.format(sta_node_id = stoplist_id)
+
+        ngram_filter_B_sql += """
+            LEFT JOIN nodes_ngrams
+                    AS stB  ON ngB.ngram_id = stB.ngram_id
+                           AND stB.node_id  = {stb_node_id}
+                           AND stB.ngram_id IS NULL
+            -- \--> filter with white/main list
+            """.format(stb_node_id = stoplist_id)

-        JOIN nodes_ngrams AS whitelistB
-          ON whitelistB.ngram_id = cooc.ngB
-         """

-    if stoplist_id:
-        # used for reverse join
-        sql_statement +="""
-        LEFT JOIN (
-            SELECT * FROM nodes_ngrams
-            WHERE nodes_ngrams.node_id = %i
-            ) AS stoplistA
-          ON stoplistA.ngram_id = cooc.ngA
-
-        LEFT JOIN (
-            SELECT * FROM nodes_ngrams
-            WHERE nodes_ngrams.node_id = %i
-            ) AS stoplistB
-          ON stoplistA.ngram_id = cooc.ngA
-         """ % (stoplist_id, stoplist_id)
-
-    # 7) FILTERS
+    # 4) prepare the synonyms
+    if groupings_id:
+        ngram_filter_A_sql += """
+        LEFT JOIN  nodes_ngrams_ngrams 
+               AS grA  ON wlA.ngram_id = grA.ngram1_id 
+                      AND grA.node_id  = {groupings_id}
+        -- \--> adding (joining) ngrams that are grouped
+        LEFT JOIN  nodes_ngrams
+               AS wlAA ON grA.ngram2_id = wlAA.id
+                      AND wlA.node_id  = wlA.node_id 
+        -- \--> adding (joining) ngrams that are not grouped
+        --LEFT JOIN  ngrams        AS wlAA ON grA.ngram2_id = wlAA.id
+        -- \--> for joining all synonyms even if they are not in the main list (white list)

-    # the inclusive threshold filter is always here
-    sql_statement += "\n WHERE cooc.cwei >= %i" % threshold
+        """.format(groupings_id = groupings_id)
+        
+        ngram_filter_B_sql += """
+        LEFT JOIN  nodes_ngrams_ngrams
+               AS grB  ON wlB.ngram_id = grB.ngram1_id 
+                      AND grB.node_id  = {groupings_id}
+        -- \--> adding (joining) ngrams that are grouped
+        LEFT JOIN  nodes_ngrams 
+               AS wlBB ON grB.ngram2_id = wlBB.id
+                      AND wlB.node_id   = wlB.node_id
+        -- \--> adding (joining) ngrams that are not grouped
+
+        -- LEFT JOIN  ngrams        AS wlBB ON grB.ngram2_id = wlBB.id
+        -- \--> for joining all synonyms even if they are not in the main list (white list)
+        """.format(groupings_id = groupings_id)

-    # the optional whitelist perimeters
-    if on_list_id:
-        sql_statement += "\n AND whitelistA.node_id = %i" % on_list_id
-        sql_statement += "\n AND whitelistB.node_id = %i" % on_list_id

-    if stoplist_id:
-        sql_statement += "\n AND stoplistA.ngram_id IS NULL"
-        sql_statement += "\n AND stoplistB.ngram_id IS NULL"
+    # 5) Buil the main COOC query
+    cooc_sql += ngram_filter_A_sql + ngram_filter_B_sql + cooc_filter_sql

+    # 6) FILTERS
+    select_cooc_sql = """
+    SELECT ngA, ngB, score
+        FROM COOC    --> from the query above
+    """
+    # the inclusive threshold filter is always here
+    select_cooc_sql += "\n WHERE score >= %i" % threshold

    # don't compute ngram with itself
    # NB: this option is bad for main toolchain
    if diagonal_filter:
-        sql_statement += "\n AND ngA != ngB"
+        select_cooc_sql += "\n AND ngA != ngB"

    # 1 filtre tenant en compte de la symétrie
    # NB: this option is also bad for main toolchain
    if symmetry_filter:
-        sql_statement += "\n AND ngA <= ngB"
-
+        select_cooc_sql += "\n AND ngA <= ngB"

+    # 7) Building the final query
+    final_sql += cooc_sql + select_cooc_sql

    # 6) EXECUTE QUERY
    # ----------------
    # debug
-    print(sql_statement)
+    print(final_sql)

    # executing the SQL statement
-    results = connection.execute(sql_statement)
+    results = connection.execute(final_sql)

    #  => storage in our matrix structure
    matrix = WeightedMatrix(results)