add groups to ngram_coocs + fix date params + fix stoplist param + remove sql IN operators there

3b2d568c · Romain Loth · 92d5dfcd · 3b2d568c · 3b2d568c · 3b2d568c
Commit 3b2d568c authored May 20, 2016 by Romain Loth
4 changed files
--- a/gargantext/util/toolchain/__init__.py
+++ b/gargantext/util/toolchain/__init__.py
@@ -111,15 +111,11 @@ def parse_extract_indexhyperdata(corpus):
    group_id = compute_groups(corpus, stoplist_id = None)
    print('CORPUS #%d: [%s] new grouplist node #%i' % (corpus.id, t(), group_id))
+    # ------------
    # -> write occurrences to Node and NodeNodeNgram # (todo: NodeNgram)
    occ_id = compute_occs(corpus, groupings_id = group_id)
    print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id))
-    # ------------
-    # -> write local tfidf similarities to Node and NodeNodeNgram
-    ltfidf_id = compute_tfidf_local(corpus)
-    print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))
    # -> write cumulated ti_ranking (tfidf ranking vector) to Node and NodeNodeNgram (todo: NodeNgram)
    tirank_id = compute_ti_ranking(corpus,
                                   groupings_id = group_id,
@@ -132,13 +128,21 @@ def parse_extract_indexhyperdata(corpus):
                              stoplist_id = stop_id)
    print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id))
+    # -> write local tfidf similarities to Node and NodeNodeNgram
+    # TODO only on mainlist
+    ltfidf_id = compute_tfidf_local(corpus)
+    print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))
+    # => used for doc <=> ngram association
    # ------------
    # -> cooccurrences: compute + write (=> Node and NodeNodeNgram)
-    cooc_id = compute_coocs(corpus, mainlist_id = mainlist_id)
+    cooc_id = compute_coocs(corpus, mainlist_id = mainlist_id, groupings_id = group_id)
    print('CORPUS #%d: [%s] new coocs node #%i' % (corpus.id, t(), cooc_id))
    # -> specificity: compute + write (=> NodeNodeNgram)
-    spec_id = compute_specificity(corpus, cooc_id=cooc_id)
+    spec_id = compute_specificity(corpus, cooc_id=cooc_id
+            #   ,groupings_id = group_id
+              )
    print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), spec_id))
    # ?? maplist: compute + write (to Node and NodeNgram)

--- a/gargantext/util/toolchain/list_main.py
+++ b/gargantext/util/toolchain/list_main.py
@@ -65,6 +65,9 @@ def do_mainlist(corpus,
    ordered_filtered_tfidf = (session
        .query(NodeNodeNgram.ngram_id)
        .filter(NodeNodeNgram.node1_id == ranking_scores_id)
+        # NOT IN but speed theoretically ok here
+        # see http://sqlperformance.com/2012/12/t-sql-queries/left-anti-semi-join
+        # but http://stackoverflow.com/questions/2246772/whats-the-difference-between-not-exists-vs-not-in-vs-left-join-where-is-null/2246793#2246793
        .filter(~ NodeNodeNgram.ngram_id.in_(stopterms_subquery))
        .order_by(desc(NodeNodeNgram.score))
        )

--- a/gargantext/util/toolchain/metric_tfidf.py
+++ b/gargantext/util/toolchain/metric_tfidf.py
@@ -63,7 +63,6 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
    #                   ------------
    # (the occurrences are the sums for each ngram's mainform)
    else:
-        print ("gtoup mode")
        # sub-SELECT the synonyms of this GROUPLIST id (for OUTER JOIN later)
        syn = (session.query(NodeNgramNgram.ngram1_id,
                             NodeNgramNgram.ngram2_id)

--- a/gargantext/util/toolchain/ngram_coocs.py
+++ b/gargantext/util/toolchain/ngram_coocs.py
 from gargantext.models         import Node, NodeNgram, NodeNgramNgram, \
-                                      NodeHyperdata
+                                      NodeHyperdata, Ngram
 from gargantext.util.lists     import WeightedMatrix
 from gargantext.util.db        import session, aliased, func
 from gargantext.util.db_cache  import cache
 from gargantext.constants      import DEFAULT_COOC_THRESHOLD
 from datetime                  import datetime
+from sqlalchemy.sql.expression import case # for choice if ngram has mainform or not
 def compute_coocs(  corpus,
                    overwrite_id    = None,
                    threshold       = DEFAULT_COOC_THRESHOLD,
+                    groupings_id    = None,
                    mainlist_id     = None,
                    stoplist_id     = None,
                    start           = None,
@@ -41,9 +44,11 @@ def compute_coocs(  corpus,
      - overwrite_id: id of a pre-existing COOCCURRENCES node for this corpus
                     (all hyperdata and previous NodeNgramNgram rows will be replaced)
      - threshold: on output cooc count (previously called hapax)
+      - groupings_id: optional synonym relations to add all subform counts
+                      with their mainform's counts
      - mainlist_id: mainlist to constrain the input ngrams
      - stoplist_id: stoplist for filtering input ngrams
-                     (normally unnecessary if a mainlist is provided)
+                     (normally unnecessary if a mainlist is already provided)
      - start, end: provide one or both temporal limits to filter on doc date
                    NB the expected type of parameter value is datetime.datetime
                        (string is also possible but format must follow
@@ -56,25 +61,24 @@ def compute_coocs(  corpus,
    basic idea for one doc
    ======================
    each pair of ngrams sharing same doc (node_id)
-        SELEC idx1.ngram_id, idx2.ngram_id
+        SELEC idxa.ngram_id, idxb.ngram_id
-        FROM nodes_ngrams AS idx1, nodes_ngrams AS idx2
+        FROM nodes_ngrams AS idxa, nodes_ngrams AS idxb
        ---------------------------------
-        WHERE idx1.node_id = idx2.node_id      <== that's cooc
+        WHERE idxa.node_id = idxb.node_id      <== that's cooc
        ---------------------------------
-        AND idx1.ngram_id <> idx2.ngram_id
+        AND idxa.ngram_id <> idxb.ngram_id
-        AND idx1.node_id = MY_DOC ;
+        AND idxa.node_id = MY_DOC ;
    on entire corpus
    =================
    coocs for each doc :
      - each given pair like (termA, termB) will likely appear several times
-        => we do GROUP BY (x1.ngram_id, x2.ngram_id)
+        => we do GROUP BY (Xindex.ngram_id, Yindex.ngram_id)
      - we count unique appearances of the pair (cooc)
    """
-        #   - TODO add grouped element's values in grouping 'chief ngram'
        #   - TODO cvalue_id: allow a metric as additional  input filter
        #   - TODO n_min, n_max : filter on Ngram.n (aka length of ngram)
        #   - TODO weighted: if False normal cooc to be saved as result
@@ -86,124 +90,194 @@ def compute_coocs(  corpus,
    #     71.134 lignes en se limitant aux ngrammes qui ont une occ > 1 (weight)
    # 2 x the occurrence index table
-    x1 = aliased(NodeNgram)
+    Xindex = aliased(NodeNgram)
-    x2 = aliased(NodeNgram)
+    Yindex = aliased(NodeNgram)
-    # cooccurrences columns definition
+    # for debug (1/4)
-    ucooc = func.count(x1.ngram_id).label("ucooc")
+    # Xngram = aliased(Ngram)
+    # Yngram = aliased(Ngram)
-    # 1) MAIN DB QUERY
-    coocs_query = (
+    # 1) prepare definition of counted forms
-        session.query(x1.ngram_id, x2.ngram_id, ucooc)
+    if not groupings_id:
-               .join(Node, Node.id == x1.node_id)   # <- b/c within corpus
-               .join(x2, x1.node_id == Node.id )     # <- b/c within corpus
+        # no groupings => the counted forms are the ngrams
-               .filter(Node.parent_id == corpus.id) # <- b/c within corpus
+        Xindex_ngform_id = Xindex.ngram_id
-               .filter(Node.typename == "DOCUMENT") # <- b/c within corpus
+        Yindex_ngform_id = Yindex.ngram_id
-               .filter(x1.node_id  == x2.node_id)       # <- by definition of cooc
-               .filter(x1.ngram_id != x2.ngram_id)      # <- b/c not with itself
+    # groupings: cf commentaire détaillé dans compute_occs() + todo facto
-               .group_by(x1.ngram_id, x2.ngram_id)
+    else:
+        # prepare translations
+        Xsyno = (session.query(NodeNgramNgram.ngram1_id,
+                             NodeNgramNgram.ngram2_id)
+                .filter(NodeNgramNgram.node_id == groupings_id)
+                .subquery()
+               )
+        # further use as anon tables prevent doing Ysyno = Xsyno
+        Ysyno = (session.query(NodeNgramNgram.ngram1_id,
+                             NodeNgramNgram.ngram2_id)
+                .filter(NodeNgramNgram.node_id == groupings_id)
+                .subquery()
+               )
+        # groupings => define the counted form depending on the existence of a synonym
+        Xindex_ngform_id = case([
+                            (Xsyno.c.ngram1_id != None, Xsyno.c.ngram1_id),
+                            (Xsyno.c.ngram1_id == None, Xindex.ngram_id)
+                            #     condition               value
+                        ])
+        Yindex_ngform_id = case([
+                            (Ysyno.c.ngram1_id != None, Ysyno.c.ngram1_id),
+                            (Ysyno.c.ngram1_id == None, Yindex.ngram_id)
+                        ])
+        # ---
+    # 2) BASE DB QUERY
+    # cooccurrences columns definition ----------------
+    ucooc = func.count(Xindex_ngform_id).label("ucooc")
+    # NB could be X or Y in this line
+    #    (we're counting grouped rows and just happen to do it on this column)
+    base_query = (
+        session.query(
+                    Xindex_ngform_id,
+                    Yindex_ngform_id,
+                    ucooc
+                    # for debug (2/4)
+                    #, Xngram.terms.label("w_x")
+                    #, Yngram.terms.label("w_y")
+                    )
+               .join(Yindex, Xindex.node_id == Yindex.node_id )   # <- by definition of cooc
+               .join(Node, Node.id == Xindex.node_id) # <- b/c within corpus
+               .filter(Node.parent_id == corpus.id)   # <- b/c within corpus
+               .filter(Node.typename == "DOCUMENT")   # <- b/c within corpus
+               .filter(Xindex_ngform_id != Yindex_ngform_id) # <- b/c not with itself
+        )
+    # outerjoin the synonyms if needed
+    if groupings_id:
+        base_query = (base_query
+               .outerjoin(Xsyno,                 # <- synonyms for Xindex.ngrams
+                          Xsyno.c.ngram2_id == Xindex.ngram_id)
+               .outerjoin(Ysyno,                 # <- synonyms for Yindex.ngrams
+                          Ysyno.c.ngram2_id == Yindex.ngram_id)
+            )
+    # 3) counting clause in any case
+    coocs_query = (base_query
+               .group_by(
+                    Xindex_ngform_id, Yindex_ngform_id # <- what we're counting
+                    # for debug (3/4)
+                    #,"w_x", "w_y"
+                    )
+            # for debug (4/4)
+            #.join(Xngram, Xngram.id == Xindex_ngform_id)
+            #.join(Yngram, Yngram.id == Yindex_ngform_id)
+            .order_by(ucooc)
           )
-    # 2) INPUT FILTERS (reduce N before O(N²))
+    # 4) INPUT FILTERS (reduce N before O(N²))
    if mainlist_id:
        m1 = aliased(NodeNgram)
        m2 = aliased(NodeNgram)
        coocs_query = ( coocs_query
-            .join(m1, m1.ngram_id == x1.ngram_id)
+            .join(m1, m1.ngram_id == Xindex_ngform_id)
-            .join(m2, m2.ngram_id == x2.ngram_id)
+            .join(m2, m2.ngram_id == Yindex_ngform_id)
            .filter( m1.node_id == mainlist_id )
            .filter( m2.node_id == mainlist_id )
        )
    if stoplist_id:
-        s1 = aliased(NodeNgram)
+        s1 = (session.query(NodeNgram.ngram_id)
-        s2 = aliased(NodeNgram)
+                .filter(NodeNgram.node_id == stoplist_id)
+                .subquery()
+               )
+        # further use as anon tables prevent doing s2 = s1
+        s2 = (session.query(NodeNgram.ngram_id)
+                .filter(NodeNgram.node_id == stoplist_id)
+                .subquery()
+               )
        coocs_query = ( coocs_query
-            .join(m1, s1.ngram_id == x1.ngram_id)
+            .outerjoin(s1, s1.c.ngram_id == Xindex_ngform_id)
-            .join(m2, s2.ngram_id == x2.ngram_id)
+            .outerjoin(s2, s2.c.ngram_id == Yindex_ngform_id)
+            # équivalent NOT IN stoplist
+            .filter( s1.c.ngram_id == None )
+            .filter( s2.c.ngram_id == None )
-            .filter( s1.node_id == mainlist_id )
-            .filter( s2.node_id == mainlist_id )
        )
-    if start:
+    if start or end:
-        if isinstance(start, datetime):
+        Time = aliased(NodeHyperdata)
-            start_str = start.strftime("%Y-%m-%d %H:%M:%S")
-        else:
+        coocs_query = (coocs_query
-            start_str = str(start)
+                            .join(Time, Time.node_id == Xindex.node_id)
+                            .filter(Time.key=="publication_date")
-        # doc_ids matching this limit
+                            )
-        # TODO s/subqueries/inner joins/ && thanks!
-        starttime_subquery = (session
+        if start:
-                                .query(NodeHyperdata.node_id)
+            if not isinstance(start, datetime):
-                                .filter(NodeHyperdata.key=="publication_date")
+                try:
-                                .filter(NodeHyperdata.value_str >= start_str)
+                    start = datetime.strptime(start, '%Y-%m-%d')
-                                .subquery()
+                except:
-                           )
+                    raise TypeError("'start' param expects datetime object or %%Y-%%m-%%d string")
-        # direct use of str comparison op because there is consistency b/w
-        # sql alpha sort and chrono sort *in this format %Y-%m-%d %H:%M:%S*
+            # the filtering by start limit
+            coocs_query = coocs_query.filter(Time.value_utc >= start)
-        # the filtering by start limit
-        coocs_query = coocs_query.filter(x1.node_id.in_(starttime_subquery))
+        if end:
+            if not isinstance(end, datetime):
-    if end:
+                try:
-        if isinstance(end, datetime):
+                    end = datetime.strptime(end, '%Y-%m-%d')
-            end_str = end.strftime("%Y-%m-%d %H:%M:%S")
+                except:
-        else:
+                    raise TypeError("'end' param expects datetime object or %%Y-%%m-%%d string")
-            end_str = str(end)
+            # the filtering by start limit
-        # TODO s/subqueries/inner joins/ && thanks!
+            coocs_query = coocs_query.filter(Time.value_utc <= end)
-        endtime_subquery = (session
-                                .query(NodeHyperdata.node_id)
-                                .filter(NodeHyperdata.key=="publication_date")
-                                .filter(NodeHyperdata.value_str <= end_str)
-                                .subquery()
-                           )
-        # the filtering by end limit
-        coocs_query = coocs_query.filter(x1.node_id.in_(endtime_subquery))
    if symmetry_filter:
        # 1 filtre tenant en compte de la symétrie
        #  -> réduit le travail de moitié !!
-        #  -> mais empêchera l'accès direct aux cooccurrences de x2
+        #  -> mais récupération sera plus couteuse via des requêtes OR comme:
-        #  -> seront éparpillées: notées dans les x1 qui ont précédé x2
-        #  -> récupération sera plus couteuse via des requêtes OR comme:
        #       WHERE ngram1 = mon_ngram OR ngram2 = mon_ngram
-        coocs_query = coocs_query.filter(x1.ngram_id  < x2.ngram_id)
+        coocs_query = coocs_query.filter(Xindex_ngform_id  < Yindex_ngform_id)
-    # ------------
-    # 2 filtres amont possibles pour réduire combinatoire
-    #         - par exemple 929k lignes => 35k lignes
-    #         - ici sur weight mais dégrade les résultats
-    #            => imaginable sur une autre métrique (cvalue ou tfidf?)
-    # coocs_query = coocs_query.filter(x1.weight > 1)
-    # coocs_query = coocs_query.filter(x2.weight > 1)
-    # ------------
-    # 3) OUTPUT FILTERS
+    # 5) OUTPUT FILTERS
    # ------------------
    # threshold
    # £TODO adjust COOC_THRESHOLD a posteriori:
    # ex: sometimes 2 sometimes 4 depending on sparsity
    coocs_query = coocs_query.having(ucooc >= threshold)
-    # 4) EXECUTE QUERY
+    # 6) EXECUTE QUERY
    # ----------------
    #  => storage in our matrix structure
    matrix = WeightedMatrix(coocs_query.all())
+    #                      -------------------
    # fyi
-    #shape_0 = len({pair[0] for pair in matrix.items})
+    shape_0 = len({pair[0] for pair in matrix.items})
-    #shape_1 = len({pair[1] for pair in matrix.items})
+    shape_1 = len({pair[1] for pair in matrix.items})
-    #print("COOCS: NEW matrix shape [%ix%i]" % (shape_0, shape_1))
+    print("COOCS: NEW matrix shape [%ix%i]" % (shape_0, shape_1))
    # 5) SAVE
    # --------