clarify

aa848fd9 · Romain Loth · 049dc862 · aa848fd9 · aa848fd9
Commit aa848fd9 authored May 19, 2016 by Romain Loth
Hide whitespace changes
Inline Side-by-side

Showing with 10 additions and 20 deletions

__init__.py gargantext/util/toolchain/__init__.py +6 -6

ngram_coocs.py gargantext/util/toolchain/ngram_coocs.py +4 -14

No files found.
--- a/gargantext/util/toolchain/__init__.py
+++ b/gargantext/util/toolchain/__init__.py
@@ -111,8 +111,8 @@ def parse_extract_indexhyperdata(corpus):
    group_id = compute_groups(corpus, stoplist_id = None)
    print('CORPUS #%d: [%s] new grouplist node #%i' % (corpus.id, t(), group_id))

-    # -> write occurrences to Node and NodeNodeNgram # possible: factorize with tfidf
-    occ_id = compute_occs(corpus)
+    # -> write occurrences to Node and NodeNodeNgram # (todo: NodeNgram)
+    occ_id = compute_occs(corpus, groupings_id = group_id)
    print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id))

    # ------------
@@ -120,11 +120,11 @@ def parse_extract_indexhyperdata(corpus):
    ltfidf_id = compute_tfidf_local(corpus)
    print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))

-    # -> write global and cumulated ti_ranking (tfidf ranking vector) to Node and NodeNodeNgram
+    # -> write cumulated ti_ranking (tfidf ranking vector) to Node and NodeNodeNgram (todo: NodeNgram)
    tirank_id = compute_ti_ranking(corpus,
-                                   count_scope="global",
-                                   termset_scope="local")
-    print('CORPUS #%d: [%s] new tfidf ranking node #%i' % (corpus.id, t(), tirank_id))
+                                   groupings_id = group_id,
+                                   count_scope="global")
+    print('CORPUS #%d: [%s] new ti ranking node #%i' % (corpus.id, t(), tirank_id))

    # -> mainlist: filter + write (to Node and NodeNgram)
    mainlist_id = do_mainlist(corpus,

--- a/gargantext/util/toolchain/ngram_coocs.py
+++ b/gargantext/util/toolchain/ngram_coocs.py
@@ -85,14 +85,6 @@ def compute_coocs(  corpus,
    #  1.859.408 lignes pour la requête cooc simple
    #     71.134 lignes en se limitant aux ngrammes qui ont une occ > 1 (weight)

-    # docs of our corpus
-    docids_subquery = (session
-                        .query(Node.id)
-                        .filter(Node.parent_id == corpus.id)
-                        .filter(Node.typename == "DOCUMENT")
-                        .subquery()
-                       )
-
    # 2 x the occurrence index table
    x1 = aliased(NodeNgram)
    x2 = aliased(NodeNgram)
@@ -105,11 +97,9 @@ def compute_coocs(  corpus,
        session.query(x1.ngram_id, x2.ngram_id, ucooc)
               .join(Node, Node.id == x1.node_id)   # <- b/c within corpus
               .join(x2, x1.node_id == Node.id )     # <- b/c within corpus
-               
               .filter(Node.parent_id == corpus.id) # <- b/c within corpus
               .filter(Node.typename == "DOCUMENT") # <- b/c within corpus

-            
               .filter(x1.node_id  == x2.node_id)       # <- by definition of cooc
               .filter(x1.ngram_id != x2.ngram_id)      # <- b/c not with itself
               .group_by(x1.ngram_id, x2.ngram_id)
@@ -120,7 +110,7 @@ def compute_coocs(  corpus,

        m1 = aliased(NodeNgram)
        m2 = aliased(NodeNgram)
-        
+
        coocs_query = ( coocs_query
            .join(m1, m1.ngram_id == x1.ngram_id)
            .join(m2, m2.ngram_id == x2.ngram_id)
@@ -211,9 +201,9 @@ def compute_coocs(  corpus,
    matrix = WeightedMatrix(coocs_query.all())

    # fyi
-    shape_0 = len({pair[0] for pair in matrix.items})
-    shape_1 = len({pair[1] for pair in matrix.items})
-    print("COOCS: NEW matrix shape [%ix%i]" % (shape_0, shape_1))
+    #shape_0 = len({pair[0] for pair in matrix.items})
+    #shape_1 = len({pair[1] for pair in matrix.items})
+    #print("COOCS: NEW matrix shape [%ix%i]" % (shape_0, shape_1))

    # 5) SAVE
    # --------