Merge branch 'romain-refactoring' into unstable

75a7e329 · delanoe · 6c438c85 · eee27166 · 75a7e329 · 6c438c85
Commit 75a7e329 authored May 23, 2016 by delanoe
8 changed files
--- a/doc/schemas/ngram_parsing_flow.dot
+++ b/doc/schemas/ngram_parsing_flow.dot
@@ -6,15 +6,19 @@ digraph ngramflow {
    labelloc="t" ;
    "extracted_ngrams" -> "grouplist" ;
-    "extracted_ngrams" -> "occs+tfidfs" ;
+    "extracted_ngrams" -> "occs+ti_rank" ;
-    "main_user_stoplist" -> "stoplist" ;
+    "project stoplist (todo)" -> "stoplist" ;
    "stoplist" -> "mainlist" ;
-    "occs+tfidfs" -> "mainlist" [label="  TFIDF_LIMIT"];
+    "occs+ti_rank" -> "mainlist" [label="  TI_RANK_LIMIT"];
    "mainlist" -> "coocs" [label="  COOCS_THRESHOLD"] ;
    "coocs" -> "specificity" ;
    "specificity" -> "maplist" [label="MAPLIST_LIMIT\nMONOGRAM_PART"];
+    "mainlist" -> "tfidf" ;
+    "tfidf" -> "explore" [label="doc relations with all map and candidates"];
    "maplist" -> "explore" ;
-    "grouplist" -> "maplist" ;
+    "grouplist" -> "occs+ti_rank" ;
+    "grouplist" -> "coocs" ;
+    "grouplist" -> "tfidf" ;
 }
--- a/doc/schemas/ngram_parsing_flow.png
+++ b/doc/schemas/ngram_parsing_flow.png
--- a/gargantext/util/lists.py
+++ b/gargantext/util/lists.py
@@ -196,10 +196,10 @@ class WeightedMatrix(_BaseClass):
            self.id = source
            from gargantext.models import NodeNgramNgram
            query = (session
-                .query(NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id, NodeNgramNgram.score)
+                .query(NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id, NodeNgramNgram.weight)
                .filter(NodeNgramNgram.node_id == source)
            )
-            for key1, key2, value in self.items.items():
+            for key1, key2, value in query.all():
                self.items[key1, key2] = value
        elif isinstance(source, WeightedMatrix):
            for key1, key2, value in source:
@@ -225,11 +225,14 @@ class WeightedMatrix(_BaseClass):
        session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id == node_id).delete()
        session.commit()
        # insert new data
+        print("WeightedMatrix bulk_insert start")
        bulk_insert(
            NodeNgramNgram,
            ('node_id', 'ngram1_id', 'ngram2_id', 'weight'),
            ((node_id, key1, key2, value) for key1, key2, value in self)
        )
+        print("WeightedMatrix bulk_insert stop")
    def __radd__(self, other):
        result = NotImplemented

--- a/gargantext/util/toolchain/__init__.py
+++ b/gargantext/util/toolchain/__init__.py
@@ -6,12 +6,12 @@ from .hyperdata_indexing  import index_hyperdata
 # in usual run order
 from .list_stop           import do_stoplist
+from .ngram_groups        import compute_groups
 from .metric_tfidf        import compute_occs, compute_tfidf_local, compute_ti_ranking
 from .list_main           import do_mainlist
 from .ngram_coocs         import compute_coocs
 from .metric_specificity  import compute_specificity
 from .list_map            import do_maplist     # TEST
-from .ngram_groups        import compute_groups
 from .mail_notification   import notify_owner
 from gargantext.util.db   import session
 from gargantext.models    import Node
@@ -129,27 +129,31 @@ def parse_extract_indexhyperdata(corpus):
    print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id))
    # -> write local tfidf similarities to Node and NodeNodeNgram
-    # TODO only on mainlist
+    ltfidf_id = compute_tfidf_local(corpus,
-    ltfidf_id = compute_tfidf_local(corpus)
+                                    on_list_id=mainlist_id,
+                                    groupings_id = group_id)
    print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))
    # => used for doc <=> ngram association
    # ------------
-    # -> cooccurrences: compute + write (=> Node and NodeNodeNgram)
+    # -> cooccurrences on mainlist: compute + write (=> Node and NodeNgramNgram)
-    cooc_id = compute_coocs(corpus, mainlist_id = mainlist_id, groupings_id = group_id)
+    coocs = compute_coocs(corpus,
-    print('CORPUS #%d: [%s] new coocs node #%i' % (corpus.id, t(), cooc_id))
+                            on_list_id = mainlist_id,
+                            groupings_id = group_id,
+                            just_pass_result = True)
+    print('CORPUS #%d: [%s] computed mainlist coocs for specif rank' % (corpus.id, t()))
    # -> specificity: compute + write (=> NodeNodeNgram)
-    spec_id = compute_specificity(corpus, cooc_id=cooc_id
+    spec_id = compute_specificity(corpus,cooc_matrix = coocs)
-            #   ,groupings_id = group_id
+    # no need here for subforms because cooc already counted them in mainform
-              )
    print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), spec_id))
-    # ?? maplist: compute + write (to Node and NodeNgram)
+    # maplist: compute + write (to Node and NodeNgram)
    map_id = do_maplist(corpus,
                        mainlist_id = mainlist_id,
                        specificity_id=spec_id,
-                        grouplist_id=group_id)
+                        grouplist_id=group_id
+                        )
    print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id))
    print('CORPUS #%d: [%s] FINISHED ngram lists computation' % (corpus.id, t()))
@@ -160,7 +164,7 @@ def parse_extract_indexhyperdata(corpus):
    if DEBUG is False:
-        print('CORPUS #%d: [%s] FINISHED Sendind email notification' % (corpus.id, t()))
+        print('CORPUS #%d: [%s] FINISHED Sending email notification' % (corpus.id, t()))
        notify_owner(corpus)
    corpus.status('Workflow', progress=10, complete=True)

--- a/gargantext/util/toolchain/list_map.py
+++ b/gargantext/util/toolchain/list_map.py
@@ -43,15 +43,11 @@ def do_maplist(corpus,
    #dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
-    mainterms_subquery = (session
+    MainlistTable = aliased(NodeNgram)
-                            # we want only terms within mainlist
-                            .query(NodeNgram.ngram_id)
-                            .filter(NodeNgram.node_id == mainlist_id)
-                            .subquery()
-                         )
-    primary_groupterms_subquery = (session
+    IsSubform = (session
-                            # we want only primary terms (ngram1)
+                            # we want only secondary terms (ngram2)
+                            # to be able to filter them out
                            .query(NodeNgramNgram.ngram2_id)
                            .filter(NodeNgramNgram.node_id == grouplist_id)
                            .subquery()
@@ -63,8 +59,15 @@ def do_maplist(corpus,
    query = (session.query(ScoreSpec.ngram_id)
                .join(Ngram, Ngram.id == ScoreSpec.ngram_id)
                .filter(ScoreSpec.node_id == specificity_id)
-                .filter(ScoreSpec.ngram_id.in_(mainterms_subquery))
-                .filter(ScoreSpec.ngram_id.notin_(primary_groupterms_subquery))
+                # we want only terms within mainlist
+                .join(MainlistTable, Ngram.id == MainlistTable.ngram_id)
+                .filter(MainlistTable.node_id == mainlist_id)
+                # we remove all ngrams matching an ngram2_id from the synonyms
+                .outerjoin(IsSubform,
+                           IsSubform.c.ngram2_id == ScoreSpec.ngram_id)
+                .filter(IsSubform.c.ngram2_id == None)
            )
    # TODO: move these 2 pools up to mainlist selection
@@ -94,7 +97,7 @@ def do_maplist(corpus,
    new_hyperdata = { 'corpus': corpus.id,
                      'limit' : limit,
                      'monograms_part' : monograms_part,
-                     'monograms_result' : obtained_mono/obtained_total if obtained_total != 0 else obtained_mono
+                     'monograms_result' : obtained_mono/obtained_total if obtained_total != 0 else 0
                    }
    if overwrite_id:
        # overwrite pre-existing node

--- a/gargantext/util/toolchain/metric_specificity.py
+++ b/gargantext/util/toolchain/metric_specificity.py
@@ -9,7 +9,7 @@ from collections              import defaultdict
 from pandas                   import DataFrame
 import pandas as pd
-def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
+def compute_specificity(corpus, cooc_id=None, cooc_matrix=None, overwrite_id = None):
    '''
    Compute the specificity, simple calculus.
@@ -18,18 +18,26 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
        - overwrite_id: optional preexisting specificity node to overwrite
    '''
+    matrix = defaultdict(lambda : defaultdict(float))
+    if cooc_id == None and cooc_matrix == None:
+        raise TypeError("compute_specificity: needs a cooc_id or cooc_matrix param")
+    elif cooc_id:
        cooccurrences = (session.query(NodeNgramNgram)
                        .filter(NodeNgramNgram.node_id==cooc_id)
                        )
-    # no filtering: new choice cooc already filtered on tfidf before creation
+        # no filtering: cooc already filtered on mainlist_id at creation
-    matrix = defaultdict(lambda : defaultdict(float))
-    # £TODO re-rename weight => score
        for cooccurrence in cooccurrences:
            matrix[cooccurrence.ngram1_id][cooccurrence.ngram2_id] = cooccurrence.weight
            matrix[cooccurrence.ngram2_id][cooccurrence.ngram1_id] = cooccurrence.weight
+    elif cooc_matrix:
+        # copy WeightedMatrix into local matrix structure
+        for (ngram1_id, ngram2_id) in cooc_matrix.items:
+            w = cooc_matrix.items[(ngram1_id, ngram2_id)]
+            matrix[ngram1_id][ngram2_id] = w
    nb_ngrams = len(matrix)
    print("SPECIFICITY: computing on %i ngrams" % nb_ngrams)

--- a/gargantext/util/toolchain/metric_tfidf.py
+++ b/gargantext/util/toolchain/metric_tfidf.py
@@ -377,12 +377,18 @@ def compute_ti_ranking(corpus,
-def compute_tfidf_local(corpus, overwrite_id=None):
+def compute_tfidf_local(corpus,
+                        on_list_id=None,
+                        groupings_id=None,
+                        overwrite_id=None):
    """
    Calculates tfidf similarity of each (doc, ngram) couple, within the current corpus
    Parameters:
      - the corpus itself
+      - groupings_id: optional synonym relations to add all subform counts
+                      with their mainform's counts
+      - on_list_id: mainlist or maplist type, to constrain the input ngrams
      - overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus
                   (the Node and its previous NodeNodeNgram rows will be replaced)
    """
@@ -398,36 +404,94 @@ def compute_tfidf_local(corpus, overwrite_id=None):
    # N
    total_docs = session.query(docids_subquery).count()
-    # number of docs with given term (number of rows = M ngrams)
-    n_docswith_ng = (session
+    # define the counted form
-                    .query(
+    if not groupings_id:
-                        NodeNgram.ngram_id,
+        ngform_id = NodeNgram.ngram_id
-                        func.count(NodeNgram.node_id).label("nd")  # nd: n docs with term
+    else:
-                     )
+        Syno = (session.query(NodeNgramNgram.ngram1_id,
-                    .filter(NodeNgram.node_id.in_(docids_subquery))
+                             NodeNgramNgram.ngram2_id)
-                    .group_by(NodeNgram.ngram_id)
+                .filter(NodeNgramNgram.node_id == groupings_id)
-                    .all()
+                .subquery()
               )
-    # { ngram_id => log(nd) }
+        ngform_id = case([
-    log_nd_lookup = {row.ngram_id : log(row.nd) for row in n_docswith_ng}
+                            (Syno.c.ngram1_id != None, Syno.c.ngram1_id),
+                            (Syno.c.ngram1_id == None, NodeNgram.ngram_id)
+                        ])
    # tf for each couple (number of rows = N docs X M ngrams)
-    tf_doc_ng = (session
+    tf_doc_query = (session
                    .query(
-                        NodeNgram.ngram_id,
+                        ngform_id,
                        NodeNgram.node_id,
                        func.sum(NodeNgram.weight).label("tf"),    # tf: occurrences
                     )
-                    .filter(NodeNgram.node_id.in_(docids_subquery))
-                    .group_by(NodeNgram.node_id, NodeNgram.ngram_id)
+                     # select within docs of current corpus
-                    .all()
+                    .join(docids_subquery,
+                          docids_subquery.c.id == NodeNgram.node_id)
+                   )
+    if groupings_id:
+        tf_doc_query = ( tf_doc_query
+                .outerjoin(Syno, Syno.c.ngram2_id == NodeNgram.ngram_id)
+            )
+        # now when we'll group_by the ngram2 freqs will be added to ngram1
+    if on_list_id:
+        Miamlist = aliased(NodeNgram)
+        tf_doc_query = ( tf_doc_query
+                .join(Miamlist, Miamlist.ngram_id == ngform_id)
+                .filter( Miamlist.node_id == on_list_id )
            )
+    # execute query to do our tf sum
+    tf_per_doc = tf_doc_query.group_by(NodeNgram.node_id, ngform_id).all()
+    # ex: [(128371, 9732, 1.0),
+    #      (128383, 9740, 1.0),
+    #      (128373, 9731, 1.0),
+    #      (128376, 9734, 1.0),
+    #      (128372, 9731, 1.0),
+    #      (128383, 9733, 1.0),
+    #      (128383, 9735, 1.0),
+    #      (128389, 9734, 1.0),
+    #      (8624, 9731, 1.0),
+    #      (128382, 9740, 1.0),
+    #      (128383, 9739, 1.0),
+    #      (128383, 9736, 1.0),
+    #      (128378, 9735, 1.0),
+    #      (128375, 9733, 4.0),
+    #      (128383, 9732, 1.0)]
+    #        ^ ^     ^^    ^^
+    #       ngram   doc   freq in this doc
+    # simultaneously count docs with given term (number of rows = M ngrams)
+    ndocswithngram = {}
+    for triple in tf_per_doc:
+        ng = triple[0]
+        doc = triple[1]
+        if ng in ndocswithngram:
+            ndocswithngram[ng] += 1
+        else:
+            ndocswithngram[ng] = 1
+    # print(ndocswithngram)
+    # store for use in formula
+    # { ngram_id => log(nd) }
+    log_nd_lookup = {ng : log(nd_count)
+                        for (ng, nd_count) in ndocswithngram.items()}
    # ---------------------------------------------------------
    tfidfs = {}
    log_tot_docs = log(total_docs)
-    for (ngram_id, node_id, tf) in tf_doc_ng:
+    for (ngram_id, node_id, tf) in tf_per_doc:
        log_nd = log_nd_lookup[ngram_id]
        # tfidfs[ngram_id] = tf * log(total_docs/nd)
        tfidfs[node_id, ngram_id] = tf * (log_tot_docs-log_nd)

--- a/gargantext/util/toolchain/ngram_coocs.py
+++ b/gargantext/util/toolchain/ngram_coocs.py
@@ -10,13 +10,15 @@ from sqlalchemy.sql.expression import case # for choice if ngram has mainform or
 def compute_coocs(  corpus,
                    overwrite_id    = None,
+                    just_pass_result= True,   # just return the WeightedMatrix,
+                                              #    (don't write to DB)
                    threshold       = DEFAULT_COOC_THRESHOLD,
                    groupings_id    = None,
-                    mainlist_id     = None,
+                    on_list_id      = None,
                    stoplist_id     = None,
                    start           = None,
                    end             = None,
-                    symmetry_filter = True):
+                    symmetry_filter = False):
    """
    Count how often some extracted terms appear
    together in a small context (document)
@@ -46,7 +48,7 @@ def compute_coocs(  corpus,
      - threshold: on output cooc count (previously called hapax)
      - groupings_id: optional synonym relations to add all subform counts
                      with their mainform's counts
-      - mainlist_id: mainlist to constrain the input ngrams
+      - on_list_id: mainlist or maplist type, to constrain the input ngrams
      - stoplist_id: stoplist for filtering input ngrams
                     (normally unnecessary if a mainlist is already provided)
      - start, end: provide one or both temporal limits to filter on doc date
@@ -62,9 +64,10 @@ def compute_coocs(  corpus,
    ======================
    each pair of ngrams sharing same doc (node_id)
        SELEC idxa.ngram_id, idxb.ngram_id
-        FROM nodes_ngrams AS idxa, nodes_ngrams AS idxb
+        FROM nodes_ngrams AS idxa
        ---------------------------------
-        WHERE idxa.node_id = idxb.node_id      <== that's cooc
+        JOIN nodes_ngrams AS idxb
+        ON idxa.node_id = idxb.node_id      <== that's cooc
        ---------------------------------
        AND idxa.ngram_id <> idxb.ngram_id
        AND idxa.node_id = MY_DOC ;
@@ -188,7 +191,7 @@ def compute_coocs(  corpus,
    # 4) INPUT FILTERS (reduce N before O(N²))
-    if mainlist_id:
+    if on_list_id:
        m1 = aliased(NodeNgram)
        m2 = aliased(NodeNgram)
@@ -197,8 +200,8 @@ def compute_coocs(  corpus,
            .join(m1, m1.ngram_id == Xindex_ngform_id)
            .join(m2, m2.ngram_id == Yindex_ngform_id)
-            .filter( m1.node_id == mainlist_id )
+            .filter( m1.node_id == on_list_id )
-            .filter( m2.node_id == mainlist_id )
+            .filter( m2.node_id == on_list_id )
        )
    if stoplist_id:
@@ -279,11 +282,16 @@ def compute_coocs(  corpus,
    shape_1 = len({pair[1] for pair in matrix.items})
    print("COOCS: NEW matrix shape [%ix%i]" % (shape_0, shape_1))
+    if just_pass_result:
+        return matrix
+    else:
        # 5) SAVE
        # --------
        # saving the parameters of the analysis in the Node JSON
        new_hyperdata = { 'corpus'   : corpus.id,
                          'threshold': threshold }
        if overwrite_id:
            # overwrite pre-existing id
            the_cooc = cache.Node[overwrite_id]