occurrences with subforms counted inside mainform

1bb37aff · Romain Loth · f280e8a4 · 1bb37aff
Commit 1bb37aff authored May 18, 2016 by Romain Loth
Show whitespace changes
Inline Side-by-side

Showing with 79 additions and 31 deletions

metric_tfidf.py gargantext/util/toolchain/metric_tfidf.py +79 -31

No files found.
--- a/gargantext/util/toolchain/metric_tfidf.py
+++ b/gargantext/util/toolchain/metric_tfidf.py
@@ -11,57 +11,105 @@ FIXME: "having the same source" means we need to select inside hyperdata
 from gargantext.models   import Node, NodeNgram, NodeNodeNgram, NodeNgramNgram
 from gargantext.util.db  import session, bulk_insert, func # = sqlalchemy.func like sum() or count()
 from sqlalchemy          import text  # for query from raw SQL statement
+from sqlalchemy.sql.expression import case # for choice if ngram has mainform or not
 from math                import log
 # £TODO
 # from gargantext.util.lists import WeightedContextIndex
-def compute_occs(corpus, overwrite_id = None):
+def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
    """
-    # TODO check if cumulated occs correspond to app's use cases and intention
+    Calculates sum of occs per ngram (or per mainform if groups) within corpus
-    Calculates sum of occs per ngram within corpus
                 (used as info in the ngrams table view)
    ? optimize ?  OCCS here could be calculated simultaneously within TFIDF-CORPUS loop
+    ? use cases ?
+       => not the main score for users (their intuition for nb of docs having word)
+       => but is the main weighting value for any NLP task
    Parameters:
        - overwrite_id: optional id of a pre-existing OCCURRENCES node for this corpus
                     (the Node and its previous NodeNodeNgram rows will be replaced)
+        - groupings_id: optional id of a GROUPLIST node for this corpus
+                        IF absent the occurrences are the sums for each ngram
+                        IF present they're the sums for each ngram's mainform
    """
-    # 0) Get the groups
+    #  simple case : no groups
-    group_id = (session.query(Node.id)
+    #                ---------
+    #    (the occurrences are the sums for each ngram)
+    if not groupings_id:
+        # NodeNgram index
+        occs_q = (session
+                    .query(
+                        NodeNgram.ngram_id,
+                        func.sum(NodeNgram.weight)   # <== OCCURRENCES
+                     )
+                     # filter docs within corpus
+                    .join(Node)
                    .filter(Node.parent_id == corpus.id)
-                       .filter(Node.typename  == "GROUPLIST")
+                    .filter(Node.typename == "DOCUMENT")
-                       .first()
+                    # for the sum
+                    .group_by(NodeNgram.ngram_id)
                   )
-    # 1) all the doc_ids of our corpus (scope of counts for filter)
+    #   difficult case: with groups
-    # slower alternative: [doc.id for doc in corpus.children('DOCUMENT').all()]
+    #                   ------------
-    docids_subquery = (session
+    # (the occurrences are the sums for each ngram's mainform)
-                        .query(Node.id)
+    else:
-                        .filter(Node.parent_id == corpus.id)
+        # sub-SELECT the synonyms of this GROUPLIST id (for OUTER JOIN later)
-                        .filter(Node.typename == "DOCUMENT")
+        syn = (session.query(NodeNgramNgram.ngram1_id,
+                             NodeNgramNgram.ngram2_id)
+                .filter(NodeNgramNgram.node_id == groupings_id)
                .subquery()
               )
-    # 2) our sums per ngram_id
+        # NodeNgram index with additional subform => mainform replacement
-    occ_sums = (session
+        occs_q = (session
                    .query(
-                    NodeNgram.ngram_id,
+                        # intermediate columns for debug
-                    func.sum(NodeNgram.weight)
+                        # -------------------------------
+                        # NodeNgram.node_id,        # document
+                        # NodeNgram.ngram_id,       # <= the occurring ngram
+                        # NodeNgram.weight,         # <= its frequency in doc
+                        # syn.c.ngram1_id           # mainform
+                        # syn.c.ngram2_id,          # subform
+                        # ngram to count aka counted_form
+                        # ----------------------------------
+                        #     either NodeNgram.ngram_id as before
+                        #         or mainform if it exists
+                        case([(syn.c.ngram1_id != None, syn.c.ngram1_id)],
+                             else_=NodeNgram.ngram_id)
+                        .label("counted_form"),
+                        # the sum itself
+                        # --------------
+                        func.sum(NodeNgram.weight)   # <== OCCURRENCES
                    )
-                #.join(NodeNgramNgram, NodeNgramNgram.node_id == group_id)
+                    # this brings the mainform if NodeNgram.ngram_id has one in syn
-                .filter(NodeNgram.node_id.in_(docids_subquery))
+                    .outerjoin(syn,
-                .group_by(NodeNgram.ngram_id)
+                               syn.c.ngram2_id == NodeNgram.ngram_id)
-                .all()
+                    # filter docs within corpus
+                    .join(Node)
+                    .filter(Node.parent_id == corpus.id)
+                    .filter(Node.typename == "DOCUMENT")
+                    # for the sum
+                    .group_by("counted_form")
                 )
+    occ_sums = occs_q.all()
    # example result = [(1970, 1.0), (2024, 2.0),  (259, 2.0), (302, 1.0), ... ]
    #                    ^^^^  ^^^
    #                ngram_id   sum_wei
+    #                   OR
+    #              counted_form
    if overwrite_id:
        # overwrite pre-existing id