first simple version of tfidf in ngram_scores

dee88be8 · Romain Loth · a65df75a · dee88be8 · dee88be8 · dee88be8
Commit dee88be8 authored Mar 04, 2016 by Romain Loth
5 changed files
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -9,29 +9,30 @@ LISTTYPES = {
    'STOPLIST'     : UnweightedList,
    'MAINLIST'     : UnweightedList,
    'MAPLIST'      : UnweightedList,
-    'OCCURRENCES'  : WeightedList,
+    'OCCURRENCES'  : WeightedContextIndex,
    'COOCCURRENCES': WeightedMatrix,
+    'TFIDF-CORPUS' : WeightedContextIndex,
 }

 NODETYPES = [
    None,
    # documents hierarchy
-    'USER',
-    'PROJECT',
-    'CORPUS',
-    'DOCUMENT',
+    'USER',                  # 1
+    'PROJECT',               # 2
+    'CORPUS',                # 3
+    'DOCUMENT',              # 4
    # lists
-    'STOPLIST',
-    'GROUPLIST',
-    'MAINLIST',
-    'MAPLIST',
-    'COOCCURRENCES',
+    'STOPLIST',              # 5
+    'GROUPLIST',             # 6
+    'MAINLIST',              # 7
+    'MAPLIST',               # 8
+    'COOCCURRENCES',         # 9
    # scores
-    'OCCURRENCES',
-    'SPECIFICITY',
-    'CVALUE',
-    'TFIDF-CORPUS',
-    'TFIDF-GLOBAL',
+    'OCCURRENCES',           # 10
+    'SPECIFICITY',           # 11
+    'CVALUE',                # 12
+    'TFIDF-CORPUS',          # 13
+    'TFIDF-GLOBAL',          # 14
 ]



--- a/gargantext/util/lists.py
+++ b/gargantext/util/lists.py
@@ -2,7 +2,7 @@
 """


-__all__ = ['Translations', 'WeightedMatrix', 'UnweightedList', 'WeightedList']
+__all__ = ['Translations', 'WeightedMatrix', 'UnweightedList', 'WeightedList', 'WeightedContextIndex']


 from gargantext.util.db import session, bulk_insert
@@ -165,6 +165,22 @@ class Translations(_BaseClass):
        )


+class WeightedContextIndex(_BaseClass):
+    """
+    associated model   : NodeNodeNgram
+    associated columns : node1_id  |  node2_id  |  ngram_id  |  score (float)
+
+    Tensor representing a contextual index or registry
+    (matrix of weighted ngrams *per* doc *per* context)
+
+    Exemple : tfidf by corpus
+    """
+    def __init__(self, source=None):
+        self.items = defaultdict(float)
+
+
+
+
 class WeightedMatrix(_BaseClass):

    def __init__(self, source=None):

--- a/gargantext/util/toolchain/__init__.py
+++ b/gargantext/util/toolchain/__init__.py
-from .parsing import parse
+from .parsing           import parse
 from .ngrams_extraction import extract_ngrams
-
+from .ngram_scores      import compute_occurrences_local, compute_tfidf_local
+from .ngram_groups      import compute_groups

 from gargantext.util.db import session
-from gargantext.models import Node
-
-from .group import compute_groups
+from gargantext.models  import Node

 def parse_extract(corpus):
    # retrieve corpus from database from id
@@ -23,5 +22,15 @@ def parse_extract(corpus):
    print('CORPUS #%d: extracted ngrams' % (corpus.id))

    # temporary ngram lists workflow
-    group_id = compute_groups(corpus)
-    print('CORPUS #%d: new grouplist = #%i' % (corpus.id, group_id))
+
+    # write occurrences to Node and NodeNodeNgram
+    occnd_id = compute_occurrences_local(corpus)
+    print('CORPUS #%d: new occs node #%i' % (corpus.id, occnd_id))
+
+    # write local tfidf to Node and NodeNodeNgram
+    ltfidf_id = compute_tfidf_local(corpus)
+    print('CORPUS #%d: new localtfidf node #%i' % (corpus.id, ltfidf_id))
+
+    # write groups to Node and NodeNgramNgram
+    group_id = compute_groups(corpus, stoplist_id = None)
+    print('CORPUS #%d: new grouplist node #%i' % (corpus.id, group_id))
--- a/gargantext/util/toolchain/group.py
+++ b/gargantext/util/toolchain/group.py
--- a/gargantext/util/toolchain/ngram_scores.py
+++ b/gargantext/util/toolchain/ngram_scores.py
+from gargantext.models   import Node, NodeNgram, NodeNodeNgram
+from gargantext.util.db  import session, bulk_insert
+
+# £TODO
+# from gargantext.util.lists import WeightedContextIndex
+
+from gargantext.util.db import func # = sqlalchemy.func like sum() or count()
+
+from math  import log
+
+def compute_occurrences_local(corpus):
+    """
+    Calculates sum of occs per ngram within corpus
+    """
+
+    # 1) all the doc_ids of our corpus (scope of counts for filter)
+    # slower alternative: [doc.id for doc in corpus.children('DOCUMENT').all()]
+    docids_subquery = (session
+                        .query(Node.id)
+                        .filter(Node.parent_id == corpus.id)
+                        .filter(Node.typename == "DOCUMENT")
+                        .subquery()
+                       )
+
+    # 2) our sums per ngram_id
+    occ_sums = (session
+                .query(
+                    NodeNgram.ngram_id,
+                    func.sum(NodeNgram.weight)
+                 )
+                .filter(NodeNgram.node_id.in_(docids_subquery))
+                .group_by(NodeNgram.ngram_id)
+                .all()
+               )
+
+    # example result = [(1970, 1.0), (2024, 2.0),  (259, 2.0), (302, 1.0), ... ]
+    #                    ^^^^  ^^^
+    #                ngram_id  sum_wei
+
+    # create the new OCCURRENCES node
+    occnode = Node()
+    occnode.typename  = "OCCURRENCES"
+    occnode.name      = "occ_sums (in:%s)" % corpus.id
+    occnode.parent_id = corpus.id
+    occnode.user_id   = corpus.user_id
+    session.add(occnode)
+    session.commit()
+
+    # reflect that in NodeNodeNgrams (could be NodeNgram but harmony with tfidf)
+    # £TODO replace bulk_insert by something like WeightedContextMatrix.save()
+    bulk_insert(
+        NodeNodeNgram,
+        ('node1_id' , 'node2_id', 'ngram_id', 'score'),
+        ((occnode.id, corpus.id,  res[0], res[1]) for res in occ_sums)
+    )
+
+    return occnode.id
+
+
+def compute_tfidf_local(corpus):
+    """
+    Calculates tfidf within the current corpus
+    """
+
+    # ?? FIXME could we keep the docids somehow from previous computations ??
+    docids_subquery = (session
+                        .query(Node.id)
+                        .filter(Node.parent_id == corpus.id)
+                        .filter(Node.typename == "DOCUMENT")
+                        .subquery()
+                       )
+
+    total_docs = session.query(docids_subquery).count()
+
+    # or perhaps at least do the occurrences right now at the same time
+    tf_nd = (session
+                    .query(
+                        NodeNgram.ngram_id,
+                        func.sum(NodeNgram.weight),    # tf: same as occnode
+                        func.count(NodeNgram.node_id)  # nd: n docs with term
+                     )
+                    .filter(NodeNgram.node_id.in_(docids_subquery))
+                    .group_by(NodeNgram.ngram_id)
+                    .all()
+                   )
+
+    # ---------------------------------------------
+    tfidfs = {}
+    for (ngram_id, tf, nd) in tf_nd:
+        tfidfs[ngram_id] = tf / log(total_docs/nd)
+    # ---------------------------------------------
+
+    # create the new TFIDF-CORPUS node
+    ltfidf = Node()
+    ltfidf.typename  = "TFIDF-CORPUS"
+    ltfidf.name      = "tfidf (in:%s)" % corpus.id
+    ltfidf.parent_id = corpus.id
+    ltfidf.user_id   = corpus.user_id
+    session.add(ltfidf)
+    session.commit()
+
+    # reflect that in NodeNodeNgrams
+    # £TODO replace bulk_insert by something like WeightedContextMatrix.save()
+    bulk_insert(
+        NodeNodeNgram,
+        ('node1_id' , 'node2_id', 'ngram_id', 'score'),
+        ((ltfidf.id,  corpus.id,     ng, tfidfs[ng]) for ng in tfidfs)
+    )
+
+    return ltfidf.id