Merge branch 'romain-refactoring' into unstable

0b94f4b0 · delanoe · 2eb7e07e · 7c14a4ed · 0b94f4b0 · 0b94f4b0
Commit 0b94f4b0 authored May 20, 2016 by delanoe
5 changed files
--- a/gargantext/util/parsers/_Parser.py
+++ b/gargantext/util/parsers/_Parser.py
@@ -126,10 +126,27 @@ class Parser:
                        break
                except KeyError:
                    language_keyerrors[key] = language_symbol
+
+        # languages can find Language objects from any code iso2 or iso3
+        # --------------------------------------------------------------
+        # > languages['fr']
+        # <Language iso3="fra" iso2="fr" implemented="True" name="French">
+        # > languages['fra']
+        # <Language iso3="fra" iso2="fr" implemented="True" name="French">
        if language is not None:
-            hyperdata['language_iso2'] = language.iso2
-            hyperdata['language_iso3'] = language.iso3
            hyperdata['language_name'] = language.name
+            hyperdata['language_iso3'] = language.iso3
+            if (language.iso2 is not None):
+                # NB: language can be recognized through iso3 but have no iso2!!
+                #     because there's *more* languages in iso3 codes (iso-639-3)
+                # exemple:
+                # > languages['dnj']
+                # <Language iso3="dnj" iso2="None" implemented="False" name="Dan">
+                #                            ----
+                hyperdata['language_iso2'] = language.iso2
+            else:
+                # 'None' would become json 'null'  ==> "__unknown__" more stable
+                hyperdata['language_iso2'] = "__unknown__"
        elif language_keyerrors:
            print('Unrecognized language: %s' % ', '.join(
                '%s="%s"' % (key, value) for key, value in language_keyerrors.items()

--- a/gargantext/util/toolchain/__init__.py
+++ b/gargantext/util/toolchain/__init__.py
@@ -54,6 +54,10 @@ def parse_extract_indexhyperdata(corpus):
    corpus.status('Workflow', progress=1)
    corpus.save_hyperdata()
    session.commit()
+    # FIXME: 'Workflow' will still be uncomplete when 'Index' and 'Lists' will
+    #        get stacked into hyperdata['statuses'], but doing corpus.status()
+    #        will return only the 1st uncomplete action (corpus.status() doesn't
+    #        understand "subactions")

    # apply actions
    print('CORPUS #%d' % (corpus.id))

--- a/gargantext/util/toolchain/metric_tfidf.py
+++ b/gargantext/util/toolchain/metric_tfidf.py
@@ -9,59 +9,108 @@ FIXME: "having the same source" means we need to select inside hyperdata
 """

 from gargantext.models   import Node, NodeNgram, NodeNodeNgram, NodeNgramNgram
-from gargantext.util.db  import session, bulk_insert, func # = sqlalchemy.func like sum() or count()
-from sqlalchemy          import text  # for query from raw SQL statement
+from gargantext.util.db  import session, bulk_insert, aliased, \
+                                func # = sqlalchemy.func like sum() or count()
+from sqlalchemy.sql.expression import case # for choice if ngram has mainform or not
+from sqlalchemy import distinct   # for list of unique ngram_ids within a corpus
 from math                import log
 # £TODO
 # from gargantext.util.lists import WeightedContextIndex


-def compute_occs(corpus, overwrite_id = None):
+def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
    """
-    # TODO check if cumulated occs correspond to app's use cases and intention
-
-    Calculates sum of occs per ngram within corpus
-    (used as info in the ngrams table view)
+    Calculates sum of occs per ngram (or per mainform if groups) within corpus
+                 (used as info in the ngrams table view)

    ? optimize ?  OCCS here could be calculated simultaneously within TFIDF-CORPUS loop

+    ? use cases ?
+       => not the main score for users (their intuition for nb of docs having word)
+       => but is the main weighting value for any NLP task
+
    Parameters:
        - overwrite_id: optional id of a pre-existing OCCURRENCES node for this corpus
                     (the Node and its previous NodeNodeNgram rows will be replaced)
+        - groupings_id: optional id of a GROUPLIST node for this corpus
+                        IF absent the occurrences are the sums for each ngram
+                        IF present they're the sums for each ngram's mainform
    """
-    # 0) Get the groups
-    group_id = (session.query(Node.id)
-                       .filter(Node.parent_id == corpus.id)
-                       .filter(Node.typename  == "GROUPLIST")
-                       .first()
-                )
+    #  simple case : no groups
+    #                ---------
+    #    (the occurrences are the sums for each ngram)
+    if not groupings_id:

+        # NodeNgram index
+        occs_q = (session
+                    .query(
+                        NodeNgram.ngram_id,
+                        func.sum(NodeNgram.weight)   # <== OCCURRENCES
+                     )
+                     # filter docs within corpus
+                    .join(Node)
+                    .filter(Node.parent_id == corpus.id)
+                    .filter(Node.typename == "DOCUMENT")

-    # 1) all the doc_ids of our corpus (scope of counts for filter)
-    # slower alternative: [doc.id for doc in corpus.children('DOCUMENT').all()]
-    docids_subquery = (session
-                        .query(Node.id)
-                        .filter(Node.parent_id == corpus.id)
-                        .filter(Node.typename == "DOCUMENT")
-                        .subquery()
-                       )
+                    # for the sum
+                    .group_by(NodeNgram.ngram_id)
+                   )

-    # 2) our sums per ngram_id
-    occ_sums = (session
-                .query(
-                    NodeNgram.ngram_id,
-                    func.sum(NodeNgram.weight)
-                 )
-                #.join(NodeNgramNgram, NodeNgramNgram.node_id == group_id)
-                .filter(NodeNgram.node_id.in_(docids_subquery))
-                .group_by(NodeNgram.ngram_id)
-                .all()
+
+    #   difficult case: with groups
+    #                   ------------
+    # (the occurrences are the sums for each ngram's mainform)
+    else:
+        # sub-SELECT the synonyms of this GROUPLIST id (for OUTER JOIN later)
+        syn = (session.query(NodeNgramNgram.ngram1_id,
+                             NodeNgramNgram.ngram2_id)
+                .filter(NodeNgramNgram.node_id == groupings_id)
+                .subquery()
               )

+        # NodeNgram index with additional subform => mainform replacement
+        occs_q = (session
+                    .query(
+                        # intermediate columns for debug
+                        # -------------------------------
+                        # NodeNgram.node_id,        # document
+                        # NodeNgram.ngram_id,       # <= the occurring ngram
+                        # NodeNgram.weight,         # <= its frequency in doc
+                        # syn.c.ngram1_id           # mainform
+                        # syn.c.ngram2_id,          # subform
+
+                        # ngram to count aka counted_form
+                        # ----------------------------------
+                        #     either NodeNgram.ngram_id as before
+                        #         or mainform if it exists
+                        case([(syn.c.ngram1_id != None, syn.c.ngram1_id)],
+                             else_=NodeNgram.ngram_id)
+                        .label("counted_form"),
+
+                        # the sum itself
+                        # --------------
+                        func.sum(NodeNgram.weight)   # <== OCCURRENCES
+                    )
+                    # this brings the mainform if NodeNgram.ngram_id has one in syn
+                    .outerjoin(syn,
+                               syn.c.ngram2_id == NodeNgram.ngram_id)
+
+                    # filter docs within corpus
+                    .join(Node)
+                    .filter(Node.parent_id == corpus.id)
+                    .filter(Node.typename == "DOCUMENT")
+
+                    # for the sum
+                    .group_by("counted_form")
+                 )
+
+
+    occ_sums = occs_q.all()
    # example result = [(1970, 1.0), (2024, 2.0),  (259, 2.0), (302, 1.0), ... ]
    #                    ^^^^  ^^^
-    #                ngram_id  sum_wei
-
+    #                ngram_id   sum_wei
+    #                   OR
+    #              counted_form

    if overwrite_id:
        # overwrite pre-existing id
@@ -92,7 +141,7 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
    """
    # TODO check if cumulated tfs correspond to app's use cases and intention

-    Calculates tfidf ranking (cumulated tfidf) within the given scope
+    Calculates tfidf ranking (cumulated tfidf for each ngram) within given scope

    Parameters:
      - the corpus itself
@@ -100,92 +149,110 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
         - local  <=> frequencies counted in the current corpus
         - global <=> frequencies counted in all corpora of this type

-
        when the count_scope is global, there is another parameter:
          - termset_scope: {"local" or "global"}
             - local <=> output list of terms limited to the current corpus
-               (SELECT ngram_id FROM nodes_ngrams WHERE node_id IN <docs>)
+               (SELECT DISTINCT ngram_id FROM nodes_ngrams WHERE node_id IN <docs>)
             - global <=> output list of terms from all corpora of this type
-                                                    !!!! (more terms)
+                                                    !!!! (many more terms)

      - overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus
                   (the Node and its previous NodeNodeNgram rows will be replaced)
    """

-    corpus_docids_subquery = (session
-                    .query(Node.id)
-                    .filter(Node.parent_id == corpus.id)
-                    .filter(Node.typename == "DOCUMENT")
-                    .subquery()
+    # MAIN QUERY SKELETON
+    tf_nd_query = (session
+                    .query(
+                        NodeNgram.ngram_id,
+                        func.sum(NodeNgram.weight),    # tf: same as occurrences
+                                                       # -----------------------
+
+                        func.count(NodeNgram.node_id)  # nd: n docs with term
+                                                       # --------------------
+                     )
+                    .group_by(NodeNgram.ngram_id)
+
+                    # optional *count_scope*: if we'll restrict the doc nodes
+                    #          -------------
+                    # .join(countdocs_subquery,
+                    #       countdocs_subquery.c.id == NodeNgram.node_id)
+
+                    # optional *termset_scope*: if we'll restrict the ngrams
+                    #          ---------------
+                    # .join(termset_subquery,
+                    #       termset_subquery.c.uniq_ngid == NodeNgram.ngram_id)
                   )

+    # validate string params
+    if count_scope not in ["local","global"]:
+        raise ValueError("compute_ti_ranking: count_scope param allowed values: 'local', 'global'")
+    if termset_scope not in ["local","global"]:
+        raise ValueError("compute_ti_ranking: termset_scope param allowed values: 'local', 'global'")
+    if count_scope == "local" and termset_scope == "global":
+        raise ValueError("compute_ti_ranking: the termset_scope param can be 'global' iff count_scope param is 'global' too.")
+
    # local <=> within this corpus
    if count_scope == "local":
        # All docs of this corpus
-        count_scope_subquery = corpus_docids_subquery
-
-        termset_scope_subquery = (session
-                        .query(NodeNgram.ngram_id)
-                        .filter(NodeNgram.node_id.in_(corpus_docids_subquery))
+        countdocs_subquery = (session
+                        .query(Node.id)
+                        .filter(Node.typename == "DOCUMENT")
+                        .filter(Node.parent_id == corpus.id)
                        .subquery()
                       )

+        # both scopes are the same: no need to independantly restrict the ngrams
+        tf_nd_query = tf_nd_query.join(countdocs_subquery,
+                                       countdocs_subquery.c.id == NodeNgram.node_id)
+
+
    # global <=> within all corpora of this source
    elif count_scope == "global":
        this_source_type = corpus.resources()[0]['type']

-        # all corpora with the same source type
-        # (we need raw SQL query for postgres JSON operators) (TODO test speed)
-        same_source_corpora_query = (session
-                        .query(Node.id)
-                        .from_statement(text(
-                            """
-                            SELECT id FROM nodes
-                            WHERE hyperdata->'resources' @> '[{\"type\"\:%s}]'
-                            """ % this_source_type
-                            ))
-                        )
+        CorpusNode = aliased(Node)

        # All docs **in all corpora of the same source**
-        ressource_docids_subquery = (session
+        countdocs_subquery = (session
                        .query(Node.id)
-                        .filter(Node.parent_id.in_(same_source_corpora_query))
                        .filter(Node.typename == "DOCUMENT")
+
+                        # join on parent_id with selected corpora nodes
+                        .join(CorpusNode, CorpusNode.id == Node.parent_id)
+                        .filter(CorpusNode.typename == "CORPUS")
+                        .filter(CorpusNode.hyperdata['resources'][0]['type'].astext == str(this_source_type))
                        .subquery()
                       )

-
-        count_scope_subquery = ressource_docids_subquery
-
        if termset_scope == "global":
-            termset_scope_subquery = (session
-                            .query(NodeNgram.ngram_id)
-                            .filter(NodeNgram.node_id.in_(ressource_docids_subquery))
-                            .subquery()
-                           )
-        else:
-            termset_scope_subquery = (session
-                            .query(NodeNgram.ngram_id)
-                            .filter(NodeNgram.node_id.in_(corpus_docids_subquery))
+            # both scopes are the same: no need to independantly restrict the ngrams
+            tf_nd_query = tf_nd_query.join(countdocs_subquery,
+                                           countdocs_subquery.c.id == NodeNgram.node_id)
+
+        elif termset_scope == "local":
+
+            # All unique terms in the original corpus
+            termset_subquery = (session
+                            .query(distinct(NodeNgram.ngram_id).label("uniq_ngid"))
+                            .join(Node)
+                            .filter(Node.typename == "DOCUMENT")
+                            .filter(Node.parent_id == corpus.id)
                            .subquery()
                           )

+            # only case of independant restrictions on docs and terms
+            tf_nd_query = (tf_nd_query
+                            .join(countdocs_subquery,
+                                  countdocs_subquery.c.id == NodeNgram.node_id)
+                            .join(termset_subquery,
+                                  termset_subquery.c.uniq_ngid == NodeNgram.ngram_id)
+                          )

    # N
-    total_docs = session.query(ressource_docids_subquery).count()
+    total_docs = session.query(countdocs_subquery).count()

-    # nb: possible to do the occurrences right now at the same time
-    tf_nd = (session
-                    .query(
-                        NodeNgram.ngram_id,
-                        func.sum(NodeNgram.weight),    # tf: same as occnode
-                        func.count(NodeNgram.node_id)  # nd: n docs with term
-                     )
-                    .filter(NodeNgram.node_id.in_(count_scope_subquery))
-                    .filter(NodeNgram.ngram_id.in_(termset_scope_subquery))
-                    .group_by(NodeNgram.ngram_id)
-                    .all()
-                   )
+    # result
+    tf_nd = tf_nd_query.all()

    # -------------------------------------------------
    tfidfs = {}

--- a/gargantext/util/toolchain/ngrams_extraction.py
+++ b/gargantext/util/toolchain/ngrams_extraction.py
@@ -56,7 +56,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', )):
                ngramsextractor = ngramsextractors[language_iso2]
            except KeyError:
                # skip document
-                print('Unsupported language: `%s`' % (language_iso2, ))
+                print('Unsupported language: `%s` (doc #%i)' % (language_iso2, document.id))
                # and remember that for later processes (eg stemming)
                document.hyperdata['__skipped__'] = 'ngrams_extraction'
                document.save_hyperdata()

--- a/gargantext/views/api/ngramlists.py
+++ b/gargantext/views/api/ngramlists.py
@@ -74,8 +74,10 @@ def _query_grouped_ngrams(groupings_id, details=False, scoring_metric_id=None):
      - details: if False, send just the array of ngram_ids
                 if True, send triples with (ngram_id, term, scoring)
                                                             ^^^^^^^
-      - scoring_metric_id: id of a scoring metric node   (TFIDF or OCCS)
+
+      deprecated: scoring_metric_id: id of a scoring metric node   (TFIDF or OCCS)
                           (for details and sorting)
+                   (no more OCCS counts of subforms)
    """
    if not details:
        # simple contents
@@ -86,12 +88,12 @@ def _query_grouped_ngrams(groupings_id, details=False, scoring_metric_id=None):
                    .query(
                        NodeNgramNgram.ngram2_id,
                        Ngram.terms,
-                        NodeNodeNgram.score
+                        # NodeNodeNgram.score           #
                     )
                    .join(Ngram, NodeNgramNgram.ngram2_id == Ngram.id)
-                    .join(NodeNodeNgram, NodeNgramNgram.ngram2_id == NodeNodeNgram.ngram_id)
-                    .filter(NodeNodeNgram.node1_id == scoring_metric_id)
-                    .order_by(desc(NodeNodeNgram.score))
+                    # .join(NodeNodeNgram, NodeNgramNgram.ngram2_id == NodeNodeNgram.ngram_id)
+                    # .filter(NodeNodeNgram.node1_id == scoring_metric_id)
+                    # .order_by(desc(NodeNodeNgram.score))
                )

    # main filter