mainlist creation

744ec7f1 · Romain Loth · 89c8268c · 744ec7f1 · 744ec7f1 · 744ec7f1
Commit 744ec7f1 authored Mar 11, 2016 by Romain Loth
Hide whitespace changes
Inline Side-by-side

Showing with 100 additions and 7 deletions

constants.py gargantext/constants.py +3 -1

__init__.py gargantext/util/toolchain/__init__.py +15 -6

list_main.py gargantext/util/toolchain/list_main.py +82 -0

No files found.
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -93,7 +93,9 @@ RESOURCETYPES = [
 ]

 # linguistic extraction parameters
-DEFAULT_COOC_THRESHOLD = 4
+DEFAULT_TFIDF_CUTOFF_RATIO = .55      # for MAINLIST maximum terms
+DEFAULT_TFIDF_HARD_LIMIT = 1000       # for MAINLIST maximum terms
+DEFAULT_COOC_THRESHOLD = 4            # for COOCCURRENCES node

 # other parameters
 # default number of docs POSTed to scrappers.views.py

--- a/gargantext/util/toolchain/__init__.py
+++ b/gargantext/util/toolchain/__init__.py
 from .parsing           import parse
 from .ngrams_extraction import extract_ngrams

-from .list_stop         import compute_stop
+from .list_stop         import do_stoplist
 from .ngram_scores      import compute_occurrences_local, compute_tfidf
+from .list_main         import do_mainlist
 from .ngram_coocs_tempo import compute_coocs
 from .score_specificity import compute_specificity
 from .list_map          import compute_mapList     # TEST
@@ -24,6 +25,12 @@ def parse_extract(corpus):
    # apply actions
    print('CORPUS #%d' % (corpus.id))
    parse(corpus)
+
+    # was there an error in the process ?
+    if corpus.status()['error']:
+        print("ERROR: aborting parse_extract for corpus #%i" % corpus_id)
+        return None
+
    print('CORPUS #%d: parsed' % (corpus.id))
    extract_ngrams(corpus)
    print('CORPUS #%d: extracted ngrams' % (corpus.id))
@@ -45,16 +52,16 @@ def parse_extract(corpus):
    gtfidf_id = compute_tfidf(corpus, scope="global")
    print('CORPUS #%d: [%s] new globaltfidf node #%i' % (corpus.id, t(), gtfidf_id))

-    # ?? mainlist: compute + write (to Node and NodeNgram)
-    # mainlist_id = compute_mainlist(corpus)
-    # print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id))
+    # -> mainlist: compute + write (to Node and NodeNgram)
+    mainlist_id = mainlist_filter(corpus, tfidf_id = gtfidf_id, stoplist_id = stop_id)
+    print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id))

    # -> cooccurrences: compute + write (=> Node and NodeNodeNgram)
-    cooc_id = compute_coocs(corpus, stop_id = None)
+    cooc_id = compute_coocs(corpus, mainlist_id = mainlist_id, stop_id = None)
    print('CORPUS #%d: [%s] new cooccs node #%i' % (corpus.id, t(), cooc_id))

    # ?? specificity: compute + write (=> NodeNodeNgram)
-    spec_id = compute_specificity(cooc_id=cooc_id, corpus=corpus)
+    spec_id = compute_specificity(corpus, cooc_id=cooc_id)
    print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), cooc_id))

    # ?? maplist: compute + write (to Node and NodeNgram)
@@ -70,5 +77,7 @@ def parse_extract(corpus):
    print('CORPUS #%d: [%s] new grouplist node #%i' % (corpus.id, t(), group_id))


+
+
 def t():
    return datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
--- a/gargantext/util/toolchain/list_main.py
+++ b/gargantext/util/toolchain/list_main.py
+from gargantext.models     import Node, NodeNgram, NodeNodeNgram
+from gargantext.util.db    import session
+from gargantext.util.lists import UnweightedList
+from sqlalchemy            import desc
+from gargantext.constants  import DEFAULT_TFIDF_CUTOFF_RATIO, DEFAULT_TFIDF_HARD_LIMIT
+from math                  import floor
+
+def do_mainlist(corpus, tfidf_id=None, stoplist_id=None,
+                    hard_limit=DEFAULT_TFIDF_HARD_LIMIT,
+                    ratio_limit=DEFAULT_TFIDF_CUTOFF_RATIO
+                    ):
+    """
+    Select terms for the mainlist according to a global tfidf and stoplist.
+
+    The number of selected terms will be:
+        min(hard_limit, number_of_terms * ratio_limit)
+
+    NB : We use a global tfidf node where the values are global but the ngrams
+         are already selected (== only within this corpus documents).
+
+    Parameters:
+        2 limits are useful to set a maximum amount of picked terms
+        - ratio_limit: relative to the number of distinct ngrams  [0,1]
+        - hard_limit: absolute value [default: 1000]
+    """
+
+    # retrieve helper nodes if not provided
+    if not tfidf_id:
+        tfidf_id  = session.query(Node.id).filter(
+                                Node.typename  == "TFIDF-GLOBAL",
+                                Node.parent_id == corpus.id
+                    ).first()
+        if not tfidf_id:
+            raise ValueError("MAINLIST: TFIDF node needed for mainlist creation")
+
+    if not stoplist_id:
+        stoplist_id  = session.query(Node.id).filter(
+                                Node.typename  == "STOPLIST",
+                                Node.parent_id == corpus.id
+                        ).first()
+        if not stoplist_id:
+            raise ValueError("MAINLIST: STOPLIST node needed for mainlist creation")
+
+    # the ngrams we don't want
+    # NOTE: keep sure we do this only once during the ngram initial workflow
+    stopterms_subquery = (session
+                            .query(NodeNgram.ngram_id)
+                            .filter(NodeNgram.node_id == stoplist_id)
+                            .subquery()
+                         )
+
+    # tfidf-ranked query
+    ordered_filtered_tfidf = (session
+        .query(NodeNodeNgram.ngram_id)
+        .filter(NodeNodeNgram.node1_id == tfidf_id)
+        .filter(~ NodeNodeNgram.ngram_id.in_(stopterms_subquery))
+        .order_by(desc(NodeNodeNgram.score))
+        )
+
+    # total count
+    nb_ngrams = ordered_filtered_tfidf.count()
+
+    # apply ratio to find smallest limit
+    our_limit = min(hard_limit, floor(nb_ngrams * ratio_limit))
+
+    # DB retrieve up to limit => MAINLIST
+    top_ngrams_ids = ordered_filtered_tfidf.limit(our_limit).all()
+
+    # now create the new MAINLIST node
+    mainlist = corpus.add_child(
+        typename  = "MAINLIST",
+        name = "Mainlist (in:%s)" % corpus.name[0:10]
+    )
+    session.add(mainlist)
+    session.commit()
+
+    the_id = mainlist.id
+
+    # create UnweightedList object and save (=> new NodeNgram rows)
+    UnweightedList(top_ngrams_ids).save(the_id)
+
+    return the_id