tidying up

33ba94b8 · Romain Loth · 6260e8c1 · 33ba94b8 · 33ba94b8 · 33ba94b8
Commit 33ba94b8 authored Mar 14, 2016 by Romain Loth
7 changed files
--- a/gargantext/util/toolchain/__init__.py
+++ b/gargantext/util/toolchain/__init__.py
@@ -2,13 +2,13 @@ from .parsing           import parse
 from .ngrams_extraction import extract_ngrams

 # in usual run order
-from .list_stop         import do_stoplist
-from .ngram_scores      import compute_occurrences_local, compute_tfidf
-from .list_main         import do_mainlist
-from .ngram_coocs_tempo import compute_coocs
-from .score_specificity import compute_specificity
-from .list_map          import do_maplist     # TEST
-from .ngram_groups      import compute_groups
+from .list_stop           import do_stoplist
+from .metric_tfidf        import compute_occs, compute_tfidf
+from .list_main           import do_mainlist
+from .ngram_coocs         import compute_coocs
+from .metric_specificity  import compute_specificity
+from .list_map            import do_maplist     # TEST
+from .ngram_groups        import compute_groups

 from gargantext.util.db import session
 from gargantext.models  import Node
@@ -50,7 +50,7 @@ def parse_extract(corpus):
    print('CORPUS #%d: [%s] new grouplist node #%i' % (corpus.id, t(), group_id))

    # -> write occurrences to Node and NodeNodeNgram # possible: factorize with tfidf
-    occ_id = compute_occurrences_local(corpus)
+    occ_id = compute_occs(corpus)
    print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id))

    # ------------

--- a/gargantext/util/toolchain/list_map.py
+++ b/gargantext/util/toolchain/list_map.py
@@ -67,6 +67,7 @@ def do_maplist(corpus,
                .filter(ScoreSpec.ngram_id.in_(primary_groupterms_subquery))
            )

+    # TODO: move these 2 pools up to mainlist selection
    top_monograms = (query
                .filter(Ngram.n == 1)
                .order_by(desc(ScoreSpec.weight))

--- a/gargantext/util/toolchain/score_specificity.py
+++ b/gargantext/util/toolchain/score_specificity.py
--- a/gargantext/util/toolchain/ngram_scores.py
+++ b/gargantext/util/toolchain/ngram_scores.py
@@ -16,7 +16,7 @@ from math                import log
 # from gargantext.util.lists import WeightedContextIndex


-def compute_occurrences_local(corpus, overwrite_id = None):
+def compute_occs(corpus, overwrite_id = None):
    """
    Calculates sum of occs per ngram within corpus
    (used as info in the ngrams table view)

--- a/gargantext/util/toolchain/ngram_coocs_tempo.py
+++ b/gargantext/util/toolchain/ngram_coocs_tempo.py
@@ -103,7 +103,6 @@ def compute_coocs(corpus,
           )

    # 2) INPUT FILTERS (reduce N before O(N²))
-    #    £TODO add possibility to restrict to the mainlist
    if mainlist_id:
        main_subquery = (
            session.query(NodeNgram.ngram_id)
@@ -150,6 +149,8 @@ def compute_coocs(corpus,
    # 3) OUTPUT FILTERS
    # ------------------
    # threshold
+    # £TODO adjust COOC_THRESHOLD a posteriori:
+    # ex: sometimes 2 sometimes 4 depending on sparsity
    coocs_query = coocs_query.having(ucooc >= threshold)

    # 4) EXECUTE QUERY

--- a/gargantext/util/toolchain/ngrams_tools.py
+++ b/gargantext/util/toolchain/ngrams_tools.py
-from gargantext.util.db import *
-from gargantext.util.db_cache import *
-from gargantext.constants import *
-
-from gargantext.models.ngrams import Ngram, NodeNgram, NodeNgramNgram
-
-
-def insert_ngrams(ngrams,get='terms-id'):
-    '''
-    insert_ngrams :: [(String, Int)] -> dict[terms] = id
-    '''
-    db, cursor = get_cursor()
-    
-    cursor.execute('''    
-        CREATE TEMPORARY TABLE tmp__ngram (
-            id INT,
-            terms VARCHAR(255) NOT NULL,
-            n INT
-            );
-        ''')
-
-    bulk_insert('tmp__ngram', ['terms', 'n'], ngrams, cursor=cursor)
-    
-    cursor.execute('''
-        UPDATE
-            tmp__ngram
-        SET
-            id = ngram.id
-        FROM
-            %s AS ngram
-        WHERE
-            tmp__ngram.terms = ngram.terms
-            ''' % (Ngram.__table__.name,))
-    
-    cursor.execute('''
-        INSERT INTO
-            %s (terms, n)
-        SELECT
-            terms, n
-        FROM
-            tmp__ngram
-        WHERE
-            id IS NULL
-            ''' % (Ngram.__table__.name,))
-    
-    cursor.execute('''
-        UPDATE
-            tmp__ngram
-        SET
-            id = ngram.id
-        FROM
-            %s AS ngram
-        WHERE
-            ngram.terms = tmp__ngram.terms
-        AND
-            ngram.n = tmp__ngram.n
-        AND
-            tmp__ngram.id IS NULL
-            ''' % (Ngram.__table__.name,))
-    
-    ngram_ids = dict()
-    cursor.execute('SELECT id, terms FROM tmp__ngram')
-    for row in cursor.fetchall():
-        ngram_ids[row[1]] = row[0]
-
-    db.commit()
-    return(ngram_ids)
-
--- a/gargantext/util/toolchain/score_occurrences.py
+++ b/gargantext/util/toolchain/score_occurrences.py
-
-from gargantext_web.db import get_session, cache, get_cursor
-from gargantext_web.db import Node, NodeNgram, NodeNodeNgram
-from gargantext_web.db import get_or_create_node
-
-#from admin.utils import DebugTime
-
-def compute_occs(corpus, debug=True):
-    '''
-    compute_occs :: Corpus -> IO ()
-
-    '''
-    
-    #dbg = DebugTime('Corpus #%d - OCCURRENCES' % corpus.id)
-    #dbg.show('Calculate occurrences')
-    
-    occs_node = get_or_create_node(nodetype='Occurrences', corpus=corpus, mysession=mysession)
-    
-    #print(occs_node.id)
-
-    (session.query(NodeNodeNgram)
-            .filter(NodeNodeNgram.nodex_id==occs_node.id).delete()
-            )
-    session.commit()
-
-    db, cursor = get_cursor()
-    cursor.execute('''
-        INSERT INTO
-            %s (nodex_id, nodey_id, ngram_id, score)
-        SELECT
-            %d AS nodex_id,
-            %d AS nodey_id,
-            nodengram.ngram_id AS ngram_id,
-            SUM(nodengram.weight) AS score
-        FROM
-            %s AS nodengram
-        INNER JOIN
-            %s AS node     ON nodengram.node_id = node.id
-        WHERE
-            node.parent_id = %d
-            AND
-            node.type_id = %d
-        GROUP BY
-            nodengram.ngram_id
-            
-
-    ''' % (   NodeNodeNgram.__table__.name
-            , occs_node.id, corpus.id
-            , NodeNgram.__table__.name
-            , Node.__table__.name
-            , corpus.id
-            , cache.NodeType['Document'].id
-            )
-    )
-    db.commit()
-    
-    if debug is True:
-        data = session.query(NodeNodeNgram).filter(NodeNodeNgram.nodex_id==occs_node.id).all()
-        print([n for n in data])