Merge branch 'testing' into testing-graph-public

04afe46a · delanoe · 39ab3eaf · c3a055a7 · 04afe46a · 04afe46a
Commit 04afe46a authored Apr 04, 2017 by delanoe
7 changed files
--- a/gargantext/util/db.py
+++ b/gargantext/util/db.py
@@ -173,3 +173,4 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None, do_stat

    cursor.close()

+
--- a/gargantext/util/toolchain/ngram_coocs.py
+++ b/gargantext/util/toolchain/ngram_coocs.py
@@ -3,9 +3,9 @@ COOCS
    (this is the full SQL version, should be more reliable on outerjoin)
 """
 from gargantext                import settings
-from sqlalchemy                import create_engine, exc
+from sqlalchemy                import exc
 from gargantext.util.lists     import WeightedMatrix
-# from gargantext.util.db        import session, aliased, func
+from gargantext.util.db        import get_engine
 from gargantext.util.db_cache  import cache
 from gargantext.constants      import DEFAULT_COOC_THRESHOLD, NODETYPES
 from gargantext.constants      import INDEXED_HYPERDATA
@@ -64,12 +64,7 @@ def compute_coocs(  corpus,
    """

    # 1) prepare direct connection to the DB
-    url = 'postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}:{PORT}/{NAME}'.format(
-            **settings.DATABASES['default']
-        )
-
-    engine = create_engine( url )
-    connection = engine.connect()
+    connection = get_engine().connect()

    # string vars for our SQL query
    # setting work memory high to improve cache perf.

--- a/graph/graph.py
+++ b/graph/graph.py
@@ -8,6 +8,7 @@ from graph.cooccurrences          import countCooccurrences
 from graph.distances              import clusterByDistances
 from graph.bridgeness             import filterByBridgeness
 from graph.mail_notification      import notify_owner
+from graph.growth                 import compute_growth

 from gargantext.util.scheduling   import scheduled
 from gargantext.constants         import graph_constraints
@@ -64,7 +65,15 @@ def compute_graph( corpus_id=None      , cooc_id=None

        print("GRAPH #%d ... Filtering by bridgeness %d." % (cooc_id, bridgeness))
        data = filterByBridgeness(G,partition,ids,weight,bridgeness,"node_link",field1,field2)
-
+        
+        if start is not None and end is not None:
+            growth= dict()
+            for (ng_id, score) in compute_growth(corpus_id, groupList_id, mapList_id, start, end):
+                growth[ng_id] = float(score) + 100 # for the normalization, should not be negativ
+
+            for node in data['nodes']:
+                node['attributes']['growth'] = growth[node['id']]
+        
        print("GRAPH #%d ... Saving Graph in hyperdata as json." % cooc_id)
        node = session.query(Node).filter(Node.id == cooc_id).first()

@@ -187,7 +196,7 @@ def get_graph( request=None         , corpus=None
                                     )
                                .filter( Start.key == 'publication_date')
                                .filter( Start.value_utc >= date_start_utc)
-                      )
+                            )


    # Filter corpus by date if any end date
@@ -203,8 +212,7 @@ def get_graph( request=None         , corpus=None
                                     )
                                .filter( End.key == 'publication_date')
                                .filter( End.value_utc <= date_end_utc )
-                      )
-
+                            )

    # Finally test if the size of the corpora is big enough
    # --------------------------------
@@ -221,10 +229,11 @@ def get_graph( request=None         , corpus=None
                                   #, limit=size
                                    )

-        return {"state" : "saveOnly",
-                "target_id" : cooc_id,
-                "target_name": cooc_name,
-                "target_date": cooc_date}
+        return { "state"      : "saveOnly"
+               , "target_id"  : cooc_id
+               , "target_name": cooc_name
+               , "target_date": cooc_date
+               }

    elif corpus_size > graph_constraints['corpusMax']:
        # Then compute cooc asynchronously with celery
@@ -262,5 +271,5 @@ def get_graph( request=None         , corpus=None
    if len(data) == 0:
        print("GRAPH #   ... GET_GRAPH: 0 coocs in matrix")
        data = {'nodes':[], 'links':[]}  # empty data
-
+    
    return data
--- a/graph/growth.py
+++ b/graph/growth.py
+"""
+Computes ngram growth on periods
+"""
+
+from gargantext.models   import Node, NodeNgram, NodeNodeNgram, NodeNgramNgram
+from gargantext.util.db_cache  import cache
+from gargantext.util.db  import session, bulk_insert, aliased, \
+                                func, get_engine # = sqlalchemy.func like sum() or count()
+from datetime             import datetime
+
+
+def timeframes(start, end):
+    """
+    timeframes :: String -> String -> (UTCTime, UTCTime, UTCTime)
+    """
+    
+    start = datetime.strptime (str(start), "%Y-%m-%d")
+    end   = datetime.strptime (str(end), "%Y-%m-%d")
+
+    date_0 = start - (end - start)
+    date_1 = start
+    date_2 = end
+
+    return (date_0, date_1, date_2)
+
+
+
+def compute_growth(corpus_id, groupList_id, mapList_id, start, end):
+    """
+    compute_graph :: Int -> UTCTime -> UTCTime -> Int -> Int 
+                   -> [(Int, Numeric)]
+    
+    this function uses SQL function in 
+    /srv/gargantext/install/gargamelle/sqlFunctions.sql
+
+    First compute occurrences of ngrams in mapList (with groups) on the first
+    period, then on the second and finally returns growth.
+
+    Directly computed with Postgres Database (C) for optimization.
+    """
+    connection = get_engine()
+    
+    (date_0, date_1, date_2) = timeframes(start, end)
+    
+    query = """SELECT * FROM OCC_HIST( {corpus_id}
+                                     , {groupList_id}
+                                     , {mapList_id}
+                                     , '{date_0}'
+                                     , '{date_1}'
+                                     , '{date_2}'
+                                     )
+            """.format( corpus_id    = corpus_id
+                      , groupList_id = groupList_id
+                      , mapList_id   = mapList_id
+                      , date_0       = date_0
+                      , date_1       = date_1
+                      , date_2       = date_2
+                      )
+    return(connection.execute(query))
+
+
--- a/graph/utils.py
+++ b/graph/utils.py
@@ -19,6 +19,8 @@ def compress_graph(graphdata):
    for node in graphdata['nodes']:
        node['lb'] = node['label']
        del node['label']
+        
+        #node['attributes']['growth'] = 0.8

        node['at'] = node['attributes']
        del node['attributes']

--- a/install/gargamelle/psqlFunctions.sql
+++ b/install/gargamelle/psqlFunctions.sql
+-- CNRS Copyrights 2017
+-- See Gargantext Licence for details
+-- Maintainers: team@gargantext.org
+
+
+-- USAGE
+-- psql gargandb < occ_growth.sql
+
+-- OCC_HIST :: Corpus.id -> GroupList.id -> MapList.id -> Start -> EndFirst -> EndLast
+-- EXEMPLE USAGE 
+--    SELECT * FROM OCC_HIST(182856, 183859, 183866, '1800-03-15 17:00:00+01', '2000-03-15 17:00:00+01', '2017-03-15 17:00:00+01')
+
+
+-- OCC_HIST_PART :: Corpus.id -> GroupList.id -> Start -> End
+DROP FUNCTION OCC_HIST_PART(integer, integer, timestamp without time zone, timestamp without time zone);
+-- DROP for tests
+CREATE OR REPLACE FUNCTION OCC_HIST_PART(int, int, timestamp, timestamp) RETURNS TABLE (ng_id int, score float8) 
+    AS $$
+-- EXPLAIN ANALYZE
+    SELECT 
+    COALESCE(gr.ngram1_id, ng1.ngram_id) as ng_id,
+    SUM(ng1.weight) as score
+
+    from nodes n
+    
+    -- BEFORE
+    INNER JOIN nodes as n1 ON n1.id = n.id
+
+    INNER JOIN nodes_ngrams ng1 ON ng1.node_id = n1.id
+
+    -- Limit with timestamps: ]start, end]
+    INNER JOIN nodes_hyperdata nh1 ON nh1.node_id = n1.id
+                                  AND nh1.value_utc >  $3
+                                  AND nh1.value_utc <= $4
+
+    -- Group List
+    LEFT JOIN  nodes_ngrams_ngrams gr ON ng1.ngram_id = gr.ngram2_id
+                               AND gr.node_id = $2
+
+    WHERE
+        n.typename  = 4
+    AND n.parent_id = $1
+    GROUP BY 1
+    $$
+LANGUAGE SQL;
+
+DROP FUNCTION OCC_HIST(integer, integer, integer, timestamp without time zone, timestamp without time zone, timestamp without time zone);
+-- OCC_HIST :: Corpus.id -> GroupList.id -> MapList.id -> Start -> EndFirst -> EndLast
+CREATE OR REPLACE FUNCTION OCC_HIST(int, int, int, timestamp, timestamp, timestamp) RETURNS TABLE (ng_id int, score numeric) 
+    AS $$
+    WITH OCC1 as (SELECT * from OCC_HIST_PART($1, $2, $4, $5))
+       , OCC2 as (SELECT * from OCC_HIST_PART($1, $2, $5, $6))
+       , GROWTH as (SELECT ml.ngram_id as ngram_id
+                 , COALESCE(OCC1.score, null) as score1 
+                 , COALESCE(OCC2.score, null) as score2
+                    FROM nodes_ngrams ml
+                        LEFT JOIN OCC1 ON OCC1.ng_id = ml.ngram_id
+                        LEFT JOIN OCC2 ON OCC2.ng_id = ml.ngram_id
+                    WHERE ml.node_id = $3
+                    ORDER by score2 DESC)
+    SELECT ngram_id, COALESCE(ROUND(CAST((100 * (score2 - score1) / COALESCE((score2 + score1), 1)) as numeric), 2), 0) from GROWTH
+    $$
+LANGUAGE SQL;
+
+
+-- BEHAVIORAL TEST (should be equal to occ in terms table)
+--    WITH OCC as (SELECT * from OCC_HIST(182856, 183859, '1800-03-15 17:00:00+01', '2300-03-15 17:00:00+01'))
+--    SELECT ng_id, score from OCC 
+--            INNER JOIN nodes_ngrams ml on ml.ngram_id = ng_id
+--                                      AND ml.node_id = 183866
+--            ORDER BY score DESC;
--- a/templates/pages/menu.html
+++ b/templates/pages/menu.html
@@ -367,7 +367,7 @@
            <p>
                Gargantext
                <span class="glyphicon glyphicon-registration-mark" aria-hidden="true"></span>
-                , version 3.0.6.6,
+                , version 3.0.6.7,
                <a href="http://www.cnrs.fr" target="blank" title="Institution that enables this project.">
                    Copyrights
                    <span class="glyphicon glyphicon-copyright-mark" aria-hidden="true"></span>