Commit fb87c813 authored by delanoe's avatar delanoe

[FEAT] Growth on graph

       First select a period on Documents view.

       Then compute a graph chosing one distance (distributional or
       conditional).

       Then in Graph Explorer view (TinaWebJS), user can get colors of
       node with growth attributes.

       Red nodes are increasing.
       Yellow nodes are decreasing.

       Growth computed with previous period which is equal to the
       selected period in duration (but before).

       TODO: make some tests with Users to validate the fact that
       previous period is not all the corpus but the selected period.
parent 0b6f1d50
......@@ -173,3 +173,4 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None, do_stat
cursor.close()
......@@ -3,9 +3,9 @@ COOCS
(this is the full SQL version, should be more reliable on outerjoin)
"""
from gargantext import settings
from sqlalchemy import create_engine, exc
from sqlalchemy import exc
from gargantext.util.lists import WeightedMatrix
# from gargantext.util.db import session, aliased, func
from gargantext.util.db import get_engine
from gargantext.util.db_cache import cache
from gargantext.constants import DEFAULT_COOC_THRESHOLD, NODETYPES
from gargantext.constants import INDEXED_HYPERDATA
......@@ -64,12 +64,7 @@ def compute_coocs( corpus,
"""
# 1) prepare direct connection to the DB
url = 'postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}:{PORT}/{NAME}'.format(
**settings.DATABASES['default']
)
engine = create_engine( url )
connection = engine.connect()
connection = get_engine().connect()
# string vars for our SQL query
# setting work memory high to improve cache perf.
......
......@@ -8,6 +8,7 @@ from graph.cooccurrences import countCooccurrences
from graph.distances import clusterByDistances
from graph.bridgeness import filterByBridgeness
from graph.mail_notification import notify_owner
from graph.growth import compute_growth
from gargantext.util.scheduling import scheduled
from gargantext.constants import graph_constraints
......@@ -64,7 +65,15 @@ def compute_graph( corpus_id=None , cooc_id=None
print("GRAPH #%d ... Filtering by bridgeness %d." % (cooc_id, bridgeness))
data = filterByBridgeness(G,partition,ids,weight,bridgeness,"node_link",field1,field2)
if start is not None and end is not None:
growth= dict()
for (ng_id, score) in compute_growth(corpus_id, groupList_id, mapList_id, start, end):
growth[ng_id] = float(score) + 100 # for the normalization, should not be negativ
for node in data['nodes']:
node['attributes']['growth'] = growth[node['id']]
print("GRAPH #%d ... Saving Graph in hyperdata as json." % cooc_id)
node = session.query(Node).filter(Node.id == cooc_id).first()
......@@ -187,7 +196,7 @@ def get_graph( request=None , corpus=None
)
.filter( Start.key == 'publication_date')
.filter( Start.value_utc >= date_start_utc)
)
)
# Filter corpus by date if any end date
......@@ -203,8 +212,7 @@ def get_graph( request=None , corpus=None
)
.filter( End.key == 'publication_date')
.filter( End.value_utc <= date_end_utc )
)
)
# Finally test if the size of the corpora is big enough
# --------------------------------
......@@ -221,10 +229,11 @@ def get_graph( request=None , corpus=None
#, limit=size
)
return {"state" : "saveOnly",
"target_id" : cooc_id,
"target_name": cooc_name,
"target_date": cooc_date}
return { "state" : "saveOnly"
, "target_id" : cooc_id
, "target_name": cooc_name
, "target_date": cooc_date
}
elif corpus_size > graph_constraints['corpusMax']:
# Then compute cooc asynchronously with celery
......@@ -262,5 +271,5 @@ def get_graph( request=None , corpus=None
if len(data) == 0:
print("GRAPH # ... GET_GRAPH: 0 coocs in matrix")
data = {'nodes':[], 'links':[]} # empty data
return data
"""
Computes ngram growth on periods
"""
from gargantext.models import Node, NodeNgram, NodeNodeNgram, NodeNgramNgram
from gargantext.util.db_cache import cache
from gargantext.util.db import session, bulk_insert, aliased, \
func, get_engine # = sqlalchemy.func like sum() or count()
from datetime import datetime
def timeframes(start, end):
"""
timeframes :: String -> String -> (UTCTime, UTCTime, UTCTime)
"""
start = datetime.strptime (str(start), "%Y-%m-%d")
end = datetime.strptime (str(end), "%Y-%m-%d")
date_0 = start - (end - start)
date_1 = start
date_2 = end
return (date_0, date_1, date_2)
def compute_growth(corpus_id, groupList_id, mapList_id, start, end):
"""
compute_graph :: Int -> UTCTime -> UTCTime -> Int -> Int
-> [(Int, Numeric)]
this function uses SQL function in
/srv/gargantext/install/gargamelle/sqlFunctions.sql
First compute occurrences of ngrams in mapList (with groups) on the first
period, then on the second and finally returns growth.
Directly computed with Postgres Database (C) for optimization.
"""
connection = get_engine()
(date_0, date_1, date_2) = timeframes(start, end)
query = """SELECT * FROM OCC_HIST( {corpus_id}
, {groupList_id}
, {mapList_id}
, '{date_0}'
, '{date_1}'
, '{date_2}'
)
""".format( corpus_id = corpus_id
, groupList_id = groupList_id
, mapList_id = mapList_id
, date_0 = date_0
, date_1 = date_1
, date_2 = date_2
)
return(connection.execute(query))
......@@ -19,6 +19,8 @@ def compress_graph(graphdata):
for node in graphdata['nodes']:
node['lb'] = node['label']
del node['label']
#node['attributes']['growth'] = 0.8
node['at'] = node['attributes']
del node['attributes']
......
-- CNRS Copyrights 2017
-- See Gargantext Licence for details
-- Maintainers: team@gargantext.org
-- USAGE
-- psql gargandb < occ_growth.sql
-- OCC_HIST :: Corpus.id -> GroupList.id -> MapList.id -> Start -> EndFirst -> EndLast
-- EXEMPLE USAGE
-- SELECT * FROM OCC_HIST(182856, 183859, 183866, '1800-03-15 17:00:00+01', '2000-03-15 17:00:00+01', '2017-03-15 17:00:00+01')
-- OCC_HIST_PART :: Corpus.id -> GroupList.id -> Start -> End
DROP FUNCTION OCC_HIST_PART(integer, integer, timestamp without time zone, timestamp without time zone);
-- DROP for tests
CREATE OR REPLACE FUNCTION OCC_HIST_PART(int, int, timestamp, timestamp) RETURNS TABLE (ng_id int, score float8)
AS $$
-- EXPLAIN ANALYZE
SELECT
COALESCE(gr.ngram1_id, ng1.ngram_id) as ng_id,
SUM(ng1.weight) as score
from nodes n
-- BEFORE
INNER JOIN nodes as n1 ON n1.id = n.id
INNER JOIN nodes_ngrams ng1 ON ng1.node_id = n1.id
-- Limit with timestamps: ]start, end]
INNER JOIN nodes_hyperdata nh1 ON nh1.node_id = n1.id
AND nh1.value_utc > $3
AND nh1.value_utc <= $4
-- Group List
LEFT JOIN nodes_ngrams_ngrams gr ON ng1.ngram_id = gr.ngram2_id
AND gr.node_id = $2
WHERE
n.typename = 4
AND n.parent_id = $1
GROUP BY 1
$$
LANGUAGE SQL;
DROP FUNCTION OCC_HIST(integer, integer, integer, timestamp without time zone, timestamp without time zone, timestamp without time zone);
-- OCC_HIST :: Corpus.id -> GroupList.id -> MapList.id -> Start -> EndFirst -> EndLast
CREATE OR REPLACE FUNCTION OCC_HIST(int, int, int, timestamp, timestamp, timestamp) RETURNS TABLE (ng_id int, score numeric)
AS $$
WITH OCC1 as (SELECT * from OCC_HIST_PART($1, $2, $4, $5))
, OCC2 as (SELECT * from OCC_HIST_PART($1, $2, $5, $6))
, GROWTH as (SELECT ml.ngram_id as ngram_id
, COALESCE(OCC1.score, null) as score1
, COALESCE(OCC2.score, null) as score2
FROM nodes_ngrams ml
LEFT JOIN OCC1 ON OCC1.ng_id = ml.ngram_id
LEFT JOIN OCC2 ON OCC2.ng_id = ml.ngram_id
WHERE ml.node_id = $3
ORDER by score2 DESC)
SELECT ngram_id, COALESCE(ROUND(CAST((100 * (score2 - score1) / COALESCE((score2 + score1), 1)) as numeric), 2), 0) from GROWTH
$$
LANGUAGE SQL;
-- BEHAVIORAL TEST (should be equal to occ in terms table)
-- WITH OCC as (SELECT * from OCC_HIST(182856, 183859, '1800-03-15 17:00:00+01', '2300-03-15 17:00:00+01'))
-- SELECT ng_id, score from OCC
-- INNER JOIN nodes_ngrams ml on ml.ngram_id = ng_id
-- AND ml.node_id = 183866
-- ORDER BY score DESC;
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment