graph.py

# Gargantext lib
from gargantext.util.db           import session, aliased
from gargantext.util.lists        import WeightedMatrix, UnweightedList, Translations
from gargantext.util.http         import JsonHttpResponse
from gargantext.models            import Node, Ngram, NodeNgram, NodeNgramNgram, NodeHyperdata

from graph.cooccurrences          import countCooccurrences
from graph.distances              import clusterByDistances
from graph.bridgeness             import filterByBridgeness
from graph.mail_notification      import notify_owner
from graph.growth                 import compute_growth

from gargantext.util.scheduling   import scheduled
from gargantext.constants         import graph_constraints

from celery                       import shared_task
from datetime                     import datetime


@shared_task
def compute_graph( corpus_id=None      , cooc_id=None
                , field1='ngrams'     , field2='ngrams'
                , start=None          , end=None
                , mapList_id=None     , groupList_id=None
                , distance=None       , bridgeness=None
                , n_min=1, n_max=None , limit=1000
                , isMonopartite=True  , threshold = 3
                , save_on_db= True    , reset=True
                ) :
        '''
        All steps to compute a graph:

        1) count Cooccurrences  (function countCooccurrences)
                main parameters: threshold, isMonopartite

        2) filter and cluster By Distances (function clusterByDistances)
                main parameter: distance
                TODO option clustering='louvain'
                     or 'percolation' or 'random walk' or ...

        3) filter By Bridgeness (function filterByBridgeness)
                main parameter: bridgeness

        4) format the graph     (formatGraph)
                main parameter: format_
        '''

        print("GRAPH # ... Computing cooccurrences.")
        (cooc_id, cooc_matrix) = countCooccurrences( corpus_id=corpus_id, cooc_id=cooc_id
                                    , field1=field1, field2=field2
                                    , start=start           , end =end
                                    , mapList_id=mapList_id , groupList_id=groupList_id
                                    , isMonopartite=True    , threshold = threshold
                                    , distance=distance     , bridgeness=bridgeness
                                    , save_on_db = True     , reset = reset
                                    )
        print("GRAPH #%d ... Cooccurrences computed." % (cooc_id))


        print("GRAPH #%d ... Clustering with %s distance." % (cooc_id,distance))
        G, partition, ids, weight = clusterByDistances ( cooc_matrix
                                                       , field1="ngrams", field2="ngrams"
                                                       , distance=distance
                                                       )

        print("GRAPH #%d ... Filtering by bridgeness %d." % (cooc_id, bridgeness))
        data = filterByBridgeness(G,partition,ids,weight,bridgeness,"node_link",field1,field2)

        if start is not None and end is not None:
            growth= dict()
            for (ng_id, score) in compute_growth(corpus_id, groupList_id, mapList_id, start, end):
                growth[ng_id] = float(score) + 100 # for the normalization, should not be negativ

            for node in data['nodes']:
                node['attributes']['growth'] = growth[node['id']]

        print("GRAPH #%d ... Saving Graph in hyperdata as json." % cooc_id)
        node = session.query(Node).filter(Node.id == cooc_id).first()

        if node.hyperdata.get(distance, None) is None:
            print("GRAPH #%d ... Distance %s has not been computed already." % (cooc_id, distance))
            node.hyperdata[distance] = dict()

        node.hyperdata[distance][bridgeness] = data

        node.hyperdata[distance]["nodes"]    = len(G.nodes())
        node.hyperdata[distance]["edges"]    = len(G.edges())

        node.save_hyperdata()
        session.commit()

        print("GRAPH #%d ... Notify by email owner of the graph." % cooc_id)
        corpus = session.query(Node).filter(Node.id==corpus_id).first()
        #notify_owner(corpus, cooc_id, distance, bridgeness)

        print("GRAPH #%d ... Returning data as json." % cooc_id)
        return data

def get_graph( request=None         , corpus=None
            , field1='ngrams'       , field2='ngrams'
            , mapList_id = None     , groupList_id = None
            , cooc_id=None          , type='node_link'
            , start=None            , end=None
            , distance='conditional', bridgeness=5
            , threshold=1           , isMonopartite=True
            , saveOnly=True
            ) :
    '''
    Get_graph : main steps:
    0) Check the parameters

    get_graph :: GraphParameters -> Either (Dic Nodes Links) (Dic State Length)
        where type Length = Int

    get_graph first checks the parameters and return either graph data or a dict with
    state "type" with an integer to indicate the size of the parameter
    (maybe we could add a String in that step to factor and give here the error message)

    1) compute_graph (see function above)
    2) return graph

    '''
    overwrite_node_contents = False

    # Case of graph has been computed already
    if cooc_id is not None:
        print("GRAPH#%d ... Loading data already computed." % int(cooc_id))
        node = session.query(Node).filter(Node.id == cooc_id).first()

        # Structure of the Node.hyperdata[distance][bridbeness]
        # All parameters (but distance and bridgeness)
        # are in Node.hyperdata["parameters"]

        # Check distance of the graph
        if node.hyperdata.get(distance, None) is not None:
            graph = node.hyperdata[distance]

            # Check bridgeness of the graph
            if graph.get(str(bridgeness), None) is not None:
                return graph[str(bridgeness)]

    # new graph: we give it an empty node with new id and status
    elif saveOnly:
        # NB: we do creation already here (instead of same in countCooccurrences)
        #     to guarantee a unique ref id to the saveOnly graph (async generation)
        new_node = corpus.add_child(
                            typename  = "COOCCURRENCES",
                            name = "GRAPH (in corpus %s)" % corpus.id
                            )

        session.add(new_node)
        session.commit()
        cooc_id = new_node.id
        cooc_name = new_node.name
        cooc_date = new_node.date
        # and the empty content will need redoing by countCooccurrences
        overwrite_node_contents = True
        print("GRAPH #%d ... Created new empty data node for saveOnly" % int(cooc_id))


    # Case of graph has not been computed already
    # First, check the parameters

    # Case of mapList not big enough
    # ==============================

    # if we do not have any mapList_id already
    if mapList_id is None:
        mapList_id = session.query(Node.id).filter(Node.typename == "MAPLIST").first()[0]

    mapList_size = session.query(NodeNgram).filter(NodeNgram.node_id == mapList_id).count()

    if mapList_size < graph_constraints['mapList']:
        # Do not compute the graph if mapList is not big enough
        return {'state': "mapListError", "length" : mapList_size}


    # Instantiate query for case of corpus not big enough
    # ===================================================
    corpus_size_query = (session.query(Node)
                                .filter(Node.typename=="DOCUMENT")
                                .filter(Node.parent_id == corpus.id)
                        )

    # Filter corpus by date if any start date
    # ---------------------------------------
    if start is not None:
        #date_start = datetime.datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S")
        date_start = datetime.strptime (str(start), "%Y-%m-%d")
        date_start_utc = date_start.strftime("%Y-%m-%d %H:%M:%S")

        Start=aliased(NodeHyperdata)
        corpus_size_query = (corpus_size_query.join( Start
                                     , Start.node_id == Node.id
                                     )
                                .filter( Start.key == 'publication_date')
                                .filter( Start.value_utc >= date_start_utc)
                            )


    # Filter corpus by date if any end date
    # -------------------------------------
    if end is not None:
        date_end = datetime.strptime (str(end), "%Y-%m-%d")
        date_end_utc = date_end.strftime("%Y-%m-%d %H:%M:%S")

        End=aliased(NodeHyperdata)

        corpus_size_query = (corpus_size_query.join( End
                                     , End.node_id == Node.id
                                     )
                                .filter( End.key == 'publication_date')
                                .filter( End.value_utc <= date_end_utc )
                            )

    # Finally test if the size of the corpora is big enough
    # --------------------------------
    corpus_size = corpus_size_query.count()

    if saveOnly is not None and saveOnly == "True":
        scheduled(compute_graph)( corpus_id=corpus.id, cooc_id=cooc_id
                                   #, field1="ngrams", field2="ngrams"
                                    , start=start           , end =end
                                    , mapList_id=mapList_id , groupList_id=groupList_id
                                    , isMonopartite=True    , threshold = threshold
                                    , distance=distance     , bridgeness=bridgeness
                                    , save_on_db = True     , reset=overwrite_node_contents
                                   #, limit=size
                                    )

        return { "state"      : "saveOnly"
               , "target_id"  : cooc_id
               , "target_name": cooc_name
               , "target_date": cooc_date
               }

    elif corpus_size > graph_constraints['corpusMax']:
        # Then compute cooc asynchronously with celery
        scheduled(compute_graph)( corpus_id=corpus.id, cooc_id=cooc_id
                                   #, field1="ngrams", field2="ngrams"
                                    , start=start           , end =end
                                    , mapList_id=mapList_id , groupList_id=groupList_id
                                    , isMonopartite=True    , threshold = threshold
                                    , distance=distance     , bridgeness=bridgeness
                                    , save_on_db = True     , reset=overwrite_node_contents
                                   #, limit=size
                                    )
        # Dict to inform user that corpus maximum is reached
        # then graph is computed asynchronously
        return {"state" : "corpusMax", "length" : corpus_size}

    elif corpus_size <= graph_constraints['corpusMin']:
        # Do not compute the graph if corpus is not big enough
        return {"state" : "corpusMin", "length" : corpus_size}

    else:
        # If graph_constraints are ok then compute the graph in live
        data = compute_graph( corpus_id=corpus.id, cooc_id=cooc_id
                                  #, field1="ngrams", field2="ngrams"
                                   , start=start           , end =end
                                   , mapList_id=mapList_id , groupList_id=groupList_id
                                   , isMonopartite=True    , threshold = threshold
                                   , distance=distance     , bridgeness=bridgeness
                                   , save_on_db = True     , reset=overwrite_node_contents
                                  #, limit=size
                                   )

    # case when 0 coocs are observed (usually b/c not enough ngrams in maplist)

    if len(data) == 0:
        print("GRAPH #   ... GET_GRAPH: 0 coocs in matrix")
        data = {'nodes':[], 'links':[]}  # empty data

    return data