graph.py 8.04 KB
Newer Older
1
# Gargantext lib
2
from gargantext.util.db           import session, aliased
3
from gargantext.util.lists        import WeightedMatrix, UnweightedList, Translations
4
from gargantext.util.http         import JsonHttpResponse
5
from gargantext.models            import Node, Ngram, NodeNgram, NodeNgramNgram, NodeHyperdata
6 7

#from gargantext.util.toolchain.ngram_coocs import compute_coocs
8 9 10
from graph.cooccurrences  import countCooccurrences, filterMatrix
from graph.distances      import clusterByDistances
from graph.bridgeness     import filterByBridgeness
11

12
from gargantext.util.scheduling import scheduled
13
from gargantext.constants import graph_constraints
14 15 16

from datetime import datetime

17 18 19 20 21 22 23 24 25
def get_graph( request=None         , corpus=None
            , field1='ngrams'       , field2='ngrams'
            , mapList_id = None     , groupList_id = None
            , cooc_id=None          , type='node_link'
            , start=None            , end=None
            , threshold=1
            , distance='conditional'
            , isMonopartite=True                # By default, we compute terms/terms graph
            , bridgeness=5
26
            , saveOnly=None
27 28 29 30
            #, size=1000
        ):
    '''
    Get_graph : main steps:
31 32 33 34 35 36 37 38 39
    0) Check the parameters
    
    get_graph :: GraphParameters -> Either (Dic Nodes Links) (Dic State Length)
        where type Length = Int

    get_graph first checks the parameters and return either graph data or a dic with 
    state "type" with an integer to indicate the size of the parameter 
    (maybe we could add a String in that step to factor and give here the error message)

40 41 42 43 44 45
    1) count Cooccurrences  (function countCooccurrences)
            main parameters: threshold

    2) filter and cluster By Distances (function clusterByDistances)
            main parameter: distance

46 47
    3) filter By Bridgeness (function filterByBridgeness)
            main parameter: bridgeness
48

49 50 51 52 53
    4) format the graph     (formatGraph)
            main parameter: format_

    '''

54 55

    before_cooc = datetime.now()
56
    
57

58
    # case of Cooccurrences have not been computed already
59 60 61 62 63 64 65 66
    if cooc_id == None:

        # case of mapList not big enough
        # ==============================
        # if we do not have any mapList_id already
        if mapList_id is None:
            mapList_id = session.query(Node.id).filter(Node.typename == "MAPLIST").first()[0]

67 68 69
        mapList_size_query = session.query(NodeNgram).filter(NodeNgram.node_id == mapList_id)
        mapList_size = mapList_size_query.count()
        if mapList_size < graph_constraints['mapList']:
70
            # Do not compute the graph if mapList is not big enough
71
            return {'state': "mapListError", "length" : mapList_size}
72 73


74 75
        # Instantiate query for case of corpus not big enough
        # ===================================================
76 77 78 79 80
        corpus_size_query = (session.query(Node)
                                    .filter(Node.typename=="DOCUMENT")
                                    .filter(Node.parent_id == corpus.id)
                            )

81 82
        # filter by date if any start date
        # --------------------------------
83 84 85 86 87 88 89 90 91 92 93 94 95 96
        if start is not None:
            #date_start = datetime.datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S")
            date_start = datetime.strptime (str(start), "%Y-%m-%d")
            date_start_utc = date_start.strftime("%Y-%m-%d %H:%M:%S")

            Start=aliased(NodeHyperdata)
            corpus_size_query = (corpus_size_query.join( Start
                                         , Start.node_id == Node.id
                                         )
                                    .filter( Start.key == 'publication_date')
                                    .filter( Start.value_utc >= date_start_utc)
                          )


97 98
        # filter by date if any end date
        # --------------------------------
99 100 101 102 103 104 105 106 107 108 109 110
        if end is not None:
            date_end = datetime.strptime (str(end), "%Y-%m-%d")
            date_end_utc = date_end.strftime("%Y-%m-%d %H:%M:%S")

            End=aliased(NodeHyperdata)

            corpus_size_query = (corpus_size_query.join( End
                                         , End.node_id == Node.id
                                         )
                                    .filter( End.key == 'publication_date')
                                    .filter( End.value_utc <= date_end_utc )
                          )
111 112
        
        
113

114 115
        # Finally test if the size of the corpora is big enough
        # --------------------------------
116
        corpus_size = corpus_size_query.count()
117 118 119 120 121 122 123 124 125 126 127 128

        if saveOnly is not None and saveOnly == "True":
            scheduled(countCooccurrences)( corpus_id=corpus.id
                                       #, field1="ngrams", field2="ngrams"
                                        , start=start           , end =end
                                        , mapList_id=mapList_id , groupList_id=groupList_id
                                        , isMonopartite=True    , threshold = threshold
                                        , save_on_db = True
                                       #, limit=size
                                        )
            return {"state" : "saveOnly"}

129
        if corpus_size > graph_constraints['corpusMax']:
130
            # Then compute cooc asynchronously with celery
131
            scheduled(countCooccurrences)( corpus_id=corpus.id
delanoe's avatar
delanoe committed
132 133 134 135 136 137 138
                                       #, field1="ngrams", field2="ngrams"
                                        , start=start           , end =end
                                        , mapList_id=mapList_id , groupList_id=groupList_id
                                        , isMonopartite=True    , threshold = threshold
                                        , save_on_db = True
                                       #, limit=size
                                        )
139 140 141
            # Dic to inform user that corpus maximum is reached then
            # graph is computed asynchronously
            return {"state" : "corpusMax", "length" : corpus_size}
142
        
143
        elif corpus_size <= graph_constraints['corpusMin']:
144
            # Do not compute the graph if corpus is not big enough
145
            return {"state" : "corpusMin", "length" : corpus_size}
delanoe's avatar
delanoe committed
146 147
  
        else:
148
            # If graph_constraints are ok then compute the graph in live
149
            cooc_matrix = countCooccurrences( corpus_id=corpus.id
delanoe's avatar
delanoe committed
150 151 152 153
                                       #, field1="ngrams", field2="ngrams"
                                        , start=start           , end =end
                                        , mapList_id=mapList_id , groupList_id=groupList_id
                                        , isMonopartite=True    , threshold = threshold
154
                                        , save_on_db = True
delanoe's avatar
delanoe committed
155 156
                                       #, limit=size
                                        )
157
    else:
158 159
        print("Getting data for matrix %d", int(cooc_id))
        matrix      = WeightedMatrix(int(cooc_id))
160
        #print(matrix)
161 162
        cooc_matrix = filterMatrix(matrix, mapList_id, groupList_id)

163 164 165 166 167 168

    # fyi
    after_cooc = datetime.now()
    print("... Cooccurrences took %f s." % (after_cooc - before_cooc).total_seconds())


169 170 171 172
    # case when 0 coocs are observed (usually b/c not enough ngrams in maplist)
    if len(cooc_matrix.items) == 0:
        print("GET_GRAPH: 0 coocs in matrix")
        data = {'nodes':[], 'links':[]}  # empty data
173

174 175 176 177 178 179 180 181 182
    # normal case
    else:
        G, partition, ids, weight = clusterByDistances ( cooc_matrix
                                                       , field1="ngrams", field2="ngrams"
                                                       , distance=distance
                                                       )

        after_cluster = datetime.now()
        print("... Clustering took %f s." % (after_cluster - after_cooc).total_seconds())
183

184
        data = filterByBridgeness(G,partition,ids,weight,bridgeness,type,field1,field2)
185

186 187
        after_filter = datetime.now()
        print("... Filtering took %f s." % (after_filter - after_cluster).total_seconds())
188 189

    return data