cooccurrences.py 8.24 KB
Newer Older
1
from gargantext.models     import Node, Ngram, NodeNgram, NodeNgramNgram, \
2
                                  NodeHyperdata, HyperdataKey
3 4 5 6 7 8 9 10 11
from gargantext.util.db    import session, aliased, bulk_insert, func

from gargantext.util.lists import WeightedMatrix, UnweightedList, Translations

from sqlalchemy            import desc, asc, or_, and_

#import inspect
import datetime

12
from celery               import shared_task
13 14 15 16 17 18 19 20

def filterMatrix(matrix, mapList_id, groupList_id):
    mapList   = UnweightedList( mapList_id  )
    group_list = Translations  ( groupList_id )
    cooc       = matrix & (mapList * group_list)
    return cooc


21 22
@shared_task
def countCooccurrences( corpus_id=None         , test= False
23 24 25 26 27
                      , field1='ngrams'     , field2='ngrams'
                      , start=None          , end=None
                      , mapList_id=None     , groupList_id=None
                      , n_min=1, n_max=None , limit=1000
                      , coocNode_id=None    , reset=True
28
                      , isMonopartite=True  , threshold = 3
29
                      , save_on_db= False,  # just return the WeightedMatrix,
30 31
                                                 #    (don't write to DB)
                      ):
32 33 34 35 36
    '''
    Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id
    For the moment list of paramters are not supported because, lists need to
    be merged before.
    corpus           :: Corpus
37

38
    mapList_id       :: Int
39 40 41 42 43 44 45 46 47
    groupList_id     :: Int

    For the moment, start and end are simple, only year is implemented yet
    start :: TimeStamp -- example: '2010-05-30 02:00:00+02'
    end   :: TimeStamp
    limit :: Int

    '''
    # TODO : add hyperdata here
48

49 50
    # Security test
    field1,field2 = str(field1), str(field2)
51

52 53 54 55 56
    # Parameters to save in hyperdata of the Node Cooc
    parameters = dict()
    parameters['field1'] = field1
    parameters['field2'] = field2

57 58 59
    # Get corpus as Python object
    corpus = session.query(Node).filter(Node.id==corpus_id).first()

60 61
    # Get node
    if not coocNode_id:
62
        
63 64 65 66 67 68 69
        coocNode_id0  = ( session.query( Node.id )
                                .filter( Node.typename  == "COOCCURRENCES"
                                       , Node.name      == "GRAPH EXPLORER"
                                       , Node.parent_id == corpus.id
                                       )
                                .first()
                        )
70 71 72
        if not coocNode_id:
            coocNode = corpus.add_child(
            typename  = "COOCCURRENCES",
73
            name = "GRAPH (in corpus %s)" % corpus.id
74 75 76 77 78 79 80
            )

            session.add(coocNode)
            session.commit()
            coocNode_id = coocNode.id
        else :
            coocNode_id = coocNode_id[0]
81

82 83 84
    if reset == True :
        session.query( NodeNgramNgram ).filter( NodeNgramNgram.node_id == coocNode_id ).delete()
        session.commit()
85

86

87
    NodeNgramX = aliased(NodeNgram)
88

89
    # Simple Cooccurrences
90
    cooc_score = func.count(NodeNgramX.node_id).label('cooc_score')
91

92
    # A kind of Euclidean distance cooccurrences
93
    #cooc_score = func.sqrt(func.sum(NodeNgramX.weight * NodeNgramY.weight)).label('cooc_score')
94

95 96 97
    if isMonopartite :
        NodeNgramY = aliased(NodeNgram)

98 99 100 101 102 103 104 105 106 107 108 109 110 111
        cooc_query = (session.query( NodeNgramX.ngram_id
                                   , NodeNgramY.ngram_id
                                   , cooc_score
                                   )
                             .join( Node
                                  , Node.id == NodeNgramX.node_id
                                  )
                             .join( NodeNgramY
                                  , NodeNgramY.node_id == Node.id
                                  )
                             .filter( Node.parent_id==corpus.id
                                    , Node.typename=="DOCUMENT"
                                    )
                     )
112 113
    else :
        NodeNgramY = aliased(NodeNgram)
114

115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
        cooc_query = (session.query( NodeHyperdataNgram.ngram_id
                                   , NodeNgramY.ngram_id
                                   , cooc_score
                                   )
                             .join( Node
                                  , Node.id == NodeHyperdataNgram.node_id
                                  )
                             .join( NodeNgramY
                                  , NodeNgramY.node_id == Node.id
                                  )
                             .join( Hyperdata
                                  , Hyperdata.id == NodeHyperdataNgram.hyperdata_id
                                  )
                             .filter( Node.parent_id == corpus.id
                                    , Node.typename == "DOCUMENT"
                                    )
                             .filter( Hyperdata.name == field1 )
                     )
133 134 135 136 137

    # Size of the ngrams between n_min and n_max
    if n_min is not None or n_max is not None:
        if isMonopartite:
            NgramX = aliased(Ngram)
138 139 140
            cooc_query = cooc_query.join ( NgramX
                                         , NgramX.id == NodeNgramX.ngram_id
                                         )
141

142
        NgramY = aliased(Ngram)
143 144 145
        cooc_query = cooc_query.join ( NgramY
                                     , NgramY.id == NodeNgramY.ngram_id
                                     )
146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166

    if n_min is not None:
        cooc_query = (cooc_query
             .filter(NgramY.n >= n_min)
            )
        if isMonopartite:
            cooc_query = cooc_query.filter(NgramX.n >= n_min)

    if n_max is not None:
        cooc_query = (cooc_query
             .filter(NgramY.n >= n_min)
            )
        if isMonopartite:
            cooc_query = cooc_query.filter(NgramX.n >= n_min)

    # Cooc between the dates start and end
    if start is not None:
        #date_start = datetime.datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S")
        # TODO : more complexe date format here.
        date_start = datetime.datetime.strptime (str(start), "%Y-%m-%d")
        date_start_utc = date_start.strftime("%Y-%m-%d %H:%M:%S")
167

168
        Start=aliased(NodeHyperdata)
169 170 171
        cooc_query = (cooc_query.join( Start
                                     , Start.node_id == Node.id
                                     )
172 173
                                .filter( Start.key == 'publication_date')
                                .filter( Start.value_utc >= date_start_utc)
174 175
                      )

176 177
        parameters['start'] = date_start_utc

178 179 180 181 182

    if end is not None:
        # TODO : more complexe date format here.
        date_end = datetime.datetime.strptime (str(end), "%Y-%m-%d")
        date_end_utc = date_end.strftime("%Y-%m-%d %H:%M:%S")
183

184
        End=aliased(NodeHyperdata)
185

186 187 188
        cooc_query = (cooc_query.join( End
                                     , End.node_id == Node.id
                                     )
189 190
                                .filter( End.key == 'publication_date')
                                .filter( End.value_utc <= date_end_utc )
191 192
                      )

193
        parameters['end'] = date_end_utc
194 195 196 197

    if isMonopartite:
        # Cooc is symetric, take only the main cooccurrences and cut at the limit
        cooc_query = cooc_query.filter(NodeNgramX.ngram_id < NodeNgramY.ngram_id)
198

199
    cooc_query = cooc_query.having(cooc_score > threshold)
200

201 202 203 204 205
    if isMonopartite:
        cooc_query = cooc_query.group_by(NodeNgramX.ngram_id, NodeNgramY.ngram_id)
    else:
        cooc_query = cooc_query.group_by(NodeHyperdataNgram.ngram_id, NodeNgramY.ngram_id)

206
    # Order according some scores
207 208
    # If ordering is really needed, use Ordered Index (faster)
    #cooc_query = cooc_query.order_by(desc('cooc_score'))
209 210

    matrix = WeightedMatrix(cooc_query)
211
    cooc = filterMatrix(matrix, mapList_id, groupList_id)
212

213 214 215
    parameters['MapList_id'] = str(mapList_id)
    parameters['GroupList_id'] = str(mapList_id)

216
    if save_on_db:
217
        # Saving cooc Matrix
218
        cooc.save(coocNode_id)
219 220 221 222 223 224 225 226
        
        # Saving the parameters
        coocNode = session.query(Node).filter(Node.id==coocNode_id).first()
        coocNode.hyperdata = parameters
        session.add(coocNode)
        session.commit()
        
        # Log message
227 228 229
        print("Cooccurrence Matrix saved")
    
    return cooc