intersection.py 3.53 KB
from gargantext.models     import Node, Ngram, NodeNgram, NodeNgramNgram, \
                                  HyperdataKey

from gargantext.util.db    import session, aliased, bulk_insert, func

from gargantext.util.lists import WeightedMatrix, UnweightedList, Translations
from gargantext.util.http  import JsonHttpResponse
from sqlalchemy            import desc, asc, or_, and_, func

import datetime
import ast
import networkx as nx


def doc_freq(corpus_id, node_ids):
    '''
    doc_freq :: Corpus_id -> [(Ngram_id, Int)]
    Given a corpus, compute number of documents that have the ngram in it.
    '''
    return ( session.query(NodeNgram.ngram_id, func.count(NodeNgram.node_id))
                    .join(Node, NodeNgram.node_id == Node.id)
                    .filter( Node.parent_id == corpus_id
                           , Node.typename== 'DOCUMENT')
                    .filter( NodeNgram.weight > 0 
                           , NodeNgram.ngram_id.in_(node_ids) )
                    .group_by(NodeNgram.ngram_id)
                    .all()
                  )

def doc_ngram_representativity(corpus_id, node_ids):
    '''
    doc_ngram_representativity :: Corpus_ID -> Dict Ngram_id Float
    Given a corpus, compute part of of documents that have the ngram it it.
    '''
    nodes_count = ( session.query(Node)
                           .filter( Node.parent_id == corpus_id
                                  , Node.typename == 'DOCUMENT'
                                  )
                           .count()
                  )

    result = dict()
    for ngram_id, somme in doc_freq(corpus_id, node_ids):
        result[ngram_id] = somme / nodes_count

    return result

def compare_corpora(Corpus_id_A, Corpus_id_B, node_ids):
    '''
    compare_corpora :: Corpus_id -> Corpus_id -> Dict Ngram_id Float
    Given two corpus :
        - if corpora are the same, it return :
            (dict of document frequency per ngram as key)
        - if corpora are different, it returns :
            doc_ngram_representativit(Corpus_id_A) / doc_ngram_representativity(Corpus_id_B)
            (as dict per ngram as key)
    '''

    result = dict()
    
    if int(Corpus_id_A) == int(Corpus_id_B):
        for ngram_id, somme in doc_freq(Corpus_id_A, node_ids):
            result[ngram_id] = somme
    
    else:

        data_A = doc_ngram_representativity(Corpus_id_A, node_ids)
        data_B = doc_ngram_representativity(Corpus_id_B, node_ids)
    
        queue     = list()

        for k in data_A.keys():
            if k not in data_B.keys():
                queue.append(k)
            else:
                result[k] = data_B[k] / data_A[k]

        maximum = max([ result[k] for k in result.keys()])
        minimum = min([ result[k] for k in result.keys()])

        for k in queue:
            result[k] = minimum

    return result

def intersection(request , corpuses_ids, measure='cooc'):
    '''
    intersection :: (str(Int) + "a" str(Int)) -> Dict(Ngram.id :: Int, Score :: Int)
    intersection = returns as Json Http Response the intersection of two graphs

    '''
    if request.method == 'POST' and "nodeids" in request.POST and len(request.POST["nodeids"])>0 :

        node_ids = [int(i) for i in (ast.literal_eval( request.POST["nodeids"] )) ]
        # Here are the visible nodes of the initial semantic map.

        corpuses_ids = corpuses_ids.split('a')

        corpuses_ids = [int(i) for i in corpuses_ids]
        # corpus[1] will be the corpus to compare
        
        return JsonHttpResponse(compare_corpora(corpuses_ids[0], corpuses_ids[1], node_ids))