Commit b33f37eb authored by delanoe's avatar delanoe

[FEAT] Generic cooccurrence function with miam_id, stop_id, group_id.

parent 38556c56
from env import *
from admin.utils import PrintException
from gargantext_web.db import NodeNgram
from gargantext_web.db import *
from parsing.corpustools import *
import sqlalchemy
from sqlalchemy.sql import func
from sqlalchemy import desc, asc, or_, and_, Date, cast, select
from sqlalchemy import literal_column
from sqlalchemy.orm import aliased
from sqlalchemy.sql import func
from gargantext_web.db import Node, NodeNgram, NodeNgramNgram, NodeNodeNgram, NodeHyperdata, Hyperdata
from gargantext_web.db import session, cache, get_or_create_node, bulk_insert
from analysis.lists import WeightedMatrix, UnweightedList, Translations
# from gargantext_web.db import Node, get_cursor
def cooc(corpus=None
, miam_id=None, stop_id=None, group_id=None
, start=None, end=None
, limit=1000):
'''
Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id
For the moment list of paramters are not supported because, lists need to
be merged before.
corpus :: Corpus
miam_id :: Int
stop_id :: Int
group_id :: Int
For the moment, start and ens are simple, only year is implemented yet
start :: TimeStamp -- example: '2010-05-30 02:00:00+02'
end :: TimeStamp
limit :: Int
def cooccurrences(user_id=None, corpus_id=None,
mainlist_id=None, stoplist_id=None,
lem=False, stem=True, cvalue=False,
date_begin=None, date_end=None,
size=10, n_min=2, n_max=3):
'''
Function to create a cooccurrence Node
---------------------------------------------------
cooccurrences :: [Text] -> [Word] -> [[Word]]
node_cooc = get_or_create_node(nodetype='Cooccurrence', corpus=corpus
, name_str="Cooccurrences corpus " + str(corpus.id) + "list_id: " + str(miam_id)
)
user_id :: Integer, User.id who creates the cooccurrence matrix
corpus_id :: Integer, Node.id with NodeType "Corpus"
# TODO : save parameters in Node
# args, _, _, parameters = inspect.getargvalues(inspect.currentframe())
# print(parameters)
# for parameter in parameters.keys():
# print(parameters[parameter])
# node_cooc.hyperdata[parameter] = parameters[parameter]
#
# session.add(node_cooc)
# session.commit()
# print(node_cooc.hyperdata)
session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==node_cooc.id).delete()
session.commit()
miamlist_id :: Integer, Node.id with NodeType "MiamList" and with parent_id=corpus_id
stoplist_id :: Integer, Node.id with NodeType "StopList" and with parent_id=corpus_id
mainlist_id :: Integer, Node.id with NodeType "MainList" and with parent_id=corpus_id
NodeNgramX = aliased(NodeNgram)
NodeNgramY = aliased(NodeNgram)
lem :: False | True, if lemmatization should be taken into account
stem :: False | True, if stemmatization should be taken into account
cvalue :: False | True, if cvalue should be taken into account
group :: False | True, if manual groups should be taken into account
doc_id = cache.NodeType['Document'].id
date_begin :: Datetime, format YYYY-MM-DD, begin of corpus splitted by date
date_end :: Datetime, format YYYY-MM-DD, end of corpus splitted by date
cooc_query = (session.query(NodeNgramX.ngram_id, NodeNgramY.ngram_id, func.count())
.join(Node, Node.id == NodeNgramX.node_id)
.join(NodeNgramY, NodeNgramY.node_id == Node.id)
)
size :: Integer, size of the cooccurrence list
n_min :: Integer, minimal ngram's size of n
n_max :: Integer, maximal ngram's size of n
'''
# We create a new node of Type cooccurrence
if corpus_id is not None and user_id is not None:
node_cooc = session.query(Node).filter(
Node.parent_id==corpus.id,
Node.type_id == cache.NodeType['Cooccurrence'].id
).first()
if node_cooc is None:
node_cooc = Node(user_id = user_id,
parent_id=corpus_id,
type_id=cache.NodeType['Cooccurrence'].id,
name="Cooccurrences corpus " + str(corpus_id))
session.add(node_cooc)
session.commit()
else:
print("Usage (Warning): Need corpus_id and user_id")
# Getting the main lists here, by default create or take the first one.
# Getting nodes for lems, stems and cvalue, if needed.
if stem is True:
node_stem = session.query(Node).filter(
Node.type_id==cache.NodeType['Stem'].id).first()
miamNgram = aliased(NodeNgram)
stopNgram = aliased(NodeNgram)
groupNgram = aliased(NodeNgramNgram)
stemNgram = aliased(NodeNgramNgram)
lemNgram = aliased(NodeNgramNgram)
cvalueNgram = aliased(NodeNgramNgram)
# Literal query here
query = (session.query(Node.id, Ngram.id.label('x'), Ngram.id.label('y'), func.count().label('score'))
.join(NodeNgram, NodeNgram.node_id == Node.id)
#.outerjoin(stopNgram, stopNgram.ngram_id == Ngram.id)
.filter(Node.parent_id == corpus_id)
.filter(Node.type_id == cache.NodeType['Document'].id)
#.filter(Ngram.n > n_max)
#.group_by(x)
#.group_by(y)
#.limit(size)
.all()
if start is not None:
Start=aliased(NodeHyperdata)
StartFormat = aliased(Hyperdata)
cooc_query = (cooc_query.join(Start, Start.node_id == Node.id)
.join(StartFormat, StartFormat.id == Start.hyperdata_id)
.filter(StartFormat.name == 'datetime')
.filter(Start.value_datetime >= start)
)
if end is not None:
End=aliased(NodeHyperdata)
EndFormat = aliased(Hyperdata)
cooc_query = (cooc_query.join(End, End.node_id == Node.id)
.join(EndFormat, EndFormat.id == End.hyperdata_id)
.filter(EndFormat.name == 'datetime')
.filter(End.value_datetime <= end)
)
cooc_query = (cooc_query.filter(Node.parent_id == corpus.id, Node.type_id == doc_id)
.filter(NodeNgramX.ngram_id < NodeNgramY.ngram_id)
.group_by(NodeNgramX.ngram_id, NodeNgramY.ngram_id)
.order_by(func.count())
.limit(limit)
)
matrix = WeightedMatrix(cooc_query)
if miam_id is not None :
#miam = get_or_create_node(nodetype='Cvalue', corpus=corpus)
miam_list = UnweightedList(session.query(NodeNodeNgram.ngram_id)
.filter(NodeNodeNgram.nodex_id == miam_id).all()
)
if stop_id is not None :
#stop = get_or_create_node(nodetype='StopList', corpus=corpus)
stop_list = UnweightedList(session.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id == stop_id).all()
)
if group_id is not None :
#group = get_or_create_node(nodetype='GroupList', corpus=corpus)
group_list = UnweightedList(session.query(NodeNgramNgram.ngramx_id, NodeNgramNgram.ngramy_id)
.filter(NodeNgramNgram.node_id == stop_id).all()
)
return(query)
if miam_id is not None and stop_id is None and group_id is None:
cooc = (matrix & miam_list)
elif miam_id is not None and stop_id is not None and group_id is None :
cooc = (matrix & miam_list) - stop_list
elif miam_id is not None and stop_id is not None and group_id is not None :
cooc = (matrix & miam_list & group_list) - stop_list
cooc.save(node_cooc.id)
return(node_cooc.id)
......@@ -62,6 +62,7 @@ for model_name, model in models.__dict__.items():
NodeNgram = Node_Ngram
NodeResource = Node_Resource
NodeHyperdata = Node_Hyperdata
# manually declare the Node table...
from datetime import datetime
......
#from admin.env import *
import inspect
from admin.utils import PrintException,DebugTime
from django.db import connection, transaction
from sqlalchemy import desc, asc, or_, and_, Date, cast, select
from sqlalchemy import literal_column
from sqlalchemy.orm import aliased
from sqlalchemy.sql import func
from gargantext_web.db import Node, NodeNgram, NodeNgramNgram, NodeNodeNgram
from gargantext_web.db import session, cache, get_or_create_node, bulk_insert
from collections import defaultdict
import numpy as np
import pandas as pd
from analysis.lists import WeightedMatrix, UnweightedList
def cooc(corpus=None, list_id=None, limit=1000):
node_cooc = get_or_create_node(nodetype='Cooccurrence', corpus=corpus
, name_str="Cooccurrences corpus " + str(corpus.id) + "for list Cvalue" + str(list_id))
session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==node_cooc.id).delete()
session.commit()
NodeNgramX = aliased(NodeNgram)
NodeNgramY = aliased(NodeNgram)
doc_id = cache.NodeType['Document'].id
#literal_column(str(miam_id)).label("node_id"),
query = (session.query(NodeNgramX.ngram_id, NodeNgramY.ngram_id, func.count())
.join(Node, Node.id == NodeNgramX.node_id)
.join(NodeNgramY, NodeNgramY.node_id == Node.id)
.filter(Node.parent_id == corpus.id, Node.type_id == doc_id)
.filter(NodeNgramX.ngram_id < NodeNgramY.ngram_id)
.group_by(NodeNgramX.ngram_id, NodeNgramY.ngram_id)
.order_by(func.count())
.limit(limit)
)
cvalue_id = get_or_create_node(nodetype='Cvalue', corpus=corpus).id
stop_id = get_or_create_node(nodetype='StopList', corpus=corpus).id
cvalue_list = UnweightedList(session.query(NodeNodeNgram.ngram_id).filter(NodeNodeNgram.nodex_id==cvalue_id).all())
stop_list = UnweightedList(session.query(NodeNgram.ngram_id).filter(NodeNgram.node_id==stop_id).all())
matrix = WeightedMatrix(query)
from analysis.cooccurrences import cooc
from gargantext_web.db import session, cache, get_or_create_node, bulk_insert
cooc = matrix & cvalue_list - stop_list
cooc.save(node_cooc.id)
return(node_cooc.id)
def coocOld(corpus=None, list_id=None, limit=100):
def specificity(cooc_id=None, corpus=None):
'''
cooc :: Corpus -> Int -> NodeNgramNgram
Compute the specificity, simple calculus.
'''
cursor = connection.cursor()
node_cooc = get_or_create_node(nodetype='Cooccurrence', corpus=corpus
, name_str="Cooccurrences corpus " + str(corpus.id) + "for list Cvalue" + str(list_id))
session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==node_cooc.id).delete()
session.commit()
query_cooc = """
INSERT INTO node_nodengramngram (node_id, "ngramx_id", "ngramy_id", score)
SELECT
%d as node_id,
ngX.id,
ngY.id,
COUNT(*) AS score
FROM
node_node AS n -- the nodes who are direct children of the corpus
INNER JOIN
node_node_ngram AS nngX ON nngX.node_id = n.id -- list of ngrams contained in the node
INNER JOIN
node_nodenodengram AS whitelistX ON whitelistX.ngram_id = nngX.ngram_id -- list of ngrams contained in the whitelist and in the node
INNER JOIN
node_ngram AS ngX ON ngX.id = whitelistX.ngram_id -- ngrams which are in both
INNER JOIN
node_node_ngram AS nngY ON nngY.node_id = n.id
INNER JOIN
node_nodenodengram AS whitelistY ON whitelistY.ngram_id = nngY.ngram_id
INNER JOIN
node_ngram AS ngY ON ngY.id = whitelistY.ngram_id
WHERE
n.parent_id = %s
AND
whitelistX.nodex_id = %s
AND
whitelistY.nodex_id = %s
AND
nngX.ngram_id < nngY.ngram_id -- so we only get distinct pairs of ngrams
GROUP BY
ngX.id,
ngX.terms,
ngY.id,
ngY.terms
ORDER BY
score DESC
LIMIT
%d
""" % (node_cooc.id, corpus.id, list_id, list_id, limit)
# print(query_cooc)
cursor.execute(query_cooc)
return(node_cooc.id)
def specificity(cooc_id=None, corpus=None):
cooccurrences = session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==cooc_id).all()
matrix = defaultdict(lambda : defaultdict(float))
......@@ -149,20 +46,24 @@ def specificity(cooc_id=None, corpus=None):
bulk_insert(NodeNodeNgram, ['nodex_id', 'nodey_id', 'ngram_id', 'score'], [d for d in data])
return(node.id)
def compute_specificity(corpus,limit=100):
'''
Computing specificities
Computing specificities as NodeNodeNgram.
All workflow is the following:
1) Compute the cooc matrix
2) Compute the specificity score, saving it in database, return its Node
'''
dbg = DebugTime('Corpus #%d - specificity' % corpus.id)
list_cvalue = get_or_create_node(nodetype='Cvalue', corpus=corpus)
cooc_id = cooc(corpus=corpus, list_id=list_cvalue.id,limit=limit)
cooc_id = cooc(corpus=corpus, miam_id=list_cvalue.id,limit=limit)
specificity(cooc_id=cooc_id,corpus=corpus)
dbg.show('specificity')
#corpus=session.query(Node).filter(Node.id==244250).first()
#cooc2(corpus)
#compute_specificity(corpus)
......@@ -13,8 +13,8 @@ def ngram_workflow(corpus):
'''
compute_tfidf(corpus)
compute_tfidf_global(corpus)
compute_cvalue(corpus,limit=1000) # size
compute_specificity(corpus,limit=800)
compute_cvalue(corpus,limit=3000) # size
compute_specificity(corpus,limit=200)
# compute_stop(corpus)
compute_groups(corpus,limit_inf=400, limit_sup=600)
# compute_miam(corpus,limit=100) # size
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment