Commit 4c9cb5ea authored by delanoe's avatar delanoe

[FEAT] Keep node.

parent e02cd586
......@@ -29,7 +29,6 @@ from sqlalchemy.orm import aliased
def diag_null(x):
return x - x * scipy.eye(x.shape[0])
def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True):
'''
do_distance :: Int -> (Graph, Partition, {ids}, {weight})
......@@ -75,10 +74,10 @@ def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True):
n = n.sort(inplace=False)
m = m.sort(inplace=False)
nodes_included = 300 #int(round(size/20,0))
nodes_included = 500 #int(round(size/20,0))
#nodes_excluded = int(round(size/10,0))
nodes_specific = 300 #int(round(size/10,0))
nodes_specific = 500 #int(round(size/10,0))
#nodes_generic = int(round(size/10,0))
# TODO use the included score for the node size
......@@ -87,6 +86,7 @@ def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True):
#m_index = pd.Index.intersection(x.index, m.index[:nodes_generic])
# Specific:
m_index = pd.Index.intersection(x.index, m.index[-nodes_specific:])
#m_index = pd.Index.intersection(x.index, n.index[:nodes_included])
x_index = pd.Index.union(n_index, m_index)
xx = x[list(x_index)].T[list(x_index)]
......@@ -113,7 +113,6 @@ def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True):
return(G,partition,ids,weight)
def get_cooc(request=None, corpus=None
, field1='ngrams', field2='ngrams'
, cooc_id=None, type='node_link', size=1000
......@@ -126,7 +125,7 @@ def get_cooc(request=None, corpus=None
data = {}
#if session.query(Node).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first() is None:
print("Coocurrences do not exist yet, create it.")
miam_id = get_or_create_node(nodetype='MiamList', corpus=corpus).id
miam_id = get_or_create_node(nodetype='MapList', corpus=corpus).id
stop_id = get_or_create_node(nodetype='StopList', corpus=corpus).id
group_id = get_or_create_node(nodetype='Group', corpus=corpus).id
......@@ -141,9 +140,9 @@ def get_cooc(request=None, corpus=None
#cooc_id = get_or_create_node(nodetype='Cooccurrence', corpus=corpus).id
cooc_id = do_cooc(corpus=corpus, field1="ngrams", field2="ngrams"
, miam_id=miam_id, group_id=group_id, stop_id=stop_id, limit=size
, isMonopartite=isMonopartite , start=start , end=end , apax=apax)
, isMonopartite=True, start=start , end=end , apax=apax)
G, partition, ids, weight = do_distance(cooc_id, field1="ngrams", field2="ngrams", isMonopartite=isMonopartite)
G, partition, ids, weight = do_distance(cooc_id, field1="ngrams", field2="ngrams", isMonopartite=True)
if type == "node_link":
nodesB_dict = {}
......@@ -173,8 +172,8 @@ def get_cooc(request=None, corpus=None
s = e[0]
t = e[1]
info = {
"s":ids[s][1] ,
"t":ids[t][1] ,
"s": ids[s][1] ,
"t": ids[t][1] ,
"w": G[ids[s][1]][ids[t][1]]["weight"]
}
# print(info)
......@@ -216,15 +215,13 @@ def get_cooc(request=None, corpus=None
return(data)
def get_graphA( nodeA_type , NodesB , links , corpus ):
from analysis.InterUnion import Utils
print(" = = = == = = = ")
print("In get_graphA(), corpus id:",corpus.id)
nodeA_type_id = cache.Hyperdata[nodeA_type].id
threshold_cotainf = 0.05
threshold_cotainf = 0.02
max_nodeid = -1
for nodeid in NodesB:
if nodeid > max_nodeid:
......
......@@ -91,7 +91,7 @@ print('Initialize node types...')
node_types = [
'Root', 'Trash',
'Project', 'Corpus', 'Document',
'MiamList', 'StopList', 'MainList',
'MiamList', 'StopList', 'MainList', 'MapList', # TODO MiamList -> MainList
'Stem', 'Lem', 'Group', 'Tfidf', 'Tfidf (global)', 'Cvalue', 'Specificity'
, 'Cooccurrence',
]
......
......@@ -7,6 +7,7 @@ from gargantext_web.db import NodeNgram,NodeNodeNgram
from gargantext_web.db import *
from gargantext_web.db import get_or_create_node
from analysis.lists import Translations, UnweightedList
from parsing.corpustools import *
import sqlalchemy as sa
......@@ -21,62 +22,7 @@ from collections import defaultdict
from math import log
from functools import reduce
def queryNodeNodeNgram(nodeMeasure_id=None, corpus_id=None, limit=None):
'''
queryNodeNodeNgram :: Int -> Int -> Int -> (Int, String, Float)
Get list of ngrams according to a measure related to the corpus: maybe tfidf
cvalue.
'''
query = (session.query(Ngram.id, Ngram.terms, NodeNodeNgram.score)
.join(NodeNodeNgram, NodeNodeNgram.ngram_id == Ngram.id)
.join(Node, Node.id == NodeNodeNgram.nodex_id)
.filter(NodeNodeNgram.nodex_id == nodeMeasure_id)
.filter(NodeNodeNgram.nodey_id == corpus_id)
.group_by(Ngram.id, Ngram.terms, NodeNodeNgram.score)
.order_by(desc(NodeNodeNgram.score))
)
if limit is None:
query = query.count()
elif limit == 0 :
query = query.all()
else:
query = query.limit(limit)
return(query)
def getNgrams(corpus=None, limit_inf=600, limit_sup=3000):
'''
getNgrams :: Corpus -> [(Int, String)] -> [(Int, String)]
For a corpus, gives list of highest Cvalue ngrams and highest TFIDF (global)
ngrams that have to be grouped with
'''
#tfidf_node = get_or_create_node(nodetype='Tfidf (global)', corpus=corpus)
cvalue_node = get_or_create_node(nodetype='Cvalue', corpus=corpus)
spec_node = get_or_create_node(nodetype='Specificity', corpus=corpus)
#tfidf_ngrams = queryNodeNodeNgram(nodeMeasure_id=tfidf_node.id, corpus_id=corpus.id)
cvalue_ngrams = queryNodeNodeNgram(nodeMeasure_id=cvalue_node.id, corpus_id=corpus.id, limit=limit_sup)
spec_ngrams = queryNodeNodeNgram(nodeMeasure_id=spec_node.id, corpus_id=corpus.id, limit=limit_inf)
#print([n for n in tfidf_ngrams])
def list2set(_list):
_set = set()
for n in _list:
_set.add((n[0],n[1]))
return(_set)
cvalue_set = set()
spec_set = set()
cvalue_set = list2set(cvalue_ngrams)
spec_set = list2set(spec_ngrams)
cvalue_setDiff = cvalue_set.difference(spec_set)
return(spec_set,cvalue_setDiff)
def getStemmer(corpus):
'''
......@@ -121,17 +67,35 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem'):
miam_to_insert = set()
miam_node = get_or_create_node(nodetype='MiamList', corpus=corpus)
stop_node = get_or_create_node(nodetype='StopList', corpus=corpus)
#stop_list = UnweightedList(stop_node.id)
Stop = aliased(NodeNgram)
frequency = sa.func.count(NodeNgram.weight)
ngrams = (session.query(Ngram.id, Ngram.terms, frequency )
.join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
.join(Node, Node.id == NodeNgram.node_id)
#.outerjoin(Stop, Stop.ngram_id == Ngram.id)
#.filter(Stop.node_id == stop_node.id, Stop.ngram_id == None)
.filter(Node.parent_id==corpus.id, Node.type_id==cache.NodeType['Document'].id)
.group_by(Ngram.id)
.order_by(desc(frequency))
#.all()
.limit(limit_sup)
)
stops = (session.query(Ngram.id, Ngram.terms, frequency)
.join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
.join(Node, Node.id == NodeNgram.node_id)
.join(Stop, Stop.ngram_id == Ngram.id)
.filter(Stop.node_id == stop_node.id)
.filter(Node.parent_id==corpus.id, Node.type_id==cache.NodeType['Document'].id)
.group_by(Ngram.id)
.all()
)
ngrams = [n for n in ngrams if n not in stops]
print(ngrams)
#group = defaultdict(lambda : defaultdict())
ids_dict = dict()
mainform_dict = dict()
......
# Without this, we couldn't use the Django environment
#from admin.env import *
from admin.env import *
#from ngram.stemLem import *
from admin.utils import PrintException,DebugTime
......@@ -15,42 +15,51 @@ from sqlalchemy.orm import aliased
from ngram.tools import insert_ngrams
import csv
def compute_miam(corpus,limit=500):
def compute_mapList(corpus,limit=500):
'''
According to Specificities and stoplist,
'''
dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
node_group = get_or_create_node(nodetype='Group', corpus=corpus)
node_miam = get_or_create_node(nodetype='MiamList', corpus=corpus)
node_stop = get_or_create_node(nodetype='StopList', corpus=corpus)
node_group = get_or_create_node(nodetype='Group', corpus=corpus)
node_spec = get_or_create_node(nodetype='Specificity', corpus=corpus)
Miam=aliased(NodeNgram)
Stop=aliased(NodeNgram)
Group=aliased(NodeNgramNgram)
Spec=aliased(NodeNodeNgram)
top_miam = (session.query(Spec.ngram_id, Spec.score)
.outerjoin(Group, Group.ngramy_id == Spec.ngram_id)
.outerjoin(Stop, Stop.ngram_id == Spec.ngram_id)
.filter(Group.node_id == node_group.id)
.filter(Stop.node_id == node_stop.id)
top_ngrams = (session.query(Spec.ngram_id, Spec.score)
.join(Miam, Spec.ngram_id == Miam.ngram_id)
#.outerjoin(Group, Group.ngramy_id == Spec.ngram_id)
#.outerjoin(Stop, Stop.ngram_id == Spec.ngram_id)
.filter(Miam.node_id == node_miam.id)
#.filter(Group.node_id == node_group.id)
#.filter(Stop.node_id == node_stop.id)
.filter(Spec.nodex_id == node_spec.id)
.order_by(desc(Spec.score))
.limit(limit)
)
print([t for t in top_miam])
node_miam = get_or_create_node(nodetype='MiamList', corpus=corpus)
session.query(NodeNgram).filter(NodeNgram.node_id==node_miam.id).delete()
#print([t for t in top_ngrams])
node_mapList = get_or_create_node(nodetype='MapList', corpus=corpus)
session.query(NodeNgram).filter(NodeNgram.node_id==node_mapList.id).delete()
session.commit()
data = zip(
[node_miam.id for i in range(1,limit)]
, [n[0] for n in top_miam]
[node_mapList.id for i in range(1,limit)]
, [n[0] for n in top_ngrams]
, [1 for i in range(1,limit)]
)
print([d for d in data])
#print([d for d in data])
bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data])
dbg.show('Miam computed')
dbg.show('MapList computed')
def insert_miam(corpus, ngrams=None, path_file_csv=None):
dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
......@@ -87,8 +96,41 @@ def insert_miam(corpus, ngrams=None, path_file_csv=None):
file_csv.close()
dbg.show('Miam computed')
#corpus = session.query(Node).filter(Node.id==556113).first()
#corpus = session.query(Node).filter(Node.id==540420).first()
#compute_mapList(corpus)
#insert_miam(corpus=corpus, path_file_csv="Thesaurus_tag.csv")
#def getNgrams(corpus=None, limit_inf=600, limit_sup=3000):
# '''
# getNgrams :: Corpus -> [(Int, String)] -> [(Int, String)]
# For a corpus, gives list of highest Cvalue ngrams and highest TFIDF (global)
# ngrams that have to be grouped with
# '''
# #tfidf_node = get_or_create_node(nodetype='Tfidf (global)', corpus=corpus)
# cvalue_node = get_or_create_node(nodetype='Cvalue', corpus=corpus)
# spec_node = get_or_create_node(nodetype='Specificity', corpus=corpus)
#
#
# #tfidf_ngrams = queryNodeNodeNgram(nodeMeasure_id=tfidf_node.id, corpus_id=corpus.id)
# cvalue_ngrams = queryNodeNodeNgram(nodeMeasure_id=cvalue_node.id, corpus_id=corpus.id, limit=limit_sup)
# spec_ngrams = queryNodeNodeNgram(nodeMeasure_id=spec_node.id, corpus_id=corpus.id, limit=limit_inf)
#
# #print([n for n in tfidf_ngrams])
#
# def list2set(_list):
# _set = set()
# for n in _list:
# _set.add((n[0],n[1]))
# return(_set)
#
# cvalue_set = set()
# spec_set = set()
#
# cvalue_set = list2set(cvalue_ngrams)
# spec_set = list2set(spec_ngrams)
#
# cvalue_setDiff = cvalue_set.difference(spec_set)
#
# return(spec_set,cvalue_setDiff)
#
# Without this, we couldn't use the Django environment
#from admin.env import *
#from ngram.stemLem import *
import re
from admin.utils import PrintException
from gargantext_web.db import NodeNgram,NodeNodeNgram
from gargantext_web.db import cache, session, get_or_create_node
from gargantext_web.db import Node, Ngram, NodeNgram,NodeNodeNgram
from gargantext_web.db import cache, session, get_or_create_node, bulk_insert
import sqlalchemy as sa
from sqlalchemy.sql import func
from sqlalchemy import desc, asc, or_, and_, Date, cast, select
from sqlalchemy import literal_column
......@@ -38,7 +35,6 @@ def importStopList(node,filename,language='fr'):
bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data])
def isStopWord(ngram, stop_words=None):
'''
ngram :: (Int, String) => (ngram_id, ngram_terms)
......@@ -55,8 +51,9 @@ def isStopWord(ngram, stop_words=None):
if format_regex.match(word) :
return(True)
for regex in ["(.*)\d(.*)"
, "^.{1,2}$"
for regex in [
"^.{1,2}$"
, "(.*)\d(.*)"
, "(.*)(\.)(.*)"
, "(.*)(\,)(.*)"
, "(.*)(study)(.*)"
......@@ -73,13 +70,11 @@ def isStopWord(ngram, stop_words=None):
if test_match(word, regex) is True :
return(True)
def compute_stop(corpus,size=2000,debug=False):
def compute_stop(corpus,limit=2000,debug=False):
'''
do some statitics on all stop lists of database of the same type
'''
stop_node = get_or_create_node(nodetype='StopList', corpus=corpus)
miam_node = get_or_create_node(nodetype='MiamList', corpus=corpus)
# TODO do a function to get all stop words with social scores
root = session.query(Node).filter(Node.type_id == cache.NodeType['Root'].id).first()
......@@ -90,34 +85,26 @@ def compute_stop(corpus,size=2000,debug=False):
.filter(NodeNgram.node_id == root_stop_id)
.all()
)
#print([n for n in stop_words])
frequency = sa.func.count( NodeNgram.weight )
ngrams = ( session.query( Ngram.id, Ngram.terms, frequency )
.join( NodeNgram, NodeNgram.ngram_id == Ngram.id )
.join( Node, Node.id == NodeNgram.node_id )
.filter( Node.parent_id == corpus.id,
Node.type_id == cache.NodeType['Document'].id )
.group_by( Ngram.id )
.order_by( desc( frequency ) )
.all()
#.limit(limit)
)
top_words = (session.query(Ngram.id, Ngram.terms)
.join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
.filter(NodeNgram.node_id == miam_node.id)
.order_by(desc(NodeNgram.weight))
.limit(size)
)
ngrams_to_stop = filter(lambda x: isStopWord(x,stop_words=stop_words), top_words)
ngrams_to_stop = filter(lambda x: isStopWord(x,stop_words=stop_words), ngrams)
#print([n for n in ngrams_to_stop])
stop = WeightedList({ n[0] : -1 for n in ngrams_to_stop})
stop.save(stop_node.id)
miam = UnweightedList(miam_node.id)
new_miam = miam - stop
new_miam.save(miam_node.id)
# data = zip(
# [stop_node.id for i in range(0,size)]
# , [ngram[0] for ngram in ngrams_to_stop]
# , [-1 for i in range(0,size)]
# )
# bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data])
#corpus=session.query(Node).filter(Node.id==545461).first()
#compute_stop(corpus)
......@@ -109,3 +109,30 @@ def insert_nodengramngram(nodengramngram):
''' % (NodeNgramNgram.__table__.name,))
db.commit()
#def queryNodeNodeNgram(nodeMeasure_id=None, corpus_id=None, limit=None):
# '''
# queryNodeNodeNgram :: Int -> Int -> Int -> (Int, String, Float)
# Get list of ngrams according to a measure related to the corpus: maybe tfidf
# cvalue.
# '''
# query = (session.query(Ngram.id, Ngram.terms, NodeNodeNgram.score)
# .join(NodeNodeNgram, NodeNodeNgram.ngram_id == Ngram.id)
# .join(Node, Node.id == NodeNodeNgram.nodex_id)
# .filter(NodeNodeNgram.nodex_id == nodeMeasure_id)
# .filter(NodeNodeNgram.nodey_id == corpus_id)
# .group_by(Ngram.id, Ngram.terms, NodeNodeNgram.score)
# .order_by(desc(NodeNodeNgram.score))
# )
#
# if limit is None:
# query = query.count()
# elif limit == 0 :
# query = query.all()
# else:
# query = query.limit(limit)
#
# return(query)
#
......@@ -4,8 +4,9 @@ from ngram.cvalue import compute_cvalue
from ngram.specificity import compute_specificity
#from ngram.stop import compute_stop
from ngram.group import compute_groups
from ngram.miam import compute_miam
from gargantext_web.db import get_or_create_node
from ngram.mapList import compute_mapList
#from gargantext_web.celery import update_processing
......@@ -13,31 +14,32 @@ def ngram_workflow(corpus, n=5000):
'''
All the workflow to filter the ngrams.
'''
compute_tfidf_global(corpus)
#compute_tfidf_global(corpus)
part = round(n * 0.8)
part = round(n * 0.9)
compute_cvalue(corpus,limit=part) # size
#compute_cvalue(corpus,limit=part) # size
part = round(part * 0.4)
part = round(part * 0.8)
print('spec part:', part)
compute_specificity(corpus,limit=part)
#compute_specificity(corpus,limit=part)
part = round(part * 0.5)
part = round(part * 0.8)
# compute_stop(corpus)
limit_inf = round(part * 1)
limit_sup = round(part * 5)
print(limit_inf,limit_sup)
compute_groups(corpus,limit_inf=limit_inf, limit_sup=limit_sup)
#compute_groups(corpus,limit_inf=limit_inf, limit_sup=limit_sup)
# compute_miam(corpus,limit=part) # size
compute_mapList(corpus,limit=part) # size
compute_tfidf(corpus)
#compute_tfidf(corpus)
#corpus=session.query(Node).filter(Node.id==257579).first()
#corpus=session.query(Node).filter(Node.id==540420).first()
#corpus=session.query(Node).filter(Node.id==559637).first()
#ngram_workflow(corpus)
#update_processing(corpus, 0)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment