Commit 3476d4a2 authored by delanoe's avatar delanoe

[FIX SESSIONS] local session for workflow and get_or_create_node are renamed to mysession.

parent c54058e2
......@@ -47,13 +47,12 @@ def PrintException():
class WorkflowTracking:
def __init__( self ):
self.hola = "mundo"
def processing_(self , corpus , step):
def processing_(self , corpus_id , step):
try:
the_query = """ UPDATE node_node SET hyperdata=\'{ \"%s\" : \"%s\"}\' WHERE id=%d """ % ( "Processing", step , corpus.id )
the_query = """ UPDATE node_node SET hyperdata=\'{ \"%s\" : \"%s\"}\' WHERE id=%d """ % ( "Processing", step , corpus_id )
cursor = connection.cursor()
try:
cursor.execute(the_query)
......
......@@ -18,7 +18,8 @@ def do_cooc(corpus=None
, start=None, end=None
, limit=1000
, isMonopartite=True
, hapax = 3):
, hapax = 3
, mysession=None):
'''
Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id
For the moment list of paramters are not supported because, lists need to
......@@ -40,13 +41,16 @@ def do_cooc(corpus=None
# Security test
field1,field2 = str(field1), str(field2)
session = get_session()
if mysession is None:
from gargantext_web.db import session
mysession = session
# Get node
node_cooc = get_or_create_node(nodetype='Cooccurrence', corpus=corpus
, name_str="Cooccurrences corpus " \
+ str(corpus.id) + "list_id: " + str(miam_id)
#, hyperdata={'field1': field1, 'field2':field2}
, session=session)
, mysession=mysession)
# BEGIN
......@@ -60,12 +64,12 @@ def do_cooc(corpus=None
#
# node_cooc.hyperdata = hyperdata
#
session.add(node_cooc)
session.commit()
mysession.add(node_cooc)
mysession.commit()
# END
session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==node_cooc.id).delete()
session.commit()
mysession.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==node_cooc.id).delete()
mysession.commit()
doc_id = cache.NodeType['Document'].id
......@@ -77,7 +81,7 @@ def do_cooc(corpus=None
if isMonopartite :
NodeNgramY = aliased(NodeNgram)
cooc_query = (session.query(NodeNgramX.ngram_id, NodeNgramY.ngram_id, cooc_score)
cooc_query = (mysession.query(NodeNgramX.ngram_id, NodeNgramY.ngram_id, cooc_score)
.join(Node, Node.id == NodeNgramX.node_id)
.join(NodeNgramY, NodeNgramY.node_id == Node.id)
.filter(Node.parent_id==corpus.id, Node.type_id==doc_id)
......@@ -85,7 +89,7 @@ def do_cooc(corpus=None
else :
NodeNgramY = aliased(NodeNgram)
cooc_query = (session.query(NodeHyperdataNgram.ngram_id, NodeNgramY.ngram_id, cooc_score)
cooc_query = (mysession.query(NodeHyperdataNgram.ngram_id, NodeNgramY.ngram_id, cooc_score)
.join(Node, Node.id == NodeHyperdataNgram.node_id)
.join(NodeNgramY, NodeNgramY.node_id == Node.id)
.join(Hyperdata, Hyperdata.id == NodeHyperdataNgram.hyperdata_id)
......@@ -169,7 +173,7 @@ def do_cooc(corpus=None
# Select according some scores
if cvalue_id is not None :
#miam = get_or_create_node(nodetype='Cvalue', corpus=corpus)
cvalue_list = UnweightedList(session.query(NodeNodeNgram.ngram_id)
cvalue_list = UnweightedList(mysession.query(NodeNodeNgram.ngram_id)
.filter(NodeNodeNgram.nodex_id == cvalue_id).all()
)
......@@ -200,4 +204,3 @@ def do_cooc(corpus=None
cooc = matrix
cooc.save(node_cooc.id)
return(node_cooc.id)
session.remove()
......@@ -44,9 +44,9 @@ def get_cooc(request=None, corpus=None
data = {}
#if session.query(Node).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first() is None:
print("Cooccurrences do not exist yet, creating it.")
miam_id = get_or_create_node(nodetype='MapList', corpus=corpus, session=session).id
stop_id = get_or_create_node(nodetype='StopList', corpus=corpus, session=session).id
group_id = get_or_create_node(nodetype='Group', corpus=corpus, session=session).id
miam_id = get_or_create_node(nodetype='MapList', corpus=corpus, mysession=session).id
stop_id = get_or_create_node(nodetype='StopList', corpus=corpus, mysession=session).id
group_id = get_or_create_node(nodetype='Group', corpus=corpus, mysession=session).id
SamuelFlag = False
# if field1 == field2 == 'ngrams' :
......
......@@ -51,7 +51,7 @@ def periods(corpus, start=None, end=None):
if duration.days > 365 * 3 :
print("OK")
miam_id = get_or_create_node(nodetype='MiamList', corpus=corpus).id
miam_id = get_or_create_node(nodetype='MiamList', corpus=corpus, mysession=session).id
result_list = list()
for t in times:
......@@ -86,7 +86,7 @@ def jacquard(period1, period2):
def get_partition(corpus, start=None, end=None, distance=distance):
session = get_session()
miam_id = get_or_create_node(corpus=corpus, nodetype='MapList', session=session).id
miam_id = get_or_create_node(corpus=corpus, nodetype='MapList', mysession=session).id
print("get Partition %s - %s" % (start, end))
cooc_id = do_cooc(corpus=corpus
, start=start
......
......@@ -3,7 +3,7 @@
from celery import shared_task
from node import models
from django.db import transaction
from admin.utils import DebugTime
from admin.utils import DebugTime, PrintException
import cProfile
#@app.task(bind=True)
......@@ -15,15 +15,8 @@ from gargantext_web.db import get_session, cache, Node
from ngram.workflow import ngram_workflow
@shared_task
def apply_sum(x, y):
print(x+y)
session = get_session()
print(session.query(Node.name).first())
session.remove()
from parsing.corpustools import parse_resources, extract_ngrams #add_resource,
from ngram.lists import ngrams2miam
#from ngram.lists import ngrams2miam
from admin.utils import WorkflowTracking
......@@ -36,28 +29,29 @@ def apply_workflow(corpus_id):
update_state = WorkflowTracking()
try :
session = get_session()
corpus = session.query(Node).filter(Node.id==corpus_id).first()
mysession = get_session()
corpus = mysession.query(Node).filter(Node.id==corpus_id).first()
update_state.processing_(corpus, "Parsing")
update_state.processing_(int(corpus_id), "Parsing")
#cProfile.runctx('parse_resources(corpus)', global,locals)
parse_resources(corpus, session=session)
parse_resources(corpus, mysession=mysession)
update_state.processing_(corpus, "Terms extraction")
extract_ngrams(corpus, ['title', 'abstract'], nlp=True, session=session)
update_state.processing_(int(corpus_id), "Terms extraction")
extract_ngrams(corpus, ['title', 'abstract'], nlp=True, mysession=mysession)
# update_state.processing_(corpus, "")
ngram_workflow(corpus, session=session)
ngram_workflow(corpus, mysession=mysession)
#ngrams2miam(user_id=corpus.user_id, corpus_id=corpus_id)
print("End of the Workflow for corpus %d" % (corpus_id))
update_state.processing_(corpus, "0")
update_state.processing_(int(corpus_id), "0")
session.remove()
mysession.remove()
except Exception as error:
print(error)
session.remove()
PrintException()
mysession.remove()
@shared_task
def empty_trash(corpus_id):
......
......@@ -242,17 +242,16 @@ class bulk_insert:
readline = read
def get_or_create_node(nodetype=None,corpus=None,corpus_id=None,name_str=None,hyperdata=None, session=None):
def get_or_create_node(nodetype=None,corpus=None,corpus_id=None,name_str=None,hyperdata=None, mysession=None):
'''
Should be a method of the object. __get_or_create__ ?
name_str :: String
hyperdata :: Dict
'''
sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
if mysession is None:
from gargantext_web.db import session
mysession = session
if nodetype is None:
print("Need to give a type node")
......@@ -262,13 +261,13 @@ def get_or_create_node(nodetype=None,corpus=None,corpus_id=None,name_str=None,hy
except KeyError:
ntype = cache.NodeType[nodetype] = NodeType()
ntype.name = nodetype
session.add(ntype)
session.commit()
mysession.add(ntype)
mysession.commit()
if corpus_id is not None and corpus is None:
corpus = session.query(Node).filter(Node.id==corpus_id).first()
corpus = mysession.query(Node).filter(Node.id==corpus_id).first()
node = (session.query(Node).filter(Node.type_id == ntype.id
node = (mysession.query(Node).filter(Node.type_id == ntype.id
, Node.parent_id == corpus.id
, Node.user_id == corpus.user_id
)
......@@ -289,11 +288,9 @@ def get_or_create_node(nodetype=None,corpus=None,corpus_id=None,name_str=None,hy
node.name=name_str
else:
node.name=ntype.name
session.add(node)
session.commit()
mysession.add(node)
mysession.commit()
#print(parent_id, n.parent_id, n.id, n.name)
return(node)
if sessionToRemove:
session.remove()
......@@ -67,7 +67,7 @@ def getNgrams(corpus=None, limit=1000):
return(terms)
session.remove()
def compute_cvalue(corpus=None, limit=1000):
def compute_cvalue(corpus=None, limit=1000, mysession=None):
'''
computeCvalue :: Corpus
frequency :: String -> Int -> Int
......@@ -126,13 +126,11 @@ def compute_cvalue(corpus=None, limit=1000):
result = cvalueAll()
#print([n for n in result])
session = get_session()
session.query(NodeNodeNgram).filter(NodeNodeNgram.nodex_id==cvalue_node.id).delete()
session.commit()
mysession.query(NodeNodeNgram).filter(NodeNodeNgram.nodex_id==cvalue_node.id).delete()
mysession.commit()
#bulk_insert(NodeNodeNgram, ['nodex_id', 'nodey_id', 'ngram_id', 'score'], [n for n in islice(result,0,100)])
bulk_insert(NodeNodeNgram, ['nodex_id', 'nodey_id', 'ngram_id', 'score'], [n for n in result])
session.remove()
# test
#corpus=session.query(Node).filter(Node.id==244250).first()
#computeCvalue(corpus)
......@@ -47,14 +47,10 @@ def getStemmer(corpus):
return(stemIt)
def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem', session=None):
def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem', mysession=None):
'''
group ngrams according to a function (stemming or lemming)
'''
sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
dbg = DebugTime('Corpus #%d - group' % corpus.id)
dbg.show('Group')
......@@ -66,19 +62,19 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem', session=N
stemIt = getStemmer(corpus)
group_to_insert = set()
node_group = get_or_create_node(nodetype='Group', corpus=corpus, session=session)
node_group = get_or_create_node(nodetype='Group', corpus=corpus, mysession=mysession)
miam_to_insert = set()
miam_node = get_or_create_node(nodetype='MiamList', corpus=corpus, session=session)
miam_node = get_or_create_node(nodetype='MiamList', corpus=corpus, mysession=mysession)
stop_node = get_or_create_node(nodetype='StopList', corpus=corpus, session=session)
stop_node = get_or_create_node(nodetype='StopList', corpus=corpus, mysession=mysession)
#stop_list = UnweightedList(stop_node.id)
Stop = aliased(NodeNgram)
frequency = sa.func.count(NodeNgram.weight)
ngrams = (session.query(Ngram.id, Ngram.terms, frequency )
ngrams = (mysession.query(Ngram.id, Ngram.terms, frequency )
.join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
.join(Node, Node.id == NodeNgram.node_id)
#.outerjoin(Stop, Stop.ngram_id == Ngram.id)
......@@ -90,7 +86,7 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem', session=N
.limit(limit_sup)
)
stops = (session.query(Ngram.id, Ngram.terms, frequency)
stops = (mysession.query(Ngram.id, Ngram.terms, frequency)
.join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
.join(Node, Node.id == NodeNgram.node_id)
.join(Stop, Stop.ngram_id == Ngram.id)
......@@ -131,10 +127,10 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem', session=N
miam_to_insert.add((miam_node.id, group[key]['mainForm'], 1))
# # Deleting previous groups
session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id == node_group.id).delete()
mysession.query(NodeNgramNgram).filter(NodeNgramNgram.node_id == node_group.id).delete()
# # Deleting previous ngrams miam list
session.query(NodeNgram).filter(NodeNgram.node_id == miam_node.id).delete()
session.commit()
mysession.query(NodeNgram).filter(NodeNgram.node_id == miam_node.id).delete()
mysession.commit()
bulk_insert(NodeNgramNgram
, ('node_id', 'ngramx_id', 'ngramy_id', 'score')
......@@ -142,4 +138,3 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem', session=N
bulk_insert(NodeNgram, ('node_id', 'ngram_id', 'weight'), [data for data in list(miam_to_insert)])
if sessionToRemove: session.remove()
......@@ -15,15 +15,11 @@ from sqlalchemy.orm import aliased
from ngram.tools import insert_ngrams
import csv
def compute_mapList(corpus,limit=500,n=1, session=None):
def compute_mapList(corpus,limit=500,n=1, mysession=None):
'''
According to Specificities and stoplist,
'''
sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
monograms_part = 0.005
monograms_limit = round(limit * monograms_part)
......@@ -31,11 +27,11 @@ def compute_mapList(corpus,limit=500,n=1, session=None):
dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
node_miam = get_or_create_node(nodetype='MiamList', corpus=corpus)
node_stop = get_or_create_node(nodetype='StopList', corpus=corpus)
node_group = get_or_create_node(nodetype='Group', corpus=corpus)
node_miam = get_or_create_node(nodetype='MiamList', corpus=corpus, mysession=mysession)
node_stop = get_or_create_node(nodetype='StopList', corpus=corpus, mysession=mysession)
node_group = get_or_create_node(nodetype='Group', corpus=corpus, mysession=mysession)
node_spec = get_or_create_node(nodetype='Specificity', corpus=corpus)
node_spec = get_or_create_node(nodetype='Specificity', corpus=corpus, mysession=mysession)
Miam=aliased(NodeNgram)
Stop=aliased(NodeNgram)
......@@ -43,7 +39,7 @@ def compute_mapList(corpus,limit=500,n=1, session=None):
Spec=aliased(NodeNodeNgram)
query = (session.query(Spec.ngram_id, Spec.score)
query = (mysession.query(Spec.ngram_id, Spec.score)
.join(Miam, Spec.ngram_id == Miam.ngram_id)
.join(Ngram, Ngram.id == Spec.ngram_id)
#.outerjoin(Group, Group.ngramy_id == Spec.ngram_id)
......@@ -66,19 +62,19 @@ def compute_mapList(corpus,limit=500,n=1, session=None):
.limit(multigrams_limit)
)
stop_ngrams = (session.query(NodeNgram.ngram_id)
stop_ngrams = (mysession.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id == node_stop.id)
.all()
)
grouped_ngrams = (session.query(NodeNgramNgram.ngramy_id)
grouped_ngrams = (mysession.query(NodeNgramNgram.ngramy_id)
.filter(NodeNgramNgram.node_id == node_group.id)
.all()
)
node_mapList = get_or_create_node(nodetype='MapList', corpus=corpus)
session.query(NodeNgram).filter(NodeNgram.node_id==node_mapList.id).delete()
session.commit()
node_mapList = get_or_create_node(nodetype='MapList', corpus=corpus, mysession=mysession)
mysession.query(NodeNgram).filter(NodeNgram.node_id==node_mapList.id).delete()
mysession.commit()
data = zip(
[node_mapList.id for i in range(1,limit)]
......@@ -91,20 +87,14 @@ def compute_mapList(corpus,limit=500,n=1, session=None):
bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data])
dbg.show('MapList computed')
if sessionToRemove: session.remove()
def insert_miam(corpus, ngrams=None, path_file_csv=None):
sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
def insert_miam(corpus, ngrams=None, path_file_csv=None, mysession=None):
dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
node_miam = get_or_create_node(nodetype='MiamList', corpus=corpus)
session.query(NodeNgram).filter(NodeNgram.node_id==node_miam.id).delete()
session.commit()
node_miam = get_or_create_node(nodetype='MiamList', corpus=corpus, mysession=mysession)
mysession.query(NodeNgram).filter(NodeNgram.node_id==node_miam.id).delete()
mysession.commit()
stop_words = set()
miam_words = set()
......@@ -133,6 +123,5 @@ def insert_miam(corpus, ngrams=None, path_file_csv=None):
bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data])
file_csv.close()
dbg.show('Miam computed')
if sessionToRemove: session.remove()
......@@ -4,26 +4,22 @@ from gargantext_web.db import Node, NodeNgram, NodeNodeNgram
from gargantext_web.db import get_or_create_node
from admin.utils import DebugTime
def compute_occs(corpus, session=None):
def compute_occs(corpus, mysession=None):
'''
compute_occs :: Corpus -> IO ()
'''
sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
dbg = DebugTime('Corpus #%d - OCCURRENCES' % corpus.id)
dbg.show('Calculate occurrences')
occs_node = get_or_create_node(nodetype='Occurrences', corpus=corpus)
occs_node = get_or_create_node(nodetype='Occurrences', corpus=corpus, mysession=mysession)
#print(occs_node.id)
(session.query(NodeNodeNgram)
(mysession.query(NodeNodeNgram)
.filter(NodeNodeNgram.nodex_id==occs_node.id).delete()
)
session.commit()
mysession.commit()
db, cursor = get_cursor()
cursor.execute('''
......@@ -55,7 +51,6 @@ def compute_occs(corpus, session=None):
)
)
db.commit()
if sessionToRemove: session.remove()
#data = session.query(NodeNodeNgram).filter(NodeNodeNgram.nodex_id==occs_node.id).all()
......
......@@ -15,16 +15,12 @@ from gargantext_web.db import NodeNgramNgram, NodeNodeNgram
from sqlalchemy import desc, asc, or_, and_, Date, cast, select
def specificity(cooc_id=None, corpus=None, limit=100, session=None):
def specificity(cooc_id=None, corpus=None, limit=100, mysession=None):
'''
Compute the specificity, simple calculus.
'''
sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
cooccurrences = (session.query(NodeNgramNgram)
cooccurrences = (mysession.query(NodeNgramNgram)
.filter(NodeNgramNgram.node_id==cooc_id)
.order_by(NodeNgramNgram.score)
.limit(limit)
......@@ -45,23 +41,22 @@ def specificity(cooc_id=None, corpus=None, limit=100, session=None):
m = ( xs - ys) / (2 * (x.shape[0] - 1))
m = m.sort(inplace=False)
node = get_or_create_node(nodetype='Specificity',corpus=corpus,session=session)
node = get_or_create_node(nodetype='Specificity',corpus=corpus, mysession=mysession)
data = zip( [node.id for i in range(1,m.shape[0])]
, [corpus.id for i in range(1,m.shape[0])]
, m.index.tolist()
, m.values.tolist()
)
session.query(NodeNodeNgram).filter(NodeNodeNgram.nodex_id==node.id).delete()
session.commit()
mysession.query(NodeNodeNgram).filter(NodeNodeNgram.nodex_id==node.id).delete()
mysession.commit()
bulk_insert(NodeNodeNgram, ['nodex_id', 'nodey_id', 'ngram_id', 'score'], [d for d in data])
return(node.id)
if sessionToRemove: session.remove()
def compute_specificity(corpus,limit=100, session=None):
def compute_specificity(corpus,limit=100, mysession=None):
'''
Computing specificities as NodeNodeNgram.
All workflow is the following:
......@@ -69,19 +64,13 @@ def compute_specificity(corpus,limit=100, session=None):
2) Compute the specificity score, saving it in database, return its Node
'''
sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
dbg = DebugTime('Corpus #%d - specificity' % corpus.id)
list_cvalue = get_or_create_node(nodetype='Cvalue', corpus=corpus, session=session)
cooc_id = do_cooc(corpus=corpus, cvalue_id=list_cvalue.id,limit=limit)
list_cvalue = get_or_create_node(nodetype='Cvalue', corpus=corpus, mysession=mysession)
cooc_id = do_cooc(corpus=corpus, cvalue_id=list_cvalue.id,limit=limit, mysession=mysession)
specificity(cooc_id=cooc_id,corpus=corpus,limit=limit)
specificity(cooc_id=cooc_id,corpus=corpus,limit=limit,mysession=mysession)
dbg.show('specificity')
if sessionToRemove: session.remove()
#corpus=session.query(Node).filter(Node.id==244250).first()
#compute_specificity(corpus)
......
......@@ -75,22 +75,18 @@ def isStopWord(ngram, stop_words=None):
if test_match(word, regex) is True :
return(True)
def compute_stop(corpus,limit=2000,debug=False, session=None):
def compute_stop(corpus,limit=2000,debug=False, mysession=None):
'''
do some statitics on all stop lists of database of the same type
'''
sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
stop_node_id = get_or_create_node(nodetype='StopList', corpus=corpus).id
stop_node_id = get_or_create_node(nodetype='StopList', corpus=corpus, mysession=mysession).id
# TODO do a function to get all stop words with social scores
root = session.query(Node).filter(Node.type_id == cache.NodeType['Root'].id).first()
root_stop_id = get_or_create_node(nodetype='StopList', corpus=root).id
root = mysession.query(Node).filter(Node.type_id == cache.NodeType['Root'].id).first()
root_stop_id = get_or_create_node(nodetype='StopList', corpus=root, mysession=mysession).id
stop_words = (session.query(Ngram.terms)
stop_words = (mysession.query(Ngram.terms)
.join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
.filter(NodeNgram.node_id == root_stop_id)
.all()
......@@ -99,7 +95,7 @@ def compute_stop(corpus,limit=2000,debug=False, session=None):
#print([n for n in stop_words])
frequency = sa.func.count( NodeNgram.weight )
ngrams = ( session.query( Ngram.id, Ngram.terms, frequency )
ngrams = ( mysession.query( Ngram.id, Ngram.terms, frequency )
.join( NodeNgram, NodeNgram.ngram_id == Ngram.id )
.join( Node, Node.id == NodeNgram.node_id )
.filter( Node.parent_id == corpus.id,
......@@ -118,4 +114,3 @@ def compute_stop(corpus,limit=2000,debug=False, session=None):
stop = WeightedList({ n[0] : -1 for n in ngrams_to_stop})
stop.save(stop_node_id)
if sessionToRemove: session.remove()
......@@ -5,17 +5,12 @@ from gargantext_web.db import get_session, get_or_create_node
from admin.utils import DebugTime
def compute_tfidf(corpus, session=None):
def compute_tfidf(corpus, mysession=None):
# compute terms frequency sum
sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
dbg = DebugTime('Corpus #%d - TFIDF' % corpus.id)
dbg.show('calculate terms frequencies sums')
tfidf_node = get_or_create_node(nodetype='Tfidf', corpus=corpus, session=session)
tfidf_node = get_or_create_node(nodetype='Tfidf', corpus=corpus, mysession=mysession)
db, cursor = get_cursor()
cursor.execute('''
......@@ -125,26 +120,20 @@ def compute_tfidf(corpus, session=None):
# the end!
db.commit()
if sessionToRemove: session.remove()
def compute_tfidf_global(corpus, session=None):
def compute_tfidf_global(corpus, mysession=None):
'''
Maybe improve this with:
#http://stackoverflow.com/questions/8674718/best-way-to-select-random-rows-postgresql
'''
sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
dbg = DebugTime('Corpus #%d - tfidf global' % corpus.id)
dbg.show('calculate terms frequencies sums')
tfidf_node = get_or_create_node(nodetype='Tfidf (global)', corpus=corpus, session=session)
tfidf_node = get_or_create_node(nodetype='Tfidf (global)', corpus=corpus, mysession=mysession)
# update would be better
session.query(NodeNodeNgram).filter(NodeNodeNgram.nodex_id==tfidf_node.id).delete()
session.commit()
mysession.query(NodeNodeNgram).filter(NodeNodeNgram.nodex_id==tfidf_node.id).delete()
mysession.commit()
# compute terms frequency sum
db, cursor = get_cursor()
......@@ -271,8 +260,3 @@ def compute_tfidf_global(corpus, session=None):
db.commit()
dbg.show('insert tfidf')
if sessionToRemove: session.remove()
#corpus=session.query(Node).filter(Node.id==244250).first()
#compute_tfidf_global(corpus)
......@@ -8,8 +8,8 @@ def insert_ngrams_to_list(list_of_ngrams, corpus, list_type='MapList', erase=Tru
'''
session = get_session()
list_node = get_or_create_node(corpus=corpus, nodetype=list_type, session=session)
group_node = get_or_create_node(corpus=corpus, nodetype='GroupList', session=session)
list_node = get_or_create_node(corpus=corpus, nodetype=list_type, mysession=session)
group_node = get_or_create_node(corpus=corpus, nodetype='GroupList', mysession=session)
group_list = (session.query(NodeNgramNgram.ngramy_id)
.filter(NodeNgramNgram.id==group_node.id)
.all()
......
......@@ -12,17 +12,17 @@ from gargantext_web.db import Node , NodeNgram
from admin.utils import WorkflowTracking
def ngram_workflow(corpus, n=5000, session=None):
def ngram_workflow(corpus, n=5000, mysession=None):
'''
All the workflow to filter the ngrams.
'''
update_state = WorkflowTracking()
update_state.processing_(corpus, "Stop words")
compute_stop(corpus, session=session)
update_state.processing_(corpus.id, "Stop words")
compute_stop(corpus, mysession=mysession)
update_state.processing_(corpus, "TF-IDF global score")
compute_tfidf_global(corpus, session=session)
update_state.processing_(corpus.id, "TF-IDF global score")
compute_tfidf_global(corpus, mysession=mysession)
part = round(n * 0.9)
......@@ -31,28 +31,28 @@ def ngram_workflow(corpus, n=5000, session=None):
# part = round(part * 0.8)
#print('spec part:', part)
update_state.processing_(corpus, "Specificity score")
compute_specificity(corpus,limit=part, session=session)
update_state.processing_(corpus.id, "Specificity score")
compute_specificity(corpus,limit=part, mysession=mysession)
part = round(part * 0.8)
limit_inf = round(part * 1)
limit_sup = round(part * 5)
#print(limit_inf,limit_sup)
update_state.processing_(corpus, "Synonyms")
update_state.processing_(corpus.id, "Synonyms")
try:
compute_groups(corpus,limit_inf=limit_inf, limit_sup=limit_sup, session=session)
compute_groups(corpus,limit_inf=limit_inf, limit_sup=limit_sup, mysession=mysession)
except Exception as error:
print("Workflow Ngram Group error", error)
pass
update_state.processing_(corpus, "Map list terms")
compute_mapList(corpus,limit=1000, session=session) # size
update_state.processing_(corpus.id, "Map list terms")
compute_mapList(corpus,limit=1000, mysession=mysession) # size
update_state.processing_(corpus, "TF-IDF local score")
compute_tfidf(corpus, session=session)
update_state.processing_(corpus.id, "TF-IDF local score")
compute_tfidf(corpus, mysession=mysession)
update_state.processing_(corpus, "Occurrences")
compute_occs(corpus, session=session)
update_state.processing_(corpus.id, "Occurrences")
compute_occs(corpus, mysession=mysession)
......@@ -31,12 +31,11 @@ parsers = Parsers()
# resources management
def add_resource(corpus, session=None, **kwargs):
def add_resource(corpus, mysession=None, **kwargs):
sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
if mysession is None:
from gargantext_web.db import session
mysession = session
# only for tests
resource = Resource(guid=str(random()), **kwargs )
......@@ -50,7 +49,7 @@ def add_resource(corpus, session=None, **kwargs):
f.close()
resource.digest = h.hexdigest()
# check if a resource on this node already has this hash
tmp_resource = (session
tmp_resource = (mysession
.query(Resource)
.join(Node_Resource, Node_Resource.resource_id == Resource.id)
.filter(Resource.digest == resource.digest)
......@@ -59,28 +58,24 @@ def add_resource(corpus, session=None, **kwargs):
if tmp_resource is not None:
return tmp_resource
else:
session.add(resource)
session.commit()
mysession.add(resource)
mysession.commit()
# link with the resource
node_resource = Node_Resource(
node_id = corpus.id,
resource_id = resource.id,
parsed = False,
)
session.add(node_resource)
session.commit()
# return result
mysession.add(node_resource)
mysession.commit()
return resource
if sessionToRemove:
session.remove()
def parse_resources(corpus, user=None, user_id=None, session=None):
def parse_resources(corpus, user=None, user_id=None, mysession=None):
sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
if mysession is None:
from gargantext_web.db import session
mysession = session
dbg = DebugTime('Corpus #%d - parsing' % corpus.id)
......@@ -91,7 +86,7 @@ def parse_resources(corpus, user=None, user_id=None, session=None):
else:
user_id = corpus.user_id
# find resource of the corpus
resources_query = (session
resources_query = (mysession
.query(Resource, ResourceType)
.join(ResourceType, ResourceType.id == Resource.type_id)
.join(Node_Resource, Node_Resource.resource_id == Resource.id)
......@@ -134,14 +129,14 @@ def parse_resources(corpus, user=None, user_id=None, session=None):
# TODO: mark node-resources associations as parsed
#
dbg.show('insert %d documents' % len(nodes))
session.add_all(nodes)
session.commit()
mysession.add_all(nodes)
mysession.commit()
# now, index the hyperdata
dbg.show('insert hyperdata')
node_hyperdata_lists = defaultdict(list)
hyperdata_types = {
hyperdata.name: hyperdata
for hyperdata in session.query(Hyperdata)
for hyperdata in mysession.query(Hyperdata)
}
#print('hyperdata_types', hyperdata_types)
for node in nodes:
......@@ -166,7 +161,7 @@ def parse_resources(corpus, user=None, user_id=None, session=None):
node_hyperdata_ngrams = set()
#for field in ['source', 'authors', 'journal']:
for field in ['journal', 'authors']:
hyperdata_set.add(session.query(Hyperdata.id).filter(Hyperdata.name==field).first()[0])
hyperdata_set.add(mysession.query(Hyperdata.id).filter(Hyperdata.name==field).first()[0])
#print("hyperdata_set", hyperdata_set)
......@@ -191,9 +186,6 @@ def parse_resources(corpus, user=None, user_id=None, session=None):
# mark the corpus as parsed
corpus.parsed = True
if sessionToRemove:
session.remove()
# ngrams extraction
from .NgramsExtractors import EnglishNgramsExtractor, FrenchNgramsExtractor, NgramsExtractor
from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize
......@@ -222,18 +214,17 @@ class NgramsExtractors(defaultdict):
ngramsextractors = NgramsExtractors()
def extract_ngrams(corpus, keys, nlp=True, session=None):
def extract_ngrams(corpus, keys, nlp=True, mysession=None):
sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
if mysession is None:
from gargantext_web.db import session
mysession = session
dbg = DebugTime('Corpus #%d - ngrams' % corpus.id)
default_language_iso2 = None if corpus.language_id is None else cache.Language[corpus.language_id].iso2
# query the hyperdata associated with the given keys
columns = [Node.id, Node.language_id] + [Node.hyperdata[key] for key in keys]
hyperdata_query = (session
hyperdata_query = (mysession
.query(*columns)
.filter(Node.parent_id == corpus.id)
.filter(Node.type_id == cache.NodeType['Document'].id)
......@@ -242,7 +233,7 @@ def extract_ngrams(corpus, keys, nlp=True, session=None):
dbg.show('find ngrams')
languages_by_id = {
language.id: language.iso2
for language in session.query(Language)
for language in mysession.query(Language)
}
ngrams_data = set()
......@@ -321,9 +312,6 @@ def extract_ngrams(corpus, keys, nlp=True, session=None):
# commit to database
db.commit()
if sessionToRemove:
session.remove()
def text_prepa(my_str):
"""
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment