Commit 55136392 authored by delanoe's avatar delanoe

[FACTOR] One session of one workflow.

parent 29aa56bb
......@@ -35,25 +35,28 @@ def apply_workflow(corpus_id):
update_state = WorkflowTracking()
session = get_session()
corpus = session.query(Node).filter(Node.id==corpus_id).first()
try :
session = get_session()
corpus = session.query(Node).filter(Node.id==corpus_id).first()
update_state.processing_(corpus, "Parsing")
#cProfile.runctx('parse_resources(corpus)', global,locals)
parse_resources(corpus)
update_state.processing_(corpus, "Parsing")
#cProfile.runctx('parse_resources(corpus)', global,locals)
parse_resources(corpus, session=session)
update_state.processing_(corpus, "Terms extraction")
extract_ngrams(corpus, ['title', 'abstract'], nlp=True)
update_state.processing_(corpus, "Terms extraction")
extract_ngrams(corpus, ['title', 'abstract'], nlp=True, session=session)
# update_state.processing_(corpus, "")
ngram_workflow(corpus)
# update_state.processing_(corpus, "")
ngram_workflow(corpus, session=session)
#ngrams2miam(user_id=corpus.user_id, corpus_id=corpus_id)
#ngrams2miam(user_id=corpus.user_id, corpus_id=corpus_id)
print("End of the Workflow for corpus %d" % (corpus_id))
update_state.processing_(corpus, "0")
session.remove()
print("End of the Workflow for corpus %d" % (corpus_id))
update_state.processing_(corpus, "0")
session.remove()
except :
session.remove()
@shared_task
def empty_trash(corpus_id):
......
......@@ -23,7 +23,6 @@ from math import log
from functools import reduce
def getStemmer(corpus):
'''
getStemmer :: Corpus -> Stemmer
......@@ -48,11 +47,14 @@ def getStemmer(corpus):
return(stemIt)
def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem'):
def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem', session=None):
'''
group ngrams according to a function (stemming or lemming)
'''
session = get_session()
sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
dbg = DebugTime('Corpus #%d - group' % corpus.id)
dbg.show('Group')
......@@ -140,4 +142,4 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem'):
bulk_insert(NodeNgram, ('node_id', 'ngram_id', 'weight'), [data for data in list(miam_to_insert)])
session.remove()
if sessionToRemove: session.remove()
......@@ -19,8 +19,12 @@ def compute_mapList(corpus,limit=500,n=1):
'''
According to Specificities and stoplist,
'''
session = get_session()
sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
monograms_part = 0.005
monograms_limit = round(limit * monograms_part)
multigrams_limit = limit - monograms_limit
......@@ -87,10 +91,15 @@ def compute_mapList(corpus,limit=500,n=1):
bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data])
dbg.show('MapList computed')
session.remove()
if sessionToRemove: session.remove()
def insert_miam(corpus, ngrams=None, path_file_csv=None):
session = get_session()
sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
node_miam = get_or_create_node(nodetype='MiamList', corpus=corpus)
......@@ -124,6 +133,6 @@ def insert_miam(corpus, ngrams=None, path_file_csv=None):
bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data])
file_csv.close()
dbg.show('Miam computed')
session.remove()
if sessionToRemove: session.remove()
......@@ -5,7 +5,14 @@ from gargantext_web.db import get_or_create_node
from admin.utils import DebugTime
def compute_occs(corpus):
session = get_session()
'''
compute_occs :: Corpus -> IO ()
'''
sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
dbg = DebugTime('Corpus #%d - OCCURRENCES' % corpus.id)
dbg.show('Calculate occurrences')
......@@ -48,7 +55,7 @@ def compute_occs(corpus):
)
)
db.commit()
session.remove()
if sessionToRemove: session.remove()
#data = session.query(NodeNodeNgram).filter(NodeNodeNgram.nodex_id==occs_node.id).all()
......
......@@ -15,12 +15,15 @@ from gargantext_web.db import NodeNgramNgram, NodeNodeNgram
from sqlalchemy import desc, asc, or_, and_, Date, cast, select
def specificity(cooc_id=None, corpus=None, limit=100):
def specificity(cooc_id=None, corpus=None, limit=100, session=None):
'''
Compute the specificity, simple calculus.
'''
session = get_session()
sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
cooccurrences = (session.query(NodeNgramNgram)
.filter(NodeNgramNgram.node_id==cooc_id)
.order_by(NodeNgramNgram.score)
......@@ -55,17 +58,22 @@ def specificity(cooc_id=None, corpus=None, limit=100):
bulk_insert(NodeNodeNgram, ['nodex_id', 'nodey_id', 'ngram_id', 'score'], [d for d in data])
return(node.id)
session.remove()
if sessionToRemove: session.remove()
def compute_specificity(corpus,limit=100):
def compute_specificity(corpus,limit=100, session=None):
'''
Computing specificities as NodeNodeNgram.
All workflow is the following:
1) Compute the cooc matrix
2) Compute the specificity score, saving it in database, return its Node
'''
session = get_session()
sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
dbg = DebugTime('Corpus #%d - specificity' % corpus.id)
list_cvalue = get_or_create_node(nodetype='Cvalue', corpus=corpus, session=session)
......@@ -73,7 +81,7 @@ def compute_specificity(corpus,limit=100):
specificity(cooc_id=cooc_id,corpus=corpus,limit=limit)
dbg.show('specificity')
session.remove()
if sessionToRemove: session.remove()
#corpus=session.query(Node).filter(Node.id==244250).first()
#compute_specificity(corpus)
......
......@@ -75,11 +75,14 @@ def isStopWord(ngram, stop_words=None):
if test_match(word, regex) is True :
return(True)
def compute_stop(corpus,limit=2000,debug=False):
def compute_stop(corpus,limit=2000,debug=False, session=None):
'''
do some statitics on all stop lists of database of the same type
'''
session = get_session()
sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
stop_node_id = get_or_create_node(nodetype='StopList', corpus=corpus).id
......@@ -115,4 +118,4 @@ def compute_stop(corpus,limit=2000,debug=False):
stop = WeightedList({ n[0] : -1 for n in ngrams_to_stop})
stop.save(stop_node_id)
session.remove()
if sessionToRemove: session.remove()
......@@ -5,9 +5,13 @@ from gargantext_web.db import get_session, get_or_create_node
from admin.utils import DebugTime
def compute_tfidf(corpus):
def compute_tfidf(corpus, session=None):
# compute terms frequency sum
session = get_session()
sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
dbg = DebugTime('Corpus #%d - TFIDF' % corpus.id)
dbg.show('calculate terms frequencies sums')
......@@ -121,15 +125,18 @@ def compute_tfidf(corpus):
# the end!
db.commit()
session.remove()
if sessionToRemove: session.remove()
def compute_tfidf_global(corpus):
def compute_tfidf_global(corpus, session=None):
'''
Maybe improve this with:
#http://stackoverflow.com/questions/8674718/best-way-to-select-random-rows-postgresql
'''
session = get_session()
sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
dbg = DebugTime('Corpus #%d - tfidf global' % corpus.id)
dbg.show('calculate terms frequencies sums')
......@@ -265,7 +272,7 @@ def compute_tfidf_global(corpus):
db.commit()
dbg.show('insert tfidf')
session.remove()
if sessionToRemove: session.remove()
#corpus=session.query(Node).filter(Node.id==244250).first()
#compute_tfidf_global(corpus)
......@@ -12,17 +12,17 @@ from gargantext_web.db import Node , NodeNgram
from admin.utils import WorkflowTracking
def ngram_workflow(corpus, n=5000):
def ngram_workflow(corpus, n=5000, session=None):
'''
All the workflow to filter the ngrams.
'''
update_state = WorkflowTracking()
update_state.processing_(corpus, "Stop words")
compute_stop(corpus)
compute_stop(corpus, session=session)
update_state.processing_(corpus, "TF-IDF global score")
compute_tfidf_global(corpus)
compute_tfidf_global(corpus, session=session)
part = round(n * 0.9)
......@@ -32,7 +32,7 @@ def ngram_workflow(corpus, n=5000):
#print('spec part:', part)
update_state.processing_(corpus, "Specificity score")
compute_specificity(corpus,limit=part)
compute_specificity(corpus,limit=part, session=session)
part = round(part * 0.8)
......@@ -41,18 +41,18 @@ def ngram_workflow(corpus, n=5000):
#print(limit_inf,limit_sup)
update_state.processing_(corpus, "Synonyms")
try:
compute_groups(corpus,limit_inf=limit_inf, limit_sup=limit_sup)
compute_groups(corpus,limit_inf=limit_inf, limit_sup=limit_sup, session=session)
except Exception as error:
print("Workflow Ngram Group error", error)
pass
update_state.processing_(corpus, "Map list terms")
compute_mapList(corpus,limit=1000) # size
compute_mapList(corpus,limit=1000, session=session) # size
update_state.processing_(corpus, "TF-IDF local score")
compute_tfidf(corpus)
compute_tfidf(corpus, session=session)
update_state.processing_(corpus, "Occurrences")
compute_occs(corpus)
compute_occs(corpus, session=session)
......@@ -31,8 +31,12 @@ parsers = Parsers()
# resources management
def add_resource(corpus, **kwargs):
session = get_session()
def add_resource(corpus, session=None, **kwargs):
sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
# only for tests
resource = Resource(guid=str(random()), **kwargs )
......@@ -67,10 +71,16 @@ def add_resource(corpus, **kwargs):
session.commit()
# return result
return resource
session.remove()
if sessionToRemove:
session.remove()
def parse_resources(corpus, user=None, user_id=None):
session = get_session()
def parse_resources(corpus, user=None, user_id=None, session=None):
sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
dbg = DebugTime('Corpus #%d - parsing' % corpus.id)
......@@ -180,7 +190,9 @@ def parse_resources(corpus, user=None, user_id=None):
# mark the corpus as parsed
corpus.parsed = True
session.remove()
if sessionToRemove:
session.remove()
# ngrams extraction
from .NgramsExtractors import EnglishNgramsExtractor, FrenchNgramsExtractor, NgramsExtractor
......@@ -210,8 +222,12 @@ class NgramsExtractors(defaultdict):
ngramsextractors = NgramsExtractors()
def extract_ngrams(corpus, keys, nlp=True):
session = get_session()
def extract_ngrams(corpus, keys, nlp=True, session=None):
sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
dbg = DebugTime('Corpus #%d - ngrams' % corpus.id)
default_language_iso2 = None if corpus.language_id is None else cache.Language[corpus.language_id].iso2
......@@ -304,7 +320,9 @@ def extract_ngrams(corpus, keys, nlp=True):
dbg.message = 'insert %d associations' % len(node_ngram_data)
# commit to database
db.commit()
session.remove()
if sessionToRemove:
session.remove()
def text_prepa(my_str):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment