Commit 55136392 authored by delanoe's avatar delanoe

[FACTOR] One session of one workflow.

parent 29aa56bb
...@@ -35,25 +35,28 @@ def apply_workflow(corpus_id): ...@@ -35,25 +35,28 @@ def apply_workflow(corpus_id):
update_state = WorkflowTracking() update_state = WorkflowTracking()
session = get_session() try :
corpus = session.query(Node).filter(Node.id==corpus_id).first() session = get_session()
corpus = session.query(Node).filter(Node.id==corpus_id).first()
update_state.processing_(corpus, "Parsing") update_state.processing_(corpus, "Parsing")
#cProfile.runctx('parse_resources(corpus)', global,locals) #cProfile.runctx('parse_resources(corpus)', global,locals)
parse_resources(corpus) parse_resources(corpus, session=session)
update_state.processing_(corpus, "Terms extraction") update_state.processing_(corpus, "Terms extraction")
extract_ngrams(corpus, ['title', 'abstract'], nlp=True) extract_ngrams(corpus, ['title', 'abstract'], nlp=True, session=session)
# update_state.processing_(corpus, "") # update_state.processing_(corpus, "")
ngram_workflow(corpus) ngram_workflow(corpus, session=session)
#ngrams2miam(user_id=corpus.user_id, corpus_id=corpus_id) #ngrams2miam(user_id=corpus.user_id, corpus_id=corpus_id)
print("End of the Workflow for corpus %d" % (corpus_id)) print("End of the Workflow for corpus %d" % (corpus_id))
update_state.processing_(corpus, "0") update_state.processing_(corpus, "0")
session.remove() session.remove()
except :
session.remove()
@shared_task @shared_task
def empty_trash(corpus_id): def empty_trash(corpus_id):
......
...@@ -23,7 +23,6 @@ from math import log ...@@ -23,7 +23,6 @@ from math import log
from functools import reduce from functools import reduce
def getStemmer(corpus): def getStemmer(corpus):
''' '''
getStemmer :: Corpus -> Stemmer getStemmer :: Corpus -> Stemmer
...@@ -48,11 +47,14 @@ def getStemmer(corpus): ...@@ -48,11 +47,14 @@ def getStemmer(corpus):
return(stemIt) return(stemIt)
def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem'): def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem', session=None):
''' '''
group ngrams according to a function (stemming or lemming) group ngrams according to a function (stemming or lemming)
''' '''
session = get_session() sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
dbg = DebugTime('Corpus #%d - group' % corpus.id) dbg = DebugTime('Corpus #%d - group' % corpus.id)
dbg.show('Group') dbg.show('Group')
...@@ -140,4 +142,4 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem'): ...@@ -140,4 +142,4 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem'):
bulk_insert(NodeNgram, ('node_id', 'ngram_id', 'weight'), [data for data in list(miam_to_insert)]) bulk_insert(NodeNgram, ('node_id', 'ngram_id', 'weight'), [data for data in list(miam_to_insert)])
session.remove() if sessionToRemove: session.remove()
...@@ -19,8 +19,12 @@ def compute_mapList(corpus,limit=500,n=1): ...@@ -19,8 +19,12 @@ def compute_mapList(corpus,limit=500,n=1):
''' '''
According to Specificities and stoplist, According to Specificities and stoplist,
''' '''
session = get_session()
sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
monograms_part = 0.005 monograms_part = 0.005
monograms_limit = round(limit * monograms_part) monograms_limit = round(limit * monograms_part)
multigrams_limit = limit - monograms_limit multigrams_limit = limit - monograms_limit
...@@ -87,10 +91,15 @@ def compute_mapList(corpus,limit=500,n=1): ...@@ -87,10 +91,15 @@ def compute_mapList(corpus,limit=500,n=1):
bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data]) bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data])
dbg.show('MapList computed') dbg.show('MapList computed')
session.remove() if sessionToRemove: session.remove()
def insert_miam(corpus, ngrams=None, path_file_csv=None): def insert_miam(corpus, ngrams=None, path_file_csv=None):
session = get_session()
sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id) dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
node_miam = get_or_create_node(nodetype='MiamList', corpus=corpus) node_miam = get_or_create_node(nodetype='MiamList', corpus=corpus)
...@@ -124,6 +133,6 @@ def insert_miam(corpus, ngrams=None, path_file_csv=None): ...@@ -124,6 +133,6 @@ def insert_miam(corpus, ngrams=None, path_file_csv=None):
bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data]) bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data])
file_csv.close() file_csv.close()
dbg.show('Miam computed') dbg.show('Miam computed')
session.remove() if sessionToRemove: session.remove()
...@@ -5,7 +5,14 @@ from gargantext_web.db import get_or_create_node ...@@ -5,7 +5,14 @@ from gargantext_web.db import get_or_create_node
from admin.utils import DebugTime from admin.utils import DebugTime
def compute_occs(corpus): def compute_occs(corpus):
session = get_session() '''
compute_occs :: Corpus -> IO ()
'''
sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
dbg = DebugTime('Corpus #%d - OCCURRENCES' % corpus.id) dbg = DebugTime('Corpus #%d - OCCURRENCES' % corpus.id)
dbg.show('Calculate occurrences') dbg.show('Calculate occurrences')
...@@ -48,7 +55,7 @@ def compute_occs(corpus): ...@@ -48,7 +55,7 @@ def compute_occs(corpus):
) )
) )
db.commit() db.commit()
session.remove() if sessionToRemove: session.remove()
#data = session.query(NodeNodeNgram).filter(NodeNodeNgram.nodex_id==occs_node.id).all() #data = session.query(NodeNodeNgram).filter(NodeNodeNgram.nodex_id==occs_node.id).all()
......
...@@ -15,12 +15,15 @@ from gargantext_web.db import NodeNgramNgram, NodeNodeNgram ...@@ -15,12 +15,15 @@ from gargantext_web.db import NodeNgramNgram, NodeNodeNgram
from sqlalchemy import desc, asc, or_, and_, Date, cast, select from sqlalchemy import desc, asc, or_, and_, Date, cast, select
def specificity(cooc_id=None, corpus=None, limit=100): def specificity(cooc_id=None, corpus=None, limit=100, session=None):
''' '''
Compute the specificity, simple calculus. Compute the specificity, simple calculus.
''' '''
session = get_session() sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
cooccurrences = (session.query(NodeNgramNgram) cooccurrences = (session.query(NodeNgramNgram)
.filter(NodeNgramNgram.node_id==cooc_id) .filter(NodeNgramNgram.node_id==cooc_id)
.order_by(NodeNgramNgram.score) .order_by(NodeNgramNgram.score)
...@@ -55,17 +58,22 @@ def specificity(cooc_id=None, corpus=None, limit=100): ...@@ -55,17 +58,22 @@ def specificity(cooc_id=None, corpus=None, limit=100):
bulk_insert(NodeNodeNgram, ['nodex_id', 'nodey_id', 'ngram_id', 'score'], [d for d in data]) bulk_insert(NodeNodeNgram, ['nodex_id', 'nodey_id', 'ngram_id', 'score'], [d for d in data])
return(node.id) return(node.id)
session.remove()
if sessionToRemove: session.remove()
def compute_specificity(corpus,limit=100): def compute_specificity(corpus,limit=100, session=None):
''' '''
Computing specificities as NodeNodeNgram. Computing specificities as NodeNodeNgram.
All workflow is the following: All workflow is the following:
1) Compute the cooc matrix 1) Compute the cooc matrix
2) Compute the specificity score, saving it in database, return its Node 2) Compute the specificity score, saving it in database, return its Node
''' '''
session = get_session()
sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
dbg = DebugTime('Corpus #%d - specificity' % corpus.id) dbg = DebugTime('Corpus #%d - specificity' % corpus.id)
list_cvalue = get_or_create_node(nodetype='Cvalue', corpus=corpus, session=session) list_cvalue = get_or_create_node(nodetype='Cvalue', corpus=corpus, session=session)
...@@ -73,7 +81,7 @@ def compute_specificity(corpus,limit=100): ...@@ -73,7 +81,7 @@ def compute_specificity(corpus,limit=100):
specificity(cooc_id=cooc_id,corpus=corpus,limit=limit) specificity(cooc_id=cooc_id,corpus=corpus,limit=limit)
dbg.show('specificity') dbg.show('specificity')
session.remove() if sessionToRemove: session.remove()
#corpus=session.query(Node).filter(Node.id==244250).first() #corpus=session.query(Node).filter(Node.id==244250).first()
#compute_specificity(corpus) #compute_specificity(corpus)
......
...@@ -75,11 +75,14 @@ def isStopWord(ngram, stop_words=None): ...@@ -75,11 +75,14 @@ def isStopWord(ngram, stop_words=None):
if test_match(word, regex) is True : if test_match(word, regex) is True :
return(True) return(True)
def compute_stop(corpus,limit=2000,debug=False): def compute_stop(corpus,limit=2000,debug=False, session=None):
''' '''
do some statitics on all stop lists of database of the same type do some statitics on all stop lists of database of the same type
''' '''
session = get_session() sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
stop_node_id = get_or_create_node(nodetype='StopList', corpus=corpus).id stop_node_id = get_or_create_node(nodetype='StopList', corpus=corpus).id
...@@ -115,4 +118,4 @@ def compute_stop(corpus,limit=2000,debug=False): ...@@ -115,4 +118,4 @@ def compute_stop(corpus,limit=2000,debug=False):
stop = WeightedList({ n[0] : -1 for n in ngrams_to_stop}) stop = WeightedList({ n[0] : -1 for n in ngrams_to_stop})
stop.save(stop_node_id) stop.save(stop_node_id)
session.remove() if sessionToRemove: session.remove()
...@@ -5,9 +5,13 @@ from gargantext_web.db import get_session, get_or_create_node ...@@ -5,9 +5,13 @@ from gargantext_web.db import get_session, get_or_create_node
from admin.utils import DebugTime from admin.utils import DebugTime
def compute_tfidf(corpus): def compute_tfidf(corpus, session=None):
# compute terms frequency sum # compute terms frequency sum
session = get_session()
sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
dbg = DebugTime('Corpus #%d - TFIDF' % corpus.id) dbg = DebugTime('Corpus #%d - TFIDF' % corpus.id)
dbg.show('calculate terms frequencies sums') dbg.show('calculate terms frequencies sums')
...@@ -121,15 +125,18 @@ def compute_tfidf(corpus): ...@@ -121,15 +125,18 @@ def compute_tfidf(corpus):
# the end! # the end!
db.commit() db.commit()
session.remove() if sessionToRemove: session.remove()
def compute_tfidf_global(corpus): def compute_tfidf_global(corpus, session=None):
''' '''
Maybe improve this with: Maybe improve this with:
#http://stackoverflow.com/questions/8674718/best-way-to-select-random-rows-postgresql #http://stackoverflow.com/questions/8674718/best-way-to-select-random-rows-postgresql
''' '''
session = get_session() sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
dbg = DebugTime('Corpus #%d - tfidf global' % corpus.id) dbg = DebugTime('Corpus #%d - tfidf global' % corpus.id)
dbg.show('calculate terms frequencies sums') dbg.show('calculate terms frequencies sums')
...@@ -265,7 +272,7 @@ def compute_tfidf_global(corpus): ...@@ -265,7 +272,7 @@ def compute_tfidf_global(corpus):
db.commit() db.commit()
dbg.show('insert tfidf') dbg.show('insert tfidf')
session.remove() if sessionToRemove: session.remove()
#corpus=session.query(Node).filter(Node.id==244250).first() #corpus=session.query(Node).filter(Node.id==244250).first()
#compute_tfidf_global(corpus) #compute_tfidf_global(corpus)
...@@ -12,17 +12,17 @@ from gargantext_web.db import Node , NodeNgram ...@@ -12,17 +12,17 @@ from gargantext_web.db import Node , NodeNgram
from admin.utils import WorkflowTracking from admin.utils import WorkflowTracking
def ngram_workflow(corpus, n=5000): def ngram_workflow(corpus, n=5000, session=None):
''' '''
All the workflow to filter the ngrams. All the workflow to filter the ngrams.
''' '''
update_state = WorkflowTracking() update_state = WorkflowTracking()
update_state.processing_(corpus, "Stop words") update_state.processing_(corpus, "Stop words")
compute_stop(corpus) compute_stop(corpus, session=session)
update_state.processing_(corpus, "TF-IDF global score") update_state.processing_(corpus, "TF-IDF global score")
compute_tfidf_global(corpus) compute_tfidf_global(corpus, session=session)
part = round(n * 0.9) part = round(n * 0.9)
...@@ -32,7 +32,7 @@ def ngram_workflow(corpus, n=5000): ...@@ -32,7 +32,7 @@ def ngram_workflow(corpus, n=5000):
#print('spec part:', part) #print('spec part:', part)
update_state.processing_(corpus, "Specificity score") update_state.processing_(corpus, "Specificity score")
compute_specificity(corpus,limit=part) compute_specificity(corpus,limit=part, session=session)
part = round(part * 0.8) part = round(part * 0.8)
...@@ -41,18 +41,18 @@ def ngram_workflow(corpus, n=5000): ...@@ -41,18 +41,18 @@ def ngram_workflow(corpus, n=5000):
#print(limit_inf,limit_sup) #print(limit_inf,limit_sup)
update_state.processing_(corpus, "Synonyms") update_state.processing_(corpus, "Synonyms")
try: try:
compute_groups(corpus,limit_inf=limit_inf, limit_sup=limit_sup) compute_groups(corpus,limit_inf=limit_inf, limit_sup=limit_sup, session=session)
except Exception as error: except Exception as error:
print("Workflow Ngram Group error", error) print("Workflow Ngram Group error", error)
pass pass
update_state.processing_(corpus, "Map list terms") update_state.processing_(corpus, "Map list terms")
compute_mapList(corpus,limit=1000) # size compute_mapList(corpus,limit=1000, session=session) # size
update_state.processing_(corpus, "TF-IDF local score") update_state.processing_(corpus, "TF-IDF local score")
compute_tfidf(corpus) compute_tfidf(corpus, session=session)
update_state.processing_(corpus, "Occurrences") update_state.processing_(corpus, "Occurrences")
compute_occs(corpus) compute_occs(corpus, session=session)
...@@ -31,8 +31,12 @@ parsers = Parsers() ...@@ -31,8 +31,12 @@ parsers = Parsers()
# resources management # resources management
def add_resource(corpus, **kwargs): def add_resource(corpus, session=None, **kwargs):
session = get_session()
sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
# only for tests # only for tests
resource = Resource(guid=str(random()), **kwargs ) resource = Resource(guid=str(random()), **kwargs )
...@@ -67,10 +71,16 @@ def add_resource(corpus, **kwargs): ...@@ -67,10 +71,16 @@ def add_resource(corpus, **kwargs):
session.commit() session.commit()
# return result # return result
return resource return resource
session.remove()
if sessionToRemove:
session.remove()
def parse_resources(corpus, user=None, user_id=None): def parse_resources(corpus, user=None, user_id=None, session=None):
session = get_session()
sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
dbg = DebugTime('Corpus #%d - parsing' % corpus.id) dbg = DebugTime('Corpus #%d - parsing' % corpus.id)
...@@ -180,7 +190,9 @@ def parse_resources(corpus, user=None, user_id=None): ...@@ -180,7 +190,9 @@ def parse_resources(corpus, user=None, user_id=None):
# mark the corpus as parsed # mark the corpus as parsed
corpus.parsed = True corpus.parsed = True
session.remove()
if sessionToRemove:
session.remove()
# ngrams extraction # ngrams extraction
from .NgramsExtractors import EnglishNgramsExtractor, FrenchNgramsExtractor, NgramsExtractor from .NgramsExtractors import EnglishNgramsExtractor, FrenchNgramsExtractor, NgramsExtractor
...@@ -210,8 +222,12 @@ class NgramsExtractors(defaultdict): ...@@ -210,8 +222,12 @@ class NgramsExtractors(defaultdict):
ngramsextractors = NgramsExtractors() ngramsextractors = NgramsExtractors()
def extract_ngrams(corpus, keys, nlp=True): def extract_ngrams(corpus, keys, nlp=True, session=None):
session = get_session()
sessionToRemove = False
if session is None:
session = get_session()
sessionToRemove = True
dbg = DebugTime('Corpus #%d - ngrams' % corpus.id) dbg = DebugTime('Corpus #%d - ngrams' % corpus.id)
default_language_iso2 = None if corpus.language_id is None else cache.Language[corpus.language_id].iso2 default_language_iso2 = None if corpus.language_id is None else cache.Language[corpus.language_id].iso2
...@@ -304,7 +320,9 @@ def extract_ngrams(corpus, keys, nlp=True): ...@@ -304,7 +320,9 @@ def extract_ngrams(corpus, keys, nlp=True):
dbg.message = 'insert %d associations' % len(node_ngram_data) dbg.message = 'insert %d associations' % len(node_ngram_data)
# commit to database # commit to database
db.commit() db.commit()
session.remove()
if sessionToRemove:
session.remove()
def text_prepa(my_str): def text_prepa(my_str):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment