Commit 6bbe13d0 authored by delanoe's avatar delanoe

[PARAMETERS] of the ngram workflow.

parent 51e6eb60
......@@ -8,6 +8,8 @@ from gargantext_web.db import Node, Ngram, NodeNgram, NodeNgramNgram, \
from gargantext_web.db import session, cache, get_or_create_node, bulk_insert
from analysis.lists import WeightedMatrix, UnweightedList, Translations
# keep list
def cooc(corpus=None
, field_X=None, field_Y=None
, miam_id=None, stop_id=None, group_id=None
......@@ -104,13 +106,13 @@ def cooc(corpus=None
# Cooc is symetric, take only the main cooccurrences and cut at the limit
cooc_query = (cooc_query.filter(Node.parent_id == corpus.id, Node.type_id == doc_id)
cooc_query = (cooc_query
.filter(NodeNgramX.ngram_id < NodeNgramY.ngram_id)
.group_by(NodeNgramX.ngram_id, NodeNgramY.ngram_id)
.order_by(desc(func.count()))
.limit(limit)
#.limit(limit)
)
matrix = WeightedMatrix(cooc_query)
......
......@@ -229,10 +229,10 @@ def get_cooc(request=None, corpus=None, cooc_id=None, type='node_link', size=siz
#print(n)
#print(m)
nodes_included = 300 #int(round(size/20,0))
nodes_included = 1000 #int(round(size/20,0))
#nodes_excluded = int(round(size/10,0))
nodes_specific = 300 #int(round(size/10,0))
nodes_specific = 1000 #int(round(size/10,0))
#nodes_generic = int(round(size/10,0))
# TODO user the included score for the node size
......
......@@ -36,7 +36,7 @@ ngrams = {'adenoic cystic basal cell carcinoma' : 5
}
'''
def getNgrams(corpus=None, limit=160):
def getNgrams(corpus=None, limit=1000):
'''
getNgrams :: Corpus -> [(Int, String, String, Float)]
'''
......@@ -63,7 +63,7 @@ def getNgrams(corpus=None, limit=160):
PrintException()
return(terms)
def compute_cvalue(corpus=None, limit=160):
def compute_cvalue(corpus=None, limit=1000):
'''
computeCvalue :: Corpus
frequency :: String -> Int -> Int
......
......@@ -7,17 +7,28 @@ from ngram.group import compute_groups
from ngram.miam import compute_miam
from gargantext_web.db import get_or_create_node
def ngram_workflow(corpus):
def ngram_workflow(corpus, n=5000):
'''
All the workflow to filter the ngrams.
'''
compute_tfidf(corpus)
compute_tfidf_global(corpus)
compute_cvalue(corpus,limit=10000) # size
compute_specificity(corpus,limit=10000)
part = round(n * 0.8)
compute_cvalue(corpus,limit=part) # size
part = round(part * 0.6)
compute_specificity(corpus,limit=part)
part = round(part * 0.5)
# compute_stop(corpus)
compute_groups(corpus,limit_inf=1000, limit_sup=5000)
compute_miam(corpus,limit=3000) # size
compute_groups(corpus,limit_inf=part, limit_sup=n)
compute_miam(corpus,limit=part) # size
compute_tfidf(corpus)
#corpus=session.query(Node).filter(Node.id==244250).first()
#ngram_workflow(corpus)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment