Commit 6bbe13d0 authored by delanoe's avatar delanoe

[PARAMETERS] of the ngram workflow.

parent 51e6eb60
...@@ -8,6 +8,8 @@ from gargantext_web.db import Node, Ngram, NodeNgram, NodeNgramNgram, \ ...@@ -8,6 +8,8 @@ from gargantext_web.db import Node, Ngram, NodeNgram, NodeNgramNgram, \
from gargantext_web.db import session, cache, get_or_create_node, bulk_insert from gargantext_web.db import session, cache, get_or_create_node, bulk_insert
from analysis.lists import WeightedMatrix, UnweightedList, Translations from analysis.lists import WeightedMatrix, UnweightedList, Translations
# keep list
def cooc(corpus=None def cooc(corpus=None
, field_X=None, field_Y=None , field_X=None, field_Y=None
, miam_id=None, stop_id=None, group_id=None , miam_id=None, stop_id=None, group_id=None
...@@ -104,13 +106,13 @@ def cooc(corpus=None ...@@ -104,13 +106,13 @@ def cooc(corpus=None
# Cooc is symetric, take only the main cooccurrences and cut at the limit # Cooc is symetric, take only the main cooccurrences and cut at the limit
cooc_query = (cooc_query.filter(Node.parent_id == corpus.id, Node.type_id == doc_id) cooc_query = (cooc_query
.filter(NodeNgramX.ngram_id < NodeNgramY.ngram_id) .filter(NodeNgramX.ngram_id < NodeNgramY.ngram_id)
.group_by(NodeNgramX.ngram_id, NodeNgramY.ngram_id) .group_by(NodeNgramX.ngram_id, NodeNgramY.ngram_id)
.order_by(desc(func.count())) .order_by(desc(func.count()))
.limit(limit) #.limit(limit)
) )
matrix = WeightedMatrix(cooc_query) matrix = WeightedMatrix(cooc_query)
......
...@@ -229,10 +229,10 @@ def get_cooc(request=None, corpus=None, cooc_id=None, type='node_link', size=siz ...@@ -229,10 +229,10 @@ def get_cooc(request=None, corpus=None, cooc_id=None, type='node_link', size=siz
#print(n) #print(n)
#print(m) #print(m)
nodes_included = 300 #int(round(size/20,0)) nodes_included = 1000 #int(round(size/20,0))
#nodes_excluded = int(round(size/10,0)) #nodes_excluded = int(round(size/10,0))
nodes_specific = 300 #int(round(size/10,0)) nodes_specific = 1000 #int(round(size/10,0))
#nodes_generic = int(round(size/10,0)) #nodes_generic = int(round(size/10,0))
# TODO user the included score for the node size # TODO user the included score for the node size
......
...@@ -36,7 +36,7 @@ ngrams = {'adenoic cystic basal cell carcinoma' : 5 ...@@ -36,7 +36,7 @@ ngrams = {'adenoic cystic basal cell carcinoma' : 5
} }
''' '''
def getNgrams(corpus=None, limit=160): def getNgrams(corpus=None, limit=1000):
''' '''
getNgrams :: Corpus -> [(Int, String, String, Float)] getNgrams :: Corpus -> [(Int, String, String, Float)]
''' '''
...@@ -63,7 +63,7 @@ def getNgrams(corpus=None, limit=160): ...@@ -63,7 +63,7 @@ def getNgrams(corpus=None, limit=160):
PrintException() PrintException()
return(terms) return(terms)
def compute_cvalue(corpus=None, limit=160): def compute_cvalue(corpus=None, limit=1000):
''' '''
computeCvalue :: Corpus computeCvalue :: Corpus
frequency :: String -> Int -> Int frequency :: String -> Int -> Int
......
...@@ -7,17 +7,28 @@ from ngram.group import compute_groups ...@@ -7,17 +7,28 @@ from ngram.group import compute_groups
from ngram.miam import compute_miam from ngram.miam import compute_miam
from gargantext_web.db import get_or_create_node from gargantext_web.db import get_or_create_node
def ngram_workflow(corpus): def ngram_workflow(corpus, n=5000):
''' '''
All the workflow to filter the ngrams. All the workflow to filter the ngrams.
''' '''
compute_tfidf(corpus)
compute_tfidf_global(corpus) compute_tfidf_global(corpus)
compute_cvalue(corpus,limit=10000) # size
compute_specificity(corpus,limit=10000) part = round(n * 0.8)
compute_cvalue(corpus,limit=part) # size
part = round(part * 0.6)
compute_specificity(corpus,limit=part)
part = round(part * 0.5)
# compute_stop(corpus) # compute_stop(corpus)
compute_groups(corpus,limit_inf=1000, limit_sup=5000) compute_groups(corpus,limit_inf=part, limit_sup=n)
compute_miam(corpus,limit=3000) # size
compute_miam(corpus,limit=part) # size
compute_tfidf(corpus)
#corpus=session.query(Node).filter(Node.id==244250).first() #corpus=session.query(Node).filter(Node.id==244250).first()
#ngram_workflow(corpus) #ngram_workflow(corpus)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment