Commit 4c9cb5ea authored by delanoe's avatar delanoe

[FEAT] Keep node.

parent e02cd586
...@@ -29,7 +29,6 @@ from sqlalchemy.orm import aliased ...@@ -29,7 +29,6 @@ from sqlalchemy.orm import aliased
def diag_null(x): def diag_null(x):
return x - x * scipy.eye(x.shape[0]) return x - x * scipy.eye(x.shape[0])
def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True): def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True):
''' '''
do_distance :: Int -> (Graph, Partition, {ids}, {weight}) do_distance :: Int -> (Graph, Partition, {ids}, {weight})
...@@ -75,10 +74,10 @@ def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True): ...@@ -75,10 +74,10 @@ def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True):
n = n.sort(inplace=False) n = n.sort(inplace=False)
m = m.sort(inplace=False) m = m.sort(inplace=False)
nodes_included = 300 #int(round(size/20,0)) nodes_included = 500 #int(round(size/20,0))
#nodes_excluded = int(round(size/10,0)) #nodes_excluded = int(round(size/10,0))
nodes_specific = 300 #int(round(size/10,0)) nodes_specific = 500 #int(round(size/10,0))
#nodes_generic = int(round(size/10,0)) #nodes_generic = int(round(size/10,0))
# TODO use the included score for the node size # TODO use the included score for the node size
...@@ -87,6 +86,7 @@ def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True): ...@@ -87,6 +86,7 @@ def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True):
#m_index = pd.Index.intersection(x.index, m.index[:nodes_generic]) #m_index = pd.Index.intersection(x.index, m.index[:nodes_generic])
# Specific: # Specific:
m_index = pd.Index.intersection(x.index, m.index[-nodes_specific:]) m_index = pd.Index.intersection(x.index, m.index[-nodes_specific:])
#m_index = pd.Index.intersection(x.index, n.index[:nodes_included])
x_index = pd.Index.union(n_index, m_index) x_index = pd.Index.union(n_index, m_index)
xx = x[list(x_index)].T[list(x_index)] xx = x[list(x_index)].T[list(x_index)]
...@@ -113,7 +113,6 @@ def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True): ...@@ -113,7 +113,6 @@ def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True):
return(G,partition,ids,weight) return(G,partition,ids,weight)
def get_cooc(request=None, corpus=None def get_cooc(request=None, corpus=None
, field1='ngrams', field2='ngrams' , field1='ngrams', field2='ngrams'
, cooc_id=None, type='node_link', size=1000 , cooc_id=None, type='node_link', size=1000
...@@ -126,7 +125,7 @@ def get_cooc(request=None, corpus=None ...@@ -126,7 +125,7 @@ def get_cooc(request=None, corpus=None
data = {} data = {}
#if session.query(Node).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first() is None: #if session.query(Node).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first() is None:
print("Coocurrences do not exist yet, create it.") print("Coocurrences do not exist yet, create it.")
miam_id = get_or_create_node(nodetype='MiamList', corpus=corpus).id miam_id = get_or_create_node(nodetype='MapList', corpus=corpus).id
stop_id = get_or_create_node(nodetype='StopList', corpus=corpus).id stop_id = get_or_create_node(nodetype='StopList', corpus=corpus).id
group_id = get_or_create_node(nodetype='Group', corpus=corpus).id group_id = get_or_create_node(nodetype='Group', corpus=corpus).id
...@@ -141,9 +140,9 @@ def get_cooc(request=None, corpus=None ...@@ -141,9 +140,9 @@ def get_cooc(request=None, corpus=None
#cooc_id = get_or_create_node(nodetype='Cooccurrence', corpus=corpus).id #cooc_id = get_or_create_node(nodetype='Cooccurrence', corpus=corpus).id
cooc_id = do_cooc(corpus=corpus, field1="ngrams", field2="ngrams" cooc_id = do_cooc(corpus=corpus, field1="ngrams", field2="ngrams"
, miam_id=miam_id, group_id=group_id, stop_id=stop_id, limit=size , miam_id=miam_id, group_id=group_id, stop_id=stop_id, limit=size
, isMonopartite=isMonopartite , start=start , end=end , apax=apax) , isMonopartite=True, start=start , end=end , apax=apax)
G, partition, ids, weight = do_distance(cooc_id, field1="ngrams", field2="ngrams", isMonopartite=isMonopartite) G, partition, ids, weight = do_distance(cooc_id, field1="ngrams", field2="ngrams", isMonopartite=True)
if type == "node_link": if type == "node_link":
nodesB_dict = {} nodesB_dict = {}
...@@ -173,8 +172,8 @@ def get_cooc(request=None, corpus=None ...@@ -173,8 +172,8 @@ def get_cooc(request=None, corpus=None
s = e[0] s = e[0]
t = e[1] t = e[1]
info = { info = {
"s":ids[s][1] , "s": ids[s][1] ,
"t":ids[t][1] , "t": ids[t][1] ,
"w": G[ids[s][1]][ids[t][1]]["weight"] "w": G[ids[s][1]][ids[t][1]]["weight"]
} }
# print(info) # print(info)
...@@ -216,15 +215,13 @@ def get_cooc(request=None, corpus=None ...@@ -216,15 +215,13 @@ def get_cooc(request=None, corpus=None
return(data) return(data)
def get_graphA( nodeA_type , NodesB , links , corpus ): def get_graphA( nodeA_type , NodesB , links , corpus ):
from analysis.InterUnion import Utils from analysis.InterUnion import Utils
print(" = = = == = = = ") print(" = = = == = = = ")
print("In get_graphA(), corpus id:",corpus.id) print("In get_graphA(), corpus id:",corpus.id)
nodeA_type_id = cache.Hyperdata[nodeA_type].id nodeA_type_id = cache.Hyperdata[nodeA_type].id
threshold_cotainf = 0.05 threshold_cotainf = 0.02
max_nodeid = -1 max_nodeid = -1
for nodeid in NodesB: for nodeid in NodesB:
if nodeid > max_nodeid: if nodeid > max_nodeid:
......
...@@ -91,7 +91,7 @@ print('Initialize node types...') ...@@ -91,7 +91,7 @@ print('Initialize node types...')
node_types = [ node_types = [
'Root', 'Trash', 'Root', 'Trash',
'Project', 'Corpus', 'Document', 'Project', 'Corpus', 'Document',
'MiamList', 'StopList', 'MainList', 'MiamList', 'StopList', 'MainList', 'MapList', # TODO MiamList -> MainList
'Stem', 'Lem', 'Group', 'Tfidf', 'Tfidf (global)', 'Cvalue', 'Specificity' 'Stem', 'Lem', 'Group', 'Tfidf', 'Tfidf (global)', 'Cvalue', 'Specificity'
, 'Cooccurrence', , 'Cooccurrence',
] ]
......
...@@ -7,6 +7,7 @@ from gargantext_web.db import NodeNgram,NodeNodeNgram ...@@ -7,6 +7,7 @@ from gargantext_web.db import NodeNgram,NodeNodeNgram
from gargantext_web.db import * from gargantext_web.db import *
from gargantext_web.db import get_or_create_node from gargantext_web.db import get_or_create_node
from analysis.lists import Translations, UnweightedList
from parsing.corpustools import * from parsing.corpustools import *
import sqlalchemy as sa import sqlalchemy as sa
...@@ -21,62 +22,7 @@ from collections import defaultdict ...@@ -21,62 +22,7 @@ from collections import defaultdict
from math import log from math import log
from functools import reduce from functools import reduce
def queryNodeNodeNgram(nodeMeasure_id=None, corpus_id=None, limit=None):
'''
queryNodeNodeNgram :: Int -> Int -> Int -> (Int, String, Float)
Get list of ngrams according to a measure related to the corpus: maybe tfidf
cvalue.
'''
query = (session.query(Ngram.id, Ngram.terms, NodeNodeNgram.score)
.join(NodeNodeNgram, NodeNodeNgram.ngram_id == Ngram.id)
.join(Node, Node.id == NodeNodeNgram.nodex_id)
.filter(NodeNodeNgram.nodex_id == nodeMeasure_id)
.filter(NodeNodeNgram.nodey_id == corpus_id)
.group_by(Ngram.id, Ngram.terms, NodeNodeNgram.score)
.order_by(desc(NodeNodeNgram.score))
)
if limit is None:
query = query.count()
elif limit == 0 :
query = query.all()
else:
query = query.limit(limit)
return(query)
def getNgrams(corpus=None, limit_inf=600, limit_sup=3000):
'''
getNgrams :: Corpus -> [(Int, String)] -> [(Int, String)]
For a corpus, gives list of highest Cvalue ngrams and highest TFIDF (global)
ngrams that have to be grouped with
'''
#tfidf_node = get_or_create_node(nodetype='Tfidf (global)', corpus=corpus)
cvalue_node = get_or_create_node(nodetype='Cvalue', corpus=corpus)
spec_node = get_or_create_node(nodetype='Specificity', corpus=corpus)
#tfidf_ngrams = queryNodeNodeNgram(nodeMeasure_id=tfidf_node.id, corpus_id=corpus.id)
cvalue_ngrams = queryNodeNodeNgram(nodeMeasure_id=cvalue_node.id, corpus_id=corpus.id, limit=limit_sup)
spec_ngrams = queryNodeNodeNgram(nodeMeasure_id=spec_node.id, corpus_id=corpus.id, limit=limit_inf)
#print([n for n in tfidf_ngrams])
def list2set(_list):
_set = set()
for n in _list:
_set.add((n[0],n[1]))
return(_set)
cvalue_set = set()
spec_set = set()
cvalue_set = list2set(cvalue_ngrams)
spec_set = list2set(spec_ngrams)
cvalue_setDiff = cvalue_set.difference(spec_set)
return(spec_set,cvalue_setDiff)
def getStemmer(corpus): def getStemmer(corpus):
''' '''
...@@ -121,17 +67,35 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem'): ...@@ -121,17 +67,35 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem'):
miam_to_insert = set() miam_to_insert = set()
miam_node = get_or_create_node(nodetype='MiamList', corpus=corpus) miam_node = get_or_create_node(nodetype='MiamList', corpus=corpus)
stop_node = get_or_create_node(nodetype='StopList', corpus=corpus)
#stop_list = UnweightedList(stop_node.id)
Stop = aliased(NodeNgram)
frequency = sa.func.count(NodeNgram.weight) frequency = sa.func.count(NodeNgram.weight)
ngrams = (session.query(Ngram.id, Ngram.terms, frequency ) ngrams = (session.query(Ngram.id, Ngram.terms, frequency )
.join(NodeNgram, NodeNgram.ngram_id == Ngram.id) .join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
.join(Node, Node.id == NodeNgram.node_id) .join(Node, Node.id == NodeNgram.node_id)
#.outerjoin(Stop, Stop.ngram_id == Ngram.id)
#.filter(Stop.node_id == stop_node.id, Stop.ngram_id == None)
.filter(Node.parent_id==corpus.id, Node.type_id==cache.NodeType['Document'].id) .filter(Node.parent_id==corpus.id, Node.type_id==cache.NodeType['Document'].id)
.group_by(Ngram.id) .group_by(Ngram.id)
.order_by(desc(frequency)) .order_by(desc(frequency))
#.all() #.all()
.limit(limit_sup) .limit(limit_sup)
) )
stops = (session.query(Ngram.id, Ngram.terms, frequency)
.join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
.join(Node, Node.id == NodeNgram.node_id)
.join(Stop, Stop.ngram_id == Ngram.id)
.filter(Stop.node_id == stop_node.id)
.filter(Node.parent_id==corpus.id, Node.type_id==cache.NodeType['Document'].id)
.group_by(Ngram.id)
.all()
)
ngrams = [n for n in ngrams if n not in stops]
print(ngrams)
#group = defaultdict(lambda : defaultdict()) #group = defaultdict(lambda : defaultdict())
ids_dict = dict() ids_dict = dict()
mainform_dict = dict() mainform_dict = dict()
......
# Without this, we couldn't use the Django environment # Without this, we couldn't use the Django environment
#from admin.env import * from admin.env import *
#from ngram.stemLem import * #from ngram.stemLem import *
from admin.utils import PrintException,DebugTime from admin.utils import PrintException,DebugTime
...@@ -15,42 +15,51 @@ from sqlalchemy.orm import aliased ...@@ -15,42 +15,51 @@ from sqlalchemy.orm import aliased
from ngram.tools import insert_ngrams from ngram.tools import insert_ngrams
import csv import csv
def compute_miam(corpus,limit=500): def compute_mapList(corpus,limit=500):
''' '''
According to Specificities and stoplist, According to Specificities and stoplist,
''' '''
dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id) dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
node_group = get_or_create_node(nodetype='Group', corpus=corpus) node_miam = get_or_create_node(nodetype='MiamList', corpus=corpus)
node_stop = get_or_create_node(nodetype='StopList', corpus=corpus) node_stop = get_or_create_node(nodetype='StopList', corpus=corpus)
node_group = get_or_create_node(nodetype='Group', corpus=corpus)
node_spec = get_or_create_node(nodetype='Specificity', corpus=corpus) node_spec = get_or_create_node(nodetype='Specificity', corpus=corpus)
Miam=aliased(NodeNgram)
Stop=aliased(NodeNgram) Stop=aliased(NodeNgram)
Group=aliased(NodeNgramNgram) Group=aliased(NodeNgramNgram)
Spec=aliased(NodeNodeNgram) Spec=aliased(NodeNodeNgram)
top_miam = (session.query(Spec.ngram_id, Spec.score) top_ngrams = (session.query(Spec.ngram_id, Spec.score)
.outerjoin(Group, Group.ngramy_id == Spec.ngram_id) .join(Miam, Spec.ngram_id == Miam.ngram_id)
.outerjoin(Stop, Stop.ngram_id == Spec.ngram_id) #.outerjoin(Group, Group.ngramy_id == Spec.ngram_id)
.filter(Group.node_id == node_group.id) #.outerjoin(Stop, Stop.ngram_id == Spec.ngram_id)
.filter(Stop.node_id == node_stop.id) .filter(Miam.node_id == node_miam.id)
#.filter(Group.node_id == node_group.id)
#.filter(Stop.node_id == node_stop.id)
.filter(Spec.nodex_id == node_spec.id)
.order_by(desc(Spec.score)) .order_by(desc(Spec.score))
.limit(limit) .limit(limit)
) )
print([t for t in top_miam])
node_miam = get_or_create_node(nodetype='MiamList', corpus=corpus) #print([t for t in top_ngrams])
session.query(NodeNgram).filter(NodeNgram.node_id==node_miam.id).delete()
node_mapList = get_or_create_node(nodetype='MapList', corpus=corpus)
session.query(NodeNgram).filter(NodeNgram.node_id==node_mapList.id).delete()
session.commit() session.commit()
data = zip( data = zip(
[node_miam.id for i in range(1,limit)] [node_mapList.id for i in range(1,limit)]
, [n[0] for n in top_miam] , [n[0] for n in top_ngrams]
, [1 for i in range(1,limit)] , [1 for i in range(1,limit)]
) )
print([d for d in data]) #print([d for d in data])
bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data]) bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data])
dbg.show('Miam computed') dbg.show('MapList computed')
def insert_miam(corpus, ngrams=None, path_file_csv=None): def insert_miam(corpus, ngrams=None, path_file_csv=None):
dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id) dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
...@@ -87,8 +96,41 @@ def insert_miam(corpus, ngrams=None, path_file_csv=None): ...@@ -87,8 +96,41 @@ def insert_miam(corpus, ngrams=None, path_file_csv=None):
file_csv.close() file_csv.close()
dbg.show('Miam computed') dbg.show('Miam computed')
#corpus = session.query(Node).filter(Node.id==556113).first() #corpus = session.query(Node).filter(Node.id==540420).first()
#compute_mapList(corpus)
#insert_miam(corpus=corpus, path_file_csv="Thesaurus_tag.csv") #insert_miam(corpus=corpus, path_file_csv="Thesaurus_tag.csv")
#def getNgrams(corpus=None, limit_inf=600, limit_sup=3000):
# '''
# getNgrams :: Corpus -> [(Int, String)] -> [(Int, String)]
# For a corpus, gives list of highest Cvalue ngrams and highest TFIDF (global)
# ngrams that have to be grouped with
# '''
# #tfidf_node = get_or_create_node(nodetype='Tfidf (global)', corpus=corpus)
# cvalue_node = get_or_create_node(nodetype='Cvalue', corpus=corpus)
# spec_node = get_or_create_node(nodetype='Specificity', corpus=corpus)
#
#
# #tfidf_ngrams = queryNodeNodeNgram(nodeMeasure_id=tfidf_node.id, corpus_id=corpus.id)
# cvalue_ngrams = queryNodeNodeNgram(nodeMeasure_id=cvalue_node.id, corpus_id=corpus.id, limit=limit_sup)
# spec_ngrams = queryNodeNodeNgram(nodeMeasure_id=spec_node.id, corpus_id=corpus.id, limit=limit_inf)
#
# #print([n for n in tfidf_ngrams])
#
# def list2set(_list):
# _set = set()
# for n in _list:
# _set.add((n[0],n[1]))
# return(_set)
#
# cvalue_set = set()
# spec_set = set()
#
# cvalue_set = list2set(cvalue_ngrams)
# spec_set = list2set(spec_ngrams)
#
# cvalue_setDiff = cvalue_set.difference(spec_set)
#
# return(spec_set,cvalue_setDiff)
#
# Without this, we couldn't use the Django environment
#from admin.env import *
#from ngram.stemLem import *
import re import re
from admin.utils import PrintException from admin.utils import PrintException
from gargantext_web.db import NodeNgram,NodeNodeNgram from gargantext_web.db import Node, Ngram, NodeNgram,NodeNodeNgram
from gargantext_web.db import cache, session, get_or_create_node from gargantext_web.db import cache, session, get_or_create_node, bulk_insert
import sqlalchemy as sa
from sqlalchemy.sql import func from sqlalchemy.sql import func
from sqlalchemy import desc, asc, or_, and_, Date, cast, select from sqlalchemy import desc, asc, or_, and_, Date, cast, select
from sqlalchemy import literal_column from sqlalchemy import literal_column
...@@ -38,7 +35,6 @@ def importStopList(node,filename,language='fr'): ...@@ -38,7 +35,6 @@ def importStopList(node,filename,language='fr'):
bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data]) bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data])
def isStopWord(ngram, stop_words=None): def isStopWord(ngram, stop_words=None):
''' '''
ngram :: (Int, String) => (ngram_id, ngram_terms) ngram :: (Int, String) => (ngram_id, ngram_terms)
...@@ -55,8 +51,9 @@ def isStopWord(ngram, stop_words=None): ...@@ -55,8 +51,9 @@ def isStopWord(ngram, stop_words=None):
if format_regex.match(word) : if format_regex.match(word) :
return(True) return(True)
for regex in ["(.*)\d(.*)" for regex in [
, "^.{1,2}$" "^.{1,2}$"
, "(.*)\d(.*)"
, "(.*)(\.)(.*)" , "(.*)(\.)(.*)"
, "(.*)(\,)(.*)" , "(.*)(\,)(.*)"
, "(.*)(study)(.*)" , "(.*)(study)(.*)"
...@@ -73,13 +70,11 @@ def isStopWord(ngram, stop_words=None): ...@@ -73,13 +70,11 @@ def isStopWord(ngram, stop_words=None):
if test_match(word, regex) is True : if test_match(word, regex) is True :
return(True) return(True)
def compute_stop(corpus,limit=2000,debug=False):
def compute_stop(corpus,size=2000,debug=False):
''' '''
do some statitics on all stop lists of database of the same type do some statitics on all stop lists of database of the same type
''' '''
stop_node = get_or_create_node(nodetype='StopList', corpus=corpus) stop_node = get_or_create_node(nodetype='StopList', corpus=corpus)
miam_node = get_or_create_node(nodetype='MiamList', corpus=corpus)
# TODO do a function to get all stop words with social scores # TODO do a function to get all stop words with social scores
root = session.query(Node).filter(Node.type_id == cache.NodeType['Root'].id).first() root = session.query(Node).filter(Node.type_id == cache.NodeType['Root'].id).first()
...@@ -90,34 +85,26 @@ def compute_stop(corpus,size=2000,debug=False): ...@@ -90,34 +85,26 @@ def compute_stop(corpus,size=2000,debug=False):
.filter(NodeNgram.node_id == root_stop_id) .filter(NodeNgram.node_id == root_stop_id)
.all() .all()
) )
#print([n for n in stop_words])
frequency = sa.func.count( NodeNgram.weight )
ngrams = ( session.query( Ngram.id, Ngram.terms, frequency )
.join( NodeNgram, NodeNgram.ngram_id == Ngram.id )
.join( Node, Node.id == NodeNgram.node_id )
.filter( Node.parent_id == corpus.id,
Node.type_id == cache.NodeType['Document'].id )
.group_by( Ngram.id )
.order_by( desc( frequency ) )
.all()
#.limit(limit)
)
top_words = (session.query(Ngram.id, Ngram.terms)
.join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
.filter(NodeNgram.node_id == miam_node.id)
.order_by(desc(NodeNgram.weight))
.limit(size)
)
ngrams_to_stop = filter(lambda x: isStopWord(x,stop_words=stop_words), top_words) ngrams_to_stop = filter(lambda x: isStopWord(x,stop_words=stop_words), ngrams)
#print([n for n in ngrams_to_stop])
stop = WeightedList({ n[0] : -1 for n in ngrams_to_stop}) stop = WeightedList({ n[0] : -1 for n in ngrams_to_stop})
stop.save(stop_node.id) stop.save(stop_node.id)
miam = UnweightedList(miam_node.id)
new_miam = miam - stop
new_miam.save(miam_node.id)
# data = zip(
# [stop_node.id for i in range(0,size)]
# , [ngram[0] for ngram in ngrams_to_stop]
# , [-1 for i in range(0,size)]
# )
# bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data])
#corpus=session.query(Node).filter(Node.id==545461).first()
#compute_stop(corpus)
...@@ -109,3 +109,30 @@ def insert_nodengramngram(nodengramngram): ...@@ -109,3 +109,30 @@ def insert_nodengramngram(nodengramngram):
''' % (NodeNgramNgram.__table__.name,)) ''' % (NodeNgramNgram.__table__.name,))
db.commit() db.commit()
#def queryNodeNodeNgram(nodeMeasure_id=None, corpus_id=None, limit=None):
# '''
# queryNodeNodeNgram :: Int -> Int -> Int -> (Int, String, Float)
# Get list of ngrams according to a measure related to the corpus: maybe tfidf
# cvalue.
# '''
# query = (session.query(Ngram.id, Ngram.terms, NodeNodeNgram.score)
# .join(NodeNodeNgram, NodeNodeNgram.ngram_id == Ngram.id)
# .join(Node, Node.id == NodeNodeNgram.nodex_id)
# .filter(NodeNodeNgram.nodex_id == nodeMeasure_id)
# .filter(NodeNodeNgram.nodey_id == corpus_id)
# .group_by(Ngram.id, Ngram.terms, NodeNodeNgram.score)
# .order_by(desc(NodeNodeNgram.score))
# )
#
# if limit is None:
# query = query.count()
# elif limit == 0 :
# query = query.all()
# else:
# query = query.limit(limit)
#
# return(query)
#
...@@ -4,8 +4,9 @@ from ngram.cvalue import compute_cvalue ...@@ -4,8 +4,9 @@ from ngram.cvalue import compute_cvalue
from ngram.specificity import compute_specificity from ngram.specificity import compute_specificity
#from ngram.stop import compute_stop #from ngram.stop import compute_stop
from ngram.group import compute_groups from ngram.group import compute_groups
from ngram.miam import compute_miam
from gargantext_web.db import get_or_create_node from gargantext_web.db import get_or_create_node
from ngram.mapList import compute_mapList
#from gargantext_web.celery import update_processing #from gargantext_web.celery import update_processing
...@@ -13,31 +14,32 @@ def ngram_workflow(corpus, n=5000): ...@@ -13,31 +14,32 @@ def ngram_workflow(corpus, n=5000):
''' '''
All the workflow to filter the ngrams. All the workflow to filter the ngrams.
''' '''
compute_tfidf_global(corpus) #compute_tfidf_global(corpus)
part = round(n * 0.8) part = round(n * 0.9)
compute_cvalue(corpus,limit=part) # size #compute_cvalue(corpus,limit=part) # size
part = round(part * 0.4) part = round(part * 0.8)
print('spec part:', part) print('spec part:', part)
compute_specificity(corpus,limit=part) #compute_specificity(corpus,limit=part)
part = round(part * 0.5) part = round(part * 0.8)
# compute_stop(corpus) # compute_stop(corpus)
limit_inf = round(part * 1) limit_inf = round(part * 1)
limit_sup = round(part * 5) limit_sup = round(part * 5)
print(limit_inf,limit_sup) print(limit_inf,limit_sup)
compute_groups(corpus,limit_inf=limit_inf, limit_sup=limit_sup) #compute_groups(corpus,limit_inf=limit_inf, limit_sup=limit_sup)
# compute_miam(corpus,limit=part) # size compute_mapList(corpus,limit=part) # size
compute_tfidf(corpus) #compute_tfidf(corpus)
#corpus=session.query(Node).filter(Node.id==257579).first() #corpus=session.query(Node).filter(Node.id==540420).first()
#corpus=session.query(Node).filter(Node.id==559637).first()
#ngram_workflow(corpus) #ngram_workflow(corpus)
#update_processing(corpus, 0) #update_processing(corpus, 0)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment