Commit 1219970a authored by Administrator's avatar Administrator

[FEAT] Simple function to populate ngram list

	modifié :         ngram/lists.py
	New simple function "ngrams2miam" to test API

	modifié :         test-list-management.py
	@Elias: see in this file how to use ngrams2miam function in
	order to populate your MiamList and test it with your api.

	modifié :         parsing/corpustools.py
	Only Cosmetics here.
parent 256fda0e
import sys
from admin.utils import PrintException
from gargantext_web.db import NodeNgram
......@@ -167,10 +166,7 @@ def ngramList(do=None, ngram_ids=None, list_id=None) :
# Some functions to manage automatically the lists
def doStopList(user_id=None, corpus_id=None,
stop_id=None,
reset=False, limit=None
):
def doStopList(user_id=None, corpus_id=None, stop_id=None, reset=False, limit=None):
'''
Compute automatically the stopList and returns its Node.id
Algo: TODO tfidf according type of corpora
......@@ -183,9 +179,37 @@ def doStopList(user_id=None, corpus_id=None,
# according to type of corpus, choose the right default stopList
def ngrams2miam(user_id=None, corpus_id=None):
'''
Create a Miam List only
'''
miam_id = listIds(typeList='MiamList', user_id=user_id, corpus_id=corpus_id)[0][0]
print(miam_id)
query = (session.query(
literal_column(str(miam_id)).label("node_id"),
Ngram.id,
func.count(),
)
.select_from(Ngram)
.join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
.join(Node, NodeNgram.node_id == Node.id)
.filter(Node.parent_id == corpus_id)
.filter(Node.type_id == cache.NodeType['Document'].id)
.group_by(Ngram.id)
#.limit(10)
.all()
)
bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], query)
def doList(
type_list='miam',
type_list='MiamList',
user_id=None, corpus_id=None,
miam_id=None, stop_id=None, main_id=None,
lem_id=None, stem_id=None, cvalue_id=None, group_id=None,
......@@ -209,9 +233,8 @@ def doList(
cvalue = equivalent N-Words according to C-Value (but the main form)
'''
if type_list not in ['miam', 'main']:
print('Type List supported: \'miam\' or \'main\'')
sys.exit(0)
if type_list not in ['MiamList', 'MainList']:
raise Exception("Type List (%s) not supported, try: \'MiamList\' or \'MainList\'" % type_list)
try:
list_dict = {
......@@ -246,7 +269,7 @@ def doList(
stopNgram = aliased(NodeNgram)
if 'miam' == type_list:
if type_list == 'MiamList' :
query = (session.query(
literal_column(str(list_dict['miam']['id'])).label("node_id"),
Ngram.id,
......@@ -266,7 +289,7 @@ def doList(
.group_by(Ngram.id)
)
elif 'main' == type_list:
elif type_list == 'MainList' :
# Query to get Ngrams for main list
query = (session.query(
literal_column(str(list_dict['main']['id'])).label("node_id"),
......
......@@ -97,7 +97,7 @@ def parse_resources(corpus, user=None, user_id=None):
.filter(Node_Resource.parsed == False)
)
# make a new node for every parsed document of the corpus
print(resources_query)
# print(resources_query)
dbg.show('analyze documents')
nodes = list()
for resource, resourcetype in resources_query:
......@@ -141,7 +141,7 @@ def parse_resources(corpus, user=None, user_id=None):
hyperdata.name: hyperdata
for hyperdata in session.query(Hyperdata)
}
print('hyperdata_types', hyperdata_types)
#print('hyperdata_types', hyperdata_types)
for node in nodes:
node_id = node.id
for hyperdata_key, hyperdata_value in node.hyperdata.items():
......@@ -157,10 +157,10 @@ def parse_resources(corpus, user=None, user_id=None):
hyperdata_value,
))
print('I am here', node_hyperdata_lists.items())
#print('I am here', node_hyperdata_lists.items())
for key, values in node_hyperdata_lists.items():
print('here', key, values)
#print('here', key, values)
bulk_insert(Node_Hyperdata, ['node_id', 'hyperdata_id', 'value_'+key], values)
# mark the corpus as parsed
corpus.parsed = True
......
......@@ -8,7 +8,7 @@ from ngram.lists import *
#from gargantext_web.views import empty_trash
#empty_trash()
#
#user = session.query(User).all()[0]
user = session.query(User).filter(User.username=='alexandre').first()
......@@ -36,6 +36,8 @@ if project is None:
corpus = session.query(Node).filter(Node.parent_id == project.id,
Node.type_id == cache.NodeType['Corpus'].id).first()
print('Corpus is', corpus)
if corpus is None:
corpus = Node(
parent_id = project.id,
......@@ -66,14 +68,14 @@ print('Working on corpus:', corpus.id, corpus.name)
stem_id = stem_corpus(corpus_id=corpus.id)
print('Stem Node.id is', stem_id)
for typeList in ['MiamList', 'StopList', 'MainList', 'Group']:
n = listIds(user_id=user.id,
corpus_id=corpus.id,
typeList=typeList)
#print(n[0][0])
print('Test having list_id')
print(n, listNgramIds(list_id=n[0][0])[:3])
#for typeList in ['MiamList', 'StopList', 'MainList', 'Group']:
# n = listIds(user_id=user.id,
# corpus_id=corpus.id,
# typeList=typeList)
# #print(n[0][0])
# print('Test having list_id')
# print(n, listNgramIds(list_id=n[0][0])[:3])
#
stop_list_id = listIds(user_id=user.id,
corpus_id=corpus.id,
......@@ -87,30 +89,53 @@ miam_list_id = listIds(user_id=user.id,
print('Stop List', stop_list_id)
print('Miam List', miam_list_id)
ngram_id = listNgramIds(list_id=miam_list_id)[0][0]
print('ngram_id', ngram_id)
ngramList(do='add', ngram_ids=[ngram_id,], list_id=stop_list_id)
ngrams2miam(user_id=user.id, corpus_id=corpus.id)
print(listNgramIds(list_id=miam_list_id, user_id=user.id, corpus_id=corpus.id))
#type_list='MiamList'
#try:
# d = doList(type_list=type_list, user_id = user.id, corpus_id = corpus.id, limit=150)
## print('Size of the ' + type_list + ' list:',
## session.query(NodeNgram).filter(NodeNgram.node_id == d).count()
## )
#except:
# PrintException()
##
#print(listNgramIds(list_id=miam_list_id, user_id=user.id, corpus_id=corpus.id))
#
#ngram_id = listNgramIds(list_id=miam_list_id, user_id=user.id, corpus_id=corpus.id)[0][0]
#print('ngram_id', ngram_id)
#
#ngramList(do='add', ngram_ids=[ngram_id,], list_id=stop_list_id)
# print('Test having typeList and corpus.id')
# print(n, listNgramIds(typeList=typeList, corpus_id=corpus.id, user_id=user.id)[:3])
##
# print('Test having typeList and corpus.id and doc_id')
# print(n, listNgramIds(typeList=typeList, corpus_id=corpus.id, doc_id=doc_id, user_id=user.id)[:3])
#
#
#type_list='miam'
#try:
# d = doList(type_list=type_list, user_id = user.id, corpus_id = corpus.id, stem_id=stem_id, limit=150)
# print('Size of the ' + type_list + ' list:',
# session.query(NodeNgram).filter(NodeNgram.node_id == d).count()
# )
#except:
# PrintException()
#
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment