Commit 951381a2 authored by Mathieu Rodic's avatar Mathieu Rodic

Merge branch 'unstable' of ssh://delanoe.org:1979/gargantext into unstable-201511-advancedcharts

parents bdb6ac85 4d8e6141
......@@ -3,6 +3,7 @@ import linecache
from time import time
from gargantext_web.settings import MEDIA_ROOT
from django.db import connection
class DebugTime:
def __init__(self, prefix):
......@@ -19,7 +20,6 @@ class DebugTime:
self.message = message
self.time = time()
def ensure_dir(user):
'''
If user is new, folder does not exist yet, create it then
......@@ -46,3 +46,19 @@ def PrintException():
print('EXCEPTION IN ({}, LINE {} "{}"): {}'.format(filename, lineno, line.strip(), exc_obj))
class WorkflowTracking:
def __init__( self ):
self.hola = "mundo"
def processing_(self , corpus , step):
try:
the_query = """ UPDATE node_node SET hyperdata=\'{ \"%s\" : \"%s\"}\' WHERE id=%d """ % ( "Processing", step , corpus.id )
cursor = connection.cursor()
try:
cursor.execute(the_query)
cursor.execute("COMMIT;")
finally:
connection.close()
except :
PrintException()
\ No newline at end of file
......@@ -14,7 +14,7 @@ def do_cooc(corpus=None
, field1='ngrams', field2='ngrams'
, miam_id=None, stop_id=None, group_id=None
, cvalue_id=None
, n_min=2, n_max=None
, n_min=1, n_max=None
, start=None, end=None
, limit=1000
, isMonopartite=True
......@@ -62,7 +62,6 @@ def do_cooc(corpus=None
session.commit()
# END
session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==node_cooc.id).delete()
session.commit()
......@@ -186,17 +185,16 @@ def do_cooc(corpus=None
cooc = matrix & miam_list
elif miam_id is not None and stop_id is not None and group_id is None :
cooc = matrix & (miam_list - stop_list)
elif miam_id is not None and stop_id is not None and group_id is not None :
print("miam_id is not None and stop_id is not None and group_id is not None")
#cooc = matrix & (miam_list * group_list - stop_list)
cooc = matrix & (miam_list - stop_list)
cooc = matrix & (miam_list * group_list - stop_list)
#cooc = matrix & (miam_list - stop_list)
elif miam_id is not None and stop_id is None and group_id is not None :
cooc = matrix & (miam_list * group_list)
else :
cooc = matrix
else:
cooc = matrix
#print(cooc)
#print(" x " * 30)
cooc.save(node_cooc.id)
return(node_cooc.id)
from admin.utils import PrintException
from gargantext_web.db import *
from collections import defaultdict
from operator import itemgetter
from django.db import connection, transaction
import math
from math import log,sqrt
import scipy
from gargantext_web.db import get_or_create_node
import pandas as pd
from copy import copy
import numpy as np
import scipy
import networkx as nx
from networkx.readwrite import json_graph
from rest_v1_0.api import JsonHttpResponse
from analysis.louvain import best_partition, generate_dendogram, partition_at_level
from ngram.lists import listIds
from sqlalchemy.orm import aliased
def diag_null(x):
return x - x * scipy.eye(x.shape[0])
def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True, distance='conditional'):
'''
do_distance :: Int -> (Graph, Partition, {ids}, {weight})
'''
authorized = ['conditional', 'distributional', 'cosine']
if distance not in authorized:
distance = 'conditional'
matrix = defaultdict(lambda : defaultdict(float))
ids = defaultdict(lambda : defaultdict(int))
labels = dict()
weight = dict()
Cooc = aliased(NodeNgramNgram)
query = session.query(Cooc).filter(Cooc.node_id==cooc_id).all()
for cooc in query:
matrix[cooc.ngramx_id][cooc.ngramy_id] = cooc.score
matrix[cooc.ngramy_id][cooc.ngramx_id] = cooc.score
ids[cooc.ngramx_id] = (field1, cooc.ngramx_id)
ids[cooc.ngramy_id] = (field2, cooc.ngramy_id)
weight[cooc.ngramx_id] = weight.get(cooc.ngramx_id, 0) + cooc.score
weight[cooc.ngramy_id] = weight.get(cooc.ngramy_id, 0) + cooc.score
x = pd.DataFrame(matrix).fillna(0)
if distance == 'conditional':
x = x / x.sum(axis=1)
#y = y / y.sum(axis=0)
xs = x.sum(axis=1) - x
ys = x.sum(axis=0) - x
# top inclus ou exclus
n = ( xs + ys) / (2 * (x.shape[0] - 1))
# top generic or specific
m = ( xs - ys) / (2 * (x.shape[0] - 1))
n = n.sort(inplace=False)
m = m.sort(inplace=False)
nodes_included = 500 #int(round(size/20,0))
#nodes_excluded = int(round(size/10,0))
nodes_specific = 500 #int(round(size/10,0))
#nodes_generic = int(round(size/10,0))
# TODO use the included score for the node size
n_index = pd.Index.intersection(x.index, n.index[:nodes_included])
# Generic:
#m_index = pd.Index.intersection(x.index, m.index[:nodes_generic])
# Specific:
m_index = pd.Index.intersection(x.index, m.index[-nodes_specific:])
#m_index = pd.Index.intersection(x.index, n.index[:nodes_included])
x_index = pd.Index.union(n_index, m_index)
xx = x[list(x_index)].T[list(x_index)]
# Removing unconnected nodes
xxx = xx.values
threshold = min(xxx.max(axis=1))
matrix_filtered = np.where(xxx >= threshold, xxx, 0)
#matrix_filtered = matrix_filtered.resize((90,90))
G = nx.from_numpy_matrix(np.matrix(matrix_filtered))
G = nx.relabel_nodes(G, dict(enumerate([ ids[id_][1] for id_ in list(xx.columns)])))
elif distance == 'cosine':
scd = defaultdict(lambda : defaultdict(int))
for i in matrix.keys():
for j in matrix.keys():
numerator = sum(
[
matrix[i][k] * matrix[j][k]
for k in matrix.keys()
if i != j and k != i and k != j
]
)
denominator = sqrt(
sum([
matrix[i][k]
for k in matrix.keys()
if k != i and k != j #and matrix[i][k] > 0
])
*
sum([
matrix[i][k]
for k in matrix.keys()
if k != i and k != j #and matrix[i][k] > 0
])
)
try:
scd[i][j] = numerator / denominator
except Exception as error:
scd[i][j] = 0
minmax = min([ max([ scd[i][j] for i in scd.keys()]) for j in scd.keys()])
G = nx.DiGraph()
G.add_edges_from(
[
(i, j, {'weight': scd[i][j]})
for i in scd.keys() for j in scd.keys()
if i != j and scd[i][j] > minmax and scd[i][j] > scd[j][i]
]
)
elif distance == 'distributional':
mi = defaultdict(lambda : defaultdict(int))
total_cooc = x.sum().sum()
for i in matrix.keys():
si = sum([matrix[i][j] for j in matrix[i].keys() if i != j])
for j in matrix[i].keys():
sj = sum([matrix[j][k] for k in matrix[j].keys() if j != k])
if i!=j :
mi[i][j] = log( matrix[i][j] / ((si * sj) / total_cooc) )
r = defaultdict(lambda : defaultdict(int))
for i in matrix.keys():
for j in matrix.keys():
sumMin = sum(
[
min(mi[i][k], mi[j][k])
for k in matrix.keys()
if i != j and k != i and k != j and mi[i][k] > 0
]
)
sumMi = sum(
[
mi[i][k]
for k in matrix.keys()
if k != i and k != j and mi[i][k] > 0
]
)
try:
r[i][j] = sumMin / sumMi
except Exception as error:
r[i][j] = 0
# Need to filter the weak links, automatic threshold here
minmax = min([ max([ r[i][j] for i in r.keys()]) for j in r.keys()])
G = nx.DiGraph()
G.add_edges_from(
[
(i, j, {'weight': r[i][j]})
for i in r.keys() for j in r.keys()
if i != j and r[i][j] > minmax and r[i][j] > r[j][i]
]
)
# degree_max = max([(n, d) for n,d in G.degree().items()], key=itemgetter(1))[1]
# nodes_to_remove = [n for (n,d) in G.degree().items() if d <= round(degree_max/2)]
# G.remove_nodes_from(nodes_to_remove)
# Removing too connected nodes (find automatic way to do it)
#edges_to_remove = [ e for e in G.edges_iter() if
# nodes_to_remove = [n for n in degree if degree[n] <= 1]
# G.remove_nodes_from(nodes_to_remove)
def getWeight(item):
return item[1]
#
# node_degree = sorted(G.degree().items(), key=getWeight, reverse=True)
# #print(node_degree)
# nodes_too_connected = [n[0] for n in node_degree[0:(round(len(node_degree)/5))]]
#
# for n in nodes_too_connected:
# n_edges = list()
# for v in nx.neighbors(G,n):
# #print((n, v), G[n][v]['weight'], ":", (v,n), G[v][n]['weight'])
# n_edges.append(((n, v), G[n][v]['weight']))
#
# n_edges_sorted = sorted(n_edges, key=getWeight, reverse=True)
# #G.remove_edges_from([ e[0] for e in n_edges_sorted[round(len(n_edges_sorted)/2):]])
# #G.remove_edges_from([ e[0] for e in n_edges_sorted[(round(len(nx.neighbors(G,n))/3)):]])
# G.remove_edges_from([ e[0] for e in n_edges_sorted[10:]])
G.remove_nodes_from(nx.isolates(G))
partition = best_partition(G.to_undirected())
return(G,partition,ids,weight)
......@@ -12,6 +12,7 @@ import scipy
from gargantext_web.db import get_or_create_node
from analysis.cooccurrences import do_cooc
from analysis.distance import do_distance
import pandas as pd
from copy import copy
......@@ -26,114 +27,13 @@ from analysis.louvain import best_partition, generate_dendogram, partition_at_le
from ngram.lists import listIds
from sqlalchemy.orm import aliased
def diag_null(x):
return x - x * scipy.eye(x.shape[0])
def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True):
'''
do_distance :: Int -> (Graph, Partition, {ids}, {weight})
'''
#print([n for n in session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==cooc_id).all()])
matrix = defaultdict(lambda : defaultdict(float))
ids = defaultdict(lambda : defaultdict(int))
labels = dict()
weight = dict()
Cooc = aliased(NodeNgramNgram)
query = session.query(Cooc).filter(Cooc.node_id==cooc_id).all()
for cooc in query:
matrix[cooc.ngramx_id][cooc.ngramy_id] = cooc.score
matrix[cooc.ngramy_id][cooc.ngramx_id] = cooc.score
ids[cooc.ngramx_id] = (field1, cooc.ngramx_id)
ids[cooc.ngramy_id] = (field2, cooc.ngramy_id)
weight[cooc.ngramx_id] = weight.get(cooc.ngramx_id, 0) + cooc.score
weight[cooc.ngramy_id] = weight.get(cooc.ngramy_id, 0) + cooc.score
x = pd.DataFrame(matrix).fillna(0)
y = pd.DataFrame(matrix).fillna(0)
#xo = diag_null(x)
#y = diag_null(y)
x = x / x.sum(axis=1)
y = y / y.sum(axis=0)
xs = x.sum(axis=1) - x
ys = x.sum(axis=0) - x
# top inclus ou exclus
n = ( xs + ys) / (2 * (x.shape[0] - 1))
# top generic or specific
m = ( xs - ys) / (2 * (x.shape[0] - 1))
n = n.sort(inplace=False)
m = m.sort(inplace=False)
nodes_included = 500 #int(round(size/20,0))
#nodes_excluded = int(round(size/10,0))
nodes_specific = 500 #int(round(size/10,0))
#nodes_generic = int(round(size/10,0))
# TODO use the included score for the node size
n_index = pd.Index.intersection(x.index, n.index[:nodes_included])
# Generic:
#m_index = pd.Index.intersection(x.index, m.index[:nodes_generic])
# Specific:
m_index = pd.Index.intersection(x.index, m.index[-nodes_specific:])
#m_index = pd.Index.intersection(x.index, n.index[:nodes_included])
x_index = pd.Index.union(n_index, m_index)
xx = x[list(x_index)].T[list(x_index)]
# Removing unconnected nodes
xxx = xx.values
threshold = min(xxx.max(axis=1))
matrix_filtered = np.where(xxx >= threshold, xxx, 0)
#matrix_filtered = matrix_filtered.resize((90,90))
G = nx.from_numpy_matrix(np.matrix(matrix_filtered))
#G = nx.from_numpy_matrix(matrix_filtered, create_using=nx.MultiDiGraph())
G = nx.relabel_nodes(G, dict(enumerate([ ids[id_][1] for id_ in list(xx.columns)])))
# Removing too connected nodes (find automatic way to do it)
#edges_to_remove = [ e for e in G.edges_iter() if
# nodes_to_remove = [n for n in degree if degree[n] <= 1]
# G.remove_nodes_from(nodes_to_remove)
def getWeight(item):
return item[1]
node_degree = sorted(G.degree().items(), key=getWeight, reverse=True)
#print(node_degree)
nodes_too_connected = [n[0] for n in node_degree[0:(round(len(node_degree)/5))]]
for n in nodes_too_connected:
n_edges = list()
for v in nx.neighbors(G,n):
n_edges.append(((n, v), G[n][v]['weight']))
n_edges_sorted = sorted(n_edges, key=getWeight, reverse=True)
#G.remove_edges_from([ e[0] for e in n_edges_sorted[round(len(n_edges_sorted)/2):]])
G.remove_edges_from([ e[0] for e in n_edges_sorted[(round(len(nx.neighbors(G,n))/3)):]])
G.remove_nodes_from(nx.isolates(G))
partition = best_partition(G.to_undirected())
return(G,partition,ids,weight)
def get_cooc(request=None, corpus=None
, field1='ngrams', field2='ngrams'
, cooc_id=None, type='node_link', size=1000
, start=None, end=None
, hapax=1
, distance='conditional'
):
'''
get_ccoc : to compute the graph.
......@@ -158,7 +58,8 @@ def get_cooc(request=None, corpus=None
, miam_id=miam_id, group_id=group_id, stop_id=stop_id, limit=size
, isMonopartite=True, start=start , end=end , hapax=hapax)
G, partition, ids, weight = do_distance(cooc_id, field1="ngrams", field2="ngrams", isMonopartite=True)
G, partition, ids, weight = do_distance(cooc_id, field1="ngrams", field2="ngrams"
, isMonopartite=True, distance=distance)
if type == "node_link":
nodesB_dict = {}
......
......@@ -111,10 +111,16 @@ class NgramEdit(APIView):
node_mapList = get_or_create_node(nodetype='MapList', corpus=corpus )
results = session.query(NodeNgram).filter(NodeNgram.node_id==node_mapList.id ).all()
ngram_2del = [int(i) for i in ngram_ids.split('+')]
ngram_2del = session.query(NodeNgram).filter(NodeNgram.node_id==node_mapList.id , NodeNgram.ngram_id.in_(ngram_2del) ).all()
for map_node in ngram_2del:
ngram_2del_ = session.query(NodeNgram).filter(NodeNgram.node_id==node_mapList.id , NodeNgram.ngram_id.in_(ngram_2del) ).all()
for map_node in ngram_2del_:
session.delete(map_node)
session.commit()
node_stopList = get_or_create_node(nodetype='StopList', corpus=corpus )
for ngram_id in ngram_2del:
stop_node = NodeNgram( weight=1.0, ngram_id=ngram_id , node_id=node_stopList.id)
session.add(stop_node)
session.commit()
# [ = = = = / del from map-list = = = = ]
return Response(None, 204)
......@@ -193,3 +199,5 @@ class Document(APIView):
'id': node.id
}
return Response(data)
......@@ -17,11 +17,6 @@ def get_team():
'''
team = [
{ 'first_name' : 'Alexandre', 'last_name' : 'Delanoë',
'mail' : 'alexandre+gargantextATdelanoe.org',
'website' : 'http://alexandre.delanoe.org',
'picture' : 'alexandre.jpg',
'role' : 'principal investigator, developer'},
{ 'first_name' : 'David', 'last_name' : 'Chavalarias',
'mail' : 'david.chavalariasATiscpif.fr',
......@@ -46,6 +41,12 @@ def get_team():
'picture' : 'samuel.jpg',
'role' : 'developer'},
{ 'first_name' : 'Alexandre', 'last_name' : 'Delanoë',
'mail' : 'alexandre+gargantextATdelanoe.org',
'website' : 'http://alexandre.delanoe.org',
'picture' : 'alexandre.jpg',
'role' : 'principal investigator, developer'},
#{ 'first_name' : '', 'name' : '', 'mail' : '', 'website' : '', 'picture' : ''},
# copy paste the line above and write your informations please
]
......@@ -53,21 +54,37 @@ def get_team():
random.shuffle(team)
return(team)
def get_sponsors():
def get_partners():
'''
Function to get list of each sponsor as dict of institutional informations.
'''
sponsors = [
institutions = [
{ 'name' : 'Mines ParisTech', 'website' : 'http://mines-paristech.fr', 'picture' : 'mines.png', 'funds':''},
{ 'name' : 'Institut Pasteur', 'website' : 'http://www.pasteur.fr', 'picture' : 'pasteur.png', 'funds':''},
{ 'name' : 'Forccast', 'website' : 'http://forccast.hypotheses.org/', 'picture' : 'forccast.png', 'funds':''},
{ 'name' : 'ADEME', 'website' : 'http://www.ademe.fr', 'picture' : 'ademe.png', 'funds':''},
{ 'name' : 'EHESS', 'website' : 'http://www.ehess.fr', 'picture' : 'ehess.png', 'funds':''},
#{ 'name' : '', 'website' : '', 'picture' : '', 'funds':''},
# copy paste the line above and write your informations please
]
labos = [
{ 'name' : 'Centre de Sociologie de l\'innovation', 'website' : 'http://www.csi.mines-paristech.fr/en/', 'picture' : 'csi.png', 'funds':''},
#{ 'name' : '', 'website' : '', 'picture' : '', 'funds':''},
# copy paste the line above and write your informations please
]
grants = [
{ 'name' : 'Forccast', 'website' : 'http://forccast.hypotheses.org/', 'picture' : 'forccast.png', 'funds':''},
{ 'name' : 'Mastodons', 'website' : 'http://www.cnrs.fr/mi/spip.php?article53&lang=fr', 'picture' : 'mastodons.png', 'funds':''},
#{ 'name' : '', 'website' : '', 'picture' : '', 'funds':''},
# copy paste the line above and write your informations please
]
random.shuffle(institutions)
random.shuffle(grants)
return(institutions,labos,grants)
random.shuffle(sponsors)
return(sponsors)
......@@ -23,32 +23,30 @@ def apply_sum(x, y):
from parsing.corpustools import parse_resources, extract_ngrams #add_resource,
from ngram.lists import ngrams2miam
from admin.utils import PrintException
def update_processing(corpus, step=0):
try:
corpus.hyperdata.update({'Processing' : step})
session.query(Node).filter(Node.id==corpus.id).update({'hyperdata' : corpus.hyperdata})
session.commit()
except :
PrintException()
from admin.utils import WorkflowTracking
@shared_task
def apply_workflow(corpus_id):
update_state = WorkflowTracking()
corpus = session.query(Node).filter(Node.id==corpus_id).first()
update_processing(corpus, 1)
update_state.processing_(corpus, "Parsing")
#cProfile.runctx('parse_resources(corpus)', global,locals)
parse_resources(corpus)
update_processing(corpus, 2)
update_state.processing_(corpus, "Terms extraction")
extract_ngrams(corpus, ['title', 'abstract'], nlp=True)
update_processing(corpus, 3)
# update_state.processing_(corpus, "")
ngram_workflow(corpus)
#ngrams2miam(user_id=corpus.user_id, corpus_id=corpus_id)
update_processing(corpus, 0)
print("End of the Workflow for corpus %d" % (corpus_id))
update_state.processing_(corpus, "0")
@shared_task
def empty_trash(corpus_id):
......@@ -63,4 +61,3 @@ def empty_trash(corpus_id):
node.delete()
print("Nodes deleted")
......@@ -276,5 +276,3 @@ def get_or_create_node(nodetype=None,corpus=None,corpus_id=None,name_str=None,hy
session.commit()
#print(parent_id, n.parent_id, n.id, n.name)
return(node)
......@@ -91,11 +91,10 @@ urlpatterns = patterns('',
############################################################################
url(r'^tests/', include('tests.urls')),
# TODO Samuel, lines below were on your tests, are they still used ?
# can we delete them ?
url(r'^project/(\d+)/corpus/(\d+)/terms/ngrams.json$', samtest.get_ngrams_json),
url(r'^project/(\d+)/corpus/(\d+)/terms$', samtest.get_ngrams),
url(r'^project/(\d+)/corpus/(\d+)/stop_list.json$', samtest.get_stoplist)
url(r'^api/corpus/(\d+)$', samtest.get_corpus_state),
url(r'^test_cores$', samtest.get_cores)
)
......
......@@ -77,7 +77,8 @@ def logo(request):
if group == "cnrs":
color = "#093558"
else:
color = "#ff8080"
# color of the css adapted to the logo
color = "#AE5C5C"
svg_data = template.render(Context({\
'color': color,\
}))
......@@ -164,13 +165,15 @@ def get_about(request):
date = datetime.datetime.now()
members = about.get_team()
sponsors = about.get_sponsors()
institutions,labos,grants = about.get_partners()
html = template.render(Context({\
'user': user,\
'date': date,\
'team': members,\
'sponsors':sponsors,\
'institutions': institutions,\
'labos': labos,\
'grants': grants,\
}))
return HttpResponse(html)
......@@ -342,12 +345,14 @@ def corpus(request, project_id, corpus_id):
type_doc_id = cache.NodeType['Document'].id
number = session.query(func.count(Node.id)).filter(Node.parent_id==corpus_id, Node.type_id==type_doc_id).all()[0][0]
the_query = """ SELECT hyperdata FROM node_node WHERE id=%d """ % ( int(corpus_id) )
cursor = connection.cursor()
try:
processing = corpus.hyperdata['Processing']
except Exception as error:
print(error)
processing = 0
print('processing', processing)
cursor.execute(the_query)
processing = cursor.fetchone()[0]["Processing"]
except:
processing = "Error"
html = t.render(Context({
'debug': settings.DEBUG,
......@@ -566,13 +571,17 @@ def graph(request, project_id, corpus_id, generic=100, specific=100):
project_type_id = cache.NodeType['Project'].id
corpus_type_id = cache.NodeType['Corpus'].id
miamlist_type_id = cache.NodeType['MiamList'].id
miamlist = session.query(Node).filter(Node.user_id == request.user.id , Node.parent_id==corpus_id , Node.type_id == cache.NodeType['MiamList'].id ).first()
graphurl = "corpus/"+str(corpus_id)+"/node_link.json"
html = t.render(Context({\
'debug': settings.DEBUG,
'user' : user,\
'user': request.user,\
'date' : date,\
'corpus' : corpus,\
'list_id' : miamlist.id,\
'project' : project,\
'graphfile' : graphurl,\
}))
......
......@@ -140,7 +140,7 @@ def project(request, project_id):
parent_id = project_id,
type_id = cache.NodeType['Corpus'].id,
language_id = language_id,
hyperdata = {'Processing' : 1,}
hyperdata = {'Processing' : "Parsing documents",}
)
session.add(corpus)
session.commit()
......@@ -212,7 +212,8 @@ def tfidf(request, corpus_id, ngram_ids):
.query(Node, func.sum(NodeNodeNgram.score))
.join(NodeNodeNgram, NodeNodeNgram.nodey_id == Node.id)
.filter(NodeNodeNgram.nodex_id == tfidf_id)
.filter(NodeNodeNgram.ngram_id.in_(ngram_ids))
.filter(Node.type_id == cache.NodeType['Document'].id)
.filter(or_(*[NodeNodeNgram.ngram_id==ngram_id for ngram_id in ngram_ids]))
.group_by(Node)
.order_by(func.sum(NodeNodeNgram.score).desc())
.limit(limit)
......@@ -221,8 +222,21 @@ def tfidf(request, corpus_id, ngram_ids):
# print("in TFIDF:")
# print("\tcorpus_id:",corpus_id)
# convert query result to a list of dicts
if nodes_query is None:
print("TFIDF error, juste take sums")
nodes_query = (session
.query(Node, func.sum(NodeNgram.weight))
.join(NodeNgram, NodeNgram.node_id == Node.id)
.filter(Node.parent_id == corpus_id)
.filter(Node.type_id == cache.NodeType['Document'].id)
.filter(or_(*[NodeNgram.ngram_id==ngram_id for ngram_id in ngram_ids]))
.group_by(Node)
.order_by(func.sum(NodeNgram.weight).desc())
.limit(limit)
)
for node, score in nodes_query:
# print("\t corpus:",corpus_id,"\t",node.name)
print("\t corpus:",corpus_id,"\t",node.name)
node_dict = {
'id': node.id,
'score': score,
......
......@@ -93,7 +93,7 @@ node_types = [
'Project', 'Corpus', 'Document',
'MiamList', 'StopList', 'MainList', 'MapList', # TODO MiamList -> MainList
'Stem', 'Lem', 'Group', 'Tfidf', 'Tfidf (global)', 'Cvalue', 'Specificity'
, 'Cooccurrence',
, 'Cooccurrence', 'Occurrences',
]
for node_type in node_types:
......
from admin.env import *
import sys
from node.models import User
from django.core.mail import send_mail
......@@ -53,13 +55,17 @@ def active_user(username, active=True):
user.active_user = active
user.save()
def mines_account_creation(fichier=None):
def mass_account_creation(fichier=None):
if fichier is None:
fichier = "/home/alexandre/projets/forccast/Tutorat/2014-2015/comptes_gargantext.csv"
fichier = "/tmp/comptes.csv"
accounts = open(fichier, "r")
for line in accounts.readlines():
username, email, password, fin = line.split(',')
create_user(username, email, password=password, notify=False)
create_user(username, email, password=password, active=True, notify=False)
#delete_user(username)
accounts.close()
if __name__ == "__main__":
mass_account_creation(fichier=sys.argv[1])
......@@ -15,10 +15,15 @@ from sqlalchemy.orm import aliased
from ngram.tools import insert_ngrams
import csv
def compute_mapList(corpus,limit=500):
def compute_mapList(corpus,limit=500,n=1):
'''
According to Specificities and stoplist,
'''
monograms_part = 0.005
monograms_limit = round(limit * monograms_part)
multigrams_limit = limit - monograms_limit
dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
node_miam = get_or_create_node(nodetype='MiamList', corpus=corpus)
......@@ -33,27 +38,49 @@ def compute_mapList(corpus,limit=500):
Spec=aliased(NodeNodeNgram)
top_ngrams = (session.query(Spec.ngram_id, Spec.score)
query = (session.query(Spec.ngram_id, Spec.score)
.join(Miam, Spec.ngram_id == Miam.ngram_id)
.join(Ngram, Ngram.id == Spec.ngram_id)
#.outerjoin(Group, Group.ngramy_id == Spec.ngram_id)
#.outerjoin(Stop, Stop.ngram_id == Spec.ngram_id)
.filter(Miam.node_id == node_miam.id)
#.filter(Group.node_id == node_group.id)
#.filter(Stop.node_id == node_stop.id)
.filter(Spec.nodex_id == node_spec.id)
)
top_monograms = (query
.filter(Ngram.n == 1)
.order_by(desc(Spec.score))
.limit(limit)
.limit(monograms_limit)
)
top_multigrams = (query
.filter(Ngram.n >= 2)
.order_by(desc(Spec.score))
.limit(multigrams_limit)
)
stop_ngrams = (session.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id == node_stop.id)
.all()
)
grouped_ngrams = (session.query(NodeNgramNgram.ngramy_id)
.filter(NodeNgramNgram.node_id == node_group.id)
.all()
)
#print([t for t in top_ngrams])
node_mapList = get_or_create_node(nodetype='MapList', corpus=corpus)
session.query(NodeNgram).filter(NodeNgram.node_id==node_mapList.id).delete()
session.commit()
data = zip(
[node_mapList.id for i in range(1,limit)]
, [n[0] for n in top_ngrams]
, [n[0] for n in list(top_multigrams) + list(top_monograms)
if (n[0],) not in list(stop_ngrams) + list(grouped_ngrams)
]
, [1 for i in range(1,limit)]
)
#print([d for d in data])
......@@ -100,37 +127,3 @@ def insert_miam(corpus, ngrams=None, path_file_csv=None):
#compute_mapList(corpus)
#insert_miam(corpus=corpus, path_file_csv="Thesaurus_tag.csv")
#def getNgrams(corpus=None, limit_inf=600, limit_sup=3000):
# '''
# getNgrams :: Corpus -> [(Int, String)] -> [(Int, String)]
# For a corpus, gives list of highest Cvalue ngrams and highest TFIDF (global)
# ngrams that have to be grouped with
# '''
# #tfidf_node = get_or_create_node(nodetype='Tfidf (global)', corpus=corpus)
# cvalue_node = get_or_create_node(nodetype='Cvalue', corpus=corpus)
# spec_node = get_or_create_node(nodetype='Specificity', corpus=corpus)
#
#
# #tfidf_ngrams = queryNodeNodeNgram(nodeMeasure_id=tfidf_node.id, corpus_id=corpus.id)
# cvalue_ngrams = queryNodeNodeNgram(nodeMeasure_id=cvalue_node.id, corpus_id=corpus.id, limit=limit_sup)
# spec_ngrams = queryNodeNodeNgram(nodeMeasure_id=spec_node.id, corpus_id=corpus.id, limit=limit_inf)
#
# #print([n for n in tfidf_ngrams])
#
# def list2set(_list):
# _set = set()
# for n in _list:
# _set.add((n[0],n[1]))
# return(_set)
#
# cvalue_set = set()
# spec_set = set()
#
# cvalue_set = list2set(cvalue_ngrams)
# spec_set = list2set(spec_ngrams)
#
# cvalue_setDiff = cvalue_set.difference(spec_set)
#
# return(spec_set,cvalue_setDiff)
#
from gargantext_web.db import session, cache, get_cursor
from gargantext_web.db import Node, NodeNgram, NodeNodeNgram
from gargantext_web.db import get_or_create_node
from admin.utils import DebugTime
def compute_occs(corpus):
dbg = DebugTime('Corpus #%d - OCCURRENCES' % corpus.id)
dbg.show('Calculate occurrences')
occs_node = get_or_create_node(nodetype='Occurrences', corpus=corpus)
#print(occs_node.id)
(session.query(NodeNodeNgram)
.filter(NodeNodeNgram.nodex_id==occs_node.id).delete()
)
session.commit()
db, cursor = get_cursor()
cursor.execute('''
INSERT INTO
%s (nodex_id, nodey_id, ngram_id, score)
SELECT
%d AS nodex_id,
%d AS nodey_id,
nodengram.ngram_id AS ngram_id,
SUM(nodengram.weight) AS score
FROM
%s AS nodengram
INNER JOIN
%s AS node ON nodengram.node_id = node.id
WHERE
node.parent_id = %d
AND
node.type_id = %d
GROUP BY
nodengram.ngram_id
''' % ( NodeNodeNgram.__table__.name
, occs_node.id, corpus.id
, NodeNgram.__table__.name
, Node.__table__.name
, corpus.id
, cache.NodeType['Document'].id
)
)
db.commit()
#data = session.query(NodeNodeNgram).filter(NodeNodeNgram.nodex_id==occs_node.id).all()
#print([n for n in data])
......@@ -127,6 +127,9 @@ def compute_tfidf_global(corpus):
tfidf_node = get_or_create_node(nodetype='Tfidf (global)', corpus=corpus)
session.query(NodeNodeNgram).filter(NodeNodeNgram.nodex_id==tfidf_node.id).delete()
session.commit()
# compute terms frequency sum
db, cursor = get_cursor()
......@@ -171,8 +174,7 @@ def compute_tfidf_global(corpus):
INSERT INTO
tmp__idf(ngram_id, idf)
SELECT
node_ngram.ngram_id,
-ln(COUNT(*))
node_ngram.ngram_id, -ln(COUNT(*))
FROM
%s AS node_ngram
INNER JOIN
......@@ -183,10 +185,10 @@ def compute_tfidf_global(corpus):
%s as corpus ON corpus.id = doc.parent_id
WHERE
doc.language_id = %d AND doc.type_id = %d AND corpus.type_id=%d
AND RANDOM() < 0.01
-- AND RANDOM() < 0.01
GROUP BY
node_ngram.ngram_id
limit 10000
-- limit 10000
;
''' % (Node_Ngram.__table__.name
, Node.__table__.name
......@@ -202,8 +204,7 @@ def compute_tfidf_global(corpus):
INSERT INTO
tmp__idf(ngram_id, idf)
SELECT
node_ngram.ngram_id,
-ln(COUNT(*))
node_ngram.ngram_id, -ln(COUNT(*))
FROM
%s AS node_ngram
INNER JOIN
......@@ -217,7 +218,7 @@ def compute_tfidf_global(corpus):
AND RANDOM() < 0.01
GROUP BY
node_ngram.ngram_id
limit 10000
-- limit 10000
;
''' % (Node_Ngram.__table__.name
, Node.__table__.name
......@@ -238,7 +239,6 @@ def compute_tfidf_global(corpus):
lnD = log(D)
cursor.execute('UPDATE tmp__idf SET idf = idf + %f' % (lnD, ))
# show off
dbg.show('insert tfidf')
cursor.execute('''
INSERT INTO
%s (nodex_id, nodey_id, ngram_id, score)
......@@ -254,6 +254,7 @@ def compute_tfidf_global(corpus):
''' % (NodeNodeNgram.__table__.name, tfidf_node.id, corpus.id, ))
db.commit()
dbg.show('insert tfidf')
#corpus=session.query(Node).filter(Node.id==244250).first()
#compute_tfidf_global(corpus)
from gargantext_web.db import session
from gargantext_web.db import Ngram, NodeNgram, NodeNgramNgram
from gargantext_web.db import Ngram, NodeNgramNgram
from gargantext_web.db import get_cursor, bulk_insert, get_or_create_node
from gargantext_web.db import get_cursor, bulk_insert
def insert_ngrams_to_list(list_of_ngrams, corpus, list_type='MapList', erase=True):
'''
Works only for Stop and Map
'''
list_node = get_or_create_node(corpus=corpus, nodetype=list_type)
group_node = get_or_create_node(corpus=corpus, nodetype='GroupList')
group_list = (session.query(NodeNgramNgram.ngramy_id)
.filter(NodeNgramNgram.id==group_node.id)
.all()
)
#print(list_node)
if erase == True:
session.query(NodeNgram).filter(NodeNgram.node_id==list_node.id).delete()
session.commit()
def get_id(ngram):
query = session.query(Ngram.id).filter(Ngram.terms==ngram).first()
return(query)
list_to_insert = list()
for ngram in list_of_ngrams:
ngram_candidate = get_id(ngram)
if ngram_candidate is not None:
ngram_id = ngram_candidate[0]
if ngram_id is not None and ngram_id not in group_list:
list_to_insert.append((list_node.id, ngram_id, 1))
#print(list_to_insert)
db, cursor = get_cursor()
bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [n for n in list_to_insert])
def insert_ngrams(ngrams,get='terms-id'):
'''
......@@ -111,28 +142,3 @@ def insert_nodengramngram(nodengramngram):
db.commit()
#def queryNodeNodeNgram(nodeMeasure_id=None, corpus_id=None, limit=None):
# '''
# queryNodeNodeNgram :: Int -> Int -> Int -> (Int, String, Float)
# Get list of ngrams according to a measure related to the corpus: maybe tfidf
# cvalue.
# '''
# query = (session.query(Ngram.id, Ngram.terms, NodeNodeNgram.score)
# .join(NodeNodeNgram, NodeNodeNgram.ngram_id == Ngram.id)
# .join(Node, Node.id == NodeNodeNgram.nodex_id)
# .filter(NodeNodeNgram.nodex_id == nodeMeasure_id)
# .filter(NodeNodeNgram.nodey_id == corpus_id)
# .group_by(Ngram.id, Ngram.terms, NodeNodeNgram.score)
# .order_by(desc(NodeNodeNgram.score))
# )
#
# if limit is None:
# query = query.count()
# elif limit == 0 :
# query = query.all()
# else:
# query = query.limit(limit)
#
# return(query)
#
......@@ -6,58 +6,48 @@ from ngram.stop import compute_stop
from ngram.group import compute_groups
from gargantext_web.db import get_or_create_node
from ngram.mapList import compute_mapList
# from ngram.occurrences import compute_occs
from gargantext_web.db import NodeNgram
#from gargantext_web.celery import update_processing
from gargantext_web.db import session , Node , NodeNgram
from admin.utils import WorkflowTracking
def ngram_workflow(corpus, n=5000):
'''
All the workflow to filter the ngrams.
'''
update_state = WorkflowTracking()
update_state.processing_(corpus, "Stop words")
compute_stop(corpus)
update_state.processing_(corpus, "TF-IDF global score")
compute_tfidf_global(corpus)
part = round(n * 0.9)
compute_cvalue(corpus,limit=1000) # size
# compute_cvalue(corpus,limit=1000) # size
part = round(part * 0.8)
print('spec part:', part)
# part = round(part * 0.8)
#print('spec part:', part)
update_state.processing_(corpus, "Specificity score")
compute_specificity(corpus,limit=part)
part = round(part * 0.8)
limit_inf = round(part * 1)
limit_sup = round(part * 5)
print(limit_inf,limit_sup)
#print(limit_inf,limit_sup)
update_state.processing_(corpus, "Synonyms")
compute_groups(corpus,limit_inf=limit_inf, limit_sup=limit_sup)
update_state.processing_(corpus, "Map list terms")
compute_mapList(corpus,limit=1000) # size
update_state.processing_(corpus, "TF-IDF local score")
compute_tfidf(corpus)
#corpus=session.query(Node).filter(Node.id==540420).first()
#corpus=session.query(Node).filter(Node.id==559637).first()
#update_processing(corpus, 0)
check_stop = False
# update_state.processing_(corpus, "OCCS local score")
# compute_occs(corpus)
if check_stop:
stop = get_or_create_node(corpus=corpus,nodetype='StopList')
#session.query(NodeNgram).filter(NodeNgram.node_id==stop.id).delete()
#session.commit()
stop_ngrams = (session.query(Ngram)
.join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
.filter(NodeNgram.node_id==stop.id)
.all()
)
print([n for n in stop_ngrams])
......@@ -269,42 +269,42 @@ class Node(CTENode):
for ngram_text, weight in associations.items()
])
@current_app.task(filter=task_method)
def workflow(self, keys=None, ngramsextractorscache=None, ngramscaches=None, verbose=False):
import time
total = 0
print("LOG::TIME: In workflow() parse_resources()")
start = time.time()
self.hyperdata['Processing'] = 1
self.save()
self.parse_resources()
end = time.time()
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" parse_resources() [s]",(end - start))
print("LOG::TIME: In workflow() / parse_resources()")
start = time.time()
print("LOG::TIME: In workflow() extract_ngrams()")
print("\n- - - - - - - - - -")
type_document = NodeType.objects.get(name='Document')
self.children.filter(type_id=type_document.pk).extract_ngrams(keys=['title',])
end = time.time()
print("- - - - - - - - - - \n")
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" extract_ngrams() [s]",(end - start))
print("LOG::TIME: In workflow() / extract_ngrams()")
start = time.time()
print("In workflow() do_tfidf()")
from analysis.functions import do_tfidf
do_tfidf(self)
end = time.time()
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_tfidf() [s]",(end - start))
print("LOG::TIME: In workflow() / do_tfidf()")
print("In workflow() END")
self.hyperdata['Processing'] = 0
self.save()
# @current_app.task(filter=task_method)
# def workflow(self, keys=None, ngramsextractorscache=None, ngramscaches=None, verbose=False):
# import time
# total = 0
# print("LOG::TIME: In workflow() parse_resources()")
# start = time.time()
# self.hyperdata['Processing'] = 1
# self.save()
# self.parse_resources()
# end = time.time()
# total += (end - start)
# print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" parse_resources() [s]",(end - start))
# print("LOG::TIME: In workflow() / parse_resources()")
# start = time.time()
# print("LOG::TIME: In workflow() extract_ngrams()")
# print("\n- - - - - - - - - -")
# type_document = NodeType.objects.get(name='Document')
# self.children.filter(type_id=type_document.pk).extract_ngrams(keys=['title',])
# end = time.time()
# print("- - - - - - - - - - \n")
# total += (end - start)
# print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" extract_ngrams() [s]",(end - start))
# print("LOG::TIME: In workflow() / extract_ngrams()")
# start = time.time()
# print("In workflow() do_tfidf()")
# from analysis.functions import do_tfidf
# do_tfidf(self)
# end = time.time()
# total += (end - start)
# print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_tfidf() [s]",(end - start))
# print("LOG::TIME: In workflow() / do_tfidf()")
# print("In workflow() END")
# self.hyperdata['Processing'] = 0
# self.save()
class Node_Hyperdata(models.Model):
node = models.ForeignKey(Node, on_delete=models.CASCADE)
......
......@@ -23,10 +23,12 @@ from ..NgramsExtractors import *
from admin.utils import PrintException
class EuropressFileParser(FileParser):
def _parse_header(self, header):
pass
def _parse(self, file):
localeEncoding = "fr_FR"
codif = "UTF-8"
format_date = re.compile('.*\d{4}.*', re.UNICODE)
if isinstance(file, str):
file = open(file, 'rb')
......@@ -71,6 +73,7 @@ class EuropressFileParser(FileParser):
# parse all the articles, one by one
try:
for html_article in html_articles:
print('article')
hyperdata = {}
......@@ -87,31 +90,12 @@ class EuropressFileParser(FileParser):
header = html_article.xpath(header_xpath)[0].text
if header is not None:
header = header.split(', ')
if format_date.match(header[0]):
date = header[0]
elif format_date.match(header[1]):
hyperdata['rubrique'] = header[0]
date = header[1]
try:
hyperdata['page'] = header[2].split(' ')[1]
except:
pass
else:
date = header[2]
try:
hyperdata['publication_date'] = dateparser.parse(date.strip(), languages=['fr', 'en'])
except:
hyperdata['publication_date'] = timezone.now().strftime("%Y-%m-%d %H:%M:%S")
hyperdata.update(self._parse_header(header))
hyperdata['publication_year'] = hyperdata['publication_date'].strftime('%Y')
hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m')
hyperdata['publication_day'] = hyperdata['publication_date'].strftime('%d')
#print(hyperdata['publication_date'])
try:
title = paragraph_list(html_article.xpath(title_xpath))
hyperdata['title'] = title[0]
......@@ -130,6 +114,60 @@ class EuropressFileParser(FileParser):
PrintException()
pass
class EuropressFileParser_fr(EuropressFileParser):
def _parse_header(self, header):
format_date = re.compile('.*\d{4}.*', re.UNICODE)
hyperdata = dict()
if header is not None:
header = header.split(', ')
if format_date.match(header[0]):
date = header[0]
elif format_date.match(header[1]):
hyperdata['rubrique'] = header[0]
date = header[1]
try:
hyperdata['page'] = header[2].split(' ')[1]
except:
pass
else:
date = header[2]
try:
hyperdata['publication_date'] = dateparser.parse(date.strip(), languages=['fr'])
except:
hyperdata['publication_date'] = timezone.now().strftime("%Y-%m-%d %H:%M:%S")
return(hyperdata)
#print(hyperdata['publication_date'])
class EuropressFileParser_en(EuropressFileParser):
def _parse_header(self, header):
format_date = re.compile('.*\d{4}.*', re.UNICODE)
if header is not None:
header = header.split(', ')
if format_date.match(header[0]):
date = header[0]
elif format_date.match(header[1]):
hyperdata['rubrique'] = header[0]
date = header[1]
try:
hyperdata['page'] = header[2].split(' ')[1]
except:
pass
else:
date = header[2]
try:
hyperdata['publication_date'] = dateparser.parse(date.strip(), languages=['fr'])
except:
hyperdata['publication_date'] = timezone.now().strftime("%Y-%m-%d %H:%M:%S")
if __name__ == "__main__":
e = EuropressFileParser()
hyperdata = e.parse(str(sys.argv[1]))
......
import re
import locale
from lxml import etree
from lxml.etree import tostring
from lxml.html import html5parser
from itertools import chain
from datetime import datetime, date
from django.utils import timezone
import dateutil.parser
import dateparser
import sys
#sys.path.append('/srv/gargantext')
#from admin.env import *
#from parsing.FileParsers.FileParser import FileParser
from .FileParser import FileParser
#from parsing.NgramsExtractors import *
from ..NgramsExtractors import *
from admin.utils import PrintException
class EuropressFileParser_en(FileParser):
def _parse(self, file):
localeEncoding = "fr_FR"
codif = "UTF-8"
format_page = re.compile('p\. .*', re.UNICODE)
def parse_date(date, lang):
d = dateparser.parse(date.strip(), languages=[lang])
return d
if isinstance(file, str):
file = open(file, 'rb')
contents = file.read()
encoding = self.detect_encoding(contents)
if encoding != "utf-8":
try:
contents = contents.decode("latin1", errors='replace').encode(codif)
except:
PrintException()
html_parser = etree.HTMLParser(encoding=codif)
html = etree.fromstring(contents, html_parser)
html_parser = html5parser.etree.HTMLParser(encoding=codif)
html = html5parser.etree.fromstring(contents, html_parser)
html_articles = html.xpath('//article')
name_xpath = "./header/div/span[@class = 'DocPublicationName']"
header_xpath = "./header/div/span[@class = 'DocHeader']"
title_xpath = "./header/div[@class='titreArticle']/descendant-or-self::*"
text_xpath = "./section/div[@class='DocText']/descendant-or-self::*"
def paragraph_list(data_xpath):
result = list()
for elem in data_xpath:
if elem.text is not None:
if elem.text.strip() != '':
if elem.tag == 'p':
result.append(elem.text)
else:
if len(result) > 0:
result.append(result.pop() + elem.text)
else:
result.append(elem.text)
return result
# parse all the articles, one by one
try:
for html_article in html_articles:
hyperdata = {}
try:
pub_name = html_article.xpath(name_xpath)[0].text
name = pub_name.split(', ')
hyperdata['journal'] = name[0]
hyperdata['number'] = name[1]
except:
try:
hyperdata['journal'] = pub_name.strip()
except:
pass
#print(hyperdata['publication_date'])
try:
title = paragraph_list(html_article.xpath(title_xpath))
hyperdata['title'] = title[0]
except:
pass
header = html_article.xpath(header_xpath)[0].text
if header is not None:
header = header.split(', ')
header = list(filter(lambda x: format_page.match(x) is None, header))
print(header)
if parse_date(header[0], 'en') is not None:
date = ' '.join(header[0:])
elif parse_date(header[1], 'en') is not None:
date = ' '.join(header[1:])
elif parse_date(header[2], 'en') is not None:
date = ' '.join(header[2:])
elif parse_date(header[3], 'en') is not None:
date = ' '.join(header[3:])
else:
date = '2016'
try:
hyperdata['publication_date'] = dateparser.parse(date.strip(), languages=['en'])
except:
hyperdata['publication_date'] = timezone.now().strftime("%Y-%m-%d %H:%M:%S")
try:
hyperdata['publication_year'] = hyperdata['publication_date'].strftime('%Y')
hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m')
hyperdata['publication_day'] = hyperdata['publication_date'].strftime('%d')
except:
print(hyperdata['title'])
print(date)
try:
text = paragraph_list(html_article.xpath(text_xpath))
hyperdata['abstract'] = ' '.join([ ' <p> ' + p + ' </p> ' for p in title[1:] + text])
except:
pass
yield hyperdata
except :
PrintException()
pass
if __name__ == "__main__":
e = EuropressFileParser()
hyperdata = e.parse(str(sys.argv[1]))
for h in hyperdata:
try:
print(h['journal'], ":", h['publication_date'])
except:
pass
import re
import locale
from lxml import etree
from lxml.etree import tostring
from lxml.html import html5parser
from itertools import chain
from datetime import datetime, date
from django.utils import timezone
import dateutil.parser
import dateparser
import sys
#sys.path.append('/srv/gargantext')
#from admin.env import *
#from parsing.FileParsers.FileParser import FileParser
from .FileParser import FileParser
#from parsing.NgramsExtractors import *
from ..NgramsExtractors import *
from admin.utils import PrintException
class EuropressFileParser_fr(FileParser):
def _parse(self, file):
localeEncoding = "fr_FR"
codif = "UTF-8"
format_date = re.compile('.*\d{4}.*', re.UNICODE)
def parse_date(date, lang):
d = dateparser.parse(date.strip(), languages=[lang])
return d
if isinstance(file, str):
file = open(file, 'rb')
contents = file.read()
encoding = self.detect_encoding(contents)
if encoding != "utf-8":
try:
contents = contents.decode("latin1", errors='replace').encode(codif)
except:
PrintException()
html_parser = etree.HTMLParser(encoding=codif)
html = etree.fromstring(contents, html_parser)
html_parser = html5parser.etree.HTMLParser(encoding=codif)
html = html5parser.etree.fromstring(contents, html_parser)
html_articles = html.xpath('//article')
name_xpath = "./header/div/span[@class = 'DocPublicationName']"
header_xpath = "./header/div/span[@class = 'DocHeader']"
title_xpath = "./header/div[@class='titreArticle']/descendant-or-self::*"
text_xpath = "./section/div[@class='DocText']/descendant-or-self::*"
def paragraph_list(data_xpath):
result = list()
for elem in data_xpath:
if elem.text is not None:
if elem.text.strip() != '':
if elem.tag == 'p':
result.append(elem.text)
else:
if len(result) > 0:
result.append(result.pop() + elem.text)
else:
result.append(elem.text)
return result
# parse all the articles, one by one
try:
for html_article in html_articles:
hyperdata = {}
try:
pub_name = html_article.xpath(name_xpath)[0].text
name = pub_name.split(', ')
hyperdata['journal'] = name[0]
hyperdata['number'] = name[1]
except:
try:
hyperdata['journal'] = pub_name.strip()
except:
pass
header = html_article.xpath(header_xpath)[0].text
if header is not None:
header = header.split(', ')
if parse_date(header[0], 'fr') is not None:
date = header[0]
elif parse_date(header[1], 'fr') is not None:
hyperdata['rubrique'] = header[0]
date = header[1]
try:
hyperdata['page'] = header[2].split(' ')[1]
except:
pass
elif parse_date(header[2], 'fr') is not None:
date = header[2]
elif parse_date(header[0], 'en') is not None:
date = ' '.join(header[0:])
elif parse_date(header[1], 'en') is not None:
date = ' '.join(header[1:])
elif parse_date(header[2], 'en') is not None:
date = ' '.join(header[2:])
try:
hyperdata['publication_date'] = dateparser.parse(date.strip(), languages=['fr', 'en'])
except:
hyperdata['publication_date'] = timezone.now().strftime("%Y-%m-%d %H:%M:%S")
hyperdata['publication_year'] = hyperdata['publication_date'].strftime('%Y')
hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m')
hyperdata['publication_day'] = hyperdata['publication_date'].strftime('%d')
#print(hyperdata['publication_date'])
try:
title = paragraph_list(html_article.xpath(title_xpath))
hyperdata['title'] = title[0]
except:
pass
try:
text = paragraph_list(html_article.xpath(text_xpath))
hyperdata['abstract'] = ' '.join([ ' <p> ' + p + ' </p> ' for p in title[1:] + text])
except:
pass
yield hyperdata
except :
PrintException()
pass
if __name__ == "__main__":
e = EuropressFileParser()
hyperdata = e.parse(str(sys.argv[1]))
for h in hyperdata:
try:
print(h['journal'], ":", h['publication_date'])
except:
pass
......@@ -3,6 +3,7 @@ from .IsiFileParser import IsiFileParser
from .JstorFileParser import JstorFileParser
from .ZoteroFileParser import ZoteroFileParser
from .PubmedFileParser import PubmedFileParser
from .EuropressFileParser import EuropressFileParser
from .EuropressFileParser_en import EuropressFileParser_en
from .EuropressFileParser_fr import EuropressFileParser_fr
from .ISTex import ISTex
from .CSVParser import CSVParser
......@@ -7,8 +7,8 @@ parsers = {
'Zotero (RIS format)' : ZoteroFileParser,
'Jstor (RIS format)' : JstorFileParser,
#'Europress' : EuropressFileParser,
'Europress (French)' : EuropressFileParser,
'Europress (English)' : EuropressFileParser,
'Europress (French)' : EuropressFileParser_fr,
'Europress (English)' : EuropressFileParser_en,
'CSVParser' : CSVParser,
'ISTex' : ISTex,
}
......
......@@ -11,7 +11,7 @@ import datetime
import copy
from gargantext_web.views import move_to_trash
from gargantext_web.db import session, Node, NodeNgram, NodeNgramNgram, NodeNodeNgram, Ngram, Hyperdata, Node_Ngram\
from gargantext_web.db import session, cache, Node, NodeNgram, NodeNgramNgram, NodeNodeNgram, Ngram, Hyperdata, Node_Ngram\
, NodeType, Node_Hyperdata
from gargantext_web.validation import validate, ValidationException
from node import models
......@@ -139,6 +139,50 @@ class NodesChildrenNgrams(APIView):
],
})
class NodesChildrenNgramsIds(APIView):
def get(self, request, node_id):
# query ngrams
ParentNode = aliased(Node)
ngrams_query = (session
.query(Node.id, func.sum(Node_Ngram.weight).label('count'))
.join(Node_Ngram, Node_Ngram.node_id == Node.id)
.join(Ngram, Ngram.id == Node_Ngram.ngram_id)
.filter(Node.parent_id == node_id)
.filter(Node.type_id == cache.NodeType['Document'].id)
.group_by(Node.id)
# .group_by(Ngram)
.order_by(func.sum(Node_Ngram.weight).desc())
)
# filters
if 'startwith' in request.GET:
ngrams_query = ngrams_query.filter(Ngram.terms.startswith(request.GET['startwith']))
if 'contain' in request.GET:
ngrams_query = ngrams_query.filter(Ngram.terms.contains(request.GET['contain']))
#if 'doesnotcontain' in request.GET:
# ngrams_query = ngrams_query.filter(not_(Ngram.terms.contains(request.GET['doesnotcontain'])))
# pagination
offset = int(request.GET.get('offset', 0))
limit = int(request.GET.get('limit', 20))
total = ngrams_query.count()
# return formatted result
return JsonHttpResponse({
'pagination': {
'offset': offset,
'limit': limit,
'total': total,
},
'data': [
{
'id': node,
'count': count
}
for node, count in ngrams_query[offset : offset+limit]
],
})
from gargantext_web.db import get_or_create_node
class Ngrams(APIView):
......
......@@ -19,23 +19,27 @@ class Graph(APIView):
start = request.GET.get('start', None)
end = request.GET.get('end' , None)
format_ = request.GET.get('format', 'json')
type_ = request.GET.get('type', 'node_link')
hapax = request.GET.get('hapax', 1)
format_ = request.GET.get('format', 'json')
type_ = request.GET.get('type', 'node_link')
hapax = request.GET.get('hapax', 1)
distance = request.GET.get('distance', 'conditional')
corpus = session.query(Node).filter(Node.id==corpus_id).first()
accepted_field1 = ['ngrams', 'journal', 'source', 'authors']
accepted_field2 = ['ngrams',]
options = ['start', 'end', 'hapax']
options = ['start', 'end', 'hapax', 'distance']
if field1 in accepted_field1 :
if field2 in accepted_field2 :
if start is not None and end is not None :
data = get_cooc(corpus=corpus,field1=field1, field2=field2, start=start, end=end, hapax=hapax)
data = get_cooc(corpus=corpus,field1=field1, field2=field2
, start=start, end=end
, hapax=hapax, distance=distance)
else:
data = get_cooc(corpus=corpus,field1=field1, field2=field2, hapax=hapax)
data = get_cooc(corpus=corpus,field1=field1, field2=field2
, hapax=hapax, distance = distance)
if format_ == 'json':
return JsonHttpResponse(data)
else:
......
This diff is collapsed.
......@@ -6,6 +6,8 @@ from rest_v1_0 import api, ngrams, graph
from annotations import views
import tests.ngramstable.views as samtest
urlpatterns = patterns('',
# REST URLS
# What is REST ?
......@@ -15,6 +17,7 @@ urlpatterns = patterns('',
url(r'nodes$', api.NodesList.as_view()),
url(r'nodes/(\d+)$', api.Nodes.as_view()),
url(r'nodes/(\d+)/children/ngrams$', api.NodesChildrenNgrams.as_view()), # => repeated children ?
url(r'nodes/(\d+)/children/ids$', api.NodesChildrenNgramsIds.as_view()), # => repeated children ?
# NGRAMS table & annotations
url(r'node/(\d+)/ngrams$' , ngrams.Ngrams.as_view()),
......@@ -22,7 +25,9 @@ urlpatterns = patterns('',
url(r'node/(\d+)/ngrams/keep$', ngrams.Keep.as_view()),
# url(r'node/(?P<list_id>[0-9]+)/ngrams/keep/(?P<ngram_ids>[0-9,\+]+)+$' , ngrams.Keep.as_view()),
url(r'node/(?P<list_id>[0-9]+)/ngrams/(?P<ngram_ids>[0-9,\+]+)+$', views.NgramEdit.as_view()),
url(r'node/(\d+)/ngrams/list$' , ngrams.List.as_view()),
url(r'node/(?P<corpus_id>[0-9]+)/ngrams/list/(?P<list_name>\w+)$' , ngrams.List.as_view()),
url(r'node/corpus/(?P<node_ids>[0-9,\+]+)+$' , samtest.get_corpuses),
#url(r'nodes/(\d+)/children/hyperdata$', api.NodesChildrenMetatadata.as_view()),
#url(r'nodes/(\d+)/children/hyperdata$', api.NodesChildrenMetatadata.as_view()),
......
......@@ -45,19 +45,22 @@ class MedlineFetcher:
query = query.replace(' ', '%20')
eSearch = '%s/esearch.fcgi?db=%s&retmax=1&usehistory=y&term=%s' %(self.pubMedEutilsURL, self.pubMedDB, query)
eSearchResult = urlopen(eSearch)
data = eSearchResult.read()
root = etree.XML(data)
findcount = etree.XPath("/eSearchResult/Count/text()")
count = findcount(root)[0]
findquerykey = etree.XPath("/eSearchResult/QueryKey/text()")
queryKey = findquerykey(root)[0]
findwebenv = etree.XPath("/eSearchResult/WebEnv/text()")
webEnv = findwebenv(root)[0]
try:
eSearchResult = urlopen(eSearch)
data = eSearchResult.read()
root = etree.XML(data)
findcount = etree.XPath("/eSearchResult/Count/text()")
count = findcount(root)[0]
findquerykey = etree.XPath("/eSearchResult/QueryKey/text()")
queryKey = findquerykey(root)[0]
findwebenv = etree.XPath("/eSearchResult/WebEnv/text()")
webEnv = findwebenv(root)[0]
except:
count=0
queryKey=False
webEnv=False
origQuery=False
values = { "query":origQuery , "count": int(str(count)), "queryKey": queryKey , "webEnv":webEnv }
return values
......@@ -173,8 +176,13 @@ class MedlineFetcher:
self.q.join()
print('time:',time.perf_counter() - start)
Total = 0
Fails = 0
for globalresults in self.firstResults:
# globalresults = self.medlineEsearch(pubmedquery)
Total += 1
if globalresults["queryKey"]==False:
Fails += 1
if globalresults["count"]>0:
N+=globalresults["count"]
queryhyperdata = {
......@@ -198,4 +206,7 @@ class MedlineFetcher:
if query["retmax"]==0: query["retmax"]+=1
print(query["string"],"\t[",k,">",query["retmax"],"]")
if ((Fails+1)/(Total+1))==1 : # for identifying the epic fail or connection error
thequeries = [False]
return thequeries
......@@ -130,7 +130,7 @@ def doTheQuery(request , project_id):
parent_id = project_id,
type_id = cache.NodeType['Corpus'].id,
language_id = None,
hyperdata = {'Processing' : 1,}
hyperdata = {'Processing' : "Parsing documents",}
)
session.add(corpus)
session.commit()
......@@ -243,7 +243,7 @@ def testISTEX(request , project_id):
parent_id = project_id,
type_id = cache.NodeType['Corpus'].id,
language_id = None,
hyperdata = {'Processing' : 1,}
hyperdata = {'Processing' : "Parsing documents",}
)
session.add(corpus)
session.commit()
......
#!/bin/bash
FILE=$(date +%Y%m%d-%H:%M:%S.log)
source /srv/gargantext_env/bin/activate
touch /var/log/gargantext/celery/$FILE && ./manage.py celery worker --loglevel=info >> $FILE
#!/bin/bash
FILE=$(date +%Y%m%d-%H:%M:%S.log)
touch /var/log/gargantext/uwsgi/$FILE && uwsgi gargantext.ini >> $FILE
This diff is collapsed.
This diff is collapsed.
static/img/logo.png

3.41 KB | W: | H:

static/img/logo.png

39.2 KB | W: | H:

static/img/logo.png
static/img/logo.png
static/img/logo.png
static/img/logo.png
  • 2-up
  • Swipe
  • Onion skin
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- Created with Inkscape (http://www.inkscape.org/) -->
<svg
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:cc="http://creativecommons.org/ns#"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:svg="http://www.w3.org/2000/svg"
xmlns="http://www.w3.org/2000/svg"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
width="210mm"
height="297mm"
id="svg2"
version="1.1"
inkscape:version="0.48.5 r10040"
sodipodi:docname="mastodons.svg"
inkscape:export-filename="/srv/gargantext/static/img/sponsors/mastodons.png"
inkscape:export-xdpi="61.073017"
inkscape:export-ydpi="61.073017">
<defs
id="defs4">
<linearGradient
id="linearGradient3782">
<stop
style="stop-color:#09097e;stop-opacity:1;"
offset="0"
id="stop3784" />
<stop
id="stop3790"
offset="0.5"
style="stop-color:#09097e;stop-opacity:0.49803922;" />
<stop
style="stop-color:#09097e;stop-opacity:0;"
offset="1"
id="stop3786" />
</linearGradient>
</defs>
<sodipodi:namedview
id="base"
pagecolor="#ffffff"
bordercolor="#666666"
borderopacity="1.0"
inkscape:pageopacity="0.0"
inkscape:pageshadow="2"
inkscape:zoom="7.0998446"
inkscape:cx="68.070083"
inkscape:cy="793.17743"
inkscape:document-units="px"
inkscape:current-layer="layer1"
showgrid="false"
inkscape:window-width="963"
inkscape:window-height="762"
inkscape:window-x="0"
inkscape:window-y="0"
inkscape:window-maximized="0" />
<metadata
id="metadata7">
<rdf:RDF>
<cc:Work
rdf:about="">
<dc:format>image/svg+xml</dc:format>
<dc:type
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
<dc:title></dc:title>
</cc:Work>
</rdf:RDF>
</metadata>
<g
inkscape:label="Calque 1"
inkscape:groupmode="layer"
id="layer1">
<path
style="fill:#04047e;fill-opacity:1;stroke:#191559;stroke-width:2.68011474999999999;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;fill-rule:nonzero;opacity:1"
d="m 67.957124,219.89012 c -21.951494,0 -40.977621,8.69677 -50.335489,21.40826 4.292013,3.79798 7.034932,9.55548 7.047344,16.00434 l 0,0.0396 c 0,7.23734 -3.440309,13.61318 -8.653863,17.32815 8.645806,13.99801 28.643543,23.80892 51.942008,23.80892 23.496654,0 43.642766,-9.96848 52.167926,-24.15469 -4.08915,-3.80631 -6.68337,-9.43968 -6.68337,-15.71784 0,-6.0475 2.40661,-11.48933 6.23781,-15.28316 -8.76845,-13.80251 -28.630779,-23.4335 -51.722366,-23.4335 z"
id="path2989"
inkscape:export-filename="/srv/gargantext/static/img/sponsors/mastodons.png"
inkscape:export-xdpi="150.35899"
inkscape:export-ydpi="150.35899"
inkscape:connector-curvature="0" />
<text
xml:space="preserve"
style="font-size:29.44009972px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#00ffff;fill-opacity:1;stroke:none;font-family:Courier;-inkscape-font-specification:Courier"
x="160.47124"
y="119.38621"
id="text2985"
sodipodi:linespacing="125%"
transform="scale(0.43195474,2.3150573)"
inkscape:export-filename="/srv/gargantext/static/img/sponsors/mastodons.png"
inkscape:export-xdpi="150.35899"
inkscape:export-ydpi="150.35899"><tspan
sodipodi:role="line"
id="tspan2987"
x="160.47124"
y="119.38621">M<tspan
style="fill:#ffffff"
id="tspan2985">a</tspan>s<tspan
style="fill:#ffffff"
id="tspan2988">t</tspan>o<tspan
style="fill:#ffffff"
id="tspan2990">n</tspan>d<tspan
style="fill:#ffffff"
id="tspan2992">o</tspan>n<tspan
style="fill:#ffffff"
id="tspan2996">s</tspan></tspan></text>
</g>
</svg>
......@@ -9,14 +9,14 @@
xmlns="http://www.w3.org/2000/svg"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
width="48px"
height="48px"
id="svg4362"
width="640"
height="480"
id="svg3041"
version="1.1"
inkscape:version="0.48.5 r10040"
sodipodi:docname="logo.svg">
<defs
id="defs4364" />
id="defs3043" />
<sodipodi:namedview
id="base"
pagecolor="#ffffff"
......@@ -24,20 +24,19 @@
borderopacity="1.0"
inkscape:pageopacity="0.0"
inkscape:pageshadow="2"
inkscape:zoom="5.6897594"
inkscape:cx="-11.235831"
inkscape:cy="3.8560006"
inkscape:current-layer="layer1"
showgrid="true"
inkscape:grid-bbox="true"
inkscape:zoom="0.86750285"
inkscape:cx="574.44134"
inkscape:cy="214.55006"
inkscape:document-units="px"
inkscape:current-layer="layer1"
showgrid="false"
inkscape:window-width="1360"
inkscape:window-height="762"
inkscape:window-x="0"
inkscape:window-y="0"
inkscape:window-maximized="0" />
<metadata
id="metadata4367">
id="metadata3046">
<rdf:RDF>
<cc:Work
rdf:about="">
......@@ -49,49 +48,87 @@
</rdf:RDF>
</metadata>
<g
inkscape:label="Calque 1"
inkscape:groupmode="layer"
id="layer1"
inkscape:label="Layer 1"
inkscape:groupmode="layer">
transform="translate(0,-572.36218)">
<rect
style="fill:#fffcfc;fill-opacity:1;stroke:none"
id="rect3755"
width="29.70249"
height="31.108515"
x="0"
y="-0.1566938"
style="fill:#ffffff;fill-opacity:1;stroke:none"
id="rect2998"
width="410.37329"
height="315.84909"
x="102.59332"
y="641.98889"
inkscape:export-filename="/srv/gargantext/static/img/logo.png"
inkscape:export-xdpi="200"
inkscape:export-ydpi="200" />
<path
inkscape:export-ydpi="200"
inkscape:export-xdpi="200"
inkscape:export-filename="/srv/gargantext/static/img/logo.png"
id="path3950"
d="m 202.38568,655.35804 0,249.27214 130.47086,0 0,-16.6021 c -18.98946,-0.90974 -35.76303,-5.06743 -49.20478,-12.09066 -35.91671,-18.75291 -57.50584,-54.40745 -57.50584,-94.68011 0,-61.05225 40.65113,-101.66242 106.71062,-106.22939 l 0,-19.66988 -130.47086,0 z m 130.47086,19.66988 0,16.36149 c 3.35297,-0.31508 6.80528,-0.54137 10.34622,-0.54137 13.30249,0 25.59129,2.77406 37.8961,8.00029 l 4.63174,5.53403 4.99267,27.97093 18.64728,0 0,-48.84386 c -21.94909,-5.84107 -43.53556,-8.96273 -63.82184,-8.96273 -4.32186,0 -8.56494,0.19589 -12.69217,0.48122 z m 0,16.36149 c -44.82787,4.21264 -72.48382,36.2169 -72.48382,87.70241 0,53.1268 27.8084,86.66863 72.48382,92.03339 l 0,-179.7358 z m 0,179.7358 0,16.90287 c 2.62215,0.12569 5.29683,0.18036 8.00027,0.18036 19.62116,0 42.60224,-3.63639 68.87466,-10.70715 l 0,-73.50641 5.65434,-4.27083 18.28637,-1.56397 0,-12.63203 -83.19097,0 0,12.63203 23.63995,1.56397 5.65434,4.27083 0,63.94215 c -12.96993,2.76683 -24.28172,4.03022 -33.92603,4.03022 -4.47193,0 -8.80811,-0.3396 -12.99293,-0.84213 z"
style="font-size:166.11251831px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#800000;fill-opacity:1;stroke:none;font-family:Bitstream Charter;-inkscape-font-specification:Bitstream Charter"
inkscape:connector-curvature="0" />
<path
inkscape:export-ydpi="200"
inkscape:export-xdpi="200"
inkscape:export-filename="/srv/gargantext/static/img/logo.png"
transform="matrix(1.9248814,0,0,1.9248814,982.68611,-182.71269)"
sodipodi:open="true"
sodipodi:end="6.1660663"
sodipodi:start="0"
d="m -312.87112,480.17926 a 9.0149298,9.0149298 0 1 1 -0.0618,-1.05341"
sodipodi:ry="9.0149298"
sodipodi:rx="9.0149298"
sodipodi:cy="480.17926"
sodipodi:cx="-321.88605"
id="path3952"
style="fill:#ffcc00;fill-opacity:1;stroke:none"
sodipodi:type="arc" />
<flowRoot
xml:space="preserve"
id="flowRoot3130"
style="font-size:12px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#800000;fill-opacity:1;stroke:none;font-family:Sans"
transform="translate(-222.82792,732.12538)"
inkscape:export-filename="/srv/gargantext/static/img/logo.png"
inkscape:export-xdpi="200"
inkscape:export-ydpi="200"><flowRegion
id="flowRegion3132"><rect
id="rect3134"
width="1090.0853"
height="476.31992"
x="327.0256"
y="148.23489"
style="fill:#800000" /></flowRegion><flowPara
id="flowPara3136"
style="font-size:64px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;fill:#800000;font-family:Sawasdee;-inkscape-font-specification:Sawasdee">Gargan<flowSpan
style="font-size:72px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Courier;-inkscape-font-specification:Courier;fill:#800000"
id="flowSpan3019">text</flowSpan></flowPara></flowRoot> <flowRoot
xml:space="preserve"
id="flowRoot3138"
style="fill:black;stroke:none;stroke-opacity:1;stroke-width:1px;stroke-linejoin:miter;stroke-linecap:butt;fill-opacity:1;font-family:Sans;font-style:normal;font-weight:normal;font-size:12px;line-height:125%;letter-spacing:0px;word-spacing:0px"
inkscape:export-filename="/srv/gargantext/static/img/logo.png"
inkscape:export-xdpi="53"
inkscape:export-ydpi="53" />
<g
inkscape:export-ydpi="53.799999"
inkscape:export-xdpi="53.799999"
inkscape:export-xdpi="200"
inkscape:export-ydpi="200"><flowRegion
id="flowRegion3140"><rect
id="rect3142"
width="127.96654"
height="37.916012"
x="338.87436"
y="77.142372" /></flowRegion><flowPara
id="flowPara3144" /></flowRoot> <flowRoot
xml:space="preserve"
id="flowRoot3021"
style="fill:black;stroke:none;stroke-opacity:1;stroke-width:1px;stroke-linejoin:miter;stroke-linecap:butt;fill-opacity:1;font-family:Sans;font-style:normal;font-weight:normal;font-size:12px;line-height:125%;letter-spacing:0px;word-spacing:0px"
inkscape:export-filename="/srv/gargantext/static/img/logo.png"
style="fill:#ff8080;fill-opacity:0.82014388"
id="g3835"
transform="matrix(0.2422549,0,0,0.23374214,-49.789462,-7.9055988)">
<path
inkscape:export-ydpi="100"
inkscape:export-xdpi="100"
inkscape:export-filename="/home/alexandre/projets/gargantext.py/gargantext_core/shared/LogoSimple.png"
id="path3837"
d="m 206.24721,35.28586 0,129.5 67.78125,0 0,-8.625 c -9.86526,-0.47262 -18.57934,-2.63259 -25.5625,-6.28125 -18.65918,-9.74237 -29.875,-28.26535 -29.875,-49.1875 0,-31.71741 21.11877,-52.8149 55.4375,-55.1875 l 0,-10.21875 -67.78125,0 z m 67.78125,10.21875 0,8.5 c 1.74191,-0.16369 3.53543,-0.28125 5.37499,-0.28125 6.91081,0 13.295,1.44116 19.6875,4.15625 l 2.40625,2.875 2.59375,14.53125 9.6875,0 0,-25.375 c -11.40283,-3.03451 -22.61727,-4.65625 -33.15625,-4.65625 -2.24526,0 -4.44959,0.10177 -6.59374,0.25 z m 0,8.5 c -23.28864,2.18852 -37.65625,18.81513 -37.65625,45.562503 0,27.600037 14.44681,45.025437 37.65625,47.812497 l 0,-93.375 z m 0,93.375 0,8.78125 c 1.36224,0.0653 2.75177,0.0937 4.15624,0.0937 10.19344,0 22.1324,-1.88915 35.78125,-5.5625 l 0,-38.1875 2.9375,-2.21875 9.5,-0.8125 0,-6.5625 -43.21875,0 0,6.5625 12.28125,0.8125 2.9375,2.21875 0,33.21875 c -6.73804,1.4374 -12.61466,2.09375 -17.625,2.09375 -2.32322,0 -4.57592,-0.17643 -6.74999,-0.4375 z"
style="font-size:166.11251831px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#ff8080;fill-opacity:0.82014388;stroke:none;font-family:Bitstream Charter;-inkscape-font-specification:Bitstream Charter"
inkscape:connector-curvature="0" />
<path
inkscape:export-ydpi="100"
inkscape:export-xdpi="100"
transform="translate(611.62306,-400.10238)"
sodipodi:open="true"
sodipodi:end="6.1660663"
sodipodi:start="0"
d="m -312.87112,480.17926 c 0,4.97881 -4.03612,9.01493 -9.01493,9.01493 -4.97881,0 -9.01493,-4.03612 -9.01493,-9.01493 0,-4.97881 4.03612,-9.01493 9.01493,-9.01493 4.57131,0 8.41901,3.42153 8.95317,7.96152"
sodipodi:ry="9.0149298"
sodipodi:rx="9.0149298"
sodipodi:cy="480.17926"
sodipodi:cx="-321.88605"
id="path3839"
style="fill:#ff8080;fill-opacity:0.82014388;stroke:none"
sodipodi:type="arc" />
</g>
</g>
inkscape:export-xdpi="200"
inkscape:export-ydpi="200"><flowRegion
id="flowRegion3023"><rect
id="rect3025"
width="3.9310031"
height="24.568769"
x="739.02856"
y="423.98322" /></flowRegion><flowPara
id="flowPara3027" /></flowRoot> </g>
</svg>
......@@ -293,11 +293,6 @@ function Main_test( Data , SearchFilter ) {
// console.log(Data[i]["date"]+" : originalRecords["+arr_id+"] <- "+orig_id+" | "+Data[i]["name"])
}
// $("#move2trash").prop('disabled', true);
var t0 = AjaxRecords[0].date.split("-").map(Number)
var t1 = AjaxRecords.slice(-1)[0].date.split("-").map(Number)
oldest = t0;
......@@ -458,6 +453,7 @@ function Main_test( Data , SearchFilter ) {
var the_content = $("#filter_search").html();
$(""+the_content).insertAfter("#dynatable-query-search-my-ajax-table")
// .insertAfter("#dynatable-query-search-my-ajax-table")
return "OK"
......
This diff is collapsed.
......@@ -35,24 +35,39 @@
<div class="panel-body">
<div class="container">
<ul>
<li>Version 1.0</li>
<li>Version 2.0</li>
<ul>
<li>[Start] Beta Version </li>
<li>[Law] Licence of Gargantext is GPL v3+ </li>
<li>[NAME] Red Lemon</li>
<li>[NLP] Turbo Parser, MELT</li>
<li>[FEATURE] Ngrams Table management</li>
<li>[FEATURE] Annotation local view</li>
<li>[FEATURE] Lexical Graph with temporal filter</li>
<li>[FEATURE] Graph bi-Partite</li>
</ul>
<li>Version 1.0.5</li>
<li>Versions from 1.0 to 1.9</li>
<ul>
<li>Bug resolution: [Import] xml zipped from Mac</li>
<li>Bug resolution: [Import] french accents in filenames</li>
<li>New features: [Advanced chart] ngrams completion</li>
<li>New features: [Duplicates management] button to delete all duplicates</li>
<li>[NAME] Rose Bonbon</li>
<li>[Law] Licence of Gargantext is (and will be for next versions) AGPL</li>
<li>[NLP] Turbo Parser, TreeTagger</li>
<li>[FEATURE] Advanced Chart</li>
<li>[FEATURE] Remove duplicates</li>
</ul>
<li>Version 1.0.6</li>
<li>Versions from 0.1 to 0.9</li>
<ul>
<li>Bug resolution: [Advanced chart] one can make comparisons with different corpora at different scales</li>
<li>Bug resolution: [Graph] Graph link can not be executed until workflow is finished.</li>
<li>[NAME] Black Salade</li>
<li>[Law] Licence of Gargantext is GPL v3+ </li>
<li>[NLP] NLTK, TreeTagger</li>
<li>[FEATURE] Graph Explorer</li>
</ul>
</ul>
</ul>
</div>
</div>
</div>
......@@ -146,28 +161,52 @@
</div>
</div>
{% if sponsors %}
<div class="panel panel-default">
<div class="panel-heading">
<h2 class="panel-title">
<center>
<h2>Sponsors</h2>
<h2>Institutional, research and financial support</h2>
<h3>Host institutions</h3>
<a href="http://www.cnrs.fr" target="_blank" >
<img src="{% static "img/sponsors/cnrs.png"%}" alt="CNRS" style="height:100px">
</a>
<a href="http://www.iscpif.fr" target="_blank" >
<img src="{% static "img/sponsors/iscpif.svg"%}" style="height:100px">
</a>
{% for sponsor in sponsors %}
<a href="{{ sponsor.website }}" target="_blank" >
<img src="{% static "img/sponsors/"%}{{ sponsor.picture }}" style="height:100px">
<a href="http://cams.ehess.fr" target="_blank" >
<img src="{% static "img/sponsors/cams.jpg"%}" style="height:100px">
</a>
<h3>Institutional Partners</h3>
<p>
{% for institution in institutions %}
<a href="{{ institution.website }}" target="_blank" >
<img src="{% static "img/sponsors/"%}{{ institution.picture }}" style="height:100px">
</a>
{% endfor %}
</p>
<h4>Laboratory Partners</h4>
<p>
{% for labo in labos %}
<a href="{{ labo.website }}" target="_blank" >
<img src="{% static "img/sponsors/"%}{{ labo.picture }}" style="height:50px">
</a>
{% endfor %}
</p>
<h4>Grants</h4>
{% for grant in grants %}
<a href="{{ grant.website }}" target="_blank" >
<img src="{% static "img/sponsors/"%}{{ grant.picture }}" style="height:100px">
</a>
{% endfor %}
</center>
</div>
</div>
</div>
{% endif %}
......
This diff is collapsed.
......@@ -104,6 +104,10 @@ th a {
<div id="filter_search" style="visibility:hidden">
<span style="font-size:70%;">
<input title="Search in Titles" type="checkbox" checked onclick="return false">TI</input>&nbsp;
<input title="Search in Abstracts" type="checkbox">AB</input>
</span>&nbsp;&nbsp;
<select id="example-single-optgroups" onchange="SearchFilters(this);">
<!-- <optgroup label=""> -->
<option id="filter_all" value="filter_all">All</option>
......
......@@ -45,13 +45,7 @@
</ul>
'>Manage</a>
<!--
<div class="progress">
<div class="progress-bar progress-bar-striped active" role="progressbar" aria-valuenow="70" aria-valuemin="0" aria-valuemax="100" style="width: 90%">
<span class="sr-only">45% Complete</span>
</div>
</div>
--!>
{% if number == 0 %}
<a class="btn btn-primary btn-lg" role="button" href="/admin/documents/corpus/{{ corpus.id }}/">Add documents</a></p>
......@@ -65,8 +59,7 @@
<center>
<a type="button" class="btn btn-default {% if view == "documents" %}active{%endif%}" href="/project/{{project.id}}/corpus/{{ corpus.id }}/documents">{{number}} Documents</a>
<a type="button" class="btn btn-default {% if view == "journals" %}active{%endif%}" href="/project/{{project.id}}/corpus/{{ corpus.id }}/journals">Journals</a>
{% if processing == 0 %}
{% if processing == 0 or processing == "0" %}
<a type="button" class="btn btn-default {% if view == "terms" %}active{%endif%}" href="/project/{{project.id}}/corpus/{{ corpus.id }}/terms">Terms (Bêta)</a>
{% endif %}
</center>
......@@ -92,21 +85,24 @@
</div>
</div>
<span style="display:none;" id="process_state">{{processing}}</span>
<span style="display:none;" id="corpus_id">{{corpus.id}}</span>
<div class="col-md-6">
<div class="jumbotron">
{% if processing > 0 %}
<h3> <img width="20px" src="{% static "js/libs/img2/loading-bar.gif" %}"></img> Networks (later)</h3>
{% if processing == 0 or processing == "0" %}
<h3> Networks </h3>
<ol>
<li>Terms</li>
<li>Journals and Terms</li>
<li data-url="/project/{{project.id}}/corpus/{{ corpus.id }}/explorer?field1=ngrams&amp;field2=ngrams" onclick='gotoexplorer(this)'><a>Terms</a></li>
<li data-url="/project/{{project.id}}/corpus/{{ corpus.id }}/explorer?field1=journal&amp;field2=ngrams" onclick='gotoexplorer(this)'><a>Journals and Terms</a></li>
<li>Authors and Terms</li>
</ol>
{% else %}
<h3> Networks </h3>
<h3><img width="20px" src="{% static "js/libs/img2/loading-bar.gif" %}"></img> Networks </h3>
<h6>(Updating: <i id="process_id" data-since="date" >{{processing}}</i>)</h6>
<ol>
<li data-url="/project/{{project.id}}/corpus/{{ corpus.id }}/explorer?field1=ngrams&amp;field2=ngrams" onclick='gotoexplorer(this)'><a>Terms</a></li>
<li data-url="/project/{{project.id}}/corpus/{{ corpus.id }}/explorer?field1=journal&amp;field2=ngrams" onclick='gotoexplorer(this)'><a>Journals and Terms</a></li>
<li>Terms</li>
<li>Journals and Terms</li>
<li>Authors and Terms</li>
</ol>
{% endif %}
......@@ -140,6 +136,35 @@
return window.open(url_,'_blank');
}
var refresh_time = 10000 //ms
function corpus_monitorer() {
var url_ = "/api/corpus/"+$("#corpus_id").text()
$.ajax({
type: "GET",
url: url_,
dataType: "json",
success : function(data, textStatus, jqXHR) {
if(data["Processing"]=="0") {
window.location.reload()
} else {
$("#process_id").html(data["Processing"]+"...")
}
},
error: function(exception) {
console.log("exception!:"+exception.status)
}
});
}
if( $("#process_state").text()=="0" ) {
// workflow : finished!
} else {
setInterval(corpus_monitorer ,refresh_time);
}
</script>
......
......@@ -43,6 +43,9 @@ tr:hover {
.table-hover tbody tr:hover td, .table-hover tbody tr:hover th {
background-color: #F5A9A9;
}*/
.normal {
color: black;
}
.delete {
color:red;
......@@ -69,15 +72,15 @@ tr:hover {
border: 1px solid yellow;
}
#group_flag {
}
.dynatable-record-count {
font-size: 0.7em;
}
.dynatable-pagination-links {
font-size: 0.7em;
}
input[type=radio] {
display:none;
}
input[type=radio] + label {
display:inline-block;
......@@ -124,6 +127,13 @@ input[type=radio]:checked + label {
{% block content %}
<div id="content_loader">
<br>
<center>
<img width="10%" src="{% static "img/ajax-loader.gif"%}"></img>
</center>
<br>
</div>
<div class="container">
<div class="container">
......@@ -179,11 +189,10 @@ input[type=radio]:checked + label {
</table>
</p> -->
<p align="right">
<button id="Clean_All" class="btn btn-warning">Clean</button>
<!-- <button id="Clean_All" class="btn btn-warning">Clean</button> -->
<button id="Save_All" class="btn btn-primary">Save</button>
</p>
</div>
</div>
</div>
......@@ -192,32 +201,65 @@ input[type=radio]:checked + label {
</div>
<div id="savemodal" class="modal fade">
<div class="modal-dialog">
<div class="modal-content">
<div id="corpuses" class="modal fade">
<div class="modal-dialog">
<div class="modal-content">
<div class="modal-header">
<button type="button" class="close" data-dismiss="modal" aria-hidden="true">×</button>
<h4 class="modal-title">Group NGrams</h4>
</div>
<div class="modal-header">
<button type="button" class="close" data-dismiss="modal" aria-hidden="true">×</button>
<h3 class="modal-title">Adding a list from another corpus</h3>
</div>
<div class="modal-body form-horizontal">
Do you want to merge this elements before continuing?:
<div id="to_group"></div>
</div>
<div class="modal-body form-horizontal">
<div class="form-inline">
<label class="control-label">
Which list do you want?</label>
<label class="radio">
<input value="miam" name="whichlist" disabled type="radio">MiamList
</label>
<label class="radio">
<input value="stop" name="whichlist" checked type="radio">StopList
</label>
</div>
<div class="modal-footer">
<button id="closesavemodal" type="button" class="btn btn-default" data-dismiss="modal">Close</button>
<button type="button" class="btn btn-primary" onclick="GroupNGrams();">Save</button>
</div>
<h4>Choose one corpus:</h4>
<div style="color:red;" id="selected_corpus"></div>
<div id="user_portfolio"></div>
<div class="modal-footer">
<button id="closecorpuses" type="button" class="btn btn-default" data-dismiss="modal">Close</button>
<button id="add_corpus_tab" type="button" class="btn btn-primary" disabled onclick='printCorpuses();'>Add Tab</button>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
<div id="filter_search" style="visibility:hidden">
<select id="example-single-optgroups" onchange="SearchFilters(this);">
<!-- <optgroup label=""> -->
<option id="filter_all" value="filter_all">All</option>
<!-- <option id="filter_title" value="filter_title">Title</option> -->
<!-- <option id="filter_date" value="filter_date">Date</option> -->
<!-- </optgroup> -->
<!-- <optgroup label="Duplicates"> -->
<!-- <option value="filter_doi">By DOI</option> -->
<option id="filter_map-list" value="filter_map-list">Map-List</option>
<option id="filter_stop-list" value="filter_stop-list">Stop-List</option>
<!-- </optgroup> -->
</select>
<button id="ImportList" onclick="GetUserPortfolio(); $('#corpuses').modal('show');" class="btn btn-warning">Import a Corpus-List</button>
</div>
<script type="text/javascript" src="{% static "js/jquery/jquery.min.js" %}"></script>
<script src="{% static "js/charts/bootstrap.min.js" %}"></script>
<script src="{% static "js/libs/jquery/jquery.ba-dotimeout.min.js" %}" type="text/javascript"></script>
......
......@@ -136,6 +136,7 @@
<li>
<a>
<div id="graphid" style="visibility: hidden;">{{graphfile}}</div>
<input type="hidden" id="list_id" value="{{ list_id }}"></input>
<div id="jquerytemplatenb" style="visibility: hidden;">{{user.id}}</div>
</a>
</li>
......@@ -181,8 +182,6 @@
</ul>
<ul class="nav navbar-nav navbar-right">
<li><a>
<input type="checkbox" id="checkboxdiv" onclick="alertCheckBox(this);">Add</input>
......@@ -194,7 +193,18 @@
</a></li>
</ul>
<ul class="nav navbar-nav navbar-right">
<li>
<a>
<img width="17%" title="Compare with other corpus!" onclick="GetUserPortfolio(); $('#corpuses').modal('show');" src="{% static "js/libs/img2/INTER.png" %}"></img>
</a>
</li>
</ul>
<div class="colorgraph_div"></div>
<div class="sizegraph_div"></div>
<!---->
......@@ -287,6 +297,7 @@
<div id="leftcolumn">
<div id="tips"></div>
<div id="names"></div>
<div id="ngrams_actions"></div>
<br>
......@@ -314,7 +325,7 @@
<ul class='etabs'>
<li id="tabmed" class='tab active'><a href="#tabs3">Medline Pubs</a></li>
<li id="tabgps" class='tab'><a onclick="$('#corpuses').modal('show');">+</a></li>
<li id="tabgps" class='tab'><a href="#tabs3"></a></li>
</ul>
<div class='panel-container'>
......
......@@ -19,14 +19,23 @@
<div class="col-md-4 content">
<h1>Gargantext</h1>
<p>A web platform to explore text-mining</p>
<a class="btn btn-primary btn-lg" href="/projects" title="Click and test by yourself">Test Gargantext</a>
</div>
<div class="col-md-2 content"></div>
<a class="btn btn-primary btn-lg" href="/projects" title="Click and test by yourself">Test Gargantext
</a>
<p>
<span class="glyphicon glyphicon-warning-sign" aria-hidden="true"></span>
<small>
<i>
Some features may not work without a javascript optimized browser (Chromium for instance).
</i>
</small>
</p>
</div>
<div class="col-md-2 content"></div>
<div class="col-md-2 content"></div>
<div class="col-md-2 content">
<p class="right">
<div style="border:15px">
<img src="{% static "img/logo.png"%}" title="Logo designed by anoe" style="100px; height:150px; border:3px solid white">
<img src="{% static "img/logo.png"%}" title="Logo designed by dacha and anoe" style="100px; height:150px; border:3px solid white">
</div>
</p>
</div>
......
......@@ -22,7 +22,6 @@
<div class="navbar-collapse collapse">
<ul class="nav navbar-nav">
<!-- <li><a href="/admin/">Admin/</a></li> --!>
<li><a href="/about/" title="More informations about the project, its sponsors and its authors.">About</a>
</li>
{% if user.is_authenticated %}
......@@ -75,7 +74,7 @@
<hr>
<footer>
<p>Gargantext, version 1.0.6, <a href="http://www.cnrs.fr" target="blank" title="Institution that enables this project.">Copyrights CNRS {{ date.year }}</a>,
<p>Gargantext, version 2.0, <a href="http://www.cnrs.fr" target="blank" title="Institution that enables this project.">Copyrights CNRS {{ date.year }}</a>,
<a href="http://www.gnu.org/licenses/agpl-3.0.html" target="blank" title="Legal instructions of the project.">Licence aGPLV3</a>.</p>
</footer>
......
......@@ -21,6 +21,13 @@
{ font-size:x-small;}
</style>
<script type="text/javascript">
</script>
{% endblock %}
......@@ -266,7 +273,10 @@
success: function(data) {
console.log("in doTheQuery() Ajax.Success:")
console.log(data)
location.reload();
setTimeout(
function() {
location.reload();
}, 3000);
},
error: function(result) {
console.log("in doTheQuery(). Data not found");
......@@ -333,6 +343,7 @@
xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
},
success: function(data) {
console.log("SUCCESS")
console.log("in getGlobalResults")
console.log(data)
console.log("enabling "+"#"+value.id)
......@@ -349,12 +360,15 @@
$('#submit_thing').prop('disabled', false);
} else {
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: No results!.</i><br>")
if(data[0]==false)
$("#theresults").html("Pubmed connection error!</i><br>")
$('#submit_thing').prop('disabled', true);
}
},
error: function(result) {
console.log("Data not found");
$("#theresults").html("Pubmed connection error!</i><br>")
$('#submit_thing').prop('disabled', true);
}
});
}
......@@ -500,8 +514,11 @@
},
success: function(data) {
console.log("ajax_success: in testISTEX()")
console.log(data)
location.reload();
console.log(data)
setTimeout(
function() {
location.reload();
}, 5000);
},
error: function(result) {
console.log("in testISTEX(). Data not found");
......
......@@ -6,6 +6,7 @@
<link rel="stylesheet" href="{% static "css/bootstrap.css" %}">
<script src="{% static "js/jquery/jquery.min.js" %}" type="text/javascript"></script>
{% endblock %}
......@@ -43,7 +44,7 @@
data-content='
<ul>
<li> Rename </li>
<li> Add new corpus </li>
<li><a href="/project/{{ project.id }}">Add new corpus</a></li>
<li><a href="/delete/{{ project.id }}">Delete</a></li>
</ul>
'>Manage</button>
......
# Without this, we couldn't use the Django environment
from admin.env import *
from ngram.stemLem import *
from ngram.lists import *
#user = session.query(User).all()[0]
user = session.query(User).filter(User.username=='alexandre').first()
print('Current user is:', user.username)
project = session.query(Node).filter(Node.name == 'Test').first()
if project is None:
project = Node(
name = 'Test',
type_id = cache.NodeType['Project'].id,
user_id = user.id
)
session.add(project)
session.commit()
#corpora = session.query(Node).filter(Node.parent_id == project.id,
# Node.type_id == cache.NodeType['Corpus'].id
# ).delete()
#
#models.Node.objects(parent_id = project.id, type_id = cache.NodeType['Corpus']).all().delete()
#
corpus = session.query(Node).filter(Node.parent_id == project.id,
Node.type_id == cache.NodeType['Corpus'].id).first()
print('Corpus is', corpus)
if corpus is None:
corpus = Node(
parent_id = project.id,
name = 'Test Corpus',
type_id = cache.NodeType['Corpus'].id,
user_id = user.id
)
session.add(corpus)
session.commit()
add_resource(corpus,
file = '/srv/gargantext_lib/data_samples/pubmed.zip',
# #file = '/srv/gargantext_lib/data_samples/pubmed_2013-04-01_HoneyBeesBeeBees.xml',
type_id = cache.ResourceType['Pubmed (xml format)'].id,
)
parse_resources(corpus)
extract_ngrams(corpus, ('title', 'abstract'))
compute_tfidf(corpus)
doc_id = session.query(Node.id).filter(Node.parent_id == corpus.id,
Node.type_id == cache.NodeType['Document'].id).all()[2]
print('Miam list', listIds(typeList='MiamList', corpus_id=corpus.id, user_id=user.id)[0][0])
# Stemming the corpus
print('Working on corpus:', corpus.id, corpus.name)
stem_id = stem_corpus(corpus_id=corpus.id)
print('Stem Node.id is', stem_id)
#for typeList in ['MiamList', 'StopList', 'MainList', 'Group']:
# n = listIds(user_id=user.id,
# corpus_id=corpus.id,
# typeList=typeList)
# #print(n[0][0])
# print('Test having list_id')
# print(n, listNgramIds(list_id=n[0][0])[:3])
#
stop_list_id = listIds(user_id=user.id,
corpus_id=corpus.id,
typeList='StopList')[0][0]
miam_list_id = listIds(user_id=user.id,
corpus_id=corpus.id,
typeList='MiamList')[0][0]
print('StopList', stop_list_id)
print('MiamList', miam_list_id)
print(session.query(Node.id).filter(Node.parent_id==corpus.id, Node.type_id==cache.NodeType['WhiteList'].id).first())
#ngrams2miam(user_id=user.id, corpus_id=corpus.id)
doc_ngram_list = listNgramIds(corpus_id=corpus.id, doc_id=doc_id, user_id=user.id)
print(doc_ngram_list)
#print(listNgramIds(list_id=stop_list_id, user_id=user.id, corpus_id=corpus.id))
#type_list='MiamList'
#try:
# d = doList(type_list=type_list, user_id = user.id, corpus_id = corpus.id, limit=150)
## print('Size of the ' + type_list + ' list:',
## session.query(NodeNgram).filter(NodeNgram.node_id == d).count()
## )
#except:
# PrintException()
##
#print(listNgramIds(list_id=miam_list_id, user_id=user.id, corpus_id=corpus.id))
#
#ngram_id = listNgramIds(list_id=miam_list_id, user_id=user.id, corpus_id=corpus.id)[0][0]
#print('ngram_id', ngram_id)
#
#ngramList(do='add', ngram_ids=[ngram_id,], list_id=stop_list_id)
# print('Test having typeList and corpus.id')
# print(n, listNgramIds(typeList=typeList, corpus_id=corpus.id, user_id=user.id)[:3])
##
# print('Test having typeList and corpus.id and doc_id')
# print(n, listNgramIds(typeList=typeList, corpus_id=corpus.id, doc_id=doc_id, user_id=user.id)[:3])
import threading
from queue import Queue
# import time
import random
from gargantext_web.db import session, Node_Ngram
class ChunkedSELECTS:
def __init__(self):
self.q = Queue()
self.firstResults = []
self.lock = threading.Lock() # lock to serialize console output
self.ngrams_dict = {}
def worker_sql_action(self , docs_list):
data = {}
for d in docs_list:
# this_ngrams = session.query(Node_Ngram.ngram_id).filter( Node_Ngram.node_id==d).all()
this_ngrams = session.query(Node_Ngram.ngram_id,Node_Ngram.weight).filter( Node_Ngram.node_id==d).all()
filtered_ngrams = []
for n in this_ngrams:
if n[0] in self.ngrams_dict:
# filtered_ngrams.append( n[0] )
filtered_ngrams.append( [ n[0] , int(n[1]) ] )
data[d] = filtered_ngrams
with self.lock:
# print(threading.current_thread().name, str(len(docs_list))+" OK")
return data
def worker_sql(self):
while True:
item = self.q.get()
results = []
try:
result = self.worker_sql_action(item)
except:
result = False
self.firstResults.append(result)
self.q.task_done()
def chunks(self , l , n):
for i in range(0, len(l), n):
yield l[i:i+n]
......@@ -56,43 +56,6 @@ from rest_v1_0.api import JsonHttpResponse
from ngram.lists import listIds, listNgramIds, ngramList , doList
def test_page(request , project_id , corpus_id):
if not request.user.is_authenticated():
return redirect('/login/?next=%s' % request.path)
try:
offset = int(project_id)
offset = int(corpus_id)
except ValueError:
raise Http404()
t = get_template('tests/test_select-boostrap.html')
user = cache.User[request.user.username].id
date = datetime.datetime.now()
project = cache.Node[int(project_id)]
corpus = cache.Node[int(corpus_id)]
type_doc_id = cache.NodeType['Document'].id
number = session.query(func.count(Node.id)).filter(Node.parent_id==corpus_id, Node.type_id==type_doc_id).all()[0][0]
try:
processing = corpus.hyperdata['Processing']
except Exception as error:
print(error)
processing = 0
html = t.render(Context({
'debug': settings.DEBUG,
'user': request.user.username,
'date': date,
'project': project,
'corpus' : corpus,
'processing' : processing,
'number' : number,
}))
return HttpResponse(html)
def get_ngrams(request , project_id , corpus_id ):
if not request.user.is_authenticated():
return redirect('/login/?next=%s' % request.path)
......@@ -122,7 +85,7 @@ def get_ngrams(request , project_id , corpus_id ):
html = t.render(Context({
'debug': settings.DEBUG,
'user': request.user.username,
'user': request.user,
'date': date,
'project': project,
'corpus' : corpus,
......@@ -133,32 +96,6 @@ def get_ngrams(request , project_id , corpus_id ):
return HttpResponse(html)
def get_stoplist(request , corpus_id , doc_id):
"""Get All for a doc id"""
user_id = request.user.id
whitelist_type_id = cache.NodeType['WhiteList'].id
document_type_id = cache.NodeType['Document'].id
miam_id = listIds(typeList='MiamList', user_id=request.user.id, corpus_id=corpus_id)[0][0]
count_min = 2
size = 1000
corpus_id = int(corpus_id)
lists = dict()
for list_type in ['StopList']:
list_id = list()
list_id = listIds(user_id=request.user.id, corpus_id=int(corpus_id), typeList=list_type)
lists["%s" % list_id[0][0]] = list_type
doc_ngram_list = listNgramIds(corpus_id=corpus_id, list_id=list_id[0][0], doc_id=list_id[0][0], user_id=request.user.id)
StopList = {}
for n in doc_ngram_list:
StopList[ n[0] ] = True
results = StopList.keys() #[ "hola" , "mundo" ]
return JsonHttpResponse(StopList)
def get_journals(request , project_id , corpus_id ):
if not request.user.is_authenticated():
......@@ -187,7 +124,7 @@ def get_journals(request , project_id , corpus_id ):
html = t.render(Context({
'debug': settings.DEBUG,
'user': request.user.username,
'user': request.user,
'date': date,
'project': project,
'corpus' : corpus,
......@@ -216,115 +153,32 @@ def get_journals_json(request , project_id, corpus_id ):
from gargantext_web.db import session, cache, Node, NodeNgram
from sqlalchemy import or_, func
from sqlalchemy.orm import aliased
def get_ngrams_json(request , project_id, corpus_id ):
results = ["holaaaa" , "mundo"]
user_id = request.user.id
whitelist_type_id = cache.NodeType['WhiteList'].id
document_type_id = cache.NodeType['Document'].id
miam_id = listIds(typeList='MiamList', user_id=request.user.id, corpus_id=corpus_id)[0][0]
count_min = 2
size = 1000
corpus_id = int(corpus_id)
lists = dict()
for list_type in ['StopList']:
list_id = list()
list_id = listIds(user_id=request.user.id, corpus_id=int(corpus_id), typeList=list_type)
lists["%s" % list_id[0][0]] = list_type
doc_ngram_list = listNgramIds(corpus_id=corpus_id, list_id=list_id[0][0], doc_id=list_id[0][0], user_id=request.user.id)
StopList = {}
for n in doc_ngram_list:
StopList[ n[0] ] = True
# [ Get Uniq_Occs ]
myamlist_type_id = cache.NodeType['MiamList'].id
myamlist = session.query(Node).filter(Node.user_id == user_id , Node.parent_id==corpus_id , Node.type_id == myamlist_type_id ).first()
myamlists = session.query(Node).filter(Node.user_id == user_id , Node.parent_id==corpus_id , Node.type_id == myamlist_type_id ).all()
# sql_average = """SELECT avg(weight) as Average FROM node_node_ngram WHERE node_node_ngram.node_id=%d""" % (myamlist.id)
# cursor = connection.cursor()
# cursor.execute(sql_average)
# avg_result = cursor.fetchone()[0]
# threshold = min (10 , math.sqrt(avg_result) )
# OCCs = session.query(Node_Ngram).filter( Node_Ngram.node_id==myamlist.id , Node_Ngram.weight >= threshold ).all()
# [ / Get Uniq_Occs ]
Miam = aliased(NodeNgram)
sql_average = (session.query(NodeNgram.ngram_id, func.sum(NodeNgram.weight))
.join(Node, Node.id == NodeNgram.node_id)
.join(Miam, Miam.ngram_id == NodeNgram.ngram_id)
.filter(Node.parent_id == corpus_id, Node.type_id==cache.NodeType['Document'].id)
.filter(Miam.node_id==myamlist.id)
.group_by(NodeNgram.ngram_id)
.all()
)
# print([n for n in sql_average])
OCCs = {}
for ngram in sql_average:
OCCs [ ngram[0] ] = ngram[1]
# [ Initializing Ngrams_Scores with occ_uniq ]
Ngrams_Scores = {}
for ngram in OCCs:
if ngram not in StopList:
if ngram not in Ngrams_Scores:
Ngrams_Scores[ngram] = {}
Ngrams_Scores[ngram]["scores"] = {
"occ_uniq": round(OCCs[ngram]),
"tfidf_sum": 0.0
}
# [ / Initializing Ngrams_Scores with occ_uniq ]
# [ Getting TF-IDF scores (sum per each ngram) ]
NgramTFIDF = session.query(NodeNodeNgram).filter( NodeNodeNgram.nodex_id==corpus_id ).all()
for ngram in NgramTFIDF:
if ngram.ngram_id not in StopList:
if ngram.ngram_id in Ngrams_Scores:
Ngrams_Scores[ngram.ngram_id]["scores"]["tfidf_sum"] += ngram.score
# [ / Getting TF-IDF scores ]
# [ Preparing JSON-Array full of Scores! ]
Metrics = {
"ngrams":[],
"scores": {}
}
ngrams_ids = Ngrams_Scores.keys()
query = session.query(Ngram).filter(Ngram.id.in_( ngrams_ids ))
ngrams_data = query.all()
for ngram in ngrams_data:
if ngram.id not in StopList:
occ_uniq = occ_uniq = Ngrams_Scores[ngram.id]["scores"]["occ_uniq"]
Ngrams_Scores[ngram.id]["name"] = ngram.terms
Ngrams_Scores[ngram.id]["id"] = ngram.id
Ngrams_Scores[ngram.id]["scores"]["tfidf"] = Ngrams_Scores[ngram.id]["scores"]["tfidf_sum"] / occ_uniq
del Ngrams_Scores[ngram.id]["scores"]["tfidf_sum"]
Metrics["ngrams"].append( Ngrams_Scores[ngram.id] )
Metrics["scores"] = {
"initial":"occ_uniq",
"nb_docs":1,
"orig_nb_ngrams":1,
"nb_ngrams":len(Metrics["ngrams"]),
# "occs_threshold":threshold
}
# [ / Preparing JSON-Array full of Scores! ]
# print("miamlist:",myamlist.id)
# print("sql avg:",sql_average)
# print (avg_result)
# print ("LALALALALALALALLLALALALALA")
return JsonHttpResponse(Metrics)
def get_corpuses( request , node_ids ):
ngrams = [int(i) for i in node_ids.split("+") ]
results = session.query(Node.id,Node.hyperdata).filter(Node.id.in_(ngrams) ).all()
for r in results:
print(r)
return JsonHttpResponse( [ "tudo" , "bem" ] )
def get_cores( request ):
import multiprocessing
cpus = multiprocessing.cpu_count()
return JsonHttpResponse( {"data":cpus} )
def get_corpus_state( request , corpus_id ):
if not request.user.is_authenticated():
return JsonHttpResponse( {"request" : "forbidden"} )
processing = ["Waiting"]
the_query = """ SELECT hyperdata FROM node_node WHERE id=%d """ % ( int(corpus_id) )
cursor = connection.cursor()
try:
cursor.execute(the_query)
processing = cursor.fetchone()[0]
finally:
connection.close()
# processing = corpus.hyperdata['Processing']
return JsonHttpResponse( processing )
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment