Commit 042591de authored by Administrator's avatar Administrator

Merge branch 'stable-jsonb' into tina-jsonb

parents 2b67af45 63d15bd8
import networkx as nx
from itertools import combinations
class Utils:
def __init__(self):
self.G = nx.Graph()
def unique(self,a):
""" return the list with duplicate elements removed """
return list(set(a))
def intersect(self,a, b):
""" return the intersection of two lists """
return list(set(a) & set(b))
def union(self,a, b):
""" return the union of two lists """
return list(set(a) | set(b))
def addCompleteSubGraph(self,terms):
G=self.G
# <addnode> #
for i in terms:
G.add_node(i)
# </addnode> #
# <addedge> #
edges = combinations(terms, 2)
for n in edges:
n1=n[0]
n2=n[1]
one=float(1)
if G.has_edge(n1,n2):
G[n1][n2]['weight']+=one
else: G.add_edge(n1,n2,weight=one)
self.G = G
\ No newline at end of file
from node.models import Language, ResourceType, Resource, \ from gargantext_web.db import *
Node, NodeType, Node_Resource, Project, Corpus, \
Node_Ngram, NodeNgramNgram, NodeNodeNgram
from collections import defaultdict from collections import defaultdict
from django.db import connection, transaction from django.db import connection, transaction
...@@ -13,29 +11,26 @@ def create_blacklist(user, corpus): ...@@ -13,29 +11,26 @@ def create_blacklist(user, corpus):
def create_synonymes(user, corpus): def create_synonymes(user, corpus):
pass pass
def create_whitelist(user, corpus, size=100): def create_whitelist(user, corpus_id, size=100):
cursor = connection.cursor() cursor = connection.cursor()
try: whitelist_type_id = cache.NodeType['WhiteList'].id
whitelist_type = NodeType.objects.get(name='WhiteList') blacklist_type_id = cache.NodeType['BlackList'].id
blacklist_type = NodeType.objects.get(name='BlackList') type_document_id = cache.NodeType['Document'].id
type_document = NodeType.objects.get(name='Document')
except:
whitelist_type = NodeType(name='WhiteList')
whitelist_type.save()
blacklist_type = NodeType(name='BlackList')
blacklist_type.save()
white_list = Node.objects.create(name='WhiteList Corpus ' + str(corpus.id), user=user, parent=corpus, type=whitelist_type)
black_list = Node.objects.create(name='BlackList Corpus ' + str(corpus.id), user=user, parent=corpus, type=blacklist_type)
white_list = Node(name='WhiteList Corpus ' + str(corpus_id), user_id=user.id, parent_id=corpus_id, type_id=whitelist_type_id)
black_list = Node(name='BlackList Corpus ' + str(corpus_id), user_id=user.id, parent_id=corpus_id, type_id=blacklist_type_id)
session.add(white_list)
session.add(black_list)
session.commit()
# delete avant pour éviter les doublons # delete avant pour éviter les doublons
# try: # try:
# Node_Ngram.objects.filter(node=white_list).all().delete() # Node_Ngram.objects.filter(node=white_list).all().delete()
# except: # except:
# print('First time we compute cooc') # print('First time we compute cooc')
# #
query_whitelist = """ query_whitelist = """
INSERT INTO node_node_ngram (node_id, ngram_id, weight) INSERT INTO node_node_ngram (node_id, ngram_id, weight)
SELECT SELECT
...@@ -67,28 +62,29 @@ def create_whitelist(user, corpus, size=100): ...@@ -67,28 +62,29 @@ def create_whitelist(user, corpus, size=100):
LIMIT LIMIT
%d %d
; ;
""" % (white_list.id, corpus.id, type_document.id, size) """ % (white_list.id, int(corpus_id), int(type_document_id), size)
# print("PRINTING QYERY OF WHITELIST:")
# print(query_whitelist)
cursor.execute(query_whitelist) cursor.execute(query_whitelist)
return white_list return white_list
#def create_cooc(user, corpus, whitelist, blacklist, synonymes): #def create_cooc(user, corpus, whitelist, blacklist, synonymes):
def create_cooc(user=None, corpus=None, whitelist=None, size=150, year_start=None, year_end=None): def create_cooc(user=None, corpus_id=None, whitelist=None, size=150, year_start=None, year_end=None):
cursor = connection.cursor() cursor = connection.cursor()
try: cooc_type_id = cache.NodeType['Cooccurrence'].id
cooc_type = NodeType.objects.get(name='Cooccurrence')
except:
cooc_type = NodeType(name='Cooccurrence')
cooc_type.save()
# pour les tests on supprime les cooc # pour les tests on supprime les cooc
Node.objects.filter(type=cooc_type, parent=corpus).delete() #session.Node.objects.filter(type=cooc_type, parent=corpus).delete()
cooc = Node.objects.create(user=user,\ cooc = Node(user_id=user.id,\
parent=corpus,\ parent_id=corpus_id,\
type=cooc_type,\ type_id=cooc_type_id,\
name="Cooccurrences corpus " + str(corpus.pk)) name="Cooccurrences corpus " + str(corpus_id))
session.add(cooc)
session.commit()
query_cooc = """ query_cooc = """
INSERT INTO node_nodengramngram (node_id, "ngramx_id", "ngramy_id", score) INSERT INTO node_nodengramngram (node_id, "ngramx_id", "ngramy_id", score)
...@@ -133,10 +129,11 @@ def create_cooc(user=None, corpus=None, whitelist=None, size=150, year_start=Non ...@@ -133,10 +129,11 @@ def create_cooc(user=None, corpus=None, whitelist=None, size=150, year_start=Non
score DESC score DESC
LIMIT LIMIT
%d %d
""" % (cooc.pk, corpus.id, whitelist.id, whitelist.id, size) """ % (cooc.id, corpus_id, whitelist.id, whitelist.id, size)
# print(query_cooc)
cursor.execute(query_cooc) cursor.execute(query_cooc)
return cooc return cooc.id
def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150): def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150):
import pandas as pd import pandas as pd
...@@ -153,36 +150,37 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150 ...@@ -153,36 +150,37 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150
labels = dict() labels = dict()
weight = dict() weight = dict()
corpus = Node.objects.get(id=corpus_id) type_cooc_id = cache.NodeType['Cooccurrence'].id
type_cooc = NodeType.objects.get(name="Cooccurrence")
if Node.objects.filter(type=type_cooc, parent=corpus).first() is None: if session.query(Node).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first() is None:
print("Coocurrences do not exist yet, create it.") print("Coocurrences do not exist yet, create it.")
whitelist = create_whitelist(request.user, corpus, size=n) whitelist = create_whitelist(request.user, corpus_id=corpus_id, size=n)
cooccurrence_node = create_cooc(user=request.user, corpus=corpus, whitelist=whitelist, size=n) cooccurrence_node_id = create_cooc(user=request.user, corpus_id=corpus_id, whitelist=whitelist, size=n)
print(cooccurrence_node.id, "Cooc created")
else: else:
cooccurrence_node = Node.objects.filter(type=type_cooc, parent=corpus).first() cooccurrence_node_id = session.query(Node.id).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first()
for cooccurrence in NodeNgramNgram.objects.filter(node=cooccurrence_node): for cooccurrence in session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==cooccurrence_node_id).all():
# print(cooccurrence.ngramx.terms," <=> ",cooccurrence.ngramy.terms,"\t",cooccurrence.score)
ids[cooccurrence.ngramx.terms] = cooccurrence.ngramx.id
ids[cooccurrence.ngramy.terms] = cooccurrence.ngramy.id
labels[cooccurrence.ngramx.id] = cooccurrence.ngramx.terms labels[cooccurrence.ngramx_id] = session.query(Ngram.terms).filter(Ngram.id == cooccurrence.ngramx_id).first()[0]
labels[cooccurrence.ngramy.id] = cooccurrence.ngramy.terms labels[cooccurrence.ngramy_id] = session.query(Ngram.terms).filter(Ngram.id == cooccurrence.ngramy_id).first()[0]
matrix[cooccurrence.ngramx.id][cooccurrence.ngramy.id] = cooccurrence.score ids[labels[cooccurrence.ngramx_id]] = cooccurrence.ngramx_id
matrix[cooccurrence.ngramy.id][cooccurrence.ngramx.id] = cooccurrence.score ids[labels[cooccurrence.ngramy_id]] = cooccurrence.ngramy_id
weight[cooccurrence.ngramy.terms] = weight.get(cooccurrence.ngramy.terms, 0) + cooccurrence.score matrix[cooccurrence.ngramx_id][cooccurrence.ngramy_id] = cooccurrence.score
weight[cooccurrence.ngramx.terms] = weight.get(cooccurrence.ngramx.terms, 0) + cooccurrence.score matrix[cooccurrence.ngramy_id][cooccurrence.ngramx_id] = cooccurrence.score
weight[cooccurrence.ngramx_id] = weight.get(cooccurrence.ngramx_id, 0) + cooccurrence.score
weight[cooccurrence.ngramy_id] = weight.get(cooccurrence.ngramy_id, 0) + cooccurrence.score
df = pd.DataFrame(matrix).fillna(0) df = pd.DataFrame(matrix).fillna(0)
x = copy(df.values) x = copy(df.values)
x = x / x.sum(axis=1) x = x / x.sum(axis=1)
# import pprint
# pprint.pprint(ids)
# Removing unconnected nodes # Removing unconnected nodes
threshold = min(x.max(axis=1)) threshold = min(x.max(axis=1))
matrix_filtered = np.where(x >= threshold, 1, 0) matrix_filtered = np.where(x >= threshold, 1, 0)
...@@ -191,32 +189,41 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150 ...@@ -191,32 +189,41 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150
G = nx.from_numpy_matrix(matrix_filtered) G = nx.from_numpy_matrix(matrix_filtered)
G = nx.relabel_nodes(G, dict(enumerate([ labels[label] for label in list(df.columns)]))) G = nx.relabel_nodes(G, dict(enumerate([ labels[label] for label in list(df.columns)])))
#G = nx.relabel_nodes(G, dict(enumerate(df.columns))) #G = nx.relabel_nodes(G, dict(enumerate(df.columns)))
# Removing too connected nodes (find automatic way to do it) # Removing too connected nodes (find automatic way to do it)
# outdeg = G.degree() # outdeg = G.degree()
# to_remove = [n for n in outdeg if outdeg[n] >= 10] # to_remove = [n for n in outdeg if outdeg[n] >= 10]
# G.remove_nodes_from(to_remove) # G.remove_nodes_from(to_remove)
partition = best_partition(G) partition = best_partition(G)
if type == "node_link": if type == "node_link":
for community in set(partition.values()):
#print(community)
G.add_node("cluster " + str(community), hidden=1)
for node in G.nodes(): for node in G.nodes():
try: try:
#node,type(labels[node]) #node,type(labels[node])
G.node[node]['pk'] = ids[node]
G.node[node]['label'] = node G.node[node]['label'] = node
G.node[node]['name'] = node # G.node[node]['pk'] = ids[str(node)]
G.node[node]['pk'] = ids[str(node)] G.node[node]['size'] = weight[ids[node]]
G.node[node]['size'] = weight[node]
G.node[node]['group'] = partition[node] G.node[node]['group'] = partition[node]
G.add_edge(node, "cluster " + str(partition[node]), weight=3) # G.add_edge(node, "cluster " + str(partition[node]), weight=3)
# G.node[node]['color'] = '19,180,300'
except Exception as error: except Exception as error:
print(error) print("error01: ",error)
data = json_graph.node_link_data(G) data = json_graph.node_link_data(G)
links = []
i=1
for e in G.edges_iter():
s = e[0]
t = e[1]
info = { "id":i , "source":ids[s] , "target":ids[t]}
# print(info)
links.append(info)
i+=1
# print(data)
data["links"] = []
data["links"] = links
elif type == "adjacency": elif type == "adjacency":
for node in G.nodes(): for node in G.nodes():
...@@ -227,48 +234,26 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150 ...@@ -227,48 +234,26 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150
#G.node[node]['size'] = weight[node] #G.node[node]['size'] = weight[node]
G.node[node]['group'] = partition[node] G.node[node]['group'] = partition[node]
#G.add_edge(node, partition[node], weight=3) #G.add_edge(node, partition[node], weight=3)
# G.node[node]['color'] = '19,180,300'
except Exception as error: except Exception as error:
print(error) print("error02: ",error)
data = json_graph.node_link_data(G) data = json_graph.node_link_data(G)
# data = json_graph.node_link_data(G, attrs={\ # data = json_graph.node_link_data(G, attrs={\
# 'source':'source',\ # 'source':'source',\
# 'target':'target',\ # 'target':'target',\
# 'weight':'weight',\ # 'weight':'weight',\
# #'label':'label',\ # #'label':'label',\
# #'color':'color',\ # #'color':'color',\
# 'id':'id',}) # 'id':'id',})
#print(data) #print(data)
return data return data
#def tfidf(corpus, document, ngram):
# '''
# Compute TF-IDF (Term Frequency - Inverse Document Frequency)
# See: http://en.wikipedia.org/wiki/Tf%E2%80%93idf
# '''
# try:
# occurences_of_ngram = Node_Ngram.objects.get(node=document, ngram=ngram).weight
# ngrams_by_document = sum([ x.weight for x in Node_Ngram.objects.filter(node=document)])
# term_frequency = occurences_of_ngram / ngrams_by_document
#
# xx = Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")).count()
# yy = Node_Ngram.objects.filter(ngram=ngram).count() # filter: ON node.parent=corpus
# inverse_document_frequency= log(xx/yy)
#
# # result = tf * idf
# result = term_frequency * inverse_document_frequency
# except Exception as error:
# print(error, ngram)
# result = 0
# return result
from analysis.tfidf import tfidf from analysis.tfidf import tfidf
def do_tfidf(corpus, reset=True): def do_tfidf(corpus, reset=True):
print("=========== doing tfidf ===========") # print("=========== doing tfidf ===========")
with transaction.atomic(): with transaction.atomic():
if reset==True: if reset==True:
NodeNodeNgram.objects.filter(nodex=corpus).delete() NodeNodeNgram.objects.filter(nodex=corpus).delete()
...@@ -278,8 +263,7 @@ def do_tfidf(corpus, reset=True): ...@@ -278,8 +263,7 @@ def do_tfidf(corpus, reset=True):
# # for i in Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")): # # for i in Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")):
for document in Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")): for document in Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")):
# print("the doc:",document) # print("the doc:",document)
somevariable = Node_Ngram.objects.filter(node=document) for node_ngram in Node_Ngram.objects.filter(node=document):
for node_ngram in somevariable:
# print("\tngram:",node_ngram.ngram) # print("\tngram:",node_ngram.ngram)
try: try:
nnn = NodeNodeNgram.objects.get(nodex=corpus, nodey=document, ngram=node_ngram.ngram) nnn = NodeNodeNgram.objects.get(nodex=corpus, nodey=document, ngram=node_ngram.ngram)
...@@ -288,7 +272,7 @@ def do_tfidf(corpus, reset=True): ...@@ -288,7 +272,7 @@ def do_tfidf(corpus, reset=True):
score = tfidf(corpus, document, node_ngram.ngram) score = tfidf(corpus, document, node_ngram.ngram)
nnn = NodeNodeNgram(nodex=corpus, nodey=node_ngram.node, ngram=node_ngram.ngram, score=score) nnn = NodeNodeNgram(nodex=corpus, nodey=node_ngram.node, ngram=node_ngram.ngram, score=score)
nnn.save() nnn.save()
# print("\t\tEXC: ",score) # print("\t\t",node_ngram.ngram," : ",score)
# print("- - - - - - - - - - \n") # print("- - - - - - - - - - \n")
else: else:
print("Only corpus implemented yet, you put instead:", type(corpus)) print("Only corpus implemented yet, you put instead:", type(corpus))
......
...@@ -60,6 +60,7 @@ def tfidf(corpus, document, ngram): ...@@ -60,6 +60,7 @@ def tfidf(corpus, document, ngram):
.filter(NodeNgram.ngram_id == ngram.id)\ .filter(NodeNgram.ngram_id == ngram.id)\
.count() .count()
# print("\t\t\t","occs:",occurrences_of_ngram," || ngramsbydoc:",ngrams_by_document," || TF = occ/ngramsbydoc:",term_frequency," |||||| x:",xx," || y:",yy," || IDF = log(x/y):",log(xx/yy))
inverse_document_frequency= log(xx/yy) inverse_document_frequency= log(xx/yy)
# result = tf * idf # result = tf * idf
......
from django.http import HttpResponseNotFound, HttpResponse, Http404 from django.http import HttpResponseNotFound, HttpResponse, Http404
from django.core.exceptions import PermissionDenied, SuspiciousOperation from django.core.exceptions import PermissionDenied, SuspiciousOperation
from django.core.exceptions import ValidationError from django.core.exceptions import ValidationError
from django.core.urlresolvers import reverse
from django.db.models import Avg, Max, Min, Count, Sum from django.db.models import Avg, Max, Min, Count, Sum
# from node.models import Language, ResourceType, Resource # from node.models import Language, ResourceType, Resource
...@@ -10,8 +11,9 @@ from sqlalchemy import text, distinct ...@@ -10,8 +11,9 @@ from sqlalchemy import text, distinct
from sqlalchemy.sql import func from sqlalchemy.sql import func
from sqlalchemy.orm import aliased from sqlalchemy.orm import aliased
from gargantext_web.views import move_to_trash
from .db import * from .db import *
from node import models
def DebugHttpResponse(data): def DebugHttpResponse(data):
return HttpResponse('<html><body style="background:#000;color:#FFF"><pre>%s</pre></body></html>' % (str(data), )) return HttpResponse('<html><body style="background:#000;color:#FFF"><pre>%s</pre></body></html>' % (str(data), ))
...@@ -45,10 +47,14 @@ _ngrams_order_columns = { ...@@ -45,10 +47,14 @@ _ngrams_order_columns = {
} }
from rest_framework.authentication import SessionAuthentication, BasicAuthentication
from rest_framework.permissions import IsAuthenticated
from rest_framework.views import APIView from rest_framework.views import APIView
from rest_framework.response import Response from rest_framework.response import Response
from rest_framework.exceptions import APIException as _APIException from rest_framework.exceptions import APIException as _APIException
class APIException(_APIException): class APIException(_APIException):
def __init__(self, message, code=500): def __init__(self, message, code=500):
self.status_code = code self.status_code = code
...@@ -82,7 +88,7 @@ class NodesChildrenNgrams(APIView): ...@@ -82,7 +88,7 @@ class NodesChildrenNgrams(APIView):
def get(self, request, node_id): def get(self, request, node_id):
# query ngrams # query ngrams
ParentNode = aliased(Node) ParentNode = aliased(Node)
ngrams_query = (Ngram ngrams_query = (session
.query(Ngram.terms, func.count().label('count')) .query(Ngram.terms, func.count().label('count'))
# .query(Ngram.id, Ngram.terms, func.count().label('count')) # .query(Ngram.id, Ngram.terms, func.count().label('count'))
.join(Node_Ngram, Node_Ngram.ngram_id == Ngram.id) .join(Node_Ngram, Node_Ngram.ngram_id == Ngram.id)
...@@ -128,7 +134,7 @@ class NodesChildrenDuplicates(APIView): ...@@ -128,7 +134,7 @@ class NodesChildrenDuplicates(APIView):
raise APIException('Missing GET parameter: "keys"', 400) raise APIException('Missing GET parameter: "keys"', 400)
keys = request.GET['keys'].split(',') keys = request.GET['keys'].split(',')
# metadata retrieval # metadata retrieval
metadata_query = (Metadata metadata_query = (session
.query(Metadata) .query(Metadata)
.filter(Metadata.name.in_(keys)) .filter(Metadata.name.in_(keys))
) )
...@@ -187,6 +193,7 @@ class NodesChildrenDuplicates(APIView): ...@@ -187,6 +193,7 @@ class NodesChildrenDuplicates(APIView):
# get the minimum ID for each of the nodes sharing the same metadata # get the minimum ID for each of the nodes sharing the same metadata
kept_node_ids_query = self._fetch_duplicates(request, node_id, [func.min(Node.id).label('id')], 0) kept_node_ids_query = self._fetch_duplicates(request, node_id, [func.min(Node.id).label('id')], 0)
kept_node_ids = [kept_node.id for kept_node in kept_node_ids_query] kept_node_ids = [kept_node.id for kept_node in kept_node_ids_query]
# TODO with new orm
duplicate_nodes = models.Node.objects.filter( parent_id=node_id ).exclude(id__in=kept_node_ids) duplicate_nodes = models.Node.objects.filter( parent_id=node_id ).exclude(id__in=kept_node_ids)
# # delete the stuff # # delete the stuff
# delete_query = (session # delete_query = (session
...@@ -197,7 +204,7 @@ class NodesChildrenDuplicates(APIView): ...@@ -197,7 +204,7 @@ class NodesChildrenDuplicates(APIView):
count = len(duplicate_nodes) count = len(duplicate_nodes)
for node in duplicate_nodes: for node in duplicate_nodes:
print("deleting node ",node.id) print("deleting node ",node.id)
node.delete() move_to_trash(node.id)
# print(delete_query) # print(delete_query)
# # delete_query.delete(synchronize_session=True) # # delete_query.delete(synchronize_session=True)
# session.flush() # session.flush()
...@@ -213,7 +220,7 @@ class NodesChildrenMetatadata(APIView): ...@@ -213,7 +220,7 @@ class NodesChildrenMetatadata(APIView):
# query metadata keys # query metadata keys
ParentNode = aliased(Node) ParentNode = aliased(Node)
metadata_query = (Metadata metadata_query = (session
.query(Metadata) .query(Metadata)
.join(Node_Metadata, Node_Metadata.metadata_id == Metadata.id) .join(Node_Metadata, Node_Metadata.metadata_id == Metadata.id)
.join(Node, Node.id == Node_Metadata.node_id) .join(Node, Node.id == Node_Metadata.node_id)
...@@ -233,7 +240,7 @@ class NodesChildrenMetatadata(APIView): ...@@ -233,7 +240,7 @@ class NodesChildrenMetatadata(APIView):
values_to = None values_to = None
if metadata.type != 'text': if metadata.type != 'text':
value_column = getattr(Node_Metadata, 'value_' + metadata.type) value_column = getattr(Node_Metadata, 'value_' + metadata.type)
node_metadata_query = (Node_Metadata node_metadata_query = (session
.query(value_column) .query(value_column)
.join(Node, Node.id == Node_Metadata.node_id) .join(Node, Node.id == Node_Metadata.node_id)
.filter(Node.parent_id == node_id) .filter(Node.parent_id == node_id)
...@@ -381,9 +388,9 @@ class NodesChildrenQueries(APIView): ...@@ -381,9 +388,9 @@ class NodesChildrenQueries(APIView):
for field_name in fields_names: for field_name in fields_names:
split_field_name = field_name.split('.') split_field_name = field_name.split('.')
if split_field_name[0] == 'metadata': if split_field_name[0] == 'metadata':
metadata = Metadata.query(Metadata).filter(Metadata.name == split_field_name[1]).first() metadata = session.query(Metadata).filter(Metadata.name == split_field_name[1]).first()
if metadata is None: if metadata is None:
metadata_query = Metadata.query(Metadata.name).order_by(Metadata.name) metadata_query = session.query(Metadata.name).order_by(Metadata.name)
metadata_names = [metadata.name for metadata in metadata_query.all()] metadata_names = [metadata.name for metadata in metadata_query.all()]
raise APIException('Invalid key for "%s" in parameter "field", should be one of the following values: "%s". "%s" was found instead' % (field[0], '", "'.join(metadata_names), field[1]), 400) raise APIException('Invalid key for "%s" in parameter "field", should be one of the following values: "%s". "%s" was found instead' % (field[0], '", "'.join(metadata_names), field[1]), 400)
# check or create Node_Metadata alias; join if necessary # check or create Node_Metadata alias; join if necessary
...@@ -422,7 +429,7 @@ class NodesChildrenQueries(APIView): ...@@ -422,7 +429,7 @@ class NodesChildrenQueries(APIView):
) )
# starting the query! # starting the query!
document_type_id = NodeType.query(NodeType.id).filter(NodeType.name == 'Document').scalar() document_type_id = cache.NodeType['Document'].id ##session.query(NodeType.id).filter(NodeType.name == 'Document').scalar()
query = (session query = (session
.query(*fields_list) .query(*fields_list)
.select_from(Node) .select_from(Node)
...@@ -451,9 +458,9 @@ class NodesChildrenQueries(APIView): ...@@ -451,9 +458,9 @@ class NodesChildrenQueries(APIView):
# #
if field[0] == 'metadata': if field[0] == 'metadata':
# which metadata? # which metadata?
metadata = Metadata.query(Metadata).filter(Metadata.name == field[1]).first() metadata = session.query(Metadata).filter(Metadata.name == field[1]).first()
if metadata is None: if metadata is None:
metadata_query = Metadata.query(Metadata.name).order_by(Metadata.name) metadata_query = session.query(Metadata.name).order_by(Metadata.name)
metadata_names = [metadata.name for metadata in metadata_query.all()] metadata_names = [metadata.name for metadata in metadata_query.all()]
raise APIException('Invalid key for "%s" in parameter "field", should be one of the following values: "%s". "%s" was found instead' % (field[0], '", "'.join(metadata_names), field[1]), 400) raise APIException('Invalid key for "%s" in parameter "field", should be one of the following values: "%s". "%s" was found instead' % (field[0], '", "'.join(metadata_names), field[1]), 400)
# check or create Node_Metadata alias; join if necessary # check or create Node_Metadata alias; join if necessary
...@@ -475,7 +482,7 @@ class NodesChildrenQueries(APIView): ...@@ -475,7 +482,7 @@ class NodesChildrenQueries(APIView):
)) ))
elif field[0] == 'ngrams': elif field[0] == 'ngrams':
query = query.filter( query = query.filter(
Node.id.in_(Node_Metadata Node.id.in_(session
.query(Node_Ngram.node_id) .query(Node_Ngram.node_id)
.filter(Node_Ngram.ngram_id == Ngram.id) .filter(Node_Ngram.ngram_id == Ngram.id)
.filter(operator( .filter(operator(
...@@ -549,11 +556,13 @@ class NodesChildrenQueries(APIView): ...@@ -549,11 +556,13 @@ class NodesChildrenQueries(APIView):
class NodesList(APIView): class NodesList(APIView):
authentication_classes = (SessionAuthentication, BasicAuthentication)
def get(self, request): def get(self, request):
query = (Node print("user id : " + str(request.user))
query = (session
.query(Node.id, Node.name, NodeType.name.label('type')) .query(Node.id, Node.name, NodeType.name.label('type'))
.filter(Node.user_id == request.session._session_cache['_auth_user_id']) .filter(Node.user_id == int(request.user.id))
.join(NodeType) .join(NodeType)
) )
if 'type' in request.GET: if 'type' in request.GET:
...@@ -576,8 +585,11 @@ class Nodes(APIView): ...@@ -576,8 +585,11 @@ class Nodes(APIView):
return JsonHttpResponse({ return JsonHttpResponse({
'id': node.id, 'id': node.id,
'name': node.name, 'name': node.name,
'parent_id': node.parent_id,
'type': cache.NodeType[node.type_id].name,
# 'type': node.type__name, # 'type': node.type__name,
'metadata': dict(node.metadata), #'metadata': dict(node.metadata),
'metadata': node.metadata,
}) })
# deleting node by id # deleting node by id
...@@ -585,13 +597,19 @@ class Nodes(APIView): ...@@ -585,13 +597,19 @@ class Nodes(APIView):
# it should take the subnodes into account as well, # it should take the subnodes into account as well,
# for better constistency... # for better constistency...
def delete(self, request, node_id): def delete(self, request, node_id):
node = models.Node.objects.filter(id = node_id)
msgres = "" user = request.user
node = session.query(Node).filter(Node.id == node_id).first()
msgres = str()
try: try:
node.delete()
msgres = node_id+" deleted!" move_to_trash(node_id)
except: msgres = node_id+" moved to Trash"
msgres ="error deleting: "+node_id
except Exception as error:
msgres ="error deleting : " + node_id + str(error)
class CorpusController: class CorpusController:
...@@ -602,9 +620,9 @@ class CorpusController: ...@@ -602,9 +620,9 @@ class CorpusController:
corpus_id = int(corpus_id) corpus_id = int(corpus_id)
except: except:
raise ValidationError('Corpora are identified by an integer.', 400) raise ValidationError('Corpora are identified by an integer.', 400)
corpusQuery = Node.objects.filter(id = corpus_id) corpusQuery = session.query(Node).filter(Node.id == corpus_id).first()
# print(str(corpusQuery)) # print(str(corpusQuery))
# raise Http404("C'est toujours ça de pris.") # raise Http404("404 error.")
if not corpusQuery: if not corpusQuery:
raise Http404("No such corpus: %d" % (corpus_id, )) raise Http404("No such corpus: %d" % (corpus_id, ))
corpus = corpusQuery.first() corpus = corpusQuery.first()
...@@ -623,7 +641,7 @@ class CorpusController: ...@@ -623,7 +641,7 @@ class CorpusController:
# build query # build query
ParentNode = aliased(Node) ParentNode = aliased(Node)
query = (Ngram query = (session
.query(Ngram.terms, func.count('*')) .query(Ngram.terms, func.count('*'))
.join(Node_Ngram, Node_Ngram.ngram_id == Ngram.id) .join(Node_Ngram, Node_Ngram.ngram_id == Ngram.id)
.join(Node, Node.id == Node_Ngram.node_id) .join(Node, Node.id == Node_Ngram.node_id)
......
from node import models
from gargantext_web import settings from gargantext_web import settings
from node import models
__all__ = ['literalquery', 'session', 'cache', 'Session', 'bulk_insert', 'engine', 'get_cursor']
# initialize sqlalchemy
from sqlalchemy.orm import Session, mapper
from sqlalchemy.ext.automap import automap_base
from sqlalchemy import create_engine, MetaData, Table, Column, ForeignKey
from sqlalchemy.types import Integer, String, DateTime
from sqlalchemy.dialects.postgresql import JSON
__all__ = ['literalquery', 'session', 'cache'] engine = create_engine('postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}/{NAME}'.format(
**settings.DATABASES['default']
))
Base = automap_base()
Base.prepare(engine, reflect=True)
# model representation
def model_repr(modelname):
def _repr(obj):
result = '<' + modelname
isfirst = True
for key, value in obj.__dict__.items():
if key[0] != '_':
value = repr(value)
if len(value) > 64:
value = value[:30] + '....' + value[-30:]
if isfirst:
isfirst = False
else:
result += ','
result += ' ' + key + '=' + value
result += '>'
return result
return _repr
# map the Django models found in node.models to SQLAlchemy models # map the Django models found in node.models to SQLAlchemy models
for model_name, model in models.__dict__.items(): for model_name, model in models.__dict__.items():
if hasattr(model, 'sa'): if hasattr(model, '_meta') :
globals()[model_name] = model.sa table_name = model._meta.db_table
__all__.append(model_name) if hasattr(Base.classes, table_name):
sqla_model = getattr(Base.classes, table_name)
setattr(sqla_model, '__repr__', model_repr(model_name))
globals()[model_name] = sqla_model
__all__.append(model_name)
NodeNgram = Node_Ngram NodeNgram = Node_Ngram
NodeResource = Node_Resource
# debugging tool, to translate SQLAlchemy queries to string # debugging tool, to translate SQLAlchemy queries to string
...@@ -61,16 +103,17 @@ def literalquery(statement, dialect=None): ...@@ -61,16 +103,17 @@ def literalquery(statement, dialect=None):
# SQLAlchemy session management # SQLAlchemy session management
def get_sessionmaker(): def get_engine():
from django.db import connections
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine from sqlalchemy import create_engine
alias = 'default'
connection = connections[alias]
url = 'postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}/{NAME}'.format( url = 'postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}/{NAME}'.format(
**settings.DATABASES['default'] **settings.DATABASES['default']
) )
engine = create_engine(url, use_native_hstore=True) return create_engine(url, use_native_hstore=True)
engine = get_engine()
def get_sessionmaker():
from sqlalchemy.orm import sessionmaker
return sessionmaker(bind=engine) return sessionmaker(bind=engine)
Session = get_sessionmaker() Session = get_sessionmaker()
...@@ -84,7 +127,7 @@ from sqlalchemy import or_ ...@@ -84,7 +127,7 @@ from sqlalchemy import or_
class ModelCache(dict): class ModelCache(dict):
def __init__(self, model, preload=False): def __init__(self, model, preload=False):
self._model = model.sa self._model = globals()[model.__name__]
self._columns_names = [column.name for column in model._meta.fields if column.unique] self._columns_names = [column.name for column in model._meta.fields if column.unique]
self._columns = [getattr(self._model, column_name) for column_name in self._columns_names] self._columns = [getattr(self._model, column_name) for column_name in self._columns_names]
self._columns_validators = [] self._columns_validators = []
...@@ -92,20 +135,16 @@ class ModelCache(dict): ...@@ -92,20 +135,16 @@ class ModelCache(dict):
self.preload() self.preload()
def __missing__(self, key): def __missing__(self, key):
for column in self._columns: #print(key)
conditions = [] conditions = [
try: (column == str(key))
formatted_key = column.type.python_type(key) for column in self._columns
conditions.append(column == key) if column.type.python_type == str or key.__class__ == column.type.python_type
except ValueError: ]
pass element = session.query(self._model).filter(or_(*conditions)).first()
if formatted_key in self: if element is None:
self[key] = self[formatted_key] raise KeyError
else: self[key] = element
element = session.query(self._model).filter(or_(*conditions)).first()
if element is None:
raise KeyError
self[key] = element
return element return element
def preload(self): def preload(self):
...@@ -115,7 +154,7 @@ class ModelCache(dict): ...@@ -115,7 +154,7 @@ class ModelCache(dict):
key = getattr(element, column_name) key = getattr(element, column_name)
self[key] = element self[key] = element
class Cache: class Cache():
def __getattr__(self, key): def __getattr__(self, key):
try: try:
...@@ -127,3 +166,50 @@ class Cache: ...@@ -127,3 +166,50 @@ class Cache:
return modelcache return modelcache
cache = Cache() cache = Cache()
# Insert many elements at once
import psycopg2
def get_cursor():
db_settings = settings.DATABASES['default']
db = psycopg2.connect(**{
'database': db_settings['NAME'],
'user': db_settings['USER'],
'password': db_settings['PASSWORD'],
'host': db_settings['HOST'],
})
return db, db.cursor()
class bulk_insert:
def __init__(self, table, keys, data, cursor=None):
# prepare the iterator
self.iter = iter(data)
# template
self.template = '%s' + (len(keys) - 1) * '\t%s' + '\n'
# prepare the cursor
if cursor is None:
db, cursor = get_cursor()
mustcommit = True
else:
mustcommit = False
# insert data
if not isinstance(table, str):
table = table.__table__.name
cursor.copy_from(self, table, columns=keys)
# commit if necessary
if mustcommit:
db.commit()
def read(self, size=None):
try:
return self.template % tuple(
str(x).replace('\r', '').replace('\n', '\\n').replace('\t', '\\t') for x in next(self.iter)
)
except StopIteration:
return ''
readline = read
import random
import random_words
from math import pi
def paragraph_lorem(size_target=450):
'''
Function that returns paragraph with false latin language.
size_target is the number of random words that will be given.
'''
lorem = random_words.LoremIpsum()
sentences_list = lorem.get_sentences_list(sentences=5)
paragraph_size = 0
while paragraph_size < size_target :
sentences_list.append(lorem.get_sentence())
paragraph = ' '.join(sentences_list)
paragraph_size = len(paragraph)
return(paragraph)
def paragraph_gargantua(size_target=500):
'''
Function that returns paragraph with chapter titles of Gargantua.
size_target is the number of random words that will be given.
'''
paragraph = list()
paragraph_size = 0
chapter_number = 1
while paragraph_size < size_target and chapter_number < 6:
chapitre = open('/srv/gargantext/static/docs/gargantua_book/gargantua_chapter_' + str(chapter_number) + '.txt', 'r')
paragraph.append(random.choice(chapitre.readlines()).strip())
chapitre.close()
paragraph_size = len(' '.join(paragraph))
chapter_number += 1
return(' '.join(paragraph))
def random_letter(mot, size_min=5):
'''
Functions that randomize order letters of a
word which size is greater that size_min.
'''
if len(mot) > size_min:
size = round(len(mot) / pi)
first_letters = mot[:size]
last_letters = mot[-size:]
others_letters = list(mot[size:-size])
random.shuffle(others_letters)
mot_list = list()
mot_list.append(first_letters)
for letter in others_letters:
mot_list.append(letter)
mot_list.append(last_letters)
return(''.join(mot_list))
else:
return(mot)
tutoriel = """Il paraît que l'ordre des lettres dans un mot n'a pas d'importance. La première et la dernière lettre doivent être à la bonne place. Le reste peut être dans un désordre total et on peut toujours lire sans problème. On ne lit donc pas chaque lettre en elle-même, mais le mot comme un tout. Un changement de référentiel et nous transposons ce résultat au texte lui-même: l'ordre des mots est faiblement important comparé au contexte du texte qui, lui, est compté"""
def paragraph_tutoreil(tutoriel=tutoriel):
'''
Functions that returns paragraph of words with words with
randomized letters.
'''
paragraph = ' '.join([ random_letter(mot) for mot in tutoriel.split(" ")]) \
+ ": comptexter avec Gargantext."
return(paragraph)
...@@ -64,12 +64,11 @@ INSTALLED_APPS = ( ...@@ -64,12 +64,11 @@ INSTALLED_APPS = (
'django.contrib.messages', 'django.contrib.messages',
'django.contrib.staticfiles', 'django.contrib.staticfiles',
'django_extensions', 'django_extensions',
'south', 'django_pg',
'cte_tree', 'cte_tree',
'node', 'node',
'ngram', 'ngram',
'scrap_pubmed', 'scrap_pubmed',
'django_hstore',
'djcelery', 'djcelery',
'aldjemy', 'aldjemy',
'rest_framework', 'rest_framework',
...@@ -84,6 +83,16 @@ MIDDLEWARE_CLASSES = ( ...@@ -84,6 +83,16 @@ MIDDLEWARE_CLASSES = (
'django.middleware.clickjacking.XFrameOptionsMiddleware', 'django.middleware.clickjacking.XFrameOptionsMiddleware',
) )
REST_SESSION_LOGIN = False
REST_FRAMEWORK = {
'DEFAULT_AUTHENTICATION_CLASSES': (
'rest_framework.authentication.TokenAuthentication',
'rest_framework.authentication.SessionAuthentication',
),
'DEFAULT_PERMISSION_CLASSES': (
'rest_framework.permissions.AllowAny',
),
}
WSGI_APPLICATION = 'wsgi.application' WSGI_APPLICATION = 'wsgi.application'
......
...@@ -3,7 +3,7 @@ from django.conf.urls import patterns, include, url ...@@ -3,7 +3,7 @@ from django.conf.urls import patterns, include, url
from django.contrib import admin from django.contrib import admin
from django.contrib.auth.views import login from django.contrib.auth.views import login
from gargantext_web import views from gargantext_web import views, views_optimized
import gargantext_web.api import gargantext_web.api
import scrap_pubmed.views as pubmedscrapper import scrap_pubmed.views as pubmedscrapper
...@@ -20,22 +20,23 @@ urlpatterns = patterns('', ...@@ -20,22 +20,23 @@ urlpatterns = patterns('',
url(r'^auth/$', views.login_user), url(r'^auth/$', views.login_user),
url(r'^auth/logout/$', views.logout_user), url(r'^auth/logout/$', views.logout_user),
# Dynamic CSS
url(r'^img/logo.svg$', views.logo), url(r'^img/logo.svg$', views.logo),
url(r'^css/bootstrap.css$', views.css), url(r'^css/bootstrap.css$', views.css),
# User Home view # User Home view
url(r'^$', views.home), url(r'^$', views.home_view),
url(r'^about/', views.get_about), url(r'^about/', views.get_about),
url(r'^maintenance/', views.get_maintenance), url(r'^maintenance/', views.get_maintenance),
# Project Management # Project Management
url(r'^projects/$', views.projects), url(r'^projects/$', views.projects),
url(r'^project/(\d+)/delete/$', views.delete_project), url(r'^project/(\d+)/$', views_optimized.project),
url(r'^project/(\d+)/$', views.project), url(r'^delete/(\d+)$', views.delete_node), # => api.node('id' = id, children = 'True', copies = False)
# Corpus management # Corpus management
url(r'^project/(\d+)/corpus/(\d+)/$', views.corpus), url(r'^project/(\d+)/corpus/(\d+)/$', views.corpus),
url(r'^project/(\d+)/corpus/(\d+)/delete/$', views.delete_corpus),
url(r'^project/(\d+)/corpus/(\d+)/corpus.csv$', views.corpus_csv), url(r'^project/(\d+)/corpus/(\d+)/corpus.csv$', views.corpus_csv),
url(r'^project/(\d+)/corpus/(tests_mvc_listdocuments+)/corpus.tests_mvc_listdocuments$', views.corpus_csv), url(r'^project/(\d+)/corpus/(tests_mvc_listdocuments+)/corpus.tests_mvc_listdocuments$', views.corpus_csv),
...@@ -47,16 +48,19 @@ urlpatterns = patterns('', ...@@ -47,16 +48,19 @@ urlpatterns = patterns('',
url(r'^project/(\d+)/corpus/(\d+)/matrix$', views.matrix), url(r'^project/(\d+)/corpus/(\d+)/matrix$', views.matrix),
# Data management # Data management
url(r'^chart/corpus/(\d+)/data.csv$', views.send_csv), url(r'^chart/corpus/(\d+)/data.csv$', views.send_csv), # => api.node.children('type' : 'data', 'format' : 'csv')
url(r'^corpus/(\d+)/node_link.json$', views.node_link), url(r'^corpus/(\d+)/node_link.json$', views.node_link), # => api.analysis('type': 'node_link', 'format' : 'json')
url(r'^corpus/(\d+)/adjacency.json$', views.adjacency), url(r'^corpus/(\d+)/adjacency.json$', views.adjacency), # => api.analysis('type': 'adjacency', 'format' : 'json')
url(r'^api/tfidf/(\d+)/(\w+)$', views.tfidf),
url(r'^api/tfidf/(\d+)/(\w+)$', views_optimized.tfidf),
# url(r'^api/tfidf/(\d+)/(\w+)$', views.tfidf),
url(r'^api/tfidf2/(\d+)/(\w+)$', views.tfidf2),
# Data management # Data management
url(r'^api$', gargantext_web.api.Root), #url(r'^api$', gargantext_web.api.Root), # = ?
url(r'^api/nodes$', gargantext_web.api.NodesList.as_view()), url(r'^api/nodes$', gargantext_web.api.NodesList.as_view()),
url(r'^api/nodes/(\d+)$', gargantext_web.api.Nodes.as_view()), url(r'^api/nodes/(\d+)$', gargantext_web.api.Nodes.as_view()),
url(r'^api/nodes/(\d+)/children/ngrams$', gargantext_web.api.NodesChildrenNgrams.as_view()), url(r'^api/nodes/(\d+)/children/ngrams$', gargantext_web.api.NodesChildrenNgrams.as_view()), # => repeated children ?
url(r'^api/nodes/(\d+)/children/metadata$', gargantext_web.api.NodesChildrenMetatadata.as_view()), url(r'^api/nodes/(\d+)/children/metadata$', gargantext_web.api.NodesChildrenMetatadata.as_view()),
url(r'^api/nodes/(\d+)/children/queries$', gargantext_web.api.NodesChildrenQueries.as_view()), url(r'^api/nodes/(\d+)/children/queries$', gargantext_web.api.NodesChildrenQueries.as_view()),
url(r'^api/nodes/(\d+)/children/duplicates$', gargantext_web.api.NodesChildrenDuplicates.as_view()), url(r'^api/nodes/(\d+)/children/duplicates$', gargantext_web.api.NodesChildrenDuplicates.as_view()),
...@@ -66,12 +70,13 @@ urlpatterns = patterns('', ...@@ -66,12 +70,13 @@ urlpatterns = patterns('',
url(r'^api/nodes/(\d+)/ngrams$', gargantext_web.api.CorpusController.ngrams), url(r'^api/nodes/(\d+)/ngrams$', gargantext_web.api.CorpusController.ngrams),
url(r'^ngrams$', views.ngrams), # Provisory tests
url(r'^nodeinfo/(\d+)$', views.nodeinfo), url(r'^ngrams$', views.ngrams), # to be removed
url(r'^nodeinfo/(\d+)$', views.nodeinfo), # to be removed ?
url(r'^tests/mvc$', views.tests_mvc), url(r'^tests/mvc$', views.tests_mvc),
url(r'^tests/mvc-listdocuments$', views.tests_mvc_listdocuments), url(r'^tests/mvc-listdocuments$', views.tests_mvc_listdocuments),
url(r'^tests/istextquery$', pubmedscrapper.getGlobalStatsISTEXT), url(r'^tests/istextquery$', pubmedscrapper.getGlobalStatsISTEXT), # api/query?type=istext ?
url(r'^tests/pubmedquery$', pubmedscrapper.getGlobalStats), url(r'^tests/pubmedquery$', pubmedscrapper.getGlobalStats),
url(r'^tests/project/(\d+)/pubmedquery/go$', pubmedscrapper.doTheQuery), url(r'^tests/project/(\d+)/pubmedquery/go$', pubmedscrapper.doTheQuery),
url(r'^tests/project/(\d+)/ISTEXquery/go$', pubmedscrapper.testISTEX) url(r'^tests/project/(\d+)/ISTEXquery/go$', pubmedscrapper.testISTEX)
...@@ -90,3 +95,15 @@ if settings.DEBUG: ...@@ -90,3 +95,15 @@ if settings.DEBUG:
}), }),
) )
if settings.MAINTENANCE:
urlpatterns = patterns('',
url(r'^img/logo.svg$', views.logo),
url(r'^css/bootstrap.css$', views.css),
url(r'^$', views.home_view),
url(r'^about/', views.get_about),
url(r'^.*', views.get_maintenance),
)
from django.shortcuts import redirect from django.shortcuts import redirect
from django.shortcuts import render from django.shortcuts import render
from django.db import transaction
from django.http import Http404, HttpResponse, HttpResponseRedirect from django.http import Http404, HttpResponse, HttpResponseRedirect, HttpResponseForbidden
from django.template.loader import get_template from django.template.loader import get_template
from django.template import Context from django.template import Context
from node import models from node import models
from node.models import Language, ResourceType, Resource, \ #from node.models import Language, ResourceType, Resource, \
Node, NodeType, Node_Resource, Project, Corpus, \ # Node, NodeType, Node_Resource, Project, Corpus, \
Ngram, Node_Ngram, NodeNgramNgram, NodeNodeNgram # Ngram, Node_Ngram, NodeNgramNgram, NodeNodeNgram
from node.admin import CorpusForm, ProjectForm, ResourceForm, CustomForm from node.admin import CorpusForm, ProjectForm, ResourceForm, CustomForm
...@@ -25,10 +26,12 @@ from django import forms ...@@ -25,10 +26,12 @@ from django import forms
from collections import defaultdict from collections import defaultdict
from parsing.FileParsers import * from parsing.FileParsers import *
import os
# SOME FUNCTIONS # SOME FUNCTIONS
from gargantext_web.settings import DEBUG, STATIC_ROOT, MAINTENANCE from gargantext_web import settings
from django.http import * from django.http import *
from django.shortcuts import render_to_response,redirect from django.shortcuts import render_to_response,redirect
from django.template import RequestContext from django.template import RequestContext
...@@ -37,6 +40,12 @@ from django.contrib.auth import authenticate, login, logout ...@@ -37,6 +40,12 @@ from django.contrib.auth import authenticate, login, logout
from scrap_pubmed.admin import Logger from scrap_pubmed.admin import Logger
from gargantext_web.db import *
from sqlalchemy import or_, func
from gargantext_web import about
def login_user(request): def login_user(request):
logout(request) logout(request)
username = password = '' username = password = ''
...@@ -50,9 +59,7 @@ def login_user(request): ...@@ -50,9 +59,7 @@ def login_user(request):
if user.is_active: if user.is_active:
login(request, user) login(request, user)
print("MAINTENANCE:",MAINTENANCE) return HttpResponseRedirect('/projects/')
if MAINTENANCE: return HttpResponseRedirect('/maintenance/')
else: return HttpResponseRedirect('/projects/')
return render_to_response('authentication.html', context_instance=RequestContext(request)) return render_to_response('authentication.html', context_instance=RequestContext(request))
...@@ -72,7 +79,7 @@ def logo(request): ...@@ -72,7 +79,7 @@ def logo(request):
svg_data = template.render(Context({\ svg_data = template.render(Context({\
'color': color,\ 'color': color,\
})) }))
return HttpResponse(svg_data, mimetype="image/svg+xml") return HttpResponse(svg_data, content_type="image/svg+xml")
def css(request): def css(request):
template = get_template('bootstrap.css') template = get_template('bootstrap.css')
...@@ -147,7 +154,6 @@ def date_range(start_dt, end_dt = None, format=None): ...@@ -147,7 +154,6 @@ def date_range(start_dt, end_dt = None, format=None):
# SOME VIEWS # SOME VIEWS
from gargantext_web import about
def get_about(request): def get_about(request):
''' '''
About Gargantext, the team and sponsors About Gargantext, the team and sponsors
...@@ -183,8 +189,8 @@ def get_maintenance(request): ...@@ -183,8 +189,8 @@ def get_maintenance(request):
return HttpResponse(html) return HttpResponse(html)
from gargantext_web import home
def home(request): def home_view(request):
''' '''
Home describes the platform. Home describes the platform.
A video draws the narratives. A video draws the narratives.
...@@ -197,6 +203,9 @@ def home(request): ...@@ -197,6 +203,9 @@ def home(request):
html = t.render(Context({\ html = t.render(Context({\
'user': user,\ 'user': user,\
'date': date,\ 'date': date,\
'paragraph_gargantua': home.paragraph_gargantua(),\
'paragraph_lorem' : home.paragraph_lorem(),\
'paragraph_tutoreil': home.paragraph_tutoreil(),\
})) }))
return HttpResponse(html) return HttpResponse(html)
...@@ -209,16 +218,17 @@ def projects(request): ...@@ -209,16 +218,17 @@ def projects(request):
''' '''
if not request.user.is_authenticated(): if not request.user.is_authenticated():
return redirect('/auth/') return redirect('/auth/')
if MAINTENANCE: return HttpResponseRedirect('/maintenance/')
t = get_template('projects.html') t = get_template('projects.html')
user = request.user user_id = cache.User[request.user.username].id
project_type_id = cache.NodeType['Project'].id
date = datetime.datetime.now() date = datetime.datetime.now()
print(Logger.write("STATIC_ROOT")) print(Logger.write("STATIC_ROOT"))
project_type = NodeType.objects.get(name='Project') projects = session.query(Node).filter(Node.user_id == user_id, Node.type_id == project_type_id).order_by(Node.date).all()
projects = Node.objects.filter(user=user, type_id = project_type.id).order_by("-date")
number = len(projects) number = len(projects)
form = ProjectForm() form = ProjectForm()
...@@ -227,7 +237,9 @@ def projects(request): ...@@ -227,7 +237,9 @@ def projects(request):
# TODO : protect from sql injection here # TODO : protect from sql injection here
name = str(request.POST['name']) name = str(request.POST['name'])
if name != "" : if name != "" :
Project(name=name, type=project_type, user=user).save() new_project = Project(name=name, type_id=project_type_id, user_id=user_id)
session.add(new_project)
session.commit()
return HttpResponseRedirect('/projects/') return HttpResponseRedirect('/projects/')
else: else:
form = ProjectForm() form = ProjectForm()
...@@ -240,190 +252,6 @@ def projects(request): ...@@ -240,190 +252,6 @@ def projects(request):
}) })
def project(request, project_id):
'''
This view represents all corpora in a panoramic way.
The title sums all corpora
The donut summerizes composition of the project.
The list of lists enalbles to navigate throw it.
'''
if not request.user.is_authenticated():
return redirect('/login/?next=%s' % request.path)
try:
offset = str(project_id)
except ValueError:
raise Http404()
if MAINTENANCE: return HttpResponseRedirect('/maintenance/')
user = request.user
date = datetime.datetime.now()
type_corpus = NodeType.objects.get(name='Corpus')
type_document = NodeType.objects.get(name='Document')
# type_whitelist = NodeType.objects.get(name='WhiteList')
# type_blacklist = NodeType.objects.get(name='BlackList')
# type_cooclist = NodeType.objects.get(name='Cooccurrence')
project = Node.objects.get(id=project_id)
corpora = Node.objects.filter(parent=project, type=type_corpus)
number = len(corpora)
# DONUT corpora representation
list_corpora = defaultdict(list)
donut_part = defaultdict(int)
docs_total = 0
# List of resources
# filter for each project here
whitelists = ""#.children.filter(type=type_whitelist)
blacklists = ""#.children.filter(type=type_blacklist)
cooclists = ""#.children.filter(type=type_cooclist)
for corpus in corpora:
# print("corpus", corpus.pk , corpus.name , corpus.type_id)
docs_count = Node.objects.filter(parent=corpus, type=type_document).count()
docs_total += docs_count
corpus_view = dict()
corpus_view['id'] = corpus.pk
corpus_view['name'] = corpus.name
corpus_view['count'] = docs_count
#just get first element of the corpora and get his type.
resource_corpus = Node_Resource.objects.filter(node=corpus)
if len(resource_corpus)>0:
# print(Node_Resource.objects.filter(node=corpus).all())
corpus_type = Node_Resource.objects.filter(node=corpus)[0].resource.type
list_corpora[corpus_type].append(corpus_view)
donut_part[corpus_type] += docs_count
else: print(" Node_Resource = this.corpus(",corpus.pk,") ... nothing, why?")
## For avoiding to list repeated elements, like when u use the dynamic query (per each xml, 1)
# for node_resource in Node_Resource.objects.filter(node=corpus):
# print( "node_resource.id:",node_resource.id , node_resource.resource.file )
# donut_part[node_resource.resource.type] += docs_count
# list_corpora[node_resource.resource.type.name].append(corpus_view)
# print(node_resource.resource.type.name)
list_corpora = dict(list_corpora)
if docs_total == 0 or docs_total is None:
docs_total = 1
# The donut will show: percentage by
donut = [ {'source': key,
'count': donut_part[key] ,
'part' : round(donut_part[key] * 100 / docs_total) } \
for key in donut_part.keys() ]
dauser = User.objects.get( username=user )
groups = len(dauser.groups.filter(name="PubMed_0.1"))
print("*groupslen*:",groups)
if request.method == 'POST':
form = CustomForm(request.POST, request.FILES)
if form.is_valid():
name = form.cleaned_data['name']
thefile = form.cleaned_data['file']
resource_type = ResourceType.objects.get(name=str( form.cleaned_data['type'] ))
print("-------------")
print(name,"|",resource_type,"|",thefile)
print("-------------")
try:
parent = Node.objects.get(id=project_id)
node_type = NodeType.objects.get(name='Corpus')
if resource_type.name == "europress_french":
language = Language.objects.get(iso2='fr')
elif resource_type.name == "europress_english":
language = Language.objects.get(iso2='en')
try:
corpus = Node(
user=request.user,
parent=parent,
type=node_type,
language=language,
name=name,
)
except:
corpus = Node(
user=request.user,
parent=parent,
type=node_type,
name=name,
)
corpus.save()
corpus.add_resource(
user=request.user,
type=resource_type,
file=thefile
)
try:
#corpus.parse_and_extract_ngrams()
#corpus.parse_and_extract_ngrams.apply_async((), countdown=3)
if DEBUG is True:
corpus.workflow()
else:
corpus.workflow.apply_async((), countdown=3)
except Exception as error:
print(error)
return HttpResponseRedirect('/project/' + str(project_id))
except Exception as error:
print('ee', error)
form = CorpusForm(request=request)
formResource = ResourceForm()
else:
print("bad form, bad form")
return render(request, 'project.html', {
'form' : form,
'user' : user,
'date' : date,
'project' : project,
'donut' : donut,
'list_corpora' : list_corpora,
'whitelists' : whitelists,
'blacklists' : blacklists,
'cooclists' : cooclists,
'number' : number,
})
else:
form = CustomForm()
return render(request, 'project.html', {
'form' : form,
'user' : user,
'date' : date,
'project' : project,
'donut' : donut,
'list_corpora' : list_corpora,
'whitelists' : whitelists,
'blacklists' : blacklists,
'cooclists' : cooclists,
'number' : number,
})
def corpus(request, project_id, corpus_id): def corpus(request, project_id, corpus_id):
if not request.user.is_authenticated(): if not request.user.is_authenticated():
return redirect('/login/?next=%s' % request.path) return redirect('/login/?next=%s' % request.path)
...@@ -433,77 +261,25 @@ def corpus(request, project_id, corpus_id): ...@@ -433,77 +261,25 @@ def corpus(request, project_id, corpus_id):
offset = str(corpus_id) offset = str(corpus_id)
except ValueError: except ValueError:
raise Http404() raise Http404()
if MAINTENANCE: return HttpResponseRedirect('/maintenance/')
t = get_template('corpus.html') t = get_template('corpus.html')
user = request.user user = request.user
date = datetime.datetime.now() date = datetime.datetime.now()
project = Node.objects.get(id=project_id) project = cache.Node[int(project_id)]
corpus = Node.objects.get(id=corpus_id) corpus = cache.Node[int(corpus_id)]
type_doc = NodeType.objects.get(name="Document")
number = Node.objects.filter(parent=corpus, type=type_doc).count() type_doc_id = cache.NodeType['Document'].id
number = session.query(func.count(Node.id)).filter(Node.parent_id==corpus_id, Node.type_id==type_doc_id).all()[0][0]
# try:
# sources = defaultdict(int)
# for document in documents.all():
# sources[document.metadata['journal']] += 1
#
# sources_donut = []
#
# for source in sources.keys():
# source_count = dict()
# source_count['count'] = source['count']
# try:
# source_count['part'] = round(source_count['count'] * 100 / number)
# except:
# source_count['part'] = None
# source_count['source'] = source['source']
# sources_donut.append(source_count)
# except:
# sources_donut = []
# Do a javascript query/api for that
# query_date = """
# SELECT
# id,
# metadata -> 'publication_year' as year,
# metadata -> 'publication_month' as month,
# metadata -> 'publication_day' as day,
# metadata -> 'title'
# FROM
# node_node AS n
# WHERE
# n.parent_id = %d
# ORDER BY
# year, month, day DESC
# LIMIT
# 20
# OFFSET
# %d
# """ % (corpus.id, 0)
# try:
# cursor = connection.cursor()
#
# cursor.execute(query_date)
# documents = list()
# while True:
# document = dict()
# row = cursor.fetchone()
#
# if row is None:
# break
# document['id'] = row[0]
# document['date'] = row[1] + '/' + row[2] + '/' + row[3]
# document['title'] = row[4]
# documents.append(document)
# except Exception as error:
# print(error)
try: try:
chart = dict() chart = dict()
chart['first'] = parse(corpus.children.first().metadata['publication_date']).strftime("%Y, %m, %d") chart['first'] = parse(corpus.children.first().metadata['publication_date']).strftime("%Y, %m, %d")
# TODO write with sqlalchemy
#chart['first'] = parse(session.query(Node.metadata['publication_date']).filter(Node.parent_id==corpus.id, Node.type_id==type_doc_id).first()).strftime("%Y, %m, %d")
chart['last'] = parse(corpus.children.last().metadata['publication_date']).strftime("%Y, %m, %d") chart['last'] = parse(corpus.children.last().metadata['publication_date']).strftime("%Y, %m, %d")
print(chart) print(chart)
except Exception as error: except Exception as error:
...@@ -550,12 +326,12 @@ def subcorpus(request, project_id, corpus_id, start , end ): ...@@ -550,12 +326,12 @@ def subcorpus(request, project_id, corpus_id, start , end ):
user = request.user user = request.user
date = datetime.datetime.now() date = datetime.datetime.now()
project = Node.objects.get(id=project_id) project = session.query(Node).filter(Node.id==project_id).first()
corpus = Node.objects.get(id=corpus_id) corpus = session.query(Node).filter(Node.id==corpus_id).first()
type_document = NodeType.objects.get(name="Document") type_document_id = cache.NodeType['Document'].id
# retrieving all the documents # retrieving all the documents
# documents = corpus.children.all() # documents = corpus.children.all()
documents = corpus.__class__.objects.filter(parent_id=corpus_id , type = type_document ) documents = session.query(Node).filter(Node.parent_id==corpus_id , Node.type_id == type_document_id ).all()
number = len(documents) number = len(documents)
filtered_docs = [] filtered_docs = []
...@@ -667,25 +443,82 @@ def subcorpusJSON(request, project_id, corpus_id, start , end ): ...@@ -667,25 +443,82 @@ def subcorpusJSON(request, project_id, corpus_id, start , end ):
return HttpResponse( serializer.data , content_type='application/json') return HttpResponse( serializer.data , content_type='application/json')
def empty_trash():
nodes = models.Node.objects.filter(type_id=cache.NodeType['Trash'].id).all()
with transaction.atomic():
for node in nodes:
try:
node.children.delete()
except Exception as error:
print(error)
def delete_project(request, node_id): node.delete()
Node.objects.filter(id=node_id).all().delete()
return HttpResponseRedirect('/projects/')
def move_to_trash(node_id):
try:
node = session.query(Node).filter(Node.id == node_id).first()
previous_type_id = node.type_id
node.type_id = cache.NodeType['Trash'].id
session.add(node)
session.commit()
return(previous_type_id)
except Exception as error:
print("can not move to trash Node" + node_id + ":" + error)
def delete_node(request, node_id):
# do we have a valid user?
user = request.user
node = session.query(Node).filter(Node.id == node_id).first()
if not user.is_authenticated():
return redirect('/login/?next=%s' % request.path)
if node.user_id != user.id:
return HttpResponseForbidden()
previous_type_id = move_to_trash(node_id)
if previous_type_id == cache.NodeType['Corpus'].id:
return HttpResponseRedirect('/project/' + str(node.parent_id))
else:
return HttpResponseRedirect('/projects/')
if settings.DEBUG == True:
empty_trash()
def delete_corpus(request, project_id, corpus_id):
Node.objects.filter(id=corpus_id).all().delete()
return HttpResponseRedirect('/project/' + project_id)
def delete_corpus(request, project_id, node_id):
# ORM Django
with transaction.atomic():
node = models.Node.objects.get(id=node_id)
try:
node.children.delete()
except Exception as error:
print(error)
node.delete()
# SQLA Django
# node = session.query(Node).filter(Node.id == node_id).first()
# session.delete(node)
# session.commit()
# session.flush()
return HttpResponseRedirect('/project/' + project_id)
def chart(request, project_id, corpus_id): def chart(request, project_id, corpus_id):
''' Charts to compare, filter, count''' ''' Charts to compare, filter, count'''
if MAINTENANCE: return HttpResponseRedirect('/maintenance/')
t = get_template('chart.html') t = get_template('chart.html')
user = request.user user = request.user
date = datetime.datetime.now() date = datetime.datetime.now()
project = Node.objects.get(id=project_id) project = session.query(Node).filter(Node.id==project_id).first()
corpus = Node.objects.get(id=corpus_id) corpus = session.query(Node).filter(Node.id==corpus_id).first()
html = t.render(Context({ html = t.render(Context({
'user' : user, 'user' : user,
...@@ -696,13 +529,12 @@ def chart(request, project_id, corpus_id): ...@@ -696,13 +529,12 @@ def chart(request, project_id, corpus_id):
return HttpResponse(html) return HttpResponse(html)
def matrix(request, project_id, corpus_id): def matrix(request, project_id, corpus_id):
if MAINTENANCE: return HttpResponseRedirect('/maintenance/')
t = get_template('matrix.html') t = get_template('matrix.html')
user = request.user user = request.user
date = datetime.datetime.now() date = datetime.datetime.now()
project = Node.objects.get(id=project_id) project = session.query(Node).filter(Node.id==project_id).first()
corpus = Node.objects.get(id=corpus_id) corpus = session.query(Node).filter(Node.id==corpus_id).first()
html = t.render(Context({\ html = t.render(Context({\
'user' : user,\ 'user' : user,\
...@@ -714,29 +546,24 @@ def matrix(request, project_id, corpus_id): ...@@ -714,29 +546,24 @@ def matrix(request, project_id, corpus_id):
return HttpResponse(html) return HttpResponse(html)
def graph(request, project_id, corpus_id): def graph(request, project_id, corpus_id):
if MAINTENANCE: return HttpResponseRedirect('/maintenance/')
t = get_template('explorer.html') t = get_template('explorer.html')
user = request.user user = request.user
date = datetime.datetime.now() date = datetime.datetime.now()
project = Node.objects.get(id=project_id) project = session.query(Node).filter(Node.id==project_id).first()
corpus = Node.objects.get(id=corpus_id) corpus = session.query(Node).filter(Node.id==corpus_id).first()
html = t.render(Context({\ html = t.render(Context({\
'user' : user,\ 'user' : user,\
'date' : date,\ 'date' : date,\
'corpus' : corpus,\ 'corpus' : corpus,\
'project' : project,\ 'project' : project,\
'graphfile' : "hola_mundo",\
})) }))
return HttpResponse(html) return HttpResponse(html)
def exploration(request): def exploration(request):
if MAINTENANCE: return HttpResponseRedirect('/maintenance/')
t = get_template('exploration.html') t = get_template('exploration.html')
user = request.user user = request.user
date = datetime.datetime.now() date = datetime.datetime.now()
...@@ -749,7 +576,6 @@ def exploration(request): ...@@ -749,7 +576,6 @@ def exploration(request):
return HttpResponse(html) return HttpResponse(html)
def explorer_chart(request): def explorer_chart(request):
if MAINTENANCE: return HttpResponseRedirect('/maintenance/')
t = get_template('chart.html') t = get_template('chart.html')
user = request.user user = request.user
date = datetime.datetime.now() date = datetime.datetime.now()
...@@ -773,9 +599,9 @@ def corpus_csv(request, project_id, corpus_id): ...@@ -773,9 +599,9 @@ def corpus_csv(request, project_id, corpus_id):
writer = csv.writer(response) writer = csv.writer(response)
corpus = Node.objects.get(id=corpus_id) corpus_id = session.query(Node.id).filter(Node.id==corpus_id).first()
type_document = NodeType.objects.get(name="Document") type_document_id = cache.NodeType['Document'].id
documents = Node.objects.filter(parent=corpus, type=type_document) documents = session.query(Node).filter(Node.parent_id==corpus_id, Node.type_id==type_document_id).all()
keys = list(documents[0].metadata.keys()) keys = list(documents[0].metadata.keys())
writer.writerow(keys) writer.writerow(keys)
...@@ -792,8 +618,6 @@ def corpus_csv(request, project_id, corpus_id): ...@@ -792,8 +618,6 @@ def corpus_csv(request, project_id, corpus_id):
return response return response
def send_csv(request, corpus_id): def send_csv(request, corpus_id):
''' '''
Create the HttpResponse object with the appropriate CSV header. Create the HttpResponse object with the appropriate CSV header.
...@@ -806,9 +630,9 @@ def send_csv(request, corpus_id): ...@@ -806,9 +630,9 @@ def send_csv(request, corpus_id):
cursor.execute(""" cursor.execute("""
SELECT SELECT
metadata -> 'publication_year' as year, metadata ->> 'publication_year' as year,
metadata -> 'publication_month' as month, metadata ->> 'publication_month' as month,
metadata -> 'publication_day' as day, metadata ->> 'publication_day' as day,
COUNT(*) COUNT(*)
FROM FROM
node_node AS n node_node AS n
...@@ -834,23 +658,26 @@ def send_csv(request, corpus_id): ...@@ -834,23 +658,26 @@ def send_csv(request, corpus_id):
return response return response
# To get the data # To get the data
from gargantext_web.api import JsonHttpResponse from gargantext_web.api import JsonHttpResponse
from analysis.functions import get_cooc from analysis.functions import get_cooc
import json import json
def node_link(request, corpus_id): def node_link(request, corpus_id):
''' '''
Create the HttpResponse object with the node_link dataset. Create the HttpResponse object with the node_link dataset.
''' '''
import time
print("In node_link() START") data = []
start = time.time()
data = get_cooc(request=request, corpus_id=corpus_id, type="node_link") corpus = session.query(Node).filter(Node.id==corpus_id).first()
end = time.time() filename = settings.MEDIA_ROOT + '/corpora/%s/%s_%s.json' % (request.user , corpus.parent_id, corpus_id)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" get_cooc() [s]",(end - start)) print("file exists?:",os.path.isfile(filename))
print("In node_link() END") if os.path.isfile(filename):
json_data = open(filename,"r")
data = json.load(json_data)
json_data.close()
else:
data = get_cooc(request=request, corpus_id=corpus_id, type="node_link")
return JsonHttpResponse(data) return JsonHttpResponse(data)
def adjacency(request, corpus_id): def adjacency(request, corpus_id):
...@@ -926,6 +753,35 @@ def nodeinfo(request , node_id): ...@@ -926,6 +753,35 @@ def nodeinfo(request , node_id):
return HttpResponse(html) return HttpResponse(html)
def tfidf2(request, corpus_id, ngram_id):
"""
Takes IDs of corpus and ngram and returns list of relevent documents in json format
according to TFIDF score (order is decreasing).
"""
#it will receive something like: api/tfidf/corpus_id/NGRAM1aNGRAM2aNGRAM3aNGRAM4...
docsids = ngram_id.split("a")
tfidf_list = []
for i in docsids:
pub = Node.objects.get(id=i)
finalpub = {}
finalpub["id"] = pub.id
pubmetadata = pub.metadata
if "title" in pubmetadata: finalpub["title"] = pubmetadata['title']
if "publication_date" in pubmetadata: finalpub["publication_date"] = pubmetadata['publication_date']
if "journal" in pubmetadata: finalpub["journal"] = pubmetadata['journal']
if "authors" in pubmetadata: finalpub["authors"] = pubmetadata['authors']
if "fields" in pubmetadata: finalpub["fields"] = pubmetadata['fields']
tfidf_list.append(finalpub) # doing a dictionary with only available atributes
if len(tfidf_list)==6: break # max 6 papers
data = json.dumps(tfidf_list)
# data = ["hola","mundo"]
return JsonHttpResponse(data)
def tfidf(request, corpus_id, ngram_id): def tfidf(request, corpus_id, ngram_id):
""" """
Takes IDs of corpus and ngram and returns list of relevent documents in json format Takes IDs of corpus and ngram and returns list of relevent documents in json format
......
from django.shortcuts import redirect
from django.shortcuts import render
from django.http import Http404, HttpResponse, HttpResponseRedirect, HttpResponseForbidden
from sqlalchemy import func, and_, or_
from sqlalchemy.orm import aliased
from collections import defaultdict
from datetime import datetime
from threading import Thread
from node.admin import CustomForm
from gargantext_web.db import *
from gargantext_web.settings import DEBUG, MEDIA_ROOT
from gargantext_web.api import JsonHttpResponse
import json
from parsing.corpustools import add_resource, parse_resources, extract_ngrams, compute_tfidf
def project(request, project_id):
# SQLAlchemy session
session = Session()
# do we have a valid project id?
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = (session
.query(Node)
.filter(Node.id == project_id)
.filter(Node.type_id == cache.NodeType['Project'].id)
).first()
if project is None:
raise Http404()
# do we have a valid user?
user = request.user
if not user.is_authenticated():
return redirect('/login/?next=%s' % request.path)
if project.user_id != user.id:
return HttpResponseForbidden()
# Let's find out about the children nodes of the project
ChildrenNode = aliased(Node)
# This query is giving you the wrong number of docs from the pubmedquerier (x 5)
# ... sqlalchemy.func by Resource.type_id is the guilty
# ISSUE L51
corpus_query = (session
.query(Node.id, Node.name, func.count(ChildrenNode.id))
#.query(Node.id, Node.name, Resource.type_id, func.count(ChildrenNode.id))
#.join(Node_Resource, Node_Resource.node_id == Node.id)
#.join(Resource, Resource.id == Node_Resource.resource_id)
.filter(Node.parent_id == project.id)
.filter(Node.type_id == cache.NodeType['Corpus'].id)
.filter(and_(ChildrenNode.parent_id == Node.id, ChildrenNode.type_id == cache.NodeType['Document'].id))
.group_by(Node.id, Node.name)
.order_by(Node.name)
.all()
)
corpora_by_resourcetype = defaultdict(list)
documents_count_by_resourcetype = defaultdict(int)
corpora_count = 0
corpusID_dict = {}
for corpus_id, corpus_name, document_count in corpus_query:
# Not optimized GOTO ISSUE L51
resource_type_id = (session.query(Resource.type_id)
.join(Node_Resource, Node_Resource.resource_id == Resource.id)
.join(Node, Node.id == Node_Resource.node_id )
.filter(Node.id==corpus_id)
.first())[0]
if not corpus_id in corpusID_dict:
if resource_type_id is None:
resourcetype_name = '(no resource)'
else:
resourcetype = cache.ResourceType[resource_type_id]
resourcetype_name = resourcetype.name
corpora_by_resourcetype[resourcetype_name].append({
'id': corpus_id,
'name': corpus_name,
'count': document_count,
})
documents_count_by_resourcetype[resourcetype_name] += document_count
corpora_count += 1
corpusID_dict[corpus_id]=True
# do the donut
total_documents_count = sum(documents_count_by_resourcetype.values())
donut = [
{ 'source': key,
'count': value,
'part' : round(value * 100 / total_documents_count) if total_documents_count else 0,
}
for key, value in documents_count_by_resourcetype.items()
]
# deal with the form
if request.method == 'POST':
# form validation
form = CustomForm(request.POST, request.FILES)
if form.is_valid():
# extract information from the form
name = form.cleaned_data['name']
thefile = form.cleaned_data['file']
resourcetype = cache.ResourceType[form.cleaned_data['type']]
# which default language shall be used?
if resourcetype.name == "europress_french":
language_id = cache.Language['fr'].id
elif resourcetype.name == "europress_english":
language_id = cache.Language['en'].id
else:
language_id = None
# corpus node instanciation as a Django model
corpus = Node(
name = name,
user_id = request.user.id,
parent_id = project_id,
type_id = cache.NodeType['Corpus'].id,
language_id = language_id,
)
session.add(corpus)
session.commit()
# save the uploaded file
filepath = '%s/corpora/%s/%s' % (MEDIA_ROOT, request.user.username, thefile._name)
f = open(filepath, 'wb')
f.write(thefile.read())
f.close()
# add the uploaded resource to the corpus
add_resource(corpus,
user_id = request.user.id,
type_id = resourcetype.id,
file = filepath,
)
# let's start the workflow
try:
def apply_workflow(corpus):
parse_resources(corpus)
extract_ngrams(corpus, ['title'])
compute_tfidf(corpus)
if DEBUG:
apply_workflow(corpus)
else:
thread = Thread(target=apply_workflow, args=(corpus, ), daemon=True)
thread.start()
except Exception as error:
print('WORKFLOW ERROR')
print(error)
# redirect to the main project page
return HttpResponseRedirect('/project/' + str(project_id))
else:
print('ERROR: BAD FORM')
else:
form = CustomForm()
# HTML output
return render(request, 'project.html', {
'form' : form,
'user' : user,
'date' : datetime.now(),
'project' : project,
'donut' : donut,
'list_corpora' : dict(corpora_by_resourcetype),
'whitelists' : '',
'blacklists' : '',
'cooclists' : '',
'number' : corpora_count,
})
def tfidf(request, corpus_id, ngram_ids):
"""Takes IDs of corpus and ngram and returns list of relevent documents in json format
according to TFIDF score (order is decreasing).
"""
limit=6
nodes_list = []
# filter input
ngram_ids = ngram_ids.split('a')
ngram_ids = [int(i) for i in ngram_ids]
# request data
nodes_query = (session
.query(Node, func.sum(NodeNodeNgram.score))
.join(NodeNodeNgram, NodeNodeNgram.nodey_id == Node.id)
.filter(NodeNodeNgram.nodex_id == corpus_id)
.filter(NodeNodeNgram.ngram_id.in_(ngram_ids))
.group_by(Node)
.order_by(func.sum(NodeNodeNgram.score).desc())
.limit(limit)
)
# convert query result to a list of dicts
for node, score in nodes_query:
node_dict = {
'id': node.id,
'score': score,
}
for key in ('title', 'publication_date', 'journal', 'authors', 'fields'):
if key in node.metadata:
node_dict[key] = node.metadata[key]
nodes_list.append(node_dict)
data = json.dumps(nodes_list)
return JsonHttpResponse(data)
No preview for this file type
ALTER TABLE ONLY node_node ALTER COLUMN date SET DEFAULT CURRENT_DATE ;
ALTER TABLE ONLY node_node ALTER COLUMN metadata DROP NOT NULL ;
ALTER TABLE ONLY node_node ALTER COLUMN metadata DROP DEFAULT ;
ALTER TABLE ONLY node_node ALTER COLUMN metadata TYPE JSONB USING hstore_to_json(metadata)::jsonb ;
ALTER TABLE ONLY node_node ALTER COLUMN metadata SET DEFAULT '{}'::jsonb ;
ALTER TABLE ONLY node_node ALTER COLUMN metadata SET NOT NULL ;
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
#NodeType.objects.all().delete() #NodeType.objects.all().delete()
from node.models import Node, NodeType, Project, Corpus, Document, Ngram, Node_Ngram, User, Language, ResourceType from node.models import *
import pycountry import pycountry
...@@ -31,14 +31,8 @@ except: ...@@ -31,14 +31,8 @@ except:
me = User(username='pksm3') me = User(username='pksm3')
me.save() me.save()
for node_type in ['Trash', 'Root', ]:
try: NodeType.objects.get_or_create(name=node_type)
typeProject = NodeType.objects.get(name='Root')
except Exception as error:
print(error)
typeProject = NodeType(name='Root')
typeProject.save()
try: try:
typeProject = NodeType.objects.get(name='Project') typeProject = NodeType.objects.get(name='Project')
...@@ -141,13 +135,7 @@ except Exception as error: ...@@ -141,13 +135,7 @@ except Exception as error:
#Node.objects.all().delete() #Node.objects.all().delete()
# In[9]:
try:
project = Node.objects.get(name='Bees project')
except:
project = Node(name='Bees project', type=typeProject, user=me)
project.save()
try: try:
stem = Node.objects.get(name='Stem') stem = Node.objects.get(name='Stem')
...@@ -158,3 +146,17 @@ except: ...@@ -158,3 +146,17 @@ except:
from gargantext_web.db import *
# Instantiante table NgramTag:
f = open("part_of_speech_labels.txt", 'r')
for line in f.readlines():
name, description = line.strip().split('\t')
_tag = Tag(name=name, description=description)
session.add(_tag)
session.commit()
f.close()
...@@ -4,6 +4,11 @@ psql -d gargandb -f init.sql ...@@ -4,6 +4,11 @@ psql -d gargandb -f init.sql
sleep 2 sleep 2
../manage.py syncdb ../manage.py syncdb
psql -d gargandb -f init2.sql
sleep 2 sleep 2
../manage.py shell < init.py #../manage.py shell < init.py
../manage.py shell < init_gargantext.py
#psql -d gargandb -f hstore2jsonb.sql
# Without this, we couldn't use the Django environment
import os
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "gargantext_web.settings")
os.environ.setdefault("DJANGO_HSTORE_GLOBAL_REGISTER", "False")
# We're gonna use all the models!
# Django models
from node import models
# SQLA models
from gargantext_web.db import *
# Reset: all data
#
#tables_to_empty = [
# Node,
# Node_Metadata,
# Metadata,
# NodeType,
# ResourceType,
# Resource,
#]
#for table in tables_to_empty:
# print('Empty table "%s"...' % (table._meta.db_table, ))
# table.objects.all().delete()
# Integration: metadata types
print('Initialize metadata...')
metadata = {
'publication_date': 'datetime',
'authors': 'string',
'language_fullname': 'string',
'abstract': 'text',
'title': 'string',
'source': 'string',
'volume': 'string',
'text': 'text',
'page': 'string',
'doi': 'string',
'journal': 'string',
}
for name, type in metadata.items():
models.Metadata(name=name, type=type).save()
# Integration: languages
print('Initialize languages...')
import pycountry
Language.objects.all().delete()
for language in pycountry.languages:
if 'alpha2' in language.__dict__:
Language(
iso2 = language.alpha2,
iso3 = language.bibliographic,
fullname = language.name,
implemented = 1 if language.alpha2 in ['en', 'fr'] else 0,
).save()
english = Language.objects.get(iso2='en')
french = Language.objects.get(iso2='fr')
# Integration: users
print('Initialize users...')
me = models.User.objects.get_or_create(username='alexandre')
gargantua = models.User.objects.get_or_create(username='gargantua')
node_root = Node(user_id=gargantua.id, type_id=cache.NodeType['Root'].id, name='Root')
node_stem = Node(user_id=gargantua.id, type_id=cache.NodeType['Stem'].id, name='Stem', parent_id=node_root.id)
node_lem = Node(user_id=gargantua.id, type_id=cache.NodeType['Lem'].id, name='Lem', parent_id=node_root.id)
session.add(node_root)
session.add(node_stem)
session.add(node_lem)
session.commit()
# Integration: node types
print('Initialize node types...')
node_types = [
'Root', 'Trash',
'Project', 'Corpus', 'Document',
'Stem', 'Lem', 'Tfidf',
'Synonym',
'MiamList', 'StopList',
'Cooccurrence', 'WhiteList', 'BlackList'
]
for node_type in node_types:
models.NodeType.objects.get_or_create(name=node_type)
# Integration: resource types
print('Initialize resource...')
resources = [
'pubmed', 'isi', 'ris', 'europress_french', 'europress_english']
for resource in resources:
models.ResourceType.objects.get_or_create(name=resource)
# TODO
# here some tests
# add a new project and some corpora to test it
# Integration: project
#
#print('Initialize project...')
#try:
# project = Node.objects.get(name='Bees project')
#except:
# project = Node(name='Bees project', type=typeProject, user=me)
# project.save()
#
# Integration: corpus
#print('Initialize corpus...')
#try:
# corpus_pubmed = Node.objects.get(name='PubMed corpus')
#except:
# corpus_pubmed = Node(parent=project, name='PubMed corpus', type=typeCorpus, user=me)
# corpus_pubmed.save()
#
#print('Initialize resource...')
#corpus_pubmed.add_resource(
# # file='./data_samples/pubmed.zip',
# #file='./data_samples/pubmed_2013-04-01_HoneyBeesBeeBees.xml',
# file='/srv/gargantext_lib/data_samples/pubmed.xml',
# type=typePubmed,
# user=me
#)
#
#for resource in corpus_pubmed.get_resources():
# print('Resource #%d - %s - %s' % (resource.id, resource.digest, resource.file))
#
## print('Parse corpus #%d...' % (corpus_pubmed.id, ))
# corpus_pubmed.parse_resources(verbose=True)
# print('Extract corpus #%d...' % (corpus_pubmed.id, ))
# corpus_pubmed.children.all().extract_ngrams(['title',])
# print('Parsed corpus #%d.' % (corpus_pubmed.id, ))
# Instantiante table NgramTag:
f = open("part_of_speech_labels.txt", 'r')
for line in f.readlines():
name, description = line.strip().split('\t')
_tag = Tag(name=name, description=description)
session.add(_tag)
session.commit()
f.close()
exit()
CC Coordinating conjunction
CD Cardinal number
DT Determiner
EX Existential there
FW Foreign word
IN Preposition or subordinating conjunction
JJ Adjective
JJR Adjective, comparative
JJS Adjective, superlative
LS List item marker
MD Modal
NN Noun, singular or mass
NNS Noun, plural
NNP Proper noun, singular
NNPS Proper noun, plural
PDT Predeterminer
POS Possessive ending
PRP Personal pronoun
PRP$ Possessive pronoun
RB Adverb
RBR Adverb, comparative
RBS Adverb, superlative
RP Particle
SYM Symbol
TO to
UH Interjection
VB Verb, base form
VBD Verb, past tense
VBG Verb, gerund or present participle
VBN Verb, past participle
VBP Verb, non­3rd person singular present
VBZ Verb, 3rd person singular present
WDT Wh­determiner
WP Wh­pronoun
WP$ Possessive wh­pronoun
WRB Wh­adverb
NGRA Ngram
Cython==0.20.2 Cython==0.20.2
Django==1.6.6 Django==1.6.11
Jinja2==2.7.3 Jinja2==2.7.3
MarkupSafe==0.23 MarkupSafe==0.23
Pillow==2.5.3 Pillow==2.5.3
Pygments==1.6 Pygments==1.6
RandomWords==0.1.12
SQLAlchemy==0.9.8 SQLAlchemy==0.9.8
South==1.0 South==1.0
aldjemy==0.3.10 aldjemy==0.3.10
amqp==1.4.6 amqp==1.4.6
anyjson==0.3.3 anyjson==0.3.3
bibtexparser==0.6.0
billiard==3.3.0.18 billiard==3.3.0.18
celery==3.1.15 celery==3.1.15
certifi==14.05.14 certifi==14.05.14
...@@ -23,15 +25,20 @@ django-cte-trees==0.9.2 ...@@ -23,15 +25,20 @@ django-cte-trees==0.9.2
django-extensions==1.4.0 django-extensions==1.4.0
django-grappelli==2.5.3 django-grappelli==2.5.3
django-hstore==1.3.1 django-hstore==1.3.1
django-maintenance==0.1
django-mptt==0.6.1 django-mptt==0.6.1
django-nested-inlines==0.1 django-nested-inlines==0.1
django-pgfields==1.4.4
django-pgjson==0.2.2
django-pgjsonb==0.0.10
django-treebeard==2.0 django-treebeard==2.0
djangorestframework==3.0.0 djangorestframework==3.0.0
gensim==0.10.3
graphviz==0.4 graphviz==0.4
ipython==2.2.0 ipython==2.2.0
kombu==3.0.23 kombu==3.0.23
lxml==3.3.6 lxml==3.4.1
matplotlib==1.4.0 #matplotlib==1.4.0
networkx==1.9 networkx==1.9
#nltk==3.0a4 #nltk==3.0a4
nose==1.3.4 nose==1.3.4
...@@ -44,13 +51,16 @@ pycparser==2.10 ...@@ -44,13 +51,16 @@ pycparser==2.10
pydot2==1.0.33 pydot2==1.0.33
pyparsing==2.0.2 pyparsing==2.0.2
python-dateutil==2.2 python-dateutil==2.2
python-igraph==0.7
pytz==2014.7 pytz==2014.7
pyzmq==14.3.1 pyzmq==14.3.1
readline==6.2.4.1 readline==6.2.4.1
redis==2.10.3 redis==2.10.3
scikit-learn==0.15.1 scikit-learn==0.15.1
scipy==0.14.0 scipy==0.14.0
simplerandom==0.12.1
six==1.7.3 six==1.7.3
sympy==0.7.5 sympy==0.7.5
tornado==4.0.1 tornado==4.0.1
uWSGI==2.0.7 uWSGI==2.0.7
ujson==1.33
# Without this, we couldn't use the Django environment
import os
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "gargantext_web.settings")
os.environ.setdefault("DJANGO_HSTORE_GLOBAL_REGISTER", "False")
# We're gonna use all the models!
from node.models import User, NodeType, Node
user = User.objects.get(username = 'contro2015.lait')
# Reset: all data
try:
typeDoc = NodeType.objects.get(name='Cooccurrence')
except Exception as error:
print(error)
Node.objects.filter(user=user, type=typeDoc).all().delete()
exit()
# Without this, we couldn't use the Django environment
import os
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "gargantext_web.settings")
os.environ.setdefault("DJANGO_HSTORE_GLOBAL_REGISTER", "False")
# We're gonna use all the models!
from node.models import *
# Reset: all data
tables_to_empty = [
Node,
Node_Metadata,
Metadata,
NodeType,
ResourceType,
Resource,
]
for table in tables_to_empty:
print('Empty table "%s"...' % (table._meta.db_table, ))
table.objects.all().delete()
# Integration: metadata types
print('Initialize metadata...')
metadata = {
'publication_date': 'datetime',
'authors': 'string',
'language_fullname': 'string',
'abstract': 'text',
'title': 'string',
'source': 'string',
'volume': 'string',
'text': 'text',
'page': 'string',
'doi': 'string',
'journal': 'string',
}
for name, type in metadata.items():
Metadata(name=name, type=type).save()
# Integration: languages
print('Initialize languages...')
import pycountry
Language.objects.all().delete()
for language in pycountry.languages:
if 'alpha2' in language.__dict__:
Language(
iso2 = language.alpha2,
iso3 = language.bibliographic,
fullname = language.name,
implemented = 1 if language.alpha2 in ['en', 'fr'] else 0,
).save()
english = Language.objects.get(iso2='en')
french = Language.objects.get(iso2='fr')
# Integration: users
print('Initialize users...')
try:
me = User.objects.get(username='alexandre')
except:
me = User(username='alexandre')
me.save()
# Integration: node types
print('Initialize node types...')
try:
typeProject = NodeType.objects.get(name='Root')
except Exception as error:
print(error)
typeProject = NodeType(name='Root')
typeProject.save()
try:
typeProject = NodeType.objects.get(name='Project')
except Exception as error:
print(error)
typeProject = NodeType(name='Project')
typeProject.save()
try:
typeCorpus = NodeType.objects.get(name='Corpus')
except Exception as error:
print(error)
typeCorpus = NodeType(name='Corpus')
typeCorpus.save()
try:
typeDoc = NodeType.objects.get(name='Document')
except Exception as error:
print(error)
typeDoc = NodeType(name='Document')
typeDoc.save()
try:
typeStem = NodeType.objects.get(name='Stem')
except Exception as error:
print(error)
typeStem = NodeType(name='Stem')
typeStem.save()
try:
typeTfidf = NodeType.objects.get(name='Tfidf')
except Exception as error:
print(error)
typeTfidf = NodeType(name='Tfidf')
typeTfidf.save()
try:
typeDoc = NodeType.objects.get(name='WhiteList')
except Exception as error:
print(error)
typeDoc = NodeType(name='WhiteList')
typeDoc.save()
try:
typeDoc = NodeType.objects.get(name='BlackList')
except Exception as error:
print(error)
typeDoc = NodeType(name='BlackList')
typeDoc.save()
try:
typeDoc = NodeType.objects.get(name='Synonyme')
except Exception as error:
print(error)
typeDoc = NodeType(name='Synonyme')
typeDoc.save()
try:
typeDoc = NodeType.objects.get(name='Cooccurrence')
except Exception as error:
print(error)
typeDoc = NodeType(name='Cooccurrence')
typeDoc.save()
# Integration: resource types
print('Initialize resource...')
try:
typePubmed = ResourceType.objects.get(name='pubmed')
typeIsi = ResourceType.objects.get(name='isi')
typeRis = ResourceType.objects.get(name='ris')
typePresseFr = ResourceType.objects.get(name='europress_french')
typePresseEn = ResourceType.objects.get(name='europress_english')
except Exception as error:
print(error)
typePubmed = ResourceType(name='pubmed')
typePubmed.save()
typeIsi = ResourceType(name='isi')
typeIsi.save()
typeRis = ResourceType(name='ris')
typeRis.save()
typePresseFr = ResourceType(name='europress_french')
typePresseFr.save()
typePresseEn = ResourceType(name='europress_english')
typePresseEn.save()
# Integration Node Stem
try:
stem = Node.objects.get(name='Stem')
except:
stem = Node(name='Stem', type=typeStem, user=me)
stem.save()
# Integration: project
print('Initialize project...')
try:
project = Node.objects.get(name='Bees project')
except:
project = Node(name='Bees project', type=typeProject, user=me)
project.save()
# Integration: corpus
print('Initialize corpus...')
try:
corpus_pubmed = Node.objects.get(name='PubMed corpus')
except:
corpus_pubmed = Node(parent=project, name='PubMed corpus', type=typeCorpus, user=me)
corpus_pubmed.save()
print('Initialize resource...')
corpus_pubmed.add_resource(
# file='./data_samples/pubmed.zip',
#file='./data_samples/pubmed_2013-04-01_HoneyBeesBeeBees.xml',
file='/srv/gargantext_lib/data_samples/pubmed.xml',
type=typePubmed,
user=me
)
for resource in corpus_pubmed.get_resources():
print('Resource #%d - %s - %s' % (resource.id, resource.digest, resource.file))
# print('Parse corpus #%d...' % (corpus_pubmed.id, ))
# corpus_pubmed.parse_resources(verbose=True)
# print('Extract corpus #%d...' % (corpus_pubmed.id, ))
# corpus_pubmed.children.all().extract_ngrams(['title',])
# print('Parsed corpus #%d.' % (corpus_pubmed.id, ))
exit()
...@@ -117,23 +117,16 @@ class CustomForm(forms.Form): ...@@ -117,23 +117,16 @@ class CustomForm(forms.Form):
""" """
def clean_file(self): def clean_file(self):
file_ = self.cleaned_data.get('file') file_ = self.cleaned_data.get('file')
from datetime import datetime
file_.name = str(datetime.now().microsecond)
# #Filename length # #Filename length
# if len(file_.name)>30: # if len(file_.name)>30:
# from datetime import datetime # from datetime import datetime
# file_.name = str(datetime.now().microsecond) # file_.name = str(datetime.now().microsecond)
# # raise forms.ValidationError(_('Come on dude, name too long. Now is:'+file_.name)) # # raise forms.ValidationError(_('Come on dude, name too long. Now is:'+file_.name))
# #File size #File size
# if len(file_)>104857600: if len(file_)>1024 ** 3:
# raise forms.ValidationError(_('File to heavy! (<100MB).')) raise forms.ValidationError(_('File too heavy! (>1GB).'))
## File type:
# if file_.content_type == "application/zip":
# raise forms.ValidationError(_('We need a zip pls.'))
return file_ return file_
class CorpusForm(ModelForm): class CorpusForm(ModelForm):
#parent = ModelChoiceField(EmptyQuerySet) #parent = ModelChoiceField(EmptyQuerySet)
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
...@@ -155,14 +148,14 @@ class CorpusAdmin(NodeAdmin): ...@@ -155,14 +148,14 @@ class CorpusAdmin(NodeAdmin):
###################################################################### ######################################################################
class DocumentForm(ModelForm): #class DocumentForm(ModelForm):
parent = ModelChoiceField(Node.objects.filter(user_id=1, type_id=3)) # parent = ModelChoiceField(Node.objects.filter(user_id=1, type_id=3))
class DocumentAdmin(NodeAdmin):
_parent_nodetype_name = 'Corpus'
_nodetype_name = 'Document'
form = DocumentForm
#class DocumentAdmin(NodeAdmin):
# _parent_nodetype_name = 'Corpus'
# _nodetype_name = 'Document'
# form = DocumentForm
#
class LanguageAdmin(admin.ModelAdmin): class LanguageAdmin(admin.ModelAdmin):
def get_queryset(self, request): def get_queryset(self, request):
...@@ -178,7 +171,7 @@ admin.site.register(Language, LanguageAdmin) ...@@ -178,7 +171,7 @@ admin.site.register(Language, LanguageAdmin)
admin.site.register(NodeType) admin.site.register(NodeType)
admin.site.register(Project, ProjectAdmin) admin.site.register(Project, ProjectAdmin)
admin.site.register(Corpus, CorpusAdmin) admin.site.register(Corpus, CorpusAdmin)
admin.site.register(Document, DocumentAdmin) admin.site.register(Document)#, DocumentAdmin)
admin.site.register(Node_Resource) admin.site.register(Node_Resource)
......
from django.db import models from django_pg import models
from django.utils import timezone from django.utils import timezone
from django.contrib.auth.models import User from django.contrib.auth.models import User
from django_hstore import hstore from django_pgjson.fields import JsonBField
from cte_tree.models import CTENode, CTENodeManager from cte_tree.models import CTENode, CTENodeManager
# from cte_tree.query import CTEQuerySet # from cte_tree.query import CTEQuerySet
#from cte_tree.fields import DepthField, PathField, OrderingField #from cte_tree.fields import DepthField, PathField, OrderingField
...@@ -14,7 +15,9 @@ from parsing.FileParsers import * ...@@ -14,7 +15,9 @@ from parsing.FileParsers import *
from time import time from time import time
import datetime import datetime
from multiprocessing import Process from multiprocessing import Process
from math import log
import collections
from collections import defaultdict from collections import defaultdict
import hashlib import hashlib
...@@ -23,6 +26,9 @@ from gargantext_web.settings import MEDIA_ROOT ...@@ -23,6 +26,9 @@ from gargantext_web.settings import MEDIA_ROOT
from celery.contrib.methods import task_method from celery.contrib.methods import task_method
from celery import current_app from celery import current_app
import os
import subprocess
# Some usefull functions # Some usefull functions
# TODO: start the function name with an underscore (private) # TODO: start the function name with an underscore (private)
...@@ -35,7 +41,7 @@ class Language(models.Model): ...@@ -35,7 +41,7 @@ class Language(models.Model):
iso2 = models.CharField(max_length=2, unique=True) iso2 = models.CharField(max_length=2, unique=True)
iso3 = models.CharField(max_length=3, unique=True) iso3 = models.CharField(max_length=3, unique=True)
fullname = models.CharField(max_length=255, unique=True) fullname = models.CharField(max_length=255, unique=True)
implemented = models.BooleanField(blank=True) implemented = models.BooleanField(blank=True, default=True)
def __str__(self): def __str__(self):
return self.fullname return self.fullname
...@@ -46,15 +52,35 @@ class ResourceType(models.Model): ...@@ -46,15 +52,35 @@ class ResourceType(models.Model):
def __str__(self): def __str__(self):
return self.name return self.name
class Tag(models.Model):
name = models.CharField(max_length=4, unique=True)
description = models.CharField(max_length=255, unique=True)
def __str__(self):
return self.name
class Ngram(models.Model): class Ngram(models.Model):
language = models.ForeignKey(Language, blank=True, null=True, on_delete=models.SET_NULL) language = models.ManyToManyField(blank=True, null=True, through='NgramLanguage', to='Language')
n = models.IntegerField() n = models.IntegerField()
terms = models.CharField(max_length=255, unique=True) terms = models.CharField(max_length=255, unique=True)
nodes = models.ManyToManyField(through='Node_Ngram', to='Node') nodes = models.ManyToManyField(through='Node_Ngram', to='Node')
tag = models.ManyToManyField(blank=True, null=True, through='NgramTag', to='Tag')
def __str__(self): def __str__(self):
return self.terms return self.terms
class NgramTag(models.Model):
ngram = models.ForeignKey(Ngram, on_delete=models.CASCADE)
tag = models.ForeignKey(Tag)
def __str__(self):
return "%s: %s" % (self.ngram.terms, self.tag.name)
class NgramLanguage(models.Model):
ngram = models.ForeignKey(Ngram, on_delete=models.CASCADE)
language = models.ForeignKey(Language)
def __str__(self):
return "%s: %s" % (self.ngram.terms, self.language.fullname)
class Resource(models.Model): class Resource(models.Model):
user = models.ForeignKey(User) user = models.ForeignKey(User)
...@@ -66,7 +92,7 @@ class Resource(models.Model): ...@@ -66,7 +92,7 @@ class Resource(models.Model):
return self.file return self.file
class NodeType(models.Model): class NodeType(models.Model):
name = models.CharField(max_length=200, unique=True) name = models.CharField(max_length=255, unique=True)
def __str__(self): def __str__(self):
return self.name return self.name
...@@ -88,7 +114,7 @@ class NodeQuerySet(CTENodeManager.CTEQuerySet): ...@@ -88,7 +114,7 @@ class NodeQuerySet(CTENodeManager.CTEQuerySet):
if key in metadata_cache: if key in metadata_cache:
metadata = metadata_cache[key] metadata = metadata_cache[key]
if metadata.type == 'string': if metadata.type == 'string':
value = value[:255] value = value[:200]
data.append(Node_Metadata(**{ data.append(Node_Metadata(**{
'node_id' : node.id, 'node_id' : node.id,
'metadata_id' : metadata.id, 'metadata_id' : metadata.id,
...@@ -107,7 +133,7 @@ class NodeManager(CTENodeManager): ...@@ -107,7 +133,7 @@ class NodeManager(CTENodeManager):
return getattr(self.get_queryset(), name, *args) return getattr(self.get_queryset(), name, *args)
class Metadata(models.Model): class Metadata(models.Model):
name = models.CharField(max_length=32, db_index=True) name = models.CharField(max_length=32, unique=True)
type = models.CharField(max_length=16, db_index=True) type = models.CharField(max_length=16, db_index=True)
class Node(CTENode): class Node(CTENode):
...@@ -116,12 +142,12 @@ class Node(CTENode): ...@@ -116,12 +142,12 @@ class Node(CTENode):
user = models.ForeignKey(User) user = models.ForeignKey(User)
type = models.ForeignKey(NodeType) type = models.ForeignKey(NodeType)
name = models.CharField(max_length=200) name = models.CharField(max_length=255)
language = models.ForeignKey(Language, blank=True, null=True, on_delete=models.SET_NULL) language = models.ForeignKey(Language, blank=True, null=True, on_delete=models.SET_NULL)
date = models.DateField(default=timezone.now, blank=True) date = models.DateField(default=timezone.now, blank=True)
metadata = hstore.DictionaryField(blank=True) metadata = JsonBField(null=False, default={})
ngrams = models.ManyToManyField(through='Node_Ngram', to='Ngram') ngrams = models.ManyToManyField(through='Node_Ngram', to='Ngram')
...@@ -221,21 +247,16 @@ class Node(CTENode): ...@@ -221,21 +247,16 @@ class Node(CTENode):
associations = defaultdict(float) # float or int? associations = defaultdict(float) # float or int?
if isinstance(keys, dict): if isinstance(keys, dict):
for key, weight in keys.items(): for key, weight in keys.items():
for ngram in extractor.extract_ngrams(self.metadata[key]): text2process = str(self.metadata[key]).replace('[','').replace(']','')
for ngram in extractor.extract_ngrams(text2process):
terms = ' '.join([token for token, tag in ngram]) terms = ' '.join([token for token, tag in ngram])
associations[ngram] += weight associations[ngram] += weight
else: else:
for key in keys: for key in keys:
for ngram in extractor.extract_ngrams(self.metadata[key]): text2process = str(self.metadata[key]).replace('[','').replace(']','')
for ngram in extractor.extract_ngrams(text2process):
terms = ' '.join([token for token, tag in ngram]) terms = ' '.join([token for token, tag in ngram])
associations[terms] += 1 associations[terms] += 1
import pprint
pprint.pprint(associations)
print(" - - - - - ")
#print(associations)
# insert the occurrences in the database
# print(associations.items())
Node_Ngram.objects.bulk_create([ Node_Ngram.objects.bulk_create([
Node_Ngram( Node_Ngram(
node = self, node = self,
...@@ -281,150 +302,7 @@ class Node(CTENode): ...@@ -281,150 +302,7 @@ class Node(CTENode):
print("In workflow() END") print("In workflow() END")
self.metadata['Processing'] = 0 self.metadata['Processing'] = 0
self.save() self.save()
def parse_resources__MOV(self, verbose=False):
# parse all resources into a list of metadata
metadata_list = []
print("not parsed resources:")
print(self.node_resource.filter(parsed=False))
print("= = = = = = = = = = =\n")
for node_resource in self.node_resource.filter(parsed=False):
resource = node_resource.resource
parser = defaultdict(lambda:FileParser.FileParser, {
'istext' : ISText,
'pubmed' : PubmedFileParser,
'isi' : IsiFileParser,
'ris' : RisFileParser,
'europress' : EuropressFileParser,
'europress_french' : EuropressFileParser,
'europress_english' : EuropressFileParser,
})[resource.type.name]()
metadata_list += parser.parse(str(resource.file))
self.node_resource.update(parsed=True) #writing to DB
return metadata_list
def writeMetadata__MOV(self, metadata_list=None , verbose=False):
type_id = NodeType.objects.get(name='Document').id
user_id = self.user.id
langages_cache = LanguagesCache()
# # insert the new resources in the database!
for i, metadata_values in enumerate(metadata_list):
name = metadata_values.get('title', '')[:200]
language = langages_cache[metadata_values['language_iso2']] if 'language_iso2' in metadata_values else None,
if isinstance(language, tuple):
language = language[0]
Node(
user_id = user_id,
type_id = type_id,
name = name,
parent = self,
language_id = language.id if language else None,
metadata = metadata_values
).save()
metadata_list[i]["thelang"] = language
# # make metadata filterable
self.children.all().make_metadata_filterable()
# # mark the resources as parsed for this node
self.node_resource.update(parsed=True)
def extract_ngrams__MOV(self, array , keys , ngramsextractorscache=None, ngramscaches=None):
if ngramsextractorscache is None:
ngramsextractorscache = NgramsExtractorsCache()
langages_cache = LanguagesCache()
if ngramscaches is None:
ngramscaches = NgramsCaches()
for metadata in array:
associations = defaultdict(float) # float or int?
language = langages_cache[metadata['language_iso2']] if 'language_iso2' in metadata else None,
if isinstance(language, tuple):
language = language[0]
metadata["thelang"] = language
extractor = ngramsextractorscache[language]
ngrams = ngramscaches[language]
# print("\t\t number of req keys:",len(keys)," AND isdict?:",isinstance(keys, dict))
if isinstance(keys, dict):
for key, weight in keys.items():
if key in metadata:
for ngram in extractor.extract_ngrams(metadata[key]):
terms = ' '.join([token for token, tag in ngram])
associations[ngram] += weight
else:
for key in keys:
if key in metadata:
# print("the_content:[[[[[[__",metadata[key],"__]]]]]]")
for ngram in extractor.extract_ngrams(metadata[key]):
terms = ' '.join([token for token, tag in ngram])
associations[terms] += 1
if len(associations.items())>0:
Node_Ngram.objects.bulk_create([
Node_Ngram(
node = self,
ngram = ngrams[ngram_text],
weight = weight
)
for ngram_text, weight in associations.items()
])
# for ngram_text, weight in associations.items():
# print("ngram_text:",ngram_text," | weight:",weight, " | ngrams[ngram_text]:",ngrams[ngram_text])
def runInParallel(self, *fns):
proc = []
for fn in fns:
p = Process(target=fn)
p.start()
proc.append(p)
for p in proc:
p.join()
def workflow__MOV(self, keys=None, ngramsextractorscache=None, ngramscaches=None, verbose=False):
import time
total = 0
self.metadata['Processing'] = 1
self.save()
print("LOG::TIME: In workflow() parse_resources__MOV()")
start = time.time()
theMetadata = self.parse_resources__MOV()
end = time.time()
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" parse_resources()__MOV [s]",(end - start))
print("LOG::TIME: In workflow() writeMetadata__MOV()")
start = time.time()
self.writeMetadata__MOV( metadata_list=theMetadata )
end = time.time()
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" writeMetadata__MOV() [s]",(end - start))
print("LOG::TIME: In workflow() extract_ngrams__MOV()")
start = time.time()
self.extract_ngrams__MOV(theMetadata , keys=['title','abstract',] )
end = time.time()
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" extract_ngrams__MOV() [s]",(end - start))
# # this is not working
# self.runInParallel( self.writeMetadata__MOV( metadata_list=theMetadata ) , self.extract_ngrams__MOV(theMetadata , keys=['title','abstract',] ) )
start = time.time()
print("LOG::TIME: In workflow() do_tfidf()")
from analysis.functions import do_tfidf
do_tfidf(self)
end = time.time()
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_tfidf() [s]",(end - start))
# # print("LOG::TIME: In workflow() / do_tfidf()")
print("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" In workflow() END")
self.metadata['Processing'] = 0
self.save()
class Node_Metadata(models.Model): class Node_Metadata(models.Model):
node = models.ForeignKey(Node, on_delete=models.CASCADE) node = models.ForeignKey(Node, on_delete=models.CASCADE)
...@@ -437,7 +315,7 @@ class Node_Metadata(models.Model): ...@@ -437,7 +315,7 @@ class Node_Metadata(models.Model):
class Node_Resource(models.Model): class Node_Resource(models.Model):
node = models.ForeignKey(Node, related_name='node_resource', on_delete=models.CASCADE) node = models.ForeignKey(Node, related_name='node_resource', on_delete=models.CASCADE)
resource = models.ForeignKey(Resource) resource = models.ForeignKey(Resource, on_delete=models.CASCADE)
parsed = models.BooleanField(default=False) parsed = models.BooleanField(default=False)
class Node_Ngram(models.Model): class Node_Ngram(models.Model):
...@@ -495,15 +373,11 @@ class NodeNodeNgram(models.Model): ...@@ -495,15 +373,11 @@ class NodeNodeNgram(models.Model):
def __str__(self): def __str__(self):
return "%s: %s / %s = %s" % (self.nodex.name, self.nodey.name, self.ngram.terms, self.score) return "%s: %s / %s = %s" % (self.nodex.name, self.nodey.name, self.ngram.terms, self.score)
class NodeNodeNgram(models.Model):
nodex = models.ForeignKey(Node, related_name="nodex", on_delete=models.CASCADE)
nodey = models.ForeignKey(Node, related_name="nodey", on_delete=models.CASCADE)
ngram = models.ForeignKey(Ngram, on_delete=models.CASCADE)
score = models.FloatField(default=0) class NgramNgram(models.Model):
ngram = models.ForeignKey(Ngram, related_name='ngram', on_delete=models.CASCADE)
token = models.ForeignKey(Ngram, related_name='token', on_delete=models.CASCADE)
index = models.IntegerField()
def __str__(self):
return "%s: %s / %s = %s" % (self.nodex.name, self.nodey.name, self.ngram.terms, self.score)
...@@ -41,18 +41,47 @@ class EuropressFileParser(FileParser): ...@@ -41,18 +41,47 @@ class EuropressFileParser(FileParser):
html = etree.fromstring(contents, html_parser) html = etree.fromstring(contents, html_parser)
try: try:
format_europresse = 50
html_articles = html.xpath('/html/body/table/tbody') html_articles = html.xpath('/html/body/table/tbody')
if len(html_articles) < 1: if len(html_articles) < 1:
html_articles = html.xpath('/html/body/table') html_articles = html.xpath('/html/body/table')
if len(html_articles) < 1:
format_europresse = 1
html_articles = html.xpath('//div[@id="docContain"]')
except Exception as error: except Exception as error:
print(error) print(error)
if format_europresse == 50:
name_xpath = "./tr/td/span[@class = 'DocPublicationName']"
header_xpath = "//span[@class = 'DocHeader']"
title_xpath = "string(./tr/td/span[@class = 'TitreArticleVisu'])"
text_xpath = "./tr/td/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()"
elif format_europresse == 1:
name_xpath = "//span[@class = 'DocPublicationName']"
header_xpath = "//span[@class = 'DocHeader']"
title_xpath = "string(//div[@class = 'titreArticleVisu'])"
text_xpath = "./descendant::*[\
not(\
self::div[@class='Doc-SourceText'] \
or self::span[@class='DocHeader'] \
or self::span[@class='DocPublicationName'] \
or self::span[@id='docNameVisu'] \
or self::span[@class='DocHeader'] \
or self::div[@class='titreArticleVisu'] \
or self::span[@id='docNameContType'] \
or descendant-or-self::span[@id='ucPubliC_lblCertificatIssuedTo'] \
or descendant-or-self::span[@id='ucPubliC_lblEndDate'] \
or self::td[@class='txtCertificat'] \
)]/text()"
doi_xpath = "//span[@id='ucPubliC_lblNodoc']/text()"
except: except Exception as error:
return [] print(error)
# initialize the list of metadata
metadata_list = []
# parse all the articles, one by one # parse all the articles, one by one
try: try:
for html_article in html_articles: for html_article in html_articles:
...@@ -60,19 +89,20 @@ class EuropressFileParser(FileParser): ...@@ -60,19 +89,20 @@ class EuropressFileParser(FileParser):
metadata = {} metadata = {}
if len(html_article): if len(html_article):
for name in html_article.xpath("./tr/td/span[@class = 'DocPublicationName']"): for name in html_article.xpath(name_xpath):
if name.text is not None: if name.text is not None:
format_journal = re.compile('(.*), (.*)', re.UNICODE) format_journal = re.compile('(.*), (.*)', re.UNICODE)
test_journal = format_journal.match(name.text) test_journal = format_journal.match(name.text)
if test_journal is not None: if test_journal is not None:
metadata['source'] = test_journal.group(1) metadata['journal'] = test_journal.group(1)
metadata['volume'] = test_journal.group(2) metadata['volume'] = test_journal.group(2)
else: else:
metadata['source'] = name.text.encode(codif) metadata['journal'] = name.text.encode(codif)
for header in html_article.xpath("./tr/td/span[@class = 'DocHeader']"): for header in html_article.xpath(header_xpath):
try: try:
text = header.text text = header.text
#print("header", text)
except Exception as error: except Exception as error:
print(error) print(error)
...@@ -138,8 +168,8 @@ class EuropressFileParser(FileParser): ...@@ -138,8 +168,8 @@ class EuropressFileParser(FileParser):
if test_page is not None: if test_page is not None:
metadata['page'] = test_page.group(1).encode(codif) metadata['page'] = test_page.group(1).encode(codif)
metadata['title'] = html_article.xpath("string(./tr/td/span[@class = 'TitreArticleVisu'])").encode(codif) metadata['title'] = html_article.xpath(title_xpath).encode(codif)
metadata['text'] = html_article.xpath("./tr/td/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()") metadata['abstract'] = html_article.xpath(text_xpath)
line = 0 line = 0
br_tag = 10 br_tag = 10
...@@ -185,32 +215,36 @@ class EuropressFileParser(FileParser): ...@@ -185,32 +215,36 @@ class EuropressFileParser(FileParser):
metadata['publication_year'] = metadata['publication_date'].strftime('%Y') metadata['publication_year'] = metadata['publication_date'].strftime('%Y')
metadata['publication_month'] = metadata['publication_date'].strftime('%m') metadata['publication_month'] = metadata['publication_date'].strftime('%m')
metadata['publication_day'] = metadata['publication_date'].strftime('%d') metadata['publication_day'] = metadata['publication_date'].strftime('%d')
metadata['publication_date'] = "" metadata.pop('publication_date')
if len(metadata['abstract'])>0 and format_europresse == 50:
metadata['doi'] = str(metadata['abstract'][-9])
metadata['abstract'].pop()
# Here add separator for paragraphs
metadata['abstract'] = str(' '.join(metadata['abstract']))
metadata['abstract'] = str(re.sub('Tous droits réservés.*$', '', metadata['abstract']))
elif format_europresse == 1:
metadata['doi'] = ' '.join(html_article.xpath(doi_xpath))
metadata['abstract'] = metadata['abstract'][:-9]
# Here add separator for paragraphs
metadata['abstract'] = str(' '.join(metadata['abstract']))
else:
metadata['doi'] = "not found"
metadata['length_words'] = len(metadata['abstract'].split(' '))
metadata['length_letters'] = len(metadata['abstract'])
if len(metadata['text'])>0:
metadata['doi'] = str(metadata['text'][-9])
metadata['text'].pop()
metadata['text'] = str(' '.join(metadata['text']))
metadata['text'] = str(re.sub('Tous droits réservés.*$', '', metadata['text']))
else: metadata['doi'] = "not found"
metadata['bdd'] = u'europresse' metadata['bdd'] = u'europresse'
metadata['url'] = u'' metadata['url'] = u''
#metadata_str = {} #metadata_str = {}
for key, value in metadata.items(): for key, value in metadata.items():
metadata[key] = value.decode() if isinstance(value, bytes) else value metadata[key] = value.decode() if isinstance(value, bytes) else value
metadata_list.append(metadata) yield metadata
count += 1 count += 1
file.close()
except Exception as error: except Exception as error:
print(error) print(error)
pass pass
# from pprint import pprint
# pprint(metadata_list)
# return []
return metadata_list
...@@ -103,15 +103,21 @@ class FileParser: ...@@ -103,15 +103,21 @@ class FileParser:
zipArchive = zipfile.ZipFile(file) zipArchive = zipfile.ZipFile(file)
for filename in zipArchive.namelist(): for filename in zipArchive.namelist():
try: try:
metadata_list += self.parse(zipArchive.open(filename, "r")) f = zipArchive.open(filename, 'r')
metadata_list += self.parse(f)
f.close()
except Exception as error: except Exception as error:
print(error) print(error)
# ...otherwise, let's parse it directly! # ...otherwise, let's parse it directly!
else: else:
try: try:
metadata_list += self._parse(file) for metadata in self._parse(file):
metadata_list.append(self.format_metadata(metadata))
if hasattr(file, 'close'):
file.close()
except Exception as error: except Exception as error:
print(error) print(error)
# return the list of formatted metadata # return the list of formatted metadata
return map(self.format_metadata, metadata_list) return metadata_list
...@@ -25,6 +25,7 @@ class PubmedFileParser(FileParser): ...@@ -25,6 +25,7 @@ class PubmedFileParser(FileParser):
metadata_path = { metadata_path = {
"journal" : 'MedlineCitation/Article/Journal/Title', "journal" : 'MedlineCitation/Article/Journal/Title',
"title" : 'MedlineCitation/Article/ArticleTitle', "title" : 'MedlineCitation/Article/ArticleTitle',
"abstract" : 'MedlineCitation/Article/Abstract/AbstractText',
"title2" : 'MedlineCitation/Article/VernacularTitle', "title2" : 'MedlineCitation/Article/VernacularTitle',
"language_iso3" : 'MedlineCitation/Article/Language', "language_iso3" : 'MedlineCitation/Article/Language',
"doi" : 'PubmedData/ArticleIdList/ArticleId[@type=doi]', "doi" : 'PubmedData/ArticleIdList/ArticleId[@type=doi]',
...@@ -101,7 +102,6 @@ class PubmedFileParser(FileParser): ...@@ -101,7 +102,6 @@ class PubmedFileParser(FileParser):
if "realdate_day_" in metadata: metadata.pop("realdate_day_") if "realdate_day_" in metadata: metadata.pop("realdate_day_")
if "title2" in metadata: metadata.pop("title2") if "title2" in metadata: metadata.pop("title2")
# print(metadata)
metadata_list.append(metadata) metadata_list.append(metadata)
# return the list of metadata # return the list of metadata
return metadata_list return metadata_list
...@@ -17,42 +17,34 @@ class RisFileParser(FileParser): ...@@ -17,42 +17,34 @@ class RisFileParser(FileParser):
} }
def _parse(self, file): def _parse(self, file):
metadata_list = []
metadata = {} metadata = {}
last_key = None last_key = None
last_values = [] last_values = []
# browse every line of the file
for line in file: for line in file:
if len(line) > 2: if len(line) > 2:
# extract the parameter key
parameter_key = line[:2] parameter_key = line[:2]
# print(parameter_key)
if parameter_key != b' ' and parameter_key != last_key: if parameter_key != b' ' and parameter_key != last_key:
if last_key in self._parameters: if last_key in self._parameters:
# translate the parameter key
parameter = self._parameters[last_key] parameter = self._parameters[last_key]
if parameter["type"] == "metadata": if parameter["type"] == "metadata":
separator = parameter["separator"] if "separator" in parameter else "" separator = parameter["separator"] if "separator" in parameter else ""
metadata[parameter["key"]] = separator.join(last_values) metadata[parameter["key"]] = separator.join(last_values)
elif parameter["type"] == "delimiter": elif parameter["type"] == "delimiter":
#language = self._languages_fullname[metadata["language"].lower()] if 'language_fullname' not in metadata.keys():
#print(metadata) if 'language_iso3' not in metadata.keys():
try: if 'language_iso2' not in metadata.keys():
#print("append") metadata['language_iso2'] = 'en'
if 'language_fullname' not in metadata.keys(): yield metadata
if 'language_iso3' not in metadata.keys(): metadata = {}
if 'language_iso2' not in metadata.keys():
metadata['language_iso2'] = 'en'
metadata_list.append(metadata)
metadata = {}
#print("append succeeded")
except:
pass
last_key = parameter_key last_key = parameter_key
last_values = [] last_values = []
try: try:
last_values.append(line[3:-1].decode()) last_values.append(line[3:-1].decode())
except Exception as error: except Exception as error:
print(error) print(error)
pass # if a metadata object is left in memory, yield it as well
#print(len(metadata_list)) if metadata:
#print(metadata_list) yield metadata
return metadata_list
from ..Taggers import Tagger from ..Taggers import TurboTagger
import nltk import nltk
...@@ -13,12 +13,13 @@ class NgramsExtractor: ...@@ -13,12 +13,13 @@ class NgramsExtractor:
self.start() self.start()
self._label = "NP" self._label = "NP"
self._rule = self._label + ": " + rule self._rule = self._label + ": " + rule
self._grammar = nltk.RegexpParser(self._rule)
def __del__(self): def __del__(self):
self.stop() self.stop()
def start(self): def start(self):
self.tagger = Tagger() self.tagger = TurboTagger()
def stop(self): def stop(self):
pass pass
...@@ -29,19 +30,8 @@ class NgramsExtractor: ...@@ -29,19 +30,8 @@ class NgramsExtractor:
""" """
def extract_ngrams(self, contents): def extract_ngrams(self, contents):
tagged_ngrams = self.tagger.tag_text(contents) tagged_ngrams = self.tagger.tag_text(contents)
if len(tagged_ngrams)==0: return [] if len(tagged_ngrams):
grammar_parsed = self._grammar.parse(tagged_ngrams)
grammar = nltk.RegexpParser(self._rule) for subtree in grammar_parsed.subtrees():
result = [] if subtree.label() == self._label:
# try: yield subtree.leaves()
grammar_parsed = grammar.parse(tagged_ngrams)
for subtree in grammar_parsed.subtrees():
if subtree.label() == self._label:
result.append(subtree.leaves())
# except Exception as e:
# print("Problem while parsing rule '%s'" % (self._rule, ))
# print(e)
return result
from .FrenchNgramsExtractor import FrenchNgramsExtractor from .FrenchNgramsExtractor import FrenchNgramsExtractor
from .TurboNgramsExtractor import TurboNgramsExtractor as EnglishNgramsExtractor from .TurboNgramsExtractor import TurboNgramsExtractor as EnglishNgramsExtractor
from parsing.NgramsExtractors.EnglishNgramsExtractor import EnglishNgramsExtractor # from .EnglishNgramsExtractor import EnglishNgramsExtractor
#from .NgramsExtractor import NgramsExtractor from .NgramsExtractor import NgramsExtractor
...@@ -58,9 +58,11 @@ class Tagger: ...@@ -58,9 +58,11 @@ class Tagger:
if single: if single:
self.tagging_end() self.tagging_end()
return [] return []
"""Send a text to be tagged. """Send a text to be tagged.
""" """
# Not used right now
def tag_text(self, text): def tag_text(self, text):
tokens_tags = [] tokens_tags = []
self.tagging_start() self.tagging_start()
...@@ -69,4 +71,3 @@ class Tagger: ...@@ -69,4 +71,3 @@ class Tagger:
tokens_tags += self.tag_tokens(tokens, False) tokens_tags += self.tag_tokens(tokens, False)
self.tagging_end() self.tagging_end()
return tokens_tags return tokens_tags
...@@ -9,15 +9,24 @@ from .settings import implemented_methods ...@@ -9,15 +9,24 @@ from .settings import implemented_methods
class NLPClient: class NLPClient:
def __init__(self): def __init__(self):
self._socket = socket.socket(*server_type_client) self._socket = None
self._socket.connect((server_host, server_port))
for method_name in dir(self): for method_name in dir(self):
if method_name[0] != '_': if method_name[0] != '_':
if method_name.upper() not in implemented_methods: if method_name.upper() not in implemented_methods:
setattr(self, method_name, self._notimplemented) setattr(self, method_name, self._notimplemented)
def __del__(self): def __del__(self):
self._socket.close() self._disconnect()
def _connect(self):
self._disconnect()
self._socket = socket.socket(*server_type_client)
self._socket.connect((server_host, server_port))
def _disconnect(self):
if self._socket is not None:
self._socket.close()
self._socket = None
def _notimplemented(self, *args, **kwargs): def _notimplemented(self, *args, **kwargs):
raise NotImplementedError( raise NotImplementedError(
...@@ -51,7 +60,7 @@ class NLPClient: ...@@ -51,7 +60,7 @@ class NLPClient:
data += language + '\n' data += language + '\n'
data += re.sub(r'\n+', '\n', text) data += re.sub(r'\n+', '\n', text)
data += '\n\n' data += '\n\n'
self.__init__() self._connect()
self._socket.sendall(data.encode()) self._socket.sendall(data.encode())
sentence = [] sentence = []
if keys is None: if keys is None:
...@@ -73,7 +82,6 @@ class NLPClient: ...@@ -73,7 +82,6 @@ class NLPClient:
continue continue
values = line.split('\t') values = line.split('\t')
sentence.append(dict(zip(keys, line.split('\t')))) sentence.append(dict(zip(keys, line.split('\t'))))
self.__del__()
def tokenize(self, text, language='english', asdict=False): def tokenize(self, text, language='english', asdict=False):
keys = ('token', ) if asdict else None keys = ('token', ) if asdict else None
......
...@@ -4,7 +4,7 @@ import socketserver ...@@ -4,7 +4,7 @@ import socketserver
# Server parameters # Server parameters
server_host = 'localhost' server_host = 'localhost'
server_port = 1234 server_port = 7777
server_type_server = socketserver.TCPServer server_type_server = socketserver.TCPServer
server_type_client = socket.AF_INET, socket.SOCK_STREAM server_type_client = socket.AF_INET, socket.SOCK_STREAM
server_timeout = 2.0 server_timeout = 2.0
......
from collections import defaultdict
from datetime import datetime
from random import random
from hashlib import md5
from time import time
from math import log
from gargantext_web.db import *
from .FileParsers import *
class DebugTime:
def __init__(self, prefix):
self.prefix = prefix
self.message = None
self.time = None
def __del__(self):
if self.message is not None and self.time is not None:
print('%s - %s: %.4f' % (self.prefix, self.message, time() - self.time))
def show(self, message):
self.__del__()
self.message = message
self.time = time()
# keep all the parsers in a cache
class Parsers(defaultdict):
_parsers = {
'pubmed' : PubmedFileParser,
'isi' : IsiFileParser,
'ris' : RisFileParser,
'europress' : EuropressFileParser,
'europress_french' : EuropressFileParser,
'europress_english' : EuropressFileParser,
}
def __missing__(self, key):
if key not in self._parsers:
raise NotImplementedError('No such parser: "%s"' % (key))
parser = self._parsers[key]()
self[key] = parser
return parser
parsers = Parsers()
# resources managment
def add_resource(corpus, **kwargs):
# only for tests
session = Session()
resource = Resource(guid=str(random()), **kwargs )
# User
if 'user_id' not in kwargs:
resource.user_id = corpus.user_id
# Compute the digest
h = md5()
f = open(str(resource.file), 'rb')
h.update(f.read())
f.close()
resource.digest = h.hexdigest()
# check if a resource on this node already has this hash
tmp_resource = (session
.query(Resource)
.join(Node_Resource, Node_Resource.resource_id == Resource.id)
.filter(Resource.digest == resource.digest)
.filter(Node_Resource.node_id == corpus.id)
).first()
if tmp_resource is not None:
return tmp_resource
else:
session.add(resource)
session.commit()
# link with the resource
node_resource = Node_Resource(
node_id = corpus.id,
resource_id = resource.id,
parsed = False,
)
session.add(node_resource)
session.commit()
# return result
return resource
def parse_resources(corpus, user=None, user_id=None):
dbg = DebugTime('Corpus #%d - parsing' % corpus.id)
session = Session()
corpus_id = corpus.id
type_id = cache.NodeType['Document'].id
if user_id is None and user is not None:
user_id = user.id
else:
user_id = corpus.user_id
# find resource of the corpus
resources_query = (session
.query(Resource, ResourceType)
.join(ResourceType, ResourceType.id == Resource.type_id)
.join(Node_Resource, Node_Resource.resource_id == Resource.id)
.filter(Node_Resource.node_id == corpus.id)
.filter(Node_Resource.parsed == False)
)
# make a new node for every parsed document of the corpus
dbg.show('analyze documents')
nodes = list()
for resource, resourcetype in resources_query:
parser = parsers[resourcetype.name]
for metadata_dict in parser.parse(resource.file):
# retrieve language ID from metadata
if 'language_iso2' in metadata_dict:
try:
language_id = cache.Language[metadata_dict['language_iso2']].id
except KeyError:
language_id = None
else:
language_id = None
# create new node
node = Node(
name = metadata_dict.get('title', '')[:200],
parent_id = corpus_id,
user_id = user_id,
type_id = type_id,
language_id = language_id,
metadata = metadata_dict,
date = datetime.utcnow(),
)
nodes.append(node)
#
# TODO: mark node-resources associations as parsed
#
dbg.show('insert %d documents' % len(nodes))
session.add_all(nodes)
session.commit()
# now, index the metadata
dbg.show('insert metadata')
node_metadata_lists = defaultdict(list)
metadata_types = {
metadata.name: metadata
for metadata in session.query(Metadata)
}
for node in nodes:
node_id = node.id
for metadata_key, metadata_value in node.metadata.items():
try:
metadata = metadata_types[metadata_key]
except KeyError:
# Why silent continue here ?
continue
if metadata.type == 'string':
metadata_value = metadata_value[:255]
node_metadata_lists[metadata.type].append((
node_id,
metadata.id,
metadata_value,
))
for key, values in node_metadata_lists.items():
bulk_insert(Node_Metadata, ['node_id', 'metadata_id', 'value_'+key], values)
# mark the corpus as parsed
corpus.parsed = True
# ngrams extraction
from .NgramsExtractors import EnglishNgramsExtractor, FrenchNgramsExtractor, NgramsExtractor
class NgramsExtractors(defaultdict):
def __init__(self):
# English
self['en'] = EnglishNgramsExtractor()
for key in ('eng', 'english'):
self[key] = self['en']
# French
self['fr'] = FrenchNgramsExtractor()
for key in ('fre', 'french'):
self[key] = self['fr']
# default
self['default'] = NgramsExtractor()
def __missing__(self, key):
formatted_key = key.strip().lower()
if formatted_key in self:
self[key] = self[formatted_key]
else:
self[key] = self['default']
# raise NotImplementedError
return self[key]
ngramsextractors = NgramsExtractors()
def extract_ngrams(corpus, keys):
dbg = DebugTime('Corpus #%d - ngrams' % corpus.id)
default_language_iso2 = None if corpus.language_id is None else cache.Language[corpus.language_id].iso2
# query the metadata associated with the given keys
columns = [Node.id, Node.language_id] + [Node.metadata[key] for key in keys]
metadata_query = (session
.query(*columns)
.filter(Node.parent_id == corpus.id)
.filter(Node.type_id == cache.NodeType['Document'].id)
)
# prepare data to be inserted
dbg.show('find ngrams')
languages_by_id = {
language.id: language.iso2
for language in session.query(Language)
}
ngrams_data = set()
ngrams_language_data = set()
ngrams_tag_data = set()
node_ngram_list = defaultdict(lambda: defaultdict(int))
for nodeinfo in metadata_query:
node_id = nodeinfo[0]
language_id = nodeinfo[1]
if language_id is None:
language_iso2 = default_language_iso2
else:
language_iso2 = languages_by_id.get(language_id, None)
if language_iso2 is None:
continue
ngramsextractor = ngramsextractors[language_iso2]
for text in nodeinfo[2:]:
if text is not None and len(text):
ngrams = ngramsextractor.extract_ngrams(text.replace("[","").replace("]",""))
for ngram in ngrams:
n = len(ngram)
terms = ' '.join([token for token, tag in ngram]).lower()
# TODO BUG here
if n == 1:
tag_id = cache.Tag[ngram[0][1]].id
#tag_id = 1
#print('tag_id', tag_id)
elif n > 1:
tag_id = cache.Tag['NN'].id
#tag_id = 14
#print('tag_id_2', tag_id)
node_ngram_list[node_id][terms] += 1
ngrams_data.add((n, terms))
ngrams_language_data.add((terms, language_id))
ngrams_tag_data.add((terms, tag_id))
# insert ngrams to temporary table
dbg.show('find ids for the %d ngrams' % len(ngrams_data))
db, cursor = get_cursor()
cursor.execute('''
CREATE TEMPORARY TABLE tmp__ngrams (
id INT,
n INT NOT NULL,
terms VARCHAR(255) NOT NULL
)
''')
bulk_insert('tmp__ngrams', ['n', 'terms'], ngrams_data, cursor=cursor)
# retrieve ngram ids from already inserted stuff
cursor.execute('''
UPDATE
tmp__ngrams
SET
id = ngram.id
FROM
%s AS ngram
WHERE
ngram.terms = tmp__ngrams.terms
''' % (Ngram.__table__.name, ))
# insert, then get the ids back
cursor.execute('''
INSERT INTO
%s (n, terms)
SELECT
n, terms
FROM
tmp__ngrams
WHERE
id IS NULL
''' % (Ngram.__table__.name, ))
cursor.execute('''
UPDATE
tmp__ngrams
SET
id = ngram.id
FROM
%s AS ngram
WHERE
ngram.terms = tmp__ngrams.terms
AND
tmp__ngrams.id IS NULL
''' % (Ngram.__table__.name, ))
# get all ids
ngram_ids = dict()
cursor.execute('SELECT id, terms FROM tmp__ngrams')
for row in cursor.fetchall():
ngram_ids[row[1]] = row[0]
#
dbg.show('insert associations')
node_ngram_data = list()
for node_id, ngrams in node_ngram_list.items():
for terms, weight in ngrams.items():
try:
ngram_id = ngram_ids[terms]
node_ngram_data.append((node_id, ngram_id, weight, ))
except Exception as e:
print("err01:",e)
bulk_insert(Node_Ngram, ['node_id', 'ngram_id', 'weight'], node_ngram_data, cursor=cursor)
dbg.message = 'insert %d associations' % len(node_ngram_data)
# commit to database
db.commit()
# tfidf calculation
def compute_tfidf(corpus):
dbg = DebugTime('Corpus #%d - tfidf' % corpus.id)
# compute terms frequency sum
dbg.show('calculate terms frequencies sums')
db, cursor = get_cursor()
cursor.execute('''
CREATE TEMPORARY TABLE tmp__st (
node_id INT NOT NULL,
frequency DOUBLE PRECISION NOT NULL
)
''')
cursor.execute('''
INSERT INTO
tmp__st (node_id, frequency)
SELECT
node_ngram.node_id,
SUM(node_ngram.weight) AS frequency
FROM
%s AS node
INNER JOIN
%s AS node_ngram ON node_ngram.node_id = node.id
WHERE
node.parent_id = %d
GROUP BY
node_ngram.node_id
''' % (Node.__table__.name, Node_Ngram.__table__.name, corpus.id, ))
# compute normalized terms frequencies
dbg.show('normalize terms frequencies')
cursor.execute('''
CREATE TEMPORARY TABLE tmp__tf (
node_id INT NOT NULL,
ngram_id INT NOT NULL,
frequency DOUBLE PRECISION NOT NULL
)
''')
cursor.execute('''
INSERT INTO
tmp__tf (node_id, ngram_id, frequency)
SELECT
node_ngram.node_id,
node_ngram.ngram_id,
(node_ngram.weight / node.frequency) AS frequency
FROM
%s AS node_ngram
INNER JOIN
tmp__st AS node ON node.node_id = node_ngram.node_id
''' % (Node_Ngram.__table__.name, ))
# show off
dbg.show('compute idf')
cursor.execute('''
CREATE TEMPORARY TABLE tmp__idf (
ngram_id INT NOT NULL,
idf DOUBLE PRECISION NOT NULL
)
''')
cursor.execute('''
INSERT INTO
tmp__idf(ngram_id, idf)
SELECT
node_ngram.ngram_id,
-ln(COUNT(*))
FROM
%s AS node
INNER JOIN
%s AS node_ngram ON node_ngram.node_id = node.id
WHERE
node.parent_id = %d
GROUP BY
node_ngram.ngram_id
''' % (Node.__table__.name, Node_Ngram.__table__.name, corpus.id, ))
cursor.execute('SELECT COUNT(*) FROM tmp__st')
D = cursor.fetchone()[0]
if D>0:
lnD = log(D)
cursor.execute('UPDATE tmp__idf SET idf = idf + %f' % (lnD, ))
# show off
dbg.show('insert tfidf for %d documents' % D)
cursor.execute('''
INSERT INTO
%s (nodex_id, nodey_id, ngram_id, score)
SELECT
%d AS nodex_id,
tf.node_id AS nodey_id,
tf.ngram_id AS ngram_id,
(tf.frequency * idf.idf) AS score
FROM
tmp__idf AS idf
INNER JOIN
tmp__tf AS tf ON tf.ngram_id = idf.ngram_id
''' % (NodeNodeNgram.__table__.name, corpus.id, ))
# # show off
# cursor.execute('''
# SELECT
# node.name,
# ngram.terms,
# node_node_ngram.score AS tfidf
# FROM
# %s AS node_node_ngram
# INNER JOIN
# %s AS node ON node.id = node_node_ngram.nodey_id
# INNER JOIN
# %s AS ngram ON ngram.id = node_node_ngram.ngram_id
# WHERE
# node_node_ngram.nodex_id = %d
# ORDER BY
# score DESC
# ''' % (NodeNodeNgram.__table__.name, Node.__table__.name, Ngram.__table__.name, corpus.id, ))
# for row in cursor.fetchall():
# print(row)
# the end!
db.commit()
...@@ -40,7 +40,7 @@ class MedlineFetcher: ...@@ -40,7 +40,7 @@ class MedlineFetcher:
"Get number of results for query 'query' in variable 'count'" "Get number of results for query 'query' in variable 'count'"
"Get also 'queryKey' and 'webEnv', which are used by function 'medlineEfetch'" "Get also 'queryKey' and 'webEnv', which are used by function 'medlineEfetch'"
print(query) # print(query)
origQuery = query origQuery = query
query = query.replace(' ', '%20') query = query.replace(' ', '%20')
...@@ -79,7 +79,7 @@ class MedlineFetcher: ...@@ -79,7 +79,7 @@ class MedlineFetcher:
queryNoSpace = query.replace(' ', '') # No space in directory and file names, avoids stupid errors queryNoSpace = query.replace(' ', '') # No space in directory and file names, avoids stupid errors
print ("LOG::TIME: ",'medlineEfetchRAW :Query "' , query , '"\t:\t' , count , ' results') # print ("LOG::TIME: ",'medlineEfetchRAW :Query "' , query , '"\t:\t' , count , ' results')
retstart = 0 retstart = 0
eFetch = '%s/efetch.fcgi?email=youremail@example.org&rettype=%s&retmode=xml&retstart=%s&retmax=%s&db=%s&query_key=%s&WebEnv=%s' %(self.pubMedEutilsURL, self.reportType, retstart, retmax, self.pubMedDB, queryKey, webEnv) eFetch = '%s/efetch.fcgi?email=youremail@example.org&rettype=%s&retmode=xml&retstart=%s&retmax=%s&db=%s&query_key=%s&WebEnv=%s' %(self.pubMedEutilsURL, self.reportType, retstart, retmax, self.pubMedDB, queryKey, webEnv)
...@@ -94,7 +94,7 @@ class MedlineFetcher: ...@@ -94,7 +94,7 @@ class MedlineFetcher:
def downloadFile(self, item): def downloadFile(self, item):
url = item[0] url = item[0]
filename = item[1] filename = item[1]
print("\tin test_downloadFile:") # print("\tin test_downloadFile:")
# print(url,filename) # print(url,filename)
data = urlopen(url) data = urlopen(url)
f = codecs.open(filename, "w" ,encoding='utf-8') f = codecs.open(filename, "w" ,encoding='utf-8')
...@@ -110,7 +110,7 @@ class MedlineFetcher: ...@@ -110,7 +110,7 @@ class MedlineFetcher:
def test_downloadFile(self, item): def test_downloadFile(self, item):
url = item[0] url = item[0]
filename = item[1] filename = item[1]
print("\tin downloadFile:") # print("\tin downloadFile:")
data = urlopen(url) data = urlopen(url)
return data return data
...@@ -119,7 +119,7 @@ class MedlineFetcher: ...@@ -119,7 +119,7 @@ class MedlineFetcher:
# time.sleep(1) # pretend to do some lengthy work. # time.sleep(1) # pretend to do some lengthy work.
returnvalue = self.medlineEsearch(item) returnvalue = self.medlineEsearch(item)
with self.lock: with self.lock:
print(threading.current_thread().name, item) # print(threading.current_thread().name, item)
return returnvalue return returnvalue
# The worker thread pulls an item from the queue and processes it # The worker thread pulls an item from the queue and processes it
...@@ -160,13 +160,13 @@ class MedlineFetcher: ...@@ -160,13 +160,13 @@ class MedlineFetcher:
N = 0 N = 0
print ("MedlineFetcher::serialFetcher :") # print ("MedlineFetcher::serialFetcher :")
thequeries = [] thequeries = []
globalresults = [] globalresults = []
for i in range(yearsNumber): for i in range(yearsNumber):
year = str(2015 - i) year = str(2015 - i)
print ('YEAR ' + year) # print ('YEAR ' + year)
print ('---------\n') # print ('---------\n')
pubmedquery = str(year) + '[dp] '+query pubmedquery = str(year) + '[dp] '+query
self.q.put( pubmedquery ) #put task in the queue self.q.put( pubmedquery ) #put task in the queue
...@@ -196,5 +196,6 @@ class MedlineFetcher: ...@@ -196,5 +196,6 @@ class MedlineFetcher:
retmax_forthisyear = int(round(globalLimit*proportion)) retmax_forthisyear = int(round(globalLimit*proportion))
query["retmax"] = retmax_forthisyear query["retmax"] = retmax_forthisyear
if query["retmax"]==0: query["retmax"]+=1 if query["retmax"]==0: query["retmax"]+=1
print(query["string"],"\t[",k,">",query["retmax"],"]")
return thequeries return thequeries
from django.shortcuts import redirect
from django.shortcuts import render
from django.http import Http404, HttpResponse, HttpResponseRedirect
from django.template.loader import get_template from django.template.loader import get_template
from django.template import Context from django.template import Context
from django.contrib.auth.models import User, Group from django.contrib.auth.models import User, Group
from scrap_pubmed.MedlineFetcherDavid2015 import MedlineFetcher from scrap_pubmed.MedlineFetcherDavid2015 import MedlineFetcher
from gargantext_web.api import JsonHttpResponse
from urllib.request import urlopen, urlretrieve from urllib.request import urlopen, urlretrieve
import json import json
from gargantext_web.settings import MEDIA_ROOT
# from datetime import datetime # from datetime import datetime
import time import time
import datetime import datetime
...@@ -21,9 +16,23 @@ import threading ...@@ -21,9 +16,23 @@ import threading
from django.core.files import File from django.core.files import File
from gargantext_web.settings import DEBUG from gargantext_web.settings import DEBUG
from node.models import Language, ResourceType, Resource, \
Node, NodeType, Node_Resource, Project, Corpus, \ from django.shortcuts import redirect
Ngram, Node_Ngram, NodeNgramNgram, NodeNodeNgram from django.shortcuts import render
from django.http import Http404, HttpResponse, HttpResponseRedirect, HttpResponseForbidden
from sqlalchemy import func
from sqlalchemy.orm import aliased
from collections import defaultdict
import threading
from node.admin import CustomForm
from gargantext_web.db import *
from gargantext_web.settings import DEBUG, MEDIA_ROOT
from gargantext_web.api import JsonHttpResponse
from parsing.corpustools import add_resource, parse_resources, extract_ngrams, compute_tfidf
def getGlobalStats(request ): def getGlobalStats(request ):
...@@ -31,7 +40,7 @@ def getGlobalStats(request ): ...@@ -31,7 +40,7 @@ def getGlobalStats(request ):
alist = ["bar","foo"] alist = ["bar","foo"]
if request.method == "POST": if request.method == "POST":
N = 100 N = 1000
query = request.POST["query"] query = request.POST["query"]
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query ) print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N ) print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
...@@ -72,9 +81,34 @@ def getGlobalStatsISTEXT(request ): ...@@ -72,9 +81,34 @@ def getGlobalStatsISTEXT(request ):
def doTheQuery(request , project_id): def doTheQuery(request , project_id):
alist = ["hola","mundo"] alist = ["hola","mundo"]
if request.method == "POST": # SQLAlchemy session
session = Session()
# do we have a valid project id?
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = (session
.query(Node)
.filter(Node.id == project_id)
.filter(Node.type_id == cache.NodeType['Project'].id)
).first()
if project is None:
raise Http404()
# do we have a valid user?
user = request.user
if not user.is_authenticated():
return redirect('/login/?next=%s' % request.path)
if project.user_id != user.id:
return HttpResponseForbidden()
if request.method == "POST":
query = request.POST["query"] query = request.POST["query"]
name = request.POST["string"] name = request.POST["string"]
...@@ -86,30 +120,26 @@ def doTheQuery(request , project_id): ...@@ -86,30 +120,26 @@ def doTheQuery(request , project_id):
urlreqs.append( instancia.medlineEfetchRAW( yearquery ) ) urlreqs.append( instancia.medlineEfetchRAW( yearquery ) )
alist = ["tudo fixe" , "tudo bem"] alist = ["tudo fixe" , "tudo bem"]
""" resourcetype = cache.ResourceType["pubmed"]
urlreqs: List of urls to query.
- Then, to each url in urlreqs you do:
eFetchResult = urlopen(url)
eFetchResult.read() # this will output the XML... normally you write this to a XML-file.
"""
thefile = "how we do this here?"
resource_type = ResourceType.objects.get(name="pubmed" )
parent = Node.objects.get(id=project_id)
node_type = NodeType.objects.get(name='Corpus')
type_id = NodeType.objects.get(name='Document').id
user_id = User.objects.get( username=request.user ).id
# corpus node instanciation as a Django model
corpus = Node( corpus = Node(
user=request.user, name = name,
parent=parent, user_id = request.user.id,
type=node_type, parent_id = project_id,
name=name, type_id = cache.NodeType['Corpus'].id,
language_id = None,
) )
session.add(corpus)
session.commit()
# """
# urlreqs: List of urls to query.
# - Then, to each url in urlreqs you do:
# eFetchResult = urlopen(url)
# eFetchResult.read() # this will output the XML... normally you write this to a XML-file.
# """
corpus.save()
tasks = MedlineFetcher() tasks = MedlineFetcher()
for i in range(8): for i in range(8):
...@@ -124,24 +154,30 @@ def doTheQuery(request , project_id): ...@@ -124,24 +154,30 @@ def doTheQuery(request , project_id):
dwnldsOK = 0 dwnldsOK = 0
for filename in tasks.firstResults: for filename in tasks.firstResults:
if filename!=False: if filename!=False:
corpus.add_resource( user=request.user, type=resource_type, file=filename ) # add the uploaded resource to the corpus
add_resource(corpus,
user_id = request.user.id,
type_id = resourcetype.id,
file = filename,
)
dwnldsOK+=1 dwnldsOK+=1
if dwnldsOK == 0: return JsonHttpResponse(["fail"]) if dwnldsOK == 0: return JsonHttpResponse(["fail"])
# do the WorkFlow
try: try:
if DEBUG is True: def apply_workflow(corpus):
corpus.workflow() parse_resources(corpus)
# corpus.workflow__MOV() extract_ngrams(corpus, ['title'])
compute_tfidf(corpus)
if DEBUG:
apply_workflow(corpus)
else: else:
corpus.workflow.apply_async((), countdown=3) thread = threading.Thread(target=apply_workflow, args=(corpus, ), daemon=True)
thread.start()
return JsonHttpResponse(["workflow","finished"])
except Exception as error: except Exception as error:
print('WORKFLOW ERROR')
print(error) print(error)
return HttpResponseRedirect('/project/' + str(project_id))
return JsonHttpResponse(["workflow","finished","outside the try-except"])
data = alist data = alist
return JsonHttpResponse(data) return JsonHttpResponse(data)
...@@ -164,59 +200,59 @@ def testISTEX(request , project_id): ...@@ -164,59 +200,59 @@ def testISTEX(request , project_id):
print(query_string , query , N) print(query_string , query , N)
urlreqs = [] # urlreqs = []
pagesize = 50 # pagesize = 50
tasks = MedlineFetcher() # tasks = MedlineFetcher()
chunks = list(tasks.chunks(range(N), pagesize)) # chunks = list(tasks.chunks(range(N), pagesize))
for k in chunks: # for k in chunks:
if (k[0]+pagesize)>N: pagesize = N-k[0] # if (k[0]+pagesize)>N: pagesize = N-k[0]
urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize)) # urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
print(urlreqs) # print(urlreqs)
urlreqs = ["http://localhost/374255" , "http://localhost/374278" ] # urlreqs = ["http://localhost/374255" , "http://localhost/374278" ]
print(urlreqs) # print(urlreqs)
resource_type = ResourceType.objects.get(name="istext" ) # resource_type = ResourceType.objects.get(name="istext" )
parent = Node.objects.get(id=project_id) # parent = Node.objects.get(id=project_id)
node_type = NodeType.objects.get(name='Corpus') # node_type = NodeType.objects.get(name='Corpus')
type_id = NodeType.objects.get(name='Document').id # type_id = NodeType.objects.get(name='Document').id
user_id = User.objects.get( username=request.user ).id # user_id = User.objects.get( username=request.user ).id
corpus = Node( # corpus = Node(
user=request.user, # user=request.user,
parent=parent, # parent=parent,
type=node_type, # type=node_type,
name=query, # name=query,
) # )
corpus.save() # corpus.save()
# configuring your queue with the event # # configuring your queue with the event
for i in range(8): # for i in range(8):
t = threading.Thread(target=tasks.worker2) #thing to do # t = threading.Thread(target=tasks.worker2) #thing to do
t.daemon = True # thread dies when main thread (only non-daemon thread) exits. # t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start() # t.start()
for url in urlreqs: # for url in urlreqs:
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.now().microsecond)) # filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.now().microsecond))
tasks.q.put( [url , filename]) #put a task in th queue # tasks.q.put( [url , filename]) #put a task in th queue
tasks.q.join() # wait until everything is finished # tasks.q.join() # wait until everything is finished
for filename in tasks.firstResults: # for filename in tasks.firstResults:
corpus.add_resource( user=request.user, type=resource_type, file=filename ) # corpus.add_resource( user=request.user, type=resource_type, file=filename )
corpus.save() # corpus.save()
print("DEBUG:",DEBUG) # print("DEBUG:",DEBUG)
# do the WorkFlow # # do the WorkFlow
try: # try:
if DEBUG is True: # if DEBUG is True:
corpus.workflow() # corpus.workflow()
else: # else:
corpus.workflow.apply_async((), countdown=3) # corpus.workflow.apply_async((), countdown=3)
return JsonHttpResponse(["workflow","finished"]) # return JsonHttpResponse(["workflow","finished"])
except Exception as error: # except Exception as error:
print(error) # print(error)
data = [query_string,query,N] data = [query_string,query,N]
return JsonHttpResponse(data) return JsonHttpResponse(data)
......
Project Gutenberg's Gargantua and Pantagruel, Complete., by Francois Rabelais
This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever. You may copy it, give it away or
re-use it under the terms of the Project Gutenberg License included
with this eBook or online at www.gutenberg.net
Title: Gargantua and Pantagruel, Complete.
Five Books Of The Lives, Heroic Deeds And Sayings Of Gargantua And
His Son Pantagruel
Author: Francois Rabelais
Release Date: August 8, 2004 [EBook #1200]
Language: English
*** START OF THIS PROJECT GUTENBERG EBOOK GARGANTUA AND PANTAGRUEL, ***
Produced by Sue Asscher and David Widger
MASTER FRANCIS RABELAIS
FIVE BOOKS OF THE LIVES, HEROIC DEEDS AND SAYINGS OF
GARGANTUA AND HIS SON PANTAGRUEL
Translated into English by
Sir Thomas Urquhart of Cromarty
and
Peter Antony Motteux
The text of the first Two Books of Rabelais has been reprinted from the
first edition (1653) of Urquhart's translation. Footnotes initialled 'M.'
are drawn from the Maitland Club edition (1838); other footnotes are by the
translator. Urquhart's translation of Book III. appeared posthumously in
1693, with a new edition of Books I. and II., under Motteux's editorship.
Motteux's rendering of Books IV. and V. followed in 1708. Occasionally (as
the footnotes indicate) passages omitted by Motteux have been restored from
the 1738 copy edited by Ozell.
Chapter 1.I. Of the Genealogy and Antiquity of Gargantua.
Chapter 1.II. The Antidoted Fanfreluches: or, a Galimatia of extravagant Conceits found in an ancient Monument.
Chapter 1.III. How Gargantua was carried eleven months in his mother's belly.
Chapter 1.IV. How Gargamelle, being great with Gargantua, did eat a huge deal of tripes.
Chapter 1.IX. The colours and liveries of Gargantua.
Chapter 1.L. Gargantua's speech to the vanquished.
Chapter 1.LI. How the victorious Gargantuists were recompensed after the battle.
Chapter 1.LII. How Gargantua caused to be built for the Monk the Abbey of Theleme.
Chapter 1.LIII. How the abbey of the Thelemites was built and endowed.
Chapter 1.LIV. The inscription set upon the great gate of Theleme.
Chapter 1.LV. What manner of dwelling the Thelemites had.
Chapter 1.LVI. How the men and women of the religious order of Theleme were apparelled.
Chapter 1.LVII. How the Thelemites were governed, and of their manner of living.
Chapter 1.LVIII. A prophetical Riddle.
Chapter 1.V. The Discourse of the Drinkers.
Chapter 1.VI. How Gargantua was born in a strange manner.
Chapter 1.VII. After what manner Gargantua had his name given him, and how he tippled, bibbed, and curried the can.
Chapter 1.VIII. How they apparelled Gargantua.
Chapter 1.X. Of that which is signified by the colours white and blue.
Chapter 1.XI. Of the youthful age of Gargantua.
Chapter 1.XII. Of Gargantua's wooden horses.
Chapter 1.XIII. How Gargantua's wonderful understanding became known to his father Grangousier, by the invention of a torchecul or wipebreech.
Chapter 1.XIV. How Gargantua was taught Latin by a Sophister.
Chapter 1.XIX. The oration of Master Janotus de Bragmardo for recovery of the bells.
Chapter 1.XL. Why monks are the outcasts of the world; and wherefore some have bigger noses than others.
Chapter 1.XLI. How the Monk made Gargantua sleep, and of his hours and breviaries.
Chapter 1.XLII. How the Monk encouraged his fellow-champions, and how he hanged upon a tree.
Chapter 1.XLIII. How the scouts and fore-party of Picrochole were met with by Gargantua, and how the Monk slew Captain Drawforth, and then was taken prisoner by his enemies.
Chapter 1.XLIV. How the Monk rid himself of his keepers, and how Picrochole's forlorn hope was defeated.
Chapter 1.XLIX. How Picrochole in his flight fell into great misfortunes, and what Gargantua did after the battle.
Chapter 1.XLV. How the Monk carried along with him the Pilgrims, and of the good words that Grangousier gave them.
Chapter 1.XLVI. How Grangousier did very kindly entertain Touchfaucet his prisoner.
Chapter 1.XLVII. How Grangousier sent for his legions, and how Touchfaucet slew Rashcalf, and was afterwards executed by the command of Picrochole.
Chapter 1.XLVIII. How Gargantua set upon Picrochole within the rock Clermond, and utterly defeated the army of the said Picrochole.
Chapter 1.XV. How Gargantua was put under other schoolmasters.
Chapter 1.XVI. How Gargantua was sent to Paris, and of the huge great mare that he rode on; how she destroyed the oxflies of the Beauce.
Chapter 1.XVII. How Gargantua paid his welcome to the Parisians, and how he took away the great bells of Our Lady's Church.
Chapter 1.XVIII. How Janotus de Bragmardo was sent to Gargantua to recover the great bells.
Chapter 1.XX. How the Sophister carried away his cloth, and how he had a suit in law against the other masters.
Chapter 1.XXI. The study of Gargantua, according to the discipline of his schoolmasters the Sophisters.
Chapter 1.XXII. The games of Gargantua.
Chapter 1.XXIII. How Gargantua was instructed by Ponocrates, and in such sort disciplinated, that he lost not one hour of the day.
Chapter 1.XXIV. How Gargantua spent his time in rainy weather.
Chapter 1.XXIX. The tenour of the letter which Grangousier wrote to his son Gargantua.
Chapter 1.XXV. How there was great strife and debate raised betwixt the cake-bakers of Lerne, and those of Gargantua's country, whereupon were waged great wars.
Chapter 1.XXVI. How the inhabitants of Lerne, by the commandment of Picrochole their king, assaulted the shepherds of Gargantua unexpectedly and on a sudden.
Chapter 1.XXVII. How a monk of Seville saved the close of the abbey from being ransacked by the enemy.
Chapter 1.XXVIII. How Picrochole stormed and took by assault the rock Clermond, and of Grangousier's unwillingness and aversion from the undertaking of war.
Chapter 1.XXX. How Ulric Gallet was sent unto Picrochole.
Chapter 1.XXXI. The speech made by Gallet to Picrochole.
Chapter 1.XXXII. How Grangousier, to buy peace, caused the cakes to be restored.
Chapter 1.XXXIII. How some statesmen of Picrochole, by hairbrained counsel, put him in extreme danger.
Chapter 1.XXXIV. How Gargantua left the city of Paris to succour his country, and how Gymnast encountered with the enemy.
Chapter 1.XXXIX. How the Monk was feasted by Gargantua, and of the jovial discourse they had at supper.
Chapter 1.XXXV. How Gymnast very souply and cunningly killed Captain Tripet and others of Picrochole's men.
Chapter 1.XXXVI. How Gargantua demolished the castle at the ford of Vede, and how they passed the ford.
Chapter 1.XXXVII. How Gargantua, in combing his head, made the great cannon-balls fall out of his hair.
Chapter 1.XXXVIII. How Gargantua did eat up six pilgrims in a salad.
Chapter 2.I. Of the original and antiquity of the great Pantagruel.
Chapter 2.II. Of the nativity of the most dread and redoubted Pantagruel.
Chapter 2.III. Of the grief wherewith Gargantua was moved at the decease of his wife Badebec.
Chapter 2.IV. Of the infancy of Pantagruel.
Chapter 2.IX. How Pantagruel found Panurge, whom he loved all his lifetime.
Chapter 2.V. Of the acts of the noble Pantagruel in his youthful age.
Chapter 2.VI. How Pantagruel met with a Limousin, who too affectedly did counterfeit the French language.
Chapter 2.VII. How Pantagruel came to Paris, and of the choice books of the Library of St. Victor.
Chapter 2.VIII. How Pantagruel, being at Paris, received letters from his father Gargantua, and the copy of them.
Chapter 2.X. How Pantagruel judged so equitably of a controversy, which was wonderfully obscure and difficult, that, by reason of his just decree therein, he was reputed to have a most admirable judgment.
Chapter 2.XI. How the Lords of Kissbreech and Suckfist did plead before Pantagruel without an attorney.
Chapter 2.XII. How the Lord of Suckfist pleaded before Pantagruel.
Chapter 2.XIII. How Pantagruel gave judgment upon the difference of the two lords.
Chapter 2.XIV. How Panurge related the manner how he escaped out of the hands of the Turks.
Chapter 2.XIX. How Panurge put to a nonplus the Englishman that argued by signs.
Chapter 2.XV. How Panurge showed a very new way to build the walls of Paris.
Chapter 2.XVI. Of the qualities and conditions of Panurge.
Chapter 2.XVII. How Panurge gained the pardons, and married the old women, and of the suit in law which he had at Paris.
Chapter 2.XVIII. How a great scholar of England would have argued against Pantagruel, and was overcome by Panurge.
Chapter 2.XX. How Thaumast relateth the virtues and knowledge of Panurge.
Chapter 2.XXI. How Panurge was in love with a lady of Paris.
Chapter 2.XXII. How Panurge served a Parisian lady a trick that pleased her not very well.
Chapter 2.XXIII. How Pantagruel departed from Paris, hearing news that the Dipsodes had invaded the land of the Amaurots; and the cause wherefore the leagues are so short in France.
Chapter 2.XXIV. A letter which a messenger brought to Pantagruel from a lady of Paris, together with the exposition of a posy written in a gold ring.
Chapter 2.XXIX. How Pantagruel discomfited the three hundred giants armed.
Chapter 2.XXV. How Panurge, Carpalin, Eusthenes, and Epistemon, the gentlemen attendants of Pantagruel, vanquished and discomfited six hundred and threescore horsemen very cunningly.
Chapter 2.XXVI. How Pantagruel and his company were weary in eating still salt meats; and how Carpalin went a-hunting to have some venison.
Chapter 2.XXVII. How Pantagruel set up one trophy in memorial of their valour, and Panurge another in remembrance of the hares. How Pantagruel likewise with his farts begat little men, and with his fisgs little women; and how Panurge broke a great staff over two glasses.
Chapter 2.XXVIII. How Pantagruel got the victory very strangely over the Dipsodes and the Giants.
Chapter 2.XXX. How Epistemon, who had his head cut off, was finely healed by Panurge, and of the news which he brought from the devils, and of the damned people in hell.
Chapter 2.XXXI. How Pantagruel entered into the city of the Amaurots, and how Panurge married King Anarchus to an old lantern-carrying hag, and made him a crier of green sauce.
Chapter 2.XXXII. How Pantagruel with his tongue covered a whole army, and what the author saw in his mouth.
Chapter 2.XXXIII. How Pantagruel became sick, and the manner how he was recovered.
Chapter 2.XXXIV. The conclusion of this present book, and the excuse of the author.
Chapter 3.I. How Pantagruel transported a colony of Utopians into Dipsody.
Chapter 3.II. How Panurge was made Laird of Salmigondin in Dipsody, and did waste his revenue before it came in.
Chapter 3.III. How Panurge praiseth the debtors and borrowers.
Chapter 3.IV. Panurge continueth his discourse in the praise of borrowers and lenders.
Chapter 3.IX. How Panurge asketh counsel of Pantagruel whether he should marry, yea, or no.
Chapter 3.L. How the famous Pantagruelion ought to be prepared and wrought.
Chapter 3.LI. Why it is called Pantagruelion, and of the admirable virtues.
Chapter 3.LII. How a certain kind of Pantagruelion is of that nature that the fire is not able to consume it.
Chapter 3.V. How Pantagruel altogether abhorreth the debtors and borrowers.
Chapter 3.VI. Why new married men were privileged from going to the wars.
Chapter 3.VII. How Panurge had a flea in his ear, and forbore to wear any longer his magnificent codpiece.
Chapter 3.VIII. Why the codpiece is held to be the chief piece of armour amongst warriors.
Chapter 3.X. How Pantagruel representeth unto Panurge the difficulty of giving advice in the matter of marriage; and to that purpose mentioneth somewhat of the Homeric and Virgilian lotteries.
Chapter 3.XI. How Pantagruel showeth the trial of one's fortune by the throwing of dice to be unlawful.
Chapter 3.XII. How Pantagruel doth explore by the Virgilian lottery what fortune Panurge shall have in his marriage.
Chapter 3.XIII. How Pantagruel adviseth Panurge to try the future good or bad luck of his marriage by dreams.
Chapter 3.XIV. Panurge's dream, with the interpretation thereof.
Chapter 3.XIX. How Pantagruel praiseth the counsel of dumb men.
Chapter 3.XL. How Bridlegoose giveth reasons why he looked upon those law- actions which he decided by the chance of the dice.
Chapter 3.XLI. How Bridlegoose relateth the history of the reconcilers of parties at variance in matters of law.
Chapter 3.XLII. How suits at law are bred at first, and how they come afterwards to their perfect growth.
Chapter 3.XLIII. How Pantagruel excuseth Bridlegoose in the matter of sentencing actions at law by the chance of the dice.
Chapter 3.XLIV. How Pantagruel relateth a strange history of the perplexity of human judgment.
Chapter 3.XLIX. How Pantagruel did put himself in a readiness to go to sea; and of the herb named Pantagruelion.
Chapter 3.XLV. How Panurge taketh advice of Triboulet.
Chapter 3.XLVI. How Pantagruel and Panurge diversely interpret the words of Triboulet.
Chapter 3.XLVII. How Pantagruel and Panurge resolved to make a visit to the Oracle of the Holy Bottle.
Chapter 3.XLVIII. How Gargantua showeth that the children ought not to marry without the special knowledge and advice of their fathers and mothers.
Chapter 3.XV. Panurge's excuse and exposition of the monastic mystery concerning powdered beef.
Chapter 3.XVI. How Pantagruel adviseth Panurge to consult with the Sibyl of Panzoust.
Chapter 3.XVII. How Panurge spoke to the Sibyl of Panzoust.
Chapter 3.XVIII. How Pantagruel and Panurge did diversely expound the verses of the Sibyl of Panzoust.
Chapter 3.XX. How Goatsnose by signs maketh answer to Panurge.
Chapter 3.XXI. How Panurge consulteth with an old French poet, named Raminagrobis.
Chapter 3.XXII. How Panurge patrocinates and defendeth the Order of the Begging Friars.
Chapter 3.XXIII. How Panurge maketh the motion of a return to Raminagrobis.
Chapter 3.XXIV. How Panurge consulteth with Epistemon.
Chapter 3.XXIX. How Pantagruel convocated together a theologian, physician, lawyer, and philosopher, for extricating Panurge out of the perplexity wherein he was.
Chapter 3.XXV. How Panurge consulteth with Herr Trippa.
Chapter 3.XXVI. How Panurge consulteth with Friar John of the Funnels.
Chapter 3.XXVII. How Friar John merrily and sportingly counselleth Panurge.
Chapter 3.XXVIII. How Friar John comforteth Panurge in the doubtful matter of cuckoldry.
Chapter 3.XXX. How the theologue, Hippothadee, giveth counsel to Panurge in the matter and business of his nuptial enterprise.
Chapter 3.XXXI. How the physician Rondibilis counselleth Panurge.
Chapter 3.XXXII. How Rondibilis declareth cuckoldry to be naturally one of the appendances of marriage.
Chapter 3.XXXIII. Rondibilis the physician's cure of cuckoldry.
Chapter 3.XXXIV. How women ordinarily have the greatest longing after things prohibited.
Chapter 3.XXXIX. How Pantagruel was present at the trial of Judge Bridlegoose, who decided causes and controversies in law by the chance and fortune of the dice.
Chapter 3.XXXV. How the philosopher Trouillogan handleth the difficulty of marriage.
Chapter 3.XXXVI. A continuation of the answer of the Ephectic and Pyrrhonian philosopher Trouillogan.
Chapter 3.XXXVII. How Pantagruel persuaded Panurge to take counsel of a fool.
Chapter 3.XXXVIII. How Triboulet is set forth and blazed by Pantagruel and Panurge.
Chapter 4.I. How Pantagruel went to sea to visit the oracle of Bacbuc, alias the Holy Bottle.
Chapter 4.II. How Pantagruel bought many rarities in the island of Medamothy.
Chapter 4.III. How Pantagruel received a letter from his father Gargantua, and of the strange way to have speedy news from far distant places.
Chapter 4.IV. How Pantagruel writ to his father Gargantua, and sent him several curiosities.
Chapter 4.IX. How Pantagruel arrived at the island of Ennasin, and of the strange ways of being akin in that country.
Chapter 4.L. How Homenas showed us the archetype, or representation of a pope.
Chapter 4.LI. Table-talk in praise of the decretals.
Chapter 4.LII. A continuation of the miracles caused by the decretals.
Chapter 4.LIII. How, by the virtue of the decretals, gold is subtilely drawn out of France to Rome.
Chapter 4.LIV. How Homenas gave Pantagruel some bon-Christian pears.
Chapter 4.LIX. Of the ridiculous statue Manduce; and how and what the Gastrolaters sacrifice to their ventripotent god.
Chapter 4.LV. How Pantagruel, being at sea, heard various unfrozen words.
Chapter 4.LVI. How among the frozen words Pantagruel found some odd ones.
Chapter 4.LVII. How Pantagruel went ashore at the dwelling of Gaster, the first master of arts in the world.
Chapter 4.LVIII. How, at the court of the master of ingenuity, Pantagruel detested the Engastrimythes and the Gastrolaters.
Chapter 4.LX. What the Gastrolaters sacrificed to their god on interlarded fish-days.
Chapter 4.LXI. How Gaster invented means to get and preserve corn.
Chapter 4.LXII. How Gaster invented an art to avoid being hurt or touched by cannon-balls.
Chapter 4.LXIII. How Pantagruel fell asleep near the island of Chaneph, and of the problems proposed to be solved when he waked.
Chapter 4.LXIV. How Pantagruel gave no answer to the problems.
Chapter 4.LXV. How Pantagruel passed the time with his servants.
Chapter 4.LXVI. How, by Pantagruel's order, the Muses were saluted near the isle of Ganabim.
Chapter 4.LXVII. How Panurge berayed himself for fear; and of the huge cat Rodilardus, which he took for a puny devil.
Chapter 4.V. How Pantagruel met a ship with passengers returning from Lantern-land.
Chapter 4.VI. How, the fray being over, Panurge cheapened one of Dingdong's sheep.
Chapter 4.VII. Which if you read you'll find how Panurge bargained with Dingdong.
Chapter 4.VIII. How Panurge caused Dingdong and his sheep to be drowned in the sea.
Chapter 4.X. How Pantagruel went ashore at the island of Chely, where he saw King St. Panigon.
Chapter 4.XI. Why monks love to be in kitchens.
Chapter 4.XII. How Pantagruel passed by the land of Pettifogging, and of the strange way of living among the Catchpoles.
Chapter 4.XIII. How, like Master Francis Villon, the Lord of Basche commended his servants.
Chapter 4.XIV. A further account of catchpoles who were drubbed at Basche's house.
Chapter 4.XIX. What countenances Panurge and Friar John kept during the.
Chapter 4.XL. How Friar John fitted up the sow; and of the valiant cooks that went into it.
Chapter 4.XLI. How Pantagruel broke the Chitterlings at the knees.
Chapter 4.XLII. How Pantagruel held a treaty with Niphleseth, Queen of the Chitterlings.
Chapter 4.XLIII. How Pantagruel went into the island of Ruach.
Chapter 4.XLIV. How small rain lays a high wind.
Chapter 4.XLIX. How Homenas, Bishop of Papimany, showed us the Uranopet decretals .
Chapter 4.XLV. How Pantagruel went ashore in the island of Pope-Figland.
Chapter 4.XLVI. How a junior devil was fooled by a husbandman of Pope- Figland.
Chapter 4.XLVII. How the devil was deceived by an old woman of Pope- Figland.
Chapter 4.XLVIII. How Pantagruel went ashore at the island of Papimany.
Chapter 4.XV. How the ancient custom at nuptials is renewed by the catchpole.
Chapter 4.XVI. How Friar John made trial of the nature of the catchpoles.
Chapter 4.XVII. How Pantagruel came to the islands of Tohu and Bohu; and of the strange death of Wide-nostrils, the swallower of windmills.
Chapter 4.XVIII. How Pantagruel met with a great storm at sea.
Chapter 4.XX. How the pilots were forsaking their ships in the greatest stress of weather.
Chapter 4.XXI. A continuation of the storm, with a short discourse on the subject of making testaments at sea.
Chapter 4.XXII. An end of the storm.
Chapter 4.XXIII. How Panurge played the good fellow when the storm was over.
Chapter 4.XXIV. How Panurge was said to have been afraid without reason during the storm.
Chapter 4.XXIX. How Pantagruel sailed by the Sneaking Island, where Shrovetide reigned.
Chapter 4.XXV. How, after the storm, Pantagruel went on shore in the islands of the Macreons.
Chapter 4.XXVI. How the good Macrobius gave us an account of the mansion and decease of the heroes.
Chapter 4.XXVII. Pantagruel's discourse of the decease of heroic souls; and of the dreadful prodigies that happened before the death of the late Lord de Langey.
Chapter 4.XXVIII. How Pantagruel related a very sad story of the death of the heroes.
Chapter 4.XXX. How Shrovetide is anatomized and described by Xenomanes.
Chapter 4.XXXI. Shrovetide's outward parts anatomized.
Chapter 4.XXXII. A continuation of Shrovetide's countenance.
Chapter 4.XXXIII. How Pantagruel discovered a monstrous physeter, or whirlpool, near the Wild Island.
Chapter 4.XXXIV. How the monstrous physeter was slain by Pantagruel.
Chapter 4.XXXIX. How Friar John joined with the cooks to fight the Chitterlings.
Chapter 4.XXXV. How Pantagruel went on shore in the Wild Island, the ancient abode of the Chitterlings.
Chapter 4.XXXVI. How the wild Chitterlings laid an ambuscado for Pantagruel.
Chapter 4.XXXVII. How Pantagruel sent for Colonel Maul-chitterling and Colonel Cut-pudding; with a discourse well worth your hearing about the names of places and persons.
Chapter 4.XXXVIII. How Chitterlings are not to be slighted by men.
Chapter 5.I. How Pantagruel arrived at the Ringing Island, and of the noise that we heard.
Chapter 5.II. How the Ringing Island had been inhabited by the Siticines, who were become birds.
Chapter 5.III. How there is but one pope-hawk in the Ringing Island.
Chapter 5.IV. How the birds of the Ringing Island were all passengers.
Chapter 5.IX. How we arrived at the island of Tools.
Chapter 5.V. Of the dumb Knight-hawks of the Ringing Island.
Chapter 5.VI. How the birds are crammed in the Ringing Island.
Chapter 5.VII. How Panurge related to Master Aedituus the fable of the horse and the ass.
Chapter 5.VIII. How with much ado we got a sight of the pope-hawk.
Chapter 5.X. How Pantagruel arrived at the island of Sharping.
Chapter 5.XI. How we passed through the wicket inhabited by Gripe-men-all, Archduke of the Furred Law-cats.
Chapter 5.XII. How Gripe-men-all propounded a riddle to us.
Chapter 5.XIII. How Panurge solved Gripe-men-all's riddle.
Chapter 5.XIV. How the Furred Law-cats live on corruption.
Chapter 5.XIX. How we arrived at the queendom of Whims or Entelechy.
Chapter 5.XL. How the battle in which the good Bacchus overthrew the Indians was represented in mosaic work.
Chapter 5.XLI. How the temple was illuminated with a wonderful lamp.
Chapter 5.XLII. How the Priestess Bacbuc showed us a fantastic fountain in the temple, and how the fountain-water had the taste of wine, according to the imagination of those who drank of it.
Chapter 5.XLIII. How the Priestess Bacbuc equipped Panurge in order to have the word of the Bottle.
Chapter 5.XLIV. How Bacbuc, the high-priestess, brought Panurge before the Holy Bottle.
Chapter 5.XLV. How Bacbuc explained the word of the Goddess-Bottle.
Chapter 5.XLVI. How Panurge and the rest rhymed with poetic fury.
Chapter 5.XLVII. How we took our leave of Bacbuc, and left the Oracle of the Holy Bottle.
Chapter 5.XV. How Friar John talks of rooting out the Furred Law-cats.
Chapter 5.XVI. How Pantagruel came to the island of the Apedefers, or Ignoramuses, with long claws and crooked paws, and of terrible adventures and monsters there.
Chapter 5.XVII. How we went forwards, and how Panurge had like to have been killed.
Chapter 5.XVIII. How our ships were stranded, and we were relieved by some people that were subject to Queen Whims (qui tenoient de la Quinte).
Chapter 5.XX. How the Quintessence cured the sick with a song.
Chapter 5.XXI. How the Queen passed her time after dinner.
Chapter 5.XXII. How Queen Whims' officers were employed; and how the said lady retained us among her abstractors.
Chapter 5.XXIII. How the Queen was served at dinner, and of her way of eating.
Chapter 5.XXIV. How there was a ball in the manner of a tournament, at which Queen Whims was present.
Chapter 5.XXIX. How Epistemon disliked the institution of Lent.
Chapter 5.XXV. How the thirty-two persons at the ball fought.
Chapter 5.XXVI. How we came to the island of Odes, where the ways go up and down.
Chapter 5.XXVII. How we came to the island of Sandals; and of the order of Semiquaver Friars.
Chapter 5.XXVIII. How Panurge asked a Semiquaver Friar many questions, and was only answered in monosyllables.
Chapter 5.XXX. How we came to the land of Satin.
Chapter 5.XXXI. How in the land of Satin we saw Hearsay, who kept a school of vouching.
Chapter 5.XXXII. How we came in sight of Lantern-land.
Chapter 5.XXXIII. How we landed at the port of the Lychnobii, and came to Lantern-land.
Chapter 5.XXXIV. How we arrived at the Oracle of the Bottle.
Chapter 5.XXXIX. How we saw Bacchus's army drawn up in battalia in mosaic work.
Chapter 5.XXXV. How we went underground to come to the Temple of the Holy Bottle, and how Chinon is the oldest city in the world.
Chapter 5.XXXVI. How we went down the tetradic steps, and of Panurge's fear.
Chapter 5.XXXVII. How the temple gates in a wonderful manner opened of themselves.
Chapter 5.XXXVIII. Of the temple's admirable pavement.
...@@ -42,12 +42,12 @@ ...@@ -42,12 +42,12 @@
</p> </p>
{% endif %} {% endif %}
<!-- <a class="btn btn-primary btn-lg" role="button" href="/admin/documents/corpus/{{ corpus.pk }}/">Add file</a> --> <!-- <a class="btn btn-primary btn-lg" role="button" href="/admin/documents/corpus/{{ corpus.id }}/">Add file</a> -->
<a class="btn btn-primary btn-lg" role="button" href="/project/{{project.pk}}/corpus/{{ corpus.pk }}/corpus.csv">Save as</a> <a class="btn btn-primary btn-lg" role="button" href="/project/{{project.id}}/corpus/{{ corpus.id }}/corpus.csv">Save as</a>
<a class="btn btn-primary btn-lg" role="button" href="/project/{{project.pk}}/corpus/{{ corpus.pk }}/delete">Delete</a></p> <a class="btn btn-primary btn-lg" role="button" href="/delete/{{ corpus.id }}">Delete</a></p>
{% if number == 0 %} {% if number == 0 %}
<a class="btn btn-primary btn-lg" role="button" href="/admin/documents/corpus/{{ corpus.pk }}/">Add documents</a></p> <a class="btn btn-primary btn-lg" role="button" href="/admin/documents/corpus/{{ corpus.id }}/">Add documents</a></p>
{% endif %} {% endif %}
</div> </div>
......
...@@ -260,8 +260,29 @@ ...@@ -260,8 +260,29 @@
</div> </div>
<div id="topPapers"></div> <div id="topPapers"></div>
<!--
<div id="tab-container-top" class='tab-container'>
<ul class='etabs'>
<li id="tabmed" class='tab active'><a href="#tabs3">Medline Pubs</a></li>
<li id="tabgps" class='tab'><a href="#tabs4">+</a></li>
</ul>
<div class='panel-container'>
<div id="tabs3">
<div id="topPapers"></div>
</div>
<div id="tabs4">
<div id="topProposals"></div>
</div>
</div>
</div>
-->
<div id="information"></div> <div id="information"></div>
</div> </div>
......
...@@ -19,18 +19,16 @@ ...@@ -19,18 +19,16 @@
<div class="col-md-4 content"> <div class="col-md-4 content">
<h1>Gargantext</h1> <h1>Gargantext</h1>
<p>A web platform to explore text-mining</p> <p>A web platform to explore text-mining</p>
<a class="btn btn-primary btn-lg" href="/projects">Test Gargantext</a> <a class="btn btn-primary btn-lg" href="/projects" title="Click and test by yourself">Test Gargantext</a>
</div> </div>
<div class="col-md-3 content"> <div class="col-md-2 content"></div>
</div> <div class="col-md-2 content"></div>
<div class="col-md-5 content"> <div class="col-md-2 content">
<!-- <p class="right">
<h3>Project Manager:</h3> <div style="border:15px">
<h4><a href="http://alexandre.delanoe.org" target="blank">Alexandre Delanoë</a></h4> <img src="{% static "img/logo.png"%}" title="Logo designed by anoe" style="100px; height:150px; border:3px solid white">
<h3>Scientific board:</h3> </div>
<h4><a href="http://chavalarias.com" target="blank">David Chavalarias</a> and <a href="http://alexandre.delanoe.org" target="blank">Alexandre Delanoë</a></h4> </p>
<h3><a href="/about/#collapseTeam" target="blank">Thanks to all the team</a></h3>
--!>
</div> </div>
</div> </div>
</div> </div>
...@@ -39,7 +37,7 @@ ...@@ -39,7 +37,7 @@
<div class="row"> <div class="row">
<div class="content"> <div class="content">
<center> <center>
<img src="{% static "img/logo.png"%}" alt="Logo Gargantext" style="100px; height:150px"> <img src="{% static "img/Gargantextuel-212x300.jpg"%}" title="Gargantextuel drawn by Cecile Meadel" style="border:2px solid black">
<!-- <!--
<h2>Introduction Video</h2> <h2>Introduction Video</h2>
...@@ -63,57 +61,23 @@ ...@@ -63,57 +61,23 @@
<div class="row"> <div class="row">
<div class="col-md-4 content"> <div class="col-md-4 content">
<h3><a href="#">Historic</a></h3> <h3><a href="#" title="Random sentences in Gargantua's Books chapters, historically true">Historic</a></h3>
<p> <p> {{ paragraph_gargantua }}</p>
Chapter 1.VI. -- How Gargantua was born in a strange manner.
Chapter 2.XXIII. -- How Pantagruel departed from Paris, hearing
news that the Dipsodes had invaded the land of the Amaurots; and
the cause wherefore the leagues are so short in France. Chapter
3.XLVI. -- How Pantagruel and Panurge diversely interpret the
words of Triboulet. Chapter 4.LV. -- How Pantagruel, being at sea,
heard various unfrozen words. Chapter 5.IX. -- How we arrived at
the island of Tools.
</p>
</div> </div>
<div class="col-md-4 content"> <div class="col-md-4 content">
<h3><a href="#">Presentation</a></h3> <h3><a href="#" title="Randomized words, semantically and syntaxically falses." >Presentation</a></h3>
<p> <p> {{ paragraph_lorem }}
Lorem ipsum dolor sit amet, consectetur adipiscing elit,
sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris
nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in
reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla
pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
culpa qui officia deserunt mollit anim id est laborum.
</p> </p>
</div> </div>
<div class="col-md-4 content"> <div class="col-md-4 content">
<h3><a href="#">Tutoreil</a></h3> <h3><a href="#" title="Randomized letters, true or false ?">Tutoreil</a></h3>
<p> <p>
{{ paragraph_tutoreil }}
<!-- Why not French ? --> <!-- Why not French ? -->
<!-- find Cambridge source which inspired this --!> <!-- find Cambridge source which inspired this --!>
Il praaît que l'odrre des ltetres dnas un mot n'a pas
d'iprnorotncae. La pmeirère et la drenèire letrte diovent
êrte à la bnnoe pclae. Le rsete peut êrte dnas un dsérorde
ttoal et on puet tujoruos lrie snas poribême. On ne lit
donc pas chuaqe ltetre en elle-mmêe, mias le mot cmome un
tuot. Un chnagmnet de réfretniel et nuos tarnsposns ce
rselutat au txete lui-mmêe: l'odrre des mtos est faiblement
imoprtnat copmraé au cnotxete du txete qui, lui, est copmté:
comptexter avec Gargantext.
</p> </p>
</div> </div>
......
...@@ -17,16 +17,16 @@ ...@@ -17,16 +17,16 @@
<span class="icon-bar"></span> <span class="icon-bar"></span>
<span class="icon-bar"></span> <span class="icon-bar"></span>
</button> </button>
<a class="navbar-brand" style="line-height:15px; height:10px; padding: 10px 10px;" href="/"><img src="/img/logo.svg"></a> <a class="navbar-brand" style="line-height:15px; height:10px; padding: 10px 10px;" href="/"><img src="/img/logo.svg" title="Back to home."></a>
</div> </div>
<div class="navbar-collapse collapse"> <div class="navbar-collapse collapse">
<ul class="nav navbar-nav"> <ul class="nav navbar-nav">
<!-- <li><a href="/admin/">Admin/</a></li> --!> <!-- <li><a href="/admin/">Admin/</a></li> --!>
<li><a href="/about/">About</a> <li><a href="/about/" title="More informations about the project, its sponsors and its authors.">About</a>
</li> </li>
{% if user.is_authenticated %} {% if user.is_authenticated %}
<li><a href="/projects/">Projects</a></li> <li><a href="/projects/" title="All your projects are here.">Projects</a></li>
{% endif %} {% endif %}
{% if project %} {% if project %}
<li><a href="/project/{{project.id}}">{{project.name}}</a></li> <li><a href="/project/{{project.id}}">{{project.name}}</a></li>
...@@ -40,14 +40,14 @@ ...@@ -40,14 +40,14 @@
<ul class="nav pull-right"> <ul class="nav pull-right">
<li class="dropdown"> <li class="dropdown">
<a href="#" role="button" class="dropdown-toggle" data-toggle="dropdown"><i class="icon-user"></i> {{ user }}<i class="caret"></i> <a href="#" role="button" class="dropdown-toggle" data-toggle="dropdown" title="That is your login"><i class="icon-user"></i> {{ user }}<i class="caret"></i>
</a> </a>
<ul class="dropdown-menu"> <ul class="dropdown-menu">
<li><a tabindex="-1" href="http://www.iscpif.fr/tiki-index.php?page=gargantext_feedback" target="blank" >Report Feedback</a></li> <li><a tabindex="-1" href="http://www.iscpif.fr/tiki-index.php?page=gargantext_feedback" title="Send us a message (bug, thanks, congrats...)">Report Feedback</a></li>
<li class="divider"></li> <li class="divider"></li>
{% if user.is_authenticated %} {% if user.is_authenticated %}
<li><a tabindex="-1" href="/auth/logout">Logout</a></li> <li><a tabindex="-1" href="/auth/logout" title="Click here to logout especially on public devices">Logout</a></li>
{% else %} {% else %}
<li><a tabindex="-1" href="/auth/">Login</a></li> <li><a tabindex="-1" href="/auth/">Login</a></li>
{% endif %} {% endif %}
...@@ -66,8 +66,8 @@ ...@@ -66,8 +66,8 @@
<hr> <hr>
<footer> <footer>
<p>Gargantext, version 1.0.6, <a href="http://www.cnrs.fr" target="blank">Copyrights CNRS {{ date.year }}</a>, <p>Gargantext, version 1.0.6, <a href="http://www.cnrs.fr" target="blank" title="Institution that enables this project.">Copyrights CNRS {{ date.year }}</a>,
<a href="http://www.gnu.org/licenses/agpl-3.0.html" target="blank">Licence aGPLV3</a>.</p> <a href="http://www.gnu.org/licenses/agpl-3.0.html" target="blank" title="Legal instructions of the project.">Licence aGPLV3</a>.</p>
</footer> </footer>
......
...@@ -84,19 +84,16 @@ ...@@ -84,19 +84,16 @@
<ul> <ul>
{% for corpus in corpora %} {% for corpus in corpora %}
<li> {% ifnotequal corpus.count 0 %} <li> {% ifnotequal corpus.count 0 %}
<a href="/project/{{project.id}}/corpus/{{corpus.id}}"> <a href="/project/{{project.id}}/corpus/{{corpus.id}}"> {{corpus.name}} </a> , {{ corpus.count }} Documents
{{corpus.name}} {% else %}
</a> {{corpus.name}} : <img width="20px" src="{% static "js/libs/img2/loading-bar.gif" %}"></img> Processing, drink a cup of tea, and refresh the page :)
, {{ corpus.count }} Documents {% endifnotequal %}
{% else %}
{{corpus.name}} : <img width="20px" src="{% static "js/libs/img2/loading-bar.gif" %}"></img> Processing, drink a cup of tea, and refresh the page :)
{% endifnotequal %}
<button type="button" class="btn btn-xs btn-default" data-container="body" data-toggle="popover" data-placement="bottom" <button type="button" class="btn btn-xs btn-default" data-container="body" data-toggle="popover" data-placement="bottom"
data-content=' data-content='
<ul> <ul>
<li> Rename </li> <li> Rename </li>
<li> Add new documents </li> <li> Add new documents </li>
<li><a href="/project/{{ project.id }}/corpus/{{ corpus.id}}/delete">Delete</a></li> <li><a href="/delete/{{corpus.id}}">Delete</a></li>
</ul> </ul>
'>Manage</button> '>Manage</button>
</li> </li>
...@@ -330,7 +327,7 @@ ...@@ -330,7 +327,7 @@
console.log("enabling "+"#"+value.id) console.log("enabling "+"#"+value.id)
$("#"+value.id).attr('onclick','getGlobalResults(this);'); $("#"+value.id).attr('onclick','getGlobalResults(this);');
// $("#submit_thing").prop('disabled' , false) // $("#submit_thing").prop('disabled' , false)
$("#submit_thing").html("Process a 100 sample!") $("#submit_thing").html("Process a 1000 sample!")
thequeries = data thequeries = data
var N=0,k=0; var N=0,k=0;
...@@ -427,8 +424,8 @@ ...@@ -427,8 +424,8 @@
//CSS events for changing the Select element //CSS events for changing the Select element
function CustomForSelect( selected ) { function CustomForSelect( selected ) {
// show Radio-Inputs and trigger FileOrNotFile>@upload-file events // show Radio-Inputs and trigger FileOrNotFile>@upload-file events
//if(selected=="pubmed" || selected=="istext") { if(selected=="pubmed" || selected=="istext") {
if(selected=="pubmed") { // if(selected=="pubmed") {
console.log("show the button for: "+selected) console.log("show the button for: "+selected)
$("#pubmedcrawl").css("visibility", "visible"); $("#pubmedcrawl").css("visibility", "visible");
$("#pubmedcrawl").show(); $("#pubmedcrawl").show();
......
...@@ -44,7 +44,7 @@ ...@@ -44,7 +44,7 @@
<ul> <ul>
<li> Rename </li> <li> Rename </li>
<li> Add new corpus </li> <li> Add new corpus </li>
<li><a href="/project/{{ project.id }}/delete">Delete</a></li> <li><a href="/delete/{{ project.id }}">Delete</a></li>
</ul> </ul>
'>Manage</button> '>Manage</button>
......
...@@ -19,18 +19,21 @@ ...@@ -19,18 +19,21 @@
{% if documents %} {% if documents %}
<div id="delAll" style="visibility: hidden;">
<button onclick="deleteDuplicates(theurl);">Delete Duplicates</button>
</div>
<ul> <ul>
{% for doc in documents %} {% for doc in documents %}
{% if doc.date %} {% if doc.date %}
<li><div id="doc_{{doc.id}}"> <b>{{ doc.date }}</b>: <a target="_blank" href="/nodeinfo/{{doc.id}}">{{ doc.name}}</a> , @ {{ doc.metadata.source}}</div></li> <li><div id="doc_{{doc.id}}"> <b>{{ doc.date }}</b>: <a target="_blank" href="/nodeinfo/{{doc.id}}">{{ doc.name}}</a> , @ {{ doc.metadata.source}}</div></li>
{% endif %} {% endif %}
{% endfor %} {% endfor %}
<div id="delAll" style="visibility: hidden;">
<center>
<button onclick="deleteDuplicates(theurl);">Delete all Duplicates in one click</button>
</center>
</div>
</ul> </ul>
<script> <script>
......
# Without this, we couldn't use the Django environment
import os
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "gargantext_web.settings")
os.environ.setdefault("DJANGO_HSTORE_GLOBAL_REGISTER", "False")
# database tools
from node import models
from gargantext_web.db import *
from parsing.corpustools import *
user = session.query(User).first()
project = session.query(Node).filter(Node.name == 'A').first()
corpus = Node(
parent_id = project.id,
name = 'Test 456',
type_id = cache.NodeType['Corpus'].id,
user_id = user.id,
)
session.add(corpus)
session.commit()
add_resource(corpus,
# file = './data_samples/pubmed_result.xml',
file = '/srv/gargantext_lib/data_samples/pubmed_2013-04-01_HoneyBeesBeeBees.xml',
type_id = cache.ResourceType['pubmed'].id,
)
parse_resources(corpus)
extract_ngrams(corpus, ('title', ))
# print(corpus)
# corpus = session.query(Node).filter(Node.id == 72771).first()
# corpus = session.query(Node).filter(Node.id == 73017).first()
compute_tfidf(corpus)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment