Commit 1676bf93 authored by sim's avatar sim

Remove old REST-API

parent 8d42b26a
"""URL Configuration of GarganText """URL Configuration of GarganText
Views are shared between these modules: Views are shared between these modules:
- `api`, for JSON and CSV interaction with data
- `contents`, for Python-generated contents - `contents`, for Python-generated contents
""" """
...@@ -10,11 +9,8 @@ from django.contrib import admin ...@@ -10,11 +9,8 @@ from django.contrib import admin
from django.views.generic.base import RedirectView as Redirect from django.views.generic.base import RedirectView as Redirect
from django.contrib.staticfiles.storage import staticfiles_storage as static from django.contrib.staticfiles.storage import staticfiles_storage as static
import gargantext.views.api.urls
urlpatterns = [ url(r'^admin/' , admin.site.urls ) urlpatterns = [ url(r'^admin/' , admin.site.urls )
, url(r'^api/' , include( gargantext.views.api.urls ) )
, url(r'^favicon.ico$', Redirect.as_view( url=static.url('favicon.ico') , url(r'^favicon.ico$', Redirect.as_view( url=static.url('favicon.ico')
, permanent=False), name="favicon" ) , permanent=False), name="favicon" )
] ]
from gargantext.util.http import ValidationException, APIView \
, get_parameters, JsonHttpResponse, Http404\
, HttpResponse
from gargantext.util.db import session, delete, func, bulk_insert
from gargantext.models import Node, Ngram, NodeNgram, NodeNodeNgram, NodeNode, NodeHyperdata, HyperdataKey
from gargantext.constants import INDEXED_HYPERDATA
from django.core.exceptions import PermissionDenied, SuspiciousOperation
from sqlalchemy import or_, not_
from sqlalchemy.sql import func
from sqlalchemy.orm import aliased
import datetime
import collections
from gargantext.util.db import *
from gargantext.util.validation import validate
from rest_framework.authentication import SessionAuthentication, BasicAuthentication
from rest_framework.views import APIView
from rest_framework.response import Response
from rest_framework.exceptions import APIException as _APIException
def DebugHttpResponse(data):
return HttpResponse('<html><body style="background:#000;color:#FFF"><pre>%s</pre></body></html>' % (str(data), ))
import json
class JSONEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, datetime.datetime):
return obj.isoformat()[:19] + 'Z'
else:
return super(self.__class__, self).default(obj)
json_encoder = JSONEncoder(indent=4)
def JsonHttpResponse(data, status=200):
return HttpResponse(
content = json_encoder.encode(data),
content_type = 'application/json; charset=utf-8',
status = status
)
Http400 = SuspiciousOperation
Http403 = PermissionDenied
import csv
def CsvHttpResponse(data, headers=None, status=200):
response = HttpResponse(
content_type = "text/csv",
status = status
)
writer = csv.writer(response, delimiter=',')
if headers:
writer.writerow(headers)
for row in data:
writer.writerow(row)
return response
class APIException(_APIException):
def __init__(self, message, code=500):
self.status_code = code
self.detail = message
class NodeNgramsQueries(APIView):
_resolutions = {
'second': lambda d: d + datetime.timedelta(seconds=1),
'minute': lambda d: d + datetime.timedelta(minutes=1),
'hour': lambda d: d + datetime.timedelta(hours=1),
'day': lambda d: d + datetime.timedelta(days=1),
'week': lambda d: d + datetime.timedelta(days=7),
'month': lambda d: (d + datetime.timedelta(days=32)).replace(day=1),
'year': lambda d: (d + datetime.timedelta(days=367)).replace(day=1, month=1),
'decade': lambda d: (d + datetime.timedelta(days=3660)).replace(day=1, month=1),
'century': lambda d: (d + datetime.timedelta(days=36600)).replace(day=1, month=1),
}
_operators = {
'=': lambda field, value: (field == value),
'!=': lambda field, value: (field != value),
'<': lambda field, value: (field < value),
'>': lambda field, value: (field > value),
'<=': lambda field, value: (field <= value),
'>=': lambda field, value: (field >= value),
'in': lambda field, value: (or_(*tuple(field == x for x in value))),
'contains': lambda field, value: (field.contains(value)),
'doesnotcontain': lambda field, value: (not_(field.contains(value))),
'startswith': lambda field, value: (field.startswith(value)),
'endswith': lambda field, value: (field.endswith(value)),
}
_converters = {
'float': float,
'int': int,
'datetime': lambda x: x + '2000-01-01 00:00:00Z'[len(x):],
'text': str,
'string': str,
}
def post(self, request, project_id):
# example only
input = request.data or {
'x': {
'with_empty': True,
'resolution': 'decade',
'value': 'publication_date',
},
'y': {
# 'divided_by': 'total_ngrams_count',
# 'divided_by': 'total_documents_count',
},
'filter': {
# 'ngrams': ['bees', 'bee', 'honeybee', 'honeybees', 'honey bee', 'honey bees'],
# 'ngrams': ['insecticide', 'pesticide'],
# 'corpora': [52633],
# 'date': {'min': '1995-12-31'}
},
# 'format': 'csv',
}
print(input)
# input validation
input = validate(input, {'type': dict, 'default': {}, 'items': {
'x': {'type': dict, 'default': {}, 'items': {
# which hyperdata to choose for the date
'value': {'type': str, 'default': 'publication_date', 'range': {'publication_date', }},
# time resolution
'resolution': {'type': str, 'range': self._resolutions.keys(), 'default': 'month'},
# should we add zeroes for empty values?
'with_empty': {'type': bool, 'default': False},
}},
'y': {'type': dict, 'default': {}, 'items': {
# mesured value
'value': {'type': str, 'default': 'ngrams_count', 'range': {'ngrams_count', 'documents_count', 'ngrams_tfidf'}},
# value by which we should normalize
'divided_by': {'type': str, 'range': {'total_documents_count', 'documents_count', 'total_ngrams_count'}},
}},
# filtering
'filter': {'type': dict, 'default': {}, 'items': {
# filter by metadata
'hyperdata': {'type': list, 'default': [], 'items': {'type': dict, 'items': {
'key': {'type': str, 'range': self._operators.keys()},
'operator': {'type': str},
'value': {'type': str},
}}},
# filter by date
'date': {'type': dict, 'items': {
'min': {'type': datetime.datetime},
'max': {'type': datetime.datetime},
}, 'default': {}},
# filter by corpora
'corpora' : {'type': list, 'default': [], 'items': {'type': int}},
# filter by ngrams
'ngrams' : {'type': list, 'default': [], 'items': {'type': str}},
}},
# output format
'format': {'type': str, 'default': 'json', 'range': {'json', 'csv'}},
}})
# build query: prepare columns
X = aliased(NodeHyperdata)
column_x = func.date_trunc(input['x']['resolution'], X.value_utc)
column_y = {
'documents_count': func.count(Node.id.distinct()),
'ngrams_count': func.sum(NodeNgram.weight),
# 'ngrams_tfidf': func.sum(NodeNodeNgram.weight),
}[input['y']['value']]
# build query: base
print(input)
query_base = (session
.query(column_x)
.select_from(Node)
.join(NodeNgram , NodeNgram.node_id == Node.id)
.join(X , X.node_id == NodeNgram.node_id)
#.filter(X.key == input['x']['value'])
.group_by(column_x)
.order_by(column_x)
)
# build query: base, filter by corpora or project
if 'corpora' in input['filter'] and input['filter']['corpora']:
query_base = (query_base
.filter(Node.parent_id.in_(input['filter']['corpora']))
)
else:
ParentNode = aliased(Node)
query_base = (query_base
.join(ParentNode, ParentNode.id == Node.parent_id)
.filter(ParentNode.parent_id == project_id)
)
# build query: base, filter by date
if 'date' in input['filter']:
if 'min' in input['filter']['date']:
query_base = query_base.filter(X.value >= input['filter']['date']['min'])
if 'max' in input['filter']['date']:
query_base = query_base.filter(X.value <= input['filter']['date']['max'])
# build query: filter by ngrams
query_result = query_base.add_columns(column_y)
if 'ngrams' in input['filter'] and input['filter']['ngrams']:
query_result = (query_result
.join(Ngram, Ngram.id == NodeNgram.ngram_id)
.filter(Ngram.terms.in_(input['filter']['ngrams']))
)
# build query: filter by metadata
if 'hyperdata' in input['filter']:
for h, hyperdata in enumerate(input['filter']['hyperdata']):
print(h,hyperdata)
# get hyperdata in database
#if hyperdata_model is None:
# continue
#hyperdata_id, hyperdata_type = hyperdata_model
# create alias and query it
operator = self._operators[hyperdata['operator']]
type_string = type2string(INDEXED_HYPERDATA[hyperdata['key']]['type'])
value = self._converters[type_string](hyperdata['value'])
query_result = (query_result
.join(NodeHyperdata , NodeHyperdata.node_id == NodeNgram.node_id)
.filter(NodeHyperdata.key == hyperdata['key'])
.filter(operator(NodeHyperdata.value, value))
)
# build result: prepare data
date_value_list = query_result.all()
#print(date_value_list)
if date_value_list:
date_min = date_value_list[0][0].replace(tzinfo=None)
date_max = date_value_list[-2][0].replace(tzinfo=None)
# build result: prepare interval
result = collections.OrderedDict()
if input['x']['with_empty'] and date_value_list:
compute_next_date = self._resolutions[input['x']['resolution']]
date = date_min
while date <= date_max:
result[date] = 0.0
date = compute_next_date(date)
# build result: integrate
for date, value in date_value_list[0:-1]:
result[date.replace(tzinfo=None)] = value
# build result: normalize
query_normalize = None
if date_value_list and 'divided_by' in input['y'] and input['y']['divided_by']:
if input['y']['divided_by'] == 'total_documents_count':
query_normalize = query_base.add_column(func.count(Node.id.distinct()))
elif input['y']['divided_by'] == 'total_ngrams_count':
query_normalize = query_base.add_column(func.sum(NodeNgram.weight))
if query_normalize is not None:
for date, value in query_normalize[0:-1]:
date = date.replace(tzinfo=None)
if date in result:
result[date] /= value
# return result with proper formatting
if input['format'] == 'json':
return JsonHttpResponse({
'query': input,
'result': sorted(result.items()),
}, 201)
elif input['format'] == 'csv':
return CsvHttpResponse(sorted(result.items()), ('date', 'value'), 201)
_operators_dict = {
"=": lambda field, value: (field == value),
"!=": lambda field, value: (field != value),
"<": lambda field, value: (field < value),
">": lambda field, value: (field > value),
"<=": lambda field, value: (field <= value),
">=": lambda field, value: (field >= value),
"in": lambda field, value: (or_(*tuple(field == x for x in value))),
"contains": lambda field, value: (field.contains(value)),
"doesnotcontain": lambda field, value: (not_(field.contains(value))),
"startswith": lambda field, value: (field.startswith(value)),
}
od = collections.OrderedDict(sorted(INDEXED_HYPERDATA.items()))
_hyperdata_list = [ { key : value }
for key, value in od.items()
if key != 'abstract'
]
def type2string(given_type):
if given_type == int:
return "integer"
elif given_type == str:
return "string"
elif given_type == datetime.datetime:
return "datetime"
def get_metadata(corpus_id_list):
# query hyperdata keys
ParentNode = aliased(Node)
hyperdata_query = (session
.query(NodeHyperdata.key)
.join(Node, Node.id == NodeHyperdata.node_id)
.filter(Node.parent_id.in_(corpus_id_list))
.group_by(NodeHyperdata.key)
)
# build a collection with the hyperdata keys
collection = []
for hyperdata in INDEXED_HYPERDATA.keys():
valuesCount = 0
values = None
# count values and determine their span
values_count = None
values_from = None
values_to = None
# if hyperdata == 'text':
# node_hyperdata_query = (session
# .query(NodeHyperdata.key)
# .join(Node, Node.id == NodeHyperdata.node_id)
# .filter(Node.parent_id.in_(corpus_id_list))
# .filter(NodeHyperdata.key == hyperdata)
# .group_by(NodeHyperdata.key)
# .order_by(NodeHyperdata.key)
# )
# values_count = node_hyperdata_query.count()
# # values_count, values_from, values_to = node_hyperdata_query.first()
# if there is less than 32 values, retrieve them
values = None
if isinstance(values_count, int) and values_count <= 48:
if hyperdata == 'datetime':
values = [row.isoformat() for row in node_hyperdata_query.all()]
else:
values = [row for row in node_hyperdata_query.all()]
# adding this hyperdata to the collection
collection.append({
'key': str(hyperdata),
'type': type2string(INDEXED_HYPERDATA[hyperdata]['type']),
'values': values,
'valuesFrom': values_from,
'valuesTo': values_to,
'valuesCount': values_count,
})
# give the result back
return collection
class ApiHyperdata(APIView):
def get(self, request):
corpus_id_list = list(map(int, request.GET['corpus_id'].split(',')))
return JsonHttpResponse({
'data': get_metadata(corpus_id_list),
})
from rest_framework.status import *
from rest_framework.exceptions import APIException
from rest_framework.response import Response
from rest_framework.renderers import JSONRenderer, BrowsableAPIRenderer
from rest_framework.views import APIView
from rest_framework.authentication import SessionAuthentication, BasicAuthentication
from rest_framework.permissions import IsAuthenticated
from gargantext.constants import RESOURCETYPES, NODETYPES, get_resource
from gargantext.models import Node, Ngram, NodeNgram, NodeNodeNgram, NodeNode
from gargantext.util.db import session, delete, func, bulk_insert
from gargantext.util.db_cache import cache, or_
from gargantext.util.files import upload
from gargantext.util.http import ValidationException, APIView, JsonHttpResponse, get_parameters
from gargantext.util.scheduling import scheduled
from gargantext.util.validation import validate
#import
#NODES format
_user_default_fields =["is_staff","is_superuser","is_active", "username", "email", "first_name", "last_name", "id"]
_api_default_fields = ['id', 'parent_id', 'name', 'typename', 'date']
_doc_default_fields = ['id', 'parent_id', 'name', 'typename', 'date', "hyperdata"]
#_resource_default_fields = [['id', 'parent_id', 'name', 'typename', "hyperdata.method"]
#_corpus_default_fields = ['id', 'parent_id', 'name', 'typename', 'date', "hyperdata","resource"]
def format_parent(node):
'''format the parent'''
try:
#USER
if node.username != "":
return {field: getattr(node, field) for field in _user_default_fields}
except:
#DOC
if node.typename == "DOCUMENT":
return {field: getattr(node, field) for field in _doc_default_fields}
elif node.typename == "CORPUS":
parent = {field: getattr(node, field) for field in _doc_default_fields}
#documents
#parent["documents"] = {"count":node.children("DOCUMENT").count()}
#resources
#parent["resources"] = {"count":node.children("RESOURCE").count()}
#status
#return {field: getattr(node, field) for field in _doc_default_fields}
parent["status_msg"] = status_message
return parent
#PROJECT, RESOURCES?
else:
return {field: getattr(node, field) for field in _api_default_fields}
def format_records(node_list):
'''format the records list'''
if len(node_list) == 0:
return []
node1 = node_list[0]
#USER
if node1.typename == "USER":
return [{field: getattr(node, field) for field in _user_default_fields} for node in node_list]
#DOCUMENT
elif node1.typename == "DOCUMENT":
return [{field: getattr(node, field) for field in _doc_default_fields} for node in node_list]
#CORPUS, PROJECT, RESOURCES?
elif node1.typename == "CORPUS":
records = []
for node in node_list:
#PROJECTS VIEW SHOULD NE BE SO DETAILED
record = {field: getattr(node, field) for field in _doc_default_fields}
record["resources"] = [n.id for n in node.children("RESOURCE")]
record["documents"] = [n.id for n in node.children("DOCUMENT")]
#record["resources"] = format_records([n for n in node.children("RESOURCE")])
#record["documents"] = format_records([n for n in node.children("DOCUMENT")])
status = node.status()
if status is not None and not status['complete']:
if not status['error']:
status_message = '(in progress: %s, %d complete)' % (
status['action'].replace('_', ' '),
status['progress'],
)
else:
status_message = '(aborted: "%s" after %i docs)' % (
status['error'][-1],
status['progress']
)
else:
status_message = ''
record["status"] = status_message
records.append(record)
return records
else:
return [{field: getattr(node, field) for field in _api_default_fields} for node in node_list]
def check_rights(request, node_id):
'''check that the node belong to USER'''
node = session.query(Node).filter(Node.id == node_id).first()
if node is None:
raise APIException("403 Unauthorized")
# return Response({'detail' : "Node #%s not found" %(node_id) },
# status = status.HTTP_404_NOT_FOUND)
elif node.user_id != request.user.id:
#response_data = {"log": "Unauthorized"}
#return JsonHttpResponse(response_data, status=403)
raise APIException("403 Unauthorized")
else:
return node
def format_response(parent, records):
#print(records)
return { "parent": format_parent(parent),
"records": format_records(records),
"count":len(records)
}
from django.core.exceptions import *
from .api import * #APIView, APIException entre autres
from gargantext.util.db import session
from gargantext.models import Node
from gargantext.util.http import *
class CorpusView(APIView):
'''API endpoint that represent a corpus'''
def get(self, request, project_id, corpus_id, view = "DOCUMENT"):
'''GET corpus detail
default view full documents
'''
params = get_parameters(request)
if "view" in params.keys():
filter_view = params["view"].upper()
if view in ["DOCUMENT", "JOURNAL", "TITLE", "ANALYTICS", "RESSOURCE"]:
view = filter_view
project = session.query(Node).filter(Node.id == project_id, Node.typename == "PROJECT").first()
check_rights(request, project.id)
if project is None:
return Response({'detail' : "PROJECT Node #%s not found" %(project_id) },
status = status.HTTP_404_NOT_FOUND)
corpus = session.query(Node).filter(Node.id == corpus_id, Node.typename == "CORPUS").first()
if corpus is None:
return Response({'detail' : "CORPUS Node #%s not found" %(corpus_id) },
status = status.HTTP_404_NOT_FOUND)
documents = session.query(Node).filter(Node.parent_id == corpus_id, Node.typename == view).all()
context = format_response(corpus, documents)
return Response(context)
def delete(self, request, project_id, corpus_id):
'''DELETE corpus'''
print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>delete")
# project = session.query(Node).filter(Node.id == project_id, Node.typename == "PROJECT").first()
# check_rights(request, project.id)
# if project is None:
# return Response({'detail' : "PROJECT Node #%s not found" %(project_id) },
# status = status.HTTP_404_NOT_FOUND)
corpus = session.query(Node).filter(Node.id == corpus_id, Node.typename == "CORPUS").first()
if corpus is None:
return Response({'detail' : "CORPUS Node #%s not found" %(corpus_id) },
status = status.HTTP_404_NOT_FOUND)
documents = session.query(Node).filter(Node.parent_id == corpus_id).all()
session.delete(documents)
session.delete(corpus)
session.commit()
return Response(detail="Deleted corpus #%s" %str(corpus_id), status=HTTP_204_NO_CONTENT)
def put(self, request, project_id, corpus_id, view="DOCUMENT"):
'''UPDATE corpus'''
project = session.query(Node).filter(Node.id == project_id, Node.typename == "PROJECT").first()
project = check_rights(request, project.id)
if project is None:
return Response({'detail' : "PROJECT Node #%s not found" %(project_id) },
status = status.HTTP_404_NOT_FOUND)
corpus = session.query(Node).filter(Node.id == corpus_id, Node.typename == "CORPUS").first()
if corpus is None:
return Response({'detail' : "CORPUS Node #%s not found" %(corpus_id) },
status = status.HTTP_404_NOT_FOUND)
#documents = session.query(Node).filter(Node.parent_id == corpus_id, Node.typename= view).all()
for key, val in request.data.items():
if key in ["name", "date", "username", "hyperdata"]:
if key == "username":
#changement de propriétaire
#user = session.query(Node).filter(Node.typename=="USER", Node.username== username).first()
#print(user)
#set(node, user_id, user.id)
pass
elif key == "hyperdata":
#updating some contextualvalues of the corpus
pass
else:
setattr(node, key, val)
session.add(node)
session.commit()
'''#updating children???
'''
return Response({"detail":"Updated corpus #" %str(corpus.id)}, status=HTTP_202_ACCEPTED)
def post(self, request, project_id, corpus_id):
'''ADD a new RESOURCE to CORPUS'''
project = session.query(Node).filter(Node.id == project_id, Node.typename == "PROJECT").first()
check_rights(request, project.id)
if project is None:
return Response({'detail' : "PROJECT Node #%s not found" %(project_id) },
status = status.HTTP_404_NOT_FOUND)
corpus = session.query(Node).filter(Node.id == corpus_id, Node.typename == "CORPUS").first()
if corpus is None:
return Response({'detail' : "CORPUS Node #%s not found" %(corpus_id) },
status = status.HTTP_404_NOT_FOUND)
from gargantext.util.db_cache import cache
from gargantext.util.http import ValidationException, APIView \
, HttpResponse, JsonHttpResponse
from gargantext.util.toolchain.main import recount
from gargantext.util.scheduling import scheduled
from datetime import datetime
class CorpusMetrics(APIView):
def patch(self, request, corpusnode_id):
"""
PATCH triggers recount of metrics for the specified corpus.
ex PATCH http://localhost:8000/api/metrics/14072
-----
corpus_id
"""
print("==> update metrics request on ", corpusnode_id)
if not request.user.is_authenticated():
# can't use @requires_auth because of positional 'self' within class
return HttpResponse('Unauthorized', status=401)
try:
corpus = cache.Node[int(corpusnode_id)]
except:
corpus = None
if corpus is None:
raise ValidationException("%s is not a valid corpus node id."
% corpusnode_id)
else:
t_before = datetime.now()
# =============
scheduled(recount)(corpus.id)
# =============
t_after = datetime.now()
return JsonHttpResponse({
'corpus_id' : corpusnode_id,
'took': "%f s." % (t_after - t_before).total_seconds()
})
"""
API views for advanced operations on ngrams and ngramlists
-----------------------------------------------------------
- retrieve several lists together ("family")
- retrieve detailed list infos (ngram_id, term strings, scores...)
- modify NodeNgram lists (PUT/DEL an ngram to a MAINLIST OR MAPLIST...)
- modify NodeNgramNgram groups (PUT/DEL a list of groupings like {"767[]":[209,640],"779[]":[436,265,385]}")
"""
from gargantext.util.http import APIView, get_parameters, JsonHttpResponse,\
ValidationException, Http404, HttpResponse
from gargantext.util.db import session, aliased, bulk_insert
from gargantext.util.db_cache import cache
from sqlalchemy import tuple_
from gargantext.models import Ngram, NodeNgram, NodeNodeNgram, NodeNgramNgram, Node
from gargantext.util.lists import UnweightedList, Translations
from gargantext.util.scheduling import scheduled
# useful subroutines
from gargantext.util.ngramlists_tools import query_list, export_ngramlists, \
import_ngramlists, merge_ngramlists, \
import_and_merge_ngramlists
from gargantext.util.group_tools import query_grouped_ngrams
class List(APIView):
"""
see already available API query api/nodes/<list_id>?fields[]=ngrams
"""
pass
class CSVLists(APIView):
"""
GET => CSV exports of all lists of a corpus
POST => CSV import into existing lists as "post"
PATCH => internal import into existing lists (?POSSIBILITY put it in another class ?)
"""
def get(self, request):
params = get_parameters(request)
corpus_id = int(params.pop("corpus"))
corpus_node = cache.Node[corpus_id]
# response is file-like + headers
response = HttpResponse(content_type='text/csv')
response['Content-Disposition'] = 'attachment; filename="corpus-%i_gargantext_term_list.csv"' % corpus_id
# fill the response with the data
export_ngramlists(corpus_node, fname=response, titles=True)
return response
def post(self,request):
"""
Merge the lists of a corpus with other lists from a CSV source
or from another corpus
params in request.GET:
onto_corpus: the corpus whose lists are getting patched
params in request.data:
csvfile: the csv file
/!\ We assume we checked the file size client-side before upload
"""
if not request.user.is_authenticated():
res = HttpResponse("Unauthorized")
res.status_code = 401
return res
# the corpus with the target lists to be patched
params = get_parameters(request)
corpus_id = int(params.pop("onto_corpus"))
corpus_node = cache.Node[corpus_id]
if request.user.id != corpus_node.user_id:
res = HttpResponse("Unauthorized")
res.status_code = 401
return res
# request also contains the file
# csv_file has type django.core.files.uploadedfile.InMemoryUploadedFile
# ----------------------
csv_file = request.data['csvfile']
csv_contents = csv_file.read().decode("UTF-8").split("\n")
csv_file.close()
del csv_file
# import the csv
# try:
log_msg = "Async generation"
corpus_node_id = corpus_node.id
scheduled(import_and_merge_ngramlists)(csv_contents, corpus_node_id,
overwrite=bool(params.get('overwrite')))
return JsonHttpResponse({
'log': log_msg,
}, 200)
# except Exception as e:
# return JsonHttpResponse({
# 'err': str(e),
# }, 400)
def patch(self,request):
"""
A copy of POST (merging list) but with the source == just an internal corpus_id
params in request.GET:
onto_corpus: the corpus whose lists are getting patched
from: the corpus from which we take the source lists to merge in
todo: an array of the list types ("map", "main", "stop") to merge in
"""
if not request.user.is_authenticated():
res = HttpResponse("Unauthorized")
res.status_code = 401
return res
params = get_parameters(request)
print(params)
# the corpus with the target lists to be patched
corpus_id = int(params.pop("onto_corpus"))
corpus_node = cache.Node[corpus_id]
print(params)
if request.user.id != corpus_node.user_id:
res = HttpResponse("Unauthorized")
res.status_code = 401
return res
list_types = {'map':'MAPLIST', 'main':'MAINLIST', 'stop':'STOPLIST'}
# internal DB retrieve source_lists
source_corpus_id = int(params.pop("from_corpus"))
source_node = cache.Node[source_corpus_id]
todo_lists = params.pop("todo").split(',') # ex: ['map', 'stop']
source_lists = {}
for key in todo_lists:
source_lists[key] = UnweightedList(
source_node.children(list_types[key]).first().id
)
# add the groupings too
source_lists['groupings'] = Translations(
source_node.children("GROUPLIST").first().id
)
# attempt to merge and send response
try:
# merge the source_lists onto those of the target corpus
delete = todo_lists if bool(params.get('overwrite')) else []
if len(delete) == len(list_types):
delete.append('groupings')
log_msg = merge_ngramlists(source_lists, onto_corpus=corpus_node, del_originals=delete)
return JsonHttpResponse({
'log': log_msg,
}, 200)
except Exception as e:
return JsonHttpResponse({
'err': str(e),
}, 400)
class GroupChange(APIView):
"""
Modification of some groups
(typically new subform nodes under a mainform)
USAGE EXEMPLE:
HOST/api/ngramlists/groups?node=43
vvvvvv
group node
to modify
We use PUT HTTP method to send group data to DB and DELETE to remove them.
They both use same data format in the url (see links_to_couples).
No chained effects : simply adds or deletes rows of couples
NB: request.user is also checked for current authentication status
"""
def initial(self, request):
"""
Before dispatching to post() or delete()
Checks current user authentication to prevent remote DB manipulation
"""
if not request.user.is_authenticated():
raise Http404()
# can't use return in initial() (although 401 maybe better than 404)
# can't use @requires_auth because of positional 'self' within class
def links_to_couples(self,params):
"""
IN (dict from url params)
---
params = {
"mainform_A": ["subform_A1"]
"mainform_B": ["subform_B1,subform_B2,subform_B3"]
...
}
OUT (for DB rows)
----
couples = [
(mainform_A , subform_A1),
(mainform_B , subform_B1),
(mainform_B , subform_B2),
(mainform_B , subform_B3),
...
]
"""
couples = []
for (mainform_id, subforms_ids) in params.items():
for subform_id in subforms_ids[0].split(','):
# append the couple
couples.append((int(mainform_id),int(subform_id)))
return couples
def put(self, request):
"""
Add some group elements to a group node
=> adds new couples from GroupsBuffer._to_add of terms view
TODO see use of util.lists.Translations
Parameters are all in the url (for symmetry with DELETE method)
api/ngramlists/groups?node=783&1228[]=891,1639
=> creates 1228 - 891
and 1228 - 1639
general format is: mainform_id[]=subform_id1,subform_id2 etc
=> creates mainform_id - subform_id1
and mainform_id - subform_id2
NB: also checks if the couples exist before because the ngram table
will send the entire group (old existing links + new links)
"""
# from the url
params = get_parameters(request)
# the node param is unique
group_node = params.pop('node')
# the others params are links to change
couples = self.links_to_couples(params)
# debug
# print("==couples from url =================================++++=")
# print(couples)
# local version of "insert if not exists" -------------------->8--------
# (1) check already existing elements
check_query = (session.query(NodeNgramNgram)
.filter(NodeNgramNgram.node_id == group_node)
.filter(
tuple_(NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id)
.in_(couples)
)
)
existing = {}
for synonyms in check_query.all():
existing[(synonyms.ngram1_id,synonyms.ngram2_id)] = True
# debug
#print("==existing")
#print(existing)
# (2) compute difference locally
couples_to_add = [(mform,sform) for (mform,sform)
in couples
if (mform,sform) not in existing]
# debug
# print("== couples_to_add =================================++++=")
# print(couples_to_add)
# (3) add new groupings
bulk_insert(
NodeNgramNgram,
('node_id', 'ngram1_id', 'ngram2_id', 'weight'),
((group_node, mainform, subform, 1.0) for (mainform,subform)
in couples_to_add)
)
# ------------------------------------------------------------>8--------
return JsonHttpResponse({
'count_added': len(couples_to_add),
}, 200)
def delete(self, request):
"""
Within a groupnode, deletes some group elements from some groups
Data format just like in POST, everything in the url
"""
# from the url
params = get_parameters(request)
# the node param is unique
group_node = params.pop('node')
# the others params are links to change
couples_to_remove = self.links_to_couples(params)
# debug
# print("==couples_to_remove=================================dd=")
# print(couples_to_remove)
# remove selectively group_couples
# using IN is correct in this case: list of ids is short and external
# see stackoverflow.com/questions/444475/
db_rows = (session.query(NodeNgramNgram)
.filter(NodeNgramNgram.node_id == group_node)
.filter(
tuple_(NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id)
.in_(couples_to_remove)
)
)
n_removed = db_rows.delete(synchronize_session=False)
session.commit()
return JsonHttpResponse({
'count_removed': n_removed
}, 200)
class ListChange(APIView):
"""
Any ngram action on standard NodeNgram lists (MAIN, MAP, STOP)
USAGE EXEMPLE:
HOST/api/ngramlists/change?list=42&ngrams=1,2,3,4,5
vvvvvv ||||||
old list vvvvvv
to modify new list items
| |
v v
2 x UnweightedLists: self.base_list self.change_list
We use DEL/PUT HTTP methods to differentiate the 2 basic rm/add actions
They rely only on inline parameters (no need for payload data)
No chained effects: eg removing from MAPLIST will not remove
automatically from associated MAINLIST
NB: request.user is also checked for current authentication status
"""
def initial(self, request):
"""
Before dispatching to put(), delete()...
1) Checks current user authentication to prevent remote DB manipulation
2) Prepares self.list_objects from params
"""
if not request.user.is_authenticated():
raise Http404()
# can't use return in initial() (although 401 maybe better than 404)
# can't use @requires_auth because of positional 'self' within class
# get validated params
self.params = get_parameters(request)
(self.base_list, self.change_list) = ListChange._validate(self.params)
if not len(self.change_list.items):
payload_ngrams = request.data['ngrams']
# print("no change_list in params but we got:", payload_ngrams)
# change_list can be in payload too
change_ngram_ids = [int(n) for n in payload_ngrams.split(',')]
if (not len(change_ngram_ids)):
raise ValidationException('The "ngrams" parameter requires one or more ngram_ids separated by comma')
else:
self.change_list = UnweightedList(change_ngram_ids)
def put(self, request):
"""
Adds one or more ngrams to a list.
NB: we assume ngram_ids don't contain subforms !!
(this assumption is not checked here because it would be
slow: if you want to add a subform, send the mainform's id)
"""
# union of items ----------------------------
new_list = self.base_list + self.change_list
# -------------------------------------------
# save
new_list.save(self.base_list.id)
return JsonHttpResponse({
'parameters': self.params,
'count_added': len(new_list.items) - len(self.base_list.items),
}, 201)
def delete(self, request):
"""
Removes one or more ngrams from a list.
"""
# removal (set difference) ------------------
new_list = self.base_list - self.change_list
# -------------------------------------------
# save
new_list.save(self.base_list.id)
return JsonHttpResponse({
'parameters': self.params,
'count_removed': len(self.base_list.items) - len(new_list.items),
}, 200)
@staticmethod
def _validate(params):
"""
Checks "list" and "ngrams" parameters for their:
- presence
- type
These two parameters are mandatory for any ListChange methods.
ngrams are also converted to an UnweightedList object for easy add/remove
"""
if 'list' not in params:
raise ValidationException('The route /api/ngramlists/change requires a "list" \
parameter, for instance /api/ngramlists/change?list_id=42')
# if 'ngrams' not in params:
# raise ValidationException('The route /api/ngramlists/change requires an "ngrams"\
# parameter, for instance /api/ngramlists/change?ngrams=1,2,3,4')
# 2 x retrieval => 2 x UnweightedLists
# ------------------------------------
base_list_id = None
try:
base_list_id = int(params['list'])
# UnweightedList retrieved by id
except:
raise ValidationException('The "list" parameter requires an existing list id.')
base_list = UnweightedList(base_list_id)
change_ngram_ids = []
try:
change_ngram_ids = [int(n) for n in params['ngrams'].split(',')]
# UnweightedList created from items
except:
# ngrams no longer mandatory inline, see payload check afterwards
pass
change_list = UnweightedList(change_ngram_ids)
return(base_list, change_list)
class MapListGlance(APIView):
"""
Fast infos about the maplist only
HOST/api/ngramlists/glance?corpus=2
HOST/api/ngramlists/glance?maplist=92
REST Parameters:
"maplist=92"
the maplist to retrieve
"corpus=ID"
alternatively, the corpus to which the maplist belongs
"""
def get(self, request):
parameters = get_parameters(request)
maplist_id = None
scores_id = None
if "corpus" in parameters:
corpus_id = parameters['corpus']
corpus = cache.Node[corpus_id]
maplist_id = corpus.children('MAPLIST').first().id
# with a corpus_id, the explicit scoring pointer is optional
if "scoring" in parameters:
scores_id = parameters['scoring']
else:
scores_id = corpus.children('OCCURRENCES').first().id
elif "maplist" in parameters and "scoring" in parameters:
maplist_id = int(parameters['mainlist'])
scores_id = int(parameters['scoring'])
else:
raise ValidationException("A 'corpus' id or 'maplist' id is required, and a 'scoring' for occurences counts")
ngraminfo = {} # ngram details sorted per ngram id
listmembers = {'maplist':[]} # ngram ids sorted per list name
# infos for all ngrams from maplist
map_ngrams = query_list(maplist_id, details=True,
scoring_metric_id= scores_id).all()
# ex: [(8805, 'mean age', 4.0),
# (1632, 'activity', 4.0),
# (8423, 'present', 2.0),
# (2928, 'objective', 2.0)]
# shortcut to useful function during loop
add_to_members = listmembers['maplist'].append
for ng in map_ngrams:
ng_id = ng[0]
ngraminfo[ng_id] = ng[1:]
# maplist ngrams will already be <=> ngraminfos
# but the client side expects a membership lookup
# as when there are multiple lists or some groupings
add_to_members(ng_id)
return JsonHttpResponse({
'ngraminfos' : ngraminfo,
'listmembers' : listmembers,
'links' : {}, # no grouping links sent during glance (for speed)
'nodeids' : {
'mainlist': None,
'maplist' : maplist_id,
'stoplist': None,
'groups': None,
'scores': None,
}
})
class ListFamily(APIView):
"""
Compact combination of *multiple* list info
custom made for the "terms" view
---
Sends all JSON info of a collection of the 4 list types of a corpus
(or for any combination of lists that go together):
- a mainlist
- an optional stoplist
- an optional maplist
- an optional grouplist
USAGE EXEMPLES
HOST/api/ngramlists/family?corpus=2
HOST/api/ngramlists/family?corpus=2&head=10
HOST/api/ngramlists/family?mainlist=91&scoring=94
HOST/api/ngramlists/family?mainlist=91&scoring=94&head=10
HOST/api/ngramlists/family?mainlist=91&stoplist=90&scoring=94
etc.
REST Parameters:
"head=20"
use pagination to only load the k top ngrams of the mainlist
(useful for fast loading of terms view) [CURRENTLY NOT USED]
"corpus=ID"
the corpus id to retrieve all 4 lists
"scoring=ID"
the scoring node (defaults to the OCCURRENCES child of the corpus)
"mainlist=ID&scoring=ID[&stoplist=ID&groups=ID&maplist=ID]"
alternative call syntax without specifying a corpus
(uses all explicit IDs of the lists => gives the possibility for custom term views)
"""
def get(self, request):
parameters = get_parameters(request)
glance_limit = None
mainlist_id = None
scores_id = None
groups_id = None
other_list_ids = {'maplist':None, 'stoplist':None}
# 1) retrieve a mainlist_id and other lists
##########################################
# simple request: just refers to the parent corpus
# ------------------------------------------------
if "corpus" in parameters:
corpus_id = parameters['corpus']
corpus = cache.Node[corpus_id]
# with a corpus_id, the explicit scoring pointer is optional
if "scoring" in parameters:
scores_id = parameters['scoring']
else:
scores_id = corpus.children('OCCURRENCES').first().id
# retrieve the family of lists that have corpus as parent
mainlist_id = corpus.children('MAINLIST').first().id
groups_id = corpus.children('GROUPLIST').first().id
other_list_ids['stoplist'] = corpus.children('STOPLIST').first().id
other_list_ids['maplist'] = corpus.children('MAPLIST').first().id
# custom request: refers to each list individually
# -------------------------------------------------
elif "mainlist" in parameters and "scoring" in parameters:
mainlist_id = parameters['mainlist']
scores_id = parameters['scoring']
groups_id = None
if 'groups' in parameters:
groups_id = parameters['scoring']
for k in ['stoplist', 'maplist']:
if k in parameters:
other_list_ids[k] = parameters[k]
# or request has an error
# -----------------------
else:
raise ValidationException(
"Either a 'corpus' parameter or 'mainlist' & 'scoring' params are required"
)
# 2) get the infos for each list
################################
ngraminfo = {} # ngram details sorted per ngram id
linkinfo = {} # ngram groups sorted per ngram id
listmembers = {} # ngram ids sorted per list name
if "head" in parameters:
# head <=> only mainlist AND only k top ngrams
glance_limit = int(parameters['head'])
mainlist_query = query_list(mainlist_id, details=True,
pagination_limit = glance_limit,
scoring_metric_id= scores_id)
else:
# infos for all ngrams from mainlist
mainlist_query = query_list(mainlist_id, details=True,
scoring_metric_id= scores_id)
# infos for grouped ngrams, absent from mainlist
hidden_ngrams_query = query_grouped_ngrams(groups_id, details=True)
# infos for stoplist terms, absent from mainlist
stop_ngrams_query = query_list(other_list_ids['stoplist'], details=True,
scoring_metric_id=scores_id)
# and for the other lists (stop and map)
# no details needed here, just the member ids
for li in other_list_ids:
li_elts = query_list(other_list_ids[li], details=False
).all()
# simple array of ngram_ids
listmembers[li] = [ng[0] for ng in li_elts]
# and the groupings
if groups_id:
links = Translations(groups_id)
linkinfo = links.groups
# list of
ngrams_which_need_detailed_info = []
if "head" in parameters:
# head triggered simplified form: just the top of the mainlist
# TODO add maplist membership
ngrams_which_need_detailed_info = mainlist_query.all()
else:
ngrams_which_need_detailed_info = mainlist_query.all() + hidden_ngrams_query.all() + stop_ngrams_query.all()
# the output form of details is:
# ngraminfo[id] => [term, weight]
for ng in ngrams_which_need_detailed_info:
ng_id = ng[0]
ngraminfo[ng_id] = ng[1:]
# NB the client js will sort mainlist ngs from hidden ngs after ajax
# using linkinfo (otherwise needs redundant listmembers for main)
return JsonHttpResponse({
'ngraminfos' : ngraminfo,
'listmembers' : listmembers,
'links' : linkinfo,
'nodeids' : {
'mainlist': mainlist_id,
'maplist' : other_list_ids['maplist'],
'stoplist': other_list_ids['stoplist'],
'groups': groups_id,
'scores': scores_id,
}
})
from gargantext.util.http import ValidationException, APIView \
, get_parameters, JsonHttpResponse\
, HttpResponse
from gargantext.util.db import session, func
from gargantext.util.db_cache import cache
from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram
from sqlalchemy.orm import aliased
from re import findall
# ngrams put() will implement same text cleaning procedures as toolchain
from gargantext.util.toolchain.parsing import normalize_chars
from gargantext.util.toolchain.ngrams_extraction import normalize_forms
# for indexing
from gargantext.util.toolchain.ngrams_addition import index_new_ngrams
class ApiNgrams(APIView):
def get(self, request):
"""
Used for analytics
------------------
Get ngram listing + counts in a given scope
"""
# parameters retrieval and validation
startwith = request.GET.get('startwith', '').replace("'", "\\'")
# query ngrams
ParentNode = aliased(Node)
ngrams_query = (session
.query(Ngram.id, Ngram.terms, func.sum(NodeNgram.weight).label('count'))
.join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
.join(Node, Node.id == NodeNgram.node_id)
.group_by(Ngram.id, Ngram.terms)
# .group_by(Ngram)
.order_by(func.sum(NodeNgram.weight).desc(), Ngram.terms)
)
# filters
if 'startwith' in request.GET:
ngrams_query = ngrams_query.filter(Ngram.terms.startswith(request.GET['startwith']))
if 'contain' in request.GET:
print("request.GET['contain']")
print(request.GET['contain'])
ngrams_query = ngrams_query.filter(Ngram.terms.contains(request.GET['contain']))
if 'corpus_id' in request.GET:
corpus_id_list = list(map(int, request.GET.get('corpus_id', '').split(',')))
if corpus_id_list and corpus_id_list[0]:
ngrams_query = ngrams_query.filter(Node.parent_id.in_(corpus_id_list))
if 'ngram_id' in request.GET:
ngram_id_list = list(map(int, request.GET.get('ngram_id', '').split(',')))
if ngram_id_list and ngram_id_list[0]:
ngrams_query = ngrams_query.filter(Ngram.id.in_(ngram_id_list))
# pagination
offset = int(request.GET.get('offset', 0))
limit = int(request.GET.get('limit', 20))
total = ngrams_query.count()
# return formatted result
return JsonHttpResponse({
'pagination': {
'offset': offset,
'limit': limit,
'total': total,
},
'data': [
{
'id': ngram.id,
'terms': ngram.terms,
'count': ngram.count,
}
for ngram in ngrams_query[offset : offset+limit]
],
})
def put(self, request):
"""
Basic external access for *creating an ngram*
---------------------------------------------
1 - checks user authentication before any changes
2 - checks if ngram to Ngram table in DB
if yes returns ngram_id and optionally mainform_id
otherwise continues
3 - adds the ngram to Ngram table in DB
4 - (if corpus param is present)
adds the ngram doc counts to NodeNgram table in DB
(aka "index the ngram" throught the docs of the corpus)
5 - returns json with:
'msg' => a success msg
'text' => the initial text content
'term' => the normalized text content
'id' => the new ngram_id
'count' => the number of docs with the ngram in the corpus
(if corpus param is present)
'group' => the mainform_id if applicable
possible inline parameters
--------------------------
@param text=<ngram_string> [required]
@param corpus=<CORPUS_ID> [optional]
@param testgroup (true if present) [optional, requires corpus]
"""
# 1 - check user authentication
if not request.user.is_authenticated():
res = HttpResponse("Unauthorized")
res.status_code = 401
return res
# the params
params = get_parameters(request)
print("PARAMS", [(i,v) for (i,v) in params.items()])
if 'text' in params:
original_text = str(params.pop('text'))
ngram_str = normalize_forms(normalize_chars(original_text))
else:
raise ValidationException('The route PUT /api/ngrams/ is used to create a new ngram\
It requires a "text" parameter,\
for instance /api/ngrams?text=hydrometallurgy')
if ('testgroup' in params) and (not ('corpus' in params)):
raise ValidationException("'testgroup' param requires 'corpus' param")
# if we have a 'corpus' param (to do the indexing)...
do_indexation = False
if 'corpus' in params:
# we retrieve the corpus...
corpus_id = int(params.pop('corpus'))
corpus_node = cache.Node[corpus_id]
# and the user must also have rights on the corpus
if request.user.id == corpus_node.user_id:
do_indexation = True
else:
res = HttpResponse("Unauthorized")
res.status_code = 401
return res
# number of "words" in the ngram
ngram_size = len(findall(r' +', ngram_str)) + 1
# do the additions
try:
log_msg = ""
ngram_id = None
mainform_id = None
preexisting = session.query(Ngram).filter(Ngram.terms==ngram_str).first()
if preexisting is not None:
ngram_id = preexisting.id
log_msg += "ngram already existed (id %i)\n" % ngram_id
# in the context of a corpus we can also check if has mainform
# (useful for)
if 'testgroup' in params:
groupings_id = (session.query(Node.id)
.filter(Node.parent_id == corpus_id)
.filter(Node.typename == 'GROUPLIST')
.first()
)
had_mainform = (session.query(NodeNgramNgram.ngram1_id)
.filter(NodeNgramNgram.node_id == groupings_id)
.filter(NodeNgramNgram.ngram2_id == preexisting.id)
.first()
)
if had_mainform:
mainform_id = had_mainform[0]
log_msg += "ngram had mainform (id %i) in this corpus" % mainform_id
else:
log_msg += "ngram was not in any group for this corpus"
else:
# 2 - insert into Ngrams
new_ngram = Ngram(terms=ngram_str, n=ngram_size)
session.add(new_ngram)
session.commit()
ngram_id = new_ngram.id
log_msg += "ngram was added with new id %i\n" % ngram_id
# 3 - index the term
if do_indexation:
n_added = index_new_ngrams([ngram_id], corpus_node)
log_msg += 'ngram indexed in corpus %i\n' % corpus_id
return JsonHttpResponse({
'msg': log_msg,
'text': original_text,
'term': ngram_str,
'id' : ngram_id,
'group' : mainform_id,
'count': n_added if do_indexation else 'no corpus provided for indexation'
}, 200)
# just in case
except Exception as e:
return JsonHttpResponse({
'msg': str(e),
'text': original_text
}, 400)
from gargantext.models import Node, Ngram, NodeNgram, NodeNodeNgram, NodeNode
from gargantext.constants import NODETYPES, DEFAULT_N_DOCS_HAVING_NGRAM
from gargantext.util.db import session, delete, func, bulk_insert
from gargantext.util.db_cache import cache, or_
from gargantext.util.validation import validate
from gargantext.util.http import ValidationException, APIView \
, get_parameters, JsonHttpResponse, Http404\
, HttpResponse
from .api import *
from collections import defaultdict
import csv
_node_available_fields = ['id', 'parent_id', 'name', 'typename', 'hyperdata', 'ngrams', 'date']
_node_default_fields = ['id', 'parent_id', 'name', 'typename']
_node_available_types = NODETYPES
_hyperdata_available_fields = ['title', 'source', 'abstract', 'statuses',
'language_name', 'language_iso3','language_iso2','language_id',
'publication_date',
'publication_year','publication_month', 'publication_day',
'publication_hour','publication_minute','publication_second']
#_node_available_formats = ['json', 'csv', 'bibex']
def _query_nodes(request, node_id=None):
if request.user.id is None:
raise TypeError("This API request must come from an authenticated user.")
else:
# we query among the nodes that belong to this user
user = cache.User[request.user.id]
# parameters validation
# fixme: this validation does not allow custom keys in url (eg '?name=' for rename action)
parameters = get_parameters(request)
parameters = validate(parameters, {'type': dict, 'items': {
'formated': {'type': str, 'required' : False, 'default': 'json'},
'pagination_limit': {'type': int, 'default': 10},
'pagination_offset': {'type': int, 'default': 0},
'fields': {'type': list, 'default': _node_default_fields, 'items': {
'type': str, 'range': _node_available_fields,
}},
# choice of hyperdata fields
'hyperdata_filter': {'type': list, 'required':False,
'items': {
'type': str, 'range': _hyperdata_available_fields,
}},
# optional filtering parameters
'types': {'type': list, 'required': False, 'items': {
'type': str, 'range': _node_available_types,
}},
'parent_id': {'type': int, 'required': False},
}})
# debug
# print('PARAMS', parameters)
# additional validation for hyperdata_filter
if (('hyperdata_filter' in parameters)
and (not ('hyperdata' in parameters['fields']))):
raise ValidationException("Using the hyperdata_filter filter requires fields[]=hyperdata")
# start the query
query = user.nodes()
# filter by id
if node_id is not None:
query = query.filter(Node.id == node_id)
# filter by type
if 'types' in parameters:
query = query.filter(Node.typename.in_(parameters['types']))
# filter by parent
if 'parent_id' in parameters:
query = query.filter(Node.parent_id == parameters['parent_id'])
# count
count = query.count()
# order
query = query.order_by(Node.hyperdata['publication_date'], Node.id)
# paginate the query
if parameters['pagination_limit'] == -1:
query = query[parameters['pagination_offset']:]
else:
query = query[
parameters['pagination_offset'] :
parameters['pagination_limit']
]
# return the result!
# (the receiver function does the filtering of fields and hyperdata_filter)
return parameters, query, count
def _filter_node_fields(node, parameters):
"""
Filters the properties of a Node object before sending them to response
@parameters: a dict comming from get_parameters
that must only contain a 'fields' key
Usually the dict looks like this :
{'fields': ['parent_id', 'id', 'name', 'typename', 'hyperdata'],
'hyperdata_filter': ['title'], 'parent_id': '55054',
'types': ['DOCUMENT'], 'pagination_limit': '15'}
History:
1) this used to be single line:
res = {field: getattr(node, field) for field in parameters['fields']}
2) it was in both NodeResource.get() and NodeListResource.get()
3) it's now expanded to add support for parameters['hyperdata_filter']
- if absent, entire hyperdata is considered as one field
(as before)
- if present, the hyperdata subfields are picked
(new)
"""
# FIXME all this filtering
# could be done in rawsql
# (in _query_nodes)
result = {}
for field in parameters['fields']:
# normal field or entire hyperdata
if field != 'hyperdata' or (not 'hyperdata_filter' in parameters):
result[field] = getattr(node,field)
# hyperdata if needs to be filtered
else:
this_filtered_hyp = {}
for hfield in parameters['hyperdata_filter']:
if hfield in node.hyperdata:
this_filtered_hyp[hfield] = node.hyperdata[hfield]
result['hyperdata'] = this_filtered_hyp
return result
class Status(APIView):
'''API endpoint that represent the current status of the node'''
renderer_classes = (JSONRenderer, BrowsableAPIRenderer)
def get(self, request, node_id):
if not request.user.is_authenticated():
# can't use @requires_auth because of positional 'self' within class
return HttpResponse('Unauthorized', status=401)
user = cache.User[request.user.id]
# check_rights(request, node_id)
# I commented check_rights because filter on user_id below does the job
node = session.query(Node).filter(Node.id == node_id, Node.user_id== user.id).first()
if node is None:
return Response({"detail":"Node not Found for this user"}, status=HTTP_404_NOT_FOUND)
else:
# FIXME using the more generic strategy ---------------------------
# context = format_response(node, [n for n in node.children()])
# or perhaps ? context = format_response(None, [node])
# -----------------------------------------------------------------
# using a more direct strategy
context = {}
try:
context["statuses"] = node.hyperdata["statuses"]
except KeyError:
context["statuses"] = None
return Response(context)
def post(self, request, data):
'''create a new status for node'''
if not request.user.is_authenticated():
# can't use @requires_auth because of positional 'self' within class
return HttpResponse('Unauthorized', status=401)
raise NotImplementedError
def put(self, request, data):
'''update status for node'''
if not request.user.is_authenticated():
# can't use @requires_auth because of positional 'self' within class
return HttpResponse('Unauthorized', status=401)
user = cache.User[request.user.id]
# check_rights(request, node_id)
node = session.query(Node).filter(Node.id == node_id, Node.user_id== user.id).first()
raise NotImplementedError
#return Response({"detail":"Udpated status for NODE #%i " %node.id}, status=HTTP_202_ACCEPTED)
def delete(self, request):
'''delete status for node'''
if not request.user.is_authenticated():
# can't use @requires_auth because of positional 'self' within class
return HttpResponse('Unauthorized', status=401)
user = cache.User[request.user.id]
# check_rights(request, node_id)
node = session.query(Node).filter(Node.id == node_id, Node.user_id == user.id).first()
if node is None:
return Response({"detail":"Node not Found"}, status=HTTP_404_NOT_FOUND)
node.hyperdata["status"] = []
session.add(node)
session.commit()
return Response({"detail":"Deleted status for NODE #%i " %node.id}, status=HTTP_204_NO_CONTENT)
class NodeListResource(APIView):
def get(self, request):
"""Displays the list of nodes corresponding to the query.
"""
if not request.user.is_authenticated():
# can't use @requires_auth because of positional 'self' within class
return HttpResponse('Unauthorized', status=401)
parameters, query, count = _query_nodes(request)
if parameters['formated'] == 'json':
records_array = []
add_record = records_array.append
# FIXME filter in rawsql in _query_nodes
for node in query:
add_record(_filter_node_fields(node, parameters))
return JsonHttpResponse({
'parameters': parameters,
'count': count,
'records': records_array
})
elif parameters['formated'] == 'csv':
# TODO add support for fields and hyperdata_filter
response = HttpResponse(content_type='text/csv')
response['Content-Disposition'] = 'attachment; filename="Gargantext_Corpus.csv"'
writer = csv.writer(response, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
keys = [ 'title' , 'source'
, 'publication_year', 'publication_month', 'publication_day'
, 'abstract', 'authors']
writer.writerow(keys)
for node in query:
data = list()
for key in keys:
try:
data.append(node.hyperdata[key])
except:
data.append("")
writer.writerow(data)
return response
def post(self, request):
"""Create a new node.
NOT IMPLEMENTED
"""
def delete(self, request):
"""Removes the list of nodes corresponding to the query.
TODO : Should be a delete method!
"""
if not request.user.is_authenticated():
# can't use @requires_auth because of positional 'self' within class
return HttpResponse('Unauthorized', status=401)
parameters = get_parameters(request)
parameters = validate(parameters, {'ids': list} )
try :
node_ids = [int(n) for n in parameters['ids'].split(',')]
except :
raise ValidationException('"ids" needs integers separated by comma.')
try:
result = session.execute(
delete(Node).where(Node.id.in_(node_ids))
)
session.commit()
finally:
session.close()
return JsonHttpResponse({'deleted': result.rowcount})
class NodeListHaving(APIView):
'''
Gives a list of nodes according to its score which is related
to some specific ngrams.
TODO: implement other options (offset)
Simple implementation:
Takes IDs of corpus and ngram and returns list of relevent documents in json format
according to TFIDF score (order is decreasing).
2016-09: add total counts to output json
'''
def get(self, request, corpus_id):
if not request.user.is_authenticated():
# can't use @requires_auth because of positional 'self' within class
return HttpResponse('Unauthorized', status=401)
parameters = get_parameters(request)
parameters = validate(parameters, {'score': str, 'ngram_ids' : list} )
try :
ngram_ids = [int(n) for n in parameters['ngram_ids'].split(',')]
except :
raise ValidationException('"ngram_ids" needs integers separated by comma.')
limit = DEFAULT_N_DOCS_HAVING_NGRAM
nodes_list = []
corpus = session.query(Node).filter(Node.id==corpus_id).first()
tfidf_id = ( session.query( Node.id )
.filter( Node.typename == "TFIDF-CORPUS"
, Node.parent_id == corpus.id
)
.first()
)
tfidf_id = tfidf_id[0]
print(tfidf_id)
# request data
nodes_query = (session
.query(Node, func.sum(NodeNodeNgram.score))
.join(NodeNodeNgram, NodeNodeNgram.node2_id == Node.id)
.filter(NodeNodeNgram.node1_id == tfidf_id)
.filter(Node.typename == 'DOCUMENT', Node.parent_id== corpus.id)
.filter(or_(*[NodeNodeNgram.ngram_id==ngram_id for ngram_id in ngram_ids]))
.group_by(Node)
)
# get the total count before applying limit
nodes_count = nodes_query.count()
# now the query with the limit
nodes_results_query = (nodes_query
.order_by(func.sum(NodeNodeNgram.score).desc())
.limit(limit)
)
for node, score in nodes_results_query:
print(node,score)
print("\t corpus:",corpus_id,"\t",node.name)
node_dict = {
'id': node.id,
'score': score,
}
for key in ('title', 'publication_date', 'source', 'authors', 'fields'):
if key in node.hyperdata:
node_dict[key] = node.hyperdata[key]
nodes_list.append(node_dict)
return JsonHttpResponse({
'count': nodes_count,
'records': nodes_list
})
class NodeResource(APIView):
# contains a check on user.id (within _query_nodes)
def get(self, request, node_id):
if not request.user.is_authenticated():
# can't use @requires_auth because of positional 'self' within class
return HttpResponse('Unauthorized', status=401)
parameters, query, count = _query_nodes(request, node_id)
if not len(query):
raise Http404()
node = query[0]
return JsonHttpResponse(_filter_node_fields(node, parameters))
# contains a check on user.id (within _query_nodes)
def delete(self, request, node_id):
if not request.user.is_authenticated():
# can't use @requires_auth because of positional 'self' within class
return HttpResponse('Unauthorized', status=401)
parameters, query, count = _query_nodes(request, node_id)
if not len(query):
raise Http404()
try:
result = session.execute(
delete(Node).where(Node.id == node_id)
)
session.commit()
finally:
session.close()
return JsonHttpResponse({'deleted': result.rowcount})
def post(self, request, node_id):
"""
For the moment, only used to rename a node
params in request.GET:
none (not allowed by _query_nodes validation)
params in request.DATA:
["name": the_new_name_str]
TODO 1 factorize with .projects.ProjectView.put and .post (thx c24b)
TODO 2 allow other changes than name
"""
if not request.user.is_authenticated():
# can't use @requires_auth because of positional 'self' within class
return HttpResponse('Unauthorized', status=401)
# contains a check on user.id (within _query_nodes)
parameters, query, count = _query_nodes(request, node_id)
the_node = query.pop()
# retrieve the name
if 'name' in request.data:
new_name = request.data['name']
else:
return JsonHttpResponse({
"detail":"A 'name' parameter is required in data payload"
}, 400)
# check for conflicts
other = session.query(Node).filter(Node.name == new_name).count()
if other > 0:
return JsonHttpResponse({
"detail":"A node with this name already exists"
}, 409)
# normal case: do the renaming
else:
setattr(the_node, 'name', new_name)
session.commit()
return JsonHttpResponse({
'renamed': new_name
}, 200)
class CorpusFavorites(APIView):
"""Retrieve/update/delete one or several docs from a corpus associated favs
(url: GET /api/nodes/<corpus_id>/favorites)
=> lists all favorites
(url: GET /api/nodes/<corpus_id>/favorites?docs[]=doc1,doc2)
=> checks for each doc if it is in favorites
(url: DEL /api/nodes/<corpus_id>/favorites?docs[]=doc1,doc2)
=> removes each doc from favorites
(url: PUT /api/nodes/<corpus_id>/favorites?docs[]=doc1,doc2)
=> add each doc to favorites
"""
def _get_fav_node(self, corpus_id):
"""
NB: fav_node can be None if no node is defined
this query could be faster if we didn't check that corpus_id is a CORPUS
ie: session.query(Node)
.filter(Node.parent_id==corpus_id)
.filter(Node.typename =='FAVORITES')
"""
corpus = cache.Node[corpus_id]
if corpus.typename != 'CORPUS':
raise ValidationException(
"Only nodes of type CORPUS can accept favorites queries" +
" (but this node has type %s)..." % corpus.typename)
else:
self.corpus = corpus
fav_node = self.corpus.children('FAVORITES').first()
return fav_node
def get(self, request, corpus_id):
"""
2 possibilities with/without param
1) GET http://localhost:8000/api/nodes/2/favorites
(returns the full list of fav docs within corpus 2)
2) GET http://localhost:8000/api/nodes/2/favorites?docs=53,54
(will test if docs 53 and 54 are among the favorites of corpus 2)
(returns the intersection of fav docs with [53,54])
"""
if not request.user.is_authenticated():
# can't use @requires_auth because of positional 'self' within class
return HttpResponse('Unauthorized', status=401)
fav_node = self._get_fav_node(corpus_id)
req_params = validate(
get_parameters(request),
{'docs': list, 'default': ""}
)
response = {}
if fav_node == None:
response = {
'warning':'No favorites node is defined for this corpus (\'%s\')'
% self.corpus.name ,
'favdocs':[]
}
elif 'docs' not in req_params:
# each docnode associated to the favnode of this corpusnode
q = (session
.query(NodeNode.node2_id)
.filter(NodeNode.node1_id==fav_node.id))
all_doc_ids = [row.node2_id for row in q.all()]
response = {
'favdocs': all_doc_ids
}
else:
nodeids_to_check = [int(did) for did in req_params['docs'].split(',')]
# each docnode from the input list, if it is associated to the favnode
q = (session
.query(NodeNode.node2_id)
.filter(NodeNode.node1_id==fav_node.id)
.filter(NodeNode.node2_id.in_(nodeids_to_check)))
present_doc_ids = [row.node2_id for row in q.all()]
absent_doc_ids = [did for did in nodeids_to_check if did not in present_doc_ids]
response = {
'favdocs': present_doc_ids,
'missing': absent_doc_ids
}
return JsonHttpResponse(response)
def delete(self, request, corpus_id):
"""
DELETE http://localhost:8000/api/nodes/2/favorites?docs=53,54
(will delete docs 53 and 54 from the favorites of corpus 2)
"""
if not request.user.is_authenticated():
# can't use @requires_auth because of positional 'self' within class
return HttpResponse('Unauthorized', status=401)
# user is ok
fav_node = self._get_fav_node(corpus_id)
response = {}
if fav_node == None:
response = {
'warning':'No favorites node is defined for this corpus (\'%s\')'
% self.corpus.name ,
'count_removed': 0
}
else:
req_params = validate(
get_parameters(request),
{'docs': list, 'default': ""}
)
nodeids_to_delete = [int(did) for did in req_params['docs'].split(',')]
try:
# it deletes from favourites but not from DB
result = session.execute(
delete(NodeNode)
.where(NodeNode.node1_id == fav_node.id)
.where(NodeNode.node2_id.in_(nodeids_to_delete))
)
session.commit()
response = {'count_removed': result.rowcount}
finally:
session.close()
return JsonHttpResponse(response)
def put(self, request, corpus_id, check_each_doc=True):
if not request.user.is_authenticated():
# can't use @requires_auth because of positional 'self' within class
return HttpResponse('Unauthorized', status=401)
# user is ok
fav_node = self._get_fav_node(corpus_id)
response = {}
if fav_node == None:
response = {
'warning':'No favorites node is defined for this corpus (\'%s\')'
% self.corpus.name ,
'count_added':0
}
else:
req_params = validate(
get_parameters(request),
{'docs': list, 'default': ""}
)
nodeids_to_add = [int(did) for did in req_params['docs'].split(',')]
if check_each_doc:
# verification que ce sont bien des documents du bon corpus
# un peu long => désactiver par défaut ?
known_docs_q = (session
.query(Node.id)
.filter(Node.parent_id==corpus_id)
.filter(Node.typename=='DOCUMENT')
)
lookup = {known_doc.id:True for known_doc in known_docs_q.all()}
# debug
# print("lookup hash", lookup)
rejected_list = []
for doc_node_id in nodeids_to_add:
if (doc_node_id not in lookup):
rejected_list.append(doc_node_id)
if len(rejected_list):
raise ValidationException(
"Error on some requested docs: %s (Only nodes of type 'doc' AND belonging to corpus %i can be added to favorites.)"
% (str(rejected_list), int(corpus_id)))
# add them
bulk_insert(
NodeNode,
('node1_id', 'node2_id', 'score'),
((fav_node.id, doc_node_id, 1.0 ) for doc_node_id in nodeids_to_add)
)
# todo count really added (here: counts input param not result)
response = {'count_added': len(nodeids_to_add)}
return JsonHttpResponse(response)
class CorpusFacet(APIView):
"""Loop through a corpus node's docs => do counts by a hyperdata field
(url: /api/nodes/<node_id>/facets?hyperfield=<source>)
"""
# - old url: '^project/(\d+)/corpus/(\d+)/source/sources.json$',
# - old view: tests.ngramstable.views.get_sourcess_json()
# - now generalized for various hyperdata field:
# -> source
# -> publication_year
# -> rubrique
# -> language...
def get(self, request, node_id):
# check that the node is a corpus
# ? faster from cache than: corpus = session.query(Node)...
if not request.user.is_authenticated():
# can't use @requires_auth because of positional 'self' within class
return HttpResponse('Unauthorized', status=401)
corpus = cache.Node[node_id]
if corpus.typename != 'CORPUS':
raise ValidationException(
"Only nodes of type CORPUS can accept facet queries" +
" (but this node has type %s)..." % corpus.typename
)
else:
self.corpus = corpus
# check that the hyperfield parameter makes sense
_facet_available_subfields = [
'source', 'publication_year', 'rubrique',
'language_iso2', 'language_iso3', 'language_name',
'authors'
]
parameters = get_parameters(request)
# validate() triggers an info message if subfield not in range
parameters = validate(parameters, {'type': dict, 'items': {
'hyperfield': {'type': str, 'range': _facet_available_subfields}
}})
subfield = parameters['hyperfield']
# do the aggregated sum
(xcounts, total) = self._ndocs_by_facet(subfield)
# response
return JsonHttpResponse({
'doc_count' : total,
'by': { subfield: xcounts }
})
def _ndocs_by_facet(self, subfield='source'):
"""for example on 'source'
xcounts = {'j good sci' : 25, 'nature' : 32, 'j bla bla' : 1... }"""
xcounts = defaultdict(int)
total = 0
for doc in self.corpus.children(typename='DOCUMENT'):
if subfield in doc.hyperdata:
xcounts[doc.hyperdata[subfield]] += 1
else:
xcounts["_NA_"] += 1
total += 1
# the counts below could also be memoized
# // if subfield not in corpus.aggs:
# // corpus.aggs[subfield] = xcounts
return (xcounts, total)
from .api import * #notamment APIView, check_rights, format_response
from gargantext.util.http import *
from django.core.exceptions import *
from collections import defaultdict
from gargantext.util.toolchain import *
import copy
from gargantext.util.db import session
class ProjectList(APIView):
'''API endpoint that represent a list of projects owned by a user'''
renderer_classes = (JSONRenderer, BrowsableAPIRenderer)
def get(self, request):
'''GET the projects of a given user'''
user = cache.User[request.user.id]
projects = session.query(Node).filter(Node.typename=="PROJECT", Node.user_id== user.id).all()
if len(projects) == 0:
return Response({"detail":"No projects Found for this user"}, status=HTTP_404_NOT_FOUND)
context = format_response(user, projects)
return Response(context)
def post(self, request):
'''CREATE a new project for a given user'''
user = cache.User[request.user.id]
try:
#corpus name
name = request.data["name"]
except AttributeError:
return Response({"detail":"Invalid POST method: \"name\" field is required "}, status = HTTP_406_NOT_ACCEPTABLE)
if name == "":
return Response({"detail":"Invalid POST method: \"name\" field is empty "}, status = HTTP_406_NOT_ACCEPTABLE)
else:
project = session.query(Node).filter(Node.typename=="PROJECT", Node.name==name).first()
if project is not None:
return Response({"detail":"Project with this name already exists", "url":"/projects/%s" %str(project.id)}, status = HTTP_409_CONFLICT)
else:
new_project = Node(
user_id = request.user.id,
typename = 'PROJECT',
name = name,
)
session.add(new_project)
session.commit()
return Response({"detail": "Created", "url":"/projects/%s" %str(new_project.id)}, status= HTTP_201_CREATED)
def delete(self, request):
''' DELETE the projects of a given user'''
user = cache.User[request.user.id]
projects = session.query(Node).filter(Node.typename=="PROJECT", Node.user_id== user.id).all()
#for project in projects:
# project = check_rights(request, project)
uids = []
for node in projects:
session.delete(node)
session.commit()
uids.append(node.id)
return Response({"detail":"Deleted %i projects" %len(uids)}, status=HTTP_204_NO_CONTENT)
def put(self, request):
'''UPDATE EVERY projects of a given user'''
user = cache.User[request.user.id]
query = session.query(Node).filter(Node.typename=="PROJECT", Node.user_id== request.user.id).all()
uids = []
for node in query:
for key, val in request.data.items():
#here verify that key is in accepted modified keys
if key in ["name", "date", "username"]:
if key == "username":
#changement de propriétaire
user = session.query(Node).filter(Node.typename=="PROJECT", Node.username== username).first()
set(node, user_id, user.id)
else:
setattr(node, key, val)
#node.name = request.data["name"]
session.add(node)
session.commit()
uids.append(node.id)
return Response({"detail":"Updated %s projects" %len(uids)}, status=HTTP_202_ACCEPTED)
class ProjectView(APIView):
'''API endpoint that represent project detail'''
renderer_classes = (JSONRenderer, BrowsableAPIRenderer)
def get(self, request, project_id):
''' GET /api/projects/<project_id> the list of corpora given a project '''
project = session.query(Node).filter(Node.id == project_id).first()
if project is None:
return Response({'detail' : "PROJECT Node #%s not found" %(project_id) },
status = HTTP_404_NOT_FOUND)
check_rights(request, project_id)
corpus_list = project.children('CORPUS', order=True).all()
if len(corpus_list) == 0:
return Response({'detail' : "No corpora found for Project Node #%s" %(project_id) },
status = HTTP_404_NOT_FOUND)
# resource_list = [(n["name"], n["type"], n["id"]) for n in corpus_list[0].children('RESOURCE', order=True).all()]
# print(resource_list)
context = format_response(project, corpus_list)
return Response(context)
def delete(self, request, project_id):
'''DELETE project'''
node = session.query(Node).filter(Node.id == project_id).first()
if node is None:
return Response({'detail' : "PROJECT Node #%s not found" %(project_id) },
status = HTTP_404_NOT_FOUND)
else:
try:
check_rights(request, project_id)
except Exception as e:
return Response({'detail' : "Unauthorized" %(project_id) },
status= 403)
session.delete(node)
session.commit()
return Response({"detail": "Successfully deleted Node #%s" %project_id}, status= 204)
def put(self, request, project_id):
'''UPDATE project '''
project = session.query(Node).filter(Node.id == project_id).first()
if project is None:
return Response({'detail' : "PROJECT Node #%s not found" %(project_id) },
status = HTTP_404_NOT_FOUND)
check_rights(request, project_id)
params = get_parameters(request)
# print(params)
#u_project = deepcopy(project)
for key, val in params.items():
if len(val) == 0:
return Response({"detail":"Invalid POST method: \"%s\" field is empty " %key}, status = HTTP_406_NOT_ACCEPTABLE)
if key in ["name", "date", "username"]:
if key == "username":
#change ownership
#find user
#user = session.query(Node).filter(Node.username == username, Node.typename="USER").first()
#if user.id
pass
elif key == "name":
other = session.query(Node).filter(Node.name == val ).count()
if other == 0:
setattr(project, key, val)
else:
return Response({"detail":"Project with this name already exists"}, status = HTTP_409_CONFLICT)
else:
setattr(project, key, val)
session.add(project)
session.commit()
return Response({"detail":"Updated PROJECT #%s" %str(project_id)}, status=HTTP_206_PARTIAL_CONTENT)
def post(self, request, project_id):
'''CREATE corpus'''
project = session.query(Node).filter(Node.id == project_id).first()
if project is None:
return Response({'detail' : "PROJECT Node #%s not found" %(project_id) },
status = HTTP_404_NOT_FOUND)
project = check_rights(request, project_id)
#controling form data
if not "name" in request.data.keys():
return Response({'detail' : "CORPUS Node: field name is mandatory" },
status = HTTP_406_NOT_ACCEPTABLE)
if not "source" in request.data.keys():
return Response({'detail' : "CORPUS Node: field source is mandatory"},
status = HTTP_406_NOT_ACCEPTABLE)
corpus_name = request.data["name"]
corpus_source = request.data["source"]
if corpus_name == "":
return Response({'detail' : "CORPUS Node name can't be empty" },
status = HTTP_406_NOT_ACCEPTABLE)
corpus = session.query(Node).filter(Node.name == corpus_name, Node.typename == "CORPUS").first()
if corpus is not None:
return Response({'detail' : "CORPUS Node with name '%s' already exists" %(corpus_name) },
status = HTTP_409_CONFLICT)
if corpus_source == "" or corpus_source == 0 or corpus_source == None:
return Response({'detail' : "CORPUS Node source can't be empty"},status=HTTP_406_NOT_ACCEPTABLE)
params = get_parameters(request)
if "method" not in params.keys():
#if "method" not in request.data.keys():
return Response({'detail' : "CORPUS Node has not 'method' parameter"},
status = HTTP_405_METHOD_NOT_ALLOWED)
#method = request.data["method"]
method = params["method"]
if method not in ["parse", "scan", "copy"]:
return Response({'detail' : "CORPUS Node only parse, scan and copy 'method' are allowed" },
status = HTTP_405_METHOD_NOT_ALLOWED)
if method == "copy":
corpus = session.query(Node).filter(Node.id == corpus_source, Node.typename == "CORPUS").first()
if corpus is None:
return Response({'detail' : "CORPUS Node #%s doesn't exist. Fail to copy" %(str(corpus_source)) },
status = HTTP_404_NOT_FOUND)
else:
#cloned_corpus = {k:v for k,v in corpus if k not in ["user_id", "id", "parent_id"]}
cloned_corpus = copy.deepcopy(corpus)
del cloned_corpus.id
cloned_corpus.parent_id = project_id
cloned_corpus.user_id = request.user.id
for child in corpus.get_children():
#{k:getattr(corpus, k) for k in ["name", "date", "source", "hyperdata"] }
cloned_child = copy.deepcopy(child)
del cloned_child["id"]
cloned_child["parent_id"] = new_corpus.id
cloned_corpus["user_id"] = request.user.id
cloned_corpus.add_child(cloned_child)
session.add(cloned_corpus)
session.commit()
#RESOURCE
source = get_resource(int(corpus_source))
if source is None:
return Response({'detail' : "CORPUS Node sourcetype unknown"},
status = HTTP_406_NOT_ACCEPTABLE)
if method == "parse":
print('PARSING')
if not "file" in request.FILES.keys():
return Response({'detail' : "CORPUS Node need a file to parse" },
status = HTTP_405_METHOD_NOT_ALLOWED)
corpus_file = request.FILES['file']
if "parser" in source.keys():
corpus = project.add_child(
name = request.data["name"],
typename = 'CORPUS',
#path = corpus_file,
)
print("CORPUS #", corpus.id)
session.add(corpus)
session.commit()
resource = Node(
name = source["name"],
typename = 'RESOURCE',
parent_id = corpus.id,
hyperdata = {"type": source["type"],
"method": method,
"file": upload(corpus_file),
"query": None}
)
session.add(resource)
session.commit()
return Response({"detail":"Parsing corpus #%s of type #%s" %(str(corpus.id), resource.name)}, 200)
else:
return Response({"detail":"No Parser found for this corpus #%s of type %s" %(str(corpus.id), resource.name)}, 405)
elif method =="scan":
if "crawler" in source.keys():
if not "query" in request.data.keys():
#corpus_file = request.FILES['file']
return Response({'detail' : "CORPUS Node need a query to scan" },
status = HTTP_405_METHOD_NOT_ALLOWED)
query = request.data['query']
corpus = project.add_child(
name = request.data["name"],
typename = 'CORPUS',
)
resource = Node(
name = source["name"],
typename = 'RESOURCE',
parent_id = corpus.id,
user_id = request.user_id,
hyperdata = {"type": source["type"],
"method": method,
"file": None,
"query": query}
)
session.add(resource)
session.commit()
return Response({'detail': "CORPUS #%s created" %corpus.id}, status = HTTP_201_CREATED)
else:
return Response({'detail' : "CORPUS Node only parse, scan and copy 'method' are allowed" },
status = HTTP_405_METHOD_NOT_ALLOWED)
def old_post(self, request, project_id):
form = self._validate_form(request)
#get params
method = form["method"]
if method in ["parse", "scan", "copy"]:
#Le corpus et la resource n'existent pas
# [HACK]
# creation d'un corpus
corpus = Node( typename = 'CORPUS',
user_id = request.user_id,
parent_id = project.id,
name = form["name"],
)
session.add(corpus)
session.commit()
# creation d'une resource
try:
if method == "parse":
form["file"] = request.FILES['file']
action = getattr(self, "_"+method)
#toutes les actions sauf scan suppriment la resource?
#et remontent l'info dans corpus
if action(corpus, form):
# transferer les infos resource dans le corpus
documents = session.query(Node).filter(Node.typename=="DOCUMENT", Node.user_id== user.id, Node.parent_id==corpus.id).all()
response_data = {
"records": format_records(documents),
"resource": format_records([resource]),
"parent": format_parent(project),
"count":len(documents)
}
return Response(response_data, 200)
else:
raise APIException("Error with ", method)
except Exception as e:
raise APIException(e)
else:
#Le corpus existe et la resource doit être mise à jour
corpus = session.query(Node).filter(Node.typename=="CORPUS", Node.parent_id== project.id, Node.name == form["corpus_name"]).first()
source = get_resource(form["source"])
if corpus is None:
return Response("CORPUS not found", 404)
#[HACK] one corpus one resource by Resourcetype_name
resource = session.query(Node).filter(Node.typename=="RESOURCE",
Node.parent_id== corpus.id,
Node.corpus_name == form["corpus_name"],
Node.name == source["name"]
).first()
action = getattr(self, "_"+method)
if action(resource):
# transferer les infos resource dans le corpus
if method == "fetch":
corpus.sources[resource["name"]].append(resource)
session.delete(resource)
session.add(corpus)
session.commit()
else:
session.add(resource)
session.commit()
return Response({"log": "Created", "uids":[corpus.id]}, 200)
else:
session.delete(resource)
session.delete(corpus)
session.commit()
return Response({"log": method+": Error"}, 500)
def _check_method(self, request):
METHODS = ["scan", "parse", "sample", "fetch", "copy"]
try:
method = get_parameters(request)["method"]
except AttributeError:
raise APIException("Precondition failed : You must specify a method", 412)
if method not in METHODS:
raise APIException("Method not allowed", 405)
else:
return method
def _validate_form(self, request):
'''basic validation of the step given each method
'''
params = {}
method = self._check_method(request)
#parsing a file
if method == "parse":
fields = ['source', 'name', "file"]
#scanning a query => results_nb
elif method == "scan":
fields = ['source', 'name', "query"]
#sampling checking results_nb => ids
#~ elif method == "sample":
#~ fields = ['source', 'name', "results_nb"]
#~ #fetching ids => NewParser
#~ elif method == "fetch":
#~ fields = ['source', 'name', "ids"]
#cloning a corpus_id => Corpus
elif method == "copy":
fields = ['source', 'name', "corpus_id"]
for k in fields:
try:
if request.data[k] != "" or request.data[k] is not None:
params[k] = request.data[k]
else:
raise APIException("Mandatory value %s can't be empty "%str(k), 400)
except AttributeError:
raise APIException("Value %s is mandatory" %str(k), 400)
if len(params) > 0:
params["method"] = method
return params
else:
raise APIException("Form is empty: %s" %str(k), 404)
def _sample(self, resource):
resource = self._find_resource_hyperdata(corpus, form)
crawlbot = eval(resource.crawler)(resource)
records = crawlbot.sample()
#resource.status.insert(0,"sampled")
resource.ids = records
corpus.status(action="sample", progress=1, complete=True)
session.add(corpus)
session.commit()
return Response({"uids": [corpus.id]}, status= HTTP_200_OK)
def _fetch(self, resource):
'''internal method to fetch from a corpus the resource.urls >>> resource._parser(urls)'''
resource = self._find_resource_hyperdata(corpus, form)
resource.status(action="fetch", progress=1, complete=False)
crawlbot = eval(resource.typecrawler)(resource)
#send job to celery
scheduled(crawlbot.fetch())
corpus.status(action="fetch", progress=1, complete=True)
session.add(corpus)
session.commit()
return Response({"uids": [corpus.id]}, 200)
def _copy(self, corpus, form):
#find the target corpus
new_corpus = session.query(Node).filter(Node.typename=="CORPUS", Node.corpus_id == form["corpus_id"]).first()
#get the resource of this corpus and copy it two
new_resource = self._find_resource_hyperdata(new_corpus, form)
#copy new_corpus to previously created corpus
new_resouce.method = "cloned CORPUS #%i" %(new_corpus.id)
new_corpus.id = corpus.id
# change new_corpus ownership
new_corpus.parent_id = corpus.parent_id
new_corpus.user_id = corpus.user_id
#get the documents of the existing corpus
for doc in new_corpus.get_children():
doc.parent_id = new_corpus.parent_id
doc.user_id = new_corpus.id
#store it into corpus
new_doc = corpus.add_child(doc)
for ngrams in doc.get_children():
new_ngrams.parent_id = new_doc.id
new_ngrams.user_id = new_corpus.user_id
#store it into corpus
new_doc.add_child(new_ngrams)
#save the corpus
corpus.status(action="copy", progress=1, complete=True)
session.add(corpus)
session.commit()
return Response({"log": "Corpus created", "uids":[corpus.id]}, 202)
def _scan(self, corpus, form):
'''internal method to scan a query >> add results_nb to resource as a corpus hyperdata'''
resource = self._find_resource_hyperdata(corpus, form)
#corpus_query = check_query(form["query")
ressource.query = form["query"]
corpus.status(action="scan", progress=1, complete=False)
session.add(corpus)
session.commit()
crawlbot = eval(resource.crawler)(corpus.id)
corpus.status(action="scan", progress=2, complete=False)
session.add(corpus)
session.commit()
results_nb = crawlbot.scan_results()
resource.results_nb = results_nb
corpus.status(action="scan", progress=2, complete=True)
code = 200
session.add(corpus)
session.commit()
return Response({"log": "Corpus created", "uids":[corpus.id]}, 200)
def _parse(self, corpus, form):
'''internal method to parse a corpus >> resource >> corpus >> docs
corpus >> resource (method + file params + parser )
^ >> docs (resource.defaultlang <--------| )
| >> ngrams
|------- le tout rappatrié dans corpus
'''
#1. creating a resource
resource = {}
resource = Node(
user_id = corpus.user_id,
parent_id = corpus.id,
typename = "RESOURCE",
#corpus_name = form["name"],
)
resource.method = form["method"]
resource.path = upload(form['file'])
#mapping the default attribute of a given source from constant RESOURCETYPE
for k, v in get_resource(int(form["source"])).items():
setattr(resource, k, v)
resource.status(action="parse", progress=1, complete=False)
session.add(resource)
session.commit()
try:
workflow(resource)
except Exception as e:
print("=======except dans _parse===========")
print(e)
from traceback import print_tb
print_tb(e.__traceback__)
print("====================================")
return True
from django.conf.urls import url
from rest_framework_jwt.views import obtain_jwt_token
from . import nodes
from . import projects
from . import corpora
from . import users
from . import ngrams
from . import metrics
from . import ngramlists
from . import analytics
urlpatterns = [ url(r'^nodes$' , nodes.NodeListResource.as_view())
, url(r'^nodes/(\d+)$' , nodes.NodeResource.as_view())
, url(r'^nodes/(\d+)/having$' , nodes.NodeListHaving.as_view())
, url(r'^nodes/(\d+)/status$' , nodes.Status.as_view())
# Projects
, url(r'^projects$' , projects.ProjectList.as_view())
, url(r'^projects/(\d+)$' , projects.ProjectView.as_view())
# Corpora
, url(r'^projects/(\d+)/corpora/(\d+)$', corpora.CorpusView.as_view())
# Sources
#, url(r'^projects/(\d+)/corpora/(\d+)/sources$', corpora.CorpusSources.as_view())
#, url(r'^projects/(\d+)/corpora/(\d+)/sources/(\d+)$ , corpora.CorpusSourceView.as_view())
# Facets
, url(r'^projects/(\d+)/corpora/(\d+)/facets$', nodes.CorpusFacet.as_view())
# Favorites
, url(r'^projects/(\d+)/corpora/(\d+)/favorites$', nodes.CorpusFavorites.as_view())
# Metrics
, url(r'^projects/(\d+)/corpora/(\d+)/metrics$', metrics.CorpusMetrics.as_view())
# Ngrams
, url(r'^ngrams/?$' , ngrams.ApiNgrams.as_view())
# Analytics
, url(r'^nodes/(\d+)/histories$', analytics.NodeNgramsQueries.as_view())
, url(r'hyperdata$' , analytics.ApiHyperdata.as_view())
# get a list of ngram_ids or ngram_infos by list_id
# url(r'^ngramlists/(\d+)$', ngramlists.List.as_view()),
, url(r'^nodes/(\d+)/facets$' , nodes.CorpusFacet.as_view())
, url(r'^nodes/(\d+)/favorites$', nodes.CorpusFavorites.as_view())
# in these two routes the node is supposed to be a *corpus* node
, url(r'^metrics/(\d+)$' , metrics.CorpusMetrics.as_view())
# update all metrics for a corpus
# ex: PUT metrics/123
# \
# corpus id
, url(r'^ngramlists/export$', ngramlists.CSVLists.as_view())
# get a CSV export of the ngramlists of a corpus
# ex: GET ngramlists/export?corpus=43
# TODO : unify to a /api/ngrams?formatted=csv
# (similar to /api/nodes?formatted=csv)
, url(r'^ngramlists/import$', ngramlists.CSVLists.as_view())
# same handling class as export (CSVLists)
# but this route used only for POST + file
# or PATCH + other corpus id
, url(r'^ngramlists/change$', ngramlists.ListChange.as_view())
# add or remove ngram from a list
# ex: add <=> PUT ngramlists/change?list=42&ngrams=1,2
# rm <=> DEL ngramlists/change?list=42&ngrams=1,2
, url(r'^ngramlists/groups$', ngramlists.GroupChange.as_view())
# modify grouping couples of a group node
# ex: PUT/DEL ngramlists/groups?node=43
# & group data also in url: 767[]=209,640 & 779[]=436,265,385
, url(r'^ngramlists/family$', ngramlists.ListFamily.as_view())
# entire combination of lists from a corpus, dedicated to termtable
# (or any combination of lists that go together :
# - a mainlist
# - an optional stoplist
# - an optional maplist
# - an optional grouplist
, url(r'^ngramlists/maplist$', ngramlists.MapListGlance.as_view())
# fast access to maplist, similarly formatted for termtable
, url(r'^user/parameters/$', users.UserParameters.as_view())
, url('^auth/token$', obtain_jwt_token)
]
from .api import * #notamment APIView, check_rights, format_response
from gargantext.util.http import *
from django.core.exceptions import *
from collections import defaultdict
from gargantext.util.toolchain import *
import copy
from gargantext.util.db import session
class UserParameters(APIView):
'''API endpoint that represent the parameters of the user'''
def get(self, request):
node_user = session.query(Node).filter(Node.user_id == request.user.id, Node.typename== "USER").first()
if node_user is None:
return Response({"detail":"Not Found"}, status=HTTP_404)
else:
#context = format_response(node_user, )
return Response(node_user.hyperdata)
def put(self, request):
if request.user.id is None:
raise TypeError("This API request must come from an authenticated user.")
else:
# we query among the nodes that belong to this user
user = cache.User[request.user.id]
node_user = session.query(Node).filter(Node.user_id == user.id, Node.typename== "USER").first()
if node_user is None:
return Response({"detail":"Not Allowed"}, status=HTTP_401_UNAUTHORIZED)
for k, v in request.data.items():
node_user.hyperdata[k] = v
# setattr(node_user.hyperdata, k, v)
# print(node_user.hyperdata)
node_user.save_hyperdata()
session.add(node_user)
session.commit()
node_user = session.query(Node).filter(Node.user_id == user.id, Node.typename== "USER").first()
print(node_user.hyperdata)
return Response({"detail":"Updated user parameters", "hyperdata": node_user.hyperdata}, status=HTTP_202_ACCEPTED)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment