Commit d5652f48 authored by sim's avatar sim

[REFACT] Clean file utilities API

parent 2bbecc32
from gargantext.util.db import session
from gargantext.util.files import upload
from datetime import datetime
......
......@@ -10,16 +10,16 @@ from ._Crawler import *
import json
from gargantext.constants import UPLOAD_DIRECTORY
from math import trunc
from gargantext.util.files import save
from gargantext.util.files import file_save
class HalCrawler(Crawler):
''' HAL API CLIENT'''
def __init__(self):
# Main EndPoints
self.BASE_URL = "https://api.archives-ouvertes.fr"
self.API_URL = "search"
# Final EndPoints
# TODO : Change endpoint according type of database
self.URL = self.BASE_URL + "/" + self.API_URL
......@@ -49,7 +49,7 @@ class HalCrawler(Crawler):
"""
#, authUrl_s
#, type_s
wt = "json"
querystring = { "q" : query
......@@ -58,18 +58,18 @@ class HalCrawler(Crawler):
, "fl" : fl
, "wt" : wt
}
# Specify Headers
headers = { "cache-control" : "no-cache" }
# Do Request and get response
response = requests.request( "GET"
, self.URL
, headers = headers
, params = querystring
)
#print(querystring)
# Validation : 200 if ok else raise Value
if response.status_code == 200:
......@@ -80,27 +80,27 @@ class HalCrawler(Crawler):
return (json.loads(response.content.decode(charset)))
else:
raise ValueError(response.status_code, response.reason)
def scan_results(self, query):
'''
scan_results : Returns the number of results
Query String -> Int
'''
self.results_nb = 0
total = ( self._get(query)
.get("response", {})
.get("numFound" , 0)
)
self.results_nb = total
return self.results_nb
def download(self, query):
downloaded = False
self.status.append("fetching results")
corpus = []
......@@ -114,7 +114,7 @@ class HalCrawler(Crawler):
)
print("ERROR (scrap: Multivac d/l ): " , msg)
self.query_max = QUERY_SIZE_N_MAX
#for page in range(1, trunc(self.query_max / 100) + 2):
for page in range(0, self.query_max, paging):
print("Downloading page %s to %s results" % (page, paging))
......@@ -126,10 +126,10 @@ class HalCrawler(Crawler):
for doc in docs:
corpus.append(doc)
self.path = save( json.dumps(corpus).encode("utf-8")
, name='HAL.json'
, basedir=UPLOAD_DIRECTORY
)
self.path = file_save( json.dumps(corpus).encode("utf-8")
, name='HAL.json'
, basedir=UPLOAD_DIRECTORY
)
downloaded = True
return downloaded
......@@ -10,18 +10,18 @@ from ._Crawler import *
import json
from gargantext.constants import UPLOAD_DIRECTORY
from math import trunc
from gargantext.util.files import save
from gargantext.util.files import file_save
from gargantext.util.crawlers.sparql.bool2sparql import bool2sparql, isidore
class IsidoreCrawler(Crawler):
''' ISIDORE SPARQL API CLIENT'''
def __init__(self):
# Main EndPoints
self.BASE_URL = "https://www.rechercheisidore.fr"
self.API_URL = "sparql"
# Final EndPoints
# TODO : Change endpoint according type of database
self.URL = self.BASE_URL + "/" + self.API_URL
......@@ -35,7 +35,7 @@ class IsidoreCrawler(Crawler):
def _get(self, query, offset=0, limit=None, lang=None):
'''Parameters to download data'''
isidore(query, count=False, offset=offset, limit=limit)
def scan_results(self, query):
......@@ -47,9 +47,9 @@ class IsidoreCrawler(Crawler):
return self.results_nb
def download(self, query):
downloaded = False
self.status.append("fetching results")
corpus = []
......@@ -63,17 +63,17 @@ class IsidoreCrawler(Crawler):
)
print("WARNING (scrap: ISIDORE d/l ): " , msg)
self.query_max = QUERY_SIZE_N_MAX
for offset in range(0, self.query_max, limit):
print("Downloading result %s to %s" % (offset, self.query_max))
for doc in isidore(query, offset=offset, limit=limit) :
corpus.append(doc)
self.path = save( json.dumps(corpus).encode("utf-8")
, name='ISIDORE.json'
, basedir=UPLOAD_DIRECTORY
)
self.path = file_save( json.dumps(corpus).encode("utf-8")
, name='ISIDORE.json'
, basedir=UPLOAD_DIRECTORY
)
downloaded = True
return downloaded
......@@ -11,18 +11,18 @@ import json
from gargantext.settings import API_TOKENS
from gargantext.constants import UPLOAD_DIRECTORY
from math import trunc
from gargantext.util.files import save
from gargantext.util.files import file_save
class MultivacCrawler(Crawler):
''' Multivac API CLIENT'''
def __init__(self):
self.apikey = API_TOKENS["MULTIVAC"]
# Main EndPoints
self.BASE_URL = "https://api.iscpif.fr/v2"
self.API_URL = "pvt/economy/repec/search"
# Final EndPoints
# TODO : Change endpoint according type of database
self.URL = self.BASE_URL + "/" + self.API_URL
......@@ -39,21 +39,21 @@ class MultivacCrawler(Crawler):
, "from" : fromPage
, "api_key" : API_TOKENS["MULTIVAC"]["APIKEY"]
}
if lang is not None:
querystring["lang"] = lang
# Specify Headers
headers = { "cache-control" : "no-cache" }
# Do Request and get response
response = requests.request( "GET"
, self.URL
, headers = headers
, params = querystring
)
#print(querystring)
# Validation : 200 if ok else raise Value
if response.status_code == 200:
......@@ -64,27 +64,27 @@ class MultivacCrawler(Crawler):
return (json.loads(response.content.decode(charset)))
else:
raise ValueError(response.status_code, response.reason)
def scan_results(self, query):
'''
scan_results : Returns the number of results
Query String -> Int
'''
self.results_nb = 0
total = ( self._get(query)
.get("results", {})
.get("total" , 0)
)
self.results_nb = total
return self.results_nb
def download(self, query):
downloaded = False
self.status.append("fetching results")
corpus = []
......@@ -98,7 +98,7 @@ class MultivacCrawler(Crawler):
)
print("ERROR (scrap: Multivac d/l ): " , msg)
self.query_max = QUERY_SIZE_N_MAX
for page in range(1, trunc(self.query_max / 100) + 2):
print("Downloading page %s to %s results" % (page, paging))
docs = (self._get(query, fromPage=page, count=paging)
......@@ -109,10 +109,10 @@ class MultivacCrawler(Crawler):
for doc in docs:
corpus.append(doc)
self.path = save( json.dumps(corpus).encode("utf-8")
, name='Multivac.json'
, basedir=UPLOAD_DIRECTORY
)
self.path = file_save( json.dumps(corpus).encode("utf-8")
, name='Multivac.json'
, basedir=UPLOAD_DIRECTORY
)
downloaded = True
return downloaded
from gargantext.constants import *
import os
from gargantext.constants import DOWNLOAD_DIRECTORY, UPLOAD_LIMIT, UPLOAD_DIRECTORY
from gargantext.util.digest import str_digest
from gargantext.util import http
def save(contents, name='', basedir=''):
def file_save(contents, name='', basedir=''):
digest = str_digest(contents[:4096] + contents[-4096:])
path = basedir
for i in range(2, 8, 2):
......@@ -16,29 +18,22 @@ def save(contents, name='', basedir=''):
return path
def download(url, name=''):
return save(
def file_download(url, name=''):
return file_save(
contents = http.get(url),
name = name,
basedir = DOWNLOAD_DIRECTORY,
)
def check_format(corpus_type, name):
#~ if True:
acc_formats = RESOURCETYPES[corpus_type]["accepted_formats"]
if name.split(".")[-1].lower() not in acc_formats:
raise TypeError('Uncorrect format of file. File must be a %s file' %" or ".join(acc_formats))
def upload(uploaded):
def file_upload(uploaded):
if uploaded.size > UPLOAD_LIMIT:
raise IOError('Uploaded file is bigger than allowed: %d > %d' % (
uploaded.size,
UPLOAD_LIMIT,
))
return save(
return file_save(
contents = uploaded.file.read(),
name = uploaded.name,
basedir = UPLOAD_DIRECTORY,
......
......@@ -10,7 +10,6 @@ from gargantext.constants import RESOURCETYPES, NODETYPES, get_resource
from gargantext.models import Node, Ngram, NodeNgram, NodeNodeNgram, NodeNode
from gargantext.util.db import session, delete, func, bulk_insert
from gargantext.util.db_cache import cache, or_
from gargantext.util.files import upload
from gargantext.util.http import ValidationException, APIView, JsonHttpResponse, get_parameters
from gargantext.util.scheduling import scheduled
from gargantext.util.validation import validate
......
......@@ -5,6 +5,7 @@ from collections import defaultdict
from gargantext.util.toolchain import *
import copy
from gargantext.util.db import session
from gargantext.util.files import file_upload
class ProjectList(APIView):
'''API endpoint that represent a list of projects owned by a user'''
......@@ -237,7 +238,7 @@ class ProjectView(APIView):
parent_id = corpus.id,
hyperdata = {"type": source["type"],
"method": method,
"file": upload(corpus_file),
"file": file_upload(corpus_file),
"query": None}
)
session.add(resource)
......@@ -485,7 +486,7 @@ class ProjectView(APIView):
#corpus_name = form["name"],
)
resource.method = form["method"]
resource.path = upload(form['file'])
resource.path = file_upload(form['file'])
#mapping the default attribute of a given source from constant RESOURCETYPE
for k, v in get_resource(int(form["source"])).items():
setattr(resource, k, v)
......
from gargantext.util.http import *
from gargantext.util.db import *
from gargantext.util.db_cache import cache
from gargantext.util.files import upload
from gargantext.util.files import file_upload
from gargantext.models import *
from gargantext.constants import *
from .main import get_user_params
......@@ -124,7 +124,7 @@ def project(request, project_id):
)
corpus.add_resource(
type = int(request.POST['type']),
path = upload(request.FILES['file']),
path = file_upload(request.FILES['file']),
)
session.add(corpus)
session.commit()
......
from gargantext.util.files import download
from gargantext.util.files import file_download
import sys
import time
......@@ -98,7 +98,7 @@ class Scraper :
# generic!
def download(self, url):
print(url)
filename = download(url)
filename = file_download(url)
with self.lock:
print(threading.current_thread().name, filename+" OK")
return filename
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment