Commit c12a0dae authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge branch 'unstable' into testing

parents bfcabfad ef842d62
......@@ -14,12 +14,12 @@ from gargantext.util.files import save
class HalCrawler(Crawler):
def __init__(self):
# Main EndPoints
self.BASE_URL = ""
self.API_URL = "search"
# Final EndPoints
# TODO : Change endpoint according type of database
self.URL = self.BASE_URL + "/" + self.API_URL
......@@ -38,7 +38,9 @@ class HalCrawler(Crawler):
def _get(self, query, fromPage=1, count=10, lang=None):
# Parameters
fl = """ en_title_s
fl = """ docid
, title_s
, abstract_s
, en_title_s
, en_abstract_s
, submittedDate_s
......@@ -59,7 +61,7 @@ class HalCrawler(Crawler):
#, authUrl_s
#, type_s
wt = "json"
querystring = { "q" : query
......@@ -68,18 +70,18 @@ class HalCrawler(Crawler):
, "fl" : fl
, "wt" : wt
# Specify Headers
headers = { "cache-control" : "no-cache" }
# Do Request and get response
response = requests.request( "GET"
, self.URL
, headers = headers
, params = querystring
# Validation : 200 if ok else raise Value
if response.status_code == 200:
......@@ -90,27 +92,27 @@ class HalCrawler(Crawler):
return (json.loads(response.content.decode(charset)))
raise ValueError(response.status_code, response.reason)
def scan_results(self, query):
scan_results : Returns the number of results
Query String -> Int
self.results_nb = 0
total = ( self._get(query)
.get("response", {})
.get("numFound" , 0)
self.results_nb = total
return self.results_nb
def download(self, query):
downloaded = False
self.status.append("fetching results")
corpus = []
......@@ -124,7 +126,7 @@ class HalCrawler(Crawler):
print("ERROR (scrap: HAL d/l ): " , msg)
self.query_max = QUERY_SIZE_N_MAX
#for page in range(1, trunc(self.query_max / 100) + 2):
for page in range(0, self.query_max, paging):
print("Downloading page %s to %s results" % (page, paging))
......@@ -141,5 +143,5 @@ class HalCrawler(Crawler):
downloaded = True
return downloaded
......@@ -12,12 +12,12 @@ import json
class HalParser(Parser):
def _parse(self, json_docs):
hyperdata_list = []
hyperdata_path = { "id" : "isbn_s"
, "title" : "en_title_s"
, "abstract" : "en_abstract_s"
hyperdata_path = { "id" : "docid"
, "title" : ["en_title_s", "title_s"]
, "abstract" : ["en_abstract_s", "abstract_s"]
, "source" : "journalTitle_s"
, "url" : "uri_s"
, "authors" : "authFullName_s"
......@@ -29,8 +29,8 @@ class HalParser(Parser):
, "instStructId_i" : "instStructId_i"
, "deptStructId_i" : "deptStructId_i"
, "labStructId_i" : "labStructId_i"
, "rteamStructId_i" : "rteamStructId_i"
, "docType_s" : "docType_s"
, "rteamStructId_i" : "rteamStructId_i"
, "docType_s" : "docType_s"
uris = set()
......@@ -38,29 +38,32 @@ class HalParser(Parser):
for doc in json_docs:
hyperdata = {}
for key, path in hyperdata_path.items():
field = doc.get(path, "NOT FOUND")
if isinstance(field, list):
hyperdata[key] = ", ".join(map(lambda x: str(x), field))
hyperdata[key] = str(field)
# A path can be a field name or a sequence of field names
if isinstance(path, (list, tuple)):
# Get first non-empty value of fields in path sequence, or None
field = next((x for x in (doc.get(p) for p in path) if x), None)
# Get field value
field = doc.get(path)
if field is None:
field = "NOT FOUND"
if isinstance(field, list):
hyperdata[key] = ", ".join(map(str, field))
hyperdata[key] = str(field)
if hyperdata["url"] in uris:
print("Document already parsed")
# hyperdata["authors"] = ", ".join(
# [ p.get("person", {})
# .get("name" , "")
# for p in doc.get("hasauthor", [])
# ]
# )
maybeDate = doc.get("submittedDate_s", None)
maybeDate = doc.get("submittedDate_s", None)
if maybeDate is not None:
date = datetime.strptime(maybeDate, "%Y-%m-%d %H:%M:%S")
......@@ -70,9 +73,9 @@ class HalParser(Parser):
hyperdata["publication_year"] = str(date.year)
hyperdata["publication_month"] = str(date.month)
hyperdata["publication_day"] = str(
return hyperdata_list
def parse(self, filebuf):
......@@ -15,12 +15,16 @@ os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gargantext.settings')
from gargantext.constants import QUERY_SIZE_N_MAX, get_resource, get_resource_by_name
from gargantext.models import ProjectNode, DocumentNode, UserNode, User
from gargantext.models import ProjectNode, DocumentNode
from gargantext.util.db import session, get_engine
from collections import Counter
import importlib
from django.http import Http404
# Import those to be available by notebook user
from langdetect import detect as detect_lang
from gargantext.models import UserNode, User
class NotebookError(Exception):
......@@ -203,6 +203,7 @@
// do something…
return false;
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment