Commit 2d703388 authored by sim's avatar sim

Trailing spaces

parent 7e1e26a2
...@@ -14,12 +14,12 @@ from gargantext.util.files import save ...@@ -14,12 +14,12 @@ from gargantext.util.files import save
class HalCrawler(Crawler): class HalCrawler(Crawler):
''' HAL API CLIENT''' ''' HAL API CLIENT'''
def __init__(self): def __init__(self):
# Main EndPoints # Main EndPoints
self.BASE_URL = "https://api.archives-ouvertes.fr" self.BASE_URL = "https://api.archives-ouvertes.fr"
self.API_URL = "search" self.API_URL = "search"
# Final EndPoints # Final EndPoints
# TODO : Change endpoint according type of database # TODO : Change endpoint according type of database
self.URL = self.BASE_URL + "/" + self.API_URL self.URL = self.BASE_URL + "/" + self.API_URL
...@@ -59,7 +59,7 @@ class HalCrawler(Crawler): ...@@ -59,7 +59,7 @@ class HalCrawler(Crawler):
""" """
#, authUrl_s #, authUrl_s
#, type_s #, type_s
wt = "json" wt = "json"
querystring = { "q" : query querystring = { "q" : query
...@@ -68,18 +68,18 @@ class HalCrawler(Crawler): ...@@ -68,18 +68,18 @@ class HalCrawler(Crawler):
, "fl" : fl , "fl" : fl
, "wt" : wt , "wt" : wt
} }
# Specify Headers # Specify Headers
headers = { "cache-control" : "no-cache" } headers = { "cache-control" : "no-cache" }
# Do Request and get response # Do Request and get response
response = requests.request( "GET" response = requests.request( "GET"
, self.URL , self.URL
, headers = headers , headers = headers
, params = querystring , params = querystring
) )
#print(querystring) #print(querystring)
# Validation : 200 if ok else raise Value # Validation : 200 if ok else raise Value
if response.status_code == 200: if response.status_code == 200:
...@@ -90,27 +90,27 @@ class HalCrawler(Crawler): ...@@ -90,27 +90,27 @@ class HalCrawler(Crawler):
return (json.loads(response.content.decode(charset))) return (json.loads(response.content.decode(charset)))
else: else:
raise ValueError(response.status_code, response.reason) raise ValueError(response.status_code, response.reason)
def scan_results(self, query): def scan_results(self, query):
''' '''
scan_results : Returns the number of results scan_results : Returns the number of results
Query String -> Int Query String -> Int
''' '''
self.results_nb = 0 self.results_nb = 0
total = ( self._get(query) total = ( self._get(query)
.get("response", {}) .get("response", {})
.get("numFound" , 0) .get("numFound" , 0)
) )
self.results_nb = total self.results_nb = total
return self.results_nb return self.results_nb
def download(self, query): def download(self, query):
downloaded = False downloaded = False
self.status.append("fetching results") self.status.append("fetching results")
corpus = [] corpus = []
...@@ -124,7 +124,7 @@ class HalCrawler(Crawler): ...@@ -124,7 +124,7 @@ class HalCrawler(Crawler):
) )
print("ERROR (scrap: HAL d/l ): " , msg) print("ERROR (scrap: HAL d/l ): " , msg)
self.query_max = QUERY_SIZE_N_MAX self.query_max = QUERY_SIZE_N_MAX
#for page in range(1, trunc(self.query_max / 100) + 2): #for page in range(1, trunc(self.query_max / 100) + 2):
for page in range(0, self.query_max, paging): for page in range(0, self.query_max, paging):
print("Downloading page %s to %s results" % (page, paging)) print("Downloading page %s to %s results" % (page, paging))
...@@ -141,5 +141,5 @@ class HalCrawler(Crawler): ...@@ -141,5 +141,5 @@ class HalCrawler(Crawler):
, basedir=UPLOAD_DIRECTORY , basedir=UPLOAD_DIRECTORY
) )
downloaded = True downloaded = True
return downloaded return downloaded
...@@ -12,9 +12,9 @@ import json ...@@ -12,9 +12,9 @@ import json
class HalParser(Parser): class HalParser(Parser):
def _parse(self, json_docs): def _parse(self, json_docs):
hyperdata_list = [] hyperdata_list = []
hyperdata_path = { "id" : "isbn_s" hyperdata_path = { "id" : "isbn_s"
, "title" : "en_title_s" , "title" : "en_title_s"
, "abstract" : "en_abstract_s" , "abstract" : "en_abstract_s"
...@@ -29,8 +29,8 @@ class HalParser(Parser): ...@@ -29,8 +29,8 @@ class HalParser(Parser):
, "instStructId_i" : "instStructId_i" , "instStructId_i" : "instStructId_i"
, "deptStructId_i" : "deptStructId_i" , "deptStructId_i" : "deptStructId_i"
, "labStructId_i" : "labStructId_i" , "labStructId_i" : "labStructId_i"
, "rteamStructId_i" : "rteamStructId_i" , "rteamStructId_i" : "rteamStructId_i"
, "docType_s" : "docType_s" , "docType_s" : "docType_s"
} }
uris = set() uris = set()
...@@ -38,15 +38,15 @@ class HalParser(Parser): ...@@ -38,15 +38,15 @@ class HalParser(Parser):
for doc in json_docs: for doc in json_docs:
hyperdata = {} hyperdata = {}
for key, path in hyperdata_path.items(): for key, path in hyperdata_path.items():
field = doc.get(path, "NOT FOUND") field = doc.get(path, "NOT FOUND")
if isinstance(field, list): if isinstance(field, list):
hyperdata[key] = ", ".join(map(lambda x: str(x), field)) hyperdata[key] = ", ".join(map(lambda x: str(x), field))
else: else:
hyperdata[key] = str(field) hyperdata[key] = str(field)
if hyperdata["url"] in uris: if hyperdata["url"] in uris:
print("Document already parsed") print("Document already parsed")
else: else:
...@@ -54,11 +54,11 @@ class HalParser(Parser): ...@@ -54,11 +54,11 @@ class HalParser(Parser):
# hyperdata["authors"] = ", ".join( # hyperdata["authors"] = ", ".join(
# [ p.get("person", {}) # [ p.get("person", {})
# .get("name" , "") # .get("name" , "")
# #
# for p in doc.get("hasauthor", []) # for p in doc.get("hasauthor", [])
# ] # ]
# ) # )
# #
maybeDate = doc.get("submittedDate_s", None) maybeDate = doc.get("submittedDate_s", None)
if maybeDate is not None: if maybeDate is not None:
...@@ -70,9 +70,9 @@ class HalParser(Parser): ...@@ -70,9 +70,9 @@ class HalParser(Parser):
hyperdata["publication_year"] = str(date.year) hyperdata["publication_year"] = str(date.year)
hyperdata["publication_month"] = str(date.month) hyperdata["publication_month"] = str(date.month)
hyperdata["publication_day"] = str(date.day) hyperdata["publication_day"] = str(date.day)
hyperdata_list.append(hyperdata) hyperdata_list.append(hyperdata)
return hyperdata_list return hyperdata_list
def parse(self, filebuf): def parse(self, filebuf):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment