Commit 1b34fc89 authored by sim's avatar sim

Add HAL scraper

parent 40620d80
from .pubmed import PubmedScraper
from .hal import HALScraper
from .ris import RISScraper
from gargantext.datasource import Scraper
from gargantext.datasource.items import DocumentLoader
from gargantext.datasource.responses import JsonResponse
__all__ = ['HALScraper']
class HALDocumentLoader(DocumentLoader):
def parse(self, doc):
for garg_field, local_fields in HALScraper.fields.items():
value = next((doc.get(f) for f in local_fields if doc.get(f)), None)
self.add_value(garg_field, value)
class HALScraper(Scraper):
"""HAL scraper, open archive which stores mainly french academic articles
`API Documentation <https://api.archives-ouvertes.fr/docs/search>`_
`Hyper Articles en Ligne
<https://en.wikipedia.org/wiki/Hyper_Articles_en_Ligne>`_, generally
shortened to HAL, is an open archive where authors can deposit scholarly
documents from all academic fields. It has a good position in the
international web repository ranking.
"""
name = 'hal'
expects = JsonResponse
base_url = 'https://api.archives-ouvertes.fr/search'
fields = {
"id" : "docid",
"title" : "title_s",
"abstract" : "abstract_s",
"source" : "journalTitle_s",
"url" : "uri_s",
"publication" : "submittedDate_tdate",
"authors" : "authFullName_s",
"isbn" : "isbn_s",
"issue" : "issue_s",
"lang" : "language_s",
"doi" : "doiId_s",
"type" : "docType_s",
"struct_inst" : "instStructId_i",
"struct_dept" : "deptStructId_i",
"struct_labs" : "labStructId_i",
"struct_team" : "rteamStructId_i",
}
fields = {k: v if isinstance(v, list) else [v] for k, v in fields.items()}
local_fields = sum(fields.values(), [])
def dispatch(self):
if self.total is None:
yield self.request(self.base_url, params={
'q': self.query,
'rows': 0,
})
yield from self.request_results()
def request_results(self):
if not self.count_only and self.total:
for start in range(0, self.count, self.BATCH_SIZE):
yield self.request(self.base_url, params={
'q': self.query,
'start': start,
'rows': self.BATCH_SIZE,
'fl': ','.join(self.local_fields),
'wt': 'json',
})
def parse(self, response):
if self.total is None:
self.total = response.jmes('response.numFound')
yield from self.request_results()
for doc in response.jmes('response.docs'):
yield HALDocumentLoader(doc).load()
if self.reach_limit():
return
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment