Commit 68aef175 authored by sim's avatar sim

Brand new scrapers infrastructure using scrapy

parent b68c1ae0
...@@ -26,6 +26,9 @@ djangorestframework-jwt = "*" ...@@ -26,6 +26,9 @@ djangorestframework-jwt = "*"
django-celery-beat = "*" django-celery-beat = "*"
python-decouple = "*" python-decouple = "*"
alembic = "*" alembic = "*"
scrapy = "*"
jmespath = "*"
risparser = "*"
[requires] [requires]
......
This diff is collapsed.
import logging
from pathlib import Path
from datetime import datetime
from urllib.parse import urlencode
from scrapy.spiders import Spider
from scrapy.signals import response_received
from scrapy.http.request import Request as BaseRequest
from .responses import TextResponse, HtmlResponse, XmlResponse, JsonResponse, \
RISResponse
__all__ = ['Scraper', 'Request', 'TextResponse', 'HtmlResponse', 'XmlResponse',
'JsonResponse', 'RISResponse']
class Request(BaseRequest):
def __init__(self, url, callback=None, method='GET', headers=None, body=None,
cookies=None, meta=None, encoding='utf-8', priority=0,
dont_filter=False, errback=None, flags=None, params=None):
if params:
url += '?' + urlencode(params)
super().__init__(url, callback, method, headers, body, cookies, meta,
encoding, priority, dont_filter, errback, flags)
class Scraper(Spider):
MAX_COUNT = None
BATCH_SIZE = 100
DEBUG_DIR = '/tmp'
ARGUMENTS = ['url', 'count', 'query', 'count_only']
url = None
count = None
query = ''
count_only = False
def __init__(self, *args, **kwargs):
# The default __init__ method will take any spider arguments and copy
# them to the spider as attributes: filter arguments for security
# purposes.
spider_args = {k: v for k, v in kwargs.items() if k in self.ARGUMENTS}
super().__init__(*args, **spider_args)
default_parser = getattr(self, 'default_parser', None)
if default_parser and not hasattr(self, 'parse'):
# XXX Use setattr to bypass pylint warning...
setattr(self, 'parser', getattr(self, default_parser))
def start_requests(self):
if self.url: # and self.url.startswith('file://'):
yield Request(self.url)
else:
yield from self.dispatch()
@property
def logger_name(self):
return 'scrapers.%s' % self.name
@property
def logger(self):
logger = logging.getLogger(self.logger_name)
return logging.LoggerAdapter(logger, {'spider': self})
@property
def limit(self):
if self.MAX_COUNT is None:
return self.count or 0
if self.count is None:
return self.MAX_COUNT
return min(self.count, self.MAX_COUNT)
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super().from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.trace, signal=response_received)
return spider
def trace(self, response, request, spider):
content_type = response.headers.get('content-type', b'').decode()
self.logger.info('Content-Type=%s; type(response)=%s;',
content_type, type(response).__name__)
path = Path(self.DEBUG_DIR).absolute()
date = datetime.now().strftime("%Y%m%d_%H%m_%s.%f")
ext = '.html' if isinstance(response, HtmlResponse) else \
'.xml' if isinstance(response, XmlResponse) else \
'.json' if isinstance(response, JsonResponse) else \
'.txt' if isinstance(response, TextResponse) else \
''
filename = '%s-%s%s' % (spider.logger_name, date, ext)
filepath = str(path / filename)
with open(filepath, 'wb') as f:
f.write(response.body)
class ExpectsMiddleware(object):
def process_response(self, request, response, spider):
expects = getattr(spider, 'expects', None)
if expects is not None and not isinstance(response, expects):
expected = ' or '.join(cls.__name__ for cls in expects) \
if type(expects) is tuple else expects.__name__
raise TypeError("%s: %s expected, got %s instead." % (
spider.name, expected, response.__class__.__name__))
return response
from w3lib.url import file_uri_to_path
from scrapy.utils.decorators import defers
from .responses import responsetypes, TextResponse
class FileDownloadHandler(object):
CHUNK_SIZE = 5000
def __init__(self, settings):
pass
@defers
def download_request(self, request, spider):
filepath = file_uri_to_path(request.url)
with open(filepath, 'rb') as fo:
body_chunk = fo.read(self.CHUNK_SIZE)
# Detect response type only from data, don't trust filename extension
respcls = getattr(spider, 'expects', None) or \
responsetypes.from_args(body=body_chunk)
stream = open(filepath) if issubclass(respcls, TextResponse) else \
open(filepath, 'rb')
return respcls(url=request.url, stream=stream)
from datetime import datetime
from scrapy.item import Item, Field
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, Compose, MapCompose, Identity
from .processors import filter_empty
DateTime = Field(serialize=str)
String = Field()
class Document(Item):
id = String
title = String
abstract = String
source = String
url = String
lang = String
authors = String
publication = DateTime
creation = DateTime
class DocumentLoader(ItemLoader):
default_item_class = Document
default_output_processor = TakeFirst()
to_datetime = Compose(MapCompose(str.strip, int), filter_empty, lambda args: datetime(*args))
publication_out = to_datetime
creation_out = to_datetime
authors_out = Identity()
def __init__(self, selector, *args, **kwargs):
kwargs['selector'] = selector
super().__init__(*args, **kwargs)
def add_xpaths_text(self, xpaths):
for field_name, xpath in xpaths.items():
self.add_xpath(field_name, '%s/text()' % xpath)
def add_values(self, values):
for field_name, value in values.items():
self.add_value(field_name, value)
def parse(self, obj):
return NotImplementedError("don't use DocumentLoader directly.")
def load(self):
self.parse(self.selector)
return self.load_item()
__all__ = ['filter_empty']
def filter_empty(iterable):
return list(filter(None, iterable))
import logging
import jmespath
from abc import ABC
from weakref import WeakValueDictionary
from scrapy.http import \
TextResponse as BaseText, HtmlResponse as BaseHtml, XmlResponse as BaseXml
from scrapy import responsetypes as _responsetypes
from scrapy.responsetypes import ResponseTypes as BaseResponseTypes
from gargantext.utils.json import json_loads
from RISparser.parser import Ris
from RISparser.config import TAG_KEY_MAPPING
logger = logging.getLogger('scrapers')
# To be used in conjunction with gargantext.datasource.file.FileDownloadHandler
class StreamableMixin(object):
def __init__(self, *args, **kwargs):
self.stream = kwargs.pop('stream', None)
self._cached_stream_data = None
super().__init__(*args, **kwargs)
def readlines(self):
return iter(self.stream)
def _get_body(self):
if self.stream is not None:
if self._cached_stream_data is None:
self._cached_stream_data = self.stream.read()
return self._cached_stream_data
return super()._get_body()
class TextResponse(StreamableMixin, BaseText):
pass
class HtmlResponse(StreamableMixin, BaseHtml, ABC): pass
HtmlResponse.register(TextResponse)
class XmlResponse(StreamableMixin, BaseXml, ABC): pass
XmlResponse.register(TextResponse)
class ParseableResponse(TextResponse):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._cached_data = None
self._jmes_cache = WeakValueDictionary()
def parse(self):
raise NotImplementedError("don't use ParseableResponse directly")
@property
def data(self):
if self._cached_data is None:
self._cached_data = self.parse()
return self._cached_data
def jmes(self, path):
jp = self._jmes_cache.get(path)
if jp is None:
jp = self._jmes_cache[path] = jmespath.compile(path)
return jp.search(self.data)
class JsonResponse(ParseableResponse):
def parse(self):
return json_loads(self.text)
class RISResponse(ParseableResponse):
class RIS(Ris):
PATTERN = '^[A-Z][A-Z0-9] -'
def __init__(self, lines):
super().__init__(lines, TAG_KEY_MAPPING)
def parse(self):
return self.RIS(self.readlines()).parse()
class ResponseTypes(BaseResponseTypes):
CLASSES = {
'text/html': 'gargantext.datasource.responses.HtmlResponse',
'application/atom+xml': 'gargantext.datasource.responses.XmlResponse',
'application/rdf+xml': 'gargantext.datasource.responses.XmlResponse',
'application/rss+xml': 'gargantext.datasource.responses.XmlResponse',
'application/xhtml+xml': 'gargantext.datasource.responses.HtmlResponse',
'application/vnd.wap.xhtml+xml': 'gargantext.datasource.responses.HtmlResponse',
'application/xml': 'gargantext.datasource.responses.XmlResponse',
'application/json': 'gargantext.datasource.responses.JsonResponse',
'application/x-json': 'gargantext.datasource.responses.JsonResponse',
'application/openapi+json': 'gargantext.datasource.responses.JsonResponse',
'application/json-amazonui-streaming': 'gargantext.datasource.responses.TextResponse',
'application/javascript': 'gargantext.datasource.responses.TextResponse',
'application/x-javascript': 'gargantext.datasource.responses.TextResponse',
'text/xml': 'gargantext.datasource.responses.XmlResponse',
'text/*': 'gargantext.datasource.responses.TextResponse',
}
_responsetypes.responsetypes = responsetypes = ResponseTypes()
...@@ -264,3 +264,21 @@ API_TOKENS = { ...@@ -264,3 +264,21 @@ API_TOKENS = {
# BOOL Interpreter # BOOL Interpreter
BOOL_TOOLS_PATH = "gargantext/util/crawlers/sparql" BOOL_TOOLS_PATH = "gargantext/util/crawlers/sparql"
# Scrapy settings
BOT_NAME = 'gargantext'
SPIDER_MODULES = ['gargantext.scrapers']
DOWNLOADER_MIDDLEWARES = {
# Will check HTTP responses according to 'expects' attribute of scrapers
'gargantext.datasource.downloadermiddlewares.ExpectsMiddleware': 1,
}
DOWNLOAD_HANDLERS = {
# Enable streamed file processing to handle large files
'file': 'gargantext.datasource.file.FileDownloadHandler',
# Disable s3 handler
's3': None,
}
DOWNLOAD_DELAY = 0.6
CONCURRENT_REQUESTS_PER_IP = 8
...@@ -12,11 +12,18 @@ read -r -d '' DJANGO_VAR <<EOF ...@@ -12,11 +12,18 @@ read -r -d '' DJANGO_VAR <<EOF
DJANGO_SETTINGS_MODULE=$DSM DJANGO_SETTINGS_MODULE=$DSM
EOF EOF
read -r -d '' SCRAPY_VAR <<EOF
# Scrapy settings module, it is unlikely that you'll need to change that.
# WARNING: It will be overwritten!
SCRAPY_SETTINGS_MODULE=$DSM
EOF
build_env () { build_env () {
cat << EOF > $ENV_FILE cat << EOF > $ENV_FILE
# ENVIR can be dev or prod # ENVIR can be dev or prod
ENVIR=$ENVIR ENVIR=$ENVIR
$DJANGO_VAR $DJANGO_VAR
$SCRAPY_VAR
# Paths of configuration files, you're welcome to change that; when a simple # Paths of configuration files, you're welcome to change that; when a simple
# filename is given, it'll be searched in current directory. # filename is given, it'll be searched in current directory.
GARGANTEXT_CONF=$GARGANTEXT_CONF GARGANTEXT_CONF=$GARGANTEXT_CONF
...@@ -28,6 +35,9 @@ update_env () { ...@@ -28,6 +35,9 @@ update_env () {
grep -Eq '^\s*DJANGO_SETTINGS_MODULE=' "$ENV_FILE" \ grep -Eq '^\s*DJANGO_SETTINGS_MODULE=' "$ENV_FILE" \
&& sed -E -i "s/^(\\s*DJANGO_SETTINGS_MODULE=).*/\\1$DSM/g" $ENV_FILE \ && sed -E -i "s/^(\\s*DJANGO_SETTINGS_MODULE=).*/\\1$DSM/g" $ENV_FILE \
|| echo "$DJANGO_VAR" >> "$ENV_FILE" || echo "$DJANGO_VAR" >> "$ENV_FILE"
grep -Eq '^\s*SCRAPY_SETTINGS_MODULE=' "$ENV_FILE" \
&& sed -E -i "s/^(\\s*SCRAPY_SETTINGS_MODULE=).*/\\1$DSM/g" $ENV_FILE \
|| echo "$SCRAPY_VAR" >> "$ENV_FILE"
} }
[ -f "$ENV_FILE" ] && update_env || build_env [ -f "$ENV_FILE" ] && update_env || build_env
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment