Brand new scrapers infrastructure using scrapy

68aef175 · sim · b68c1ae0 · 68aef175 · 68aef175 · 68aef175
Commit 68aef175 authored Mar 14, 2018 by sim
11 changed files
--- a/Pipfile
+++ b/Pipfile
@@ -26,6 +26,9 @@ djangorestframework-jwt = "*"
 django-celery-beat = "*"
 python-decouple = "*"
 alembic = "*"
+scrapy = "*"
+jmespath = "*"
+risparser = "*"


 [requires]

--- a/Pipfile.lock
+++ b/Pipfile.lock
--- a/gargantext/datasource/__init__.py
+++ b/gargantext/datasource/__init__.py
+from .base import *
--- a/gargantext/datasource/base.py
+++ b/gargantext/datasource/base.py
+import logging
+
+from pathlib import Path
+from datetime import datetime
+from urllib.parse import urlencode
+
+from scrapy.spiders import Spider
+from scrapy.signals import response_received
+from scrapy.http.request import Request as BaseRequest
+
+from .responses import TextResponse, HtmlResponse, XmlResponse, JsonResponse, \
+                       RISResponse
+
+
+__all__ = ['Scraper', 'Request', 'TextResponse', 'HtmlResponse', 'XmlResponse',
+           'JsonResponse', 'RISResponse']
+
+
+class Request(BaseRequest):
+    def __init__(self, url, callback=None, method='GET', headers=None, body=None,
+             cookies=None, meta=None, encoding='utf-8', priority=0,
+             dont_filter=False, errback=None, flags=None, params=None):
+
+        if params:
+            url += '?' + urlencode(params)
+
+        super().__init__(url, callback, method, headers, body, cookies, meta,
+                encoding, priority, dont_filter, errback, flags)
+
+
+class Scraper(Spider):
+    MAX_COUNT = None
+    BATCH_SIZE = 100
+    DEBUG_DIR = '/tmp'
+    ARGUMENTS = ['url', 'count', 'query', 'count_only']
+
+    url = None
+    count = None
+    query = ''
+    count_only = False
+
+    def __init__(self, *args, **kwargs):
+        # The default __init__ method will take any spider arguments and copy
+        # them to the spider as attributes: filter arguments for security
+        # purposes.
+        spider_args = {k: v for k, v in kwargs.items() if k in self.ARGUMENTS}
+
+        super().__init__(*args, **spider_args)
+
+        default_parser = getattr(self, 'default_parser', None)
+        if default_parser and not hasattr(self, 'parse'):
+            # XXX Use setattr to bypass pylint warning...
+            setattr(self, 'parser', getattr(self, default_parser))
+
+    def start_requests(self):
+        if self.url: # and self.url.startswith('file://'):
+            yield Request(self.url)
+        else:
+            yield from self.dispatch()
+
+    @property
+    def logger_name(self):
+        return 'scrapers.%s' % self.name
+
+    @property
+    def logger(self):
+        logger = logging.getLogger(self.logger_name)
+        return logging.LoggerAdapter(logger, {'spider': self})
+
+    @property
+    def limit(self):
+        if self.MAX_COUNT is None:
+            return self.count or 0
+        if self.count is None:
+            return self.MAX_COUNT
+        return min(self.count, self.MAX_COUNT)
+
+    @classmethod
+    def from_crawler(cls, crawler, *args, **kwargs):
+        spider = super().from_crawler(crawler, *args, **kwargs)
+        crawler.signals.connect(spider.trace, signal=response_received)
+        return spider
+
+    def trace(self, response, request, spider):
+        content_type = response.headers.get('content-type', b'').decode()
+
+        self.logger.info('Content-Type=%s; type(response)=%s;',
+            content_type, type(response).__name__)
+
+        path = Path(self.DEBUG_DIR).absolute()
+        date = datetime.now().strftime("%Y%m%d_%H%m_%s.%f")
+        ext = '.html' if isinstance(response, HtmlResponse) else \
+              '.xml' if isinstance(response, XmlResponse) else \
+              '.json' if isinstance(response, JsonResponse) else \
+              '.txt' if isinstance(response, TextResponse) else \
+              ''
+
+        filename = '%s-%s%s' % (spider.logger_name, date, ext)
+        filepath = str(path / filename)
+
+        with open(filepath, 'wb') as f:
+            f.write(response.body)
--- a/gargantext/datasource/downloadermiddlewares.py
+++ b/gargantext/datasource/downloadermiddlewares.py
+class ExpectsMiddleware(object):
+    def process_response(self, request, response, spider):
+        expects = getattr(spider, 'expects', None)
+
+        if expects is not None and not isinstance(response, expects):
+            expected = ' or '.join(cls.__name__ for cls in expects) \
+                if type(expects) is tuple else expects.__name__
+
+            raise TypeError("%s: %s expected, got %s instead." % (
+                spider.name, expected, response.__class__.__name__))
+
+        return response
--- a/gargantext/datasource/file.py
+++ b/gargantext/datasource/file.py
+from w3lib.url import file_uri_to_path
+from scrapy.utils.decorators import defers
+
+from .responses import responsetypes, TextResponse
+
+
+class FileDownloadHandler(object):
+    CHUNK_SIZE = 5000
+
+    def __init__(self, settings):
+        pass
+
+    @defers
+    def download_request(self, request, spider):
+        filepath = file_uri_to_path(request.url)
+
+        with open(filepath, 'rb') as fo:
+            body_chunk = fo.read(self.CHUNK_SIZE)
+
+        # Detect response type only from data, don't trust filename extension
+        respcls = getattr(spider, 'expects', None) or \
+                  responsetypes.from_args(body=body_chunk)
+
+        stream = open(filepath) if issubclass(respcls, TextResponse) else \
+                 open(filepath, 'rb')
+
+        return respcls(url=request.url, stream=stream)
--- a/gargantext/datasource/items.py
+++ b/gargantext/datasource/items.py
+from datetime import datetime
+
+from scrapy.item import Item, Field
+from scrapy.loader import ItemLoader
+from scrapy.loader.processors import TakeFirst, Compose, MapCompose, Identity
+
+from .processors import filter_empty
+
+
+DateTime = Field(serialize=str)
+String = Field()
+
+
+class Document(Item):
+    id = String
+    title = String
+    abstract = String
+    source = String
+    url = String
+    lang = String
+    authors = String
+    publication = DateTime
+    creation = DateTime
+
+
+class DocumentLoader(ItemLoader):
+    default_item_class = Document
+    default_output_processor = TakeFirst()
+
+    to_datetime = Compose(MapCompose(str.strip, int), filter_empty, lambda args: datetime(*args))
+
+    publication_out = to_datetime
+    creation_out = to_datetime
+    authors_out = Identity()
+
+    def __init__(self, selector, *args, **kwargs):
+        kwargs['selector'] = selector
+        super().__init__(*args, **kwargs)
+
+    def add_xpaths_text(self, xpaths):
+        for field_name, xpath in xpaths.items():
+            self.add_xpath(field_name, '%s/text()' % xpath)
+
+    def add_values(self, values):
+        for field_name, value in values.items():
+            self.add_value(field_name, value)
+
+    def parse(self, obj):
+        return NotImplementedError("don't use DocumentLoader directly.")
+
+    def load(self):
+        self.parse(self.selector)
+        return self.load_item()
--- a/gargantext/datasource/processors.py
+++ b/gargantext/datasource/processors.py
+__all__ = ['filter_empty']
+
+
+def filter_empty(iterable):
+    return list(filter(None, iterable))
--- a/gargantext/datasource/responses.py
+++ b/gargantext/datasource/responses.py
+import logging
+import jmespath
+
+from abc import ABC
+from weakref import WeakValueDictionary
+
+from scrapy.http import \
+    TextResponse as BaseText, HtmlResponse as BaseHtml, XmlResponse as BaseXml
+from scrapy import responsetypes as _responsetypes
+from scrapy.responsetypes import ResponseTypes as BaseResponseTypes
+
+from gargantext.utils.json import json_loads
+
+from RISparser.parser import Ris
+from RISparser.config import TAG_KEY_MAPPING
+
+
+logger = logging.getLogger('scrapers')
+
+
+# To be used in conjunction with gargantext.datasource.file.FileDownloadHandler
+class StreamableMixin(object):
+    def __init__(self, *args, **kwargs):
+        self.stream = kwargs.pop('stream', None)
+        self._cached_stream_data = None
+
+        super().__init__(*args, **kwargs)
+
+    def readlines(self):
+        return iter(self.stream)
+
+    def _get_body(self):
+        if self.stream is not None:
+            if self._cached_stream_data is None:
+                self._cached_stream_data = self.stream.read()
+            return self._cached_stream_data
+
+        return super()._get_body()
+
+
+class TextResponse(StreamableMixin, BaseText):
+    pass
+
+
+class HtmlResponse(StreamableMixin, BaseHtml, ABC): pass
+HtmlResponse.register(TextResponse)
+
+
+class XmlResponse(StreamableMixin, BaseXml, ABC): pass
+XmlResponse.register(TextResponse)
+
+
+class ParseableResponse(TextResponse):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._cached_data = None
+        self._jmes_cache = WeakValueDictionary()
+
+    def parse(self):
+        raise NotImplementedError("don't use ParseableResponse directly")
+
+    @property
+    def data(self):
+        if self._cached_data is None:
+            self._cached_data = self.parse()
+        return self._cached_data
+
+    def jmes(self, path):
+        jp = self._jmes_cache.get(path)
+
+        if jp is None:
+            jp = self._jmes_cache[path] = jmespath.compile(path)
+
+        return jp.search(self.data)
+
+
+class JsonResponse(ParseableResponse):
+    def parse(self):
+        return json_loads(self.text)
+
+
+class RISResponse(ParseableResponse):
+    class RIS(Ris):
+        PATTERN = '^[A-Z][A-Z0-9]  -'
+
+        def __init__(self, lines):
+            super().__init__(lines, TAG_KEY_MAPPING)
+
+    def parse(self):
+        return self.RIS(self.readlines()).parse()
+
+
+class ResponseTypes(BaseResponseTypes):
+    CLASSES = {
+        'text/html': 'gargantext.datasource.responses.HtmlResponse',
+        'application/atom+xml': 'gargantext.datasource.responses.XmlResponse',
+        'application/rdf+xml': 'gargantext.datasource.responses.XmlResponse',
+        'application/rss+xml': 'gargantext.datasource.responses.XmlResponse',
+        'application/xhtml+xml': 'gargantext.datasource.responses.HtmlResponse',
+        'application/vnd.wap.xhtml+xml': 'gargantext.datasource.responses.HtmlResponse',
+        'application/xml': 'gargantext.datasource.responses.XmlResponse',
+        'application/json': 'gargantext.datasource.responses.JsonResponse',
+        'application/x-json': 'gargantext.datasource.responses.JsonResponse',
+        'application/openapi+json': 'gargantext.datasource.responses.JsonResponse',
+        'application/json-amazonui-streaming': 'gargantext.datasource.responses.TextResponse',
+        'application/javascript': 'gargantext.datasource.responses.TextResponse',
+        'application/x-javascript': 'gargantext.datasource.responses.TextResponse',
+        'text/xml': 'gargantext.datasource.responses.XmlResponse',
+        'text/*': 'gargantext.datasource.responses.TextResponse',
+    }
+
+_responsetypes.responsetypes = responsetypes = ResponseTypes()
--- a/gargantext/settings.py
+++ b/gargantext/settings.py
@@ -264,3 +264,21 @@ API_TOKENS = {
 # BOOL Interpreter

 BOOL_TOOLS_PATH = "gargantext/util/crawlers/sparql"
+
+
+# Scrapy settings
+
+BOT_NAME = 'gargantext'
+SPIDER_MODULES = ['gargantext.scrapers']
+DOWNLOADER_MIDDLEWARES = {
+    # Will check HTTP responses according to 'expects' attribute of scrapers
+    'gargantext.datasource.downloadermiddlewares.ExpectsMiddleware': 1,
+}
+DOWNLOAD_HANDLERS = {
+    # Enable streamed file processing to handle large files
+    'file': 'gargantext.datasource.file.FileDownloadHandler',
+    # Disable s3 handler
+    's3': None,
+}
+DOWNLOAD_DELAY = 0.6
+CONCURRENT_REQUESTS_PER_IP = 8
--- a/tools/mkenvs.sh
+++ b/tools/mkenvs.sh
@@ -12,11 +12,18 @@ read -r -d '' DJANGO_VAR <<EOF
 DJANGO_SETTINGS_MODULE=$DSM
 EOF

+read -r -d '' SCRAPY_VAR <<EOF
+# Scrapy settings module, it is unlikely that you'll need to change that.
+# WARNING: It will be overwritten!
+SCRAPY_SETTINGS_MODULE=$DSM
+EOF
+
 build_env () {
    cat << EOF > $ENV_FILE
 # ENVIR can be dev or prod
 ENVIR=$ENVIR
 $DJANGO_VAR
+$SCRAPY_VAR
 # Paths of configuration files, you're welcome to change that; when a simple
 # filename is given, it'll be searched in current directory.
 GARGANTEXT_CONF=$GARGANTEXT_CONF
@@ -28,6 +35,9 @@ update_env () {
    grep -Eq '^\s*DJANGO_SETTINGS_MODULE=' "$ENV_FILE" \
        && sed -E -i "s/^(\\s*DJANGO_SETTINGS_MODULE=).*/\\1$DSM/g" $ENV_FILE \
        || echo "$DJANGO_VAR" >> "$ENV_FILE"
+    grep -Eq '^\s*SCRAPY_SETTINGS_MODULE=' "$ENV_FILE" \
+        && sed -E -i "s/^(\\s*SCRAPY_SETTINGS_MODULE=).*/\\1$DSM/g" $ENV_FILE \
+        || echo "$SCRAPY_VAR" >> "$ENV_FILE"
 }

 [ -f "$ENV_FILE" ] && update_env || build_env