Commit 0be8f66f authored by sim's avatar sim

Keep raw files processed by scrapers in UPLOAD_DIRECTORY

parent ef08aad6
......@@ -8,6 +8,7 @@ from scrapy.spiders import Spider
from scrapy.signals import response_received, spider_error, item_dropped
from scrapy.http.request import Request as BaseRequest
from gargantext.constants import UPLOAD_DIRECTORY
from gargantext.utils.json import json_dumps
from gargantext.utils.dates import datetime
from gargantext.utils.convert import to_int, to_bool, to_str
......@@ -35,7 +36,7 @@ class Request(BaseRequest):
class Scraper(Spider):
MAX_COUNT = 1000
BATCH_SIZE = 100
DEBUG_DIR = '/tmp'
DEBUG_DIR = UPLOAD_DIRECTORY
ARGUMENTS = {
'user': (to_str, None),
'corpus': (to_int, None),
......@@ -70,6 +71,7 @@ class Scraper(Spider):
self.status = {"succeeded": 0, "failed": 0, "remaining": 0}
self.events = []
self.events_history = []
self.files = []
# For errors/events reporting
self.http = urllib3.PoolManager()
......@@ -217,5 +219,7 @@ class Scraper(Spider):
filename = '%s-%s%s' % (spider.logger_name, date, ext)
filepath = str(path / filename)
self.files.append(filepath)
with open(filepath, 'wb') as f:
f.write(response.body)
......@@ -72,8 +72,7 @@ class DatabasePipeline(object):
resources = self.corpus.data.get('resources', [])
resources.append({
"date": datetime.now(),
# TODO Raw files storage and listing in paths
"paths": None,
"paths": scraper.files,
"scraper": scraper.name,
"query": scraper.query,
"status": scraper.status,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment