Commit 40620d80 authored by sim's avatar sim

Store scraping 'resources' in CorpusNode

parent ce2c9c78
...@@ -67,7 +67,9 @@ class Scraper(Spider): ...@@ -67,7 +67,9 @@ class Scraper(Spider):
self._total = None self._total = None
self.counter = 0 self.counter = 0
self.status = {"succeeded": 0, "failed": 0, "remaining": 0}
self.events = [] self.events = []
self.events_history = []
# For errors/events reporting # For errors/events reporting
self.http = urllib3.PoolManager() self.http = urllib3.PoolManager()
...@@ -149,6 +151,7 @@ class Scraper(Spider): ...@@ -149,6 +151,7 @@ class Scraper(Spider):
"level": level, "level": level,
"message": message, "message": message,
}) })
self.events_history.extend(self.events)
def report(self, path, succeeded=0, failed=0, remaining=None, events=None): def report(self, path, succeeded=0, failed=0, remaining=None, events=None):
# Silently fail if not bounded to a crawler or callback is undefined # Silently fail if not bounded to a crawler or callback is undefined
...@@ -157,14 +160,15 @@ class Scraper(Spider): ...@@ -157,14 +160,15 @@ class Scraper(Spider):
assert path in ('output', 'error', 'event') assert path in ('output', 'error', 'event')
self.status.update(succeeded=succeeded,
failed=failed,
remaining=remaining)
events = self.events + (events or []) events = self.events + (events or [])
self.events = [] self.events = []
url = '%s/%s' % (self.callback, path) url = '%s/%s' % (self.callback, path)
status = dict(succeeded=succeeded, status = dict(events=events, **self.status)
failed=failed,
remaining=remaining,
events=events)
data = { k: v for k, v in status.items() if v is not None } data = { k: v for k, v in status.items() if v is not None }
try: try:
......
...@@ -9,6 +9,7 @@ import logging ...@@ -9,6 +9,7 @@ import logging
from gargantext.models.nodes import DocumentNode, CorpusNode from gargantext.models.nodes import DocumentNode, CorpusNode
from gargantext.core.db import Session from gargantext.core.db import Session
from gargantext.utils.dates import datetime
from gargantext.utils.lang import lang_detect from gargantext.utils.lang import lang_detect
...@@ -68,8 +69,20 @@ class DatabasePipeline(object): ...@@ -68,8 +69,20 @@ class DatabasePipeline(object):
self.corpus = self.db.query(CorpusNode).filter_by(id=scraper.corpus).one_or_none() self.corpus = self.db.query(CorpusNode).filter_by(id=scraper.corpus).one_or_none()
def close_spider(self, scraper): def close_spider(self, scraper):
self.corpus["resources"] = [{"type": 0}] resources = self.corpus.hyperdata.get('resources', [])
resources.append({
"date": datetime.now(),
# TODO Raw files storage and listing in paths
"paths": None,
"scraper": scraper.name,
"query": scraper.query,
"status": scraper.status,
"events": scraper.events_history,
})
self.corpus['resources'] = resources
self.corpus.save_hyperdata() self.corpus.save_hyperdata()
self.db.commit() self.db.commit()
def process_item(self, item, scraper): def process_item(self, item, scraper):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment