Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
0be8f66f
Commit
0be8f66f
authored
Mar 29, 2018
by
sim
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Keep raw files processed by scrapers in UPLOAD_DIRECTORY
parent
ef08aad6
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
6 additions
and
3 deletions
+6
-3
base.py
gargantext/datasource/base.py
+5
-1
pipelines.py
gargantext/datasource/pipelines.py
+1
-2
No files found.
gargantext/datasource/base.py
View file @
0be8f66f
...
@@ -8,6 +8,7 @@ from scrapy.spiders import Spider
...
@@ -8,6 +8,7 @@ from scrapy.spiders import Spider
from
scrapy.signals
import
response_received
,
spider_error
,
item_dropped
from
scrapy.signals
import
response_received
,
spider_error
,
item_dropped
from
scrapy.http.request
import
Request
as
BaseRequest
from
scrapy.http.request
import
Request
as
BaseRequest
from
gargantext.constants
import
UPLOAD_DIRECTORY
from
gargantext.utils.json
import
json_dumps
from
gargantext.utils.json
import
json_dumps
from
gargantext.utils.dates
import
datetime
from
gargantext.utils.dates
import
datetime
from
gargantext.utils.convert
import
to_int
,
to_bool
,
to_str
from
gargantext.utils.convert
import
to_int
,
to_bool
,
to_str
...
@@ -35,7 +36,7 @@ class Request(BaseRequest):
...
@@ -35,7 +36,7 @@ class Request(BaseRequest):
class
Scraper
(
Spider
):
class
Scraper
(
Spider
):
MAX_COUNT
=
1000
MAX_COUNT
=
1000
BATCH_SIZE
=
100
BATCH_SIZE
=
100
DEBUG_DIR
=
'/tmp'
DEBUG_DIR
=
UPLOAD_DIRECTORY
ARGUMENTS
=
{
ARGUMENTS
=
{
'user'
:
(
to_str
,
None
),
'user'
:
(
to_str
,
None
),
'corpus'
:
(
to_int
,
None
),
'corpus'
:
(
to_int
,
None
),
...
@@ -70,6 +71,7 @@ class Scraper(Spider):
...
@@ -70,6 +71,7 @@ class Scraper(Spider):
self
.
status
=
{
"succeeded"
:
0
,
"failed"
:
0
,
"remaining"
:
0
}
self
.
status
=
{
"succeeded"
:
0
,
"failed"
:
0
,
"remaining"
:
0
}
self
.
events
=
[]
self
.
events
=
[]
self
.
events_history
=
[]
self
.
events_history
=
[]
self
.
files
=
[]
# For errors/events reporting
# For errors/events reporting
self
.
http
=
urllib3
.
PoolManager
()
self
.
http
=
urllib3
.
PoolManager
()
...
@@ -217,5 +219,7 @@ class Scraper(Spider):
...
@@ -217,5 +219,7 @@ class Scraper(Spider):
filename
=
'
%
s-
%
s
%
s'
%
(
spider
.
logger_name
,
date
,
ext
)
filename
=
'
%
s-
%
s
%
s'
%
(
spider
.
logger_name
,
date
,
ext
)
filepath
=
str
(
path
/
filename
)
filepath
=
str
(
path
/
filename
)
self
.
files
.
append
(
filepath
)
with
open
(
filepath
,
'wb'
)
as
f
:
with
open
(
filepath
,
'wb'
)
as
f
:
f
.
write
(
response
.
body
)
f
.
write
(
response
.
body
)
gargantext/datasource/pipelines.py
View file @
0be8f66f
...
@@ -72,8 +72,7 @@ class DatabasePipeline(object):
...
@@ -72,8 +72,7 @@ class DatabasePipeline(object):
resources
=
self
.
corpus
.
data
.
get
(
'resources'
,
[])
resources
=
self
.
corpus
.
data
.
get
(
'resources'
,
[])
resources
.
append
({
resources
.
append
({
"date"
:
datetime
.
now
(),
"date"
:
datetime
.
now
(),
# TODO Raw files storage and listing in paths
"paths"
:
scraper
.
files
,
"paths"
:
None
,
"scraper"
:
scraper
.
name
,
"scraper"
:
scraper
.
name
,
"query"
:
scraper
.
query
,
"query"
:
scraper
.
query
,
"status"
:
scraper
.
status
,
"status"
:
scraper
.
status
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment