Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
68aef175
Commit
68aef175
authored
Mar 14, 2018
by
sim
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Brand new scrapers infrastructure using scrapy
parent
b68c1ae0
Changes
11
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
630 additions
and
4 deletions
+630
-4
Pipfile
Pipfile
+3
-0
Pipfile.lock
Pipfile.lock
+287
-4
__init__.py
gargantext/datasource/__init__.py
+1
-0
base.py
gargantext/datasource/base.py
+102
-0
downloadermiddlewares.py
gargantext/datasource/downloadermiddlewares.py
+12
-0
file.py
gargantext/datasource/file.py
+27
-0
items.py
gargantext/datasource/items.py
+53
-0
processors.py
gargantext/datasource/processors.py
+5
-0
responses.py
gargantext/datasource/responses.py
+112
-0
settings.py
gargantext/settings.py
+18
-0
mkenvs.sh
tools/mkenvs.sh
+10
-0
No files found.
Pipfile
View file @
68aef175
...
@@ -26,6 +26,9 @@ djangorestframework-jwt = "*"
...
@@ -26,6 +26,9 @@ djangorestframework-jwt = "*"
django-celery-beat = "*"
django-celery-beat = "*"
python-decouple = "*"
python-decouple = "*"
alembic = "*"
alembic = "*"
scrapy = "*"
jmespath = "*"
risparser = "*"
[requires]
[requires]
...
...
Pipfile.lock
View file @
68aef175
This diff is collapsed.
Click to expand it.
gargantext/datasource/__init__.py
0 → 100644
View file @
68aef175
from
.base
import
*
gargantext/datasource/base.py
0 → 100644
View file @
68aef175
import
logging
from
pathlib
import
Path
from
datetime
import
datetime
from
urllib.parse
import
urlencode
from
scrapy.spiders
import
Spider
from
scrapy.signals
import
response_received
from
scrapy.http.request
import
Request
as
BaseRequest
from
.responses
import
TextResponse
,
HtmlResponse
,
XmlResponse
,
JsonResponse
,
\
RISResponse
__all__
=
[
'Scraper'
,
'Request'
,
'TextResponse'
,
'HtmlResponse'
,
'XmlResponse'
,
'JsonResponse'
,
'RISResponse'
]
class
Request
(
BaseRequest
):
def
__init__
(
self
,
url
,
callback
=
None
,
method
=
'GET'
,
headers
=
None
,
body
=
None
,
cookies
=
None
,
meta
=
None
,
encoding
=
'utf-8'
,
priority
=
0
,
dont_filter
=
False
,
errback
=
None
,
flags
=
None
,
params
=
None
):
if
params
:
url
+=
'?'
+
urlencode
(
params
)
super
()
.
__init__
(
url
,
callback
,
method
,
headers
,
body
,
cookies
,
meta
,
encoding
,
priority
,
dont_filter
,
errback
,
flags
)
class
Scraper
(
Spider
):
MAX_COUNT
=
None
BATCH_SIZE
=
100
DEBUG_DIR
=
'/tmp'
ARGUMENTS
=
[
'url'
,
'count'
,
'query'
,
'count_only'
]
url
=
None
count
=
None
query
=
''
count_only
=
False
def
__init__
(
self
,
*
args
,
**
kwargs
):
# The default __init__ method will take any spider arguments and copy
# them to the spider as attributes: filter arguments for security
# purposes.
spider_args
=
{
k
:
v
for
k
,
v
in
kwargs
.
items
()
if
k
in
self
.
ARGUMENTS
}
super
()
.
__init__
(
*
args
,
**
spider_args
)
default_parser
=
getattr
(
self
,
'default_parser'
,
None
)
if
default_parser
and
not
hasattr
(
self
,
'parse'
):
# XXX Use setattr to bypass pylint warning...
setattr
(
self
,
'parser'
,
getattr
(
self
,
default_parser
))
def
start_requests
(
self
):
if
self
.
url
:
# and self.url.startswith('file://'):
yield
Request
(
self
.
url
)
else
:
yield
from
self
.
dispatch
()
@
property
def
logger_name
(
self
):
return
'scrapers.
%
s'
%
self
.
name
@
property
def
logger
(
self
):
logger
=
logging
.
getLogger
(
self
.
logger_name
)
return
logging
.
LoggerAdapter
(
logger
,
{
'spider'
:
self
})
@
property
def
limit
(
self
):
if
self
.
MAX_COUNT
is
None
:
return
self
.
count
or
0
if
self
.
count
is
None
:
return
self
.
MAX_COUNT
return
min
(
self
.
count
,
self
.
MAX_COUNT
)
@
classmethod
def
from_crawler
(
cls
,
crawler
,
*
args
,
**
kwargs
):
spider
=
super
()
.
from_crawler
(
crawler
,
*
args
,
**
kwargs
)
crawler
.
signals
.
connect
(
spider
.
trace
,
signal
=
response_received
)
return
spider
def
trace
(
self
,
response
,
request
,
spider
):
content_type
=
response
.
headers
.
get
(
'content-type'
,
b
''
)
.
decode
()
self
.
logger
.
info
(
'Content-Type=
%
s; type(response)=
%
s;'
,
content_type
,
type
(
response
)
.
__name__
)
path
=
Path
(
self
.
DEBUG_DIR
)
.
absolute
()
date
=
datetime
.
now
()
.
strftime
(
"
%
Y
%
m
%
d_
%
H
%
m_
%
s.
%
f"
)
ext
=
'.html'
if
isinstance
(
response
,
HtmlResponse
)
else
\
'.xml'
if
isinstance
(
response
,
XmlResponse
)
else
\
'.json'
if
isinstance
(
response
,
JsonResponse
)
else
\
'.txt'
if
isinstance
(
response
,
TextResponse
)
else
\
''
filename
=
'
%
s-
%
s
%
s'
%
(
spider
.
logger_name
,
date
,
ext
)
filepath
=
str
(
path
/
filename
)
with
open
(
filepath
,
'wb'
)
as
f
:
f
.
write
(
response
.
body
)
gargantext/datasource/downloadermiddlewares.py
0 → 100644
View file @
68aef175
class
ExpectsMiddleware
(
object
):
def
process_response
(
self
,
request
,
response
,
spider
):
expects
=
getattr
(
spider
,
'expects'
,
None
)
if
expects
is
not
None
and
not
isinstance
(
response
,
expects
):
expected
=
' or '
.
join
(
cls
.
__name__
for
cls
in
expects
)
\
if
type
(
expects
)
is
tuple
else
expects
.
__name__
raise
TypeError
(
"
%
s:
%
s expected, got
%
s instead."
%
(
spider
.
name
,
expected
,
response
.
__class__
.
__name__
))
return
response
gargantext/datasource/file.py
0 → 100644
View file @
68aef175
from
w3lib.url
import
file_uri_to_path
from
scrapy.utils.decorators
import
defers
from
.responses
import
responsetypes
,
TextResponse
class
FileDownloadHandler
(
object
):
CHUNK_SIZE
=
5000
def
__init__
(
self
,
settings
):
pass
@
defers
def
download_request
(
self
,
request
,
spider
):
filepath
=
file_uri_to_path
(
request
.
url
)
with
open
(
filepath
,
'rb'
)
as
fo
:
body_chunk
=
fo
.
read
(
self
.
CHUNK_SIZE
)
# Detect response type only from data, don't trust filename extension
respcls
=
getattr
(
spider
,
'expects'
,
None
)
or
\
responsetypes
.
from_args
(
body
=
body_chunk
)
stream
=
open
(
filepath
)
if
issubclass
(
respcls
,
TextResponse
)
else
\
open
(
filepath
,
'rb'
)
return
respcls
(
url
=
request
.
url
,
stream
=
stream
)
gargantext/datasource/items.py
0 → 100644
View file @
68aef175
from
datetime
import
datetime
from
scrapy.item
import
Item
,
Field
from
scrapy.loader
import
ItemLoader
from
scrapy.loader.processors
import
TakeFirst
,
Compose
,
MapCompose
,
Identity
from
.processors
import
filter_empty
DateTime
=
Field
(
serialize
=
str
)
String
=
Field
()
class
Document
(
Item
):
id
=
String
title
=
String
abstract
=
String
source
=
String
url
=
String
lang
=
String
authors
=
String
publication
=
DateTime
creation
=
DateTime
class
DocumentLoader
(
ItemLoader
):
default_item_class
=
Document
default_output_processor
=
TakeFirst
()
to_datetime
=
Compose
(
MapCompose
(
str
.
strip
,
int
),
filter_empty
,
lambda
args
:
datetime
(
*
args
))
publication_out
=
to_datetime
creation_out
=
to_datetime
authors_out
=
Identity
()
def
__init__
(
self
,
selector
,
*
args
,
**
kwargs
):
kwargs
[
'selector'
]
=
selector
super
()
.
__init__
(
*
args
,
**
kwargs
)
def
add_xpaths_text
(
self
,
xpaths
):
for
field_name
,
xpath
in
xpaths
.
items
():
self
.
add_xpath
(
field_name
,
'
%
s/text()'
%
xpath
)
def
add_values
(
self
,
values
):
for
field_name
,
value
in
values
.
items
():
self
.
add_value
(
field_name
,
value
)
def
parse
(
self
,
obj
):
return
NotImplementedError
(
"don't use DocumentLoader directly."
)
def
load
(
self
):
self
.
parse
(
self
.
selector
)
return
self
.
load_item
()
gargantext/datasource/processors.py
0 → 100644
View file @
68aef175
__all__
=
[
'filter_empty'
]
def
filter_empty
(
iterable
):
return
list
(
filter
(
None
,
iterable
))
gargantext/datasource/responses.py
0 → 100644
View file @
68aef175
import
logging
import
jmespath
from
abc
import
ABC
from
weakref
import
WeakValueDictionary
from
scrapy.http
import
\
TextResponse
as
BaseText
,
HtmlResponse
as
BaseHtml
,
XmlResponse
as
BaseXml
from
scrapy
import
responsetypes
as
_responsetypes
from
scrapy.responsetypes
import
ResponseTypes
as
BaseResponseTypes
from
gargantext.utils.json
import
json_loads
from
RISparser.parser
import
Ris
from
RISparser.config
import
TAG_KEY_MAPPING
logger
=
logging
.
getLogger
(
'scrapers'
)
# To be used in conjunction with gargantext.datasource.file.FileDownloadHandler
class
StreamableMixin
(
object
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
self
.
stream
=
kwargs
.
pop
(
'stream'
,
None
)
self
.
_cached_stream_data
=
None
super
()
.
__init__
(
*
args
,
**
kwargs
)
def
readlines
(
self
):
return
iter
(
self
.
stream
)
def
_get_body
(
self
):
if
self
.
stream
is
not
None
:
if
self
.
_cached_stream_data
is
None
:
self
.
_cached_stream_data
=
self
.
stream
.
read
()
return
self
.
_cached_stream_data
return
super
()
.
_get_body
()
class
TextResponse
(
StreamableMixin
,
BaseText
):
pass
class
HtmlResponse
(
StreamableMixin
,
BaseHtml
,
ABC
):
pass
HtmlResponse
.
register
(
TextResponse
)
class
XmlResponse
(
StreamableMixin
,
BaseXml
,
ABC
):
pass
XmlResponse
.
register
(
TextResponse
)
class
ParseableResponse
(
TextResponse
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
()
.
__init__
(
*
args
,
**
kwargs
)
self
.
_cached_data
=
None
self
.
_jmes_cache
=
WeakValueDictionary
()
def
parse
(
self
):
raise
NotImplementedError
(
"don't use ParseableResponse directly"
)
@
property
def
data
(
self
):
if
self
.
_cached_data
is
None
:
self
.
_cached_data
=
self
.
parse
()
return
self
.
_cached_data
def
jmes
(
self
,
path
):
jp
=
self
.
_jmes_cache
.
get
(
path
)
if
jp
is
None
:
jp
=
self
.
_jmes_cache
[
path
]
=
jmespath
.
compile
(
path
)
return
jp
.
search
(
self
.
data
)
class
JsonResponse
(
ParseableResponse
):
def
parse
(
self
):
return
json_loads
(
self
.
text
)
class
RISResponse
(
ParseableResponse
):
class
RIS
(
Ris
):
PATTERN
=
'^[A-Z][A-Z0-9] -'
def
__init__
(
self
,
lines
):
super
()
.
__init__
(
lines
,
TAG_KEY_MAPPING
)
def
parse
(
self
):
return
self
.
RIS
(
self
.
readlines
())
.
parse
()
class
ResponseTypes
(
BaseResponseTypes
):
CLASSES
=
{
'text/html'
:
'gargantext.datasource.responses.HtmlResponse'
,
'application/atom+xml'
:
'gargantext.datasource.responses.XmlResponse'
,
'application/rdf+xml'
:
'gargantext.datasource.responses.XmlResponse'
,
'application/rss+xml'
:
'gargantext.datasource.responses.XmlResponse'
,
'application/xhtml+xml'
:
'gargantext.datasource.responses.HtmlResponse'
,
'application/vnd.wap.xhtml+xml'
:
'gargantext.datasource.responses.HtmlResponse'
,
'application/xml'
:
'gargantext.datasource.responses.XmlResponse'
,
'application/json'
:
'gargantext.datasource.responses.JsonResponse'
,
'application/x-json'
:
'gargantext.datasource.responses.JsonResponse'
,
'application/openapi+json'
:
'gargantext.datasource.responses.JsonResponse'
,
'application/json-amazonui-streaming'
:
'gargantext.datasource.responses.TextResponse'
,
'application/javascript'
:
'gargantext.datasource.responses.TextResponse'
,
'application/x-javascript'
:
'gargantext.datasource.responses.TextResponse'
,
'text/xml'
:
'gargantext.datasource.responses.XmlResponse'
,
'text/*'
:
'gargantext.datasource.responses.TextResponse'
,
}
_responsetypes
.
responsetypes
=
responsetypes
=
ResponseTypes
()
gargantext/settings.py
View file @
68aef175
...
@@ -264,3 +264,21 @@ API_TOKENS = {
...
@@ -264,3 +264,21 @@ API_TOKENS = {
# BOOL Interpreter
# BOOL Interpreter
BOOL_TOOLS_PATH
=
"gargantext/util/crawlers/sparql"
BOOL_TOOLS_PATH
=
"gargantext/util/crawlers/sparql"
# Scrapy settings
BOT_NAME
=
'gargantext'
SPIDER_MODULES
=
[
'gargantext.scrapers'
]
DOWNLOADER_MIDDLEWARES
=
{
# Will check HTTP responses according to 'expects' attribute of scrapers
'gargantext.datasource.downloadermiddlewares.ExpectsMiddleware'
:
1
,
}
DOWNLOAD_HANDLERS
=
{
# Enable streamed file processing to handle large files
'file'
:
'gargantext.datasource.file.FileDownloadHandler'
,
# Disable s3 handler
's3'
:
None
,
}
DOWNLOAD_DELAY
=
0.6
CONCURRENT_REQUESTS_PER_IP
=
8
tools/mkenvs.sh
View file @
68aef175
...
@@ -12,11 +12,18 @@ read -r -d '' DJANGO_VAR <<EOF
...
@@ -12,11 +12,18 @@ read -r -d '' DJANGO_VAR <<EOF
DJANGO_SETTINGS_MODULE=
$DSM
DJANGO_SETTINGS_MODULE=
$DSM
EOF
EOF
read
-r
-d
''
SCRAPY_VAR
<<
EOF
# Scrapy settings module, it is unlikely that you'll need to change that.
# WARNING: It will be overwritten!
SCRAPY_SETTINGS_MODULE=
$DSM
EOF
build_env
()
{
build_env
()
{
cat
<<
EOF
>
$ENV_FILE
cat
<<
EOF
>
$ENV_FILE
# ENVIR can be dev or prod
# ENVIR can be dev or prod
ENVIR=
$ENVIR
ENVIR=
$ENVIR
$DJANGO_VAR
$DJANGO_VAR
$SCRAPY_VAR
# Paths of configuration files, you're welcome to change that; when a simple
# Paths of configuration files, you're welcome to change that; when a simple
# filename is given, it'll be searched in current directory.
# filename is given, it'll be searched in current directory.
GARGANTEXT_CONF=
$GARGANTEXT_CONF
GARGANTEXT_CONF=
$GARGANTEXT_CONF
...
@@ -28,6 +35,9 @@ update_env () {
...
@@ -28,6 +35,9 @@ update_env () {
grep
-Eq
'^\s*DJANGO_SETTINGS_MODULE='
"
$ENV_FILE
"
\
grep
-Eq
'^\s*DJANGO_SETTINGS_MODULE='
"
$ENV_FILE
"
\
&&
sed
-E
-i
"s/^(
\\
s*DJANGO_SETTINGS_MODULE=).*/
\\
1
$DSM
/g"
$ENV_FILE
\
&&
sed
-E
-i
"s/^(
\\
s*DJANGO_SETTINGS_MODULE=).*/
\\
1
$DSM
/g"
$ENV_FILE
\
||
echo
"
$DJANGO_VAR
"
>>
"
$ENV_FILE
"
||
echo
"
$DJANGO_VAR
"
>>
"
$ENV_FILE
"
grep
-Eq
'^\s*SCRAPY_SETTINGS_MODULE='
"
$ENV_FILE
"
\
&&
sed
-E
-i
"s/^(
\\
s*SCRAPY_SETTINGS_MODULE=).*/
\\
1
$DSM
/g"
$ENV_FILE
\
||
echo
"
$SCRAPY_VAR
"
>>
"
$ENV_FILE
"
}
}
[
-f
"
$ENV_FILE
"
]
&&
update_env
||
build_env
[
-f
"
$ENV_FILE
"
]
&&
update_env
||
build_env
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment