Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
a9731641
Commit
a9731641
authored
Apr 08, 2016
by
delanoe
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FACTO] Scrapers tools/methods factorization. (TODO: FACTO again).
parent
c00da2bc
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
227 additions
and
213 deletions
+227
-213
constants.py
gargantext/constants.py
+6
-0
urls.py
gargantext/urls.py
+18
-16
pubmed.py
gargantext/util/scrappers/pubmed.py
+0
-10
istex.py
scrapers/istex.py
+164
-0
pubmed.py
scrapers/pubmed.py
+13
-161
urls.py
scrapers/urls.py
+16
-6
util.py
scrapers/util.py
+8
-18
project.html
templates/pages/projects/project.html
+2
-2
No files found.
gargantext/constants.py
View file @
a9731641
...
@@ -208,3 +208,9 @@ DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY
...
@@ -208,3 +208,9 @@ DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY
# about batch processing...
# about batch processing...
BATCH_PARSING_SIZE
=
256
BATCH_PARSING_SIZE
=
256
BATCH_NGRAMSEXTRACTION_SIZE
=
1024
BATCH_NGRAMSEXTRACTION_SIZE
=
1024
# Scrapers config
QUERY_SIZE_N_MAX
=
1000
QUERY_SIZE_N_DEFAULT
=
1000
gargantext/urls.py
View file @
a9731641
...
@@ -8,9 +8,9 @@ Views are shared between these modules:
...
@@ -8,9 +8,9 @@ Views are shared between these modules:
- `graph explorer`, to explore graphs
- `graph explorer`, to explore graphs
"""
"""
from
django.conf.urls
import
include
,
url
from
django.conf.urls
import
include
,
url
from
django.contrib
import
admin
from
django.contrib
import
admin
import
gargantext.views.api.urls
import
gargantext.views.api.urls
import
gargantext.views.generated.urls
import
gargantext.views.generated.urls
...
@@ -18,32 +18,34 @@ import gargantext.views.pages.urls
...
@@ -18,32 +18,34 @@ import gargantext.views.pages.urls
# Module Annotation
# Module Annotation
## tempo: unchanged doc-annotations --
## tempo: unchanged doc-annotations --
from
annotations
import
urls
as
annotations_urls
from
annotations
import
urls
as
annotations_urls
from
annotations.views
import
main
as
annotations_main_view
from
annotations.views
import
main
as
annotations_main_view
# Module "Graph Explorer"
# Module "Graph Explorer"
#from graphExplorer import urls as graphExplorer_urls
#from graphExplorer import urls as graphExplorer_urls
from
graphExplorer.rest
import
Graph
from
graphExplorer.rest
import
Graph
from
graphExplorer.views
import
explorer
from
graphExplorer.views
import
explorer
from
scrapers
import
urls
as
scrapers_urls
# Module Scrapers
from
scrapers
import
urls
as
scrapers_urls
urlpatterns
=
[
url
(
r'^admin/'
,
admin
.
site
.
urls
)
urlpatterns
=
[
url
(
r'^admin/'
,
admin
.
site
.
urls
)
,
url
(
r'^generated/'
,
include
(
gargantext
.
views
.
generated
.
urls
))
,
url
(
r'^generated/'
,
include
(
gargantext
.
views
.
generated
.
urls
))
,
url
(
r'^api/'
,
include
(
gargantext
.
views
.
api
.
urls
)
)
,
url
(
r'^api/'
,
include
(
gargantext
.
views
.
api
.
urls
)
)
,
url
(
r'^'
,
include
(
gargantext
.
views
.
pages
.
urls
)
)
,
url
(
r'^'
,
include
(
gargantext
.
views
.
pages
.
urls
)
)
# Module Annotation
# Module Annotation
# tempo: unchanged doc-annotations routes --
# tempo: unchanged doc-annotations routes --
,
url
(
r'^annotations/'
,
include
(
annotations_urls
)
)
,
url
(
r'^annotations/'
,
include
(
annotations_urls
)
)
,
url
(
r'^projects/(\d+)/corpora/(\d+)/documents/(\d+)/$'
,
annotations_main_view
)
,
url
(
r'^projects/(\d+)/corpora/(\d+)/documents/(\d+)/$'
,
annotations_main_view
)
# Module "Graph Explorer"
# Module "Graph Explorer"
,
url
(
r'^projects/(\d+)/corpora/(\d+)/explorer$'
,
explorer
)
,
url
(
r'^projects/(\d+)/corpora/(\d+)/explorer$'
,
explorer
)
,
url
(
r'^projects/(\d+)/corpora/(\d+)/graph$'
,
Graph
.
as_view
())
,
url
(
r'^projects/(\d+)/corpora/(\d+)/graph$'
,
Graph
.
as_view
())
# to be removed:
# to be removed:
,
url
(
r'^projects/(\d+)/corpora/(\d+)/node_link.json$'
,
Graph
.
as_view
())
,
url
(
r'^projects/(\d+)/corpora/(\d+)/node_link.json$'
,
Graph
.
as_view
())
#url(r'^projects/(\d+)/corpora/(\d+)/explorer$', include(graphExplorer.urls))
#url(r'^projects/(\d+)/corpora/(\d+)/explorer$', include(graphExplorer.urls))
#url(r'^projects/(\d+)/corpora/(\d+)/explorer$', include(graphExplorer_urls))
,
url
(
r'^scrapers/'
,
include
(
scrapers_urls
))
# Scrapers module
,
url
(
r'^scrapers/'
,
include
(
scrapers_urls
)
)
]
]
gargantext/util/scrappers/pubmed.py
deleted
100644 → 0
View file @
c00da2bc
def
suggest
(
keywords
):
return
[
'Suggestion #1'
,
'Suggestion #2'
,
'Suggestion #3'
,
'Suggestion #4'
,
'Suggestion #5'
]
def
count
(
keywords
):
return
42
def
query_save
(
keywords
):
return
'path/to/query.xml'
scrapers/istex.py
View file @
a9731641
# from datetime import datetime
from
time
import
sleep
import
datetime
import
threading
#from gargantext.settings import MEDIA_ROOT, BASE_DIR
from
django.shortcuts
import
redirect
from
django.http
import
Http404
,
HttpResponseRedirect
,
HttpResponseForbidden
from
gargantext.constants
import
RESOURCETYPES
,
QUERY_SIZE_N_MAX
from
gargantext.models.nodes
import
Node
from
gargantext.util.db
import
session
from
gargantext.util.http
import
JsonHttpResponse
from
gargantext.util.tools
import
ensure_dir
from
gargantext.util.scheduling
import
scheduled
from
gargantext.util.toolchain
import
parse_extract_indexhyperdata
from
scrapers.util
import
Scraper
def
query
(
request
):
"""
ISTEX simply the total of hits for a query
(not reused in testISTEX)
"""
print
(
request
.
method
)
alist
=
[
"bar"
,
"foo"
]
if
request
.
method
==
"POST"
:
query
=
request
.
POST
[
"query"
]
N
=
int
(
request
.
POST
[
"N"
])
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" query ="
,
query
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" N ="
,
N
)
query_string
=
query
.
replace
(
" "
,
"+"
)
url
=
"http://api.istex.fr/document/?q="
+
query_string
+
"&output=id,title,abstract,pubdate,corpusName,authors,language"
tasks
=
Scraper
()
try
:
thedata_path
=
tasks
.
download
(
url
)
thedata
=
open
(
thedata_path
,
"rb"
)
alist
=
thedata
.
read
()
.
decode
(
'utf-8'
)
except
Exception
as
error
:
alist
=
[
str
(
error
)]
data
=
alist
return
JsonHttpResponse
(
data
)
def
save
(
request
,
project_id
):
print
(
"testISTEX:"
)
print
(
request
.
method
)
alist
=
[
"bar"
,
"foo"
]
# implicit global session
# do we have a valid project id?
try
:
project_id
=
int
(
project_id
)
except
ValueError
:
raise
Http404
()
# do we have a valid project?
project
=
(
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
project_id
)
.
filter
(
Node
.
typename
==
'PROJECT'
)
)
.
first
()
if
project
is
None
:
raise
Http404
()
# do we have a valid user?
user
=
request
.
user
if
not
user
.
is_authenticated
():
return
redirect
(
'/auth/?next=
%
s'
%
request
.
path
)
if
project
.
user_id
!=
user
.
id
:
return
HttpResponseForbidden
()
if
request
.
method
==
"POST"
:
query
=
"-"
query_string
=
"-"
N
=
0
if
"query"
in
request
.
POST
:
query
=
request
.
POST
[
"query"
]
query_string
=
query
.
replace
(
" "
,
"+"
)
# url encoded q
if
"N"
in
request
.
POST
:
N
=
int
(
request
.
POST
[
"N"
])
# query_size from views_opti
if
N
>
QUERY_SIZE_N_MAX
:
msg
=
"Invalid sample size N =
%
i (max =
%
i)"
%
(
N
,
QUERY_SIZE_N_MAX
)
print
(
"ERROR (scrap: istex d/l ): "
,
msg
)
raise
ValueError
(
msg
)
print
(
"Scrapping Istex: '
%
s' (
%
i)"
%
(
query_string
,
N
))
urlreqs
=
[]
pagesize
=
50
tasks
=
Scraper
()
chunks
=
list
(
tasks
.
chunks
(
range
(
N
),
pagesize
))
for
k
in
chunks
:
if
(
k
[
0
]
+
pagesize
)
>
N
:
pagesize
=
N
-
k
[
0
]
urlreqs
.
append
(
"http://api.istex.fr/document/?q="
+
query_string
+
"&output=*&"
+
"from="
+
str
(
k
[
0
])
+
"&size="
+
str
(
pagesize
))
# corpus node instanciation as a Django model
corpus
=
Node
(
name
=
query
,
user_id
=
request
.
user
.
id
,
parent_id
=
project_id
,
typename
=
'CORPUS'
,
hyperdata
=
{
"action"
:
"Scraping data"
,
"language_id"
:
None
}
)
session
.
add
(
corpus
)
session
.
commit
()
corpus_id
=
corpus
.
id
print
(
"NEW CORPUS"
,
corpus_id
)
ensure_dir
(
request
.
user
)
tasks
=
Scraper
()
for
i
in
range
(
8
):
t
=
threading
.
Thread
(
target
=
tasks
.
worker2
)
#thing to do
t
.
daemon
=
True
# thread dies when main thread (only non-daemon thread) exits.
t
.
start
()
for
url
in
urlreqs
:
tasks
.
q
.
put
(
url
)
#put a task in th queue
tasks
.
q
.
join
()
# wait until everything is finished
dwnldsOK
=
0
for
filename
in
tasks
.
firstResults
:
if
filename
!=
False
:
# add the uploaded resource to the corpus
# add the uploaded resource to the corpus
corpus
.
add_resource
(
type
=
3
,
path
=
filename
)
dwnldsOK
+=
1
if
dwnldsOK
==
0
:
return
JsonHttpResponse
([
"fail"
])
###########################
###########################
try
:
scheduled
(
parse_extract_indexhyperdata
(
corpus_id
,))
except
Exception
as
error
:
print
(
'WORKFLOW ERROR'
)
print
(
error
)
sleep
(
1
)
return
HttpResponseRedirect
(
'/projects/'
+
str
(
project_id
))
data
=
[
query_string
,
query
,
N
]
return
JsonHttpResponse
(
data
)
scrapers/pubmed.py
View file @
a9731641
# ****************************
# ***** Medline Scraper *****
# ****************************
from
scrapers.MedlineFetcher
import
MedlineFetcher
# MEDLINE USER REQUIREMENT : Run retrieval scripts on weekends or
# between 9 pm and 5 am Eastern Time weekdays
# from datetime import datetime
# from datetime import datetime
...
@@ -13,7 +17,7 @@ import threading
...
@@ -13,7 +17,7 @@ import threading
from
django.shortcuts
import
redirect
from
django.shortcuts
import
redirect
from
django.http
import
Http404
,
HttpResponseRedirect
,
HttpResponseForbidden
from
django.http
import
Http404
,
HttpResponseRedirect
,
HttpResponseForbidden
from
gargantext.constants
import
RESOURCETYPES
from
gargantext.constants
import
RESOURCETYPES
,
QUERY_SIZE_N_MAX
from
gargantext.models.nodes
import
Node
from
gargantext.models.nodes
import
Node
from
gargantext.util.db
import
session
from
gargantext.util.db
import
session
from
gargantext.util.http
import
JsonHttpResponse
from
gargantext.util.http
import
JsonHttpResponse
...
@@ -21,22 +25,11 @@ from gargantext.util.tools import ensure_dir
...
@@ -21,22 +25,11 @@ from gargantext.util.tools import ensure_dir
from
gargantext.util.scheduling
import
scheduled
from
gargantext.util.scheduling
import
scheduled
from
gargantext.util.toolchain
import
parse_extract_indexhyperdata
from
gargantext.util.toolchain
import
parse_extract_indexhyperdata
from
scrapers.util
import
Scraper
# pour lire la section [scrapers] de gargantext.ini
#from configparser import ConfigParser
# --------------------------------------------------------------------
def
query
(
request
):
# importing constants from config file
#CONF = ConfigParser()
#with open(path.join(BASE_DIR, 'gargantext.ini')) as inifile:
# CONF.read_file(inifile)
QUERY_SIZE_N_MAX
=
1000
# int(CONF['scrappers']['QUERY_SIZE_N_MAX'])
# QUERY_SIZE_N_DEFAULT = int(CONF['scrappers']['QUERY_SIZE_N_DEFAULT'])
# --------------------------------------------------------------------
def
getGlobalStats
(
request
):
"""
"""
Pubmed year by year results
Pubmed year by year results
...
@@ -47,7 +40,7 @@ def getGlobalStats( request ):
...
@@ -47,7 +40,7 @@ def getGlobalStats( request ):
# 'webEnv': 'NCID_1_14..._F_1', 'count': 345, 'retmax': 4},
# 'webEnv': 'NCID_1_14..._F_1', 'count': 345, 'retmax': 4},
# ... ]
# ... ]
(reused as thequeries in
doTheQuery
)
(reused as thequeries in
query_save
)
"""
"""
print
(
request
.
method
)
print
(
request
.
method
)
alist
=
[]
alist
=
[]
...
@@ -63,7 +56,7 @@ def getGlobalStats( request ):
...
@@ -63,7 +56,7 @@ def getGlobalStats( request ):
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" query ="
,
query
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" query ="
,
query
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" N ="
,
N
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" N ="
,
N
)
instancia
=
MedlineFetch
er
()
instancia
=
Scrap
er
()
# serialFetcher (n_last_years, query, query_size)
# serialFetcher (n_last_years, query, query_size)
alist
=
instancia
.
serialFetcher
(
5
,
query
,
N
)
alist
=
instancia
.
serialFetcher
(
5
,
query
,
N
)
...
@@ -72,7 +65,7 @@ def getGlobalStats( request ):
...
@@ -72,7 +65,7 @@ def getGlobalStats( request ):
return
JsonHttpResponse
(
data
)
return
JsonHttpResponse
(
data
)
def
doTheQuery
(
request
,
project_id
)
:
def
save
(
request
,
project_id
)
:
# implicit global session
# implicit global session
# do we have a valid project id?
# do we have a valid project id?
try
:
try
:
...
@@ -103,7 +96,7 @@ def doTheQuery( request , project_id ) :
...
@@ -103,7 +96,7 @@ def doTheQuery( request , project_id ) :
# here we just realize queries already prepared by getGlobalStats
# here we just realize queries already prepared by getGlobalStats
# ===> no need to repeat N parameter like in testISTEX <===
# ===> no need to repeat N parameter like in testISTEX <===
instancia
=
MedlineFetch
er
()
instancia
=
Scrap
er
()
thequeries
=
json
.
loads
(
queries
)
thequeries
=
json
.
loads
(
queries
)
# fyi the sum of our prepared yearly proportional quotas
# fyi the sum of our prepared yearly proportional quotas
...
@@ -138,7 +131,7 @@ def doTheQuery( request , project_id ) :
...
@@ -138,7 +131,7 @@ def doTheQuery( request , project_id ) :
ensure_dir
(
request
.
user
)
ensure_dir
(
request
.
user
)
tasks
=
MedlineFetch
er
()
tasks
=
Scrap
er
()
for
i
in
range
(
8
):
for
i
in
range
(
8
):
t
=
threading
.
Thread
(
target
=
tasks
.
worker2
)
#thing to do
t
=
threading
.
Thread
(
target
=
tasks
.
worker2
)
#thing to do
...
@@ -173,144 +166,3 @@ def doTheQuery( request , project_id ) :
...
@@ -173,144 +166,3 @@ def doTheQuery( request , project_id ) :
return
JsonHttpResponse
(
data
)
return
JsonHttpResponse
(
data
)
def
getGlobalStatsISTEXT
(
request
):
"""
ISTEX simply the total of hits for a query
(not reused in testISTEX)
"""
print
(
request
.
method
)
alist
=
[
"bar"
,
"foo"
]
if
request
.
method
==
"POST"
:
query
=
request
.
POST
[
"query"
]
N
=
int
(
request
.
POST
[
"N"
])
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" query ="
,
query
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" N ="
,
N
)
query_string
=
query
.
replace
(
" "
,
"+"
)
url
=
"http://api.istex.fr/document/?q="
+
query_string
+
"&output=id,title,abstract,pubdate,corpusName,authors,language"
tasks
=
MedlineFetcher
()
try
:
thedata_path
=
tasks
.
download
(
url
)
thedata
=
open
(
thedata_path
,
"rb"
)
alist
=
thedata
.
read
()
.
decode
(
'utf-8'
)
except
Exception
as
error
:
alist
=
[
str
(
error
)]
data
=
alist
return
JsonHttpResponse
(
data
)
def
testISTEX
(
request
,
project_id
):
print
(
"testISTEX:"
)
print
(
request
.
method
)
alist
=
[
"bar"
,
"foo"
]
# implicit global session
# do we have a valid project id?
try
:
project_id
=
int
(
project_id
)
except
ValueError
:
raise
Http404
()
# do we have a valid project?
project
=
(
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
project_id
)
.
filter
(
Node
.
typename
==
'PROJECT'
)
)
.
first
()
if
project
is
None
:
raise
Http404
()
# do we have a valid user?
user
=
request
.
user
if
not
user
.
is_authenticated
():
return
redirect
(
'/auth/?next=
%
s'
%
request
.
path
)
if
project
.
user_id
!=
user
.
id
:
return
HttpResponseForbidden
()
if
request
.
method
==
"POST"
:
query
=
"-"
query_string
=
"-"
N
=
0
if
"query"
in
request
.
POST
:
query
=
request
.
POST
[
"query"
]
query_string
=
query
.
replace
(
" "
,
"+"
)
# url encoded q
if
"N"
in
request
.
POST
:
N
=
int
(
request
.
POST
[
"N"
])
# query_size from views_opti
if
N
>
QUERY_SIZE_N_MAX
:
msg
=
"Invalid sample size N =
%
i (max =
%
i)"
%
(
N
,
QUERY_SIZE_N_MAX
)
print
(
"ERROR (scrap: istex d/l ): "
,
msg
)
raise
ValueError
(
msg
)
print
(
"Scrapping Istex: '
%
s' (
%
i)"
%
(
query_string
,
N
))
urlreqs
=
[]
pagesize
=
50
tasks
=
MedlineFetcher
()
chunks
=
list
(
tasks
.
chunks
(
range
(
N
),
pagesize
))
for
k
in
chunks
:
if
(
k
[
0
]
+
pagesize
)
>
N
:
pagesize
=
N
-
k
[
0
]
urlreqs
.
append
(
"http://api.istex.fr/document/?q="
+
query_string
+
"&output=*&"
+
"from="
+
str
(
k
[
0
])
+
"&size="
+
str
(
pagesize
))
# corpus node instanciation as a Django model
corpus
=
Node
(
name
=
query
,
user_id
=
request
.
user
.
id
,
parent_id
=
project_id
,
typename
=
'CORPUS'
,
hyperdata
=
{
"action"
:
"Scraping data"
,
"language_id"
:
None
}
)
session
.
add
(
corpus
)
session
.
commit
()
corpus_id
=
corpus
.
id
print
(
"NEW CORPUS"
,
corpus_id
)
ensure_dir
(
request
.
user
)
tasks
=
MedlineFetcher
()
for
i
in
range
(
8
):
t
=
threading
.
Thread
(
target
=
tasks
.
worker2
)
#thing to do
t
.
daemon
=
True
# thread dies when main thread (only non-daemon thread) exits.
t
.
start
()
for
url
in
urlreqs
:
tasks
.
q
.
put
(
url
)
#put a task in th queue
tasks
.
q
.
join
()
# wait until everything is finished
dwnldsOK
=
0
for
filename
in
tasks
.
firstResults
:
if
filename
!=
False
:
# add the uploaded resource to the corpus
# add the uploaded resource to the corpus
corpus
.
add_resource
(
type
=
3
,
path
=
filename
)
dwnldsOK
+=
1
if
dwnldsOK
==
0
:
return
JsonHttpResponse
([
"fail"
])
###########################
###########################
try
:
scheduled
(
parse_extract_indexhyperdata
(
corpus_id
,))
except
Exception
as
error
:
print
(
'WORKFLOW ERROR'
)
print
(
error
)
sleep
(
1
)
return
HttpResponseRedirect
(
'/projects/'
+
str
(
project_id
))
data
=
[
query_string
,
query
,
N
]
return
JsonHttpResponse
(
data
)
scrapers/urls.py
View file @
a9731641
from
django.conf.urls
import
url
from
django.conf.urls
import
url
import
scrapers.pubmed
as
pubmed
import
scrapers.pubmed
as
pubmed
#
import scrapers.istex as istex
import
scrapers.istex
as
istex
#import scrapers.cern as cern
#import scrapers.cern as cern
#import scrapers.hal as hal
#import scrapers.hal as hal
...
@@ -11,11 +11,21 @@ import scrapers.pubmed as pubmed
...
@@ -11,11 +11,21 @@ import scrapers.pubmed as pubmed
# Available databases : Pubmed, IsTex, (next: CERN)
# Available databases : Pubmed, IsTex, (next: CERN)
# /!\ urls patterns here are *without* the trailing slash
# /!\ urls patterns here are *without* the trailing slash
urlpatterns
=
[
url
(
r'^pubmed/query$'
,
pubmed
.
getGlobalStats
)
urlpatterns
=
[
url
(
r'^pubmed/query$'
,
pubmed
.
query
)
,
url
(
r'^pubmed/s
earch/(\d+)'
,
pubmed
.
doTheQuery
)
,
url
(
r'^pubmed/s
ave/(\d+)'
,
pubmed
.
save
)
,
url
(
r'^istex/query$'
,
pubmed
.
getGlobalStatsISTEXT
)
,
url
(
r'^istex/query$'
,
istex
.
query
)
,
url
(
r'^istex/search/(\d+)'
,
pubmed
.
testISTEX
)
,
url
(
r'^istex/save/(\d+)'
,
istex
.
save
)
#, url(r'^scraping$' , scraping.Target.as_view() )
# TODO REST API for the scrapers
#, url(r'^rest$' , scraping.Target.as_view() )
,
,
]
]
#def count(keywords):
# return 42
#
#def query_save(keywords):
# return 'path/to/query.xml'
#
scrapers/
MedlineFetcher
.py
→
scrapers/
util
.py
View file @
a9731641
# ****************************
# ***** Medline Fetcher *****
# ****************************
# MEDLINE USER REQUIREMENT : Run retrieval scripts on weekends or
# between 9 pm and 5 am Eastern Time weekdays
from
gargantext.util.files
import
download
from
gargantext.util.files
import
download
import
sys
import
sys
if
sys
.
version_info
>=
(
3
,
0
):
from
urllib.request
import
urlopen
else
:
from
urllib
import
urlopen
import
os
import
time
import
time
# import libxml2
from
lxml
import
etree
import
datetime
from
django.core.files
import
File
import
codecs
import
threading
import
threading
from
queue
import
Queue
from
queue
import
Queue
# import time
class
MedlineFetcher
:
from
lxml
import
etree
if
sys
.
version_info
>=
(
3
,
0
):
from
urllib.request
import
urlopen
else
:
from
urllib
import
urlopen
class
Scraper
:
def
__init__
(
self
):
def
__init__
(
self
):
self
.
queue_size
=
8
self
.
queue_size
=
8
...
...
templates/pages/projects/project.html
View file @
a9731641
...
@@ -260,7 +260,7 @@
...
@@ -260,7 +260,7 @@
$
.
ajax
({
$
.
ajax
({
// contentType: "application/json",
// contentType: "application/json",
url
:
window
.
location
.
origin
+
"/scrapers/pubmed/s
earch
/"
+
projectid
,
url
:
window
.
location
.
origin
+
"/scrapers/pubmed/s
ave
/"
+
projectid
,
data
:
pubmedifiedQuery
,
data
:
pubmedifiedQuery
,
type
:
'POST'
,
type
:
'POST'
,
beforeSend
:
function
(
xhr
)
{
beforeSend
:
function
(
xhr
)
{
...
@@ -504,7 +504,7 @@
...
@@ -504,7 +504,7 @@
$
.
ajax
({
$
.
ajax
({
// contentType: "application/json",
// contentType: "application/json",
url
:
window
.
location
.
origin
+
"/scrapers/istex/s
earch
/"
+
projectid
,
url
:
window
.
location
.
origin
+
"/scrapers/istex/s
ave
/"
+
projectid
,
data
:
postQuery
,
data
:
postQuery
,
type
:
'POST'
,
type
:
'POST'
,
beforeSend
:
function
(
xhr
)
{
beforeSend
:
function
(
xhr
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment