Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
2bbecc32
Commit
2bbecc32
authored
Jul 17, 2017
by
sim
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[REFACT] Remove unreachable code in PUBMED crawler
parent
098ec535
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
0 additions
and
137 deletions
+0
-137
PUBMED.py
gargantext/util/crawlers/PUBMED.py
+0
-137
No files found.
gargantext/util/crawlers/PUBMED.py
View file @
2bbecc32
...
...
@@ -196,140 +196,3 @@ class PubmedCrawler(Crawler):
downloaded
=
False
self
.
status
.
insert
(
0
,
"error fetching PUBMED "
+
r
.
status
)
return
downloaded
def
query
(
request
):
"""
Pubmed year by year results
# alist = [
# {'string': '2011[dp] serendipity', 'queryKey': '1',
# 'webEnv': 'NCID_1_11...._F_1', 'count': 475, 'retmax': 6},
# {'string': '2012[dp] serendipity', 'queryKey': '1',
# 'webEnv': 'NCID_1_14..._F_1', 'count': 345, 'retmax': 4},
# ... ]
(reused as thequeries in query_save)
"""
print
(
request
.
method
)
alist
=
[]
if
request
.
method
==
"POST"
:
query
=
request
.
POST
[
"query"
]
N
=
int
(
request
.
POST
[
"N"
])
if
N
>
QUERY_SIZE_N_MAX
:
msg
=
"Invalid sample size N =
%
i (max =
%
i)"
%
(
N
,
QUERY_SIZE_N_MAX
)
print
(
"ERROR(scrap: pubmed stats): "
,
msg
)
raise
ValueError
(
msg
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" query ="
,
query
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" N ="
,
N
)
instancia
=
Scraper
()
# serialFetcher (n_last_years, query, query_size)
alist
=
instancia
.
serialFetcher
(
5
,
query
,
N
)
data
=
alist
return
JsonHttpResponse
(
data
)
def
save
(
request
,
project_id
)
:
# implicit global session
# do we have a valid project id?
try
:
project_id
=
int
(
project_id
)
except
ValueError
:
raise
Http404
()
# do we have a valid project?
project
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
project_id
)
.
first
()
if
project
is
None
:
raise
Http404
()
user
=
cache
.
User
[
request
.
user
.
id
]
if
not
user
.
owns
(
project
):
return
HttpResponseForbidden
()
if
request
.
method
==
"POST"
:
queries
=
request
.
POST
[
"query"
]
name
=
request
.
POST
[
"string"
]
# here we just realize queries already prepared by getGlobalStats
# ===> no need to repeat N parameter like in testISTEX <===
instancia
=
Scraper
()
thequeries
=
json
.
loads
(
queries
)
# fyi the sum of our prepared yearly proportional quotas
sampled_sum
=
sum
([
year_q
[
'retmax'
]
for
year_q
in
thequeries
])
print
(
"Scrapping Pubmed: '
%
s' (N=
%
i)"
%
(
name
,
sampled_sum
))
urlreqs
=
[]
for
yearquery
in
thequeries
:
urlreqs
.
append
(
instancia
.
medlineEfetchRAW
(
yearquery
)
)
alist
=
[
"tudo fixe"
,
"tudo bem"
]
# corpus node instanciation as a Django model
corpus
=
project
.
add_child
(
name
=
name
,
typename
=
"CORPUS"
)
# """
# urlreqs: List of urls to query.
# - Then, to each url in urlreqs you do:
# eFetchResult = urlopen(url)
# eFetchResult.read() # this will output the XML... normally you write this to a XML-file.
# """
tasks
=
Scraper
()
for
i
in
range
(
8
):
t
=
threading
.
Thread
(
target
=
tasks
.
worker2
)
#thing to do
t
.
daemon
=
True
# thread dies when main thread (only non-daemon thread) exits.
t
.
start
()
for
url
in
urlreqs
:
tasks
.
q
.
put
(
url
)
#put a task in the queue
tasks
.
q
.
join
()
# wait until everything is finished
dwnldsOK
=
0
for
filename
in
tasks
.
firstResults
:
print
(
filename
)
if
filename
!=
False
:
# add the uploaded resource to the corpus
corpus
.
add_resource
(
type
=
resourcetype
(
'Pubmed (XML format)'
)
,
path
=
filename
,
url
=
None
)
print
(
"Adding the resource"
)
dwnldsOK
+=
1
session
.
add
(
corpus
)
session
.
commit
()
corpus_id
=
corpus
.
id
if
dwnldsOK
==
0
:
return
JsonHttpResponse
([
"fail"
])
try
:
scheduled
(
parse_extract_indexhyperdata
)(
corpus_id
)
except
Exception
as
error
:
print
(
'WORKFLOW ERROR'
)
print
(
error
)
try
:
print_tb
(
error
.
__traceback__
)
except
:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session
.
rollback
()
# --------------------------------------------
sleep
(
1
)
return
HttpResponseRedirect
(
'/projects/'
+
str
(
project_id
))
data
=
alist
return
JsonHttpResponse
(
data
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment