Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
b94e4312
Commit
b94e4312
authored
Feb 06, 2018
by
sim
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Revert "Remove moissonneurs module"
This reverts commit
fde04dab
.
parent
5df80fbb
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
1099 additions
and
0 deletions
+1099
-0
__init__.py
gargantext/moissonneurs/__init__.py
+0
-0
cern.py
gargantext/moissonneurs/cern.py
+119
-0
hal.py
gargantext/moissonneurs/hal.py
+122
-0
isidore.py
gargantext/moissonneurs/isidore.py
+118
-0
istex.py
gargantext/moissonneurs/istex.py
+189
-0
multivac.py
gargantext/moissonneurs/multivac.py
+123
-0
pubmed.py
gargantext/moissonneurs/pubmed.py
+174
-0
urls.py
gargantext/moissonneurs/urls.py
+44
-0
util.py
gargantext/moissonneurs/util.py
+210
-0
No files found.
gargantext/moissonneurs/__init__.py
0 → 100644
View file @
b94e4312
gargantext/moissonneurs/cern.py
0 → 100644
View file @
b94e4312
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# ***** CERN Crawler *****
# ****************************
RESOURCE_TYPE_SCOAP
=
9
from
django.shortcuts
import
redirect
,
render
from
django.http
import
Http404
,
HttpResponseRedirect
,
HttpResponseForbidden
from
gargantext.constants
import
get_resource
,
load_crawler
,
QUERY_SIZE_N_MAX
from
gargantext.models.nodes
import
Node
from
gargantext.util.db
import
session
from
gargantext.util.db_cache
import
cache
from
gargantext.util.http
import
JsonHttpResponse
from
gargantext.util.scheduling
import
scheduled
from
gargantext.util.toolchain
import
parse_extract_indexhyperdata
def
query
(
request
):
'''get GlobalResults()'''
if
request
.
method
==
"POST"
:
query
=
request
.
POST
[
"query"
]
source
=
get_resource
(
RESOURCE_TYPE_SCOAP
)
if
source
[
"crawler"
]
is
not
None
:
crawlerbot
=
load_crawler
(
source
)()
#old raw way to get results_nb
results
=
crawlerbot
.
scan_results
(
query
)
#ids = crawlerbot.get_ids(query)
return
JsonHttpResponse
({
"results_nb"
:
crawlerbot
.
results_nb
})
def
save
(
request
,
project_id
,
return_corpus
=
False
):
'''save'''
if
request
.
method
==
"POST"
:
query
=
request
.
POST
.
get
(
"query"
)
try
:
N
=
int
(
request
.
POST
.
get
(
"N"
))
except
:
N
=
0
print
(
query
,
N
)
#for next time
#ids = request.POST["ids"]
source
=
get_resource
(
RESOURCE_TYPE_SCOAP
)
if
N
==
0
:
raise
Http404
()
if
N
>
QUERY_SIZE_N_MAX
:
N
=
QUERY_SIZE_N_MAX
try
:
project_id
=
int
(
project_id
)
except
ValueError
:
raise
Http404
()
# do we have a valid project?
project
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
project_id
)
.
first
()
if
project
is
None
:
raise
Http404
()
user
=
cache
.
User
[
request
.
user
.
id
]
if
not
user
.
owns
(
project
):
return
HttpResponseForbidden
()
# corpus node instanciation as a Django model
corpus
=
Node
(
name
=
query
,
user_id
=
request
.
user
.
id
,
parent_id
=
project_id
,
typename
=
'CORPUS'
,
hyperdata
=
{
"action"
:
"Scrapping data"
,
"language_id"
:
"en"
}
)
#download_file
crawler_bot
=
load_crawler
(
source
)()
#for now no way to force downloading X records
#the long running command
filename
=
crawler_bot
.
download
(
query
)
corpus
.
add_resource
(
type
=
source
[
"type"
]
#, name = source["name"]
,
path
=
crawler_bot
.
path
)
session
.
add
(
corpus
)
session
.
commit
()
#corpus_id = corpus.id
try
:
scheduled
(
parse_extract_indexhyperdata
)(
corpus
.
id
)
except
Exception
as
error
:
print
(
'WORKFLOW ERROR'
)
print
(
error
)
try
:
print_tb
(
error
.
__traceback__
)
except
:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session
.
rollback
()
# --------------------------------------------
if
return_corpus
:
return
corpus
return
render
(
template_name
=
'pages/projects/wait.html'
,
request
=
request
,
context
=
{
'user'
:
request
.
user
,
'project'
:
project
,
},
)
data
=
[
query_string
,
query
,
N
]
print
(
data
)
return
JsonHttpResponse
(
data
)
gargantext/moissonneurs/hal.py
0 → 100644
View file @
b94e4312
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# ***** HAL Crawler *****
# ****************************
# LICENCE: GARGANTEXT.org Licence
RESOURCE_TYPE_HAL
=
11
from
django.shortcuts
import
redirect
,
render
from
django.http
import
Http404
,
HttpResponseRedirect
\
,
HttpResponseForbidden
from
gargantext.constants
import
get_resource
,
load_crawler
,
QUERY_SIZE_N_MAX
from
gargantext.models.nodes
import
Node
from
gargantext.util.db
import
session
from
gargantext.util.db_cache
import
cache
from
gargantext.util.http
import
JsonHttpResponse
from
gargantext.util.scheduling
import
scheduled
from
gargantext.util.toolchain
import
parse_extract_indexhyperdata
def
query
(
request
):
'''get GlobalResults()'''
if
request
.
method
==
"POST"
:
query
=
request
.
POST
[
"query"
]
source
=
get_resource
(
RESOURCE_TYPE_HAL
)
if
source
[
"crawler"
]
is
not
None
:
crawlerbot
=
load_crawler
(
source
)()
#old raw way to get results_nb
results
=
crawlerbot
.
scan_results
(
query
)
#ids = crawlerbot.get_ids(query)
print
(
results
)
return
JsonHttpResponse
({
"results_nb"
:
crawlerbot
.
results_nb
})
def
save
(
request
,
project_id
,
return_corpus
=
False
):
'''save'''
if
request
.
method
==
"POST"
:
query
=
request
.
POST
.
get
(
"query"
)
try
:
N
=
int
(
request
.
POST
.
get
(
"N"
))
except
:
N
=
0
print
(
query
,
N
)
#for next time
#ids = request.POST["ids"]
source
=
get_resource
(
RESOURCE_TYPE_HAL
)
if
N
==
0
:
raise
Http404
()
if
N
>
QUERY_SIZE_N_MAX
:
N
=
QUERY_SIZE_N_MAX
try
:
project_id
=
int
(
project_id
)
except
ValueError
:
raise
Http404
()
# do we have a valid project?
project
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
project_id
)
.
first
()
if
project
is
None
:
raise
Http404
()
user
=
cache
.
User
[
request
.
user
.
id
]
if
not
user
.
owns
(
project
):
return
HttpResponseForbidden
()
# corpus node instanciation as a Django model
corpus
=
Node
(
name
=
query
,
user_id
=
request
.
user
.
id
,
parent_id
=
project_id
,
typename
=
'CORPUS'
,
hyperdata
=
{
"action"
:
"Scrapping data"
}
)
#download_file
crawler_bot
=
load_crawler
(
source
)()
#for now no way to force downloading X records
#the long running command
filename
=
crawler_bot
.
download
(
query
)
corpus
.
add_resource
(
type
=
source
[
"type"
]
#, name = source["name"]
,
path
=
crawler_bot
.
path
)
session
.
add
(
corpus
)
session
.
commit
()
#corpus_id = corpus.id
try
:
scheduled
(
parse_extract_indexhyperdata
)(
corpus
.
id
)
except
Exception
as
error
:
print
(
'WORKFLOW ERROR'
)
print
(
error
)
try
:
print_tb
(
error
.
__traceback__
)
except
:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session
.
rollback
()
# --------------------------------------------
if
return_corpus
:
return
corpus
return
render
(
template_name
=
'pages/projects/wait.html'
,
request
=
request
,
context
=
{
'user'
:
request
.
user
,
'project'
:
project
,
},
)
data
=
[
query_string
,
query
,
N
]
print
(
data
)
return
JsonHttpResponse
(
data
)
gargantext/moissonneurs/isidore.py
0 → 100644
View file @
b94e4312
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# ***** ISIDORE Crawler *****
# ****************************
RESOURCE_TYPE_ISIDORE
=
12
from
django.shortcuts
import
redirect
,
render
from
django.http
import
Http404
,
HttpResponseRedirect
,
HttpResponseForbidden
from
gargantext.constants
import
get_resource
,
load_crawler
,
QUERY_SIZE_N_MAX
from
gargantext.models.nodes
import
Node
from
gargantext.util.db
import
session
from
gargantext.util.db_cache
import
cache
from
gargantext.util.http
import
JsonHttpResponse
from
gargantext.util.scheduling
import
scheduled
from
gargantext.util.toolchain
import
parse_extract_indexhyperdata
def
query
(
request
):
'''get GlobalResults()'''
if
request
.
method
==
"POST"
:
query
=
request
.
POST
[
"query"
]
source
=
get_resource
(
RESOURCE_TYPE_ISIDORE
)
if
source
[
"crawler"
]
is
not
None
:
crawlerbot
=
load_crawler
(
source
)()
#old raw way to get results_nb
results
=
crawlerbot
.
scan_results
(
query
)
#ids = crawlerbot.get_ids(query)
return
JsonHttpResponse
({
"results_nb"
:
crawlerbot
.
results_nb
})
def
save
(
request
,
project_id
,
return_corpus
=
False
):
'''save'''
if
request
.
method
==
"POST"
:
query
=
request
.
POST
.
get
(
"query"
)
try
:
N
=
int
(
request
.
POST
.
get
(
"N"
))
except
:
N
=
0
print
(
query
,
N
)
#for next time
#ids = request.POST["ids"]
source
=
get_resource
(
RESOURCE_TYPE_ISIDORE
)
if
N
==
0
:
raise
Http404
()
if
N
>
QUERY_SIZE_N_MAX
:
N
=
QUERY_SIZE_N_MAX
try
:
project_id
=
int
(
project_id
)
except
ValueError
:
raise
Http404
()
# do we have a valid project?
project
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
project_id
)
.
first
()
if
project
is
None
:
raise
Http404
()
user
=
cache
.
User
[
request
.
user
.
id
]
if
not
user
.
owns
(
project
):
return
HttpResponseForbidden
()
# corpus node instanciation as a Django model
corpus
=
Node
(
name
=
query
,
user_id
=
request
.
user
.
id
,
parent_id
=
project_id
,
typename
=
'CORPUS'
,
hyperdata
=
{
"action"
:
"Scrapping data"
,
"language_id"
:
"fr"
}
)
#download_file
crawler_bot
=
load_crawler
(
source
)()
#for now no way to force downloading X records
#the long running command
filename
=
crawler_bot
.
download
(
query
)
corpus
.
add_resource
(
type
=
source
[
"type"
]
#, name = source["name"]
,
path
=
crawler_bot
.
path
)
session
.
add
(
corpus
)
session
.
commit
()
#corpus_id = corpus.id
try
:
scheduled
(
parse_extract_indexhyperdata
)(
corpus
.
id
)
except
Exception
as
error
:
print
(
'WORKFLOW ERROR'
)
print
(
error
)
try
:
print_tb
(
error
.
__traceback__
)
except
:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session
.
rollback
()
# --------------------------------------------
if
return_corpus
:
return
corpus
return
render
(
template_name
=
'pages/projects/wait.html'
,
request
=
request
,
context
=
{
'user'
:
request
.
user
,
'project'
:
project
,
},
)
data
=
[
query_string
,
query
,
N
]
print
(
data
)
return
JsonHttpResponse
(
data
)
gargantext/moissonneurs/istex.py
0 → 100644
View file @
b94e4312
from
datetime
import
datetime
from
time
import
sleep
import
datetime
import
threading
from
traceback
import
print_tb
#from gargantext.settings import MEDIA_ROOT, BASE_DIR
from
django.shortcuts
import
redirect
,
render
from
django.http
import
Http404
,
HttpResponseRedirect
,
HttpResponseForbidden
from
gargantext.constants
import
get_resource
,
QUERY_SIZE_N_MAX
from
gargantext.models.nodes
import
Node
from
gargantext.util.db
import
session
from
gargantext.util.http
import
JsonHttpResponse
from
gargantext.util.scheduling
import
scheduled
from
gargantext.util.toolchain
import
parse_extract_indexhyperdata
from
.util
import
Scraper
RESOURCE_TYPE_ISTEX
=
8
def
query
(
request
):
"""
ISTEX simply the total of hits for a query
(not reused in testISTEX)
"""
print
(
request
.
method
)
alist
=
[
"bar"
,
"foo"
]
if
request
.
method
==
"POST"
:
query
=
request
.
POST
[
"query"
]
if
request
.
POST
[
"N"
]
==
"NaN"
:
N
=
QUERY_SIZE_N_MAX
else
:
N
=
int
(
request
.
POST
[
"N"
])
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" query ="
,
query
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" N ="
,
N
)
query_string
=
query
.
replace
(
" "
,
"+"
)
url
=
"http://api.istex.fr/document/?q="
+
query_string
+
"&output=id,title,abstract,pubdate,corpusName,authors,language"
tasks
=
Scraper
()
try
:
thedata_path
=
tasks
.
download
(
url
)
thedata
=
open
(
thedata_path
,
"rb"
)
alist
=
thedata
.
read
()
.
decode
(
'utf-8'
)
except
Exception
as
error
:
alist
=
[
str
(
error
)]
data
=
alist
return
JsonHttpResponse
(
data
)
def
save
(
request
,
project_id
,
return_corpus
=
False
):
print
(
"testISTEX:"
)
print
(
request
.
method
)
alist
=
[
"bar"
,
"foo"
]
# implicit global session
# do we have a valid project id?
try
:
project_id
=
int
(
project_id
)
except
ValueError
:
raise
Http404
()
# do we have a valid project?
project
=
(
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
project_id
)
.
filter
(
Node
.
typename
==
'PROJECT'
)
)
.
first
()
if
project
is
None
:
raise
Http404
()
# do we have a valid user?
user
=
request
.
user
if
not
user
.
is_authenticated
():
return
redirect
(
'/auth/?next=
%
s'
%
request
.
path
)
if
project
.
user_id
!=
user
.
id
:
return
HttpResponseForbidden
()
query_string
=
""
if
request
.
method
==
"POST"
:
query
=
"-"
query_string
=
"-"
#N = QUERY_SIZE_N_MAX
if
"query"
in
request
.
POST
:
query
=
request
.
POST
[
"query"
]
query_string
=
query
.
replace
(
" "
,
"+"
)
# url encoded q
if
"N"
in
request
.
POST
:
if
request
.
POST
[
"N"
]
==
"NaN"
:
N
=
QUERY_SIZE_N_MAX
else
:
N
=
int
(
request
.
POST
[
"N"
])
# query_size from views_opti
if
N
>
QUERY_SIZE_N_MAX
:
N
=
QUERY_SIZE_N_MAX
#msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
#print("ERROR (scrap: istex d/l ): ",msg)
#raise ValueError(msg)
print
(
"Scrapping Istex: '
%
s' (
%
i)"
%
(
query_string
,
N
))
urlreqs
=
[]
pagesize
=
50
tasks
=
Scraper
()
chunks
=
list
(
tasks
.
chunks
(
range
(
N
),
pagesize
))
for
k
in
chunks
:
if
(
k
[
0
]
+
pagesize
)
>
N
:
pagesize
=
N
-
k
[
0
]
urlreqs
.
append
(
"http://api.istex.fr/document/?q="
+
query_string
+
"&output=id,corpusName,title,genre,language,doi,host,publicationDate,abstract,author&"
+
"from="
+
str
(
k
[
0
])
+
"&size="
+
str
(
pagesize
))
# corpus node instanciation as a Django model
corpus
=
Node
(
name
=
query
,
user_id
=
request
.
user
.
id
,
parent_id
=
project_id
,
typename
=
'CORPUS'
,
hyperdata
=
{
"action"
:
"Scrapping data"
,
"language_id"
:
None
}
)
tasks
=
Scraper
()
for
i
in
range
(
8
):
t
=
threading
.
Thread
(
target
=
tasks
.
worker2
)
#thing to do
t
.
daemon
=
True
# thread dies when main thread (only non-daemon thread) exits.
t
.
start
()
for
url
in
urlreqs
:
tasks
.
q
.
put
(
url
)
#put a task in th queue
tasks
.
q
.
join
()
# wait until everything is finished
dwnldsOK
=
0
for
filename
in
tasks
.
firstResults
:
if
filename
!=
False
:
# add the uploaded resource to the corpus
corpus
.
add_resource
(
type
=
get_resource
(
RESOURCE_TYPE_ISTEX
)[
"type"
]
,
path
=
filename
)
dwnldsOK
+=
1
session
.
add
(
corpus
)
session
.
commit
()
#corpus_id = corpus.id
if
dwnldsOK
==
0
:
return
JsonHttpResponse
([
"fail"
])
###########################
###########################
try
:
scheduled
(
parse_extract_indexhyperdata
)(
corpus
.
id
)
except
Exception
as
error
:
print
(
'WORKFLOW ERROR'
)
print
(
error
)
try
:
print_tb
(
error
.
__traceback__
)
except
:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session
.
rollback
()
# --------------------------------------------
if
return_corpus
:
return
corpus
return
render
(
template_name
=
'pages/projects/wait.html'
,
request
=
request
,
context
=
{
'user'
:
request
.
user
,
'project'
:
project
,
},
)
data
=
[
query_string
,
query
,
N
]
print
(
data
)
return
JsonHttpResponse
(
data
)
gargantext/moissonneurs/multivac.py
0 → 100644
View file @
b94e4312
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# ***** MULTIVAC Crawler *****
# ****************************
# LICENCE: GARGANTEXT.org Licence
RESOURCE_TYPE_MULTIVAC
=
10
from
django.shortcuts
import
redirect
,
render
from
django.http
import
Http404
,
HttpResponseRedirect
,
HttpResponseForbidden
from
gargantext.constants
import
get_resource
,
load_crawler
,
QUERY_SIZE_N_MAX
from
gargantext.models.nodes
import
Node
from
gargantext.util.db
import
session
from
gargantext.util.db_cache
import
cache
from
gargantext.util.http
import
JsonHttpResponse
from
gargantext.util.scheduling
import
scheduled
from
gargantext.util.toolchain
import
parse_extract_indexhyperdata
def
query
(
request
):
'''get GlobalResults()'''
if
request
.
method
==
"POST"
:
query
=
request
.
POST
[
"query"
]
source
=
get_resource
(
RESOURCE_TYPE_MULTIVAC
)
if
source
[
"crawler"
]
is
not
None
:
crawlerbot
=
load_crawler
(
source
)()
#old raw way to get results_nb
results
=
crawlerbot
.
scan_results
(
query
)
#ids = crawlerbot.get_ids(query)
print
(
results
)
return
JsonHttpResponse
({
"results_nb"
:
crawlerbot
.
results_nb
})
def
save
(
request
,
project_id
,
return_corpus
=
False
):
'''save'''
if
request
.
method
==
"POST"
:
query
=
request
.
POST
.
get
(
"query"
)
try
:
N
=
int
(
request
.
POST
.
get
(
"N"
))
except
:
N
=
0
print
(
query
,
N
)
#for next time
#ids = request.POST["ids"]
source
=
get_resource
(
RESOURCE_TYPE_MULTIVAC
)
if
N
==
0
:
raise
Http404
()
if
N
>
QUERY_SIZE_N_MAX
:
N
=
QUERY_SIZE_N_MAX
try
:
project_id
=
int
(
project_id
)
except
ValueError
:
raise
Http404
()
# do we have a valid project?
project
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
project_id
)
.
first
()
if
project
is
None
:
raise
Http404
()
user
=
cache
.
User
[
request
.
user
.
id
]
if
not
user
.
owns
(
project
):
return
HttpResponseForbidden
()
# corpus node instanciation as a Django model
corpus
=
Node
(
name
=
query
,
user_id
=
request
.
user
.
id
,
parent_id
=
project_id
,
typename
=
'CORPUS'
,
hyperdata
=
{
"action"
:
"Scrapping data"
,
"language_id"
:
"en"
}
)
#download_file
crawler_bot
=
load_crawler
(
source
)()
#for now no way to force downloading X records
#the long running command
filename
=
crawler_bot
.
download
(
query
)
corpus
.
add_resource
(
type
=
source
[
"type"
]
#, name = source["name"]
,
path
=
crawler_bot
.
path
)
session
.
add
(
corpus
)
session
.
commit
()
#corpus_id = corpus.id
try
:
scheduled
(
parse_extract_indexhyperdata
)(
corpus
.
id
)
except
Exception
as
error
:
print
(
'WORKFLOW ERROR'
)
print
(
error
)
try
:
print_tb
(
error
.
__traceback__
)
except
:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session
.
rollback
()
# --------------------------------------------
if
return_corpus
:
return
corpus
return
render
(
template_name
=
'pages/projects/wait.html'
,
request
=
request
,
context
=
{
'user'
:
request
.
user
,
'project'
:
project
,
},
)
data
=
[
query_string
,
query
,
N
]
print
(
data
)
return
JsonHttpResponse
(
data
)
gargantext/moissonneurs/pubmed.py
0 → 100644
View file @
b94e4312
# ****************************
# ***** Medline Scraper *****
# ****************************
# MEDLINE USER REQUIREMENT : Run retrieval scripts on weekends or
# between 9 pm and 5 am Eastern Time weekdays
# from datetime import datetime
from
time
import
sleep
import
json
import
datetime
from
os
import
path
import
threading
from
traceback
import
print_tb
#from gargantext.settings import MEDIA_ROOT, BASE_DIR
from
django.shortcuts
import
redirect
from
django.http
import
Http404
,
HttpResponseRedirect
,
HttpResponseForbidden
from
gargantext.constants
import
get_resource_by_name
,
QUERY_SIZE_N_MAX
from
gargantext.models.nodes
import
Node
from
gargantext.util.db
import
session
from
gargantext.util.db_cache
import
cache
from
gargantext.util.http
import
JsonHttpResponse
from
gargantext.util.scheduling
import
scheduled
from
gargantext.util.toolchain
import
parse_extract_indexhyperdata
from
.util
import
Scraper
def
query
(
request
):
"""
Pubmed year by year results
# alist = [
# {'string': '2011[dp] serendipity', 'queryKey': '1',
# 'webEnv': 'NCID_1_11...._F_1', 'count': 475, 'retmax': 6},
# {'string': '2012[dp] serendipity', 'queryKey': '1',
# 'webEnv': 'NCID_1_14..._F_1', 'count': 345, 'retmax': 4},
# ... ]
(reused as thequeries in query_save)
"""
print
(
request
.
method
)
alist
=
[]
if
request
.
method
==
"POST"
:
query
=
request
.
POST
[
"query"
]
if
request
.
POST
[
"N"
]
==
"NaN"
:
N
=
QUERY_SIZE_N_MAX
else
:
N
=
int
(
request
.
POST
[
"N"
])
if
N
>
QUERY_SIZE_N_MAX
:
msg
=
"Invalid sample size N =
%
i (max =
%
i)"
%
(
N
,
QUERY_SIZE_N_MAX
)
print
(
"ERROR(scrap: pubmed stats): "
,
msg
)
raise
ValueError
(
msg
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" query ="
,
query
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" N ="
,
N
)
instancia
=
Scraper
()
# serialFetcher (n_last_years, query, query_size)
alist
=
instancia
.
serialFetcher
(
5
,
query
,
N
)
data
=
alist
return
JsonHttpResponse
(
data
)
def
save
(
request
,
project_id
,
return_corpus
=
False
)
:
# implicit global session
# do we have a valid project id?
try
:
project_id
=
int
(
project_id
)
except
ValueError
:
raise
Http404
()
# do we have a valid project?
project
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
project_id
)
.
first
()
if
project
is
None
:
raise
Http404
()
user
=
cache
.
User
[
request
.
user
.
id
]
if
not
user
.
owns
(
project
):
return
HttpResponseForbidden
()
if
request
.
method
==
"POST"
:
queries
=
request
.
POST
[
"query"
]
name
=
request
.
POST
[
"string"
]
# here we just realize queries already prepared by getGlobalStats
# ===> no need to repeat N parameter like in testISTEX <===
instancia
=
Scraper
()
thequeries
=
json
.
loads
(
queries
)
# fyi the sum of our prepared yearly proportional quotas
sampled_sum
=
sum
([
year_q
[
'retmax'
]
for
year_q
in
thequeries
])
print
(
"Scrapping Pubmed: '
%
s' (N=
%
i)"
%
(
name
,
sampled_sum
))
urlreqs
=
[]
for
yearquery
in
thequeries
:
urlreqs
.
append
(
instancia
.
medlineEfetchRAW
(
yearquery
)
)
alist
=
[
"tudo fixe"
,
"tudo bem"
]
# corpus node instanciation as a Django model
corpus
=
project
.
add_child
(
name
=
name
,
typename
=
"CORPUS"
)
# """
# urlreqs: List of urls to query.
# - Then, to each url in urlreqs you do:
# eFetchResult = urlopen(url)
# eFetchResult.read() # this will output the XML... normally you write this to a XML-file.
# """
tasks
=
Scraper
()
for
i
in
range
(
8
):
t
=
threading
.
Thread
(
target
=
tasks
.
worker2
)
#thing to do
t
.
daemon
=
True
# thread dies when main thread (only non-daemon thread) exits.
t
.
start
()
for
url
in
urlreqs
:
tasks
.
q
.
put
(
url
)
#put a task in the queue
tasks
.
q
.
join
()
# wait until everything is finished
dwnldsOK
=
0
for
filename
in
tasks
.
firstResults
:
print
(
filename
)
if
filename
!=
False
:
# add the uploaded resource to the corpus
corpus
.
add_resource
(
type
=
get_resource_by_name
(
'Pubmed [XML]'
)[
"type"
]
,
path
=
filename
,
url
=
None
)
print
(
"Adding the resource"
)
dwnldsOK
+=
1
session
.
add
(
corpus
)
session
.
commit
()
corpus_id
=
corpus
.
id
if
dwnldsOK
==
0
:
return
JsonHttpResponse
([
"fail"
])
try
:
scheduled
(
parse_extract_indexhyperdata
)(
corpus_id
)
except
Exception
as
error
:
print
(
'WORKFLOW ERROR'
)
print
(
error
)
try
:
print_tb
(
error
.
__traceback__
)
except
:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session
.
rollback
()
# --------------------------------------------
sleep
(
1
)
if
return_corpus
:
return
corpus
return
HttpResponseRedirect
(
'/projects/'
+
str
(
project_id
))
data
=
alist
return
JsonHttpResponse
(
data
)
gargantext/moissonneurs/urls.py
0 → 100644
View file @
b94e4312
# ____ ____ ____ _ _ ____ _____ ____ _
#/ ___| / ___| _ \| || | | _ \___ /| _ \ | |
#\___ \| | | |_) | || |_| |_) ||_ \| |_) / __)
# ___) | |___| _ <|__ _| __/___) | _ <\__ \
#|____/ \____|_| \_\ |_| |_| |____/|_| \_( /
# |_|
#
# moissonneurs == getting data from external databases
from
django.conf.urls
import
url
# Available databases :
import
gargantext.moissonneurs.pubmed
as
pubmed
import
gargantext.moissonneurs.istex
as
istex
import
gargantext.moissonneurs.cern
as
cern
import
gargantext.moissonneurs.multivac
as
multivac
import
gargantext.moissonneurs.hal
as
hal
import
gargantext.moissonneurs.isidore
as
isidore
# TODO : ISIDORE
# /!\ urls patterns here are *without* the trailing slash
urlpatterns
=
[
url
(
r'^pubmed/query$'
,
pubmed
.
query
)
,
url
(
r'^pubmed/save/(\d+)'
,
pubmed
.
save
)
,
url
(
r'^istex/query$'
,
istex
.
query
)
,
url
(
r'^istex/save/(\d+)'
,
istex
.
save
)
,
url
(
r'^cern/query$'
,
cern
.
query
)
,
url
(
r'^cern/save/(\d+)'
,
cern
.
save
)
,
url
(
r'^multivac/query$'
,
multivac
.
query
)
,
url
(
r'^multivac/save/(\d+)'
,
multivac
.
save
)
,
url
(
r'^hal/query$'
,
hal
.
query
)
,
url
(
r'^hal/save/(\d+)'
,
hal
.
save
)
,
url
(
r'^isidore/query$'
,
isidore
.
query
)
,
url
(
r'^isidore/save/(\d+)'
,
isidore
.
save
)
]
gargantext/moissonneurs/util.py
0 → 100644
View file @
b94e4312
from
gargantext.util.files
import
download
import
sys
import
time
import
threading
from
queue
import
Queue
from
lxml
import
etree
if
sys
.
version_info
>=
(
3
,
0
):
from
urllib.request
import
urlopen
else
:
from
urllib
import
urlopen
class
Scraper
:
def
__init__
(
self
):
self
.
queue_size
=
8
self
.
q
=
Queue
()
self
.
firstResults
=
[]
self
.
lock
=
threading
.
Lock
()
# lock to serialize console output
self
.
pubMedEutilsURL
=
'http://www.ncbi.nlm.nih.gov/entrez/eutils'
self
.
pubMedDB
=
'Pubmed'
self
.
reportType
=
'medline'
# Return the globalResults!:
# - count =
# - queryKey =
# - webEnv =
def
medlineEsearch
(
self
,
query
):
# print ("MedlineFetcher::medlineEsearch :")
"Get number of results for query 'query' in variable 'count'"
"Get also 'queryKey' and 'webEnv', which are used by function 'medlineEfetch'"
# print(query)
origQuery
=
query
query
=
query
.
replace
(
' '
,
'
%20
'
)
eSearch
=
'
%
s/esearch.fcgi?db=
%
s&retmax=1&usehistory=y&term=
%
s'
\
%
(
self
.
pubMedEutilsURL
,
self
.
pubMedDB
,
query
)
try
:
eSearchResult
=
urlopen
(
eSearch
)
data
=
eSearchResult
.
read
()
root
=
etree
.
XML
(
data
)
findcount
=
etree
.
XPath
(
"/eSearchResult/Count/text()"
)
count
=
findcount
(
root
)[
0
]
findquerykey
=
etree
.
XPath
(
"/eSearchResult/QueryKey/text()"
)
queryKey
=
findquerykey
(
root
)[
0
]
findwebenv
=
etree
.
XPath
(
"/eSearchResult/WebEnv/text()"
)
webEnv
=
findwebenv
(
root
)[
0
]
except
Exception
as
Error
:
print
(
Error
)
count
=
0
queryKey
=
False
webEnv
=
False
origQuery
=
False
values
=
{
"query"
:
origQuery
,
"count"
:
int
(
count
)
,
"queryKey"
:
queryKey
,
"webEnv"
:
webEnv
}
return
values
# RETMAX:
# Total number of UIDs from the retrieved set to be shown in the XML output (default=20)
# maximum of 100,000 records
def
medlineEfetchRAW
(
self
,
fullquery
):
query
=
fullquery
[
"string"
]
retmax
=
fullquery
[
"retmax"
]
count
=
fullquery
[
"count"
]
queryKey
=
fullquery
[
"queryKey"
]
webEnv
=
fullquery
[
"webEnv"
]
"Fetch medline result for query 'query', saving results to file every 'retmax' articles"
queryNoSpace
=
query
.
replace
(
' '
,
''
)
# No space in directory and file names, avoids stupid errors
# print ("LOG::TIME: ",'medlineEfetchRAW :Query "' , query , '"\t:\t' , count , ' results')
retstart
=
0
eFetch
=
'
%
s/efetch.fcgi?email=youremail@example.org&rettype=
%
s&retmode=xml&retstart=
%
s&retmax=
%
s&db=
%
s&query_key=
%
s&WebEnv=
%
s'
%
(
self
.
pubMedEutilsURL
,
self
.
reportType
,
retstart
,
retmax
,
self
.
pubMedDB
,
queryKey
,
webEnv
)
return
eFetch
# generic!
def
download
(
self
,
url
):
print
(
url
)
filename
=
download
(
url
)
with
self
.
lock
:
print
(
threading
.
current_thread
()
.
name
,
filename
+
" OK"
)
return
filename
# generic!
def
do_work
(
self
,
item
):
# time.sleep(1) # pretend to do some lengthy work.
returnvalue
=
self
.
medlineEsearch
(
item
)
with
self
.
lock
:
# print(threading.current_thread().name, item)
return
returnvalue
# The worker thread pulls an item from the queue and processes it
def
worker
(
self
):
while
True
:
item
=
self
.
q
.
get
()
self
.
firstResults
.
append
(
self
.
do_work
(
item
))
self
.
q
.
task_done
()
def
worker2
(
self
):
while
True
:
item
=
self
.
q
.
get
()
results
=
[]
try
:
result
=
self
.
download
(
item
)
except
Exception
as
error
:
print
(
error
)
result
=
False
self
.
firstResults
.
append
(
result
)
self
.
q
.
task_done
()
def
chunks
(
self
,
l
,
n
):
print
(
"chunks:"
)
for
i
in
range
(
0
,
len
(
l
),
n
):
yield
l
[
i
:
i
+
n
]
# GLOBALLIMIT:
# I will retrieve this exact amount of publications.
# The publications per year i'll retrieve per year will be :
# (k/N)*GlobalLimit
# \_ this is used as RETMAX
# - k : Number of publications of x year (according to pubmed)
# - N : Sum of every k belonging to {X} (total number of pubs according to pubmed)
# - GlobalLimit : Number of publications i want.
def
serialFetcher
(
self
,
yearsNumber
,
query
,
globalLimit
):
# Create the queue and thread pool.
for
i
in
range
(
self
.
queue_size
):
t
=
threading
.
Thread
(
target
=
self
.
worker
)
t
.
daemon
=
True
# thread dies when main thread (only non-daemon thread) exits.
t
.
start
()
start
=
time
.
perf_counter
()
N
=
0
# print ("MedlineFetcher::serialFetcher :")
thequeries
=
[]
globalresults
=
[]
for
i
in
range
(
yearsNumber
):
year
=
str
(
2015
-
i
)
# print ('YEAR ' + year)
# print ('---------\n')
pubmedquery
=
str
(
year
)
+
'[dp] '
+
query
self
.
q
.
put
(
pubmedquery
)
#put task in the queue
self
.
q
.
join
()
print
(
'time:'
,
time
.
perf_counter
()
-
start
)
Total
=
0
Fails
=
0
for
globalresults
in
self
.
firstResults
:
# globalresults = self.medlineEsearch(pubmedquery)
Total
+=
1
if
globalresults
[
"queryKey"
]
==
False
:
Fails
+=
1
if
globalresults
[
"count"
]
>
0
:
N
+=
globalresults
[
"count"
]
queryhyperdata
=
{
"string"
:
globalresults
[
"query"
]
,
"count"
:
globalresults
[
"count"
]
,
"queryKey"
:
globalresults
[
"queryKey"
]
,
"webEnv"
:
globalresults
[
"webEnv"
]
,
"retmax"
:
0
}
thequeries
.
append
(
queryhyperdata
)
print
(
"Total Number:"
,
N
,
"publications"
)
print
(
"And i want just:"
,
globalLimit
,
"publications"
)
print
(
"---------------------------------------
\n
"
)
for
i
,
query
in
enumerate
(
thequeries
):
k
=
query
[
"count"
]
proportion
=
k
/
float
(
N
)
retmax_forthisyear
=
int
(
round
(
globalLimit
*
proportion
))
query
[
"retmax"
]
=
retmax_forthisyear
if
query
[
"retmax"
]
==
0
:
query
[
"retmax"
]
+=
1
print
(
query
[
"string"
],
"
\t
["
,
k
,
">"
,
query
[
"retmax"
],
"]"
)
if
((
Fails
+
1
)
/
(
Total
+
1
))
==
1
:
# for identifying the epic fail or connection error
thequeries
=
[
False
]
return
thequeries
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment