Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
3aae3103
Commit
3aae3103
authored
Apr 08, 2016
by
delanoe
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FACTO] Scrapers tools/methods factorization. (TODO: FACTO again).
parent
4a6171e7
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
227 additions
and
213 deletions
+227
-213
constants.py
gargantext/constants.py
+6
-0
urls.py
gargantext/urls.py
+18
-16
pubmed.py
gargantext/util/scrappers/pubmed.py
+0
-10
istex.py
scrapers/istex.py
+164
-0
pubmed.py
scrapers/pubmed.py
+13
-161
urls.py
scrapers/urls.py
+16
-6
util.py
scrapers/util.py
+8
-18
project.html
templates/pages/projects/project.html
+2
-2
No files found.
gargantext/constants.py
View file @
3aae3103
...
...
@@ -208,3 +208,9 @@ DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY
# about batch processing...
BATCH_PARSING_SIZE
=
256
BATCH_NGRAMSEXTRACTION_SIZE
=
1024
# Scrapers config
QUERY_SIZE_N_MAX
=
1000
QUERY_SIZE_N_DEFAULT
=
1000
gargantext/urls.py
View file @
3aae3103
...
...
@@ -8,9 +8,9 @@ Views are shared between these modules:
- `graph explorer`, to explore graphs
"""
from
django.conf.urls
import
include
,
url
from
django.conf.urls
import
include
,
url
from
django.contrib
import
admin
from
django.contrib
import
admin
import
gargantext.views.api.urls
import
gargantext.views.generated.urls
...
...
@@ -18,32 +18,34 @@ import gargantext.views.pages.urls
# Module Annotation
## tempo: unchanged doc-annotations --
from
annotations
import
urls
as
annotations_urls
from
annotations.views
import
main
as
annotations_main_view
from
annotations
import
urls
as
annotations_urls
from
annotations.views
import
main
as
annotations_main_view
# Module "Graph Explorer"
#from graphExplorer import urls as graphExplorer_urls
from
graphExplorer.rest
import
Graph
from
graphExplorer.rest
import
Graph
from
graphExplorer.views
import
explorer
from
scrapers
import
urls
as
scrapers_urls
# Module Scrapers
from
scrapers
import
urls
as
scrapers_urls
urlpatterns
=
[
url
(
r'^admin/'
,
admin
.
site
.
urls
)
,
url
(
r'^generated/'
,
include
(
gargantext
.
views
.
generated
.
urls
))
,
url
(
r'^api/'
,
include
(
gargantext
.
views
.
api
.
urls
)
)
,
url
(
r'^'
,
include
(
gargantext
.
views
.
pages
.
urls
)
)
urlpatterns
=
[
url
(
r'^admin/'
,
admin
.
site
.
urls
)
,
url
(
r'^generated/'
,
include
(
gargantext
.
views
.
generated
.
urls
))
,
url
(
r'^api/'
,
include
(
gargantext
.
views
.
api
.
urls
)
)
,
url
(
r'^'
,
include
(
gargantext
.
views
.
pages
.
urls
)
)
# Module Annotation
# tempo: unchanged doc-annotations routes --
,
url
(
r'^annotations/'
,
include
(
annotations_urls
)
)
,
url
(
r'^annotations/'
,
include
(
annotations_urls
)
)
,
url
(
r'^projects/(\d+)/corpora/(\d+)/documents/(\d+)/$'
,
annotations_main_view
)
# Module "Graph Explorer"
,
url
(
r'^projects/(\d+)/corpora/(\d+)/explorer$'
,
explorer
)
,
url
(
r'^projects/(\d+)/corpora/(\d+)/graph$'
,
Graph
.
as_view
())
,
url
(
r'^projects/(\d+)/corpora/(\d+)/explorer$'
,
explorer
)
,
url
(
r'^projects/(\d+)/corpora/(\d+)/graph$'
,
Graph
.
as_view
())
# to be removed:
,
url
(
r'^projects/(\d+)/corpora/(\d+)/node_link.json$'
,
Graph
.
as_view
())
,
url
(
r'^projects/(\d+)/corpora/(\d+)/node_link.json$'
,
Graph
.
as_view
())
#url(r'^projects/(\d+)/corpora/(\d+)/explorer$', include(graphExplorer.urls))
#url(r'^projects/(\d+)/corpora/(\d+)/explorer$', include(graphExplorer_urls))
,
url
(
r'^scrapers/'
,
include
(
scrapers_urls
))
# Scrapers module
,
url
(
r'^scrapers/'
,
include
(
scrapers_urls
)
)
]
gargantext/util/scrappers/pubmed.py
deleted
100644 → 0
View file @
4a6171e7
def
suggest
(
keywords
):
return
[
'Suggestion #1'
,
'Suggestion #2'
,
'Suggestion #3'
,
'Suggestion #4'
,
'Suggestion #5'
]
def
count
(
keywords
):
return
42
def
query_save
(
keywords
):
return
'path/to/query.xml'
scrapers/istex.py
View file @
3aae3103
# from datetime import datetime
from
time
import
sleep
import
datetime
import
threading
#from gargantext.settings import MEDIA_ROOT, BASE_DIR
from
django.shortcuts
import
redirect
from
django.http
import
Http404
,
HttpResponseRedirect
,
HttpResponseForbidden
from
gargantext.constants
import
RESOURCETYPES
,
QUERY_SIZE_N_MAX
from
gargantext.models.nodes
import
Node
from
gargantext.util.db
import
session
from
gargantext.util.http
import
JsonHttpResponse
from
gargantext.util.tools
import
ensure_dir
from
gargantext.util.scheduling
import
scheduled
from
gargantext.util.toolchain
import
parse_extract_indexhyperdata
from
scrapers.util
import
Scraper
def
query
(
request
):
"""
ISTEX simply the total of hits for a query
(not reused in testISTEX)
"""
print
(
request
.
method
)
alist
=
[
"bar"
,
"foo"
]
if
request
.
method
==
"POST"
:
query
=
request
.
POST
[
"query"
]
N
=
int
(
request
.
POST
[
"N"
])
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" query ="
,
query
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" N ="
,
N
)
query_string
=
query
.
replace
(
" "
,
"+"
)
url
=
"http://api.istex.fr/document/?q="
+
query_string
+
"&output=id,title,abstract,pubdate,corpusName,authors,language"
tasks
=
Scraper
()
try
:
thedata_path
=
tasks
.
download
(
url
)
thedata
=
open
(
thedata_path
,
"rb"
)
alist
=
thedata
.
read
()
.
decode
(
'utf-8'
)
except
Exception
as
error
:
alist
=
[
str
(
error
)]
data
=
alist
return
JsonHttpResponse
(
data
)
def
save
(
request
,
project_id
):
print
(
"testISTEX:"
)
print
(
request
.
method
)
alist
=
[
"bar"
,
"foo"
]
# implicit global session
# do we have a valid project id?
try
:
project_id
=
int
(
project_id
)
except
ValueError
:
raise
Http404
()
# do we have a valid project?
project
=
(
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
project_id
)
.
filter
(
Node
.
typename
==
'PROJECT'
)
)
.
first
()
if
project
is
None
:
raise
Http404
()
# do we have a valid user?
user
=
request
.
user
if
not
user
.
is_authenticated
():
return
redirect
(
'/auth/?next=
%
s'
%
request
.
path
)
if
project
.
user_id
!=
user
.
id
:
return
HttpResponseForbidden
()
if
request
.
method
==
"POST"
:
query
=
"-"
query_string
=
"-"
N
=
0
if
"query"
in
request
.
POST
:
query
=
request
.
POST
[
"query"
]
query_string
=
query
.
replace
(
" "
,
"+"
)
# url encoded q
if
"N"
in
request
.
POST
:
N
=
int
(
request
.
POST
[
"N"
])
# query_size from views_opti
if
N
>
QUERY_SIZE_N_MAX
:
msg
=
"Invalid sample size N =
%
i (max =
%
i)"
%
(
N
,
QUERY_SIZE_N_MAX
)
print
(
"ERROR (scrap: istex d/l ): "
,
msg
)
raise
ValueError
(
msg
)
print
(
"Scrapping Istex: '
%
s' (
%
i)"
%
(
query_string
,
N
))
urlreqs
=
[]
pagesize
=
50
tasks
=
Scraper
()
chunks
=
list
(
tasks
.
chunks
(
range
(
N
),
pagesize
))
for
k
in
chunks
:
if
(
k
[
0
]
+
pagesize
)
>
N
:
pagesize
=
N
-
k
[
0
]
urlreqs
.
append
(
"http://api.istex.fr/document/?q="
+
query_string
+
"&output=*&"
+
"from="
+
str
(
k
[
0
])
+
"&size="
+
str
(
pagesize
))
# corpus node instanciation as a Django model
corpus
=
Node
(
name
=
query
,
user_id
=
request
.
user
.
id
,
parent_id
=
project_id
,
typename
=
'CORPUS'
,
hyperdata
=
{
"action"
:
"Scraping data"
,
"language_id"
:
None
}
)
session
.
add
(
corpus
)
session
.
commit
()
corpus_id
=
corpus
.
id
print
(
"NEW CORPUS"
,
corpus_id
)
ensure_dir
(
request
.
user
)
tasks
=
Scraper
()
for
i
in
range
(
8
):
t
=
threading
.
Thread
(
target
=
tasks
.
worker2
)
#thing to do
t
.
daemon
=
True
# thread dies when main thread (only non-daemon thread) exits.
t
.
start
()
for
url
in
urlreqs
:
tasks
.
q
.
put
(
url
)
#put a task in th queue
tasks
.
q
.
join
()
# wait until everything is finished
dwnldsOK
=
0
for
filename
in
tasks
.
firstResults
:
if
filename
!=
False
:
# add the uploaded resource to the corpus
# add the uploaded resource to the corpus
corpus
.
add_resource
(
type
=
3
,
path
=
filename
)
dwnldsOK
+=
1
if
dwnldsOK
==
0
:
return
JsonHttpResponse
([
"fail"
])
###########################
###########################
try
:
scheduled
(
parse_extract_indexhyperdata
(
corpus_id
,))
except
Exception
as
error
:
print
(
'WORKFLOW ERROR'
)
print
(
error
)
sleep
(
1
)
return
HttpResponseRedirect
(
'/projects/'
+
str
(
project_id
))
data
=
[
query_string
,
query
,
N
]
return
JsonHttpResponse
(
data
)
scrapers/pubmed.py
View file @
3aae3103
# ****************************
# ***** Medline Scraper *****
# ****************************
from
scrapers.MedlineFetcher
import
MedlineFetcher
# MEDLINE USER REQUIREMENT : Run retrieval scripts on weekends or
# between 9 pm and 5 am Eastern Time weekdays
# from datetime import datetime
...
...
@@ -13,7 +17,7 @@ import threading
from
django.shortcuts
import
redirect
from
django.http
import
Http404
,
HttpResponseRedirect
,
HttpResponseForbidden
from
gargantext.constants
import
RESOURCETYPES
from
gargantext.constants
import
RESOURCETYPES
,
QUERY_SIZE_N_MAX
from
gargantext.models.nodes
import
Node
from
gargantext.util.db
import
session
from
gargantext.util.http
import
JsonHttpResponse
...
...
@@ -21,22 +25,11 @@ from gargantext.util.tools import ensure_dir
from
gargantext.util.scheduling
import
scheduled
from
gargantext.util.toolchain
import
parse_extract_indexhyperdata
from
scrapers.util
import
Scraper
# pour lire la section [scrapers] de gargantext.ini
#from configparser import ConfigParser
# --------------------------------------------------------------------
# importing constants from config file
#CONF = ConfigParser()
#with open(path.join(BASE_DIR, 'gargantext.ini')) as inifile:
# CONF.read_file(inifile)
QUERY_SIZE_N_MAX
=
1000
# int(CONF['scrappers']['QUERY_SIZE_N_MAX'])
# QUERY_SIZE_N_DEFAULT = int(CONF['scrappers']['QUERY_SIZE_N_DEFAULT'])
# --------------------------------------------------------------------
def
getGlobalStats
(
request
):
def
query
(
request
):
"""
Pubmed year by year results
...
...
@@ -47,7 +40,7 @@ def getGlobalStats( request ):
# 'webEnv': 'NCID_1_14..._F_1', 'count': 345, 'retmax': 4},
# ... ]
(reused as thequeries in
doTheQuery
)
(reused as thequeries in
query_save
)
"""
print
(
request
.
method
)
alist
=
[]
...
...
@@ -63,7 +56,7 @@ def getGlobalStats( request ):
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" query ="
,
query
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" N ="
,
N
)
instancia
=
MedlineFetch
er
()
instancia
=
Scrap
er
()
# serialFetcher (n_last_years, query, query_size)
alist
=
instancia
.
serialFetcher
(
5
,
query
,
N
)
...
...
@@ -72,7 +65,7 @@ def getGlobalStats( request ):
return
JsonHttpResponse
(
data
)
def
doTheQuery
(
request
,
project_id
)
:
def
save
(
request
,
project_id
)
:
# implicit global session
# do we have a valid project id?
try
:
...
...
@@ -103,7 +96,7 @@ def doTheQuery( request , project_id ) :
# here we just realize queries already prepared by getGlobalStats
# ===> no need to repeat N parameter like in testISTEX <===
instancia
=
MedlineFetch
er
()
instancia
=
Scrap
er
()
thequeries
=
json
.
loads
(
queries
)
# fyi the sum of our prepared yearly proportional quotas
...
...
@@ -138,7 +131,7 @@ def doTheQuery( request , project_id ) :
ensure_dir
(
request
.
user
)
tasks
=
MedlineFetch
er
()
tasks
=
Scrap
er
()
for
i
in
range
(
8
):
t
=
threading
.
Thread
(
target
=
tasks
.
worker2
)
#thing to do
...
...
@@ -173,144 +166,3 @@ def doTheQuery( request , project_id ) :
return
JsonHttpResponse
(
data
)
def
getGlobalStatsISTEXT
(
request
):
"""
ISTEX simply the total of hits for a query
(not reused in testISTEX)
"""
print
(
request
.
method
)
alist
=
[
"bar"
,
"foo"
]
if
request
.
method
==
"POST"
:
query
=
request
.
POST
[
"query"
]
N
=
int
(
request
.
POST
[
"N"
])
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" query ="
,
query
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" N ="
,
N
)
query_string
=
query
.
replace
(
" "
,
"+"
)
url
=
"http://api.istex.fr/document/?q="
+
query_string
+
"&output=id,title,abstract,pubdate,corpusName,authors,language"
tasks
=
MedlineFetcher
()
try
:
thedata_path
=
tasks
.
download
(
url
)
thedata
=
open
(
thedata_path
,
"rb"
)
alist
=
thedata
.
read
()
.
decode
(
'utf-8'
)
except
Exception
as
error
:
alist
=
[
str
(
error
)]
data
=
alist
return
JsonHttpResponse
(
data
)
def
testISTEX
(
request
,
project_id
):
print
(
"testISTEX:"
)
print
(
request
.
method
)
alist
=
[
"bar"
,
"foo"
]
# implicit global session
# do we have a valid project id?
try
:
project_id
=
int
(
project_id
)
except
ValueError
:
raise
Http404
()
# do we have a valid project?
project
=
(
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
project_id
)
.
filter
(
Node
.
typename
==
'PROJECT'
)
)
.
first
()
if
project
is
None
:
raise
Http404
()
# do we have a valid user?
user
=
request
.
user
if
not
user
.
is_authenticated
():
return
redirect
(
'/auth/?next=
%
s'
%
request
.
path
)
if
project
.
user_id
!=
user
.
id
:
return
HttpResponseForbidden
()
if
request
.
method
==
"POST"
:
query
=
"-"
query_string
=
"-"
N
=
0
if
"query"
in
request
.
POST
:
query
=
request
.
POST
[
"query"
]
query_string
=
query
.
replace
(
" "
,
"+"
)
# url encoded q
if
"N"
in
request
.
POST
:
N
=
int
(
request
.
POST
[
"N"
])
# query_size from views_opti
if
N
>
QUERY_SIZE_N_MAX
:
msg
=
"Invalid sample size N =
%
i (max =
%
i)"
%
(
N
,
QUERY_SIZE_N_MAX
)
print
(
"ERROR (scrap: istex d/l ): "
,
msg
)
raise
ValueError
(
msg
)
print
(
"Scrapping Istex: '
%
s' (
%
i)"
%
(
query_string
,
N
))
urlreqs
=
[]
pagesize
=
50
tasks
=
MedlineFetcher
()
chunks
=
list
(
tasks
.
chunks
(
range
(
N
),
pagesize
))
for
k
in
chunks
:
if
(
k
[
0
]
+
pagesize
)
>
N
:
pagesize
=
N
-
k
[
0
]
urlreqs
.
append
(
"http://api.istex.fr/document/?q="
+
query_string
+
"&output=*&"
+
"from="
+
str
(
k
[
0
])
+
"&size="
+
str
(
pagesize
))
# corpus node instanciation as a Django model
corpus
=
Node
(
name
=
query
,
user_id
=
request
.
user
.
id
,
parent_id
=
project_id
,
typename
=
'CORPUS'
,
hyperdata
=
{
"action"
:
"Scraping data"
,
"language_id"
:
None
}
)
session
.
add
(
corpus
)
session
.
commit
()
corpus_id
=
corpus
.
id
print
(
"NEW CORPUS"
,
corpus_id
)
ensure_dir
(
request
.
user
)
tasks
=
MedlineFetcher
()
for
i
in
range
(
8
):
t
=
threading
.
Thread
(
target
=
tasks
.
worker2
)
#thing to do
t
.
daemon
=
True
# thread dies when main thread (only non-daemon thread) exits.
t
.
start
()
for
url
in
urlreqs
:
tasks
.
q
.
put
(
url
)
#put a task in th queue
tasks
.
q
.
join
()
# wait until everything is finished
dwnldsOK
=
0
for
filename
in
tasks
.
firstResults
:
if
filename
!=
False
:
# add the uploaded resource to the corpus
# add the uploaded resource to the corpus
corpus
.
add_resource
(
type
=
3
,
path
=
filename
)
dwnldsOK
+=
1
if
dwnldsOK
==
0
:
return
JsonHttpResponse
([
"fail"
])
###########################
###########################
try
:
scheduled
(
parse_extract_indexhyperdata
(
corpus_id
,))
except
Exception
as
error
:
print
(
'WORKFLOW ERROR'
)
print
(
error
)
sleep
(
1
)
return
HttpResponseRedirect
(
'/projects/'
+
str
(
project_id
))
data
=
[
query_string
,
query
,
N
]
return
JsonHttpResponse
(
data
)
scrapers/urls.py
View file @
3aae3103
from
django.conf.urls
import
url
import
scrapers.pubmed
as
pubmed
#
import scrapers.istex as istex
import
scrapers.istex
as
istex
#import scrapers.cern as cern
#import scrapers.hal as hal
...
...
@@ -11,11 +11,21 @@ import scrapers.pubmed as pubmed
# Available databases : Pubmed, IsTex, (next: CERN)
# /!\ urls patterns here are *without* the trailing slash
urlpatterns
=
[
url
(
r'^pubmed/query$'
,
pubmed
.
getGlobalStats
)
,
url
(
r'^pubmed/s
earch/(\d+)'
,
pubmed
.
doTheQuery
)
urlpatterns
=
[
url
(
r'^pubmed/query$'
,
pubmed
.
query
)
,
url
(
r'^pubmed/s
ave/(\d+)'
,
pubmed
.
save
)
,
url
(
r'^istex/query$'
,
pubmed
.
getGlobalStatsISTEXT
)
,
url
(
r'^istex/search/(\d+)'
,
pubmed
.
testISTEX
)
#, url(r'^scraping$' , scraping.Target.as_view() )
,
url
(
r'^istex/query$'
,
istex
.
query
)
,
url
(
r'^istex/save/(\d+)'
,
istex
.
save
)
# TODO REST API for the scrapers
#, url(r'^rest$' , scraping.Target.as_view() )
,
]
#def count(keywords):
# return 42
#
#def query_save(keywords):
# return 'path/to/query.xml'
#
scrapers/
MedlineFetcher
.py
→
scrapers/
util
.py
View file @
3aae3103
# ****************************
# ***** Medline Fetcher *****
# ****************************
# MEDLINE USER REQUIREMENT : Run retrieval scripts on weekends or
# between 9 pm and 5 am Eastern Time weekdays
from
gargantext.util.files
import
download
import
sys
if
sys
.
version_info
>=
(
3
,
0
):
from
urllib.request
import
urlopen
else
:
from
urllib
import
urlopen
import
os
import
time
# import libxml2
from
lxml
import
etree
import
datetime
from
django.core.files
import
File
import
codecs
import
threading
from
queue
import
Queue
# import time
class
MedlineFetcher
:
from
lxml
import
etree
if
sys
.
version_info
>=
(
3
,
0
):
from
urllib.request
import
urlopen
else
:
from
urllib
import
urlopen
class
Scraper
:
def
__init__
(
self
):
self
.
queue_size
=
8
...
...
templates/pages/projects/project.html
View file @
3aae3103
...
...
@@ -260,7 +260,7 @@
$
.
ajax
({
// contentType: "application/json",
url
:
window
.
location
.
origin
+
"/scrapers/pubmed/s
earch
/"
+
projectid
,
url
:
window
.
location
.
origin
+
"/scrapers/pubmed/s
ave
/"
+
projectid
,
data
:
pubmedifiedQuery
,
type
:
'POST'
,
beforeSend
:
function
(
xhr
)
{
...
...
@@ -504,7 +504,7 @@
$
.
ajax
({
// contentType: "application/json",
url
:
window
.
location
.
origin
+
"/scrapers/istex/s
earch
/"
+
projectid
,
url
:
window
.
location
.
origin
+
"/scrapers/istex/s
ave
/"
+
projectid
,
data
:
postQuery
,
type
:
'POST'
,
beforeSend
:
function
(
xhr
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment