Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
9eead9fa
Commit
9eead9fa
authored
Apr 07, 2016
by
delanoe
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FEAT] Scrapper pubmed: ok
parent
88036658
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
269 additions
and
262 deletions
+269
-262
constants.py
gargantext/constants.py
+10
-10
files.py
gargantext/util/files.py
+2
-2
http.py
gargantext/util/http.py
+1
-1
projects.py
gargantext/views/pages/projects.py
+1
-1
MedlineFetcher.py
scrapers/MedlineFetcher.py
+80
-79
istex.py
scrapers/istex.py
+142
-0
pubmed.py
scrapers/pubmed.py
+27
-167
urls.py
scrapers/urls.py
+5
-1
project.html
templates/pages/projects/project.html
+1
-1
No files found.
gargantext/constants.py
View file @
9eead9fa
...
...
@@ -47,59 +47,59 @@ def convert_to_date(date):
return
dateutil
.
parser
.
parse
(
date
)
INDEXED_HYPERDATA
=
{
# TODO use properties during toolchain.hyperdata_indexing
# TODO use properties during toolchain.hyperdata_indexing
# (type, convert_to_db, convert_from_db)
'count'
:
{
'id'
:
1
,
'type'
:
int
,
'convert_to_db'
:
int
,
'convert_from_db'
:
int
},
'publication_date'
:
{
'id'
:
2
,
'type'
:
datetime
.
datetime
,
'convert_to_db'
:
convert_to_date
,
'convert_from_db'
:
datetime
.
datetime
.
fromtimestamp
},
'title'
:
{
'id'
:
3
,
'type'
:
str
,
'convert_to_db'
:
str
,
'convert_from_db'
:
str
},
'authors'
:
{
'id'
:
4
,
'type'
:
str
,
'convert_to_db'
:
str
,
'convert_from_db'
:
str
},
'journal'
:
{
'id'
:
5
,
'type'
:
str
,
'convert_to_db'
:
str
,
'convert_from_db'
:
str
},
'abstract'
:
{
'id'
:
6
,
'type'
:
str
,
'convert_to_db'
:
str
,
'convert_from_db'
:
str
},
'text'
:
{
'id'
:
7
,
'type'
:
str
,
'convert_to_db'
:
str
,
'convert_from_db'
:
str
},
'page'
:
{
'id'
:
8
,
'type'
:
int
...
...
gargantext/util/files.py
View file @
9eead9fa
from
gargantext.constants
import
*
from
gargantext.constants
import
*
from
gargantext.util.digest
import
str_digest
from
gargantext.util
import
http
from
gargantext.util
import
http
def
save
(
contents
,
name
=
''
,
basedir
=
''
):
...
...
gargantext/util/http.py
View file @
9eead9fa
...
...
@@ -29,7 +29,7 @@ import urllib.request
def
get
(
url
):
response
=
urllib
.
request
.
urlopen
(
url
)
html
=
response
.
read
()
return
response
.
read
()
# retrieve GET parameters from a request
...
...
gargantext/views/pages/projects.py
View file @
9eead9fa
...
...
@@ -94,7 +94,7 @@ def project(request, project_id):
)
session
.
add
(
corpus
)
session
.
commit
()
# parse_extract: fileparsing -> ngram extraction -> lists
scheduled
(
parse_extract_indexhyperdata
)(
corpus
.
id
)
...
...
scrapers/MedlineFetcher.py
View file @
9eead9fa
...
...
@@ -2,10 +2,15 @@
# ***** Medline Fetcher *****
# ****************************
# MEDLINE USER REQUIREMENT : Run retrieval scripts on weekends or between 9 pm and 5 am Eastern Time weekdays
# MEDLINE USER REQUIREMENT : Run retrieval scripts on weekends or
# between 9 pm and 5 am Eastern Time weekdays
from
gargantext.util.files
import
download
import
sys
if
sys
.
version_info
>=
(
3
,
0
):
from
urllib.request
import
urlopen
else
:
from
urllib
import
urlopen
import
os
import
time
# import libxml2
...
...
@@ -21,48 +26,60 @@ from queue import Queue
class
MedlineFetcher
:
def
__init__
(
self
):
self
.
queue_size
=
8
self
.
q
=
Queue
()
self
.
firstResults
=
[]
self
.
lock
=
threading
.
Lock
()
# lock to serialize console output
self
.
queue_size
=
8
self
.
q
=
Queue
()
self
.
firstResults
=
[]
self
.
lock
=
threading
.
Lock
()
# lock to serialize console output
self
.
pubMedEutilsURL
=
'http://www.ncbi.nlm.nih.gov/entrez/eutils'
self
.
pubMedDB
=
'Pubmed'
self
.
reportType
=
'medline'
self
.
pubMedDB
=
'Pubmed'
self
.
reportType
=
'medline'
# Return the globalResults!:
# - count =
# - queryKey =
# - webEnv =
# - count =
# - queryKey =
# - webEnv =
def
medlineEsearch
(
self
,
query
):
# print ("MedlineFetcher::medlineEsearch :")
"Get number of results for query 'query' in variable 'count'"
"Get also 'queryKey' and 'webEnv', which are used by function 'medlineEfetch'"
# print(query)
origQuery
=
query
query
=
query
.
replace
(
' '
,
'
%20
'
)
eSearch
=
'
%
s/esearch.fcgi?db=
%
s&retmax=1&usehistory=y&term=
%
s'
%
(
self
.
pubMedEutilsURL
,
self
.
pubMedDB
,
query
)
query
=
query
.
replace
(
' '
,
'
%20
'
)
eSearch
=
'
%
s/esearch.fcgi?db=
%
s&retmax=1&usehistory=y&term=
%
s'
\
%
(
self
.
pubMedEutilsURL
,
self
.
pubMedDB
,
query
)
try
:
eSearchResult
=
urlopen
(
eSearch
)
data
=
eSearchResult
.
read
()
root
=
etree
.
XML
(
data
)
findcount
=
etree
.
XPath
(
"/eSearchResult/Count/text()"
)
count
=
findcount
(
root
)[
0
]
findquerykey
=
etree
.
XPath
(
"/eSearchResult/QueryKey/text()"
)
queryKey
=
findquerykey
(
root
)[
0
]
findwebenv
=
etree
.
XPath
(
"/eSearchResult/WebEnv/text()"
)
webEnv
=
findwebenv
(
root
)[
0
]
except
:
count
=
0
queryKey
=
False
webEnv
=
False
origQuery
=
False
values
=
{
"query"
:
origQuery
,
"count"
:
int
(
str
(
count
)),
"queryKey"
:
queryKey
,
"webEnv"
:
webEnv
}
data
=
eSearchResult
.
read
()
root
=
etree
.
XML
(
data
)
findcount
=
etree
.
XPath
(
"/eSearchResult/Count/text()"
)
count
=
findcount
(
root
)[
0
]
findquerykey
=
etree
.
XPath
(
"/eSearchResult/QueryKey/text()"
)
queryKey
=
findquerykey
(
root
)[
0
]
findwebenv
=
etree
.
XPath
(
"/eSearchResult/WebEnv/text()"
)
webEnv
=
findwebenv
(
root
)[
0
]
except
Exception
as
Error
:
print
(
Error
)
count
=
0
queryKey
=
False
webEnv
=
False
origQuery
=
False
values
=
{
"query"
:
origQuery
,
"count"
:
int
(
count
)
,
"queryKey"
:
queryKey
,
"webEnv"
:
webEnv
}
return
values
...
...
@@ -70,52 +87,32 @@ class MedlineFetcher:
# Total number of UIDs from the retrieved set to be shown in the XML output (default=20)
# maximum of 100,000 records
def
medlineEfetchRAW
(
self
,
fullquery
):
query
=
fullquery
[
"string"
]
retmax
=
fullquery
[
"retmax"
]
count
=
fullquery
[
"count"
]
queryKey
=
fullquery
[
"queryKey"
]
webEnv
=
fullquery
[
"webEnv"
]
query
=
fullquery
[
"string"
]
retmax
=
fullquery
[
"retmax"
]
count
=
fullquery
[
"count"
]
queryKey
=
fullquery
[
"queryKey"
]
webEnv
=
fullquery
[
"webEnv"
]
"Fetch medline result for query 'query', saving results to file every 'retmax' articles"
queryNoSpace
=
query
.
replace
(
' '
,
''
)
# No space in directory and file names, avoids stupid errors
# print ("LOG::TIME: ",'medlineEfetchRAW :Query "' , query , '"\t:\t' , count , ' results')
retstart
=
0
eFetch
=
'
%
s/efetch.fcgi?email=youremail@example.org&rettype=
%
s&retmode=xml&retstart=
%
s&retmax=
%
s&db=
%
s&query_key=
%
s&WebEnv=
%
s'
%
(
self
.
pubMedEutilsURL
,
self
.
reportType
,
retstart
,
retmax
,
self
.
pubMedDB
,
queryKey
,
webEnv
)
return
eFetch
def
ensure_dir
(
self
,
f
):
d
=
os
.
path
.
dirname
(
f
)
if
not
os
.
path
.
exists
(
d
):
os
.
makedirs
(
d
)
# generic!
def
downloadFile
(
self
,
item
):
url
=
item
[
0
]
filename
=
item
[
1
]
# print("\tin test_downloadFile:")
# print(url,filename)
data
=
urlopen
(
url
)
f
=
codecs
.
open
(
filename
,
"w"
,
encoding
=
'utf-8'
)
myfile
=
File
(
f
)
myfile
.
write
(
data
.
read
()
.
decode
(
'utf-8'
)
)
myfile
.
close
()
f
.
close
()
def
download
(
self
,
url
):
print
(
url
)
filename
=
download
(
url
)
with
self
.
lock
:
print
(
threading
.
current_thread
()
.
name
,
filename
+
" OK"
)
return
filename
# generic!
def
test_downloadFile
(
self
,
item
):
url
=
item
[
0
]
filename
=
item
[
1
]
# print("\tin downloadFile:")
data
=
urlopen
(
url
)
return
data
# generic!
def
do_work
(
self
,
item
):
...
...
@@ -132,23 +129,24 @@ class MedlineFetcher:
self
.
firstResults
.
append
(
self
.
do_work
(
item
))
self
.
q
.
task_done
()
def
worker2
(
self
):
while
True
:
item
=
self
.
q
.
get
()
results
=
[]
try
:
result
=
self
.
downloadFile
(
item
)
except
:
result
=
False
try
:
result
=
self
.
download
(
item
)
except
Exception
as
error
:
print
(
error
)
result
=
False
self
.
firstResults
.
append
(
result
)
self
.
q
.
task_done
()
def
chunks
(
self
,
l
,
n
):
print
(
"chunks:"
)
for
i
in
range
(
0
,
len
(
l
),
n
):
yield
l
[
i
:
i
+
n
]
# GLOBALLIMIT:
# I will retrieve this exact amount of publications.
# The publications per year i'll retrieve per year will be = (k/N)*GlobalLimit <- i'll use this as RETMAX
# The publications per year i'll retrieve per year will be :
# (k/N)*GlobalLimit
# \_ this is used as RETMAX
# - k : Number of publications of x year (according to pubmed)
# - N : Sum of every k belonging to {X} (total number of pubs according to pubmed)
# - GlobalLimit : Number of publications i want.
...
...
@@ -172,7 +170,7 @@ class MedlineFetcher:
# print ('---------\n')
pubmedquery
=
str
(
year
)
+
'[dp] '
+
query
self
.
q
.
put
(
pubmedquery
)
#put task in the queue
self
.
q
.
join
()
print
(
'time:'
,
time
.
perf_counter
()
-
start
)
...
...
@@ -183,15 +181,16 @@ class MedlineFetcher:
Total
+=
1
if
globalresults
[
"queryKey"
]
==
False
:
Fails
+=
1
if
globalresults
[
"count"
]
>
0
:
if
globalresults
[
"count"
]
>
0
:
N
+=
globalresults
[
"count"
]
queryhyperdata
=
{
"string"
:
globalresults
[
"query"
]
,
"count"
:
globalresults
[
"count"
]
,
"queryKey"
:
globalresults
[
"queryKey"
]
,
"webEnv"
:
globalresults
[
"webEnv"
]
,
"retmax"
:
0
}
queryhyperdata
=
{
"string"
:
globalresults
[
"query"
]
,
"count"
:
globalresults
[
"count"
]
,
"queryKey"
:
globalresults
[
"queryKey"
]
,
"webEnv"
:
globalresults
[
"webEnv"
]
,
"retmax"
:
0
}
thequeries
.
append
(
queryhyperdata
)
print
(
"Total Number:"
,
N
,
"publications"
)
...
...
@@ -199,14 +198,16 @@ class MedlineFetcher:
print
(
"---------------------------------------
\n
"
)
for
i
,
query
in
enumerate
(
thequeries
):
k
=
query
[
"count"
]
proportion
=
k
/
float
(
N
)
k
=
query
[
"count"
]
proportion
=
k
/
float
(
N
)
retmax_forthisyear
=
int
(
round
(
globalLimit
*
proportion
))
query
[
"retmax"
]
=
retmax_forthisyear
if
query
[
"retmax"
]
==
0
:
query
[
"retmax"
]
+=
1
query
[
"retmax"
]
=
retmax_forthisyear
if
query
[
"retmax"
]
==
0
:
query
[
"retmax"
]
+=
1
print
(
query
[
"string"
],
"
\t
["
,
k
,
">"
,
query
[
"retmax"
],
"]"
)
if
((
Fails
+
1
)
/
(
Total
+
1
))
==
1
:
# for identifying the epic fail or connection error
if
((
Fails
+
1
)
/
(
Total
+
1
))
==
1
:
# for identifying the epic fail or connection error
thequeries
=
[
False
]
return
thequeries
scrapers/istex.py
0 → 100644
View file @
9eead9fa
def
getGlobalStatsISTEXT
(
request
):
"""
ISTEX simply the total of hits for a query
(not reused in testISTEX)
"""
print
(
request
.
method
)
alist
=
[
"bar"
,
"foo"
]
if
request
.
method
==
"POST"
:
query
=
request
.
POST
[
"query"
]
N
=
int
(
request
.
POST
[
"N"
])
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" query ="
,
query
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" N ="
,
N
)
query_string
=
query
.
replace
(
" "
,
"+"
)
url
=
"http://api.istex.fr/document/?q="
+
query_string
+
"&output=id,title,abstract,pubdate,corpusName,authors,language"
tasks
=
MedlineFetcher
()
try
:
thedata_path
=
tasks
.
download
(
url
)
thedata
=
open
(
thedata_path
,
"rb"
)
alist
=
thedata
.
read
()
.
decode
(
'utf-8'
)
except
Exception
as
error
:
alist
=
[
str
(
error
)]
data
=
alist
return
JsonHttpResponse
(
data
)
def
testISTEX
(
request
,
project_id
):
print
(
"testISTEX:"
)
print
(
request
.
method
)
alist
=
[
"bar"
,
"foo"
]
# implicit global session
# do we have a valid project id?
try
:
project_id
=
int
(
project_id
)
except
ValueError
:
raise
Http404
()
# do we have a valid project?
project
=
(
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
project_id
)
.
filter
(
Node
.
typename
==
'PROJECT'
)
)
.
first
()
if
project
is
None
:
raise
Http404
()
# do we have a valid user?
user
=
request
.
user
if
not
user
.
is_authenticated
():
return
redirect
(
'/auth/?next=
%
s'
%
request
.
path
)
if
project
.
user_id
!=
user
.
id
:
return
HttpResponseForbidden
()
if
request
.
method
==
"POST"
:
query
=
"-"
query_string
=
"-"
N
=
0
if
"query"
in
request
.
POST
:
query
=
request
.
POST
[
"query"
]
query_string
=
query
.
replace
(
" "
,
"+"
)
# url encoded q
if
"N"
in
request
.
POST
:
N
=
int
(
request
.
POST
[
"N"
])
# query_size from views_opti
if
N
>
QUERY_SIZE_N_MAX
:
msg
=
"Invalid sample size N =
%
i (max =
%
i)"
%
(
N
,
QUERY_SIZE_N_MAX
)
print
(
"ERROR (scrap: istex d/l ): "
,
msg
)
raise
ValueError
(
msg
)
print
(
"Scrapping Istex: '
%
s' (
%
i)"
%
(
query_string
,
N
))
urlreqs
=
[]
pagesize
=
50
tasks
=
MedlineFetcher
()
chunks
=
list
(
tasks
.
chunks
(
range
(
N
),
pagesize
))
for
k
in
chunks
:
if
(
k
[
0
]
+
pagesize
)
>
N
:
pagesize
=
N
-
k
[
0
]
urlreqs
.
append
(
"http://api.istex.fr/document/?q="
+
query_string
+
"&output=*&"
+
"from="
+
str
(
k
[
0
])
+
"&size="
+
str
(
pagesize
))
resourcetype
=
RESOURCETYPES
[
"name"
][
"ISTex"
]
# corpus node instanciation as a Django model
corpus
=
Node
(
name
=
query
,
user_id
=
request
.
user
.
id
,
parent_id
=
project_id
,
typename
=
'CORPUS'
,
language_id
=
None
,
hyperdata
=
{
'Processing'
:
"Parsing documents"
,}
)
session
.
add
(
corpus
)
session
.
commit
()
corpus_id
=
corpus
.
id
print
(
"NEW CORPUS"
,
corpus_id
)
ensure_dir
(
request
.
user
)
tasks
=
MedlineFetcher
()
for
i
in
range
(
8
):
t
=
threading
.
Thread
(
target
=
tasks
.
worker2
)
#thing to do
t
.
daemon
=
True
# thread dies when main thread (only non-daemon thread) exits.
t
.
start
()
for
url
in
urlreqs
:
filename
=
MEDIA_ROOT
+
'/corpora/
%
s/
%
s'
%
(
request
.
user
,
str
(
datetime
.
datetime
.
now
()
.
isoformat
()))
tasks
.
q
.
put
(
[
url
,
filename
])
#put a task in th queue
tasks
.
q
.
join
()
# wait until everything is finished
dwnldsOK
=
0
for
filename
in
tasks
.
firstResults
:
if
filename
!=
False
:
# add the uploaded resource to the corpus
corpus
.
add_resource
(
corpus
,
user_id
=
request
.
user
.
id
,
type_id
=
resourcetype
.
id
,
file
=
filename
,
)
dwnldsOK
+=
1
if
dwnldsOK
==
0
:
return
JsonHttpResponse
([
"fail"
])
###########################
###########################
try
:
scheduled
(
parse_extract_indexhyperdata
(
corpus_id
,))
except
Exception
as
error
:
print
(
'WORKFLOW ERROR'
)
print
(
error
)
sleep
(
1
)
return
HttpResponseRedirect
(
'/project/'
+
str
(
project_id
))
data
=
[
query_string
,
query
,
N
]
return
JsonHttpResponse
(
data
)
scrapers/pubmed.py
View file @
9eead9fa
...
...
@@ -8,19 +8,18 @@ import json
import
datetime
from
os
import
path
import
threading
from
gargantext.settings
import
MEDIA_ROOT
,
BASE_DIR
#
from gargantext.settings import MEDIA_ROOT, BASE_DIR
from
django.shortcuts
import
redirect
from
django.http
import
Http404
,
HttpResponseRedirect
,
HttpResponseForbidden
from
gargantext.constants
import
RESOURCETYPES
from
gargantext.models.nodes
import
Node
from
gargantext.util.db
import
session
from
gargantext.util.http
import
JsonHttpResponse
from
gargantext.util.tools
import
ensure_dir
from
gargantext.constants
import
RESOURCETYPES
from
gargantext.models.nodes
import
Node
from
gargantext.util.db
import
session
from
gargantext.util.http
import
JsonHttpResponse
from
gargantext.util.tools
import
ensure_dir
from
gargantext.util.scheduling
import
scheduled
from
gargantext.util.toolchain
import
parse_extract_indexhyperdata
from
gargantext.util.toolchain
import
parse_extract_indexhyperdata
...
...
@@ -37,7 +36,7 @@ QUERY_SIZE_N_MAX = 1000 # int(CONF['scrappers']['QUERY_SIZE_N_MAX'])
# QUERY_SIZE_N_DEFAULT = int(CONF['scrappers']['QUERY_SIZE_N_DEFAULT'])
# --------------------------------------------------------------------
def
getGlobalStats
(
request
):
def
getGlobalStats
(
request
):
"""
Pubmed year by year results
...
...
@@ -73,37 +72,8 @@ def getGlobalStats(request ):
return
JsonHttpResponse
(
data
)
def
getGlobalStatsISTEXT
(
request
):
"""
ISTEX simply the total of hits for a query
(not reused in testISTEX)
"""
print
(
request
.
method
)
alist
=
[
"bar"
,
"foo"
]
if
request
.
method
==
"POST"
:
query
=
request
.
POST
[
"query"
]
N
=
int
(
request
.
POST
[
"N"
])
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" query ="
,
query
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" N ="
,
N
)
query_string
=
query
.
replace
(
" "
,
"+"
)
url
=
"http://api.istex.fr/document/?q="
+
query_string
+
"&output=id,title,abstract,pubdate,corpusName,authors,language"
tasks
=
MedlineFetcher
()
filename
=
MEDIA_ROOT
+
'/corpora/
%
s/
%
s'
%
(
request
.
user
,
str
(
datetime
.
datetime
.
now
()
.
isoformat
()))
try
:
thedata
=
tasks
.
test_downloadFile
(
[
url
,
filename
]
)
alist
=
thedata
.
read
()
.
decode
(
'utf-8'
)
except
Exception
as
error
:
alist
=
[
str
(
error
)]
data
=
alist
return
JsonHttpResponse
(
data
)
def
doTheQuery
(
request
,
project_id
):
def
doTheQuery
(
request
,
project_id
)
:
# implicit global session
# do we have a valid project id?
try
:
...
...
@@ -111,11 +81,10 @@ def doTheQuery(request , project_id):
except
ValueError
:
raise
Http404
()
# do we have a valid project?
project
=
(
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
project_id
)
.
filter
(
Node
.
typename
==
'PROJECT'
)
)
.
first
()
project
=
(
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
project_id
)
.
filter
(
Node
.
typename
==
'PROJECT'
)
)
.
first
()
if
project
is
None
:
raise
Http404
()
...
...
@@ -130,7 +99,7 @@ def doTheQuery(request , project_id):
if
request
.
method
==
"POST"
:
queries
=
request
.
POST
[
"query"
]
name
=
request
.
POST
[
"string"
]
name
=
request
.
POST
[
"string"
]
# here we just realize queries already prepared by getGlobalStats
# ===> no need to repeat N parameter like in testISTEX <===
...
...
@@ -147,7 +116,6 @@ def doTheQuery(request , project_id):
urlreqs
.
append
(
instancia
.
medlineEfetchRAW
(
yearquery
)
)
alist
=
[
"tudo fixe"
,
"tudo bem"
]
resourcetype
=
RESOURCETYPES
[
'name'
][
'Pubmed (xml format)'
]
# corpus node instanciation as a Django model
corpus
=
Node
(
...
...
@@ -155,8 +123,9 @@ def doTheQuery(request , project_id):
user_id
=
request
.
user
.
id
,
parent_id
=
project_id
,
typename
=
'CORPUS'
,
language_id
=
None
,
hyperdata
=
{
'Processing'
:
"Parsing documents"
,}
hyperdata
=
{
"action"
:
"Scraping data"
,
"language_id"
:
None
}
)
session
.
add
(
corpus
)
session
.
commit
()
...
...
@@ -177,22 +146,21 @@ def doTheQuery(request , project_id):
t
.
daemon
=
True
# thread dies when main thread (only non-daemon thread) exits.
t
.
start
()
for
url
in
urlreqs
:
filename
=
MEDIA_ROOT
+
'/corpora/
%
s/
%
s'
%
(
request
.
user
,
str
(
datetime
.
datetime
.
now
()
.
isoformat
()))
tasks
.
q
.
put
(
[
url
,
filename
])
#put a task in th queue
tasks
.
q
.
put
(
url
)
#put a task in the queue
tasks
.
q
.
join
()
# wait until everything is finished
dwnldsOK
=
0
for
filename
in
tasks
.
firstResults
:
if
filename
!=
False
:
for
filename
in
tasks
.
firstResults
:
print
(
filename
)
if
filename
!=
False
:
# add the uploaded resource to the corpus
add_resource
(
corpus
,
user_id
=
request
.
user
.
id
,
type_id
=
resourcetype
.
id
,
file
=
filename
,
)
corpus
.
add_resource
(
type
=
3
,
path
=
filename
)
dwnldsOK
+=
1
if
dwnldsOK
==
0
:
return
JsonHttpResponse
([
"fail"
])
if
dwnldsOK
==
0
:
return
JsonHttpResponse
([
"fail"
])
try
:
scheduled
(
parse_extract_indexhyperdata
(
corpus_id
,))
...
...
@@ -200,118 +168,10 @@ def doTheQuery(request , project_id):
print
(
'WORKFLOW ERROR'
)
print
(
error
)
sleep
(
1
)
return
HttpResponseRedirect
(
'/project/'
+
str
(
project_id
))
return
HttpResponseRedirect
(
'/project
s
/'
+
str
(
project_id
))
data
=
alist
return
JsonHttpResponse
(
data
)
def
testISTEX
(
request
,
project_id
):
print
(
"testISTEX:"
)
print
(
request
.
method
)
alist
=
[
"bar"
,
"foo"
]
# implicit global session
# do we have a valid project id?
try
:
project_id
=
int
(
project_id
)
except
ValueError
:
raise
Http404
()
# do we have a valid project?
project
=
(
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
project_id
)
.
filter
(
Node
.
typename
==
'PROJECT'
)
)
.
first
()
if
project
is
None
:
raise
Http404
()
# do we have a valid user?
user
=
request
.
user
if
not
user
.
is_authenticated
():
return
redirect
(
'/auth/?next=
%
s'
%
request
.
path
)
if
project
.
user_id
!=
user
.
id
:
return
HttpResponseForbidden
()
if
request
.
method
==
"POST"
:
query
=
"-"
query_string
=
"-"
N
=
0
if
"query"
in
request
.
POST
:
query
=
request
.
POST
[
"query"
]
query_string
=
query
.
replace
(
" "
,
"+"
)
# url encoded q
if
"N"
in
request
.
POST
:
N
=
int
(
request
.
POST
[
"N"
])
# query_size from views_opti
if
N
>
QUERY_SIZE_N_MAX
:
msg
=
"Invalid sample size N =
%
i (max =
%
i)"
%
(
N
,
QUERY_SIZE_N_MAX
)
print
(
"ERROR (scrap: istex d/l ): "
,
msg
)
raise
ValueError
(
msg
)
print
(
"Scrapping Istex: '
%
s' (
%
i)"
%
(
query_string
,
N
))
urlreqs
=
[]
pagesize
=
50
tasks
=
MedlineFetcher
()
chunks
=
list
(
tasks
.
chunks
(
range
(
N
),
pagesize
))
for
k
in
chunks
:
if
(
k
[
0
]
+
pagesize
)
>
N
:
pagesize
=
N
-
k
[
0
]
urlreqs
.
append
(
"http://api.istex.fr/document/?q="
+
query_string
+
"&output=*&"
+
"from="
+
str
(
k
[
0
])
+
"&size="
+
str
(
pagesize
))
resourcetype
=
RESOURCETYPES
[
"name"
][
"ISTex"
]
# corpus node instanciation as a Django model
corpus
=
Node
(
name
=
query
,
user_id
=
request
.
user
.
id
,
parent_id
=
project_id
,
typename
=
'CORPUS'
,
language_id
=
None
,
hyperdata
=
{
'Processing'
:
"Parsing documents"
,}
)
session
.
add
(
corpus
)
session
.
commit
()
corpus_id
=
corpus
.
id
print
(
"NEW CORPUS"
,
corpus_id
)
ensure_dir
(
request
.
user
)
tasks
=
MedlineFetcher
()
for
i
in
range
(
8
):
t
=
threading
.
Thread
(
target
=
tasks
.
worker2
)
#thing to do
t
.
daemon
=
True
# thread dies when main thread (only non-daemon thread) exits.
t
.
start
()
for
url
in
urlreqs
:
filename
=
MEDIA_ROOT
+
'/corpora/
%
s/
%
s'
%
(
request
.
user
,
str
(
datetime
.
datetime
.
now
()
.
isoformat
()))
tasks
.
q
.
put
(
[
url
,
filename
])
#put a task in th queue
tasks
.
q
.
join
()
# wait until everything is finished
dwnldsOK
=
0
for
filename
in
tasks
.
firstResults
:
if
filename
!=
False
:
# add the uploaded resource to the corpus
corpus
.
add_resource
(
corpus
,
user_id
=
request
.
user
.
id
,
type_id
=
resourcetype
.
id
,
file
=
filename
,
)
dwnldsOK
+=
1
if
dwnldsOK
==
0
:
return
JsonHttpResponse
([
"fail"
])
###########################
###########################
try
:
scheduled
(
parse_extract_indexhyperdata
(
corpus_id
,))
except
Exception
as
error
:
print
(
'WORKFLOW ERROR'
)
print
(
error
)
sleep
(
1
)
return
HttpResponseRedirect
(
'/project/'
+
str
(
project_id
))
data
=
[
query_string
,
query
,
N
]
return
JsonHttpResponse
(
data
)
scrapers/urls.py
View file @
9eead9fa
...
...
@@ -10,7 +10,11 @@ import scrapers.pubmed as pubmed
# Available databases : Pubmed, IsTex, (next: CERN)
# /!\ urls patterns here are *without* the trailing slash
urlpatterns
=
[
url
(
r'^pubmed/query$'
,
pubmed
.
getGlobalStats
)
urlpatterns
=
[
url
(
r'^pubmed/query$'
,
pubmed
.
getGlobalStats
)
,
url
(
r'^pubmed/search/(\d+)'
,
pubmed
.
doTheQuery
)
# , url(r'^istex/query$' , pubmed.getGlobalStatsISTEXT )
# , url(r'^istex/search/(\d+)' , pubmed.testISTEX )
#, url(r'^scraping$' , scraping.Target.as_view() )
,
]
templates/pages/projects/project.html
View file @
9eead9fa
...
...
@@ -260,7 +260,7 @@
$
.
ajax
({
// contentType: "application/json",
url
:
window
.
location
.
origin
+
"/
tests/project/"
+
projectid
+
"/pubmedquery/go"
,
url
:
window
.
location
.
origin
+
"/
scrapers/pubmed/search/"
+
projectid
,
data
:
pubmedifiedQuery
,
type
:
'POST'
,
beforeSend
:
function
(
xhr
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment