Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
f2f0ce75
Commit
f2f0ce75
authored
Apr 07, 2016
by
delanoe
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FEAT] Istex scraper ok, need parser now.
parent
9eead9fa
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
163 additions
and
157 deletions
+163
-157
constants.py
gargantext/constants.py
+4
-4
ISTex.py
gargantext/util/parsers/ISTex.py
+4
-4
__init__.py
gargantext/util/parsers/__init__.py
+1
-1
MedlineFetcher.py
scrapers/MedlineFetcher.py
+7
-0
istex.py
scrapers/istex.py
+0
-141
pubmed.py
scrapers/pubmed.py
+140
-1
urls.py
scrapers/urls.py
+4
-3
project.html
templates/pages/projects/project.html
+3
-3
No files found.
gargantext/constants.py
View file @
f2f0ce75
...
...
@@ -160,10 +160,10 @@ RESOURCETYPES = [
'parser'
:
CSVParser
,
'default_language'
:
'en'
,
},
#
{ 'name': 'ISTex',
# #
'parser': ISTexParser,
#
'default_language': 'en',
#
},
{
'name'
:
'ISTex'
,
'parser'
:
ISTexParser
,
'default_language'
:
'en'
,
},
]
# linguistic extraction parameters ---------------------------------------------
...
...
gargantext/util/parsers/ISTex.py
View file @
f2f0ce75
...
...
@@ -4,7 +4,7 @@ from datetime import datetime
from
io
import
BytesIO
import
json
class
ISTex
(
Parser
):
class
ISTex
Parser
(
Parser
):
def
parse
(
self
,
thefile
):
json_data
=
open
(
thefile
,
"r"
)
...
...
@@ -84,16 +84,16 @@ class ISTex(Parser):
# ---------------------------------------------------
if
len
(
hyperdata
[
"language_iso3"
])
>
0
and
hyperdata
[
"language_iso3"
][
0
]
!=
"unknown"
:
hyperdata
[
"language_iso3"
]
=
hyperdata
[
"language_iso3"
][
0
]
# default value = eng
# possible even better: langid.classify(abstract)
else
:
# NB 97% des docs istex sont eng donc par défaut
# ----------------------------------------------
hyperdata
[
"language_iso3"
]
=
"eng"
# (cf. api.istex.fr/document/?q=*&facet=language
# (cf. api.istex.fr/document/?q=*&facet=language
# et tests langid sur les language=["unknown"])
if
"publication_date"
in
hyperdata
:
RealDate
=
hyperdata
[
"publication_date"
]
...
...
gargantext/util/parsers/__init__.py
View file @
f2f0ce75
...
...
@@ -7,5 +7,5 @@ from .Pubmed import PubmedParser
# # 2015-12-08: parser 2 en 1
from
.Europress
import
EuropressParser
#
from .ISTex import ISTexParser
from
.ISTex
import
ISTexParser
from
.CSV
import
CSVParser
scrapers/MedlineFetcher.py
View file @
f2f0ce75
...
...
@@ -142,6 +142,13 @@ class MedlineFetcher:
self
.
firstResults
.
append
(
result
)
self
.
q
.
task_done
()
def
chunks
(
self
,
l
,
n
):
print
(
"chunks:"
)
for
i
in
range
(
0
,
len
(
l
),
n
):
yield
l
[
i
:
i
+
n
]
# GLOBALLIMIT:
# I will retrieve this exact amount of publications.
# The publications per year i'll retrieve per year will be :
...
...
scrapers/istex.py
View file @
f2f0ce75
def
getGlobalStatsISTEXT
(
request
):
"""
ISTEX simply the total of hits for a query
(not reused in testISTEX)
"""
print
(
request
.
method
)
alist
=
[
"bar"
,
"foo"
]
if
request
.
method
==
"POST"
:
query
=
request
.
POST
[
"query"
]
N
=
int
(
request
.
POST
[
"N"
])
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" query ="
,
query
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" N ="
,
N
)
query_string
=
query
.
replace
(
" "
,
"+"
)
url
=
"http://api.istex.fr/document/?q="
+
query_string
+
"&output=id,title,abstract,pubdate,corpusName,authors,language"
tasks
=
MedlineFetcher
()
try
:
thedata_path
=
tasks
.
download
(
url
)
thedata
=
open
(
thedata_path
,
"rb"
)
alist
=
thedata
.
read
()
.
decode
(
'utf-8'
)
except
Exception
as
error
:
alist
=
[
str
(
error
)]
data
=
alist
return
JsonHttpResponse
(
data
)
def
testISTEX
(
request
,
project_id
):
print
(
"testISTEX:"
)
print
(
request
.
method
)
alist
=
[
"bar"
,
"foo"
]
# implicit global session
# do we have a valid project id?
try
:
project_id
=
int
(
project_id
)
except
ValueError
:
raise
Http404
()
# do we have a valid project?
project
=
(
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
project_id
)
.
filter
(
Node
.
typename
==
'PROJECT'
)
)
.
first
()
if
project
is
None
:
raise
Http404
()
# do we have a valid user?
user
=
request
.
user
if
not
user
.
is_authenticated
():
return
redirect
(
'/auth/?next=
%
s'
%
request
.
path
)
if
project
.
user_id
!=
user
.
id
:
return
HttpResponseForbidden
()
if
request
.
method
==
"POST"
:
query
=
"-"
query_string
=
"-"
N
=
0
if
"query"
in
request
.
POST
:
query
=
request
.
POST
[
"query"
]
query_string
=
query
.
replace
(
" "
,
"+"
)
# url encoded q
if
"N"
in
request
.
POST
:
N
=
int
(
request
.
POST
[
"N"
])
# query_size from views_opti
if
N
>
QUERY_SIZE_N_MAX
:
msg
=
"Invalid sample size N =
%
i (max =
%
i)"
%
(
N
,
QUERY_SIZE_N_MAX
)
print
(
"ERROR (scrap: istex d/l ): "
,
msg
)
raise
ValueError
(
msg
)
print
(
"Scrapping Istex: '
%
s' (
%
i)"
%
(
query_string
,
N
))
urlreqs
=
[]
pagesize
=
50
tasks
=
MedlineFetcher
()
chunks
=
list
(
tasks
.
chunks
(
range
(
N
),
pagesize
))
for
k
in
chunks
:
if
(
k
[
0
]
+
pagesize
)
>
N
:
pagesize
=
N
-
k
[
0
]
urlreqs
.
append
(
"http://api.istex.fr/document/?q="
+
query_string
+
"&output=*&"
+
"from="
+
str
(
k
[
0
])
+
"&size="
+
str
(
pagesize
))
resourcetype
=
RESOURCETYPES
[
"name"
][
"ISTex"
]
# corpus node instanciation as a Django model
corpus
=
Node
(
name
=
query
,
user_id
=
request
.
user
.
id
,
parent_id
=
project_id
,
typename
=
'CORPUS'
,
language_id
=
None
,
hyperdata
=
{
'Processing'
:
"Parsing documents"
,}
)
session
.
add
(
corpus
)
session
.
commit
()
corpus_id
=
corpus
.
id
print
(
"NEW CORPUS"
,
corpus_id
)
ensure_dir
(
request
.
user
)
tasks
=
MedlineFetcher
()
for
i
in
range
(
8
):
t
=
threading
.
Thread
(
target
=
tasks
.
worker2
)
#thing to do
t
.
daemon
=
True
# thread dies when main thread (only non-daemon thread) exits.
t
.
start
()
for
url
in
urlreqs
:
filename
=
MEDIA_ROOT
+
'/corpora/
%
s/
%
s'
%
(
request
.
user
,
str
(
datetime
.
datetime
.
now
()
.
isoformat
()))
tasks
.
q
.
put
(
[
url
,
filename
])
#put a task in th queue
tasks
.
q
.
join
()
# wait until everything is finished
dwnldsOK
=
0
for
filename
in
tasks
.
firstResults
:
if
filename
!=
False
:
# add the uploaded resource to the corpus
corpus
.
add_resource
(
corpus
,
user_id
=
request
.
user
.
id
,
type_id
=
resourcetype
.
id
,
file
=
filename
,
)
dwnldsOK
+=
1
if
dwnldsOK
==
0
:
return
JsonHttpResponse
([
"fail"
])
###########################
###########################
try
:
scheduled
(
parse_extract_indexhyperdata
(
corpus_id
,))
except
Exception
as
error
:
print
(
'WORKFLOW ERROR'
)
print
(
error
)
sleep
(
1
)
return
HttpResponseRedirect
(
'/project/'
+
str
(
project_id
))
data
=
[
query_string
,
query
,
N
]
return
JsonHttpResponse
(
data
)
scrapers/pubmed.py
View file @
f2f0ce75
...
...
@@ -72,7 +72,6 @@ def getGlobalStats( request ):
return
JsonHttpResponse
(
data
)
def
doTheQuery
(
request
,
project_id
)
:
# implicit global session
# do we have a valid project id?
...
...
@@ -174,4 +173,144 @@ def doTheQuery( request , project_id ) :
return
JsonHttpResponse
(
data
)
def
getGlobalStatsISTEXT
(
request
):
"""
ISTEX simply the total of hits for a query
(not reused in testISTEX)
"""
print
(
request
.
method
)
alist
=
[
"bar"
,
"foo"
]
if
request
.
method
==
"POST"
:
query
=
request
.
POST
[
"query"
]
N
=
int
(
request
.
POST
[
"N"
])
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" query ="
,
query
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" N ="
,
N
)
query_string
=
query
.
replace
(
" "
,
"+"
)
url
=
"http://api.istex.fr/document/?q="
+
query_string
+
"&output=id,title,abstract,pubdate,corpusName,authors,language"
tasks
=
MedlineFetcher
()
try
:
thedata_path
=
tasks
.
download
(
url
)
thedata
=
open
(
thedata_path
,
"rb"
)
alist
=
thedata
.
read
()
.
decode
(
'utf-8'
)
except
Exception
as
error
:
alist
=
[
str
(
error
)]
data
=
alist
return
JsonHttpResponse
(
data
)
def
testISTEX
(
request
,
project_id
):
print
(
"testISTEX:"
)
print
(
request
.
method
)
alist
=
[
"bar"
,
"foo"
]
# implicit global session
# do we have a valid project id?
try
:
project_id
=
int
(
project_id
)
except
ValueError
:
raise
Http404
()
# do we have a valid project?
project
=
(
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
project_id
)
.
filter
(
Node
.
typename
==
'PROJECT'
)
)
.
first
()
if
project
is
None
:
raise
Http404
()
# do we have a valid user?
user
=
request
.
user
if
not
user
.
is_authenticated
():
return
redirect
(
'/auth/?next=
%
s'
%
request
.
path
)
if
project
.
user_id
!=
user
.
id
:
return
HttpResponseForbidden
()
if
request
.
method
==
"POST"
:
query
=
"-"
query_string
=
"-"
N
=
0
if
"query"
in
request
.
POST
:
query
=
request
.
POST
[
"query"
]
query_string
=
query
.
replace
(
" "
,
"+"
)
# url encoded q
if
"N"
in
request
.
POST
:
N
=
int
(
request
.
POST
[
"N"
])
# query_size from views_opti
if
N
>
QUERY_SIZE_N_MAX
:
msg
=
"Invalid sample size N =
%
i (max =
%
i)"
%
(
N
,
QUERY_SIZE_N_MAX
)
print
(
"ERROR (scrap: istex d/l ): "
,
msg
)
raise
ValueError
(
msg
)
print
(
"Scrapping Istex: '
%
s' (
%
i)"
%
(
query_string
,
N
))
urlreqs
=
[]
pagesize
=
50
tasks
=
MedlineFetcher
()
chunks
=
list
(
tasks
.
chunks
(
range
(
N
),
pagesize
))
for
k
in
chunks
:
if
(
k
[
0
]
+
pagesize
)
>
N
:
pagesize
=
N
-
k
[
0
]
urlreqs
.
append
(
"http://api.istex.fr/document/?q="
+
query_string
+
"&output=*&"
+
"from="
+
str
(
k
[
0
])
+
"&size="
+
str
(
pagesize
))
# corpus node instanciation as a Django model
corpus
=
Node
(
name
=
query
,
user_id
=
request
.
user
.
id
,
parent_id
=
project_id
,
typename
=
'CORPUS'
,
hyperdata
=
{
"action"
:
"Scraping data"
,
"language_id"
:
None
}
)
session
.
add
(
corpus
)
session
.
commit
()
corpus_id
=
corpus
.
id
print
(
"NEW CORPUS"
,
corpus_id
)
ensure_dir
(
request
.
user
)
tasks
=
MedlineFetcher
()
for
i
in
range
(
8
):
t
=
threading
.
Thread
(
target
=
tasks
.
worker2
)
#thing to do
t
.
daemon
=
True
# thread dies when main thread (only non-daemon thread) exits.
t
.
start
()
for
url
in
urlreqs
:
tasks
.
q
.
put
(
url
)
#put a task in th queue
tasks
.
q
.
join
()
# wait until everything is finished
dwnldsOK
=
0
for
filename
in
tasks
.
firstResults
:
if
filename
!=
False
:
# add the uploaded resource to the corpus
# add the uploaded resource to the corpus
corpus
.
add_resource
(
type
=
3
,
path
=
filename
)
dwnldsOK
+=
1
if
dwnldsOK
==
0
:
return
JsonHttpResponse
([
"fail"
])
###########################
###########################
try
:
scheduled
(
parse_extract_indexhyperdata
(
corpus_id
,))
except
Exception
as
error
:
print
(
'WORKFLOW ERROR'
)
print
(
error
)
sleep
(
1
)
return
HttpResponseRedirect
(
'/projects/'
+
str
(
project_id
))
data
=
[
query_string
,
query
,
N
]
return
JsonHttpResponse
(
data
)
scrapers/urls.py
View file @
f2f0ce75
from
django.conf.urls
import
url
import
scrapers.pubmed
as
pubmed
#import scrapers.istex as istex
#import scrapers.istex as istex
#import scrapers.cern as cern
#import scrapers.hal as hal
...
...
@@ -13,8 +14,8 @@ import scrapers.pubmed as pubmed
urlpatterns
=
[
url
(
r'^pubmed/query$'
,
pubmed
.
getGlobalStats
)
,
url
(
r'^pubmed/search/(\d+)'
,
pubmed
.
doTheQuery
)
# , url(r'^istex/query$' , pubmed.getGlobalStatsISTEXT
)
# , url(r'^istex/search/(\d+)' , pubmed.testISTEX
)
,
url
(
r'^istex/query$'
,
pubmed
.
getGlobalStatsISTEXT
)
,
url
(
r'^istex/search/(\d+)'
,
pubmed
.
testISTEX
)
#, url(r'^scraping$' , scraping.Target.as_view() )
,
]
templates/pages/projects/project.html
View file @
f2f0ce75
...
...
@@ -370,10 +370,10 @@
}
if
(
theType
==
"ISTex"
)
{
console
.
log
(
window
.
location
.
origin
+
"
tests/istext
query"
)
console
.
log
(
window
.
location
.
origin
+
"
scrapers/istex/
query"
)
$
.
ajax
({
// contentType: "application/json",
url
:
window
.
location
.
origin
+
"/
tests/istext
query"
,
url
:
window
.
location
.
origin
+
"/
scrapers/istex/
query"
,
data
:
formData
,
type
:
'POST'
,
beforeSend
:
function
(
xhr
)
{
...
...
@@ -504,7 +504,7 @@
$
.
ajax
({
// contentType: "application/json",
url
:
window
.
location
.
origin
+
"/
tests/project/"
+
projectid
+
"/ISTEXquery/go"
,
url
:
window
.
location
.
origin
+
"/
scrapers/istex/search/"
+
projectid
,
data
:
postQuery
,
type
:
'POST'
,
beforeSend
:
function
(
xhr
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment