Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
45a1fa9d
Commit
45a1fa9d
authored
Feb 16, 2015
by
PkSM3
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[UPDATE] istex|pubmed scrapper
parent
119671ab
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
142 additions
and
59 deletions
+142
-59
urls.py
gargantext_web/urls.py
+1
-0
admin.py
node/admin.py
+2
-2
models.py
node/models.py
+4
-4
PubmedFileParser.py
parsing/FileParsers/PubmedFileParser.py
+20
-13
MedlineFetcherDavid2015.py
scrap_pubmed/MedlineFetcherDavid2015.py
+11
-1
views.py
scrap_pubmed/views.py
+34
-2
project.html
templates/project.html
+70
-37
No files found.
gargantext_web/urls.py
View file @
45a1fa9d
...
...
@@ -70,6 +70,7 @@ urlpatterns = patterns('',
url
(
r'^tests/mvc$'
,
views
.
tests_mvc
),
url
(
r'^tests/mvc-listdocuments$'
,
views
.
tests_mvc_listdocuments
),
url
(
r'^tests/istextquery$'
,
pubmedscrapper
.
getGlobalStatsISTEXT
),
url
(
r'^tests/pubmedquery$'
,
pubmedscrapper
.
getGlobalStats
),
url
(
r'^tests/project/(\d+)/pubmedquery/go$'
,
pubmedscrapper
.
doTheQuery
),
url
(
r'^tests/project/(\d+)/ISTEXquery/go$'
,
pubmedscrapper
.
testISTEX
)
...
...
node/admin.py
View file @
45a1fa9d
...
...
@@ -125,8 +125,8 @@ class CustomForm(forms.Form):
# file_.name = str(datetime.now().microsecond)
# # raise forms.ValidationError(_('Come on dude, name too long. Now is:'+file_.name))
# #File size
if
len
(
file_
)
>
104857600
:
raise
forms
.
ValidationError
(
_
(
'File to heavy! (<100MB).'
))
#
if len(file_)>104857600:
#
raise forms.ValidationError(_('File to heavy! (<100MB).'))
## File type:
# if file_.content_type == "application/zip":
# raise forms.ValidationError(_('We need a zip pls.'))
...
...
node/models.py
View file @
45a1fa9d
...
...
@@ -81,7 +81,6 @@ class NodeQuerySet(CTENodeManager.CTEQuerySet):
metadata_cache
=
{
metadata
.
name
:
metadata
for
metadata
in
Metadata
.
objects
.
all
()}
data
=
[]
for
node
in
self
:
print
(
node
.
id
)
for
key
,
value
in
node
.
metadata
.
items
():
if
key
in
metadata_cache
:
metadata
=
metadata_cache
[
key
]
...
...
@@ -249,13 +248,14 @@ class Node(CTENode):
@
current_app
.
task
(
filter
=
task_method
)
def
workflow
(
self
,
keys
=
None
,
ngramsextractorscache
=
None
,
ngramscaches
=
None
,
verbose
=
False
):
import
time
total
=
0
print
(
"LOG::TIME: In workflow() parse_resources()"
)
start
=
time
.
time
()
self
.
metadata
[
'Processing'
]
=
1
self
.
save
()
self
.
parse_resources
()
end
=
time
.
time
()
total
+=
(
end
-
start
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" parse_resources() [s]"
,(
end
-
start
))
print
(
"LOG::TIME: In workflow() / parse_resources()"
)
...
...
@@ -266,7 +266,7 @@ class Node(CTENode):
self
.
children
.
filter
(
type_id
=
type_document
.
pk
)
.
extract_ngrams
(
keys
=
[
'title'
,])
end
=
time
.
time
()
print
(
"- - - - - - - - - -
\n
"
)
print
(
"LOG::TIME: "
,(
end
-
start
)
)
total
+=
(
end
-
start
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" extract_ngrams() [s]"
,(
end
-
start
))
print
(
"LOG::TIME: In workflow() / extract_ngrams()"
)
...
...
@@ -275,9 +275,9 @@ class Node(CTENode):
from
analysis.functions
import
do_tfidf
do_tfidf
(
self
)
end
=
time
.
time
()
total
+=
(
end
-
start
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" do_tfidf() [s]"
,(
end
-
start
))
print
(
"LOG::TIME: In workflow() / do_tfidf()"
)
print
(
"In workflow() END"
)
self
.
metadata
[
'Processing'
]
=
0
self
.
save
()
...
...
parsing/FileParsers/PubmedFileParser.py
View file @
45a1fa9d
...
...
@@ -80,12 +80,19 @@ class PubmedFileParser(FileParser):
if
len
(
RealDate
)
>
4
:
if
len
(
RealDate
)
>
8
:
try
:
Decision
=
datetime
.
strptime
(
RealDate
,
'
%
Y
%
b
%
d'
)
.
date
()
except
:
Decision
=
datetime
.
strptime
(
PubmedDate
,
'
%
Y
%
m
%
d'
)
.
date
()
except
:
try
:
Decision
=
datetime
.
strptime
(
PubmedDate
,
'
%
Y
%
m
%
d'
)
.
date
()
except
:
Decision
=
False
else
:
try
:
Decision
=
datetime
.
strptime
(
RealDate
,
'
%
Y
%
b'
)
.
date
()
except
:
Decision
=
datetime
.
strptime
(
PubmedDate
,
'
%
Y
%
m
%
d'
)
.
date
()
else
:
Decision
=
datetime
.
strptime
(
PubmedDate
,
'
%
Y
%
m
%
d'
)
.
date
()
except
:
try
:
Decision
=
datetime
.
strptime
(
PubmedDate
,
'
%
Y
%
m
%
d'
)
.
date
()
except
:
Decision
=
False
else
:
try
:
Decision
=
datetime
.
strptime
(
PubmedDate
,
'
%
Y
%
m
%
d'
)
.
date
()
except
:
Decision
=
False
if
Decision
!=
False
:
if
"publication_year"
in
metadata
:
metadata
[
"publication_year"
]
=
str
(
Decision
.
year
)
if
"publication_month"
in
metadata
:
metadata
[
"publication_month"
]
=
str
(
Decision
.
month
)
if
"publication_day"
in
metadata
:
metadata
[
"publication_day"
]
=
str
(
Decision
.
day
)
...
...
scrap_pubmed/MedlineFetcherDavid2015.py
View file @
45a1fa9d
...
...
@@ -105,6 +105,13 @@ class MedlineFetcher:
print
(
threading
.
current_thread
()
.
name
,
filename
+
" OK"
)
return
filename
# generic!
def
test_downloadFile
(
self
,
item
):
url
=
item
[
0
]
filename
=
item
[
1
]
print
(
"
\t
in downloadFile:"
)
data
=
urlopen
(
url
)
return
data
# generic!
def
do_work
(
self
,
item
):
...
...
@@ -124,7 +131,10 @@ class MedlineFetcher:
def
worker2
(
self
):
while
True
:
item
=
self
.
q
.
get
()
self
.
firstResults
.
append
(
self
.
downloadFile
(
item
))
results
=
[]
try
:
result
=
self
.
downloadFile
(
item
)
except
:
result
=
False
self
.
firstResults
.
append
(
result
)
self
.
q
.
task_done
()
def
chunks
(
self
,
l
,
n
):
...
...
scrap_pubmed/views.py
View file @
45a1fa9d
...
...
@@ -43,6 +43,32 @@ def getGlobalStats(request ):
return
JsonHttpResponse
(
data
)
def
getGlobalStatsISTEXT
(
request
):
print
(
request
.
method
)
alist
=
[
"bar"
,
"foo"
]
if
request
.
method
==
"POST"
:
N
=
100
query
=
request
.
POST
[
"query"
]
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" query ="
,
query
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" N ="
,
N
)
query_string
=
query
.
replace
(
" "
,
"+"
)
url
=
"http://api.istex.fr/document/?q="
+
query_string
+
"&output=*"
tasks
=
MedlineFetcher
()
filename
=
MEDIA_ROOT
+
'/corpora/
%
s/
%
s'
%
(
request
.
user
,
str
(
datetime
.
datetime
.
now
()
.
isoformat
()))
try
:
thedata
=
tasks
.
test_downloadFile
(
[
url
,
filename
]
)
alist
=
thedata
.
read
()
.
decode
(
'utf-8'
)
except
Exception
as
error
:
alist
=
[
str
(
error
)]
data
=
alist
return
JsonHttpResponse
(
data
)
def
doTheQuery
(
request
,
project_id
):
alist
=
[
"hola"
,
"mundo"
]
...
...
@@ -97,8 +123,13 @@ def doTheQuery(request , project_id):
filename
=
MEDIA_ROOT
+
'/corpora/
%
s/
%
s'
%
(
request
.
user
,
str
(
datetime
.
datetime
.
now
()
.
isoformat
()))
tasks
.
q
.
put
(
[
url
,
filename
])
#put a task in th queue
tasks
.
q
.
join
()
# wait until everything is finished
dwnldsOK
=
0
for
filename
in
tasks
.
firstResults
:
if
filename
!=
False
:
corpus
.
add_resource
(
user
=
request
.
user
,
type
=
resource_type
,
file
=
filename
)
dwnldsOK
+=
1
if
dwnldsOK
==
0
:
return
JsonHttpResponse
([
"fail"
])
# do the WorkFlow
try
:
...
...
@@ -146,7 +177,8 @@ def testISTEX(request , project_id):
urlreqs
.
append
(
"http://api.istex.fr/document/?q="
+
query_string
+
"&output=*&"
+
"from="
+
str
(
k
[
0
])
+
"&size="
+
str
(
pagesize
))
print
(
urlreqs
)
# urlreqs = ["http://localhost/374255" , "http://localhost/374278" ]
urlreqs
=
[
"http://localhost/374255"
,
"http://localhost/374278"
]
print
(
urlreqs
)
resource_type
=
ResourceType
.
objects
.
get
(
name
=
"istext"
)
...
...
templates/project.html
View file @
45a1fa9d
...
...
@@ -313,6 +313,9 @@
console
.
log
(
"disabling "
+
"#"
+
value
.
id
)
$
(
"#"
+
value
.
id
).
prop
(
'onclick'
,
null
);
var
theType
=
$
(
"#id_type option:selected"
).
html
();
if
(
theType
==
"pubmed"
)
{
$
.
ajax
({
// contentType: "application/json",
url
:
window
.
location
.
origin
+
"/tests/pubmedquery"
,
...
...
@@ -348,9 +351,39 @@
});
}
if
(
theType
==
"istext"
)
{
console
.
log
(
window
.
location
.
origin
+
"tests/istextquery"
)
$
.
ajax
({
// contentType: "application/json",
url
:
window
.
location
.
origin
+
"/tests/istextquery"
,
data
:
formData
,
type
:
'POST'
,
beforeSend
:
function
(
xhr
)
{
xhr
.
setRequestHeader
(
"X-CSRFToken"
,
getCookie
(
"csrftoken"
));
},
success
:
function
(
data
)
{
console
.
log
(
"in getGlobalResults"
)
console
.
log
(
data
)
console
.
log
(
"enabling "
+
"#"
+
value
.
id
)
$
(
"#"
+
value
.
id
).
attr
(
'onclick'
,
'getGlobalResults(this);'
);
// $("#submit_thing").prop('disabled' , false)
$
(
"#submit_thing"
).
html
(
"Process a 100 sample!"
)
$
(
"#theresults"
).
html
(
"<i> <b>"
+
pubmedquery
+
"</b>: "
+
data
[
0
]
+
"</i><br>"
)
thequeries
=
data
},
error
:
function
(
result
)
{
console
.
log
(
"Data not found"
);
}
});
}
}
// CSS events for selecting one Radio-Input
function
FileOrNotFile
(
value
)
{
var
showfile
=
JSON
.
parse
(
value
)
var
theType
=
$
(
"#id_type option:selected"
).
html
();
// @upload-file events
if
(
showfile
)
{
console
.
log
(
"You've clicked the YES"
)
...
...
@@ -376,7 +409,7 @@
$
(
"#id_name"
).
on
(
'input'
,
function
(
e
){
console
.
log
(
$
(
this
).
val
())
testAjax
(
$
(
this
).
val
()
)
if
(
theType
==
"pubmed"
)
testPUBMED
(
$
(
this
).
val
()
)
});
}
}
...
...
@@ -384,8 +417,8 @@
//CSS events for changing the Select element
function
CustomForSelect
(
selected
)
{
// show Radio-Inputs and trigger FileOrNotFile>@upload-file events
if
(
selected
==
"pubmed"
)
{
console
.
log
(
"show the button
"
)
if
(
selected
==
"pubmed"
||
selected
==
"istext"
)
{
console
.
log
(
"show the button
for: "
+
selected
)
$
(
"#pubmedcrawl"
).
css
(
"visibility"
,
"visible"
);
$
(
"#pubmedcrawl"
).
show
();
$
(
"#file_yes"
).
click
();
...
...
@@ -414,7 +447,7 @@
return
data
;
}
function
test
Ajax
(
query
)
{
function
test
PUBMED
(
query
)
{
LastData
=
[]
if
(
!
query
||
query
==
""
)
return
;
var
pubmedquery
=
encodeURIComponent
(
query
)
...
...
@@ -450,7 +483,7 @@
success
:
function
(
data
)
{
console
.
log
(
"ajax_success: in testISTEX()"
)
console
.
log
(
data
)
location
.
reload
();
//
location.reload();
},
error
:
function
(
result
)
{
console
.
log
(
"in testISTEX(). Data not found"
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment