Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
45a1fa9d
Commit
45a1fa9d
authored
10 years ago
by
PkSM3
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[UPDATE] istex|pubmed scrapper
parent
119671ab
master
1.0.0_Rose_Bonbon
2.0.0_Red_Lemon
EuropressOld
agentsBasedModelling
anoe-gargantext-light
c24b-stable
c24b-stable-patch
c24b-testing-
dev
dev-graphExplorerJSON
gargantext-light
haskell
install
patch-1
patch-2
prod
prod-dev
remote
sankey
sankey-inegalites
set_model
simon-auth
simon-big-clean-up
simon-data-import
simon-dev
simon-experimental
simon-experimental-share
simon-gargantext-light
simon-pipenv
simon-rest-doc
simon-tasks
simon-testing
simon-unstable
simon-unstable-lists-fix
simon-unstable-refact-models
simon-wip
stable
stable-help
stable-imt
stable-imt-hal
stable-imt-link
stable-imt-merge
stable-imt-notebook
stable-notebook
stable-origin
stable-patch
stable-v3-imt
testing
testing-distri
testing-graph-growth
testing-graph-public
testing-imt
testing-jwt
testing-langParsing
testing-merge
testing-mine
testing-notebook
testing-share
tina
unstable
unstable-docker
unstable-newBdd
unstable-notebook
unstable-philoNotebook
No related merge requests found
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
142 additions
and
59 deletions
+142
-59
urls.py
gargantext_web/urls.py
+1
-0
admin.py
node/admin.py
+2
-2
models.py
node/models.py
+4
-4
PubmedFileParser.py
parsing/FileParsers/PubmedFileParser.py
+20
-13
MedlineFetcherDavid2015.py
scrap_pubmed/MedlineFetcherDavid2015.py
+11
-1
views.py
scrap_pubmed/views.py
+34
-2
project.html
templates/project.html
+70
-37
No files found.
gargantext_web/urls.py
View file @
45a1fa9d
...
@@ -70,6 +70,7 @@ urlpatterns = patterns('',
...
@@ -70,6 +70,7 @@ urlpatterns = patterns('',
url
(
r'^tests/mvc$'
,
views
.
tests_mvc
),
url
(
r'^tests/mvc$'
,
views
.
tests_mvc
),
url
(
r'^tests/mvc-listdocuments$'
,
views
.
tests_mvc_listdocuments
),
url
(
r'^tests/mvc-listdocuments$'
,
views
.
tests_mvc_listdocuments
),
url
(
r'^tests/istextquery$'
,
pubmedscrapper
.
getGlobalStatsISTEXT
),
url
(
r'^tests/pubmedquery$'
,
pubmedscrapper
.
getGlobalStats
),
url
(
r'^tests/pubmedquery$'
,
pubmedscrapper
.
getGlobalStats
),
url
(
r'^tests/project/(\d+)/pubmedquery/go$'
,
pubmedscrapper
.
doTheQuery
),
url
(
r'^tests/project/(\d+)/pubmedquery/go$'
,
pubmedscrapper
.
doTheQuery
),
url
(
r'^tests/project/(\d+)/ISTEXquery/go$'
,
pubmedscrapper
.
testISTEX
)
url
(
r'^tests/project/(\d+)/ISTEXquery/go$'
,
pubmedscrapper
.
testISTEX
)
...
...
This diff is collapsed.
Click to expand it.
node/admin.py
View file @
45a1fa9d
...
@@ -125,8 +125,8 @@ class CustomForm(forms.Form):
...
@@ -125,8 +125,8 @@ class CustomForm(forms.Form):
# file_.name = str(datetime.now().microsecond)
# file_.name = str(datetime.now().microsecond)
# # raise forms.ValidationError(_('Come on dude, name too long. Now is:'+file_.name))
# # raise forms.ValidationError(_('Come on dude, name too long. Now is:'+file_.name))
# #File size
# #File size
if
len
(
file_
)
>
104857600
:
#
if len(file_)>104857600:
raise
forms
.
ValidationError
(
_
(
'File to heavy! (<100MB).'
))
#
raise forms.ValidationError(_('File to heavy! (<100MB).'))
## File type:
## File type:
# if file_.content_type == "application/zip":
# if file_.content_type == "application/zip":
# raise forms.ValidationError(_('We need a zip pls.'))
# raise forms.ValidationError(_('We need a zip pls.'))
...
...
This diff is collapsed.
Click to expand it.
node/models.py
View file @
45a1fa9d
...
@@ -81,7 +81,6 @@ class NodeQuerySet(CTENodeManager.CTEQuerySet):
...
@@ -81,7 +81,6 @@ class NodeQuerySet(CTENodeManager.CTEQuerySet):
metadata_cache
=
{
metadata
.
name
:
metadata
for
metadata
in
Metadata
.
objects
.
all
()}
metadata_cache
=
{
metadata
.
name
:
metadata
for
metadata
in
Metadata
.
objects
.
all
()}
data
=
[]
data
=
[]
for
node
in
self
:
for
node
in
self
:
print
(
node
.
id
)
for
key
,
value
in
node
.
metadata
.
items
():
for
key
,
value
in
node
.
metadata
.
items
():
if
key
in
metadata_cache
:
if
key
in
metadata_cache
:
metadata
=
metadata_cache
[
key
]
metadata
=
metadata_cache
[
key
]
...
@@ -249,13 +248,14 @@ class Node(CTENode):
...
@@ -249,13 +248,14 @@ class Node(CTENode):
@
current_app
.
task
(
filter
=
task_method
)
@
current_app
.
task
(
filter
=
task_method
)
def
workflow
(
self
,
keys
=
None
,
ngramsextractorscache
=
None
,
ngramscaches
=
None
,
verbose
=
False
):
def
workflow
(
self
,
keys
=
None
,
ngramsextractorscache
=
None
,
ngramscaches
=
None
,
verbose
=
False
):
import
time
import
time
total
=
0
print
(
"LOG::TIME: In workflow() parse_resources()"
)
print
(
"LOG::TIME: In workflow() parse_resources()"
)
start
=
time
.
time
()
start
=
time
.
time
()
self
.
metadata
[
'Processing'
]
=
1
self
.
metadata
[
'Processing'
]
=
1
self
.
save
()
self
.
save
()
self
.
parse_resources
()
self
.
parse_resources
()
end
=
time
.
time
()
end
=
time
.
time
()
total
+=
(
end
-
start
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" parse_resources() [s]"
,(
end
-
start
))
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" parse_resources() [s]"
,(
end
-
start
))
print
(
"LOG::TIME: In workflow() / parse_resources()"
)
print
(
"LOG::TIME: In workflow() / parse_resources()"
)
...
@@ -266,7 +266,7 @@ class Node(CTENode):
...
@@ -266,7 +266,7 @@ class Node(CTENode):
self
.
children
.
filter
(
type_id
=
type_document
.
pk
)
.
extract_ngrams
(
keys
=
[
'title'
,])
self
.
children
.
filter
(
type_id
=
type_document
.
pk
)
.
extract_ngrams
(
keys
=
[
'title'
,])
end
=
time
.
time
()
end
=
time
.
time
()
print
(
"- - - - - - - - - -
\n
"
)
print
(
"- - - - - - - - - -
\n
"
)
print
(
"LOG::TIME: "
,(
end
-
start
)
)
total
+=
(
end
-
start
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" extract_ngrams() [s]"
,(
end
-
start
))
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" extract_ngrams() [s]"
,(
end
-
start
))
print
(
"LOG::TIME: In workflow() / extract_ngrams()"
)
print
(
"LOG::TIME: In workflow() / extract_ngrams()"
)
...
@@ -275,9 +275,9 @@ class Node(CTENode):
...
@@ -275,9 +275,9 @@ class Node(CTENode):
from
analysis.functions
import
do_tfidf
from
analysis.functions
import
do_tfidf
do_tfidf
(
self
)
do_tfidf
(
self
)
end
=
time
.
time
()
end
=
time
.
time
()
total
+=
(
end
-
start
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" do_tfidf() [s]"
,(
end
-
start
))
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" do_tfidf() [s]"
,(
end
-
start
))
print
(
"LOG::TIME: In workflow() / do_tfidf()"
)
print
(
"LOG::TIME: In workflow() / do_tfidf()"
)
print
(
"In workflow() END"
)
print
(
"In workflow() END"
)
self
.
metadata
[
'Processing'
]
=
0
self
.
metadata
[
'Processing'
]
=
0
self
.
save
()
self
.
save
()
...
...
This diff is collapsed.
Click to expand it.
parsing/FileParsers/PubmedFileParser.py
View file @
45a1fa9d
...
@@ -80,12 +80,19 @@ class PubmedFileParser(FileParser):
...
@@ -80,12 +80,19 @@ class PubmedFileParser(FileParser):
if
len
(
RealDate
)
>
4
:
if
len
(
RealDate
)
>
4
:
if
len
(
RealDate
)
>
8
:
if
len
(
RealDate
)
>
8
:
try
:
Decision
=
datetime
.
strptime
(
RealDate
,
'
%
Y
%
b
%
d'
)
.
date
()
try
:
Decision
=
datetime
.
strptime
(
RealDate
,
'
%
Y
%
b
%
d'
)
.
date
()
except
:
Decision
=
datetime
.
strptime
(
PubmedDate
,
'
%
Y
%
m
%
d'
)
.
date
()
except
:
try
:
Decision
=
datetime
.
strptime
(
PubmedDate
,
'
%
Y
%
m
%
d'
)
.
date
()
except
:
Decision
=
False
else
:
else
:
try
:
Decision
=
datetime
.
strptime
(
RealDate
,
'
%
Y
%
b'
)
.
date
()
try
:
Decision
=
datetime
.
strptime
(
RealDate
,
'
%
Y
%
b'
)
.
date
()
except
:
Decision
=
datetime
.
strptime
(
PubmedDate
,
'
%
Y
%
m
%
d'
)
.
date
()
except
:
else
:
Decision
=
datetime
.
strptime
(
PubmedDate
,
'
%
Y
%
m
%
d'
)
.
date
()
try
:
Decision
=
datetime
.
strptime
(
PubmedDate
,
'
%
Y
%
m
%
d'
)
.
date
()
except
:
Decision
=
False
else
:
try
:
Decision
=
datetime
.
strptime
(
PubmedDate
,
'
%
Y
%
m
%
d'
)
.
date
()
except
:
Decision
=
False
if
Decision
!=
False
:
if
"publication_year"
in
metadata
:
metadata
[
"publication_year"
]
=
str
(
Decision
.
year
)
if
"publication_year"
in
metadata
:
metadata
[
"publication_year"
]
=
str
(
Decision
.
year
)
if
"publication_month"
in
metadata
:
metadata
[
"publication_month"
]
=
str
(
Decision
.
month
)
if
"publication_month"
in
metadata
:
metadata
[
"publication_month"
]
=
str
(
Decision
.
month
)
if
"publication_day"
in
metadata
:
metadata
[
"publication_day"
]
=
str
(
Decision
.
day
)
if
"publication_day"
in
metadata
:
metadata
[
"publication_day"
]
=
str
(
Decision
.
day
)
...
...
This diff is collapsed.
Click to expand it.
scrap_pubmed/MedlineFetcherDavid2015.py
View file @
45a1fa9d
...
@@ -105,6 +105,13 @@ class MedlineFetcher:
...
@@ -105,6 +105,13 @@ class MedlineFetcher:
print
(
threading
.
current_thread
()
.
name
,
filename
+
" OK"
)
print
(
threading
.
current_thread
()
.
name
,
filename
+
" OK"
)
return
filename
return
filename
# generic!
def
test_downloadFile
(
self
,
item
):
url
=
item
[
0
]
filename
=
item
[
1
]
print
(
"
\t
in downloadFile:"
)
data
=
urlopen
(
url
)
return
data
# generic!
# generic!
def
do_work
(
self
,
item
):
def
do_work
(
self
,
item
):
...
@@ -124,7 +131,10 @@ class MedlineFetcher:
...
@@ -124,7 +131,10 @@ class MedlineFetcher:
def
worker2
(
self
):
def
worker2
(
self
):
while
True
:
while
True
:
item
=
self
.
q
.
get
()
item
=
self
.
q
.
get
()
self
.
firstResults
.
append
(
self
.
downloadFile
(
item
))
results
=
[]
try
:
result
=
self
.
downloadFile
(
item
)
except
:
result
=
False
self
.
firstResults
.
append
(
result
)
self
.
q
.
task_done
()
self
.
q
.
task_done
()
def
chunks
(
self
,
l
,
n
):
def
chunks
(
self
,
l
,
n
):
...
...
This diff is collapsed.
Click to expand it.
scrap_pubmed/views.py
View file @
45a1fa9d
...
@@ -43,6 +43,32 @@ def getGlobalStats(request ):
...
@@ -43,6 +43,32 @@ def getGlobalStats(request ):
return
JsonHttpResponse
(
data
)
return
JsonHttpResponse
(
data
)
def
getGlobalStatsISTEXT
(
request
):
print
(
request
.
method
)
alist
=
[
"bar"
,
"foo"
]
if
request
.
method
==
"POST"
:
N
=
100
query
=
request
.
POST
[
"query"
]
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" query ="
,
query
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" N ="
,
N
)
query_string
=
query
.
replace
(
" "
,
"+"
)
url
=
"http://api.istex.fr/document/?q="
+
query_string
+
"&output=*"
tasks
=
MedlineFetcher
()
filename
=
MEDIA_ROOT
+
'/corpora/
%
s/
%
s'
%
(
request
.
user
,
str
(
datetime
.
datetime
.
now
()
.
isoformat
()))
try
:
thedata
=
tasks
.
test_downloadFile
(
[
url
,
filename
]
)
alist
=
thedata
.
read
()
.
decode
(
'utf-8'
)
except
Exception
as
error
:
alist
=
[
str
(
error
)]
data
=
alist
return
JsonHttpResponse
(
data
)
def
doTheQuery
(
request
,
project_id
):
def
doTheQuery
(
request
,
project_id
):
alist
=
[
"hola"
,
"mundo"
]
alist
=
[
"hola"
,
"mundo"
]
...
@@ -97,8 +123,13 @@ def doTheQuery(request , project_id):
...
@@ -97,8 +123,13 @@ def doTheQuery(request , project_id):
filename
=
MEDIA_ROOT
+
'/corpora/
%
s/
%
s'
%
(
request
.
user
,
str
(
datetime
.
datetime
.
now
()
.
isoformat
()))
filename
=
MEDIA_ROOT
+
'/corpora/
%
s/
%
s'
%
(
request
.
user
,
str
(
datetime
.
datetime
.
now
()
.
isoformat
()))
tasks
.
q
.
put
(
[
url
,
filename
])
#put a task in th queue
tasks
.
q
.
put
(
[
url
,
filename
])
#put a task in th queue
tasks
.
q
.
join
()
# wait until everything is finished
tasks
.
q
.
join
()
# wait until everything is finished
dwnldsOK
=
0
for
filename
in
tasks
.
firstResults
:
for
filename
in
tasks
.
firstResults
:
if
filename
!=
False
:
corpus
.
add_resource
(
user
=
request
.
user
,
type
=
resource_type
,
file
=
filename
)
corpus
.
add_resource
(
user
=
request
.
user
,
type
=
resource_type
,
file
=
filename
)
dwnldsOK
+=
1
if
dwnldsOK
==
0
:
return
JsonHttpResponse
([
"fail"
])
# do the WorkFlow
# do the WorkFlow
try
:
try
:
...
@@ -146,7 +177,8 @@ def testISTEX(request , project_id):
...
@@ -146,7 +177,8 @@ def testISTEX(request , project_id):
urlreqs
.
append
(
"http://api.istex.fr/document/?q="
+
query_string
+
"&output=*&"
+
"from="
+
str
(
k
[
0
])
+
"&size="
+
str
(
pagesize
))
urlreqs
.
append
(
"http://api.istex.fr/document/?q="
+
query_string
+
"&output=*&"
+
"from="
+
str
(
k
[
0
])
+
"&size="
+
str
(
pagesize
))
print
(
urlreqs
)
print
(
urlreqs
)
# urlreqs = ["http://localhost/374255" , "http://localhost/374278" ]
urlreqs
=
[
"http://localhost/374255"
,
"http://localhost/374278"
]
print
(
urlreqs
)
resource_type
=
ResourceType
.
objects
.
get
(
name
=
"istext"
)
resource_type
=
ResourceType
.
objects
.
get
(
name
=
"istext"
)
...
...
This diff is collapsed.
Click to expand it.
templates/project.html
View file @
45a1fa9d
...
@@ -313,6 +313,9 @@
...
@@ -313,6 +313,9 @@
console
.
log
(
"disabling "
+
"#"
+
value
.
id
)
console
.
log
(
"disabling "
+
"#"
+
value
.
id
)
$
(
"#"
+
value
.
id
).
prop
(
'onclick'
,
null
);
$
(
"#"
+
value
.
id
).
prop
(
'onclick'
,
null
);
var
theType
=
$
(
"#id_type option:selected"
).
html
();
if
(
theType
==
"pubmed"
)
{
$
.
ajax
({
$
.
ajax
({
// contentType: "application/json",
// contentType: "application/json",
url
:
window
.
location
.
origin
+
"/tests/pubmedquery"
,
url
:
window
.
location
.
origin
+
"/tests/pubmedquery"
,
...
@@ -348,9 +351,39 @@
...
@@ -348,9 +351,39 @@
});
});
}
}
if
(
theType
==
"istext"
)
{
console
.
log
(
window
.
location
.
origin
+
"tests/istextquery"
)
$
.
ajax
({
// contentType: "application/json",
url
:
window
.
location
.
origin
+
"/tests/istextquery"
,
data
:
formData
,
type
:
'POST'
,
beforeSend
:
function
(
xhr
)
{
xhr
.
setRequestHeader
(
"X-CSRFToken"
,
getCookie
(
"csrftoken"
));
},
success
:
function
(
data
)
{
console
.
log
(
"in getGlobalResults"
)
console
.
log
(
data
)
console
.
log
(
"enabling "
+
"#"
+
value
.
id
)
$
(
"#"
+
value
.
id
).
attr
(
'onclick'
,
'getGlobalResults(this);'
);
// $("#submit_thing").prop('disabled' , false)
$
(
"#submit_thing"
).
html
(
"Process a 100 sample!"
)
$
(
"#theresults"
).
html
(
"<i> <b>"
+
pubmedquery
+
"</b>: "
+
data
[
0
]
+
"</i><br>"
)
thequeries
=
data
},
error
:
function
(
result
)
{
console
.
log
(
"Data not found"
);
}
});
}
}
// CSS events for selecting one Radio-Input
// CSS events for selecting one Radio-Input
function
FileOrNotFile
(
value
)
{
function
FileOrNotFile
(
value
)
{
var
showfile
=
JSON
.
parse
(
value
)
var
showfile
=
JSON
.
parse
(
value
)
var
theType
=
$
(
"#id_type option:selected"
).
html
();
// @upload-file events
// @upload-file events
if
(
showfile
)
{
if
(
showfile
)
{
console
.
log
(
"You've clicked the YES"
)
console
.
log
(
"You've clicked the YES"
)
...
@@ -376,7 +409,7 @@
...
@@ -376,7 +409,7 @@
$
(
"#id_name"
).
on
(
'input'
,
function
(
e
){
$
(
"#id_name"
).
on
(
'input'
,
function
(
e
){
console
.
log
(
$
(
this
).
val
())
console
.
log
(
$
(
this
).
val
())
testAjax
(
$
(
this
).
val
()
)
if
(
theType
==
"pubmed"
)
testPUBMED
(
$
(
this
).
val
()
)
});
});
}
}
}
}
...
@@ -384,8 +417,8 @@
...
@@ -384,8 +417,8 @@
//CSS events for changing the Select element
//CSS events for changing the Select element
function
CustomForSelect
(
selected
)
{
function
CustomForSelect
(
selected
)
{
// show Radio-Inputs and trigger FileOrNotFile>@upload-file events
// show Radio-Inputs and trigger FileOrNotFile>@upload-file events
if
(
selected
==
"pubmed"
)
{
if
(
selected
==
"pubmed"
||
selected
==
"istext"
)
{
console
.
log
(
"show the button
"
)
console
.
log
(
"show the button
for: "
+
selected
)
$
(
"#pubmedcrawl"
).
css
(
"visibility"
,
"visible"
);
$
(
"#pubmedcrawl"
).
css
(
"visibility"
,
"visible"
);
$
(
"#pubmedcrawl"
).
show
();
$
(
"#pubmedcrawl"
).
show
();
$
(
"#file_yes"
).
click
();
$
(
"#file_yes"
).
click
();
...
@@ -414,7 +447,7 @@
...
@@ -414,7 +447,7 @@
return
data
;
return
data
;
}
}
function
test
Ajax
(
query
)
{
function
test
PUBMED
(
query
)
{
LastData
=
[]
LastData
=
[]
if
(
!
query
||
query
==
""
)
return
;
if
(
!
query
||
query
==
""
)
return
;
var
pubmedquery
=
encodeURIComponent
(
query
)
var
pubmedquery
=
encodeURIComponent
(
query
)
...
@@ -450,7 +483,7 @@
...
@@ -450,7 +483,7 @@
success
:
function
(
data
)
{
success
:
function
(
data
)
{
console
.
log
(
"ajax_success: in testISTEX()"
)
console
.
log
(
"ajax_success: in testISTEX()"
)
console
.
log
(
data
)
console
.
log
(
data
)
location
.
reload
();
//
location.reload();
},
},
error
:
function
(
result
)
{
error
:
function
(
result
)
{
console
.
log
(
"in testISTEX(). Data not found"
);
console
.
log
(
"in testISTEX(). Data not found"
);
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment