Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
7476e4c0
Commit
7476e4c0
authored
Feb 16, 2015
by
Administrator
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'samuel' into testing
parents
2ea48b86
4cfacd18
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
180 additions
and
87 deletions
+180
-87
backupdb.py
backupdb.py
+3
-0
urls.py
gargantext_web/urls.py
+1
-0
admin.py
node/admin.py
+2
-2
models.py
node/models.py
+4
-4
PubmedFileParser.py
parsing/FileParsers/PubmedFileParser.py
+20
-13
MedlineFetcherDavid2015.py
scrap_pubmed/MedlineFetcherDavid2015.py
+16
-4
views.py
scrap_pubmed/views.py
+54
-27
project.html
templates/project.html
+80
-37
No files found.
backupdb.py
0 → 100644
View file @
7476e4c0
import
os
command
=
'export PGPASSWORD=C8kdcUrAQy66U
\n
pg_dump -U alexandre -h localhost gargandb| gzip >
%
s'
%
"mysqldump.db"
os
.
system
(
command
)
gargantext_web/urls.py
View file @
7476e4c0
...
...
@@ -70,6 +70,7 @@ urlpatterns = patterns('',
url
(
r'^tests/mvc$'
,
views
.
tests_mvc
),
url
(
r'^tests/mvc-listdocuments$'
,
views
.
tests_mvc_listdocuments
),
url
(
r'^tests/istextquery$'
,
pubmedscrapper
.
getGlobalStatsISTEXT
),
url
(
r'^tests/pubmedquery$'
,
pubmedscrapper
.
getGlobalStats
),
url
(
r'^tests/project/(\d+)/pubmedquery/go$'
,
pubmedscrapper
.
doTheQuery
),
url
(
r'^tests/project/(\d+)/ISTEXquery/go$'
,
pubmedscrapper
.
testISTEX
)
...
...
node/admin.py
View file @
7476e4c0
...
...
@@ -125,8 +125,8 @@ class CustomForm(forms.Form):
# file_.name = str(datetime.now().microsecond)
# # raise forms.ValidationError(_('Come on dude, name too long. Now is:'+file_.name))
# #File size
if
len
(
file_
)
>
104857600
:
raise
forms
.
ValidationError
(
_
(
'File to heavy! (<100MB).'
))
#
if len(file_)>104857600:
#
raise forms.ValidationError(_('File to heavy! (<100MB).'))
## File type:
# if file_.content_type == "application/zip":
# raise forms.ValidationError(_('We need a zip pls.'))
...
...
node/models.py
View file @
7476e4c0
...
...
@@ -81,7 +81,6 @@ class NodeQuerySet(CTENodeManager.CTEQuerySet):
metadata_cache
=
{
metadata
.
name
:
metadata
for
metadata
in
Metadata
.
objects
.
all
()}
data
=
[]
for
node
in
self
:
print
(
node
.
id
)
for
key
,
value
in
node
.
metadata
.
items
():
if
key
in
metadata_cache
:
metadata
=
metadata_cache
[
key
]
...
...
@@ -249,13 +248,14 @@ class Node(CTENode):
@
current_app
.
task
(
filter
=
task_method
)
def
workflow
(
self
,
keys
=
None
,
ngramsextractorscache
=
None
,
ngramscaches
=
None
,
verbose
=
False
):
import
time
total
=
0
print
(
"LOG::TIME: In workflow() parse_resources()"
)
start
=
time
.
time
()
self
.
metadata
[
'Processing'
]
=
1
self
.
save
()
self
.
parse_resources
()
end
=
time
.
time
()
total
+=
(
end
-
start
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" parse_resources() [s]"
,(
end
-
start
))
print
(
"LOG::TIME: In workflow() / parse_resources()"
)
...
...
@@ -266,7 +266,7 @@ class Node(CTENode):
self
.
children
.
filter
(
type_id
=
type_document
.
pk
)
.
extract_ngrams
(
keys
=
[
'title'
,])
end
=
time
.
time
()
print
(
"- - - - - - - - - -
\n
"
)
print
(
"LOG::TIME: "
,(
end
-
start
)
)
total
+=
(
end
-
start
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" extract_ngrams() [s]"
,(
end
-
start
))
print
(
"LOG::TIME: In workflow() / extract_ngrams()"
)
...
...
@@ -275,9 +275,9 @@ class Node(CTENode):
from
analysis.functions
import
do_tfidf
do_tfidf
(
self
)
end
=
time
.
time
()
total
+=
(
end
-
start
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" do_tfidf() [s]"
,(
end
-
start
))
print
(
"LOG::TIME: In workflow() / do_tfidf()"
)
print
(
"In workflow() END"
)
self
.
metadata
[
'Processing'
]
=
0
self
.
save
()
...
...
parsing/FileParsers/PubmedFileParser.py
View file @
7476e4c0
...
...
@@ -80,21 +80,28 @@ class PubmedFileParser(FileParser):
if
len
(
RealDate
)
>
4
:
if
len
(
RealDate
)
>
8
:
try
:
Decision
=
datetime
.
strptime
(
RealDate
,
'
%
Y
%
b
%
d'
)
.
date
()
except
:
Decision
=
datetime
.
strptime
(
PubmedDate
,
'
%
Y
%
m
%
d'
)
.
date
()
except
:
try
:
Decision
=
datetime
.
strptime
(
PubmedDate
,
'
%
Y
%
m
%
d'
)
.
date
()
except
:
Decision
=
False
else
:
try
:
Decision
=
datetime
.
strptime
(
RealDate
,
'
%
Y
%
b'
)
.
date
()
except
:
Decision
=
datetime
.
strptime
(
PubmedDate
,
'
%
Y
%
m
%
d'
)
.
date
()
else
:
Decision
=
datetime
.
strptime
(
PubmedDate
,
'
%
Y
%
m
%
d'
)
.
date
()
except
:
try
:
Decision
=
datetime
.
strptime
(
PubmedDate
,
'
%
Y
%
m
%
d'
)
.
date
()
except
:
Decision
=
False
else
:
try
:
Decision
=
datetime
.
strptime
(
PubmedDate
,
'
%
Y
%
m
%
d'
)
.
date
()
except
:
Decision
=
False
if
"publication_year"
in
metadata
:
metadata
[
"publication_year"
]
=
str
(
Decision
.
year
)
if
"publication_month"
in
metadata
:
metadata
[
"publication_month"
]
=
str
(
Decision
.
month
)
if
"publication_day"
in
metadata
:
metadata
[
"publication_day"
]
=
str
(
Decision
.
day
)
if
"realdate_year_"
in
metadata
:
metadata
.
pop
(
"realdate_year_"
)
if
"realdate_month_"
in
metadata
:
metadata
.
pop
(
"realdate_month_"
)
if
"realdate_day_"
in
metadata
:
metadata
.
pop
(
"realdate_day_"
)
if
"title2"
in
metadata
:
metadata
.
pop
(
"title2"
)
# print(metadata)
metadata_list
.
append
(
metadata
)
if
Decision
!=
False
:
if
"publication_year"
in
metadata
:
metadata
[
"publication_year"
]
=
str
(
Decision
.
year
)
if
"publication_month"
in
metadata
:
metadata
[
"publication_month"
]
=
str
(
Decision
.
month
)
if
"publication_day"
in
metadata
:
metadata
[
"publication_day"
]
=
str
(
Decision
.
day
)
if
"realdate_year_"
in
metadata
:
metadata
.
pop
(
"realdate_year_"
)
if
"realdate_month_"
in
metadata
:
metadata
.
pop
(
"realdate_month_"
)
if
"realdate_day_"
in
metadata
:
metadata
.
pop
(
"realdate_day_"
)
if
"title2"
in
metadata
:
metadata
.
pop
(
"title2"
)
# print(metadata)
metadata_list
.
append
(
metadata
)
# return the list of metadata
return
metadata_list
scrap_pubmed/MedlineFetcherDavid2015.py
View file @
7476e4c0
...
...
@@ -12,6 +12,7 @@ import time
from
lxml
import
etree
import
datetime
from
django.core.files
import
File
import
codecs
import
threading
from
queue
import
Queue
...
...
@@ -39,6 +40,7 @@ class MedlineFetcher:
"Get number of results for query 'query' in variable 'count'"
"Get also 'queryKey' and 'webEnv', which are used by function 'medlineEfetch'"
print
(
query
)
origQuery
=
query
query
=
query
.
replace
(
' '
,
'
%20
'
)
...
...
@@ -92,10 +94,10 @@ class MedlineFetcher:
def
downloadFile
(
self
,
item
):
url
=
item
[
0
]
filename
=
item
[
1
]
print
(
"
\t
in downloadFile:"
)
print
(
url
,
filename
)
print
(
"
\t
in
test_
downloadFile:"
)
#
print(url,filename)
data
=
urlopen
(
url
)
f
=
open
(
filename
,
'w
'
)
f
=
codecs
.
open
(
filename
,
"w"
,
encoding
=
'utf-8
'
)
myfile
=
File
(
f
)
myfile
.
write
(
data
.
read
()
.
decode
(
'utf-8'
)
)
myfile
.
close
()
...
...
@@ -104,6 +106,13 @@ class MedlineFetcher:
print
(
threading
.
current_thread
()
.
name
,
filename
+
" OK"
)
return
filename
# generic!
def
test_downloadFile
(
self
,
item
):
url
=
item
[
0
]
filename
=
item
[
1
]
print
(
"
\t
in downloadFile:"
)
data
=
urlopen
(
url
)
return
data
# generic!
def
do_work
(
self
,
item
):
...
...
@@ -123,7 +132,10 @@ class MedlineFetcher:
def
worker2
(
self
):
while
True
:
item
=
self
.
q
.
get
()
self
.
firstResults
.
append
(
self
.
downloadFile
(
item
))
results
=
[]
try
:
result
=
self
.
downloadFile
(
item
)
except
:
result
=
False
self
.
firstResults
.
append
(
result
)
self
.
q
.
task_done
()
def
chunks
(
self
,
l
,
n
):
...
...
scrap_pubmed/views.py
View file @
7476e4c0
...
...
@@ -43,6 +43,32 @@ def getGlobalStats(request ):
return
JsonHttpResponse
(
data
)
def
getGlobalStatsISTEXT
(
request
):
print
(
request
.
method
)
alist
=
[
"bar"
,
"foo"
]
if
request
.
method
==
"POST"
:
N
=
100
query
=
request
.
POST
[
"query"
]
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" query ="
,
query
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" N ="
,
N
)
query_string
=
query
.
replace
(
" "
,
"+"
)
url
=
"http://api.istex.fr/document/?q="
+
query_string
tasks
=
MedlineFetcher
()
filename
=
MEDIA_ROOT
+
'/corpora/
%
s/
%
s'
%
(
request
.
user
,
str
(
datetime
.
datetime
.
now
()
.
isoformat
()))
try
:
thedata
=
tasks
.
test_downloadFile
(
[
url
,
filename
]
)
alist
=
thedata
.
read
()
.
decode
(
'utf-8'
)
except
Exception
as
error
:
alist
=
[
str
(
error
)]
data
=
alist
return
JsonHttpResponse
(
data
)
def
doTheQuery
(
request
,
project_id
):
alist
=
[
"hola"
,
"mundo"
]
...
...
@@ -85,36 +111,36 @@ def doTheQuery(request , project_id):
corpus
.
save
()
try
:
tasks
=
MedlineFetcher
()
tasks
.
ensure_dir
(
MEDIA_ROOT
+
'/corpora/'
+
str
(
request
.
user
)
+
"/"
)
# configuring your queue with the event
for
i
in
range
(
8
):
t
=
threading
.
Thread
(
target
=
tasks
.
worker2
)
#thing to do
t
.
daemon
=
True
# thread dies when main thread (only non-daemon thread) exits.
t
.
start
()
for
url
in
urlreqs
:
filename
=
MEDIA_ROOT
+
'/corpora/
%
s/
%
s'
%
(
request
.
user
,
str
(
datetime
.
datetime
.
now
()
.
isoformat
()))
tasks
.
q
.
put
(
[
url
,
filename
])
#put a task in th queue
tasks
.
q
.
join
()
# wait until everything is finished
for
filename
in
tasks
.
firstResults
:
corpus
.
add_resource
(
user
=
request
.
user
,
type
=
resource_type
,
file
=
filename
)
# do the WorkFlow
try
:
if
DEBUG
is
True
:
corpus
.
workflow
()
else
:
corpus
.
workflow
.
apply_async
((),
countdown
=
3
)
tasks
=
MedlineFetcher
()
for
i
in
range
(
8
):
t
=
threading
.
Thread
(
target
=
tasks
.
worker2
)
#thing to do
t
.
daemon
=
True
# thread dies when main thread (only non-daemon thread) exits.
t
.
start
()
for
url
in
urlreqs
:
filename
=
MEDIA_ROOT
+
'/corpora/
%
s/
%
s'
%
(
request
.
user
,
str
(
datetime
.
datetime
.
now
()
.
isoformat
()))
tasks
.
q
.
put
(
[
url
,
filename
])
#put a task in th queue
tasks
.
q
.
join
()
# wait until everything is finished
return
JsonHttpResponse
([
"workflow"
,
"finished"
])
except
Exception
as
error
:
print
(
error
)
dwnldsOK
=
0
for
filename
in
tasks
.
firstResults
:
if
filename
!=
False
:
corpus
.
add_resource
(
user
=
request
.
user
,
type
=
resource_type
,
file
=
filename
)
dwnldsOK
+=
1
if
dwnldsOK
==
0
:
return
JsonHttpResponse
([
"fail"
])
return
JsonHttpResponse
([
"workflow"
,
"finished"
,
"outside the try-except"
])
# do the WorkFlow
try
:
if
DEBUG
is
True
:
corpus
.
workflow
()
else
:
corpus
.
workflow
.
apply_async
((),
countdown
=
3
)
return
JsonHttpResponse
([
"workflow"
,
"finished"
])
except
Exception
as
error
:
print
(
"lele"
,
error
)
print
(
error
)
return
JsonHttpResponse
([
"workflow"
,
"finished"
,
"outside the try-except"
])
data
=
alist
return
JsonHttpResponse
(
data
)
...
...
@@ -146,7 +172,8 @@ def testISTEX(request , project_id):
urlreqs
.
append
(
"http://api.istex.fr/document/?q="
+
query_string
+
"&output=*&"
+
"from="
+
str
(
k
[
0
])
+
"&size="
+
str
(
pagesize
))
print
(
urlreqs
)
# urlreqs = ["http://localhost/374255" , "http://localhost/374278" ]
urlreqs
=
[
"http://localhost/374255"
,
"http://localhost/374278"
]
print
(
urlreqs
)
resource_type
=
ResourceType
.
objects
.
get
(
name
=
"istext"
)
...
...
templates/project.html
View file @
7476e4c0
...
...
@@ -313,44 +313,87 @@
console
.
log
(
"disabling "
+
"#"
+
value
.
id
)
$
(
"#"
+
value
.
id
).
prop
(
'onclick'
,
null
);
$
.
ajax
({
// contentType: "application/json",
url
:
window
.
location
.
origin
+
"/tests/pubmedquery"
,
data
:
formData
,
type
:
'POST'
,
beforeSend
:
function
(
xhr
)
{
xhr
.
setRequestHeader
(
"X-CSRFToken"
,
getCookie
(
"csrftoken"
));
},
success
:
function
(
data
)
{
console
.
log
(
"in getGlobalResults"
)
console
.
log
(
data
)
console
.
log
(
"enabling "
+
"#"
+
value
.
id
)
$
(
"#"
+
value
.
id
).
attr
(
'onclick'
,
'getGlobalResults(this);'
);
// $("#submit_thing").prop('disabled' , false)
$
(
"#submit_thing"
).
html
(
"Process a 100 sample!"
)
thequeries
=
data
var
N
=
0
,
k
=
0
;
for
(
var
i
in
thequeries
)
N
+=
thequeries
[
i
].
count
if
(
N
>
0
)
{
$
(
"#theresults"
).
html
(
"<i> <b>"
+
pubmedquery
+
"</b>: "
+
N
+
" publications in the last 5 years</i><br>"
)
$
(
'#submit_thing'
).
prop
(
'disabled'
,
false
);
}
else
{
$
(
"#theresults"
).
html
(
"<i> <b>"
+
pubmedquery
+
"</b>: No results!.</i><br>"
)
$
(
'#submit_thing'
).
prop
(
'disabled'
,
true
);
}
var
theType
=
$
(
"#id_type option:selected"
).
html
();
if
(
theType
==
"pubmed"
)
{
$
.
ajax
({
// contentType: "application/json",
url
:
window
.
location
.
origin
+
"/tests/pubmedquery"
,
data
:
formData
,
type
:
'POST'
,
beforeSend
:
function
(
xhr
)
{
xhr
.
setRequestHeader
(
"X-CSRFToken"
,
getCookie
(
"csrftoken"
));
},
success
:
function
(
data
)
{
console
.
log
(
"in getGlobalResults"
)
console
.
log
(
data
)
console
.
log
(
"enabling "
+
"#"
+
value
.
id
)
$
(
"#"
+
value
.
id
).
attr
(
'onclick'
,
'getGlobalResults(this);'
);
// $("#submit_thing").prop('disabled' , false)
$
(
"#submit_thing"
).
html
(
"Process a 100 sample!"
)
thequeries
=
data
var
N
=
0
,
k
=
0
;
for
(
var
i
in
thequeries
)
N
+=
thequeries
[
i
].
count
if
(
N
>
0
)
{
$
(
"#theresults"
).
html
(
"<i> <b>"
+
pubmedquery
+
"</b>: "
+
N
+
" publications in the last 5 years</i><br>"
)
$
(
'#submit_thing'
).
prop
(
'disabled'
,
false
);
}
else
{
$
(
"#theresults"
).
html
(
"<i> <b>"
+
pubmedquery
+
"</b>: No results!.</i><br>"
)
$
(
'#submit_thing'
).
prop
(
'disabled'
,
true
);
}
},
error
:
function
(
result
)
{
console
.
log
(
"Data not found"
);
}
});
},
error
:
function
(
result
)
{
console
.
log
(
"Data not found"
);
}
});
}
if
(
theType
==
"istext"
)
{
console
.
log
(
window
.
location
.
origin
+
"tests/istextquery"
)
$
.
ajax
({
// contentType: "application/json",
url
:
window
.
location
.
origin
+
"/tests/istextquery"
,
data
:
formData
,
type
:
'POST'
,
beforeSend
:
function
(
xhr
)
{
xhr
.
setRequestHeader
(
"X-CSRFToken"
,
getCookie
(
"csrftoken"
));
},
success
:
function
(
data
)
{
console
.
log
(
"in getGlobalResults"
)
console
.
log
(
data
)
console
.
log
(
"enabling "
+
"#"
+
value
.
id
)
$
(
"#"
+
value
.
id
).
attr
(
'onclick'
,
'getGlobalResults(this);'
);
// $("#submit_thing").prop('disabled' , false)
$
(
"#submit_thing"
).
html
(
"Process a 100 sample!"
)
thequeries
=
data
var
N
=
data
.
length
,
k
=
0
;
console
.
log
(
"N: "
+
N
)
// for(var i in thequeries) N += thequeries[i].count
if
(
N
>
1
)
{
var
total
=
JSON
.
parse
(
data
).
total
$
(
"#theresults"
).
html
(
"<i> <b>"
+
pubmedquery
+
"</b>: "
+
total
+
" publications.</i><br>"
)
$
(
'#submit_thing'
).
prop
(
'disabled'
,
false
);
}
else
{
$
(
"#theresults"
).
html
(
"<i> <b>"
+
data
[
0
]
+
"</b></i><br>"
)
$
(
'#submit_thing'
).
prop
(
'disabled'
,
true
);
}
},
error
:
function
(
result
)
{
console
.
log
(
"Data not found"
);
}
});
}
}
// CSS events for selecting one Radio-Input
function
FileOrNotFile
(
value
)
{
var
showfile
=
JSON
.
parse
(
value
)
var
theType
=
$
(
"#id_type option:selected"
).
html
();
// @upload-file events
if
(
showfile
)
{
console
.
log
(
"You've clicked the YES"
)
...
...
@@ -376,7 +419,7 @@
$
(
"#id_name"
).
on
(
'input'
,
function
(
e
){
console
.
log
(
$
(
this
).
val
())
testAjax
(
$
(
this
).
val
()
)
if
(
theType
==
"pubmed"
)
testPUBMED
(
$
(
this
).
val
()
)
});
}
}
...
...
@@ -384,8 +427,8 @@
//CSS events for changing the Select element
function
CustomForSelect
(
selected
)
{
// show Radio-Inputs and trigger FileOrNotFile>@upload-file events
if
(
selected
==
"pubmed"
)
{
console
.
log
(
"show the button
"
)
if
(
selected
==
"pubmed"
||
selected
==
"istext"
)
{
console
.
log
(
"show the button
for: "
+
selected
)
$
(
"#pubmedcrawl"
).
css
(
"visibility"
,
"visible"
);
$
(
"#pubmedcrawl"
).
show
();
$
(
"#file_yes"
).
click
();
...
...
@@ -414,7 +457,7 @@
return
data
;
}
function
test
Ajax
(
query
)
{
function
test
PUBMED
(
query
)
{
LastData
=
[]
if
(
!
query
||
query
==
""
)
return
;
var
pubmedquery
=
encodeURIComponent
(
query
)
...
...
@@ -450,7 +493,7 @@
success
:
function
(
data
)
{
console
.
log
(
"ajax_success: in testISTEX()"
)
console
.
log
(
data
)
location
.
reload
();
//
location.reload();
},
error
:
function
(
result
)
{
console
.
log
(
"in testISTEX(). Data not found"
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment