Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
a99a158b
Commit
a99a158b
authored
Feb 01, 2016
by
delanoe
Browse files
Options
Browse Files
Download
Plain Diff
Merge remote-tracking branch 'origin/romain-unstable-N_query' into unstable
parents
355ebe6b
502c5086
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
118 additions
and
27 deletions
+118
-27
gargantext.ini
gargantext.ini
+12
-0
views_optimized.py
gargantext_web/views_optimized.py
+17
-2
views.py
scrappers/scrap_pubmed/views.py
+68
-14
project.html
templates/project.html
+19
-11
urls.py
tests/urls.py
+2
-0
No files found.
gargantext.ini
View file @
a99a158b
...
@@ -59,3 +59,15 @@ max-requests = 5000
...
@@ -59,3 +59,15 @@ max-requests = 5000
uid
=
1000
uid
=
1000
gid
=
1000
gid
=
1000
################### other gargantext constants ###################
[scrappers]
# default number of docs POSTed to scrappers.views.py
# (at page project > add a corpus > scan/process sample)
QUERY_SIZE_N_DEFAULT
=
1000
# checked just before scrap to prevent running impossible workflows
# even if somebody would set "query size N" manually in POST data
QUERY_SIZE_N_MAX
=
20000
gargantext_web/views_optimized.py
View file @
a99a158b
...
@@ -18,7 +18,7 @@ from gargantext_web.db import *
...
@@ -18,7 +18,7 @@ from gargantext_web.db import *
from
gargantext_web.db
import
get_or_create_node
from
gargantext_web.db
import
get_or_create_node
from
gargantext_web.views
import
session
from
gargantext_web.views
import
session
from
gargantext_web.settings
import
DEBUG
,
MEDIA_ROOT
from
gargantext_web.settings
import
DEBUG
,
MEDIA_ROOT
,
BASE_DIR
from
rest_v1_0.api
import
JsonHttpResponse
from
rest_v1_0.api
import
JsonHttpResponse
from
django.db
import
connection
from
django.db
import
connection
...
@@ -32,6 +32,19 @@ from gargantext_web.celery import apply_workflow
...
@@ -32,6 +32,19 @@ from gargantext_web.celery import apply_workflow
from
admin.utils
import
ensure_dir
from
admin.utils
import
ensure_dir
# pour lire la section [scrappers] de gargantext.ini
from
configparser
import
ConfigParser
from
os
import
path
# --------------------------------------------------------------------
# importing constants from config file
CONF
=
ConfigParser
()
with
open
(
path
.
join
(
BASE_DIR
,
'gargantext.ini'
))
as
inifile
:
CONF
.
read_file
(
inifile
)
QUERY_SIZE_N_DEFAULT
=
CONF
[
'scrappers'
][
'QUERY_SIZE_N_DEFAULT'
]
# --------------------------------------------------------------------
def
project
(
request
,
project_id
):
def
project
(
request
,
project_id
):
# do we have a valid project id?
# do we have a valid project id?
try
:
try
:
...
@@ -64,7 +77,7 @@ def project(request, project_id):
...
@@ -64,7 +77,7 @@ def project(request, project_id):
if
not
in_group
:
if
not
in_group
:
return
JsonHttpResponse
(
{
"request"
:
"forbidden"
}
)
return
JsonHttpResponse
(
{
"request"
:
"forbidden"
}
)
# Let's find out about the children nodes of the
project
# Let's find out about the children nodes of the
corpus
ChildrenNode
=
aliased
(
Node
)
ChildrenNode
=
aliased
(
Node
)
# This query is giving you the wrong number of docs from the pubmedquerier (x 5)
# This query is giving you the wrong number of docs from the pubmedquerier (x 5)
# ... sqlalchemy.func by Resource.type_id is the guilty
# ... sqlalchemy.func by Resource.type_id is the guilty
...
@@ -196,6 +209,8 @@ def project(request, project_id):
...
@@ -196,6 +209,8 @@ def project(request, project_id):
'blacklists'
:
''
,
'blacklists'
:
''
,
'cooclists'
:
''
,
'cooclists'
:
''
,
'number'
:
corpora_count
,
'number'
:
corpora_count
,
'query_size'
:
QUERY_SIZE_N_DEFAULT
,
'user_is_admin'
:
user
.
is_superuser
})
})
def
tfidf
(
request
,
corpus_id
,
ngram_ids
):
def
tfidf
(
request
,
corpus_id
,
ngram_ids
):
...
...
scrappers/scrap_pubmed/views.py
View file @
a99a158b
...
@@ -30,7 +30,7 @@ import threading
...
@@ -30,7 +30,7 @@ import threading
from
node.admin
import
CustomForm
from
node.admin
import
CustomForm
from
gargantext_web.db
import
*
from
gargantext_web.db
import
*
from
gargantext_web.db
import
get_sessionmaker
,
session
,
get_session
from
gargantext_web.db
import
get_sessionmaker
,
session
,
get_session
from
gargantext_web.settings
import
DEBUG
,
MEDIA_ROOT
from
gargantext_web.settings
import
DEBUG
,
MEDIA_ROOT
,
BASE_DIR
from
rest_v1_0.api
import
JsonHttpResponse
from
rest_v1_0.api
import
JsonHttpResponse
from
parsing.corpustools
import
add_resource
,
parse_resources
,
extract_ngrams
from
parsing.corpustools
import
add_resource
,
parse_resources
,
extract_ngrams
...
@@ -41,16 +41,50 @@ from time import sleep
...
@@ -41,16 +41,50 @@ from time import sleep
from
admin.utils
import
ensure_dir
from
admin.utils
import
ensure_dir
# pour lire la section [scrappers] de gargantext.ini
from
configparser
import
ConfigParser
from
os
import
path
# --------------------------------------------------------------------
# importing constants from config file
CONF
=
ConfigParser
()
with
open
(
path
.
join
(
BASE_DIR
,
'gargantext.ini'
))
as
inifile
:
CONF
.
read_file
(
inifile
)
QUERY_SIZE_N_MAX
=
int
(
CONF
[
'scrappers'
][
'QUERY_SIZE_N_MAX'
])
# QUERY_SIZE_N_DEFAULT = int(CONF['scrappers']['QUERY_SIZE_N_DEFAULT'])
# --------------------------------------------------------------------
def
getGlobalStats
(
request
):
def
getGlobalStats
(
request
):
"""
Pubmed year by year results
# alist = [
# {'string': '2011[dp] serendipity', 'queryKey': '1',
# 'webEnv': 'NCID_1_11...._F_1', 'count': 475, 'retmax': 6},
# {'string': '2012[dp] serendipity', 'queryKey': '1',
# 'webEnv': 'NCID_1_14..._F_1', 'count': 345, 'retmax': 4},
# ... ]
(reused as thequeries in doTheQuery)
"""
print
(
request
.
method
)
print
(
request
.
method
)
alist
=
[
"bar"
,
"foo"
]
alist
=
[]
if
request
.
method
==
"POST"
:
if
request
.
method
==
"POST"
:
N
=
1000
query
=
request
.
POST
[
"query"
]
query
=
request
.
POST
[
"query"
]
N
=
int
(
request
.
POST
[
"N"
])
if
N
>
QUERY_SIZE_N_MAX
:
msg
=
"Invalid sample size N =
%
i (max =
%
i)"
%
(
N
,
QUERY_SIZE_N_MAX
)
print
(
"ERROR(scrap: pubmed stats): "
,
msg
)
raise
ValueError
(
msg
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" query ="
,
query
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" query ="
,
query
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" N ="
,
N
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" N ="
,
N
)
instancia
=
MedlineFetcher
()
instancia
=
MedlineFetcher
()
# serialFetcher (n_last_years, query, query_size)
alist
=
instancia
.
serialFetcher
(
5
,
query
,
N
)
alist
=
instancia
.
serialFetcher
(
5
,
query
,
N
)
data
=
alist
data
=
alist
...
@@ -59,12 +93,17 @@ def getGlobalStats(request ):
...
@@ -59,12 +93,17 @@ def getGlobalStats(request ):
def
getGlobalStatsISTEXT
(
request
):
def
getGlobalStatsISTEXT
(
request
):
"""
ISTEX simply the total of hits for a query
(not reused in testISTEX)
"""
print
(
request
.
method
)
print
(
request
.
method
)
alist
=
[
"bar"
,
"foo"
]
alist
=
[
"bar"
,
"foo"
]
if
request
.
method
==
"POST"
:
if
request
.
method
==
"POST"
:
N
=
1000
query
=
request
.
POST
[
"query"
]
query
=
request
.
POST
[
"query"
]
N
=
int
(
request
.
POST
[
"N"
])
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" query ="
,
query
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" query ="
,
query
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" N ="
,
N
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" N ="
,
N
)
query_string
=
query
.
replace
(
" "
,
"+"
)
query_string
=
query
.
replace
(
" "
,
"+"
)
...
@@ -109,11 +148,18 @@ def doTheQuery(request , project_id):
...
@@ -109,11 +148,18 @@ def doTheQuery(request , project_id):
if
request
.
method
==
"POST"
:
if
request
.
method
==
"POST"
:
quer
y
=
request
.
POST
[
"query"
]
quer
ies
=
request
.
POST
[
"query"
]
name
=
request
.
POST
[
"string"
]
name
=
request
.
POST
[
"string"
]
# here we just realize queries already prepared by getGlobalStats
# ===> no need to repeat N parameter like in testISTEX <===
instancia
=
MedlineFetcher
()
instancia
=
MedlineFetcher
()
thequeries
=
json
.
loads
(
query
)
thequeries
=
json
.
loads
(
queries
)
# fyi the sum of our prepared yearly proportional quotas
sampled_sum
=
sum
([
year_q
[
'retmax'
]
for
year_q
in
thequeries
])
print
(
"Scrapping Pubmed: '
%
s' (N=
%
i)"
%
(
name
,
sampled_sum
))
urlreqs
=
[]
urlreqs
=
[]
for
yearquery
in
thequeries
:
for
yearquery
in
thequeries
:
...
@@ -214,15 +260,22 @@ def testISTEX(request , project_id):
...
@@ -214,15 +260,22 @@ def testISTEX(request , project_id):
if
request
.
method
==
"POST"
:
if
request
.
method
==
"POST"
:
# print(alist)
query
=
"-"
query
=
"-"
query_string
=
"-"
query_string
=
"-"
N
=
1000
N
=
0
if
"query"
in
request
.
POST
:
query
=
request
.
POST
[
"query"
]
if
"string"
in
request
.
POST
:
query_string
=
request
.
POST
[
"string"
]
.
replace
(
" "
,
"+"
)
if
"query"
in
request
.
POST
:
# if "N" in request.POST: N = request.POST["N"]
query
=
request
.
POST
[
"query"
]
print
(
query_string
,
query
,
N
)
query_string
=
query
.
replace
(
" "
,
"+"
)
# url encoded q
if
"N"
in
request
.
POST
:
N
=
int
(
request
.
POST
[
"N"
])
# query_size from views_opti
if
N
>
QUERY_SIZE_N_MAX
:
msg
=
"Invalid sample size N =
%
i (max =
%
i)"
%
(
N
,
QUERY_SIZE_N_MAX
)
print
(
"ERROR (scrap: istex d/l ): "
,
msg
)
raise
ValueError
(
msg
)
print
(
"Scrapping Istex: '
%
s' (
%
i)"
%
(
query_string
,
N
))
urlreqs
=
[]
urlreqs
=
[]
pagesize
=
50
pagesize
=
50
...
@@ -248,6 +301,7 @@ def testISTEX(request , project_id):
...
@@ -248,6 +301,7 @@ def testISTEX(request , project_id):
session
.
commit
()
session
.
commit
()
corpus_id
=
corpus
.
id
corpus_id
=
corpus
.
id
print
(
"NEW CORPUS"
,
corpus_id
)
ensure_dir
(
request
.
user
)
ensure_dir
(
request
.
user
)
tasks
=
MedlineFetcher
()
tasks
=
MedlineFetcher
()
...
...
templates/project.html
View file @
a99a158b
...
@@ -249,7 +249,12 @@
...
@@ -249,7 +249,12 @@
return
cookieValue
;
return
cookieValue
;
}
}
var
thequeries
=
[]
var
thequeries
=
[]
;
// load the template's value for N scan size
var
querySize
=
parseInt
({{
query_size
}})
;
// TODO if is_admin
function
doTheQuery
()
{
function
doTheQuery
()
{
if
(
$
(
'#submit_thing'
).
prop
(
'disabled'
)
)
return
;
if
(
$
(
'#submit_thing'
).
prop
(
'disabled'
)
)
return
;
...
@@ -257,7 +262,11 @@
...
@@ -257,7 +262,11 @@
var
origQuery
=
$
(
"#id_name"
).
val
()
var
origQuery
=
$
(
"#id_name"
).
val
()
var
pubmedifiedQuery
=
{
query
:
JSON
.
stringify
(
thequeries
)
,
string
:
origQuery
}
;
var
pubmedifiedQuery
=
{
query
:
JSON
.
stringify
(
thequeries
)
,
string
:
origQuery
,
N
:
querySize
}
;
console
.
log
(
pubmedifiedQuery
)
console
.
log
(
pubmedifiedQuery
)
var
projectid
=
window
.
location
.
href
.
split
(
"project"
)[
1
].
replace
(
/
\/
/g
,
''
)
//replace all the slashes
var
projectid
=
window
.
location
.
href
.
split
(
"project"
)[
1
].
replace
(
/
\/
/g
,
''
)
//replace all the slashes
...
@@ -299,7 +308,7 @@
...
@@ -299,7 +308,7 @@
var
origQuery
=
$
(
"#id_name"
).
val
()
var
origQuery
=
$
(
"#id_name"
).
val
()
console
.
log
(
"printing the results:"
)
console
.
log
(
"printing the results:"
)
console
.
log
(
origQuery
)
console
.
log
(
origQuery
)
testISTEX
(
origQuery
.
replace
(
" "
,
"+"
),
1000
)
testISTEX
(
origQuery
.
replace
(
" "
,
"+"
),
querySize
)
}
}
}
}
else
{
else
{
...
@@ -324,9 +333,9 @@
...
@@ -324,9 +333,9 @@
console
.
log
(
"in getGlobalResults()"
)
console
.
log
(
"in getGlobalResults()"
)
// AJAX to django
// AJAX to django
var
pubmedquery
=
$
(
"#id_name"
).
val
()
var
pubmedquery
=
$
(
"#id_name"
).
val
()
var
Npubs
=
$
(
"#id_N"
).
val
();
//
var Npubs = $("#id_N").val();
if
(
pubmedquery
==
""
)
return
;
if
(
pubmedquery
==
""
)
return
;
var
formData
=
{
query
:
pubmedquery
,
N
:
Npubs
}
var
formData
=
{
query
:
pubmedquery
,
N
:
querySize
}
$
(
"#theresults"
).
html
(
'<img width="30px" src="{% static "js/libs/img2/loading-bar.gif" %}"></img>'
)
$
(
"#theresults"
).
html
(
'<img width="30px" src="{% static "js/libs/img2/loading-bar.gif" %}"></img>'
)
console
.
log
(
"disabling "
+
"#"
+
value
.
id
)
console
.
log
(
"disabling "
+
"#"
+
value
.
id
)
$
(
"#"
+
value
.
id
).
prop
(
'onclick'
,
null
);
$
(
"#"
+
value
.
id
).
prop
(
'onclick'
,
null
);
...
@@ -349,7 +358,7 @@
...
@@ -349,7 +358,7 @@
console
.
log
(
"enabling "
+
"#"
+
value
.
id
)
console
.
log
(
"enabling "
+
"#"
+
value
.
id
)
$
(
"#"
+
value
.
id
).
attr
(
'onclick'
,
'getGlobalResults(this);'
);
$
(
"#"
+
value
.
id
).
attr
(
'onclick'
,
'getGlobalResults(this);'
);
// $("#submit_thing").prop('disabled' , false)
// $("#submit_thing").prop('disabled' , false)
$
(
"#submit_thing"
).
html
(
"Process a
1000
sample!"
)
$
(
"#submit_thing"
).
html
(
"Process a
{{ query_size }}
sample!"
)
thequeries
=
data
thequeries
=
data
var
N
=
0
,
k
=
0
;
var
N
=
0
,
k
=
0
;
...
@@ -388,7 +397,7 @@
...
@@ -388,7 +397,7 @@
console
.
log
(
"enabling "
+
"#"
+
value
.
id
)
console
.
log
(
"enabling "
+
"#"
+
value
.
id
)
$
(
"#"
+
value
.
id
).
attr
(
'onclick'
,
'getGlobalResults(this);'
);
$
(
"#"
+
value
.
id
).
attr
(
'onclick'
,
'getGlobalResults(this);'
);
// $("#submit_thing").prop('disabled' , false)
// $("#submit_thing").prop('disabled' , false)
$
(
"#submit_thing"
).
html
(
"Process a
1000
sample!"
)
$
(
"#submit_thing"
).
html
(
"Process a
{{ query_size }}
sample!"
)
thequeries
=
data
thequeries
=
data
var
N
=
data
.
length
,
k
=
0
;
var
N
=
data
.
length
,
k
=
0
;
...
@@ -494,20 +503,19 @@
...
@@ -494,20 +503,19 @@
return
false
;
return
false
;
}
}
function
testISTEX
(
query
,
N
pubs
)
{
function
testISTEX
(
query
,
N
)
{
console
.
log
(
"in testISTEX:"
);
console
.
log
(
"in testISTEX:"
);
if
(
!
query
||
query
==
""
)
return
;
if
(
!
query
||
query
==
""
)
return
;
var
origQuery
=
query
var
origQuery
=
query
var
postQuery
=
{
query
:
query
,
N
:
N
}
var
pubmedifiedQuery
=
{
query
:
query
,
string
:
query
}
var
projectid
=
window
.
location
.
href
.
split
(
"project"
)[
1
].
replace
(
/
\/
/g
,
''
)
//replace all the slashes
var
projectid
=
window
.
location
.
href
.
split
(
"project"
)[
1
].
replace
(
/
\/
/g
,
''
)
//replace all the slashes
$
.
ajax
({
$
.
ajax
({
// contentType: "application/json",
// contentType: "application/json",
url
:
window
.
location
.
origin
+
"/tests/project/"
+
projectid
+
"/ISTEXquery/go"
,
url
:
window
.
location
.
origin
+
"/tests/project/"
+
projectid
+
"/ISTEXquery/go"
,
data
:
p
ubmedified
Query
,
data
:
p
ost
Query
,
type
:
'POST'
,
type
:
'POST'
,
beforeSend
:
function
(
xhr
)
{
beforeSend
:
function
(
xhr
)
{
xhr
.
setRequestHeader
(
"X-CSRFToken"
,
getCookie
(
"csrftoken"
));
xhr
.
setRequestHeader
(
"X-CSRFToken"
,
getCookie
(
"csrftoken"
));
...
...
tests/urls.py
View file @
a99a158b
...
@@ -14,6 +14,8 @@ urlpatterns = patterns('',
...
@@ -14,6 +14,8 @@ urlpatterns = patterns('',
url
(
r'paginator/corpus/(\d+)/$'
,
views
.
newpaginatorJSON
),
url
(
r'paginator/corpus/(\d+)/$'
,
views
.
newpaginatorJSON
),
url
(
r'move2trash/$'
,
views
.
move_to_trash_multiple
),
url
(
r'move2trash/$'
,
views
.
move_to_trash_multiple
),
# TODO correct and move to scappers
url
(
r'istextquery$'
,
pubmedscrapper
.
getGlobalStatsISTEXT
),
# api/query?type=istext ?
url
(
r'istextquery$'
,
pubmedscrapper
.
getGlobalStatsISTEXT
),
# api/query?type=istext ?
url
(
r'pubmedquery$'
,
pubmedscrapper
.
getGlobalStats
),
url
(
r'pubmedquery$'
,
pubmedscrapper
.
getGlobalStats
),
url
(
r'project/(\d+)/pubmedquery/go$'
,
pubmedscrapper
.
doTheQuery
),
url
(
r'project/(\d+)/pubmedquery/go$'
,
pubmedscrapper
.
doTheQuery
),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment