Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
ed1311f3
Commit
ed1311f3
authored
Jan 27, 2015
by
PkSM3
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FEATURE] dynamic query for pubmed: OK
parent
44dae6cb
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
76 additions
and
158 deletions
+76
-158
functions.py
analysis/functions.py
+0
-1
urls.py
gargantext_web/urls.py
+1
-0
views.py
gargantext_web/views.py
+13
-85
models.py
node/models.py
+6
-1
MedlineFetcherDavid2015.py
scrap_pubmed/MedlineFetcherDavid2015.py
+12
-12
views.py
scrap_pubmed/views.py
+38
-57
project.html
templates/project.html
+6
-2
No files found.
analysis/functions.py
View file @
ed1311f3
...
@@ -269,7 +269,6 @@ from analysis.tfidf import tfidf
...
@@ -269,7 +269,6 @@ from analysis.tfidf import tfidf
def
do_tfidf
(
corpus
,
reset
=
True
):
def
do_tfidf
(
corpus
,
reset
=
True
):
print
(
"doing tfidf"
)
print
(
"doing tfidf"
)
print
(
"
\t
"
,
corpus
.
type
)
with
transaction
.
atomic
():
with
transaction
.
atomic
():
if
reset
==
True
:
if
reset
==
True
:
NodeNodeNgram
.
objects
.
filter
(
nodex
=
corpus
)
.
delete
()
NodeNodeNgram
.
objects
.
filter
(
nodex
=
corpus
)
.
delete
()
...
...
gargantext_web/urls.py
View file @
ed1311f3
...
@@ -67,6 +67,7 @@ urlpatterns = patterns('',
...
@@ -67,6 +67,7 @@ urlpatterns = patterns('',
url
(
r'^nodeinfo/(\d+)$'
,
views
.
nodeinfo
),
url
(
r'^nodeinfo/(\d+)$'
,
views
.
nodeinfo
),
url
(
r'^tests/mvc$'
,
views
.
tests_mvc
),
url
(
r'^tests/mvc$'
,
views
.
tests_mvc
),
url
(
r'^tests/mvc-listdocuments$'
,
views
.
tests_mvc_listdocuments
),
url
(
r'^tests/mvc-listdocuments$'
,
views
.
tests_mvc_listdocuments
),
url
(
r'^tests/pubmedquery$'
,
pubmedscrapper
.
getGlobalStats
),
url
(
r'^tests/pubmedquery$'
,
pubmedscrapper
.
getGlobalStats
),
url
(
r'^tests/project/(\d+)/pubmedquery/go$'
,
pubmedscrapper
.
doTheQuery
)
url
(
r'^tests/project/(\d+)/pubmedquery/go$'
,
pubmedscrapper
.
doTheQuery
)
...
...
gargantext_web/views.py
View file @
ed1311f3
...
@@ -212,6 +212,8 @@ def project(request, project_id):
...
@@ -212,6 +212,8 @@ def project(request, project_id):
cooclists
=
""
#.children.filter(type=type_cooclist)
cooclists
=
""
#.children.filter(type=type_cooclist)
for
corpus
in
corpora
:
for
corpus
in
corpora
:
# print("corpus", corpus.pk , corpus.name , corpus.type_id)
docs_count
=
corpus
.
children
.
count
()
docs_count
=
corpus
.
children
.
count
()
docs_total
+=
docs_count
docs_total
+=
docs_count
...
@@ -219,10 +221,17 @@ def project(request, project_id):
...
@@ -219,10 +221,17 @@ def project(request, project_id):
corpus_view
[
'id'
]
=
corpus
.
pk
corpus_view
[
'id'
]
=
corpus
.
pk
corpus_view
[
'name'
]
=
corpus
.
name
corpus_view
[
'name'
]
=
corpus
.
name
corpus_view
[
'count'
]
=
corpus
.
children
.
count
()
corpus_view
[
'count'
]
=
corpus
.
children
.
count
()
for
node_resource
in
Node_Resource
.
objects
.
filter
(
node
=
corpus
):
#just get first element of the corpora and get his type.
donut_part
[
node_resource
.
resource
.
type
]
+=
docs_count
corpus_type
=
Node_Resource
.
objects
.
filter
(
node
=
corpus
)[
0
]
.
resource
.
type
list_corpora
[
node_resource
.
resource
.
type
.
name
]
.
append
(
corpus_view
)
list_corpora
[
corpus_type
]
.
append
(
corpus_view
)
## For avoiding to list repeated elements, like when u use the dynamic query (per each xml, 1)
# for node_resource in Node_Resource.objects.filter(node=corpus):
# print( "node_resource.id:",node_resource.id , node_resource.resource.file )
# donut_part[node_resource.resource.type] += docs_count
# list_corpora[node_resource.resource.type.name].append(corpus_view)
# print(node_resource.resource.type.name)
list_corpora
=
dict
(
list_corpora
)
list_corpora
=
dict
(
list_corpora
)
if
docs_total
==
0
or
docs_total
is
None
:
if
docs_total
==
0
or
docs_total
is
None
:
...
@@ -235,8 +244,6 @@ def project(request, project_id):
...
@@ -235,8 +244,6 @@ def project(request, project_id):
if
request
.
method
==
'POST'
:
if
request
.
method
==
'POST'
:
print
(
"original file:"
)
print
(
request
.
FILES
)
form
=
CustomForm
(
request
.
POST
,
request
.
FILES
)
form
=
CustomForm
(
request
.
POST
,
request
.
FILES
)
if
form
.
is_valid
():
if
form
.
is_valid
():
...
@@ -249,9 +256,6 @@ def project(request, project_id):
...
@@ -249,9 +256,6 @@ def project(request, project_id):
print
(
"-------------"
)
print
(
"-------------"
)
print
(
name
,
"|"
,
resource_type
,
"|"
,
thefile
)
print
(
name
,
"|"
,
resource_type
,
"|"
,
thefile
)
print
(
"-------------"
)
print
(
"-------------"
)
print
(
"new file:"
)
print
(
thefile
)
try
:
try
:
parent
=
Node
.
objects
.
get
(
id
=
project_id
)
parent
=
Node
.
objects
.
get
(
id
=
project_id
)
...
@@ -280,8 +284,6 @@ def project(request, project_id):
...
@@ -280,8 +284,6 @@ def project(request, project_id):
corpus
.
save
()
corpus
.
save
()
print
(
request
.
user
,
resource_type
,
thefile
)
corpus
.
add_resource
(
corpus
.
add_resource
(
user
=
request
.
user
,
user
=
request
.
user
,
type
=
resource_type
,
type
=
resource_type
,
...
@@ -324,80 +326,6 @@ def project(request, project_id):
...
@@ -324,80 +326,6 @@ def project(request, project_id):
})
})
else
:
else
:
form
=
CustomForm
()
form
=
CustomForm
()
# if request.method == 'POST':
# #form = CorpusForm(request.POST, request.FILES)
# #print(str(request.POST))
# name = str(request.POST['name'])
# try:
# resource_type = ResourceType.objects.get(id=str(request.POST['type']))
# except Exception as error:
# print(error)
# resource_type = None
# try:
# file = request.FILES['file']
# except Exception as error:
# print(error)
# file = None
# #if name != "" and resource_type is not None and file is not None:
# try:
# parent = Node.objects.get(id=project_id)
# node_type = NodeType.objects.get(name='Corpus')
# if resource_type.name == "europress_french":
# language = Language.objects.get(iso2='fr')
# elif resource_type.name == "europress_english":
# language = Language.objects.get(iso2='en')
# try:
# corpus = Node(
# user=request.user,
# parent=parent,
# type=node_type,
# language=language,
# name=name,
# )
# except:
# corpus = Node(
# user=request.user,
# parent=parent,
# type=node_type,
# name=name,
# )
# corpus.save()
# print(request.user, resource_type , file )
# print(corpus.language)
# corpus.add_resource(
# user=request.user,
# type=resource_type,
# file=file
# )
# try:
# #corpus.parse_and_extract_ngrams()
# #corpus.parse_and_extract_ngrams.apply_async((), countdown=3)
# if DEBUG is True:
# corpus.workflow()
# else:
# corpus.workflow.apply_async((), countdown=3)
# except Exception as error:
# print(error)
# return HttpResponseRedirect('/project/' + str(project_id))
# except Exception as error:
# print('ee', error)
# form = CorpusForm(request=request)
# formResource = ResourceForm()
# else:
# form = CorpusForm(request=request)
# formResource = ResourceForm()
return
render
(
request
,
'project.html'
,
{
return
render
(
request
,
'project.html'
,
{
'form'
:
form
,
'form'
:
form
,
...
...
node/models.py
View file @
ed1311f3
...
@@ -236,12 +236,17 @@ class Node(CTENode):
...
@@ -236,12 +236,17 @@ class Node(CTENode):
@
current_app
.
task
(
filter
=
task_method
)
@
current_app
.
task
(
filter
=
task_method
)
def
workflow
(
self
,
keys
=
None
,
ngramsextractorscache
=
None
,
ngramscaches
=
None
,
verbose
=
False
):
def
workflow
(
self
,
keys
=
None
,
ngramsextractorscache
=
None
,
ngramscaches
=
None
,
verbose
=
False
):
print
(
"In workflow()
START
"
)
print
(
"In workflow()
parse_resources()
"
)
self
.
parse_resources
()
self
.
parse_resources
()
print
(
"In workflow() / parse_resources()"
)
print
(
"In workflow() extract_ngrams()"
)
type_document
=
NodeType
.
objects
.
get
(
name
=
'Document'
)
type_document
=
NodeType
.
objects
.
get
(
name
=
'Document'
)
self
.
children
.
filter
(
type_id
=
type_document
.
pk
)
.
extract_ngrams
(
keys
=
[
'title'
,])
self
.
children
.
filter
(
type_id
=
type_document
.
pk
)
.
extract_ngrams
(
keys
=
[
'title'
,])
print
(
"In workflow() / extract_ngrams()"
)
print
(
"In workflow() do_tfidf()"
)
from
analysis.functions
import
do_tfidf
from
analysis.functions
import
do_tfidf
do_tfidf
(
self
)
do_tfidf
(
self
)
print
(
"In workflow() / do_tfidf()"
)
print
(
"In workflow() END"
)
print
(
"In workflow() END"
)
class
Node_Metadata
(
models
.
Model
):
class
Node_Metadata
(
models
.
Model
):
...
...
scrap_pubmed/MedlineFetcherDavid2015.py
View file @
ed1311f3
...
@@ -56,6 +56,7 @@ class MedlineFetcher:
...
@@ -56,6 +56,7 @@ class MedlineFetcher:
# webEnv = doc.xpathEval('eSearchResult/WebEnv/text()')[0]
# webEnv = doc.xpathEval('eSearchResult/WebEnv/text()')[0]
# print count, queryKey, webEnv
# print count, queryKey, webEnv
values
=
{
"count"
:
int
(
str
(
count
)),
"queryKey"
:
queryKey
,
"webEnv"
:
webEnv
}
values
=
{
"count"
:
int
(
str
(
count
)),
"queryKey"
:
queryKey
,
"webEnv"
:
webEnv
}
print
(
values
)
return
values
return
values
...
@@ -126,28 +127,27 @@ class MedlineFetcher:
...
@@ -126,28 +127,27 @@ class MedlineFetcher:
# medlineEfetchRAW(str(year) + '[dp] '+query , retmax=300)
# medlineEfetchRAW(str(year) + '[dp] '+query , retmax=300)
pubmedquery
=
str
(
year
)
+
'[dp] '
+
query
pubmedquery
=
str
(
year
)
+
'[dp] '
+
query
globalresults
=
self
.
medlineEsearch
(
pubmedquery
)
globalresults
=
self
.
medlineEsearch
(
pubmedquery
)
N
+=
globalresults
[
"count"
]
if
globalresults
[
"count"
]
>
0
:
querymetadata
=
{
N
+=
globalresults
[
"count"
]
"string"
:
pubmedquery
,
querymetadata
=
{
"count"
:
globalresults
[
"count"
]
,
"string"
:
pubmedquery
,
"queryKey"
:
globalresults
[
"queryKey"
]
,
"count"
:
globalresults
[
"count"
]
,
"webEnv"
:
globalresults
[
"webEnv"
]
,
"queryKey"
:
globalresults
[
"queryKey"
]
,
"retmax"
:
0
"webEnv"
:
globalresults
[
"webEnv"
]
,
}
"retmax"
:
0
thequeries
.
append
(
querymetadata
)
}
thequeries
.
append
(
querymetadata
)
print
(
"Total Number:"
,
N
,
"publications"
)
print
(
"Total Number:"
,
N
,
"publications"
)
print
(
"And i want just:"
,
globalLimit
,
"publications"
)
print
(
"And i want just:"
,
globalLimit
,
"publications"
)
print
(
"---------------------------------------
\n
"
)
print
(
"---------------------------------------
\n
"
)
for
query
in
thequeries
:
for
i
,
query
in
enumerate
(
thequeries
)
:
k
=
query
[
"count"
]
k
=
query
[
"count"
]
percentage
=
k
/
float
(
N
)
percentage
=
k
/
float
(
N
)
retmax_forthisyear
=
int
(
round
(
globalLimit
*
percentage
))
retmax_forthisyear
=
int
(
round
(
globalLimit
*
percentage
))
query
[
"retmax"
]
=
retmax_forthisyear
query
[
"retmax"
]
=
retmax_forthisyear
# self.medlineEfetchRAW( query )
print
(
'Done !'
)
return
thequeries
return
thequeries
...
...
scrap_pubmed/views.py
View file @
ed1311f3
...
@@ -12,6 +12,11 @@ from gargantext_web.api import JsonHttpResponse
...
@@ -12,6 +12,11 @@ from gargantext_web.api import JsonHttpResponse
from
urllib.request
import
urlopen
,
urlretrieve
from
urllib.request
import
urlopen
,
urlretrieve
import
json
import
json
from
gargantext_web.settings
import
MEDIA_ROOT
from
datetime
import
datetime
from
django.core.files
import
File
from
gargantext_web.settings
import
DEBUG
from
node.models
import
Language
,
ResourceType
,
Resource
,
\
from
node.models
import
Language
,
ResourceType
,
Resource
,
\
Node
,
NodeType
,
Node_Resource
,
Project
,
Corpus
,
\
Node
,
NodeType
,
Node_Resource
,
Project
,
Corpus
,
\
Ngram
,
Node_Ngram
,
NodeNgramNgram
,
NodeNodeNgram
Ngram
,
Node_Ngram
,
NodeNgramNgram
,
NodeNodeNgram
...
@@ -24,7 +29,7 @@ def getGlobalStats(request ):
...
@@ -24,7 +29,7 @@ def getGlobalStats(request ):
if
request
.
method
==
"POST"
:
if
request
.
method
==
"POST"
:
query
=
request
.
POST
[
"query"
]
query
=
request
.
POST
[
"query"
]
instancia
=
MedlineFetcher
()
instancia
=
MedlineFetcher
()
alist
=
instancia
.
serialFetcher
(
5
,
query
,
2
00
)
alist
=
instancia
.
serialFetcher
(
5
,
query
,
1
00
)
data
=
alist
data
=
alist
return
JsonHttpResponse
(
data
)
return
JsonHttpResponse
(
data
)
...
@@ -43,8 +48,6 @@ def doTheQuery(request , project_id):
...
@@ -43,8 +48,6 @@ def doTheQuery(request , project_id):
instancia
=
MedlineFetcher
()
instancia
=
MedlineFetcher
()
thequeries
=
json
.
loads
(
query
)
thequeries
=
json
.
loads
(
query
)
print
(
"------------------"
)
urlreqs
=
[]
urlreqs
=
[]
for
yearquery
in
thequeries
:
for
yearquery
in
thequeries
:
urlreqs
.
append
(
instancia
.
medlineEfetchRAW
(
yearquery
)
)
urlreqs
.
append
(
instancia
.
medlineEfetchRAW
(
yearquery
)
)
...
@@ -58,71 +61,49 @@ def doTheQuery(request , project_id):
...
@@ -58,71 +61,49 @@ def doTheQuery(request , project_id):
"""
"""
thefile
=
"how we do this here?"
thefile
=
"how we do this here?"
resource_type
=
ResourceType
()
resource_type
=
ResourceType
.
objects
.
get
(
name
=
"pubmed"
)
resource_type
.
name
=
name
try
:
parent
=
Node
.
objects
.
get
(
id
=
project_id
)
node_type
=
NodeType
.
objects
.
get
(
name
=
'Corpus'
)
type_id
=
NodeType
.
objects
.
get
(
name
=
'Document'
)
.
id
user_id
=
User
.
objects
.
get
(
username
=
request
.
user
)
.
id
corpus
=
Node
(
user
=
request
.
user
,
parent
=
parent
,
type
=
node_type
,
name
=
name
,
)
corpus
.
save
()
parser
=
PubmedFileParser
()
metadata_list
=
[]
for
url
in
urlreqs
:
data
=
urlopen
(
url
)
metadata_list
+=
parser
.
parse
(
data
.
read
()
)
# corpus.add_resource( user=request.user, type=resource_type, file=data.read() )
break
parent
=
Node
.
objects
.
get
(
id
=
project_id
)
node_type
=
NodeType
.
objects
.
get
(
name
=
'Corpus'
)
type_id
=
NodeType
.
objects
.
get
(
name
=
'Document'
)
.
id
user_id
=
User
.
objects
.
get
(
username
=
request
.
user
)
.
id
from
parsing.Caches
import
LanguagesCache
corpus
=
Node
(
langages_cache
=
LanguagesCache
()
user
=
request
.
user
,
for
i
,
metadata_values
in
enumerate
(
metadata_list
):
parent
=
parent
,
name
=
metadata_values
.
get
(
'title'
,
''
)[:
200
]
type
=
node_type
,
language
=
langages_cache
[
metadata_values
[
'language_iso2'
]]
if
'language_iso2'
in
metadata_values
else
None
,
name
=
name
,
if
isinstance
(
language
,
tuple
):
)
language
=
language
[
0
]
Node
(
corpus
.
save
()
user_id
=
user_id
,
type_id
=
type_id
,
name
=
name
,
parent
=
parent
,
language_id
=
language
.
id
if
language
else
None
,
metadata
=
metadata_values
)
.
save
()
parent
.
children
.
all
()
.
make_metadata_filterable
()
try
:
for
url
in
urlreqs
:
type_document
=
NodeType
.
objects
.
get
(
name
=
'Document'
)
print
(
url
)
print
(
"printing here 01"
)
data
=
urlopen
(
url
)
parent
.
children
.
filter
(
type_id
=
type_document
.
pk
)
.
extract_ngrams
(
keys
=
[
'title'
,])
xmlname
=
MEDIA_ROOT
+
'/corpora/
%
s/
%
s.xml'
%
(
request
.
user
,
str
(
datetime
.
now
()
.
microsecond
))
print
(
"printing here 02"
)
f
=
open
(
xmlname
,
'w'
)
myfile
=
File
(
f
)
myfile
.
write
(
data
.
read
()
.
decode
(
'utf-8'
)
)
myfile
.
close
()
f
.
close
()
corpus
.
add_resource
(
user
=
request
.
user
,
type
=
resource_type
,
file
=
xmlname
)
print
(
"now we've to apply do_tfidf..."
)
try
:
if
DEBUG
is
True
:
corpus
.
workflow
()
else
:
corpus
.
workflow
.
apply_async
((),
countdown
=
3
)
return
JsonHttpResponse
([
"workflow"
,
"finished"
])
# thetitles = parent.children.filter(type_id=type_document.pk)
except
Exception
as
error
:
# print(Node.objects.filter(parent=parent))
print
(
error
)
# from analysis.functions import do_tfidf
# do_tfidf(corpus)
print
(
"ca va?"
)
return
JsonHttpResponse
([
"workflow"
,
"finished"
,
"outside the try-except"
]
)
except
Exception
as
error
:
except
Exception
as
error
:
print
(
"lele"
,
error
)
print
(
"lele"
,
error
)
data
=
alist
data
=
alist
return
JsonHttpResponse
(
data
)
return
JsonHttpResponse
(
data
)
\ No newline at end of file
templates/project.html
View file @
ed1311f3
...
@@ -213,6 +213,7 @@
...
@@ -213,6 +213,7 @@
success
:
function
(
data
)
{
success
:
function
(
data
)
{
console
.
log
(
"in doTheQuery()"
)
console
.
log
(
"in doTheQuery()"
)
console
.
log
(
data
)
console
.
log
(
data
)
location
.
reload
();
},
},
error
:
function
(
result
)
{
error
:
function
(
result
)
{
console
.
log
(
"in doTheQuery(). Data not found"
);
console
.
log
(
"in doTheQuery(). Data not found"
);
...
@@ -241,11 +242,14 @@
...
@@ -241,11 +242,14 @@
thequeries
=
data
thequeries
=
data
var
N
=
0
,
k
=
0
;
var
N
=
0
,
k
=
0
;
for
(
var
i
in
thequeries
)
N
+=
thequeries
[
i
].
count
if
(
N
>
0
)
{
for
(
var
i
in
thequeries
)
N
+=
thequeries
[
i
].
count
if
(
N
>
0
)
{
$
(
"#results"
).
html
(
"Result: "
+
N
+
" publications in the last 5 years"
)
$
(
"#results"
).
html
(
"Result: "
+
N
+
" publications in the last 5 years"
)
$
(
'#id_thebutton'
).
prop
(
'disabled'
,
false
);
$
(
'#id_thebutton'
).
prop
(
'disabled'
,
false
);
}
else
{
$
(
"#results"
).
html
(
"No results!."
)
$
(
'#id_thebutton'
).
prop
(
'disabled'
,
true
);
}
}
},
},
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment