Commit ed1311f3 authored by PkSM3's avatar PkSM3

[FEATURE] dynamic query for pubmed: OK

parent 44dae6cb
...@@ -269,7 +269,6 @@ from analysis.tfidf import tfidf ...@@ -269,7 +269,6 @@ from analysis.tfidf import tfidf
def do_tfidf(corpus, reset=True): def do_tfidf(corpus, reset=True):
print("doing tfidf") print("doing tfidf")
print("\t",corpus.type)
with transaction.atomic(): with transaction.atomic():
if reset==True: if reset==True:
NodeNodeNgram.objects.filter(nodex=corpus).delete() NodeNodeNgram.objects.filter(nodex=corpus).delete()
......
...@@ -67,6 +67,7 @@ urlpatterns = patterns('', ...@@ -67,6 +67,7 @@ urlpatterns = patterns('',
url(r'^nodeinfo/(\d+)$', views.nodeinfo), url(r'^nodeinfo/(\d+)$', views.nodeinfo),
url(r'^tests/mvc$', views.tests_mvc), url(r'^tests/mvc$', views.tests_mvc),
url(r'^tests/mvc-listdocuments$', views.tests_mvc_listdocuments), url(r'^tests/mvc-listdocuments$', views.tests_mvc_listdocuments),
url(r'^tests/pubmedquery$', pubmedscrapper.getGlobalStats), url(r'^tests/pubmedquery$', pubmedscrapper.getGlobalStats),
url(r'^tests/project/(\d+)/pubmedquery/go$', pubmedscrapper.doTheQuery) url(r'^tests/project/(\d+)/pubmedquery/go$', pubmedscrapper.doTheQuery)
......
...@@ -212,6 +212,8 @@ def project(request, project_id): ...@@ -212,6 +212,8 @@ def project(request, project_id):
cooclists = ""#.children.filter(type=type_cooclist) cooclists = ""#.children.filter(type=type_cooclist)
for corpus in corpora: for corpus in corpora:
# print("corpus", corpus.pk , corpus.name , corpus.type_id)
docs_count = corpus.children.count() docs_count = corpus.children.count()
docs_total += docs_count docs_total += docs_count
...@@ -219,10 +221,17 @@ def project(request, project_id): ...@@ -219,10 +221,17 @@ def project(request, project_id):
corpus_view['id'] = corpus.pk corpus_view['id'] = corpus.pk
corpus_view['name'] = corpus.name corpus_view['name'] = corpus.name
corpus_view['count'] = corpus.children.count() corpus_view['count'] = corpus.children.count()
for node_resource in Node_Resource.objects.filter(node=corpus): #just get first element of the corpora and get his type.
donut_part[node_resource.resource.type] += docs_count corpus_type = Node_Resource.objects.filter(node=corpus)[0].resource.type
list_corpora[node_resource.resource.type.name].append(corpus_view) list_corpora[corpus_type].append(corpus_view)
## For avoiding to list repeated elements, like when u use the dynamic query (per each xml, 1)
# for node_resource in Node_Resource.objects.filter(node=corpus):
# print( "node_resource.id:",node_resource.id , node_resource.resource.file )
# donut_part[node_resource.resource.type] += docs_count
# list_corpora[node_resource.resource.type.name].append(corpus_view)
# print(node_resource.resource.type.name)
list_corpora = dict(list_corpora) list_corpora = dict(list_corpora)
if docs_total == 0 or docs_total is None: if docs_total == 0 or docs_total is None:
...@@ -235,8 +244,6 @@ def project(request, project_id): ...@@ -235,8 +244,6 @@ def project(request, project_id):
if request.method == 'POST': if request.method == 'POST':
print("original file:")
print(request.FILES)
form = CustomForm(request.POST, request.FILES) form = CustomForm(request.POST, request.FILES)
if form.is_valid(): if form.is_valid():
...@@ -249,9 +256,6 @@ def project(request, project_id): ...@@ -249,9 +256,6 @@ def project(request, project_id):
print("-------------") print("-------------")
print(name,"|",resource_type,"|",thefile) print(name,"|",resource_type,"|",thefile)
print("-------------") print("-------------")
print("new file:")
print(thefile)
try: try:
parent = Node.objects.get(id=project_id) parent = Node.objects.get(id=project_id)
...@@ -280,8 +284,6 @@ def project(request, project_id): ...@@ -280,8 +284,6 @@ def project(request, project_id):
corpus.save() corpus.save()
print(request.user, resource_type , thefile )
corpus.add_resource( corpus.add_resource(
user=request.user, user=request.user,
type=resource_type, type=resource_type,
...@@ -324,80 +326,6 @@ def project(request, project_id): ...@@ -324,80 +326,6 @@ def project(request, project_id):
}) })
else: else:
form = CustomForm() form = CustomForm()
# if request.method == 'POST':
# #form = CorpusForm(request.POST, request.FILES)
# #print(str(request.POST))
# name = str(request.POST['name'])
# try:
# resource_type = ResourceType.objects.get(id=str(request.POST['type']))
# except Exception as error:
# print(error)
# resource_type = None
# try:
# file = request.FILES['file']
# except Exception as error:
# print(error)
# file = None
# #if name != "" and resource_type is not None and file is not None:
# try:
# parent = Node.objects.get(id=project_id)
# node_type = NodeType.objects.get(name='Corpus')
# if resource_type.name == "europress_french":
# language = Language.objects.get(iso2='fr')
# elif resource_type.name == "europress_english":
# language = Language.objects.get(iso2='en')
# try:
# corpus = Node(
# user=request.user,
# parent=parent,
# type=node_type,
# language=language,
# name=name,
# )
# except:
# corpus = Node(
# user=request.user,
# parent=parent,
# type=node_type,
# name=name,
# )
# corpus.save()
# print(request.user, resource_type , file )
# print(corpus.language)
# corpus.add_resource(
# user=request.user,
# type=resource_type,
# file=file
# )
# try:
# #corpus.parse_and_extract_ngrams()
# #corpus.parse_and_extract_ngrams.apply_async((), countdown=3)
# if DEBUG is True:
# corpus.workflow()
# else:
# corpus.workflow.apply_async((), countdown=3)
# except Exception as error:
# print(error)
# return HttpResponseRedirect('/project/' + str(project_id))
# except Exception as error:
# print('ee', error)
# form = CorpusForm(request=request)
# formResource = ResourceForm()
# else:
# form = CorpusForm(request=request)
# formResource = ResourceForm()
return render(request, 'project.html', { return render(request, 'project.html', {
'form' : form, 'form' : form,
......
...@@ -236,12 +236,17 @@ class Node(CTENode): ...@@ -236,12 +236,17 @@ class Node(CTENode):
@current_app.task(filter=task_method) @current_app.task(filter=task_method)
def workflow(self, keys=None, ngramsextractorscache=None, ngramscaches=None, verbose=False): def workflow(self, keys=None, ngramsextractorscache=None, ngramscaches=None, verbose=False):
print("In workflow() START") print("In workflow() parse_resources()")
self.parse_resources() self.parse_resources()
print("In workflow() / parse_resources()")
print("In workflow() extract_ngrams()")
type_document = NodeType.objects.get(name='Document') type_document = NodeType.objects.get(name='Document')
self.children.filter(type_id=type_document.pk).extract_ngrams(keys=['title',]) self.children.filter(type_id=type_document.pk).extract_ngrams(keys=['title',])
print("In workflow() / extract_ngrams()")
print("In workflow() do_tfidf()")
from analysis.functions import do_tfidf from analysis.functions import do_tfidf
do_tfidf(self) do_tfidf(self)
print("In workflow() / do_tfidf()")
print("In workflow() END") print("In workflow() END")
class Node_Metadata(models.Model): class Node_Metadata(models.Model):
......
...@@ -56,6 +56,7 @@ class MedlineFetcher: ...@@ -56,6 +56,7 @@ class MedlineFetcher:
# webEnv = doc.xpathEval('eSearchResult/WebEnv/text()')[0] # webEnv = doc.xpathEval('eSearchResult/WebEnv/text()')[0]
# print count, queryKey, webEnv # print count, queryKey, webEnv
values = { "count": int(str(count)), "queryKey": queryKey , "webEnv":webEnv } values = { "count": int(str(count)), "queryKey": queryKey , "webEnv":webEnv }
print(values)
return values return values
...@@ -126,28 +127,27 @@ class MedlineFetcher: ...@@ -126,28 +127,27 @@ class MedlineFetcher:
# medlineEfetchRAW(str(year) + '[dp] '+query , retmax=300) # medlineEfetchRAW(str(year) + '[dp] '+query , retmax=300)
pubmedquery = str(year) + '[dp] '+query pubmedquery = str(year) + '[dp] '+query
globalresults = self.medlineEsearch(pubmedquery) globalresults = self.medlineEsearch(pubmedquery)
N+=globalresults["count"] if globalresults["count"]>0:
querymetadata = { N+=globalresults["count"]
"string": pubmedquery , querymetadata = {
"count": globalresults["count"] , "string": pubmedquery ,
"queryKey":globalresults["queryKey"] , "count": globalresults["count"] ,
"webEnv":globalresults["webEnv"] , "queryKey":globalresults["queryKey"] ,
"retmax":0 "webEnv":globalresults["webEnv"] ,
} "retmax":0
thequeries.append ( querymetadata ) }
thequeries.append ( querymetadata )
print("Total Number:", N,"publications") print("Total Number:", N,"publications")
print("And i want just:",globalLimit,"publications") print("And i want just:",globalLimit,"publications")
print("---------------------------------------\n") print("---------------------------------------\n")
for query in thequeries: for i,query in enumerate(thequeries):
k = query["count"] k = query["count"]
percentage = k/float(N) percentage = k/float(N)
retmax_forthisyear = int(round(globalLimit*percentage)) retmax_forthisyear = int(round(globalLimit*percentage))
query["retmax"] = retmax_forthisyear query["retmax"] = retmax_forthisyear
# self.medlineEfetchRAW( query )
print ('Done !')
return thequeries return thequeries
......
...@@ -12,6 +12,11 @@ from gargantext_web.api import JsonHttpResponse ...@@ -12,6 +12,11 @@ from gargantext_web.api import JsonHttpResponse
from urllib.request import urlopen, urlretrieve from urllib.request import urlopen, urlretrieve
import json import json
from gargantext_web.settings import MEDIA_ROOT
from datetime import datetime
from django.core.files import File
from gargantext_web.settings import DEBUG
from node.models import Language, ResourceType, Resource, \ from node.models import Language, ResourceType, Resource, \
Node, NodeType, Node_Resource, Project, Corpus, \ Node, NodeType, Node_Resource, Project, Corpus, \
Ngram, Node_Ngram, NodeNgramNgram, NodeNodeNgram Ngram, Node_Ngram, NodeNgramNgram, NodeNodeNgram
...@@ -24,7 +29,7 @@ def getGlobalStats(request ): ...@@ -24,7 +29,7 @@ def getGlobalStats(request ):
if request.method == "POST": if request.method == "POST":
query = request.POST["query"] query = request.POST["query"]
instancia = MedlineFetcher() instancia = MedlineFetcher()
alist = instancia.serialFetcher( 5, query , 200 ) alist = instancia.serialFetcher( 5, query , 100 )
data = alist data = alist
return JsonHttpResponse(data) return JsonHttpResponse(data)
...@@ -43,8 +48,6 @@ def doTheQuery(request , project_id): ...@@ -43,8 +48,6 @@ def doTheQuery(request , project_id):
instancia = MedlineFetcher() instancia = MedlineFetcher()
thequeries = json.loads(query) thequeries = json.loads(query)
print("------------------")
urlreqs = [] urlreqs = []
for yearquery in thequeries: for yearquery in thequeries:
urlreqs.append( instancia.medlineEfetchRAW( yearquery ) ) urlreqs.append( instancia.medlineEfetchRAW( yearquery ) )
...@@ -58,71 +61,49 @@ def doTheQuery(request , project_id): ...@@ -58,71 +61,49 @@ def doTheQuery(request , project_id):
""" """
thefile = "how we do this here?" thefile = "how we do this here?"
resource_type = ResourceType() resource_type = ResourceType.objects.get(name="pubmed" )
resource_type.name = name
try:
parent = Node.objects.get(id=project_id)
node_type = NodeType.objects.get(name='Corpus')
type_id = NodeType.objects.get(name='Document').id
user_id = User.objects.get( username=request.user ).id
corpus = Node(
user=request.user,
parent=parent,
type=node_type,
name=name,
)
corpus.save()
parser = PubmedFileParser()
metadata_list = []
for url in urlreqs:
data = urlopen(url)
metadata_list += parser.parse( data.read() )
# corpus.add_resource( user=request.user, type=resource_type, file=data.read() )
break
parent = Node.objects.get(id=project_id)
node_type = NodeType.objects.get(name='Corpus')
type_id = NodeType.objects.get(name='Document').id
user_id = User.objects.get( username=request.user ).id
from parsing.Caches import LanguagesCache corpus = Node(
langages_cache = LanguagesCache() user=request.user,
for i, metadata_values in enumerate(metadata_list): parent=parent,
name = metadata_values.get('title', '')[:200] type=node_type,
language = langages_cache[metadata_values['language_iso2']] if 'language_iso2' in metadata_values else None, name=name,
if isinstance(language, tuple): )
language = language[0]
Node( corpus.save()
user_id = user_id,
type_id = type_id,
name = name,
parent = parent,
language_id = language.id if language else None,
metadata = metadata_values
).save()
parent.children.all().make_metadata_filterable() try:
for url in urlreqs:
type_document = NodeType.objects.get(name='Document') print(url)
print("printing here 01") data = urlopen(url)
parent.children.filter(type_id=type_document.pk).extract_ngrams(keys=['title',]) xmlname = MEDIA_ROOT + '/corpora/%s/%s.xml' % (request.user, str(datetime.now().microsecond))
print("printing here 02") f = open(xmlname, 'w')
myfile = File(f)
myfile.write( data.read().decode('utf-8') )
myfile.close()
f.close()
corpus.add_resource( user=request.user, type=resource_type, file=xmlname )
print("now we've to apply do_tfidf...") try:
if DEBUG is True:
corpus.workflow()
else:
corpus.workflow.apply_async((), countdown=3)
return JsonHttpResponse(["workflow","finished"])
# thetitles = parent.children.filter(type_id=type_document.pk) except Exception as error:
# print(Node.objects.filter(parent=parent)) print(error)
# from analysis.functions import do_tfidf
# do_tfidf(corpus)
print("ca va?") return JsonHttpResponse(["workflow","finished","outside the try-except"])
except Exception as error: except Exception as error:
print("lele",error) print("lele",error)
data = alist data = alist
return JsonHttpResponse(data) return JsonHttpResponse(data)
\ No newline at end of file
...@@ -213,6 +213,7 @@ ...@@ -213,6 +213,7 @@
success: function(data) { success: function(data) {
console.log("in doTheQuery()") console.log("in doTheQuery()")
console.log(data) console.log(data)
location.reload();
}, },
error: function(result) { error: function(result) {
console.log("in doTheQuery(). Data not found"); console.log("in doTheQuery(). Data not found");
...@@ -241,11 +242,14 @@ ...@@ -241,11 +242,14 @@
thequeries = data thequeries = data
var N=0,k=0; var N=0,k=0;
for(var i in thequeries) N += thequeries[i].count
if(N>0) { for(var i in thequeries) N += thequeries[i].count
if( N>0) {
$("#results").html("Result: "+N+" publications in the last 5 years") $("#results").html("Result: "+N+" publications in the last 5 years")
$('#id_thebutton').prop('disabled', false); $('#id_thebutton').prop('disabled', false);
} else {
$("#results").html("No results!.")
$('#id_thebutton').prop('disabled', true);
} }
}, },
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment