Commit d0003ef9 authored by PkSM3's avatar PkSM3

[FEAT] pubmed scrapper

parent ed1311f3
...@@ -274,7 +274,6 @@ def do_tfidf(corpus, reset=True): ...@@ -274,7 +274,6 @@ def do_tfidf(corpus, reset=True):
NodeNodeNgram.objects.filter(nodex=corpus).delete() NodeNodeNgram.objects.filter(nodex=corpus).delete()
if isinstance(corpus, Node) and corpus.type.name == "Corpus": if isinstance(corpus, Node) and corpus.type.name == "Corpus":
print(Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")))
for document in Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")): for document in Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")):
for node_ngram in Node_Ngram.objects.filter(node=document): for node_ngram in Node_Ngram.objects.filter(node=document):
try: try:
......
...@@ -69,7 +69,8 @@ urlpatterns = patterns('', ...@@ -69,7 +69,8 @@ urlpatterns = patterns('',
url(r'^tests/mvc-listdocuments$', views.tests_mvc_listdocuments), url(r'^tests/mvc-listdocuments$', views.tests_mvc_listdocuments),
url(r'^tests/pubmedquery$', pubmedscrapper.getGlobalStats), url(r'^tests/pubmedquery$', pubmedscrapper.getGlobalStats),
url(r'^tests/project/(\d+)/pubmedquery/go$', pubmedscrapper.doTheQuery) url(r'^tests/project/(\d+)/pubmedquery/go$', pubmedscrapper.doTheQuery),
url(r'^tests/project/(\d+)/ISTEXquery/go$', pubmedscrapper.testISTEX)
) )
......
...@@ -223,8 +223,14 @@ def project(request, project_id): ...@@ -223,8 +223,14 @@ def project(request, project_id):
corpus_view['count'] = corpus.children.count() corpus_view['count'] = corpus.children.count()
#just get first element of the corpora and get his type. #just get first element of the corpora and get his type.
corpus_type = Node_Resource.objects.filter(node=corpus)[0].resource.type
list_corpora[corpus_type].append(corpus_view) resource_corpus = Node_Resource.objects.filter(node=corpus)
if len(resource_corpus)>0:
# print(Node_Resource.objects.filter(node=corpus).all())
corpus_type = Node_Resource.objects.filter(node=corpus)[0].resource.type
list_corpora[corpus_type].append(corpus_view)
donut_part[corpus_type] += docs_count
else: print(" Node_Resource = this.corpus(",corpus.pk,") ... nothing, why?")
## For avoiding to list repeated elements, like when u use the dynamic query (per each xml, 1) ## For avoiding to list repeated elements, like when u use the dynamic query (per each xml, 1)
# for node_resource in Node_Resource.objects.filter(node=corpus): # for node_resource in Node_Resource.objects.filter(node=corpus):
...@@ -237,6 +243,8 @@ def project(request, project_id): ...@@ -237,6 +243,8 @@ def project(request, project_id):
if docs_total == 0 or docs_total is None: if docs_total == 0 or docs_total is None:
docs_total = 1 docs_total = 1
# The donut will show: percentage by
donut = [ {'source': key, donut = [ {'source': key,
'count': donut_part[key] , 'count': donut_part[key] ,
'part' : round(donut_part[key] * 100 / docs_total) } \ 'part' : round(donut_part[key] * 100 / docs_total) } \
...@@ -246,12 +254,15 @@ def project(request, project_id): ...@@ -246,12 +254,15 @@ def project(request, project_id):
if request.method == 'POST': if request.method == 'POST':
form = CustomForm(request.POST, request.FILES) form = CustomForm(request.POST, request.FILES)
if form.is_valid(): if form.is_valid():
name = form.cleaned_data['name'] name = form.cleaned_data['name']
thefile = form.cleaned_data['file'] thefile = form.cleaned_data['file']
resource_type = ResourceType.objects.get(id=str( form.cleaned_data['type'] )) print(request.POST['type'])
print(form.cleaned_data['type'])
resource_type = ResourceType.objects.get(name=str( form.cleaned_data['type'] ))
print("-------------") print("-------------")
print(name,"|",resource_type,"|",thefile) print(name,"|",resource_type,"|",thefile)
...@@ -326,6 +337,7 @@ def project(request, project_id): ...@@ -326,6 +337,7 @@ def project(request, project_id):
}) })
else: else:
form = CustomForm() form = CustomForm()
return render(request, 'project.html', { return render(request, 'project.html', {
'form' : form, 'form' : form,
...@@ -748,9 +760,12 @@ def node_link(request, corpus_id): ...@@ -748,9 +760,12 @@ def node_link(request, corpus_id):
''' '''
Create the HttpResponse object with the node_link dataset. Create the HttpResponse object with the node_link dataset.
''' '''
import time
print("In node_link() START") print("In node_link() START")
start = time.time()
data = get_cooc(request=request, corpus_id=corpus_id, type="node_link") data = get_cooc(request=request, corpus_id=corpus_id, type="node_link")
end = time.time()
print ("LOG::TIME: get_cooc() [s]",(end - start))
print("In node_link() END") print("In node_link() END")
return JsonHttpResponse(data) return JsonHttpResponse(data)
......
...@@ -98,13 +98,10 @@ from django import forms ...@@ -98,13 +98,10 @@ from django import forms
from django.utils.translation import ugettext_lazy as _ from django.utils.translation import ugettext_lazy as _
class CustomForm(forms.Form): class CustomForm(forms.Form):
name = forms.CharField( label='Name', max_length=199 , required=True) name = forms.CharField( label='Name', max_length=199 , widget=forms.TextInput(attrs={ 'required': 'true' }))
parsing_options = ResourceType.objects.all().values_list('id', 'name') type = ModelChoiceField( ResourceType.objects.all() , widget=forms.Select(attrs={'onchange':'CustomForSelect( $("option:selected", this).text() );'}) )
type = forms.IntegerField( widget=forms.Select( choices= parsing_options) , required=True )
file = forms.FileField() file = forms.FileField()
# Description: clean_file() # Description: clean_file()
""" """
* file_.content_type - Example: ['application/pdf', 'image/jpeg'] * file_.content_type - Example: ['application/pdf', 'image/jpeg']
......
...@@ -163,6 +163,7 @@ class Node(CTENode): ...@@ -163,6 +163,7 @@ class Node(CTENode):
for node_resource in self.node_resource.filter(parsed=False): for node_resource in self.node_resource.filter(parsed=False):
resource = node_resource.resource resource = node_resource.resource
parser = defaultdict(lambda:FileParser.FileParser, { parser = defaultdict(lambda:FileParser.FileParser, {
'istext' : ISText,
'pubmed' : PubmedFileParser, 'pubmed' : PubmedFileParser,
'isi' : IsiFileParser, 'isi' : IsiFileParser,
'ris' : RisFileParser, 'ris' : RisFileParser,
...@@ -171,6 +172,7 @@ class Node(CTENode): ...@@ -171,6 +172,7 @@ class Node(CTENode):
'europress_english' : EuropressFileParser, 'europress_english' : EuropressFileParser,
})[resource.type.name]() })[resource.type.name]()
metadata_list += parser.parse(str(resource.file)) metadata_list += parser.parse(str(resource.file))
# print(parser.parse(str(resource.file)))
# retrieve info from the database # retrieve info from the database
type_id = NodeType.objects.get(name='Document').id type_id = NodeType.objects.get(name='Document').id
langages_cache = LanguagesCache() langages_cache = LanguagesCache()
...@@ -183,6 +185,8 @@ class Node(CTENode): ...@@ -183,6 +185,8 @@ class Node(CTENode):
language = langages_cache[metadata_values['language_iso2']] if 'language_iso2' in metadata_values else None, language = langages_cache[metadata_values['language_iso2']] if 'language_iso2' in metadata_values else None,
if isinstance(language, tuple): if isinstance(language, tuple):
language = language[0] language = language[0]
# print("metadata_values:")
# print("\t",metadata_values,"\n- - - - - - - - - - - - ")
Node( Node(
user_id = user_id, user_id = user_id,
type_id = type_id, type_id = type_id,
...@@ -191,7 +195,6 @@ class Node(CTENode): ...@@ -191,7 +195,6 @@ class Node(CTENode):
language_id = language.id if language else None, language_id = language.id if language else None,
metadata = metadata_values metadata = metadata_values
).save() ).save()
# make metadata filterable # make metadata filterable
self.children.all().make_metadata_filterable() self.children.all().make_metadata_filterable()
...@@ -236,17 +239,32 @@ class Node(CTENode): ...@@ -236,17 +239,32 @@ class Node(CTENode):
@current_app.task(filter=task_method) @current_app.task(filter=task_method)
def workflow(self, keys=None, ngramsextractorscache=None, ngramscaches=None, verbose=False): def workflow(self, keys=None, ngramsextractorscache=None, ngramscaches=None, verbose=False):
print("In workflow() parse_resources()") import time
print("LOG::TIME: In workflow() parse_resources()")
start = time.time()
self.parse_resources() self.parse_resources()
print("In workflow() / parse_resources()") end = time.time()
print("In workflow() extract_ngrams()") print ("LOG::TIME: parse_resources() [s]",(end - start))
print("LOG::TIME: In workflow() / parse_resources()")
start = time.time()
print("LOG::TIME: In workflow() extract_ngrams()")
type_document = NodeType.objects.get(name='Document') type_document = NodeType.objects.get(name='Document')
self.children.filter(type_id=type_document.pk).extract_ngrams(keys=['title',]) self.children.filter(type_id=type_document.pk).extract_ngrams(keys=['title',])
print("In workflow() / extract_ngrams()") end = time.time()
print ("LOG::TIME: ",(end - start))
print ("LOG::TIME: extract_ngrams() [s]",(end - start))
print("LOG::TIME: In workflow() / extract_ngrams()")
start = time.time()
print("In workflow() do_tfidf()") print("In workflow() do_tfidf()")
from analysis.functions import do_tfidf from analysis.functions import do_tfidf
do_tfidf(self) do_tfidf(self)
print("In workflow() / do_tfidf()") end = time.time()
print ("LOG::TIME: do_tfidf() [s]",(end - start))
print("LOG::TIME: In workflow() / do_tfidf()")
print("In workflow() END") print("In workflow() END")
class Node_Metadata(models.Model): class Node_Metadata(models.Model):
......
...@@ -25,6 +25,7 @@ class PubmedFileParser(FileParser): ...@@ -25,6 +25,7 @@ class PubmedFileParser(FileParser):
metadata_path = { metadata_path = {
"journal" : 'MedlineCitation/Article/Journal/Title', "journal" : 'MedlineCitation/Article/Journal/Title',
"title" : 'MedlineCitation/Article/ArticleTitle', "title" : 'MedlineCitation/Article/ArticleTitle',
"title2" : 'MedlineCitation/Article/VernacularTitle',
"language_iso3" : 'MedlineCitation/Article/Language', "language_iso3" : 'MedlineCitation/Article/Language',
"doi" : 'PubmedData/ArticleIdList/ArticleId[@type=doi]', "doi" : 'PubmedData/ArticleIdList/ArticleId[@type=doi]',
"realdate_full_" : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/MedlineDate', "realdate_full_" : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/MedlineDate',
...@@ -51,6 +52,13 @@ class PubmedFileParser(FileParser): ...@@ -51,6 +52,13 @@ class PubmedFileParser(FileParser):
except: except:
pass pass
#Title-Decision
Title=""
if not metadata["title"] or metadata["title"]=="":
if "title2" in metadata:
metadata["title"] = metadata["title2"]
else: metadata["title"] = ""
# Date-Decision # Date-Decision
# forge.iscpif.fr/issues/1418 # forge.iscpif.fr/issues/1418
RealDate = "" RealDate = ""
...@@ -68,19 +76,25 @@ class PubmedFileParser(FileParser): ...@@ -68,19 +76,25 @@ class PubmedFileParser(FileParser):
if "publication_month" in metadata: PubmedDate+=" "+metadata["publication_month"] if "publication_month" in metadata: PubmedDate+=" "+metadata["publication_month"]
if "publication_day" in metadata: PubmedDate+=" "+metadata["publication_day"] if "publication_day" in metadata: PubmedDate+=" "+metadata["publication_day"]
Decision=""
if len(RealDate)>4: if len(RealDate)>4:
if len(RealDate)>8: decision = datetime.strptime(RealDate, '%Y %b %d').date() if len(RealDate)>8:
else: decision = datetime.strptime(RealDate, '%Y %b').date() try: Decision = datetime.strptime(RealDate, '%Y %b %d').date()
else: decision = datetime.strptime(PubmedDate, '%Y %m %d').date() except: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
else:
try: Decision = datetime.strptime(RealDate, '%Y %b').date()
except: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
else: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
if "publication_year" in metadata: metadata["publication_year"] = str(decision.year) if "publication_year" in metadata: metadata["publication_year"] = str(Decision.year)
if "publication_month" in metadata: metadata["publication_month"] = str(decision.month) if "publication_month" in metadata: metadata["publication_month"] = str(Decision.month)
if "publication_day" in metadata: metadata["publication_day"] = str(decision.day) if "publication_day" in metadata: metadata["publication_day"] = str(Decision.day)
if "realdate_year_" in metadata: metadata.pop("realdate_year_") if "realdate_year_" in metadata: metadata.pop("realdate_year_")
if "realdate_month_" in metadata: metadata.pop("realdate_month_") if "realdate_month_" in metadata: metadata.pop("realdate_month_")
if "realdate_day_" in metadata: metadata.pop("realdate_day_") if "realdate_day_" in metadata: metadata.pop("realdate_day_")
if "title2" in metadata: metadata.pop("title2")
# print(metadata)
metadata_list.append(metadata) metadata_list.append(metadata)
# return the list of metadata # return the list of metadata
return metadata_list return metadata_list
...@@ -2,3 +2,4 @@ from parsing.FileParsers.RisFileParser import RisFileParser ...@@ -2,3 +2,4 @@ from parsing.FileParsers.RisFileParser import RisFileParser
from parsing.FileParsers.IsiFileParser import IsiFileParser from parsing.FileParsers.IsiFileParser import IsiFileParser
from parsing.FileParsers.PubmedFileParser import PubmedFileParser from parsing.FileParsers.PubmedFileParser import PubmedFileParser
from parsing.FileParsers.EuropressFileParser import EuropressFileParser from parsing.FileParsers.EuropressFileParser import EuropressFileParser
from parsing.FileParsers.ISText import ISText
...@@ -10,29 +10,36 @@ import os ...@@ -10,29 +10,36 @@ import os
import time import time
# import libxml2 # import libxml2
from lxml import etree from lxml import etree
from datetime import datetime
from django.core.files import File
import threading
from queue import Queue
import time
class MedlineFetcher: class MedlineFetcher:
def __init__(self): def __init__(self):
self.queue_size = 8
self.q = Queue()
self.firstResults = []
self.lock = threading.Lock() # lock to serialize console output
self.pubMedEutilsURL = 'http://www.ncbi.nlm.nih.gov/entrez/eutils' self.pubMedEutilsURL = 'http://www.ncbi.nlm.nih.gov/entrez/eutils'
self.pubMedDB = 'Pubmed' self.pubMedDB = 'Pubmed'
self.reportType = 'medline' self.reportType = 'medline'
self.personalpath_mainPath = 'MedLine/'
if not os.path.isdir(self.personalpath_mainPath):
os.makedirs(self.personalpath_mainPath)
print ('Created directory ' + self.personalpath_mainPath)
# Return the:
# Return the globalResults!:
# - count = # - count =
# - queryKey = # - queryKey =
# - webEnv = # - webEnv =
def medlineEsearch(self , query): def medlineEsearch(self , query):
print ("MedlineFetcher::medlineEsearch :") # print ("MedlineFetcher::medlineEsearch :")
"Get number of results for query 'query' in variable 'count'" "Get number of results for query 'query' in variable 'count'"
"Get also 'queryKey' and 'webEnv', which are used by function 'medlineEfetch'" "Get also 'queryKey' and 'webEnv', which are used by function 'medlineEfetch'"
origQuery = query
query = query.replace(' ', '%20') query = query.replace(' ', '%20')
eSearch = '%s/esearch.fcgi?db=%s&retmax=1&usehistory=y&term=%s' %(self.pubMedEutilsURL, self.pubMedDB, query) eSearch = '%s/esearch.fcgi?db=%s&retmax=1&usehistory=y&term=%s' %(self.pubMedEutilsURL, self.pubMedDB, query)
...@@ -50,13 +57,7 @@ class MedlineFetcher: ...@@ -50,13 +57,7 @@ class MedlineFetcher:
findwebenv = etree.XPath("/eSearchResult/WebEnv/text()") findwebenv = etree.XPath("/eSearchResult/WebEnv/text()")
webEnv = findwebenv(root)[0] webEnv = findwebenv(root)[0]
# doc = libxml2.parseDoc(data) values = { "query":origQuery , "count": int(str(count)), "queryKey": queryKey , "webEnv":webEnv }
# count = doc.xpathEval('eSearchResult/Count/text()')[0]
# queryKey = doc.xpathEval('eSearchResult/QueryKey/text()')[0]
# webEnv = doc.xpathEval('eSearchResult/WebEnv/text()')[0]
# print count, queryKey, webEnv
values = { "count": int(str(count)), "queryKey": queryKey , "webEnv":webEnv }
print(values)
return values return values
...@@ -72,40 +73,58 @@ class MedlineFetcher: ...@@ -72,40 +73,58 @@ class MedlineFetcher:
queryKey = fullquery["queryKey"] queryKey = fullquery["queryKey"]
webEnv = fullquery["webEnv"] webEnv = fullquery["webEnv"]
print ("MedlineFetcher::medlineEfetchRAW :")
"Fetch medline result for query 'query', saving results to file every 'retmax' articles" "Fetch medline result for query 'query', saving results to file every 'retmax' articles"
queryNoSpace = query.replace(' ', '') # No space in directory and file names, avoids stupid errors queryNoSpace = query.replace(' ', '') # No space in directory and file names, avoids stupid errors
print ("LOG::TIME: ",'medlineEfetchRAW :Query "' , query , '"\t:\t' , count , ' results')
# pubmedqueryfolder = personalpath.pubMedAbstractsPath + 'Pubmed_' + queryNoSpace
# if not os.path.isdir(pubmedqueryfolder):
# os.makedirs(pubmedqueryfolder)
pubMedResultFileName = self.personalpath_mainPath + 'Pubmed_' + queryNoSpace + '.xml'
pubMedResultFile = open(pubMedResultFileName, 'w')
print ('Query "' , query , '"\t:\t' , count , ' results')
print ('Starting fetching at ' , time.asctime(time.localtime()) )
retstart = 0 retstart = 0
# while(retstart < count):
eFetch = '%s/efetch.fcgi?email=youremail@example.org&rettype=%s&retmode=xml&retstart=%s&retmax=%s&db=%s&query_key=%s&WebEnv=%s' %(self.pubMedEutilsURL, self.reportType, retstart, retmax, self.pubMedDB, queryKey, webEnv) eFetch = '%s/efetch.fcgi?email=youremail@example.org&rettype=%s&retmode=xml&retstart=%s&retmax=%s&db=%s&query_key=%s&WebEnv=%s' %(self.pubMedEutilsURL, self.reportType, retstart, retmax, self.pubMedDB, queryKey, webEnv)
return eFetch return eFetch
# if sys.version_info >= (3, 0): pubMedResultFile.write(eFetchResult.read().decode('utf-8'))
# else: pubMedResultFile.write(eFetchResult.read())
# retstart += retmax
# break # you shall not pass !!
# pubMedResultFile.close()
# print ('Fetching for query ' , query , ' finished at ' , time.asctime(time.localtime()) )
# print (retmax , ' results written to file ' , pubMedResultFileName , '\n' )
# print("------------------------------------------")
# return ["everything","ok"]
# generic!
def downloadFile(self, item):
url = item[0]
filename = item[1]
print("\tin downloadFile:")
print(url,filename)
data = urlopen(url)
f = open(filename, 'w')
myfile = File(f)
myfile.write( data.read().decode('utf-8') )
myfile.close()
f.close()
with self.lock:
print(threading.current_thread().name, filename+" OK")
return filename
# generic!
def do_work(self,item):
# time.sleep(1) # pretend to do some lengthy work.
returnvalue = self.medlineEsearch(item)
with self.lock:
print(threading.current_thread().name, item)
return returnvalue
# The worker thread pulls an item from the queue and processes it
def worker(self):
while True:
item = self.q.get()
self.firstResults.append(self.do_work(item))
self.q.task_done()
def worker2(self):
while True:
item = self.q.get()
self.firstResults.append(self.downloadFile(item))
self.q.task_done()
def chunks(self , l , n):
print("chunks:")
for i in range(0, len(l), n):
yield l[i:i+n]
# GLOBALLIMIT: # GLOBALLIMIT:
# I will retrieve this exact amount of publications. # I will retrieve this exact amount of publications.
...@@ -115,22 +134,34 @@ class MedlineFetcher: ...@@ -115,22 +134,34 @@ class MedlineFetcher:
# - GlobalLimit : Number of publications i want. # - GlobalLimit : Number of publications i want.
def serialFetcher(self , yearsNumber , query, globalLimit): def serialFetcher(self , yearsNumber , query, globalLimit):
# Create the queue and thread pool.
for i in range(self.queue_size):
t = threading.Thread(target=self.worker)
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
start = time.perf_counter()
N = 0 N = 0
print ("MedlineFetcher::serialFetcher :") print ("MedlineFetcher::serialFetcher :")
thequeries = [] thequeries = []
globalresults = []
for i in range(yearsNumber): for i in range(yearsNumber):
year = str(2015 - i) year = str(2015 - i)
print ('YEAR ' + year) print ('YEAR ' + year)
print ('---------\n') print ('---------\n')
# medlineEfetch(str(year) + '[dp] '+query , 20000)
# medlineEfetchRAW(str(year) + '[dp] '+query , retmax=300)
pubmedquery = str(year) + '[dp] '+query pubmedquery = str(year) + '[dp] '+query
globalresults = self.medlineEsearch(pubmedquery) self.q.put( pubmedquery ) #put task in the queue
self.q.join()
print('time:',time.perf_counter() - start)
for globalresults in self.firstResults:
# globalresults = self.medlineEsearch(pubmedquery)
if globalresults["count"]>0: if globalresults["count"]>0:
N+=globalresults["count"] N+=globalresults["count"]
querymetadata = { querymetadata = {
"string": pubmedquery , "string": globalresults["query"] ,
"count": globalresults["count"] , "count": globalresults["count"] ,
"queryKey":globalresults["queryKey"] , "queryKey":globalresults["queryKey"] ,
"webEnv":globalresults["webEnv"] , "webEnv":globalresults["webEnv"] ,
...@@ -149,11 +180,3 @@ class MedlineFetcher: ...@@ -149,11 +180,3 @@ class MedlineFetcher:
query["retmax"] = retmax_forthisyear query["retmax"] = retmax_forthisyear
return thequeries return thequeries
# serialFetcher(yearsNumber=3, 'microbiota' , globalLimit=100 )
# query = str(2015)+ '[dp] '+'microbiota'
# medlineEsearch( query )
#
...@@ -14,6 +14,8 @@ import json ...@@ -14,6 +14,8 @@ import json
from gargantext_web.settings import MEDIA_ROOT from gargantext_web.settings import MEDIA_ROOT
from datetime import datetime from datetime import datetime
import time
import threading
from django.core.files import File from django.core.files import File
from gargantext_web.settings import DEBUG from gargantext_web.settings import DEBUG
...@@ -28,14 +30,16 @@ def getGlobalStats(request ): ...@@ -28,14 +30,16 @@ def getGlobalStats(request ):
if request.method == "POST": if request.method == "POST":
query = request.POST["query"] query = request.POST["query"]
print ("LOG::TIME: query =", query )
print ("LOG::TIME: N =", 300 )
instancia = MedlineFetcher() instancia = MedlineFetcher()
alist = instancia.serialFetcher( 5, query , 100 ) # alist = instancia.serialFetcher( 5, query , int(request.POST["N"]) )
alist = instancia.serialFetcher( 5, query , 300 )
data = alist data = alist
return JsonHttpResponse(data) return JsonHttpResponse(data)
from parsing.FileParsers import PubmedFileParser
def doTheQuery(request , project_id): def doTheQuery(request , project_id):
alist = ["hola","mundo"] alist = ["hola","mundo"]
...@@ -78,17 +82,20 @@ def doTheQuery(request , project_id): ...@@ -78,17 +82,20 @@ def doTheQuery(request , project_id):
corpus.save() corpus.save()
try: try:
tasks = MedlineFetcher()
# configuring your queue with the event
for i in range(8):
t = threading.Thread(target=tasks.worker2) #thing to do
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
for url in urlreqs: for url in urlreqs:
print(url) filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.now().microsecond))
data = urlopen(url) tasks.q.put( [url , filename]) #put a task in th queue
xmlname = MEDIA_ROOT + '/corpora/%s/%s.xml' % (request.user, str(datetime.now().microsecond)) tasks.q.join() # wait until everything is finished
f = open(xmlname, 'w') for filename in tasks.firstResults:
myfile = File(f) corpus.add_resource( user=request.user, type=resource_type, file=filename )
myfile.write( data.read().decode('utf-8') )
myfile.close()
f.close()
corpus.add_resource( user=request.user, type=resource_type, file=xmlname )
# do the WorkFlow
try: try:
if DEBUG is True: if DEBUG is True:
corpus.workflow() corpus.workflow()
...@@ -96,7 +103,6 @@ def doTheQuery(request , project_id): ...@@ -96,7 +103,6 @@ def doTheQuery(request , project_id):
corpus.workflow.apply_async((), countdown=3) corpus.workflow.apply_async((), countdown=3)
return JsonHttpResponse(["workflow","finished"]) return JsonHttpResponse(["workflow","finished"])
except Exception as error: except Exception as error:
print(error) print(error)
...@@ -106,4 +112,79 @@ def doTheQuery(request , project_id): ...@@ -106,4 +112,79 @@ def doTheQuery(request , project_id):
print("lele",error) print("lele",error)
data = alist data = alist
return JsonHttpResponse(data) return JsonHttpResponse(data)
\ No newline at end of file
def testISTEX(request , project_id):
print(request.method)
alist = ["bar","foo"]
if request.method == "POST":
# print(alist)
query = "-"
query_string = "-"
N = 60
if "query" in request.POST: query = request.POST["query"]
if "string" in request.POST: query_string = request.POST["string"].replace(" ","+")
# if "N" in request.POST: N = request.POST["N"]
print(query_string , query , N)
urlreqs = []
pagesize = 50
tasks = MedlineFetcher()
chunks = list(tasks.chunks(range(N), pagesize))
for k in chunks:
if (k[0]+pagesize)>N: pagesize = N-k[0]
urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
print(urlreqs)
# urlreqs = ["http://localhost/374255" , "http://localhost/374278" ]
resource_type = ResourceType.objects.get(name="istext" )
parent = Node.objects.get(id=project_id)
node_type = NodeType.objects.get(name='Corpus')
type_id = NodeType.objects.get(name='Document').id
user_id = User.objects.get( username=request.user ).id
corpus = Node(
user=request.user,
parent=parent,
type=node_type,
name=query,
)
corpus.save()
# configuring your queue with the event
for i in range(8):
t = threading.Thread(target=tasks.worker2) #thing to do
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
for url in urlreqs:
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.now().microsecond))
tasks.q.put( [url , filename]) #put a task in th queue
tasks.q.join() # wait until everything is finished
for filename in tasks.firstResults:
corpus.add_resource( user=request.user, type=resource_type, file=filename )
corpus.save()
# do the WorkFlow
try:
if DEBUG is True:
corpus.workflow()
else:
corpus.workflow.apply_async((), countdown=3)
return JsonHttpResponse(["workflow","finished"])
except Exception as error:
print(error)
data = [query_string,query,N]
return JsonHttpResponse(data)
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment