Commit a8dff456 authored by Administrator's avatar Administrator

Merge branch 'unstable' into testing

parents 5872b406 f51ea0e3
......@@ -66,6 +66,7 @@ INSTALLED_APPS = (
'cte_tree',
'node',
'ngram',
'scrap_pubmed',
'django_hstore',
'djcelery',
'aldjemy',
......
......@@ -6,6 +6,7 @@ from django.contrib.auth.views import login
from gargantext_web import views
import gargantext_web.api
import scrap_pubmed.views as pubmedscrapper
admin.autodiscover()
......@@ -67,7 +68,12 @@ urlpatterns = patterns('',
url(r'^ngrams$', views.ngrams),
url(r'^nodeinfo/(\d+)$', views.nodeinfo),
url(r'^tests/mvc$', views.tests_mvc),
url(r'^tests/mvc-listdocuments$', views.tests_mvc_listdocuments)
url(r'^tests/mvc-listdocuments$', views.tests_mvc_listdocuments),
url(r'^tests/pubmedquery$', pubmedscrapper.getGlobalStats),
url(r'^tests/project/(\d+)/pubmedquery/go$', pubmedscrapper.doTheQuery),
url(r'^tests/project/(\d+)/ISTEXquery/go$', pubmedscrapper.testISTEX)
)
......
......@@ -260,6 +260,8 @@ def project(request, project_id):
cooclists = ""#.children.filter(type=type_cooclist)
for corpus in corpora:
# print("corpus", corpus.pk , corpus.name , corpus.type_id)
docs_count = corpus.children.count()
docs_total += docs_count
......@@ -267,15 +269,30 @@ def project(request, project_id):
corpus_view['id'] = corpus.pk
corpus_view['name'] = corpus.name
corpus_view['count'] = corpus.children.count()
for node_resource in Node_Resource.objects.filter(node=corpus):
donut_part[node_resource.resource.type] += docs_count
list_corpora[node_resource.resource.type.name].append(corpus_view)
#just get first element of the corpora and get his type.
resource_corpus = Node_Resource.objects.filter(node=corpus)
if len(resource_corpus)>0:
# print(Node_Resource.objects.filter(node=corpus).all())
corpus_type = Node_Resource.objects.filter(node=corpus)[0].resource.type
list_corpora[corpus_type].append(corpus_view)
donut_part[corpus_type] += docs_count
else: print(" Node_Resource = this.corpus(",corpus.pk,") ... nothing, why?")
## For avoiding to list repeated elements, like when u use the dynamic query (per each xml, 1)
# for node_resource in Node_Resource.objects.filter(node=corpus):
# print( "node_resource.id:",node_resource.id , node_resource.resource.file )
# donut_part[node_resource.resource.type] += docs_count
# list_corpora[node_resource.resource.type.name].append(corpus_view)
# print(node_resource.resource.type.name)
list_corpora = dict(list_corpora)
if docs_total == 0 or docs_total is None:
docs_total = 1
# The donut will show: percentage by
donut = [ {'source': key,
'count': donut_part[key] ,
'part' : round(donut_part[key] * 100 / docs_total) } \
......@@ -283,23 +300,21 @@ def project(request, project_id):
if request.method == 'POST':
print("original file:")
print(request.FILES)
form = CustomForm(request.POST, request.FILES)
if form.is_valid():
name = form.cleaned_data['name']
thefile = form.cleaned_data['file']
resource_type = ResourceType.objects.get(id=str( form.cleaned_data['type'] ))
print(request.POST['type'])
print(form.cleaned_data['type'])
resource_type = ResourceType.objects.get(name=str( form.cleaned_data['type'] ))
print("-------------")
print(name,"|",resource_type,"|",thefile)
print("-------------")
print("new file:")
print(thefile)
try:
parent = Node.objects.get(id=project_id)
......@@ -328,8 +343,6 @@ def project(request, project_id):
corpus.save()
print(request.user, resource_type , thefile )
corpus.add_resource(
user=request.user,
type=resource_type,
......@@ -373,79 +386,6 @@ def project(request, project_id):
else:
form = CustomForm()
# if request.method == 'POST':
# #form = CorpusForm(request.POST, request.FILES)
# #print(str(request.POST))
# name = str(request.POST['name'])
# try:
# resource_type = ResourceType.objects.get(id=str(request.POST['type']))
# except Exception as error:
# print(error)
# resource_type = None
# try:
# file = request.FILES['file']
# except Exception as error:
# print(error)
# file = None
# #if name != "" and resource_type is not None and file is not None:
# try:
# parent = Node.objects.get(id=project_id)
# node_type = NodeType.objects.get(name='Corpus')
# if resource_type.name == "europress_french":
# language = Language.objects.get(iso2='fr')
# elif resource_type.name == "europress_english":
# language = Language.objects.get(iso2='en')
# try:
# corpus = Node(
# user=request.user,
# parent=parent,
# type=node_type,
# language=language,
# name=name,
# )
# except:
# corpus = Node(
# user=request.user,
# parent=parent,
# type=node_type,
# name=name,
# )
# corpus.save()
# print(request.user, resource_type , file )
# print(corpus.language)
# corpus.add_resource(
# user=request.user,
# type=resource_type,
# file=file
# )
# try:
# #corpus.parse_and_extract_ngrams()
# #corpus.parse_and_extract_ngrams.apply_async((), countdown=3)
# if DEBUG is True:
# corpus.workflow()
# else:
# corpus.workflow.apply_async((), countdown=3)
# except Exception as error:
# print(error)
# return HttpResponseRedirect('/project/' + str(project_id))
# except Exception as error:
# print('ee', error)
# form = CorpusForm(request=request)
# formResource = ResourceForm()
# else:
# form = CorpusForm(request=request)
# formResource = ResourceForm()
return render(request, 'project.html', {
'form' : form,
......@@ -874,9 +814,12 @@ def node_link(request, corpus_id):
'''
Create the HttpResponse object with the node_link dataset.
'''
import time
print("In node_link() START")
start = time.time()
data = get_cooc(request=request, corpus_id=corpus_id, type="node_link")
end = time.time()
print ("LOG::TIME: get_cooc() [s]",(end - start))
print("In node_link() END")
return JsonHttpResponse(data)
......
......@@ -98,13 +98,10 @@ from django import forms
from django.utils.translation import ugettext_lazy as _
class CustomForm(forms.Form):
name = forms.CharField( label='Name', max_length=199 , required=True)
parsing_options = ResourceType.objects.all().values_list('id', 'name')
type = forms.IntegerField( widget=forms.Select( choices= parsing_options) , required=True )
name = forms.CharField( label='Name', max_length=199 , widget=forms.TextInput(attrs={ 'required': 'true' }))
type = ModelChoiceField( ResourceType.objects.all() , widget=forms.Select(attrs={'onchange':'CustomForSelect( $("option:selected", this).text() );'}) )
file = forms.FileField()
# Description: clean_file()
"""
* file_.content_type - Example: ['application/pdf', 'image/jpeg']
......@@ -120,12 +117,14 @@ class CustomForm(forms.Form):
"""
def clean_file(self):
file_ = self.cleaned_data.get('file')
#Filename length
if len(file_.name)>30:
from datetime import datetime
file_.name = str(datetime.now().microsecond)
# raise forms.ValidationError(_('Come on dude, name too long. Now is:'+file_.name))
#File size
from datetime import datetime
file_.name = str(datetime.now().microsecond)
# #Filename length
# if len(file_.name)>30:
# from datetime import datetime
# file_.name = str(datetime.now().microsecond)
# # raise forms.ValidationError(_('Come on dude, name too long. Now is:'+file_.name))
# #File size
if len(file_)>104857600:
raise forms.ValidationError(_('File to heavy! (<100MB).'))
## File type:
......
......@@ -163,6 +163,7 @@ class Node(CTENode):
for node_resource in self.node_resource.filter(parsed=False):
resource = node_resource.resource
parser = defaultdict(lambda:FileParser.FileParser, {
'istext' : ISText,
'pubmed' : PubmedFileParser,
'isi' : IsiFileParser,
'ris' : RisFileParser,
......@@ -171,6 +172,7 @@ class Node(CTENode):
'europress_english' : EuropressFileParser,
})[resource.type.name]()
metadata_list += parser.parse(str(resource.file))
# print(parser.parse(str(resource.file)))
# retrieve info from the database
type_id = NodeType.objects.get(name='Document').id
langages_cache = LanguagesCache()
......@@ -183,6 +185,8 @@ class Node(CTENode):
language = langages_cache[metadata_values['language_iso2']] if 'language_iso2' in metadata_values else None,
if isinstance(language, tuple):
language = language[0]
# print("metadata_values:")
# print("\t",metadata_values,"\n- - - - - - - - - - - - ")
Node(
user_id = user_id,
type_id = type_id,
......@@ -191,7 +195,6 @@ class Node(CTENode):
language_id = language.id if language else None,
metadata = metadata_values
).save()
# make metadata filterable
self.children.all().make_metadata_filterable()
......@@ -236,14 +239,34 @@ class Node(CTENode):
@current_app.task(filter=task_method)
def workflow(self, keys=None, ngramsextractorscache=None, ngramscaches=None, verbose=False):
print("In workflow() START")
import time
print("LOG::TIME: In workflow() parse_resources()")
start = time.time()
self.metadata['Processing'] = 1
self.save()
self.parse_resources()
end = time.time()
print ("LOG::TIME: parse_resources() [s]",(end - start))
print("LOG::TIME: In workflow() / parse_resources()")
start = time.time()
print("LOG::TIME: In workflow() extract_ngrams()")
type_document = NodeType.objects.get(name='Document')
self.children.filter(type_id=type_document.pk).extract_ngrams(keys=['title',])
end = time.time()
print ("LOG::TIME: ",(end - start))
print ("LOG::TIME: extract_ngrams() [s]",(end - start))
print("LOG::TIME: In workflow() / extract_ngrams()")
start = time.time()
print("In workflow() do_tfidf()")
from analysis.functions import do_tfidf
do_tfidf(self)
end = time.time()
print ("LOG::TIME: do_tfidf() [s]",(end - start))
print("LOG::TIME: In workflow() / do_tfidf()")
print("In workflow() END")
self.metadata['Processing'] = 0
self.save()
......
from django.db import transaction
from lxml import etree
from .FileParser import FileParser
from ..NgramsExtractors import *
from datetime import datetime
from io import BytesIO
import json
class ISText(FileParser):
def _parse(self, thefile):
json_data=open(thefile,"r")
data = json.load(json_data)
json_data.close()
json_docs = data["hits"]
metadata_list = []
metadata_path = {
"id" : "id",
"source" : 'corpusName',
"title" : 'title',
"genre" : "genre",
# "language_iso3" : 'MedlineCitation/Article/Language',
"doi" : 'doi',
"host" : 'host',
"publication_date" : 'pubdate',
# "authors" : 'author',
"authorsRAW" : 'author',
"keywords" : "keywords"
}
metadata = {}
import pprint
import datetime
for json_doc in json_docs:
for key, path in metadata_path.items():
try:
# print(path," ==> ",len(json_doc[path]))
metadata[key] = json_doc[path]
except: pass
# print("|",metadata["publication_date"])
if "doi" in metadata: metadata["doi"] = metadata["doi"][0]
keywords = []
if "keywords" in metadata:
for keyw in metadata["keywords"]:
keywords.append(keyw["value"] )
metadata["keywords"] = ", ".join( keywords )
moredate=False
moresource=False
if "host" in metadata:
if "genre" in metadata["host"] and len(metadata["host"]["genre"])>0:
if "genre" in metadata and len(metadata["genre"])==0:
metadata["genre"] = metadata["host"]["genre"]
# print(metadata["host"])
if "pubdate" in metadata["host"]:
onebuffer = metadata["publication_date"]
metadata["publication_date"] = []
metadata["publication_date"].append(onebuffer)
metadata["publication_date"].append( metadata["host"]["pubdate"] )
if "title" in metadata["host"]:
metadata["journal"] = metadata["host"]["title"]
authors=False
if "authorsRAW" in metadata:
names = []
for author in metadata["authorsRAW"]:
names.append(author["name"])
metadata["authors"] = ", ".join(names)
if "host" in metadata: metadata.pop("host")
if "genre" in metadata:
if len(metadata["genre"])==0:
metadata.pop("genre")
if "publication_date" in metadata and isinstance(metadata["publication_date"], list):
if len(metadata["publication_date"])>1:
d1 = metadata["publication_date"][0]
d2 = metadata["publication_date"][1]
# print("date1:",d1)
# print("date2:",d2)
if len(d1)==len(d2):
metadata["publication_date"] = d2
# if int(d1)>int(d2): metadata["publication_date"] = d2
else:
fulldate = ""
year = d2[:4]
fulldate+=year
if len(d2)>4:
month = d2[4:6]
fulldate+="-"+month
if len(d2)>6:
day = d2[6:8]
fulldate+="-"+day
metadata["publication_date"] = fulldate
else:
if "copyrightdate" in json_doc:
metadata["publication_date"] = json_doc["copyrightdate"]
else:
if "copyrightdate" in json_doc:
metadata["publication_date"] = json_doc["copyrightdate"]
print("||",metadata["title"])
metadata_list.append(metadata)
print("=============================")
print("\nlen list:",len(metadata_list))
return metadata_list
......@@ -2,13 +2,19 @@ from django.db import transaction
from lxml import etree
from .FileParser import FileParser
from ..NgramsExtractors import *
from datetime import datetime
from io import BytesIO
class PubmedFileParser(FileParser):
def _parse(self, file):
# open the file as XML
xml_parser = etree.XMLParser(resolve_entities=False, recover=True)
xml = etree.parse(file, parser=xml_parser)
xml = ""
if type(file)==bytes: xml = etree.parse( BytesIO(file) , parser=xml_parser)
else: xml = etree.parse(file, parser=xml_parser)
xml_articles = xml.findall('PubmedArticle')
# initialize the list of metadata
metadata_list = []
......@@ -19,9 +25,13 @@ class PubmedFileParser(FileParser):
metadata_path = {
"journal" : 'MedlineCitation/Article/Journal/Title',
"title" : 'MedlineCitation/Article/ArticleTitle',
"title2" : 'MedlineCitation/Article/VernacularTitle',
"language_iso3" : 'MedlineCitation/Article/Language',
"doi" : 'PubmedData/ArticleIdList/ArticleId[@type=doi]',
"abstract" : 'MedlineCitation/Article/Abstract/AbstractText',
"realdate_full_" : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/MedlineDate',
"realdate_year_" : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/Year',
"realdate_month_" : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/Month',
"realdate_day_" : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/Day',
"publication_year" : 'MedlineCitation/DateCreated/Year',
"publication_month" : 'MedlineCitation/DateCreated/Month',
"publication_day" : 'MedlineCitation/DateCreated/Day',
......@@ -30,6 +40,7 @@ class PubmedFileParser(FileParser):
for key, path in metadata_path.items():
try:
xml_node = xml_article.find(path)
# Authors tag
if key == 'authors':
metadata[key] = ', '.join([
xml_author.find('ForeName').text + ' ' + xml_author.find('LastName').text
......@@ -37,8 +48,53 @@ class PubmedFileParser(FileParser):
])
else:
metadata[key] = xml_node.text
except:
pass
#Title-Decision
Title=""
if not metadata["title"] or metadata["title"]=="":
if "title2" in metadata:
metadata["title"] = metadata["title2"]
else: metadata["title"] = ""
# Date-Decision
# forge.iscpif.fr/issues/1418
RealDate = ""
if "realdate_full_" in metadata:
RealDate = metadata["realdate_full_"]
else:
if "realdate_year_" in metadata: RealDate+=metadata["realdate_year_"]
if "realdate_month_" in metadata: RealDate+=" "+metadata["realdate_month_"]
if "realdate_day_" in metadata: RealDate+=" "+metadata["realdate_day_"]
metadata["realdate_full_"] = RealDate
RealDate = RealDate.split("-")[0]
PubmedDate = ""
if "publication_year" in metadata: PubmedDate+=metadata["publication_year"]
if "publication_month" in metadata: PubmedDate+=" "+metadata["publication_month"]
if "publication_day" in metadata: PubmedDate+=" "+metadata["publication_day"]
Decision=""
if len(RealDate)>4:
if len(RealDate)>8:
try: Decision = datetime.strptime(RealDate, '%Y %b %d').date()
except: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
else:
try: Decision = datetime.strptime(RealDate, '%Y %b').date()
except: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
else: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
if "publication_year" in metadata: metadata["publication_year"] = str(Decision.year)
if "publication_month" in metadata: metadata["publication_month"] = str(Decision.month)
if "publication_day" in metadata: metadata["publication_day"] = str(Decision.day)
if "realdate_year_" in metadata: metadata.pop("realdate_year_")
if "realdate_month_" in metadata: metadata.pop("realdate_month_")
if "realdate_day_" in metadata: metadata.pop("realdate_day_")
if "title2" in metadata: metadata.pop("title2")
# print(metadata)
metadata_list.append(metadata)
# return the list of metadata
return metadata_list
......@@ -2,3 +2,4 @@ from parsing.FileParsers.RisFileParser import RisFileParser
from parsing.FileParsers.IsiFileParser import IsiFileParser
from parsing.FileParsers.PubmedFileParser import PubmedFileParser
from parsing.FileParsers.EuropressFileParser import EuropressFileParser
from parsing.FileParsers.ISText import ISText
# ****************************
# ***** Medline Fetcher *****
# ****************************
# MEDLINE USER REQUIREMENT : Run retrieval scripts on weekends or between 9 pm and 5 am Eastern Time weekdays
import sys
if sys.version_info >= (3, 0): from urllib.request import urlopen
else: from urllib import urlopen
import os
import time
# import libxml2
from lxml import etree
from datetime import datetime
from django.core.files import File
import threading
from queue import Queue
import time
class MedlineFetcher:
def __init__(self):
self.queue_size = 8
self.q = Queue()
self.firstResults = []
self.lock = threading.Lock() # lock to serialize console output
self.pubMedEutilsURL = 'http://www.ncbi.nlm.nih.gov/entrez/eutils'
self.pubMedDB = 'Pubmed'
self.reportType = 'medline'
# Return the globalResults!:
# - count =
# - queryKey =
# - webEnv =
def medlineEsearch(self , query):
# print ("MedlineFetcher::medlineEsearch :")
"Get number of results for query 'query' in variable 'count'"
"Get also 'queryKey' and 'webEnv', which are used by function 'medlineEfetch'"
origQuery = query
query = query.replace(' ', '%20')
eSearch = '%s/esearch.fcgi?db=%s&retmax=1&usehistory=y&term=%s' %(self.pubMedEutilsURL, self.pubMedDB, query)
eSearchResult = urlopen(eSearch)
data = eSearchResult.read()
root = etree.XML(data)
findcount = etree.XPath("/eSearchResult/Count/text()")
count = findcount(root)[0]
findquerykey = etree.XPath("/eSearchResult/QueryKey/text()")
queryKey = findquerykey(root)[0]
findwebenv = etree.XPath("/eSearchResult/WebEnv/text()")
webEnv = findwebenv(root)[0]
values = { "query":origQuery , "count": int(str(count)), "queryKey": queryKey , "webEnv":webEnv }
return values
# RETMAX:
# Total number of UIDs from the retrieved set to be shown in the XML output (default=20)
# maximum of 100,000 records
def medlineEfetchRAW( self , fullquery):
query = fullquery["string"]
retmax = fullquery["retmax"]
count = fullquery["count"]
queryKey = fullquery["queryKey"]
webEnv = fullquery["webEnv"]
"Fetch medline result for query 'query', saving results to file every 'retmax' articles"
queryNoSpace = query.replace(' ', '') # No space in directory and file names, avoids stupid errors
print ("LOG::TIME: ",'medlineEfetchRAW :Query "' , query , '"\t:\t' , count , ' results')
retstart = 0
eFetch = '%s/efetch.fcgi?email=youremail@example.org&rettype=%s&retmode=xml&retstart=%s&retmax=%s&db=%s&query_key=%s&WebEnv=%s' %(self.pubMedEutilsURL, self.reportType, retstart, retmax, self.pubMedDB, queryKey, webEnv)
return eFetch
def ensure_dir(self , f):
d = os.path.dirname(f)
if not os.path.exists(d):
os.makedirs(d)
# generic!
def downloadFile(self, item):
url = item[0]
filename = item[1]
print("\tin downloadFile:")
print(url,filename)
data = urlopen(url)
f = open(filename, 'w')
myfile = File(f)
myfile.write( data.read().decode('utf-8') )
myfile.close()
f.close()
with self.lock:
print(threading.current_thread().name, filename+" OK")
return filename
# generic!
def do_work(self,item):
# time.sleep(1) # pretend to do some lengthy work.
returnvalue = self.medlineEsearch(item)
with self.lock:
print(threading.current_thread().name, item)
return returnvalue
# The worker thread pulls an item from the queue and processes it
def worker(self):
while True:
item = self.q.get()
self.firstResults.append(self.do_work(item))
self.q.task_done()
def worker2(self):
while True:
item = self.q.get()
self.firstResults.append(self.downloadFile(item))
self.q.task_done()
def chunks(self , l , n):
print("chunks:")
for i in range(0, len(l), n):
yield l[i:i+n]
# GLOBALLIMIT:
# I will retrieve this exact amount of publications.
# The publications per year i'll retrieve per year will be = (k/N)*GlobalLimit <- i'll use this as RETMAX
# - k : Number of publications of x year (according to pubmed)
# - N : Sum of every k belonging to {X} (total number of pubs according to pubmed)
# - GlobalLimit : Number of publications i want.
def serialFetcher(self , yearsNumber , query, globalLimit):
# Create the queue and thread pool.
for i in range(self.queue_size):
t = threading.Thread(target=self.worker)
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
start = time.perf_counter()
N = 0
print ("MedlineFetcher::serialFetcher :")
thequeries = []
globalresults = []
for i in range(yearsNumber):
year = str(2015 - i)
print ('YEAR ' + year)
print ('---------\n')
pubmedquery = str(year) + '[dp] '+query
self.q.put( pubmedquery ) #put task in the queue
self.q.join()
print('time:',time.perf_counter() - start)
for globalresults in self.firstResults:
# globalresults = self.medlineEsearch(pubmedquery)
if globalresults["count"]>0:
N+=globalresults["count"]
querymetadata = {
"string": globalresults["query"] ,
"count": globalresults["count"] ,
"queryKey":globalresults["queryKey"] ,
"webEnv":globalresults["webEnv"] ,
"retmax":0
}
thequeries.append ( querymetadata )
print("Total Number:", N,"publications")
print("And i want just:",globalLimit,"publications")
print("---------------------------------------\n")
for i,query in enumerate(thequeries):
k = query["count"]
percentage = k/float(N)
retmax_forthisyear = int(round(globalLimit*percentage))
query["retmax"] = retmax_forthisyear
return thequeries
from django.contrib import admin
# Register your models here.
from django.db import models
# Create your models here.
from django.test import TestCase
# Create your tests here.
from django.shortcuts import redirect
from django.shortcuts import render
from django.http import Http404, HttpResponse, HttpResponseRedirect
from django.template.loader import get_template
from django.template import Context
from django.contrib.auth.models import User
from scrap_pubmed.MedlineFetcherDavid2015 import MedlineFetcher
from gargantext_web.api import JsonHttpResponse
from urllib.request import urlopen, urlretrieve
import json
from gargantext_web.settings import MEDIA_ROOT
from datetime import datetime
import time
import os
import threading
from django.core.files import File
from gargantext_web.settings import DEBUG
from node.models import Language, ResourceType, Resource, \
Node, NodeType, Node_Resource, Project, Corpus, \
Ngram, Node_Ngram, NodeNgramNgram, NodeNodeNgram
def getGlobalStats(request ):
print(request.method)
alist = ["bar","foo"]
if request.method == "POST":
query = request.POST["query"]
print ("LOG::TIME: query =", query )
print ("LOG::TIME: N =", 300 )
instancia = MedlineFetcher()
# alist = instancia.serialFetcher( 5, query , int(request.POST["N"]) )
alist = instancia.serialFetcher( 5, query , 300 )
data = alist
return JsonHttpResponse(data)
def doTheQuery(request , project_id):
alist = ["hola","mundo"]
if request.method == "POST":
query = request.POST["query"]
name = request.POST["string"]
instancia = MedlineFetcher()
thequeries = json.loads(query)
urlreqs = []
for yearquery in thequeries:
urlreqs.append( instancia.medlineEfetchRAW( yearquery ) )
alist = ["tudo fixe" , "tudo bem"]
"""
urlreqs: List of urls to query.
- Then, to each url in urlreqs you do:
eFetchResult = urlopen(url)
eFetchResult.read() # this will output the XML... normally you write this to a XML-file.
"""
thefile = "how we do this here?"
resource_type = ResourceType.objects.get(name="pubmed" )
parent = Node.objects.get(id=project_id)
node_type = NodeType.objects.get(name='Corpus')
type_id = NodeType.objects.get(name='Document').id
user_id = User.objects.get( username=request.user ).id
corpus = Node(
user=request.user,
parent=parent,
type=node_type,
name=name,
)
corpus.save()
try:
tasks = MedlineFetcher()
tasks.ensure_dir ( MEDIA_ROOT + '/corpora/'+str(request.user)+"/" )
# configuring your queue with the event
for i in range(8):
t = threading.Thread(target=tasks.worker2) #thing to do
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
for url in urlreqs:
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.now().microsecond))
tasks.q.put( [url , filename]) #put a task in th queue
tasks.q.join() # wait until everything is finished
for filename in tasks.firstResults:
corpus.add_resource( user=request.user, type=resource_type, file=filename )
# do the WorkFlow
try:
if DEBUG is True:
corpus.workflow()
else:
corpus.workflow.apply_async((), countdown=3)
return JsonHttpResponse(["workflow","finished"])
except Exception as error:
print(error)
return JsonHttpResponse(["workflow","finished","outside the try-except"])
except Exception as error:
print("lele",error)
data = alist
return JsonHttpResponse(data)
def testISTEX(request , project_id):
print(request.method)
alist = ["bar","foo"]
if request.method == "POST":
# print(alist)
query = "-"
query_string = "-"
N = 60
if "query" in request.POST: query = request.POST["query"]
if "string" in request.POST: query_string = request.POST["string"].replace(" ","+")
# if "N" in request.POST: N = request.POST["N"]
print(query_string , query , N)
urlreqs = []
pagesize = 50
tasks = MedlineFetcher()
chunks = list(tasks.chunks(range(N), pagesize))
for k in chunks:
if (k[0]+pagesize)>N: pagesize = N-k[0]
urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
print(urlreqs)
# urlreqs = ["http://localhost/374255" , "http://localhost/374278" ]
resource_type = ResourceType.objects.get(name="istext" )
parent = Node.objects.get(id=project_id)
node_type = NodeType.objects.get(name='Corpus')
type_id = NodeType.objects.get(name='Document').id
user_id = User.objects.get( username=request.user ).id
corpus = Node(
user=request.user,
parent=parent,
type=node_type,
name=query,
)
corpus.save()
# configuring your queue with the event
for i in range(8):
t = threading.Thread(target=tasks.worker2) #thing to do
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
for url in urlreqs:
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.now().microsecond))
tasks.q.put( [url , filename]) #put a task in th queue
tasks.q.join() # wait until everything is finished
for filename in tasks.firstResults:
corpus.add_resource( user=request.user, type=resource_type, file=filename )
corpus.save()
# do the WorkFlow
try:
if DEBUG is True:
corpus.workflow()
else:
corpus.workflow.apply_async((), countdown=3)
return JsonHttpResponse(["workflow","finished"])
except Exception as error:
print(error)
data = [query_string,query,N]
return JsonHttpResponse(data)
This diff is collapsed.
{% extends "menu.html" %}
{% block css %}
{% load staticfiles %}
<link rel="stylesheet" href="{% static "css/bootstrap.css" %}">
<link rel="stylesheet" type="text/css" href="{% static "css/morris.css" %}">
<link rel="stylesheet" type="text/css" href="{% static "css/jquery.easy-pie-chart.css"%}">
<script type="text/javascript" src="{% static "js/jquery/jquery.min.js" %}"></script>
<script src="{% static "js/raphael-min.js"%}"></script>
<script src="{% static "js/morris.min.js"%}"></script>
{% endblock %}
{% block content %}
<div class="container theme-showcase" role="main">
<div class="jumbotron">
<div class="row">
<div class="col-md-6">
{% if project %}
<h1>{{ project.name }}</h1>
<!--<h3> {{number}} corpora </h3>-->
{% endif %}
</div>
<div class="col-md-4">
<p>
{% if donut %}
<div id="hero-donut" style="height: 200px;"></div>
{% endif %}
<center>
<button
type="button"
class="btn btn-primary btn-lg"
data-container="body"
data-toggle="popover"
data-placement="bottom"
>Add a corpus</button>
<div id="popover-content" class="hide">
<form enctype="multipart/form-data" action="/project/{{project.id}}/" method="post">
{% csrf_token %}
{{ form.non_field_errors }}
{{ form.as_p}}
{{ formResource.non_field_errors }}
{{ formResource.as_p}}
<input onclick='$("#semLoader").css("visibility", "visible"); $("#semLoader").show();' type="submit" name="submit" id="submit" class="btn" value="Add this corpus" /><div>
</center>
</p>
</div>
</div>
</div>
</div>
</div>
</div>
<!-- Add jumbotron container for each type of corpus (presse, science etc.) -->
<div id="semLoader" style="position:absolute; top:50%; left:40%; width:80px; visibility: hidden;">
<img src="{% static "js/libs/img2/loading-bar.gif" %}"></img>
</div>
<div class="container">
{% if list_corpora %}
<h1>Resources</h1>
<h2>Corpora</h2>
<ul>
{% for key, corpora in list_corpora.items %}
<li>{{ key }}</li>
<ul>
{% for corpus in corpora %}
<li> {% ifnotequal corpus.count 0 %}
<a href="/project/{{project.id}}/corpus/{{corpus.id}}">
{{corpus.name}}
</a>
, {{ corpus.count }} Documents
{% else %}
{{corpus.name}} : <img width="20px" src="{% static "js/libs/img2/loading-bar.gif" %}"></img> Processing, drink a cup of tea, and refresh the page :)
{% endifnotequal %}
<button type="button" class="btn btn-xs btn-default" data-container="body" data-toggle="popover" data-placement="bottom"
data-content='
<ul>
<li> Rename </li>
<li> Add new documents </li>
<li><a href="/project/{{ project.id }}/corpus/{{ corpus.id}}/delete">Delete</a></li>
</ul>
'>Manage</button>
</li>
{% endfor %}
</ul>
{% endfor %}
</ul>
{% endif %}
{% if list_corporax %}
<div class="col-md-4">
<h3><a href="/project/{{project.id}}/corpus/{{corpus.id}}">{{corpus.name}}</a>
</h3>
<h4>{{ corpus.count }} Documents </h4>
<h5>Activity:</h5>
<div class="chart" data-percent="73">73%</div>
</div>
{% endif %}
{% if whitelists %}
<h2>Lists of Ngrams</h2>
<h3>White Lists</h2>
{% for list in whitelists %}
<ul>
<li> {{list.name }}
</ul>
{% endfor %}
{% endif %}
{% if whitelists %}
<h3>Black Lists</h2>
{% for list in blacklists %}
<ul>
<li> {{list.name }}
</ul>
{% endfor %}
{% endif %}
{% if cooclists %}
<h2>Results (graphs)</h2>
<h3>Cooccurrences Lists</h2>
{% for list in cooclists %}
<ul>
<li> {{list.name }}
</ul>
{% endfor %}
{% endif %}
</div>
<script>
// Morris Donut Chart
Morris.Donut({
element: 'hero-donut',
data: [
{% if donut %}
{% for part in donut %}
{label: '{{ part.source }}', value: {{ part.part }} },
{% endfor %}
{% endif %}
],
colors: ["@white", "@white"],
//colors: ["#30a1ec", "#76bdee"],
formatter: function (y) { return y + "%" }
});
</script>
{% endblock %}
......@@ -63,7 +63,7 @@ function deleteDuplicates(url) {
success: function(data) {
console.log("in DeleteDuplicates")
console.log(data)
$("#delAll").remove();
location.reload();
},
error: function(result) {
console.log("Data not found");
......
......@@ -240,6 +240,7 @@
<li ng-repeat="filter in filters">
<button ng-click="removeFilter($index)" title="remove this filter">x</button>
<span>...where the </span>
<select ng-model="filter.entity" ng-options="entity as entity.key for entity in entities"></select>
<span ng-if="filter.entity.key != 'ngrams'">
<select ng-if="filter.entity" ng-model="filter.column" ng-options="column as column.key for column in filter.entity.columns | orderBy:'key'"></select>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment