[FEAT] pubmed scrapper

d0003ef9 · PkSM3 · ed1311f3 · d0003ef9 · d0003ef9 · d0003ef9
Commit d0003ef9 authored Feb 03, 2015 by PkSM3
10 changed files
--- a/analysis/functions.py
+++ b/analysis/functions.py
@@ -274,7 +274,6 @@ def do_tfidf(corpus, reset=True):
            NodeNodeNgram.objects.filter(nodex=corpus).delete()
        
        if isinstance(corpus, Node) and corpus.type.name == "Corpus":
-            print(Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")))
            for document in Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")):
                for node_ngram in Node_Ngram.objects.filter(node=document):
                    try:

--- a/gargantext_web/urls.py
+++ b/gargantext_web/urls.py
@@ -69,7 +69,8 @@ urlpatterns = patterns('',
    url(r'^tests/mvc-listdocuments$', views.tests_mvc_listdocuments),

    url(r'^tests/pubmedquery$', pubmedscrapper.getGlobalStats),
-    url(r'^tests/project/(\d+)/pubmedquery/go$', pubmedscrapper.doTheQuery)
+    url(r'^tests/project/(\d+)/pubmedquery/go$', pubmedscrapper.doTheQuery),
+    url(r'^tests/project/(\d+)/ISTEXquery/go$', pubmedscrapper.testISTEX)

 )


--- a/gargantext_web/views.py
+++ b/gargantext_web/views.py
@@ -223,8 +223,14 @@ def project(request, project_id):
        corpus_view['count']      = corpus.children.count()

        #just get first element of the corpora and get his type.
-        corpus_type = Node_Resource.objects.filter(node=corpus)[0].resource.type
-        list_corpora[corpus_type].append(corpus_view)
+
+        resource_corpus = Node_Resource.objects.filter(node=corpus)
+        if len(resource_corpus)>0:
+            # print(Node_Resource.objects.filter(node=corpus).all())
+            corpus_type = Node_Resource.objects.filter(node=corpus)[0].resource.type
+            list_corpora[corpus_type].append(corpus_view)
+            donut_part[corpus_type] += docs_count
+        else: print(" Node_Resource = this.corpus(",corpus.pk,") ... nothing, why?")

        ## For avoiding to list repeated elements, like when u use the dynamic query (per each xml, 1)
        # for node_resource in Node_Resource.objects.filter(node=corpus):
@@ -237,6 +243,8 @@ def project(request, project_id):
    if docs_total == 0 or docs_total is None:
        docs_total = 1

+
+    # The donut will show: percentage by  
    donut = [ {'source': key, 
                'count': donut_part[key] , 
                'part' : round(donut_part[key] * 100 / docs_total) } \
@@ -246,12 +254,15 @@ def project(request, project_id):
    if request.method == 'POST':

        form = CustomForm(request.POST, request.FILES)
+
        if form.is_valid():


            name = form.cleaned_data['name']
            thefile = form.cleaned_data['file']
-            resource_type = ResourceType.objects.get(id=str( form.cleaned_data['type'] ))
+            print(request.POST['type'])
+            print(form.cleaned_data['type'])
+            resource_type = ResourceType.objects.get(name=str( form.cleaned_data['type'] ))

            print("-------------")
            print(name,"|",resource_type,"|",thefile)
@@ -326,6 +337,7 @@ def project(request, project_id):
                })
    else:
        form = CustomForm()
+
       
    return render(request, 'project.html', {
            'form'          : form,
@@ -748,9 +760,12 @@ def node_link(request, corpus_id):
    '''
    Create the HttpResponse object with the node_link dataset.
    '''
-
+    import time
    print("In node_link() START")
+    start = time.time()
    data = get_cooc(request=request, corpus_id=corpus_id, type="node_link")
+    end = time.time()
+    print ("LOG::TIME: get_cooc() [s]",(end - start))
    print("In node_link() END")
    return JsonHttpResponse(data)


--- a/node/admin.py
+++ b/node/admin.py
@@ -98,13 +98,10 @@ from django import forms

 from django.utils.translation import ugettext_lazy as _
 class CustomForm(forms.Form):
-    name = forms.CharField( label='Name', max_length=199 , required=True)
-    parsing_options = ResourceType.objects.all().values_list('id', 'name')
-    type = forms.IntegerField( widget=forms.Select( choices= parsing_options) , required=True )
+    name = forms.CharField( label='Name', max_length=199 , widget=forms.TextInput(attrs={ 'required': 'true' }))
+    type = ModelChoiceField( ResourceType.objects.all() , widget=forms.Select(attrs={'onchange':'CustomForSelect( $("option:selected", this).text() );'}) )
    file = forms.FileField()

-
-
    # Description: clean_file()
    """
        * file_.content_type - Example: ['application/pdf', 'image/jpeg']

--- a/node/models.py
+++ b/node/models.py
@@ -163,6 +163,7 @@ class Node(CTENode):
        for node_resource in self.node_resource.filter(parsed=False):
            resource = node_resource.resource
            parser = defaultdict(lambda:FileParser.FileParser, {
+                'istext'    : ISText,
                'pubmed'    : PubmedFileParser,
                'isi'       : IsiFileParser,
                'ris'       : RisFileParser,
@@ -171,6 +172,7 @@ class Node(CTENode):
                'europress_english' : EuropressFileParser,
            })[resource.type.name]()
            metadata_list += parser.parse(str(resource.file))
+            # print(parser.parse(str(resource.file)))
        # retrieve info from the database
        type_id = NodeType.objects.get(name='Document').id
        langages_cache = LanguagesCache()
@@ -183,6 +185,8 @@ class Node(CTENode):
            language = langages_cache[metadata_values['language_iso2']] if 'language_iso2' in metadata_values else None,
            if isinstance(language, tuple):
                language = language[0]
+            # print("metadata_values:")
+            # print("\t",metadata_values,"\n- - - - - - - - - - - - ")
            Node(
                user_id  = user_id,
                type_id  = type_id,
@@ -191,7 +195,6 @@ class Node(CTENode):
                language_id = language.id if language else None,
                metadata = metadata_values
            ).save()
-
        # make metadata filterable
        self.children.all().make_metadata_filterable()

@@ -236,17 +239,32 @@ class Node(CTENode):

    @current_app.task(filter=task_method)
    def workflow(self, keys=None, ngramsextractorscache=None, ngramscaches=None, verbose=False):
-        print("In workflow()    parse_resources()")
+        import time
+
+        print("LOG::TIME: In workflow()    parse_resources()")
+        start = time.time()
        self.parse_resources()
-        print("In workflow()    / parse_resources()")
-        print("In workflow()    extract_ngrams()")
+        end = time.time()
+        print ("LOG::TIME: parse_resources() [s]",(end - start))
+        print("LOG::TIME: In workflow()    / parse_resources()")
+
+        start = time.time()
+        print("LOG::TIME: In workflow()    extract_ngrams()")
        type_document   = NodeType.objects.get(name='Document')
        self.children.filter(type_id=type_document.pk).extract_ngrams(keys=['title',])
-        print("In workflow()    / extract_ngrams()")
+        end = time.time()
+        print ("LOG::TIME: ",(end - start))
+        print ("LOG::TIME: extract_ngrams() [s]",(end - start))
+        print("LOG::TIME: In workflow()    / extract_ngrams()")
+        
+        start = time.time()
        print("In workflow()    do_tfidf()")
        from analysis.functions import do_tfidf
        do_tfidf(self)
-        print("In workflow()    / do_tfidf()")
+        end = time.time()
+        print ("LOG::TIME: do_tfidf() [s]",(end - start))
+        print("LOG::TIME: In workflow()    / do_tfidf()")
+
        print("In workflow() END")

 class Node_Metadata(models.Model):

--- a/parsing/FileParsers/PubmedFileParser.py
+++ b/parsing/FileParsers/PubmedFileParser.py
@@ -25,6 +25,7 @@ class PubmedFileParser(FileParser):
            metadata_path = {
                "journal"           : 'MedlineCitation/Article/Journal/Title',
                "title"             : 'MedlineCitation/Article/ArticleTitle',
+                "title2"            : 'MedlineCitation/Article/VernacularTitle',
                "language_iso3"     : 'MedlineCitation/Article/Language',
                "doi"               : 'PubmedData/ArticleIdList/ArticleId[@type=doi]',
                "realdate_full_"     : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/MedlineDate',
@@ -51,6 +52,13 @@ class PubmedFileParser(FileParser):
                except:
                    pass

+            #Title-Decision
+            Title=""
+            if not metadata["title"] or metadata["title"]=="":
+                if "title2" in metadata:
+                    metadata["title"] = metadata["title2"]
+                else: metadata["title"] = ""
+
            # Date-Decision
            # forge.iscpif.fr/issues/1418
            RealDate = ""
@@ -68,19 +76,25 @@ class PubmedFileParser(FileParser):
            if "publication_month" in metadata: PubmedDate+=" "+metadata["publication_month"]
            if "publication_day" in metadata: PubmedDate+=" "+metadata["publication_day"]

-
+            Decision=""
            if len(RealDate)>4:
-                if len(RealDate)>8: decision = datetime.strptime(RealDate, '%Y %b %d').date()
-                else: decision = datetime.strptime(RealDate, '%Y %b').date()
-            else: decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
+                if len(RealDate)>8:
+                    try: Decision = datetime.strptime(RealDate, '%Y %b %d').date()
+                    except: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
+                else: 
+                    try: Decision = datetime.strptime(RealDate, '%Y %b').date()
+                    except: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
+            else: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()

-            if "publication_year" in metadata: metadata["publication_year"] = str(decision.year)
-            if "publication_month" in metadata: metadata["publication_month"] = str(decision.month)
-            if "publication_day" in metadata: metadata["publication_day"] = str(decision.day)
+            if "publication_year" in metadata: metadata["publication_year"] = str(Decision.year)
+            if "publication_month" in metadata: metadata["publication_month"] = str(Decision.month)
+            if "publication_day" in metadata: metadata["publication_day"] = str(Decision.day)
            if "realdate_year_" in metadata: metadata.pop("realdate_year_")
            if "realdate_month_" in metadata: metadata.pop("realdate_month_")
            if "realdate_day_" in metadata: metadata.pop("realdate_day_")
+            if "title2" in metadata: metadata.pop("title2")
            
+            # print(metadata)
            metadata_list.append(metadata)
        # return the list of metadata
        return metadata_list
--- a/parsing/FileParsers/__init__.py
+++ b/parsing/FileParsers/__init__.py
@@ -2,3 +2,4 @@ from parsing.FileParsers.RisFileParser import RisFileParser
 from parsing.FileParsers.IsiFileParser import IsiFileParser
 from parsing.FileParsers.PubmedFileParser import PubmedFileParser
 from parsing.FileParsers.EuropressFileParser import EuropressFileParser
+from parsing.FileParsers.ISText import ISText
--- a/scrap_pubmed/MedlineFetcherDavid2015.py
+++ b/scrap_pubmed/MedlineFetcherDavid2015.py
@@ -10,29 +10,36 @@ import os
 import time
 # import libxml2
 from lxml import etree
+from datetime import datetime
+from django.core.files import File
+
+import threading
+from queue import Queue
+import time

 class MedlineFetcher:

    def __init__(self):
+        self.queue_size = 8
+        self.q = Queue()
+        self.firstResults = []
+        self.lock = threading.Lock() # lock to serialize console output
        self.pubMedEutilsURL = 'http://www.ncbi.nlm.nih.gov/entrez/eutils'
        self.pubMedDB = 'Pubmed'
        self.reportType = 'medline'
-        self.personalpath_mainPath = 'MedLine/'
-        if not os.path.isdir(self.personalpath_mainPath):
-            os.makedirs(self.personalpath_mainPath)
-            print ('Created directory ' + self.personalpath_mainPath)

-    # Return the:
+
+    # Return the globalResults!:
    # - count = 
    # - queryKey = 
    # - webEnv = 
    def medlineEsearch(self , query):

-        print ("MedlineFetcher::medlineEsearch :")
+        # print ("MedlineFetcher::medlineEsearch :")

        "Get number of results for query 'query' in variable 'count'"
        "Get also 'queryKey' and 'webEnv', which are used by function 'medlineEfetch'"
-        
+        origQuery = query
        query = query.replace(' ', '%20')
            
        eSearch = '%s/esearch.fcgi?db=%s&retmax=1&usehistory=y&term=%s' %(self.pubMedEutilsURL, self.pubMedDB, query)
@@ -50,13 +57,7 @@ class MedlineFetcher:
        findwebenv = etree.XPath("/eSearchResult/WebEnv/text()")
        webEnv = findwebenv(root)[0]

-        # doc = libxml2.parseDoc(data)
-        # count = doc.xpathEval('eSearchResult/Count/text()')[0]
-        # queryKey = doc.xpathEval('eSearchResult/QueryKey/text()')[0]
-        # webEnv = doc.xpathEval('eSearchResult/WebEnv/text()')[0]
-        # print count, queryKey, webEnv
-        values = { "count": int(str(count)), "queryKey": queryKey , "webEnv":webEnv }
-        print(values)
+        values = { "query":origQuery , "count": int(str(count)), "queryKey": queryKey , "webEnv":webEnv }
        return values


@@ -72,40 +73,58 @@ class MedlineFetcher:
        queryKey = fullquery["queryKey"]
        webEnv = fullquery["webEnv"]

-        print ("MedlineFetcher::medlineEfetchRAW :")
-
        "Fetch medline result for query 'query', saving results to file every 'retmax' articles"

        queryNoSpace = query.replace(' ', '') # No space in directory and file names, avoids stupid errors
        
-
-        # pubmedqueryfolder = personalpath.pubMedAbstractsPath + 'Pubmed_' + queryNoSpace
-        # if not os.path.isdir(pubmedqueryfolder):
-        #     os.makedirs(pubmedqueryfolder)
-
-        pubMedResultFileName = self.personalpath_mainPath + 'Pubmed_' + queryNoSpace + '.xml'
-        pubMedResultFile = open(pubMedResultFileName, 'w')
-        
-
-        print ('Query "' , query , '"\t:\t' , count , ' results')
-        print ('Starting fetching at ' , time.asctime(time.localtime()) )
+        print ("LOG::TIME: ",'medlineEfetchRAW :Query "' , query , '"\t:\t' , count , ' results')

        retstart = 0
-        # while(retstart < count):
        eFetch = '%s/efetch.fcgi?email=youremail@example.org&rettype=%s&retmode=xml&retstart=%s&retmax=%s&db=%s&query_key=%s&WebEnv=%s' %(self.pubMedEutilsURL, self.reportType, retstart, retmax, self.pubMedDB, queryKey, webEnv)
        return eFetch
-        #     if sys.version_info >= (3, 0): pubMedResultFile.write(eFetchResult.read().decode('utf-8'))
-        #     else: pubMedResultFile.write(eFetchResult.read())
-        #     retstart += retmax
-        #     break # you shall not pass !!
-
-        # pubMedResultFile.close()
-        # print ('Fetching for query ' , query , ' finished at ' , time.asctime(time.localtime()) )
-        # print (retmax , ' results written to file ' , pubMedResultFileName , '\n' )
-        # print("------------------------------------------")
-        # return ["everything","ok"]
-

+    # generic!
+    def downloadFile(self, item):
+        url = item[0]
+        filename = item[1]
+        print("\tin downloadFile:")
+        print(url,filename)
+        data = urlopen(url)
+        f = open(filename, 'w')
+        myfile = File(f)
+        myfile.write( data.read().decode('utf-8') )
+        myfile.close()
+        f.close()
+        with self.lock:
+            print(threading.current_thread().name, filename+" OK")
+            return filename
+
+
+    # generic!
+    def do_work(self,item):
+        # time.sleep(1) # pretend to do some lengthy work.
+        returnvalue = self.medlineEsearch(item)
+        with self.lock:
+            print(threading.current_thread().name, item)
+            return returnvalue
+
+    # The worker thread pulls an item from the queue and processes it
+    def worker(self):
+        while True:
+            item = self.q.get()
+            self.firstResults.append(self.do_work(item))
+            self.q.task_done()
+
+    def worker2(self):
+        while True:
+            item = self.q.get()
+            self.firstResults.append(self.downloadFile(item))
+            self.q.task_done()
+
+    def chunks(self , l , n):
+        print("chunks:")
+        for i in range(0, len(l), n):
+            yield l[i:i+n]

    # GLOBALLIMIT:
    # I will retrieve this exact amount of publications.
@@ -115,22 +134,34 @@ class MedlineFetcher:
    # - GlobalLimit : Number of publications i want.
    def serialFetcher(self , yearsNumber , query, globalLimit):

+        # Create the queue and thread pool.
+        for i in range(self.queue_size):
+             t = threading.Thread(target=self.worker)
+             t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
+             t.start()
+        start = time.perf_counter()
+
        N = 0

        print ("MedlineFetcher::serialFetcher :")
        thequeries = []
+        globalresults = []
        for i in range(yearsNumber):
            year = str(2015 - i)
            print ('YEAR ' + year)
            print ('---------\n')
-            # medlineEfetch(str(year) + '[dp] '+query , 20000)
-            # medlineEfetchRAW(str(year) + '[dp] '+query , retmax=300)
            pubmedquery = str(year) + '[dp] '+query
-            globalresults = self.medlineEsearch(pubmedquery)
+            self.q.put( pubmedquery ) #put task in the queue
+        
+        self.q.join()
+        print('time:',time.perf_counter() - start)
+
+        for globalresults in self.firstResults:
+            # globalresults = self.medlineEsearch(pubmedquery)
            if globalresults["count"]>0:
                N+=globalresults["count"]
                querymetadata = { 
-                    "string": pubmedquery , 
+                    "string": globalresults["query"] , 
                    "count": globalresults["count"] , 
                    "queryKey":globalresults["queryKey"] , 
                    "webEnv":globalresults["webEnv"] , 
@@ -149,11 +180,3 @@ class MedlineFetcher:
            query["retmax"] = retmax_forthisyear

        return thequeries
-
-
-
-# serialFetcher(yearsNumber=3, 'microbiota' , globalLimit=100 )
-# query = str(2015)+ '[dp] '+'microbiota'
-# medlineEsearch( query )
-
-# 
--- a/scrap_pubmed/views.py
+++ b/scrap_pubmed/views.py
@@ -14,6 +14,8 @@ import json

 from gargantext_web.settings import MEDIA_ROOT
 from datetime import datetime
+import time
+import threading
 from django.core.files import File
 from gargantext_web.settings import DEBUG

@@ -28,14 +30,16 @@ def getGlobalStats(request ):

 	if request.method == "POST":
 		query = request.POST["query"]
+		print ("LOG::TIME: query =", query )
+		print ("LOG::TIME: N =", 300 )
 		instancia = MedlineFetcher()
-		alist = instancia.serialFetcher( 5, query , 100 )
+		# alist = instancia.serialFetcher( 5, query , int(request.POST["N"]) )
+		alist = instancia.serialFetcher( 5, query , 300 )

 	data = alist
 	return JsonHttpResponse(data)


-from parsing.FileParsers import PubmedFileParser
 def doTheQuery(request , project_id):
 	alist = ["hola","mundo"]

@@ -78,17 +82,20 @@ def doTheQuery(request , project_id):
 		corpus.save()

 		try:
+			tasks = MedlineFetcher()
+			# configuring your queue with the event
+			for i in range(8):
+				t = threading.Thread(target=tasks.worker2) #thing to do
+				t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
+				t.start()
 			for url in urlreqs:
-				print(url)
-				data = urlopen(url)
-				xmlname = MEDIA_ROOT + '/corpora/%s/%s.xml' % (request.user, str(datetime.now().microsecond))
-				f = open(xmlname, 'w')
-				myfile = File(f)
-				myfile.write( data.read().decode('utf-8') )
-				myfile.close()
-				f.close()
-				corpus.add_resource( user=request.user, type=resource_type, file=xmlname )
+				filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.now().microsecond))
+				tasks.q.put( [url , filename]) #put a task in th queue
+			tasks.q.join() # wait until everything is finished
+			for filename in tasks.firstResults:
+				corpus.add_resource( user=request.user, type=resource_type, file=filename )

+			# do the WorkFlow
 			try:
 				if DEBUG is True:
 					corpus.workflow()
@@ -96,7 +103,6 @@ def doTheQuery(request , project_id):
 					corpus.workflow.apply_async((), countdown=3)

 				return JsonHttpResponse(["workflow","finished"])
-
 			except Exception as error:
 				print(error)

@@ -106,4 +112,79 @@ def doTheQuery(request , project_id):
 			print("lele",error)

 	data = alist
-	return JsonHttpResponse(data)
\ No newline at end of file
+	return JsonHttpResponse(data)
+
+
+def testISTEX(request , project_id):
+	print(request.method)
+	alist = ["bar","foo"]
+
+
+
+	if request.method == "POST":
+		# print(alist)
+		query = "-"
+		query_string = "-"
+		N = 60
+		if "query" in request.POST: query = request.POST["query"]
+		if "string" in request.POST: query_string = request.POST["string"].replace(" ","+")
+		# if "N" in request.POST: N = request.POST["N"]
+		print(query_string , query , N)
+
+
+		urlreqs = []
+		pagesize = 50
+		tasks = MedlineFetcher()
+		chunks = list(tasks.chunks(range(N), pagesize))
+		for k in chunks:
+			if (k[0]+pagesize)>N: pagesize = N-k[0]
+			urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
+		print(urlreqs)
+
+		# urlreqs = ["http://localhost/374255" , "http://localhost/374278" ]
+
+		resource_type = ResourceType.objects.get(name="istext" )
+
+		parent      = Node.objects.get(id=project_id)
+		node_type   = NodeType.objects.get(name='Corpus')
+		type_id = NodeType.objects.get(name='Document').id
+		user_id = User.objects.get( username=request.user ).id
+
+		corpus = Node(
+			user=request.user,
+			parent=parent,
+			type=node_type,
+			name=query,
+		)
+
+		corpus.save()
+
+		# configuring your queue with the event
+		for i in range(8):
+			t = threading.Thread(target=tasks.worker2) #thing to do
+			t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
+			t.start()
+		for url in urlreqs:
+			filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.now().microsecond))
+			tasks.q.put( [url , filename]) #put a task in th queue
+		tasks.q.join() # wait until everything is finished
+		for filename in tasks.firstResults:
+			corpus.add_resource( user=request.user, type=resource_type, file=filename )
+
+
+		corpus.save()
+
+		# do the WorkFlow
+		try:
+			if DEBUG is True:
+				corpus.workflow()
+			else:
+				corpus.workflow.apply_async((), countdown=3)
+
+			return JsonHttpResponse(["workflow","finished"])
+		except Exception as error:
+			print(error)
+
+	data = [query_string,query,N]
+	return JsonHttpResponse(data)
+
--- a/templates/project.html
+++ b/templates/project.html
@@ -11,7 +11,15 @@

 <script src="{% static "js/raphael-min.js"%}"></script>
 <script src="{% static "js/morris.min.js"%}"></script>
+<link rel="stylesheet" href="http://code.jquery.com/ui/1.11.2/themes/smoothness/jquery-ui.css">
+<style>
+.ui-autocomplete {
+    z-index: 5000;
+}

+.ui-autocomplete .ui-menu-item
+{ font-size:x-small;}
+</style>

 {% endblock %}

@@ -35,24 +43,19 @@
 	  		<div id="hero-donut" style="height: 200px;"></div>
 				{% endif %}
        <center>
-            <button 
-								type="button" 
-								class="btn btn-primary btn-lg" 
-								data-container="body" 
-								data-toggle="popover" 
-								data-placement="bottom"
-								>Add a corpus</button>
-								<div id="popover-content" class="hide">
-
-					<form enctype="multipart/form-data" action="/project/{{project.id}}/" method="post">
-    {% csrf_token %}
-		{{ form.non_field_errors }}
-		{{ form.as_p}}
-		
-		{{ formResource.non_field_errors }}
-		{{ formResource.as_p}}
-		<input onclick='$("#semLoader").css("visibility", "visible"); $("#semLoader").show();' type="submit" name="submit" id="submit" class="btn" value="Add this corpus" /><div>
-		<div id="pubmedcrawl" align="right"><a data-toggle="modal" href="#stack1">&#10142; Query directly in PubMed</a></div>
+
+			<a data-toggle="modal" href="#addcorpus">
+	            <button 
+					type="button" 
+					class="btn btn-primary btn-lg" 
+					data-container="body" 
+					data-toggle="popover" 
+					data-placement="bottom"
+					>Add a corpus
+				</button>
+			</a>
+								<!-- <div id="popover-content" class="hide"> -->
+
    </center>
 				</p>

@@ -172,6 +175,60 @@
  </div><!-- /.modal -->


+  <!-- Modal -->
+  <div class="modal fade" id="addcorpus" tabindex="-1" role="dialog" aria-labelledby="myModalLabel2" aria-hidden="true">
+    <div class="modal-dialog">
+      <div class="modal-content">
+
+		<div class="modal-header">
+			<button type="button" class="close" data-dismiss="modal" aria-hidden="true">×</button>
+			<h3>Add a Corpus</h3>
+		</div>
+		<div class="modal-body">
+
+			<form id="id_form" enctype="multipart/form-data" action="/project/{{project.id}}/" method="post">
+			{% csrf_token %}
+					<table cellpadding="5">
+					{% for field in form %}
+					  <tr>
+					    <th>{{field.label_tag}}</th>
+					    <td>
+					      {{ field.errors }}
+					      {{ field }}
+					      {% if field.name == "name" %}
+					      <span onclick="getGlobalResults(this);" id="scanpubmed"></span><div id="theresults"></div>
+					      {% endif %}
+					    </td>
+					  </tr>
+					{% endfor %}
+						<tr>
+							<th></th>
+							<td>
+
+								<div id="pubmedcrawl" style="visibility: hidden;">
+									Do you have a file already? &nbsp;
+									<input type="radio" id="file_yes" name="file1" onclick="FileOrNotFile(this.value);" class="file1" value="true" checked>Yes </input>
+									<input type="radio" id="file_no" name="file1" onclick="FileOrNotFile(this.value);" class="file1" value="false">No </input>
+								</div>								
+							</td>
+						</tr>
+					</table>
+
+				</form>
+				<div class="modal-footer">
+					<!-- <div id="pubmedcrawl" align="right" style="visibility: hidden;"><a data-toggle="modal" href="#stack1">&#10142; Query directly in PubMed</a></div> -->
+			  		<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
+					<button onclick='bringDaNoise();' id="submit_thing" disabled class="btn btn-primary" >Process this!</button><span id="simpleloader"></span>
+				</div>
+
+
+
+		</div>
+
+      </div><!-- /.modal-content -->
+    </div><!-- /.modal-dialog -->
+  </div><!-- /.modal -->
+<script src="http://code.jquery.com/ui/1.11.2/jquery-ui.js"></script>
 <script>
 		function getCookie(name) {
 		    var cookieValue = null;
@@ -189,12 +246,14 @@
 		    return cookieValue;
 		}

+
+
 		var thequeries = []

 		function doTheQuery() {
-			if ( $('#id_thebutton').prop('disabled') ) return;
+			if ( $('#submit_thing').prop('disabled') ) return;
 			console.log("in doTheQuery:");
-			var origQuery = $("#daquery").val()
+			var origQuery = $("#id_name").val()


 			var pubmedifiedQuery = { query : JSON.stringify(thequeries) , string: origQuery } ;
@@ -219,14 +278,44 @@
 		            console.log("in doTheQuery(). Data not found");
 		        }
 		    });
+		}

+		function bringDaNoise() {
+			var theresults = $("#theresults").html()
+			if( theresults && theresults.search("No results")==-1 ) {
+				console.log("we've in dynamic mode")
+				$("#simpleloader").html('<img width="30px" src="{% static "js/libs/img2/loading-bar.gif" %}"></img>')
+				$("#submit_thing").prop('onclick',null);
+				doTheQuery();
+			}
+			else {
+				console.log("we dont have nothing inside results div")
+				if ( $("#id_file").is(':visible') ) {
+					console.log("we're in upload-file mode")
+
+					var namefield = $("#id_name").val()!=""
+					var typefield = $("#id_type").val()!=""
+					var filefield = $("#id_file").val()!="" 
+					if( namefield && typefield && filefield ) {
+						$("#simpleloader").html('<img width="30px" src="{% static "js/libs/img2/loading-bar.gif" %}"></img>')
+						$("#submit_thing").prop('onclick',null);
+						$( "#id_form" ).submit();
+					}
+				}
+
+			}
 		}

-		function getGlobalResults(){
+		function getGlobalResults(value){
+			console.log("in getGlobalResults()")
 			// AJAX to django
-			var pubmedquery = $("#daquery").val()
-			var formData = {query:pubmedquery}
-			$("#results").html('<img width="30px" src="{% static "js/libs/img2/loading-bar.gif" %}"></img>')
+			var pubmedquery = $("#id_name").val()
+			var Npubs = $("#id_N").val();
+			if(pubmedquery=="") return;
+			var formData = {query:pubmedquery , N:Npubs}
+			$("#theresults").html('<img width="30px" src="{% static "js/libs/img2/loading-bar.gif" %}"></img>')
+			console.log("disabling "+"#"+value.id)
+			$("#"+value.id).prop('onclick',null);

 		    $.ajax({
 			  // contentType: "application/json",
@@ -239,27 +328,137 @@
 		      success: function(data) {
 				console.log("in getGlobalResults")
 		        console.log(data)
+				console.log("enabling "+"#"+value.id)
+				$("#"+value.id).attr('onclick','getGlobalResults(this);');
+				// $("#submit_thing").prop('disabled' , false)

 	            thequeries = data
 	            var N=0,k=0;

 	            for(var i in thequeries) N += thequeries[i].count
 	            if( N>0) {
-	            	$("#results").html("Result: "+N+" publications in the last 5 years")
-	            	$('#id_thebutton').prop('disabled', false);
+	            	$("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+N+" publications in the last 5 years</i><br>")
+	            	$('#submit_thing').prop('disabled', false);
 	            } else {
-	            	$("#results").html("No results!.")
-	            	$('#id_thebutton').prop('disabled', true);
+	            	$("#theresults").html("<i>  <b>"+pubmedquery+"</b>: No results!.</i><br>")
+	            	$('#submit_thing').prop('disabled', true);
 	            }

 		      },
 		        error: function(result) {
 		            console.log("Data not found");
 		        }
+		    });	
+		}
+
+		// CSS events for selecting one Radio-Input
+		function FileOrNotFile( value ) {
+			var showfile = JSON.parse(value)
+			// @upload-file events
+			if (showfile) {
+				console.log("You've clicked the YES") 
+				$("#id_file").show()
+				$('label[for=id_file]').show();
+				$("#id_name").attr("placeholder", "");
+				$("#scanpubmed").html("")
+				$("#theresults").html("")
+	            $('#submit_thing').prop('disabled', false);
+	            $( "#id_name" ).on('input',null);
+			}
+			// @dynamic-query events
+			else {
+				console.log("You've clicked the NO")				
+				$("#id_file").hide()
+				$('label[for=id_file]').hide();
+				$("#id_name").attr("placeholder", " [ Enter your query here ] ");
+				$("#id_name").focus();
+				$("#scanpubmed").html('<a class="btn btn-primary">Scan</a>')//+'Get: <input id="id_N" size="2" type="text"></input>')
+				$("#theresults").html("")
+				$("#submit_thing").prop('disabled' , true)
+
+				$( "#id_name" ).on('input',function(e){
+					console.log($(this).val())
+					testAjax( $(this).val() )
+				}); 
+			}
+		}
+
+		//CSS events for changing the Select element
+		function CustomForSelect( selected ) {
+			// show Radio-Inputs and trigger FileOrNotFile>@upload-file events
+			if(selected=="pubmed") {
+				console.log("show the button")
+				$("#pubmedcrawl").css("visibility", "visible"); 
+				$("#pubmedcrawl").show();
+				$("#file_yes").click();
+			} 
+			// hide Radio-Inputs and trigger @upload-file events
+			else {
+				console.log("hide the button")
+				$("#pubmedcrawl").css("visibility", "hidden"); 
+				$("#id_file").show()
+				$('label[for=id_file]').show();
+				FileOrNotFile( "true" ) 
+			}
+		}
+
+		var LastData = []
+ 		function NSuggest_CreateData(q, data) {
+ 			console.log("in the new NSuggest_CreateData:")
+ 			LastData = data;
+ 			// console.log(LastData)
+ 			console.log("adding class ui-widget")
+ 			$("#id_name").removeClass( "ui-widget" ).addClass( "ui-widget" )
+		    $( "#id_name" ).autocomplete({
+		      source: LastData
+		    });
+ 			return data;
+ 		}
+
+		function testAjax( query ) {
+			LastData = []
+			if(!query || query=="") return;
+			var pubmedquery = encodeURIComponent(query)
+			$.ajax({
+			    type: 'GET',
+			    url: "http://www.ncbi.nlm.nih.gov/portal/utils/autocomp.fcgi?dict=pm_related_queries_2&q="+pubmedquery,
+			    // data:"db="+db+"&query="+query,
+			    contentType: "application/json",
+			    dataType: 'jsonp'
+			});
+			return false;
+		}
+
+		function testISTEX(query,Npubs) {
+			console.log("in testISTEX:");
+			if(!query || query=="") return;
+			var origQuery = query
+
+
+			var pubmedifiedQuery = { query : query , string: query }
+			// console.log(pubmedifiedQuery)
+
+			var projectid = window.location.href.split("project")[1].replace(/\//g, '')//replace all the slashes
+
+		    $.ajax({
+			  // contentType: "application/json",
+		      url: window.location.origin+"/tests/project/"+projectid+"/ISTEXquery/go",
+		      data: pubmedifiedQuery,
+		      type: 'POST',
+		      beforeSend: function(xhr) {
+		        xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
+		      },
+		      success: function(data) {
+				console.log("ajax_success: in testISTEX()")
+		        console.log(data)
+		        location.reload();
+		      },
+		        error: function(result) {
+		            console.log("in testISTEX(). Data not found");
+		        }
 		    });
-			
-			
 		}
+
        // Morris Donut Chart
        Morris.Donut({
            element: 'hero-donut',