[UPDATE] istex|pubmed scrapper

45a1fa9d · PkSM3 · 119671ab · 45a1fa9d · 45a1fa9d · 45a1fa9d
Commit 45a1fa9d authored Feb 16, 2015 by PkSM3
7 changed files
--- a/gargantext_web/urls.py
+++ b/gargantext_web/urls.py
@@ -70,6 +70,7 @@ urlpatterns = patterns('',
    url(r'^tests/mvc$', views.tests_mvc),
    url(r'^tests/mvc-listdocuments$', views.tests_mvc_listdocuments),

+    url(r'^tests/istextquery$', pubmedscrapper.getGlobalStatsISTEXT),
    url(r'^tests/pubmedquery$', pubmedscrapper.getGlobalStats),
    url(r'^tests/project/(\d+)/pubmedquery/go$', pubmedscrapper.doTheQuery),
    url(r'^tests/project/(\d+)/ISTEXquery/go$', pubmedscrapper.testISTEX)

--- a/node/admin.py
+++ b/node/admin.py
@@ -125,8 +125,8 @@ class CustomForm(forms.Form):
        #     file_.name = str(datetime.now().microsecond)
        #     # raise forms.ValidationError(_('Come on dude, name too long. Now is:'+file_.name))
        # #File size
-        if len(file_)>104857600:
-            raise forms.ValidationError(_('File to heavy! (<100MB).'))
+        # if len(file_)>104857600:
+        #     raise forms.ValidationError(_('File to heavy! (<100MB).'))
        ## File type:
        # if file_.content_type == "application/zip":
        #     raise forms.ValidationError(_('We need a zip pls.'))

--- a/node/models.py
+++ b/node/models.py
@@ -81,7 +81,6 @@ class NodeQuerySet(CTENodeManager.CTEQuerySet):
        metadata_cache = {metadata.name: metadata for metadata in Metadata.objects.all()}
        data = []
        for node in self:
-            print(node.id)
            for key, value in node.metadata.items():
                if key in metadata_cache:
                    metadata = metadata_cache[key]
@@ -249,13 +248,14 @@ class Node(CTENode):
    @current_app.task(filter=task_method)
    def workflow(self, keys=None, ngramsextractorscache=None, ngramscaches=None, verbose=False):
        import time
-
+        total = 0
        print("LOG::TIME: In workflow()    parse_resources()")
        start = time.time()
        self.metadata['Processing'] = 1
        self.save()
        self.parse_resources()
        end = time.time()
+        total += (end - start)
        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" parse_resources() [s]",(end - start))
        print("LOG::TIME: In workflow()    / parse_resources()")

@@ -266,7 +266,7 @@ class Node(CTENode):
        self.children.filter(type_id=type_document.pk).extract_ngrams(keys=['title',])
        end = time.time()
        print("- - - - - - - - - - \n")
-        print ("LOG::TIME: ",(end - start))
+        total += (end - start)
        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" extract_ngrams() [s]",(end - start))
        print("LOG::TIME: In workflow()    / extract_ngrams()")
        
@@ -275,9 +275,9 @@ class Node(CTENode):
        from analysis.functions import do_tfidf
        do_tfidf(self)
        end = time.time()
+        total += (end - start)
        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_tfidf() [s]",(end - start))
        print("LOG::TIME: In workflow()    / do_tfidf()")
-
        print("In workflow() END")
        self.metadata['Processing'] = 0
        self.save()

--- a/parsing/FileParsers/PubmedFileParser.py
+++ b/parsing/FileParsers/PubmedFileParser.py
@@ -80,12 +80,19 @@ class PubmedFileParser(FileParser):
            if len(RealDate)>4:
                if len(RealDate)>8:
                    try: Decision = datetime.strptime(RealDate, '%Y %b %d').date()
-                    except: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
+                    except: 
+                        try: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
+                        except: Decision=False
                else: 
                    try: Decision = datetime.strptime(RealDate, '%Y %b').date()
-                    except: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
-            else: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
+                    except: 
+                        try: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
+                        except: Decision=False
+            else: 
+                try: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
+                except: Decision=False

+            if Decision!=False:
                if "publication_year" in metadata: metadata["publication_year"] = str(Decision.year)
                if "publication_month" in metadata: metadata["publication_month"] = str(Decision.month)
                if "publication_day" in metadata: metadata["publication_day"] = str(Decision.day)

--- a/scrap_pubmed/MedlineFetcherDavid2015.py
+++ b/scrap_pubmed/MedlineFetcherDavid2015.py
@@ -105,6 +105,13 @@ class MedlineFetcher:
            print(threading.current_thread().name, filename+" OK")
            return filename

+    # generic!
+    def test_downloadFile(self, item):
+        url = item[0]
+        filename = item[1]
+        print("\tin downloadFile:")
+        data = urlopen(url)
+        return data

    # generic!
    def do_work(self,item):
@@ -124,7 +131,10 @@ class MedlineFetcher:
    def worker2(self):
        while True:
            item = self.q.get()
-            self.firstResults.append(self.downloadFile(item))
+            results = []
+            try: result = self.downloadFile(item)
+            except: result = False
+            self.firstResults.append(result)
            self.q.task_done()

    def chunks(self , l , n):

--- a/scrap_pubmed/views.py
+++ b/scrap_pubmed/views.py
@@ -43,6 +43,32 @@ def getGlobalStats(request ):
 	return JsonHttpResponse(data)


+
+def getGlobalStatsISTEXT(request ):
+	print(request.method)
+	alist = ["bar","foo"]
+
+	if request.method == "POST":
+		N = 100
+		query = request.POST["query"]
+		print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
+		print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
+		query_string = query.replace(" ","+")
+		url = "http://api.istex.fr/document/?q="+query_string+"&output=*"
+
+		tasks = MedlineFetcher()
+
+		filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
+
+		try: 
+			thedata = tasks.test_downloadFile( [url,filename] )
+			alist = thedata.read().decode('utf-8')
+		except Exception as error:
+			alist = [str(error)]
+	data = alist
+	return JsonHttpResponse(data)
+
+
 def doTheQuery(request , project_id):
 	alist = ["hola","mundo"]

@@ -97,8 +123,13 @@ def doTheQuery(request , project_id):
 				filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
 				tasks.q.put( [url , filename]) #put a task in th queue
 			tasks.q.join() # wait until everything is finished
+
+			dwnldsOK = 0
 			for filename in tasks.firstResults:
+				if filename!=False:
 					corpus.add_resource( user=request.user, type=resource_type, file=filename )
+					dwnldsOK+=1
+			if dwnldsOK == 0: return JsonHttpResponse(["fail"])

 			# do the WorkFlow
 			try:
@@ -146,7 +177,8 @@ def testISTEX(request , project_id):
 			urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
 		print(urlreqs)

-		# urlreqs = ["http://localhost/374255" , "http://localhost/374278" ]
+		urlreqs = ["http://localhost/374255" , "http://localhost/374278" ]
+		print(urlreqs)

 		resource_type = ResourceType.objects.get(name="istext" )


--- a/templates/project.html
+++ b/templates/project.html
@@ -313,6 +313,9 @@
 			console.log("disabling "+"#"+value.id)
 			$("#"+value.id).prop('onclick',null);

+			var theType = $("#id_type option:selected").html();
+
+			if(theType=="pubmed") {
 			    $.ajax({
 				  // contentType: "application/json",
 			      url: window.location.origin+"/tests/pubmedquery",
@@ -348,9 +351,39 @@
 			    });	
 			}

+			if(theType=="istext") {
+				console.log(window.location.origin+"tests/istextquery")
+			    $.ajax({
+				  // contentType: "application/json",
+			      url: window.location.origin+"/tests/istextquery",
+			      data: formData,
+			      type: 'POST',
+			      beforeSend: function(xhr) {
+			        xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
+			      },
+			      success: function(data) {
+					console.log("in getGlobalResults")
+			        console.log(data)
+					console.log("enabling "+"#"+value.id)
+					$("#"+value.id).attr('onclick','getGlobalResults(this);');
+					// $("#submit_thing").prop('disabled' , false)
+					$("#submit_thing").html("Process a 100 sample!")
+
+					$("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+data[0]+"</i><br>")
+		            thequeries = data
+
+			      },
+			        error: function(result) {
+			            console.log("Data not found");
+			        }
+			    });	
+			}
+		}
+
 		// CSS events for selecting one Radio-Input
 		function FileOrNotFile( value ) {
 			var showfile = JSON.parse(value)
+			var theType = $("#id_type option:selected").html();
 			// @upload-file events
 			if (showfile) {
 				console.log("You've clicked the YES") 
@@ -376,7 +409,7 @@

 				$( "#id_name" ).on('input',function(e){
 					console.log($(this).val())
-					testAjax( $(this).val() )
+					if(theType=="pubmed") testPUBMED( $(this).val() )
 				}); 
 			}
 		}
@@ -384,8 +417,8 @@
 		//CSS events for changing the Select element
 		function CustomForSelect( selected ) {
 			// show Radio-Inputs and trigger FileOrNotFile>@upload-file events
-			if(selected=="pubmed") {
-				console.log("show the button")
+			if(selected=="pubmed" || selected=="istext") {
+				console.log("show the button for: "+selected)
 				$("#pubmedcrawl").css("visibility", "visible"); 
 				$("#pubmedcrawl").show();
 				$("#file_yes").click();
@@ -414,7 +447,7 @@
 			return data;
 		}

-		function testAjax( query ) {
+		function testPUBMED( query ) {
 			LastData = []
 			if(!query || query=="") return;
 			var pubmedquery = encodeURIComponent(query)
@@ -450,7 +483,7 @@
 		      success: function(data) {
 				console.log("ajax_success: in testISTEX()")
 		        console.log(data)
-		        location.reload();
+		        // location.reload();
 		      },
 		        error: function(result) {
 		            console.log("in testISTEX(). Data not found");