[UPDATE] ISTex fix and ngrams-table advances

966cea3a · PkSM3 · dfec6bab · 966cea3a · dfec6bab · 966cea3a
Commit 966cea3a authored Jun 09, 2015 by PkSM3
9 changed files
--- a/gargantext_web/views.py
+++ b/gargantext_web/views.py
@@ -302,9 +302,19 @@ def newpaginatorJSON(request , corpus_id):
    # project = session.query(Node).filter(Node.id==project_id).first()
    corpus  = session.query(Node).filter(Node.id==corpus_id).first()
    type_document_id = cache.NodeType['Document'].id
-    documents  = session.query(Node).filter(Node.parent_id==corpus_id , Node.type_id == type_document_id ).all()
+    user_id = request.user.id
+    # documents  = session.query(Node).filter(Node.parent_id==corpus_id , Node.type_id == type_document_id ).all()

+    documents  = session.query(Node).filter(Node.user_id == user_id , Node.parent_id==corpus_id , Node.type_id == type_document_id ).all() 

+    # for doc in documents:
+    #     print(doc.name)
+    #     if "publication_date" in doc.hyperdata:
+    #         print(doc.hyperdata["publication_date"])
+    #     else: print ("No date")
+    #     print(" - - - - - -   -")
+
+    # print(" = = = = = = = = = = = = = = = == = = = ")
    filtered_docs = []
    for doc in documents:
        if "publication_date" in doc.hyperdata:

--- a/parsing/FileParsers/ISText.py
+++ b/parsing/FileParsers/ISText.py
-from django.db import transaction
-from lxml import etree
-from .FileParser import FileParser
-from ..NgramsExtractors import *
-from datetime import datetime
-from io import BytesIO
-import json
-
-class ISText(FileParser):
-    
-    def _parse(self, thefile):
-        json_data=open(thefile,"r")
-        data = json.load(json_data)
-        json_data.close()
-        json_docs = data["hits"]
-        hyperdata_list = []
-        hyperdata_path = {
-            "id"                : "id",
-            "source"           : 'corpusName',
-            "title"             : 'title',
-            "genre"             : "genre",
-            # "language_iso3"     : 'MedlineCitation/Article/Language',
-            "doi"               : 'doi',
-            "host"              : 'host',
-            "publication_date"  : 'pubdate',
-            # "authors"           : 'author',
-            "authorsRAW"        : 'author',
-            "keywords"          : "keywords"
-        }
-        hyperdata = {}
-        import pprint
-        import datetime
-        for json_doc in json_docs:
-            for key, path in hyperdata_path.items():
-                try:
-                    # print(path," ==> ",len(json_doc[path]))
-                    hyperdata[key] = json_doc[path]
-                except: pass
-
-            # print("|",hyperdata["publication_date"])
-
-            if "doi" in hyperdata: hyperdata["doi"] = hyperdata["doi"][0]
-            
-            keywords = []
-            if "keywords" in hyperdata:
-                for keyw in hyperdata["keywords"]:
-                    keywords.append(keyw["value"] )
-                hyperdata["keywords"] = ", ".join( keywords )
-
-            moredate=False
-            moresource=False
-            if "host" in hyperdata:
-
-                if "genre" in hyperdata["host"] and len(hyperdata["host"]["genre"])>0:
-                    if "genre" in hyperdata and len(hyperdata["genre"])==0:
-                        hyperdata["genre"] = hyperdata["host"]["genre"]
-
-                # print(hyperdata["host"])
-                if "pubdate" in hyperdata["host"]:
-                    onebuffer = hyperdata["publication_date"]
-                    hyperdata["publication_date"] = []
-                    hyperdata["publication_date"].append(onebuffer)
-                    hyperdata["publication_date"].append( hyperdata["host"]["pubdate"] )
-
-                if "title" in hyperdata["host"]:
-                    hyperdata["journal"] = hyperdata["host"]["title"]
-
-            authors=False
-            if "authorsRAW" in hyperdata:
-                names = []
-                for author in hyperdata["authorsRAW"]: 
-                    names.append(author["name"])
-                hyperdata["authors"] = ", ".join(names)
-
-            if "host" in hyperdata: hyperdata.pop("host")
-            if "genre" in hyperdata:
-                if len(hyperdata["genre"])==0:
-                    hyperdata.pop("genre")
-            
-            if "publication_date" in hyperdata and isinstance(hyperdata["publication_date"], list):
-                if len(hyperdata["publication_date"])>1:
-                    d1 = hyperdata["publication_date"][0]
-                    d2 = hyperdata["publication_date"][1]
-                    # print("date1:",d1)
-                    # print("date2:",d2)
-                    if len(d1)==len(d2):
-                        hyperdata["publication_date"] = d2
-                        # if int(d1)>int(d2): hyperdata["publication_date"] = d2
-                    else:
-                        fulldate = ""
-                        year = d2[:4]
-                        fulldate+=year
-                        if len(d2)>4:
-                            month = d2[4:6]
-                            fulldate+="-"+month
-                            if len(d2)>6:
-                                day = d2[6:8]
-                                fulldate+="-"+day
-                        hyperdata["publication_date"] = fulldate
-                else:
-                    if "copyrightdate" in json_doc: 
-                        hyperdata["publication_date"] = json_doc["copyrightdate"]
-            else:
-                if "copyrightdate" in json_doc:
-                    hyperdata["publication_date"] = json_doc["copyrightdate"]
-            
-            print("||",hyperdata["title"])
-            hyperdata_list.append(hyperdata)
-            print("=============================")
-
-        print("\nlen list:",len(hyperdata_list))
-        return hyperdata_list
--- a/parsing/FileParsers/__init__.py
+++ b/parsing/FileParsers/__init__.py
@@ -4,5 +4,5 @@ from .JstorFileParser import JstorFileParser
 from .ZoteroFileParser import ZoteroFileParser
 from .PubmedFileParser import PubmedFileParser
 from .EuropressFileParser import EuropressFileParser
-from .ISText import ISText
+from .ISTex import ISTex
 from .CSVParser import CSVParser
--- a/parsing/corpustools.py
+++ b/parsing/corpustools.py
@@ -102,10 +102,17 @@ def parse_resources(corpus, user=None, user_id=None):
        .filter(Node_Resource.parsed == False)
    )
    # make a new node for every parsed document of the corpus
+    print("HERE MOFOs")
+    print(resources_query)
    dbg.show('analyze documents')
    nodes = list()
    for resource, resourcetype in resources_query:
+        # print("resource: ",resource)
+        # print("resourcetype:",resourcetype)
+        # print(resourcetype.name)
+        # print(resource.file)
        parser = parsers[resourcetype.name]
+        # print(parser.parse(resource.file))
        for hyperdata_dict in parser.parse(resource.file):
            # retrieve language ID from hyperdata
            if 'language_iso2' in hyperdata_dict:
@@ -116,6 +123,7 @@ def parse_resources(corpus, user=None, user_id=None):
            else:
                language_id = None
            # create new node
+            # print(hyperdata_dict.get('title', '')[:200])
            node = Node(
                name = hyperdata_dict.get('title', '')[:200],
                parent_id = corpus_id,
@@ -154,6 +162,7 @@ def parse_resources(corpus, user=None, user_id=None):
                hyperdata.id,
                hyperdata_value,
            ))
+
    for key, values in node_hyperdata_lists.items():
        bulk_insert(Node_Hyperdata, ['node_id', 'hyperdata_id', 'value_'+key], values)
    # mark the corpus as parsed

--- a/parsing/parsers_config.py
+++ b/parsing/parsers_config.py
@@ -10,5 +10,6 @@ parsers = {
        'Europress (French)'                : EuropressFileParser,
        'Europress (English)'               : EuropressFileParser,
        'CSVParser'                : CSVParser,
+        'ISTex'                : ISTex,
    }

--- a/scrappers/scrap_pubmed/views.py
+++ b/scrappers/scrap_pubmed/views.py
@@ -61,7 +61,7 @@ def getGlobalStatsISTEXT(request ):
 	alist = ["bar","foo"]

 	if request.method == "POST":
-		N = 100
+		N = 1000
 		query = request.POST["query"]
 		print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
 		print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
@@ -184,34 +184,111 @@ def doTheQuery(request , project_id):


 def testISTEX(request , project_id):
+	print("testISTEX:")
 	print(request.method)
 	alist = ["bar","foo"]

+	# do we have a valid project id?
+	try:
+		project_id = int(project_id)
+	except ValueError:
+		raise Http404()
+
+	# do we have a valid project?
+	project = (session
+		.query(Node)
+		.filter(Node.id == project_id)
+		.filter(Node.type_id == cache.NodeType['Project'].id)
+	).first()
+
+	if project is None:
+		raise Http404()
+
+	# do we have a valid user?
+	user = request.user
+	if not user.is_authenticated():
+		return redirect('/login/?next=%s' % request.path)
+	if project.user_id != user.id:
+		return HttpResponseForbidden()
+


 	if request.method == "POST":
 		# print(alist)
 		query = "-"
 		query_string = "-"
-		N = 60
+		N = 1000
 		if "query" in request.POST: query = request.POST["query"]
 		if "string" in request.POST: query_string = request.POST["string"].replace(" ","+")
 		# if "N" in request.POST: N = request.POST["N"]
 		print(query_string , query , N)


-		# urlreqs = []
-		# pagesize = 50
-		# tasks = MedlineFetcher()
-		# chunks = list(tasks.chunks(range(N), pagesize))
-		# for k in chunks:
-		# 	if (k[0]+pagesize)>N: pagesize = N-k[0]
-		# 	urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
-		# print(urlreqs)
+		urlreqs = []
+		pagesize = 50
+		tasks = MedlineFetcher()
+		chunks = list(tasks.chunks(range(N), pagesize))
+		for k in chunks:
+			if (k[0]+pagesize)>N: pagesize = N-k[0]
+			urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
+
+
+		resourcetype = cache.ResourceType["ISTex"]
+
+		# corpus node instanciation as a Django model
+		corpus = Node(
+			name = query,
+			user_id = request.user.id,
+			parent_id = project_id,
+			type_id = cache.NodeType['Corpus'].id,
+			language_id = None, 
+			hyperdata    = {'Processing' : 1,}
+		)
+		session.add(corpus)
+		session.commit()
+
+
+		ensure_dir(request.user)
+		tasks = MedlineFetcher()
+
+		for i in range(8):
+			t = threading.Thread(target=tasks.worker2) #thing to do
+			t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
+			t.start()
+		for url in urlreqs:
+			filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
+			tasks.q.put( [url , filename]) #put a task in th queue
+		tasks.q.join() # wait until everything is finished
+
+		dwnldsOK = 0
+		for filename in tasks.firstResults:
+			if filename!=False:
+				# add the uploaded resource to the corpus
+				add_resource(corpus,
+					user_id = request.user.id,
+					type_id = resourcetype.id,
+					file = filename,
+				)
+				dwnldsOK+=1			

-		# urlreqs = ["http://localhost/374255" , "http://localhost/374278" ]
 		# print(urlreqs)

+
+		try:
+			if not DEBUG:
+				apply_workflow.apply_async((corpus.id,),)
+			else:
+				thread = threading.Thread(target=apply_workflow, args=(corpus.id, ), daemon=True)
+				thread.start()
+		except Exception as error:
+			print('WORKFLOW ERROR')
+			print(error)
+		sleep(1)
+		return HttpResponseRedirect('/project/' + str(project_id))
+
+
+
+
 		# resource_type = ResourceType.objects.get(name="istext" )

 		# parent      = Node.objects.get(id=project_id)

--- a/static/js/test_dyna_chart_and_table.js
+++ b/static/js/test_dyna_chart_and_table.js
@@ -211,20 +211,20 @@ function transformContent2(rec_id) {
 function overRide(elem) {
  var id = elem.id
  var current_flag = $("input[type='radio'][name='radios']:checked").val()
-  var val = elem.checked
  var this_newflag = (current_flag==AjaxRecords[id]["flag"])?false:current_flag

  console.log("striking: "+id+" | this-elem_flag: "+AjaxRecords[id]["flag"]+" | current_flag: "+current_flag)
  console.log("\t so the new flag is: "+this_newflag)
  

-  if(this_newflag)
-    FlagsBuffer[this_newflag][id] = true;
-  else 
-    delete FlagsBuffer[ AjaxRecords[id]["flag"] ][id];
+  // if(this_newflag)
+  //   FlagsBuffer[this_newflag][id] = true;
+  // else 
+  //   delete FlagsBuffer[ AjaxRecords[id]["flag"] ][id];
+


-  AjaxRecords[id]["flag"] = this_newflag;
+  AjaxRecords[id]["flag"] = Mark_NGram ( id , AjaxRecords[id]["flag"] , this_newflag );

  var sum__selected_elems = 0;
  for(var i in FlagsBuffer)
@@ -255,9 +255,53 @@ function transformContent(rec_id , header , content) {
 function DeactivateSelectAll() {
  if( $("#multiple_selection").length>0 )
    $("#multiple_selection")[0].checked = false;
+
+  if( Object.keys(FlagsBuffer["to_group"]).length ){
+
+
+    $("#savemodal").modal("show").css({
+        'margin-top': function () { //vertical centering
+            console.log($(".modal-content").height())
+            return ($(this).height() / 2);
+        }
+    });
+
+    console.log("OH OH")
+    console.log("There are some nodes in group array!:")
+    // $("#to_group").html( Object.keys(FlagsBuffer["to_group"]).join(" , ") );
+    var labels = []
+    for (var i in FlagsBuffer["to_group"]){
+      var fake_id = i
+      console.log( AjaxRecords[fake_id] )
+      labels.push(AjaxRecords[fake_id].name)
+    //   $("#to_group").htm
+    }
+
+    $("#to_group").html( '<font color="blue">' + labels.join(" , ") + '</div>' );
+  }
 }


+function Mark_NGram( ngram_id , old_flag , new_flag ) {
+  if(new_flag){
+    for(var f in FlagsBuffer) {
+      if( new_flag==f )
+        FlagsBuffer[f][ngram_id] = true;
+      else 
+        delete FlagsBuffer[f][ngram_id];
+    }
+  } else {
+    delete FlagsBuffer[ old_flag ][ngram_id];
+  }
+  return new_flag;
+}
+
+function GroupNGrams() {
+    for (var i in FlagsBuffer["to_group"]){
+      console.log( AjaxRecords[i] )
+    }  
+}
+
 //generic enough
 function ulWriter(rowIndex, record, columns, cellWriter) {
  // pr("\tulWriter: "+record.id)
@@ -282,10 +326,21 @@ function ulWriter(rowIndex, record, columns, cellWriter) {
 }

 function SelectAll( the_checkbox ) {
-  var the_flag = $("input[type='radio'][name='radios']:checked").val()
+  var current_flag = $("input[type='radio'][name='radios']:checked").val()
  $("tbody tr").each(function (i, row) {
      var id = $(row).data('stuff')
-      AjaxRecords[id]["flag"] = (the_checkbox.checked)?the_flag:false;
+      // AjaxRecords[id]["flag"] = (the_checkbox.checked)?the_flag:false;
+      
+
+      var this_newflag = (the_checkbox.checked)?current_flag:false;
+
+      // console.log("striking: "+id+" | this-elem_flag: "+AjaxRecords[id]["flag"]+" | current_flag: "+current_flag)
+      // console.log("\t so the new flag is: "+this_newflag)
+
+      AjaxRecords[id]["flag"] = Mark_NGram ( id , AjaxRecords[id]["flag"] , this_newflag );
+
+
+
  });
  MyTable.data('dynatable').dom.update();
 }
@@ -302,6 +357,8 @@ $("#Clean_All").click(function(){
    for(var j in FlagsBuffer[i])
      delete FlagsBuffer[i][j];

+  $("#Clean_All, #Save_All").attr( "disabled", "disabled" );
+
 });

 $("#Save_All").click(function(){

--- a/templates/project.html
+++ b/templates/project.html
@@ -282,8 +282,13 @@
 				$("#submit_thing").prop('onclick',null);

 				var theType = $("#id_type option:selected").html();
+				console.log("consoling the typeeee: ")
+				console.log(theType)
 				if(theType=="Pubmed (xml format)") doTheQuery();
-				if(theType=="istex") {
+				if(theType=="ISTex") {
+					var origQuery = $("#id_name").val()
+					console.log("printing the results:")
+					console.log(origQuery)
 					testISTEX(origQuery.replace(" ","+"),1000)
 				}
 			}
@@ -354,7 +359,7 @@
 			    });	
 			}

-			if(theType=="istext") {
+			if(theType=="ISTex") {
 				console.log(window.location.origin+"tests/istextquery")
 			    $.ajax({
 				  // contentType: "application/json",
@@ -365,8 +370,7 @@
 			        xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
 			      },
 			      success: function(data) {
-					console.log("in getGlobalResults")
-			        console.log(data)
+					console.log("in getGlobalResults: Ajax(ISTex)")
 					console.log("enabling "+"#"+value.id)
 					$("#"+value.id).attr('onclick','getGlobalResults(this);');
 					// $("#submit_thing").prop('disabled' , false)
@@ -374,10 +378,10 @@

 		            thequeries = data
 		            var N=data.length,k=0;
-		            console.log("N: "+N)
 		            // for(var i in thequeries) N += thequeries[i].count
 		            if( N>1) {
 		            	var total = JSON.parse(data).total
+		            	console.log("N: "+total)
 		            	$("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+total+" publications.</i><br>")
 		            	$('#submit_thing').prop('disabled', false);
 		            } else {
@@ -422,7 +426,8 @@

 				$( "#id_name" ).on('input',function(e){
 					console.log($(this).val())
-					if(theType=="Pubmed (xml format)") testPUBMED( $(this).val() )
+					if(theType=="Pubmed (xml format)") 
+						testPUBMED( $(this).val() )
 				}); 
 			}
 		}
@@ -430,7 +435,7 @@
 		//CSS events for changing the Select element
 		function CustomForSelect( selected ) {
 			// show Radio-Inputs and trigger FileOrNotFile>@upload-file events
-			if(selected=="Pubmed (xml format)" || selected=="istext") {
+			if(selected=="Pubmed (xml format)" || selected=="ISTex") {
 			// if(selected=="pubmed") {
 				console.log("show the button for: "+selected)
 				$("#pubmedcrawl").css("visibility", "visible"); 
@@ -497,7 +502,7 @@
 		      success: function(data) {
 				console.log("ajax_success: in testISTEX()")
 		        console.log(data)
-		        // location.reload();
+		        location.reload();
 		      },
 		        error: function(result) {
 		            console.log("in testISTEX(). Data not found");

--- a/templates/tests/ngrams.html
+++ b/templates/tests/ngrams.html
@@ -249,6 +249,33 @@ input[type=radio]:checked + label {
        </div>
 </div>

+
+<div id="savemodal" class="modal fade">
+  <div class="modal-dialog">
+    <div class="modal-content">
+
+
+      <div class="modal-header">
+        <button type="button" class="close" data-dismiss="modal" aria-hidden="true">×</button>
+        <h4 class="modal-title">Group NGrams</h4>
+      </div>
+
+      <div class="modal-body form-horizontal">
+        Do you want to merge this elements before continuing?:
+        <div id="to_group"></div>
+      </div>
+
+      <div class="modal-footer">
+        <button id="closesavemodal" type="button" class="btn btn-default" data-dismiss="modal">Close</button>
+        <button type="button" class="btn btn-primary" onclick="GroupNGrams();">Save</button>
+      </div>
+
+    </div>
+  </div>
+</div> 
+
+
+
 <script type="text/javascript" src="{% static "js/jquery/jquery.min.js" %}"></script>
 <script src="{% static "js/charts/bootstrap.min.js" %}"></script>
 <script type="text/javascript" src="{% static "js/jquery/jquery.dynatable.js" %}"></script>