Commit 966cea3a authored by PkSM3's avatar PkSM3

[UPDATE] ISTex fix and ngrams-table advances

parent dfec6bab
...@@ -302,9 +302,19 @@ def newpaginatorJSON(request , corpus_id): ...@@ -302,9 +302,19 @@ def newpaginatorJSON(request , corpus_id):
# project = session.query(Node).filter(Node.id==project_id).first() # project = session.query(Node).filter(Node.id==project_id).first()
corpus = session.query(Node).filter(Node.id==corpus_id).first() corpus = session.query(Node).filter(Node.id==corpus_id).first()
type_document_id = cache.NodeType['Document'].id type_document_id = cache.NodeType['Document'].id
documents = session.query(Node).filter(Node.parent_id==corpus_id , Node.type_id == type_document_id ).all() user_id = request.user.id
# documents = session.query(Node).filter(Node.parent_id==corpus_id , Node.type_id == type_document_id ).all()
documents = session.query(Node).filter(Node.user_id == user_id , Node.parent_id==corpus_id , Node.type_id == type_document_id ).all()
# for doc in documents:
# print(doc.name)
# if "publication_date" in doc.hyperdata:
# print(doc.hyperdata["publication_date"])
# else: print ("No date")
# print(" - - - - - - -")
# print(" = = = = = = = = = = = = = = = == = = = ")
filtered_docs = [] filtered_docs = []
for doc in documents: for doc in documents:
if "publication_date" in doc.hyperdata: if "publication_date" in doc.hyperdata:
......
from django.db import transaction
from lxml import etree
from .FileParser import FileParser
from ..NgramsExtractors import *
from datetime import datetime
from io import BytesIO
import json
class ISText(FileParser):
def _parse(self, thefile):
json_data=open(thefile,"r")
data = json.load(json_data)
json_data.close()
json_docs = data["hits"]
hyperdata_list = []
hyperdata_path = {
"id" : "id",
"source" : 'corpusName',
"title" : 'title',
"genre" : "genre",
# "language_iso3" : 'MedlineCitation/Article/Language',
"doi" : 'doi',
"host" : 'host',
"publication_date" : 'pubdate',
# "authors" : 'author',
"authorsRAW" : 'author',
"keywords" : "keywords"
}
hyperdata = {}
import pprint
import datetime
for json_doc in json_docs:
for key, path in hyperdata_path.items():
try:
# print(path," ==> ",len(json_doc[path]))
hyperdata[key] = json_doc[path]
except: pass
# print("|",hyperdata["publication_date"])
if "doi" in hyperdata: hyperdata["doi"] = hyperdata["doi"][0]
keywords = []
if "keywords" in hyperdata:
for keyw in hyperdata["keywords"]:
keywords.append(keyw["value"] )
hyperdata["keywords"] = ", ".join( keywords )
moredate=False
moresource=False
if "host" in hyperdata:
if "genre" in hyperdata["host"] and len(hyperdata["host"]["genre"])>0:
if "genre" in hyperdata and len(hyperdata["genre"])==0:
hyperdata["genre"] = hyperdata["host"]["genre"]
# print(hyperdata["host"])
if "pubdate" in hyperdata["host"]:
onebuffer = hyperdata["publication_date"]
hyperdata["publication_date"] = []
hyperdata["publication_date"].append(onebuffer)
hyperdata["publication_date"].append( hyperdata["host"]["pubdate"] )
if "title" in hyperdata["host"]:
hyperdata["journal"] = hyperdata["host"]["title"]
authors=False
if "authorsRAW" in hyperdata:
names = []
for author in hyperdata["authorsRAW"]:
names.append(author["name"])
hyperdata["authors"] = ", ".join(names)
if "host" in hyperdata: hyperdata.pop("host")
if "genre" in hyperdata:
if len(hyperdata["genre"])==0:
hyperdata.pop("genre")
if "publication_date" in hyperdata and isinstance(hyperdata["publication_date"], list):
if len(hyperdata["publication_date"])>1:
d1 = hyperdata["publication_date"][0]
d2 = hyperdata["publication_date"][1]
# print("date1:",d1)
# print("date2:",d2)
if len(d1)==len(d2):
hyperdata["publication_date"] = d2
# if int(d1)>int(d2): hyperdata["publication_date"] = d2
else:
fulldate = ""
year = d2[:4]
fulldate+=year
if len(d2)>4:
month = d2[4:6]
fulldate+="-"+month
if len(d2)>6:
day = d2[6:8]
fulldate+="-"+day
hyperdata["publication_date"] = fulldate
else:
if "copyrightdate" in json_doc:
hyperdata["publication_date"] = json_doc["copyrightdate"]
else:
if "copyrightdate" in json_doc:
hyperdata["publication_date"] = json_doc["copyrightdate"]
print("||",hyperdata["title"])
hyperdata_list.append(hyperdata)
print("=============================")
print("\nlen list:",len(hyperdata_list))
return hyperdata_list
...@@ -4,5 +4,5 @@ from .JstorFileParser import JstorFileParser ...@@ -4,5 +4,5 @@ from .JstorFileParser import JstorFileParser
from .ZoteroFileParser import ZoteroFileParser from .ZoteroFileParser import ZoteroFileParser
from .PubmedFileParser import PubmedFileParser from .PubmedFileParser import PubmedFileParser
from .EuropressFileParser import EuropressFileParser from .EuropressFileParser import EuropressFileParser
from .ISText import ISText from .ISTex import ISTex
from .CSVParser import CSVParser from .CSVParser import CSVParser
...@@ -102,10 +102,17 @@ def parse_resources(corpus, user=None, user_id=None): ...@@ -102,10 +102,17 @@ def parse_resources(corpus, user=None, user_id=None):
.filter(Node_Resource.parsed == False) .filter(Node_Resource.parsed == False)
) )
# make a new node for every parsed document of the corpus # make a new node for every parsed document of the corpus
print("HERE MOFOs")
print(resources_query)
dbg.show('analyze documents') dbg.show('analyze documents')
nodes = list() nodes = list()
for resource, resourcetype in resources_query: for resource, resourcetype in resources_query:
# print("resource: ",resource)
# print("resourcetype:",resourcetype)
# print(resourcetype.name)
# print(resource.file)
parser = parsers[resourcetype.name] parser = parsers[resourcetype.name]
# print(parser.parse(resource.file))
for hyperdata_dict in parser.parse(resource.file): for hyperdata_dict in parser.parse(resource.file):
# retrieve language ID from hyperdata # retrieve language ID from hyperdata
if 'language_iso2' in hyperdata_dict: if 'language_iso2' in hyperdata_dict:
...@@ -116,6 +123,7 @@ def parse_resources(corpus, user=None, user_id=None): ...@@ -116,6 +123,7 @@ def parse_resources(corpus, user=None, user_id=None):
else: else:
language_id = None language_id = None
# create new node # create new node
# print(hyperdata_dict.get('title', '')[:200])
node = Node( node = Node(
name = hyperdata_dict.get('title', '')[:200], name = hyperdata_dict.get('title', '')[:200],
parent_id = corpus_id, parent_id = corpus_id,
...@@ -154,6 +162,7 @@ def parse_resources(corpus, user=None, user_id=None): ...@@ -154,6 +162,7 @@ def parse_resources(corpus, user=None, user_id=None):
hyperdata.id, hyperdata.id,
hyperdata_value, hyperdata_value,
)) ))
for key, values in node_hyperdata_lists.items(): for key, values in node_hyperdata_lists.items():
bulk_insert(Node_Hyperdata, ['node_id', 'hyperdata_id', 'value_'+key], values) bulk_insert(Node_Hyperdata, ['node_id', 'hyperdata_id', 'value_'+key], values)
# mark the corpus as parsed # mark the corpus as parsed
......
...@@ -10,5 +10,6 @@ parsers = { ...@@ -10,5 +10,6 @@ parsers = {
'Europress (French)' : EuropressFileParser, 'Europress (French)' : EuropressFileParser,
'Europress (English)' : EuropressFileParser, 'Europress (English)' : EuropressFileParser,
'CSVParser' : CSVParser, 'CSVParser' : CSVParser,
'ISTex' : ISTex,
} }
...@@ -61,7 +61,7 @@ def getGlobalStatsISTEXT(request ): ...@@ -61,7 +61,7 @@ def getGlobalStatsISTEXT(request ):
alist = ["bar","foo"] alist = ["bar","foo"]
if request.method == "POST": if request.method == "POST":
N = 100 N = 1000
query = request.POST["query"] query = request.POST["query"]
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query ) print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N ) print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
...@@ -184,34 +184,111 @@ def doTheQuery(request , project_id): ...@@ -184,34 +184,111 @@ def doTheQuery(request , project_id):
def testISTEX(request , project_id): def testISTEX(request , project_id):
print("testISTEX:")
print(request.method) print(request.method)
alist = ["bar","foo"] alist = ["bar","foo"]
# do we have a valid project id?
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = (session
.query(Node)
.filter(Node.id == project_id)
.filter(Node.type_id == cache.NodeType['Project'].id)
).first()
if project is None:
raise Http404()
# do we have a valid user?
user = request.user
if not user.is_authenticated():
return redirect('/login/?next=%s' % request.path)
if project.user_id != user.id:
return HttpResponseForbidden()
if request.method == "POST": if request.method == "POST":
# print(alist) # print(alist)
query = "-" query = "-"
query_string = "-" query_string = "-"
N = 60 N = 1000
if "query" in request.POST: query = request.POST["query"] if "query" in request.POST: query = request.POST["query"]
if "string" in request.POST: query_string = request.POST["string"].replace(" ","+") if "string" in request.POST: query_string = request.POST["string"].replace(" ","+")
# if "N" in request.POST: N = request.POST["N"] # if "N" in request.POST: N = request.POST["N"]
print(query_string , query , N) print(query_string , query , N)
# urlreqs = [] urlreqs = []
# pagesize = 50 pagesize = 50
# tasks = MedlineFetcher() tasks = MedlineFetcher()
# chunks = list(tasks.chunks(range(N), pagesize)) chunks = list(tasks.chunks(range(N), pagesize))
# for k in chunks: for k in chunks:
# if (k[0]+pagesize)>N: pagesize = N-k[0] if (k[0]+pagesize)>N: pagesize = N-k[0]
# urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize)) urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
# print(urlreqs)
resourcetype = cache.ResourceType["ISTex"]
# corpus node instanciation as a Django model
corpus = Node(
name = query,
user_id = request.user.id,
parent_id = project_id,
type_id = cache.NodeType['Corpus'].id,
language_id = None,
hyperdata = {'Processing' : 1,}
)
session.add(corpus)
session.commit()
ensure_dir(request.user)
tasks = MedlineFetcher()
for i in range(8):
t = threading.Thread(target=tasks.worker2) #thing to do
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
for url in urlreqs:
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
tasks.q.put( [url , filename]) #put a task in th queue
tasks.q.join() # wait until everything is finished
dwnldsOK = 0
for filename in tasks.firstResults:
if filename!=False:
# add the uploaded resource to the corpus
add_resource(corpus,
user_id = request.user.id,
type_id = resourcetype.id,
file = filename,
)
dwnldsOK+=1
# urlreqs = ["http://localhost/374255" , "http://localhost/374278" ]
# print(urlreqs) # print(urlreqs)
try:
if not DEBUG:
apply_workflow.apply_async((corpus.id,),)
else:
thread = threading.Thread(target=apply_workflow, args=(corpus.id, ), daemon=True)
thread.start()
except Exception as error:
print('WORKFLOW ERROR')
print(error)
sleep(1)
return HttpResponseRedirect('/project/' + str(project_id))
# resource_type = ResourceType.objects.get(name="istext" ) # resource_type = ResourceType.objects.get(name="istext" )
# parent = Node.objects.get(id=project_id) # parent = Node.objects.get(id=project_id)
......
...@@ -211,20 +211,20 @@ function transformContent2(rec_id) { ...@@ -211,20 +211,20 @@ function transformContent2(rec_id) {
function overRide(elem) { function overRide(elem) {
var id = elem.id var id = elem.id
var current_flag = $("input[type='radio'][name='radios']:checked").val() var current_flag = $("input[type='radio'][name='radios']:checked").val()
var val = elem.checked
var this_newflag = (current_flag==AjaxRecords[id]["flag"])?false:current_flag var this_newflag = (current_flag==AjaxRecords[id]["flag"])?false:current_flag
console.log("striking: "+id+" | this-elem_flag: "+AjaxRecords[id]["flag"]+" | current_flag: "+current_flag) console.log("striking: "+id+" | this-elem_flag: "+AjaxRecords[id]["flag"]+" | current_flag: "+current_flag)
console.log("\t so the new flag is: "+this_newflag) console.log("\t so the new flag is: "+this_newflag)
if(this_newflag) // if(this_newflag)
FlagsBuffer[this_newflag][id] = true; // FlagsBuffer[this_newflag][id] = true;
else // else
delete FlagsBuffer[ AjaxRecords[id]["flag"] ][id]; // delete FlagsBuffer[ AjaxRecords[id]["flag"] ][id];
AjaxRecords[id]["flag"] = this_newflag; AjaxRecords[id]["flag"] = Mark_NGram ( id , AjaxRecords[id]["flag"] , this_newflag );
var sum__selected_elems = 0; var sum__selected_elems = 0;
for(var i in FlagsBuffer) for(var i in FlagsBuffer)
...@@ -255,8 +255,52 @@ function transformContent(rec_id , header , content) { ...@@ -255,8 +255,52 @@ function transformContent(rec_id , header , content) {
function DeactivateSelectAll() { function DeactivateSelectAll() {
if( $("#multiple_selection").length>0 ) if( $("#multiple_selection").length>0 )
$("#multiple_selection")[0].checked = false; $("#multiple_selection")[0].checked = false;
if( Object.keys(FlagsBuffer["to_group"]).length ){
$("#savemodal").modal("show").css({
'margin-top': function () { //vertical centering
console.log($(".modal-content").height())
return ($(this).height() / 2);
}
});
console.log("OH OH")
console.log("There are some nodes in group array!:")
// $("#to_group").html( Object.keys(FlagsBuffer["to_group"]).join(" , ") );
var labels = []
for (var i in FlagsBuffer["to_group"]){
var fake_id = i
console.log( AjaxRecords[fake_id] )
labels.push(AjaxRecords[fake_id].name)
// $("#to_group").htm
}
$("#to_group").html( '<font color="blue">' + labels.join(" , ") + '</div>' );
}
}
function Mark_NGram( ngram_id , old_flag , new_flag ) {
if(new_flag){
for(var f in FlagsBuffer) {
if( new_flag==f )
FlagsBuffer[f][ngram_id] = true;
else
delete FlagsBuffer[f][ngram_id];
}
} else {
delete FlagsBuffer[ old_flag ][ngram_id];
}
return new_flag;
} }
function GroupNGrams() {
for (var i in FlagsBuffer["to_group"]){
console.log( AjaxRecords[i] )
}
}
//generic enough //generic enough
function ulWriter(rowIndex, record, columns, cellWriter) { function ulWriter(rowIndex, record, columns, cellWriter) {
...@@ -282,10 +326,21 @@ function ulWriter(rowIndex, record, columns, cellWriter) { ...@@ -282,10 +326,21 @@ function ulWriter(rowIndex, record, columns, cellWriter) {
} }
function SelectAll( the_checkbox ) { function SelectAll( the_checkbox ) {
var the_flag = $("input[type='radio'][name='radios']:checked").val() var current_flag = $("input[type='radio'][name='radios']:checked").val()
$("tbody tr").each(function (i, row) { $("tbody tr").each(function (i, row) {
var id = $(row).data('stuff') var id = $(row).data('stuff')
AjaxRecords[id]["flag"] = (the_checkbox.checked)?the_flag:false; // AjaxRecords[id]["flag"] = (the_checkbox.checked)?the_flag:false;
var this_newflag = (the_checkbox.checked)?current_flag:false;
// console.log("striking: "+id+" | this-elem_flag: "+AjaxRecords[id]["flag"]+" | current_flag: "+current_flag)
// console.log("\t so the new flag is: "+this_newflag)
AjaxRecords[id]["flag"] = Mark_NGram ( id , AjaxRecords[id]["flag"] , this_newflag );
}); });
MyTable.data('dynatable').dom.update(); MyTable.data('dynatable').dom.update();
} }
...@@ -302,6 +357,8 @@ $("#Clean_All").click(function(){ ...@@ -302,6 +357,8 @@ $("#Clean_All").click(function(){
for(var j in FlagsBuffer[i]) for(var j in FlagsBuffer[i])
delete FlagsBuffer[i][j]; delete FlagsBuffer[i][j];
$("#Clean_All, #Save_All").attr( "disabled", "disabled" );
}); });
$("#Save_All").click(function(){ $("#Save_All").click(function(){
......
...@@ -282,8 +282,13 @@ ...@@ -282,8 +282,13 @@
$("#submit_thing").prop('onclick',null); $("#submit_thing").prop('onclick',null);
var theType = $("#id_type option:selected").html(); var theType = $("#id_type option:selected").html();
console.log("consoling the typeeee: ")
console.log(theType)
if(theType=="Pubmed (xml format)") doTheQuery(); if(theType=="Pubmed (xml format)") doTheQuery();
if(theType=="istex") { if(theType=="ISTex") {
var origQuery = $("#id_name").val()
console.log("printing the results:")
console.log(origQuery)
testISTEX(origQuery.replace(" ","+"),1000) testISTEX(origQuery.replace(" ","+"),1000)
} }
} }
...@@ -354,7 +359,7 @@ ...@@ -354,7 +359,7 @@
}); });
} }
if(theType=="istext") { if(theType=="ISTex") {
console.log(window.location.origin+"tests/istextquery") console.log(window.location.origin+"tests/istextquery")
$.ajax({ $.ajax({
// contentType: "application/json", // contentType: "application/json",
...@@ -365,8 +370,7 @@ ...@@ -365,8 +370,7 @@
xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken")); xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
}, },
success: function(data) { success: function(data) {
console.log("in getGlobalResults") console.log("in getGlobalResults: Ajax(ISTex)")
console.log(data)
console.log("enabling "+"#"+value.id) console.log("enabling "+"#"+value.id)
$("#"+value.id).attr('onclick','getGlobalResults(this);'); $("#"+value.id).attr('onclick','getGlobalResults(this);');
// $("#submit_thing").prop('disabled' , false) // $("#submit_thing").prop('disabled' , false)
...@@ -374,10 +378,10 @@ ...@@ -374,10 +378,10 @@
thequeries = data thequeries = data
var N=data.length,k=0; var N=data.length,k=0;
console.log("N: "+N)
// for(var i in thequeries) N += thequeries[i].count // for(var i in thequeries) N += thequeries[i].count
if( N>1) { if( N>1) {
var total = JSON.parse(data).total var total = JSON.parse(data).total
console.log("N: "+total)
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+total+" publications.</i><br>") $("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+total+" publications.</i><br>")
$('#submit_thing').prop('disabled', false); $('#submit_thing').prop('disabled', false);
} else { } else {
...@@ -422,7 +426,8 @@ ...@@ -422,7 +426,8 @@
$( "#id_name" ).on('input',function(e){ $( "#id_name" ).on('input',function(e){
console.log($(this).val()) console.log($(this).val())
if(theType=="Pubmed (xml format)") testPUBMED( $(this).val() ) if(theType=="Pubmed (xml format)")
testPUBMED( $(this).val() )
}); });
} }
} }
...@@ -430,7 +435,7 @@ ...@@ -430,7 +435,7 @@
//CSS events for changing the Select element //CSS events for changing the Select element
function CustomForSelect( selected ) { function CustomForSelect( selected ) {
// show Radio-Inputs and trigger FileOrNotFile>@upload-file events // show Radio-Inputs and trigger FileOrNotFile>@upload-file events
if(selected=="Pubmed (xml format)" || selected=="istext") { if(selected=="Pubmed (xml format)" || selected=="ISTex") {
// if(selected=="pubmed") { // if(selected=="pubmed") {
console.log("show the button for: "+selected) console.log("show the button for: "+selected)
$("#pubmedcrawl").css("visibility", "visible"); $("#pubmedcrawl").css("visibility", "visible");
...@@ -497,7 +502,7 @@ ...@@ -497,7 +502,7 @@
success: function(data) { success: function(data) {
console.log("ajax_success: in testISTEX()") console.log("ajax_success: in testISTEX()")
console.log(data) console.log(data)
// location.reload(); location.reload();
}, },
error: function(result) { error: function(result) {
console.log("in testISTEX(). Data not found"); console.log("in testISTEX(). Data not found");
......
...@@ -249,6 +249,33 @@ input[type=radio]:checked + label { ...@@ -249,6 +249,33 @@ input[type=radio]:checked + label {
</div> </div>
</div> </div>
<div id="savemodal" class="modal fade">
<div class="modal-dialog">
<div class="modal-content">
<div class="modal-header">
<button type="button" class="close" data-dismiss="modal" aria-hidden="true">×</button>
<h4 class="modal-title">Group NGrams</h4>
</div>
<div class="modal-body form-horizontal">
Do you want to merge this elements before continuing?:
<div id="to_group"></div>
</div>
<div class="modal-footer">
<button id="closesavemodal" type="button" class="btn btn-default" data-dismiss="modal">Close</button>
<button type="button" class="btn btn-primary" onclick="GroupNGrams();">Save</button>
</div>
</div>
</div>
</div>
<script type="text/javascript" src="{% static "js/jquery/jquery.min.js" %}"></script> <script type="text/javascript" src="{% static "js/jquery/jquery.min.js" %}"></script>
<script src="{% static "js/charts/bootstrap.min.js" %}"></script> <script src="{% static "js/charts/bootstrap.min.js" %}"></script>
<script type="text/javascript" src="{% static "js/jquery/jquery.dynatable.js" %}"></script> <script type="text/javascript" src="{% static "js/jquery/jquery.dynatable.js" %}"></script>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment