Commit 966cea3a authored by PkSM3's avatar PkSM3

[UPDATE] ISTex fix and ngrams-table advances

parent dfec6bab
......@@ -302,9 +302,19 @@ def newpaginatorJSON(request , corpus_id):
# project = session.query(Node).filter(Node.id==project_id).first()
corpus = session.query(Node).filter(Node.id==corpus_id).first()
type_document_id = cache.NodeType['Document'].id
documents = session.query(Node).filter(Node.parent_id==corpus_id , Node.type_id == type_document_id ).all()
user_id = request.user.id
# documents = session.query(Node).filter(Node.parent_id==corpus_id , Node.type_id == type_document_id ).all()
documents = session.query(Node).filter(Node.user_id == user_id , Node.parent_id==corpus_id , Node.type_id == type_document_id ).all()
# for doc in documents:
# print(doc.name)
# if "publication_date" in doc.hyperdata:
# print(doc.hyperdata["publication_date"])
# else: print ("No date")
# print(" - - - - - - -")
# print(" = = = = = = = = = = = = = = = == = = = ")
filtered_docs = []
for doc in documents:
if "publication_date" in doc.hyperdata:
......
from django.db import transaction
from lxml import etree
from .FileParser import FileParser
from ..NgramsExtractors import *
from datetime import datetime
from io import BytesIO
import json
class ISText(FileParser):
def _parse(self, thefile):
json_data=open(thefile,"r")
data = json.load(json_data)
json_data.close()
json_docs = data["hits"]
hyperdata_list = []
hyperdata_path = {
"id" : "id",
"source" : 'corpusName',
"title" : 'title',
"genre" : "genre",
# "language_iso3" : 'MedlineCitation/Article/Language',
"doi" : 'doi',
"host" : 'host',
"publication_date" : 'pubdate',
# "authors" : 'author',
"authorsRAW" : 'author',
"keywords" : "keywords"
}
hyperdata = {}
import pprint
import datetime
for json_doc in json_docs:
for key, path in hyperdata_path.items():
try:
# print(path," ==> ",len(json_doc[path]))
hyperdata[key] = json_doc[path]
except: pass
# print("|",hyperdata["publication_date"])
if "doi" in hyperdata: hyperdata["doi"] = hyperdata["doi"][0]
keywords = []
if "keywords" in hyperdata:
for keyw in hyperdata["keywords"]:
keywords.append(keyw["value"] )
hyperdata["keywords"] = ", ".join( keywords )
moredate=False
moresource=False
if "host" in hyperdata:
if "genre" in hyperdata["host"] and len(hyperdata["host"]["genre"])>0:
if "genre" in hyperdata and len(hyperdata["genre"])==0:
hyperdata["genre"] = hyperdata["host"]["genre"]
# print(hyperdata["host"])
if "pubdate" in hyperdata["host"]:
onebuffer = hyperdata["publication_date"]
hyperdata["publication_date"] = []
hyperdata["publication_date"].append(onebuffer)
hyperdata["publication_date"].append( hyperdata["host"]["pubdate"] )
if "title" in hyperdata["host"]:
hyperdata["journal"] = hyperdata["host"]["title"]
authors=False
if "authorsRAW" in hyperdata:
names = []
for author in hyperdata["authorsRAW"]:
names.append(author["name"])
hyperdata["authors"] = ", ".join(names)
if "host" in hyperdata: hyperdata.pop("host")
if "genre" in hyperdata:
if len(hyperdata["genre"])==0:
hyperdata.pop("genre")
if "publication_date" in hyperdata and isinstance(hyperdata["publication_date"], list):
if len(hyperdata["publication_date"])>1:
d1 = hyperdata["publication_date"][0]
d2 = hyperdata["publication_date"][1]
# print("date1:",d1)
# print("date2:",d2)
if len(d1)==len(d2):
hyperdata["publication_date"] = d2
# if int(d1)>int(d2): hyperdata["publication_date"] = d2
else:
fulldate = ""
year = d2[:4]
fulldate+=year
if len(d2)>4:
month = d2[4:6]
fulldate+="-"+month
if len(d2)>6:
day = d2[6:8]
fulldate+="-"+day
hyperdata["publication_date"] = fulldate
else:
if "copyrightdate" in json_doc:
hyperdata["publication_date"] = json_doc["copyrightdate"]
else:
if "copyrightdate" in json_doc:
hyperdata["publication_date"] = json_doc["copyrightdate"]
print("||",hyperdata["title"])
hyperdata_list.append(hyperdata)
print("=============================")
print("\nlen list:",len(hyperdata_list))
return hyperdata_list
......@@ -4,5 +4,5 @@ from .JstorFileParser import JstorFileParser
from .ZoteroFileParser import ZoteroFileParser
from .PubmedFileParser import PubmedFileParser
from .EuropressFileParser import EuropressFileParser
from .ISText import ISText
from .ISTex import ISTex
from .CSVParser import CSVParser
......@@ -102,10 +102,17 @@ def parse_resources(corpus, user=None, user_id=None):
.filter(Node_Resource.parsed == False)
)
# make a new node for every parsed document of the corpus
print("HERE MOFOs")
print(resources_query)
dbg.show('analyze documents')
nodes = list()
for resource, resourcetype in resources_query:
# print("resource: ",resource)
# print("resourcetype:",resourcetype)
# print(resourcetype.name)
# print(resource.file)
parser = parsers[resourcetype.name]
# print(parser.parse(resource.file))
for hyperdata_dict in parser.parse(resource.file):
# retrieve language ID from hyperdata
if 'language_iso2' in hyperdata_dict:
......@@ -116,6 +123,7 @@ def parse_resources(corpus, user=None, user_id=None):
else:
language_id = None
# create new node
# print(hyperdata_dict.get('title', '')[:200])
node = Node(
name = hyperdata_dict.get('title', '')[:200],
parent_id = corpus_id,
......@@ -154,6 +162,7 @@ def parse_resources(corpus, user=None, user_id=None):
hyperdata.id,
hyperdata_value,
))
for key, values in node_hyperdata_lists.items():
bulk_insert(Node_Hyperdata, ['node_id', 'hyperdata_id', 'value_'+key], values)
# mark the corpus as parsed
......
......@@ -10,5 +10,6 @@ parsers = {
'Europress (French)' : EuropressFileParser,
'Europress (English)' : EuropressFileParser,
'CSVParser' : CSVParser,
'ISTex' : ISTex,
}
......@@ -61,7 +61,7 @@ def getGlobalStatsISTEXT(request ):
alist = ["bar","foo"]
if request.method == "POST":
N = 100
N = 1000
query = request.POST["query"]
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
......@@ -184,34 +184,111 @@ def doTheQuery(request , project_id):
def testISTEX(request , project_id):
print("testISTEX:")
print(request.method)
alist = ["bar","foo"]
# do we have a valid project id?
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = (session
.query(Node)
.filter(Node.id == project_id)
.filter(Node.type_id == cache.NodeType['Project'].id)
).first()
if project is None:
raise Http404()
# do we have a valid user?
user = request.user
if not user.is_authenticated():
return redirect('/login/?next=%s' % request.path)
if project.user_id != user.id:
return HttpResponseForbidden()
if request.method == "POST":
# print(alist)
query = "-"
query_string = "-"
N = 60
N = 1000
if "query" in request.POST: query = request.POST["query"]
if "string" in request.POST: query_string = request.POST["string"].replace(" ","+")
# if "N" in request.POST: N = request.POST["N"]
print(query_string , query , N)
# urlreqs = []
# pagesize = 50
# tasks = MedlineFetcher()
# chunks = list(tasks.chunks(range(N), pagesize))
# for k in chunks:
# if (k[0]+pagesize)>N: pagesize = N-k[0]
# urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
# print(urlreqs)
urlreqs = []
pagesize = 50
tasks = MedlineFetcher()
chunks = list(tasks.chunks(range(N), pagesize))
for k in chunks:
if (k[0]+pagesize)>N: pagesize = N-k[0]
urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
resourcetype = cache.ResourceType["ISTex"]
# corpus node instanciation as a Django model
corpus = Node(
name = query,
user_id = request.user.id,
parent_id = project_id,
type_id = cache.NodeType['Corpus'].id,
language_id = None,
hyperdata = {'Processing' : 1,}
)
session.add(corpus)
session.commit()
ensure_dir(request.user)
tasks = MedlineFetcher()
for i in range(8):
t = threading.Thread(target=tasks.worker2) #thing to do
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
for url in urlreqs:
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
tasks.q.put( [url , filename]) #put a task in th queue
tasks.q.join() # wait until everything is finished
dwnldsOK = 0
for filename in tasks.firstResults:
if filename!=False:
# add the uploaded resource to the corpus
add_resource(corpus,
user_id = request.user.id,
type_id = resourcetype.id,
file = filename,
)
dwnldsOK+=1
# urlreqs = ["http://localhost/374255" , "http://localhost/374278" ]
# print(urlreqs)
try:
if not DEBUG:
apply_workflow.apply_async((corpus.id,),)
else:
thread = threading.Thread(target=apply_workflow, args=(corpus.id, ), daemon=True)
thread.start()
except Exception as error:
print('WORKFLOW ERROR')
print(error)
sleep(1)
return HttpResponseRedirect('/project/' + str(project_id))
# resource_type = ResourceType.objects.get(name="istext" )
# parent = Node.objects.get(id=project_id)
......
......@@ -211,20 +211,20 @@ function transformContent2(rec_id) {
function overRide(elem) {
var id = elem.id
var current_flag = $("input[type='radio'][name='radios']:checked").val()
var val = elem.checked
var this_newflag = (current_flag==AjaxRecords[id]["flag"])?false:current_flag
console.log("striking: "+id+" | this-elem_flag: "+AjaxRecords[id]["flag"]+" | current_flag: "+current_flag)
console.log("\t so the new flag is: "+this_newflag)
if(this_newflag)
FlagsBuffer[this_newflag][id] = true;
else
delete FlagsBuffer[ AjaxRecords[id]["flag"] ][id];
// if(this_newflag)
// FlagsBuffer[this_newflag][id] = true;
// else
// delete FlagsBuffer[ AjaxRecords[id]["flag"] ][id];
AjaxRecords[id]["flag"] = this_newflag;
AjaxRecords[id]["flag"] = Mark_NGram ( id , AjaxRecords[id]["flag"] , this_newflag );
var sum__selected_elems = 0;
for(var i in FlagsBuffer)
......@@ -255,9 +255,53 @@ function transformContent(rec_id , header , content) {
function DeactivateSelectAll() {
if( $("#multiple_selection").length>0 )
$("#multiple_selection")[0].checked = false;
if( Object.keys(FlagsBuffer["to_group"]).length ){
$("#savemodal").modal("show").css({
'margin-top': function () { //vertical centering
console.log($(".modal-content").height())
return ($(this).height() / 2);
}
});
console.log("OH OH")
console.log("There are some nodes in group array!:")
// $("#to_group").html( Object.keys(FlagsBuffer["to_group"]).join(" , ") );
var labels = []
for (var i in FlagsBuffer["to_group"]){
var fake_id = i
console.log( AjaxRecords[fake_id] )
labels.push(AjaxRecords[fake_id].name)
// $("#to_group").htm
}
$("#to_group").html( '<font color="blue">' + labels.join(" , ") + '</div>' );
}
}
function Mark_NGram( ngram_id , old_flag , new_flag ) {
if(new_flag){
for(var f in FlagsBuffer) {
if( new_flag==f )
FlagsBuffer[f][ngram_id] = true;
else
delete FlagsBuffer[f][ngram_id];
}
} else {
delete FlagsBuffer[ old_flag ][ngram_id];
}
return new_flag;
}
function GroupNGrams() {
for (var i in FlagsBuffer["to_group"]){
console.log( AjaxRecords[i] )
}
}
//generic enough
function ulWriter(rowIndex, record, columns, cellWriter) {
// pr("\tulWriter: "+record.id)
......@@ -282,10 +326,21 @@ function ulWriter(rowIndex, record, columns, cellWriter) {
}
function SelectAll( the_checkbox ) {
var the_flag = $("input[type='radio'][name='radios']:checked").val()
var current_flag = $("input[type='radio'][name='radios']:checked").val()
$("tbody tr").each(function (i, row) {
var id = $(row).data('stuff')
AjaxRecords[id]["flag"] = (the_checkbox.checked)?the_flag:false;
// AjaxRecords[id]["flag"] = (the_checkbox.checked)?the_flag:false;
var this_newflag = (the_checkbox.checked)?current_flag:false;
// console.log("striking: "+id+" | this-elem_flag: "+AjaxRecords[id]["flag"]+" | current_flag: "+current_flag)
// console.log("\t so the new flag is: "+this_newflag)
AjaxRecords[id]["flag"] = Mark_NGram ( id , AjaxRecords[id]["flag"] , this_newflag );
});
MyTable.data('dynatable').dom.update();
}
......@@ -302,6 +357,8 @@ $("#Clean_All").click(function(){
for(var j in FlagsBuffer[i])
delete FlagsBuffer[i][j];
$("#Clean_All, #Save_All").attr( "disabled", "disabled" );
});
$("#Save_All").click(function(){
......
......@@ -282,8 +282,13 @@
$("#submit_thing").prop('onclick',null);
var theType = $("#id_type option:selected").html();
console.log("consoling the typeeee: ")
console.log(theType)
if(theType=="Pubmed (xml format)") doTheQuery();
if(theType=="istex") {
if(theType=="ISTex") {
var origQuery = $("#id_name").val()
console.log("printing the results:")
console.log(origQuery)
testISTEX(origQuery.replace(" ","+"),1000)
}
}
......@@ -354,7 +359,7 @@
});
}
if(theType=="istext") {
if(theType=="ISTex") {
console.log(window.location.origin+"tests/istextquery")
$.ajax({
// contentType: "application/json",
......@@ -365,8 +370,7 @@
xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
},
success: function(data) {
console.log("in getGlobalResults")
console.log(data)
console.log("in getGlobalResults: Ajax(ISTex)")
console.log("enabling "+"#"+value.id)
$("#"+value.id).attr('onclick','getGlobalResults(this);');
// $("#submit_thing").prop('disabled' , false)
......@@ -374,10 +378,10 @@
thequeries = data
var N=data.length,k=0;
console.log("N: "+N)
// for(var i in thequeries) N += thequeries[i].count
if( N>1) {
var total = JSON.parse(data).total
console.log("N: "+total)
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+total+" publications.</i><br>")
$('#submit_thing').prop('disabled', false);
} else {
......@@ -422,7 +426,8 @@
$( "#id_name" ).on('input',function(e){
console.log($(this).val())
if(theType=="Pubmed (xml format)") testPUBMED( $(this).val() )
if(theType=="Pubmed (xml format)")
testPUBMED( $(this).val() )
});
}
}
......@@ -430,7 +435,7 @@
//CSS events for changing the Select element
function CustomForSelect( selected ) {
// show Radio-Inputs and trigger FileOrNotFile>@upload-file events
if(selected=="Pubmed (xml format)" || selected=="istext") {
if(selected=="Pubmed (xml format)" || selected=="ISTex") {
// if(selected=="pubmed") {
console.log("show the button for: "+selected)
$("#pubmedcrawl").css("visibility", "visible");
......@@ -497,7 +502,7 @@
success: function(data) {
console.log("ajax_success: in testISTEX()")
console.log(data)
// location.reload();
location.reload();
},
error: function(result) {
console.log("in testISTEX(). Data not found");
......
......@@ -249,6 +249,33 @@ input[type=radio]:checked + label {
</div>
</div>
<div id="savemodal" class="modal fade">
<div class="modal-dialog">
<div class="modal-content">
<div class="modal-header">
<button type="button" class="close" data-dismiss="modal" aria-hidden="true">×</button>
<h4 class="modal-title">Group NGrams</h4>
</div>
<div class="modal-body form-horizontal">
Do you want to merge this elements before continuing?:
<div id="to_group"></div>
</div>
<div class="modal-footer">
<button id="closesavemodal" type="button" class="btn btn-default" data-dismiss="modal">Close</button>
<button type="button" class="btn btn-primary" onclick="GroupNGrams();">Save</button>
</div>
</div>
</div>
</div>
<script type="text/javascript" src="{% static "js/jquery/jquery.min.js" %}"></script>
<script src="{% static "js/charts/bootstrap.min.js" %}"></script>
<script type="text/javascript" src="{% static "js/jquery/jquery.dynatable.js" %}"></script>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment