Commit 2ea48b86 authored by Administrator's avatar Administrator

Merge branch 'unstable' into testing

parents a8dff456 4c89ca94
......@@ -274,14 +274,19 @@ def do_tfidf(corpus, reset=True):
NodeNodeNgram.objects.filter(nodex=corpus).delete()
if isinstance(corpus, Node) and corpus.type.name == "Corpus":
# print("\n- - - - - - - - - - ")
# for i in Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")):
# print("^^^",i)
for document in Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")):
for node_ngram in Node_Ngram.objects.filter(node=document):
try:
# print("\t",node_ngram.ngram)
nnn = NodeNodeNgram.objects.get(nodex=corpus, nodey=document, ngram=node_ngram.ngram)
except:
score = tfidf(corpus, document, node_ngram.ngram)
nnn = NodeNodeNgram(nodex=corpus, nodey=node_ngram.node, ngram=node_ngram.ngram, score=score)
nnn.save()
# print("- - - - - - - - - - \n")
else:
print("Only corpus implemented yet, you put instead:", type(corpus))
......
from django.conf.urls import patterns, include, url
from django.contrib import admin
from django.contrib.auth.views import login
from gargantext_web import views
import gargantext_web.api
admin.autodiscover()
urlpatterns = patterns('',
# Admin views
url(r'^admin/', include(admin.site.urls)),
url(r'^login/', include(admin.site.urls)),
url(r'^grappelli/', include('grappelli.urls')),
# User views
url(r'^$', views.home),
url(r'^projects/$', views.projects),
url(r'^project/(\d+)/delete/$', views.delete_project),
url(r'^project/(\d+)/$', views.project),
url(r'^project/(\d+)/corpus/(\d+)/$', views.corpus),
url(r'^project/(\d+)/corpus/(\d+)/delete/$', views.delete_corpus),
# Visualizations
url(r'^corpus/(\d+)/explorer$', views.explorer_graph),
url(r'^corpus/(\d+)/matrix$', views.explorer_matrix),
# Getting data
url(r'^chart/corpus/(\d+)/data.csv$', views.send_csv),
url(r'^corpus/(\d+)/node_link.json$', views.node_link),
url(r'^corpus/(\d+)/adjacency.json$', views.adjacency),
url(r'^api$', gargantext_web.api.Root),
url(r'^api/nodes/(\d+)/children/metadata$', gargantext_web.api.NodesChildrenMetatadata.as_view()),
url(r'^api/nodes/(\d+)/children/queries$', gargantext_web.api.NodesChildrenQueries.as_view()),
#url(r'^api/nodes$', gargantext_web.api.NodesController.get),
url(r'^api/nodes/(\d+)/ngrams$', gargantext_web.api.CorpusController.ngrams),
url(r'^api/nodes/(\d+)/data$', gargantext_web.api.CorpusController.data),
url(r'^graph-it$', views.graph_it),
url(r'^ngrams$', views.ngrams),
)
from django.conf import settings
if settings.DEBUG:
urlpatterns += patterns('',
url(r'^media/(?P<path>.*)$', 'django.views.static.serve', {
'document_root': settings.MEDIA_ROOT,
}),
url(r'^static/(?P<path>.*)$', 'django.views.static.serve', {
'document_root': settings.STATIC_ROOT,
}),
)
......@@ -34,6 +34,8 @@ from django.template import RequestContext
from django.contrib.auth.decorators import login_required
from django.contrib.auth import authenticate, login, logout
from scrap_pubmed.admin import Logger
def login_user(request):
logout(request)
username = password = ''
......@@ -194,6 +196,7 @@ def projects(request):
user = request.user
date = datetime.datetime.now()
print(Logger.write("STATIC_ROOT"))
project_type = NodeType.objects.get(name='Project')
projects = Node.objects.filter(user=user, type_id = project_type.id).order_by("-date")
......@@ -299,6 +302,10 @@ def project(request, project_id):
for key in donut_part.keys() ]
dauser = User.objects.get( username=user )
groups = len(dauser.groups.filter(name="PubMed_0.1"))
print("*groupslen*:",groups)
if request.method == 'POST':
form = CustomForm(request.POST, request.FILES)
......@@ -308,8 +315,6 @@ def project(request, project_id):
name = form.cleaned_data['name']
thefile = form.cleaned_data['file']
print(request.POST['type'])
print(form.cleaned_data['type'])
resource_type = ResourceType.objects.get(name=str( form.cleaned_data['type'] ))
print("-------------")
......@@ -819,7 +824,7 @@ def node_link(request, corpus_id):
start = time.time()
data = get_cooc(request=request, corpus_id=corpus_id, type="node_link")
end = time.time()
print ("LOG::TIME: get_cooc() [s]",(end - start))
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" get_cooc() [s]",(end - start))
print("In node_link() END")
return JsonHttpResponse(data)
......
......@@ -217,10 +217,10 @@ corpus_pubmed.add_resource(
for resource in corpus_pubmed.get_resources():
print('Resource #%d - %s - %s' % (resource.id, resource.digest, resource.file))
print('Parse corpus #%d...' % (corpus_pubmed.id, ))
corpus_pubmed.parse_resources(verbose=True)
print('Extract corpus #%d...' % (corpus_pubmed.id, ))
corpus_pubmed.children.all().extract_ngrams(['title',])
print('Parsed corpus #%d.' % (corpus_pubmed.id, ))
# print('Parse corpus #%d...' % (corpus_pubmed.id, ))
# corpus_pubmed.parse_resources(verbose=True)
# print('Extract corpus #%d...' % (corpus_pubmed.id, ))
# corpus_pubmed.children.all().extract_ngrams(['title',])
# print('Parsed corpus #%d.' % (corpus_pubmed.id, ))
exit()
......@@ -11,6 +11,7 @@ from cte_tree.models import CTENode, CTENodeManager
from parsing.Caches import LanguagesCache, NgramsExtractorsCache, NgramsCaches
from parsing.FileParsers import *
from time import time
import datetime
from collections import defaultdict
import hashlib
......@@ -160,6 +161,9 @@ class Node(CTENode):
def parse_resources(self, verbose=False):
# parse all resources into a list of metadata
metadata_list = []
print("not parsed resources:")
print(self.node_resource.filter(parsed=False))
print("= = = = = = = = = = =\n")
for node_resource in self.node_resource.filter(parsed=False):
resource = node_resource.resource
parser = defaultdict(lambda:FileParser.FileParser, {
......@@ -173,7 +177,11 @@ class Node(CTENode):
})[resource.type.name]()
metadata_list += parser.parse(str(resource.file))
# print(parser.parse(str(resource.file)))
# retrieve info from the database
# # retrieve info from the database
# print("\n - - -- - - - - - - - ")
# for i in metadata_list:
# print("***",i["title"])
# print("- - -- - - - - - - - \n")
type_id = NodeType.objects.get(name='Document').id
langages_cache = LanguagesCache()
user_id = self.user.id
......@@ -228,6 +236,7 @@ class Node(CTENode):
associations[terms] += 1
#print(associations)
# insert the occurrences in the database
# print(associations.items())
Node_Ngram.objects.bulk_create([
Node_Ngram(
node = self,
......@@ -247,16 +256,18 @@ class Node(CTENode):
self.save()
self.parse_resources()
end = time.time()
print ("LOG::TIME: parse_resources() [s]",(end - start))
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" parse_resources() [s]",(end - start))
print("LOG::TIME: In workflow() / parse_resources()")
start = time.time()
print("LOG::TIME: In workflow() extract_ngrams()")
print("\n- - - - - - - - - -")
type_document = NodeType.objects.get(name='Document')
self.children.filter(type_id=type_document.pk).extract_ngrams(keys=['title',])
end = time.time()
print("- - - - - - - - - - \n")
print ("LOG::TIME: ",(end - start))
print ("LOG::TIME: extract_ngrams() [s]",(end - start))
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" extract_ngrams() [s]",(end - start))
print("LOG::TIME: In workflow() / extract_ngrams()")
start = time.time()
......@@ -264,7 +275,7 @@ class Node(CTENode):
from analysis.functions import do_tfidf
do_tfidf(self)
end = time.time()
print ("LOG::TIME: do_tfidf() [s]",(end - start))
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_tfidf() [s]",(end - start))
print("LOG::TIME: In workflow() / do_tfidf()")
print("In workflow() END")
......
......@@ -10,12 +10,12 @@ import os
import time
# import libxml2
from lxml import etree
from datetime import datetime
import datetime
from django.core.files import File
import threading
from queue import Queue
import time
# import time
class MedlineFetcher:
......@@ -180,8 +180,9 @@ class MedlineFetcher:
for i,query in enumerate(thequeries):
k = query["count"]
percentage = k/float(N)
retmax_forthisyear = int(round(globalLimit*percentage))
proportion = k/float(N)
retmax_forthisyear = int(round(globalLimit*proportion))
query["retmax"] = retmax_forthisyear
if query["retmax"]==0: query["retmax"]+=1
return thequeries
from django.contrib import admin
from gargantext_web.settings import STATIC_ROOT
# Register your models here.
import os
import datetime
class Logger():
def write(msg):
path = "Logs/"
Logger.ensure_dir(path)
nowfull = datetime.datetime.now().isoformat().split("T")
date = nowfull[0]
time = nowfull[1]
return path
def ensure_dir(f):
d = os.path.dirname(f)
if not os.path.exists(d):
os.makedirs(d)
......@@ -4,7 +4,7 @@ from django.shortcuts import render
from django.http import Http404, HttpResponse, HttpResponseRedirect
from django.template.loader import get_template
from django.template import Context
from django.contrib.auth.models import User
from django.contrib.auth.models import User, Group
from scrap_pubmed.MedlineFetcherDavid2015 import MedlineFetcher
......@@ -13,8 +13,9 @@ from urllib.request import urlopen, urlretrieve
import json
from gargantext_web.settings import MEDIA_ROOT
from datetime import datetime
# from datetime import datetime
import time
import datetime
import os
import threading
from django.core.files import File
......@@ -30,12 +31,13 @@ def getGlobalStats(request ):
alist = ["bar","foo"]
if request.method == "POST":
N = 100
query = request.POST["query"]
print ("LOG::TIME: query =", query )
print ("LOG::TIME: N =", 300 )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
instancia = MedlineFetcher()
# alist = instancia.serialFetcher( 5, query , int(request.POST["N"]) )
alist = instancia.serialFetcher( 5, query , 300 )
alist = instancia.serialFetcher( 5, query , N )
data = alist
return JsonHttpResponse(data)
......@@ -73,6 +75,7 @@ def doTheQuery(request , project_id):
type_id = NodeType.objects.get(name='Document').id
user_id = User.objects.get( username=request.user ).id
corpus = Node(
user=request.user,
parent=parent,
......@@ -91,7 +94,7 @@ def doTheQuery(request , project_id):
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
for url in urlreqs:
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.now().microsecond))
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
tasks.q.put( [url , filename]) #put a task in th queue
tasks.q.join() # wait until everything is finished
for filename in tasks.firstResults:
......
......@@ -221,8 +221,6 @@
<button onclick='bringDaNoise();' id="submit_thing" disabled class="btn btn-primary" >Process this!</button><span id="simpleloader"></span>
</div>
</div>
</div><!-- /.modal-content -->
......@@ -246,8 +244,6 @@
return cookieValue;
}
var thequeries = []
function doTheQuery() {
......@@ -331,6 +327,7 @@
console.log("enabling "+"#"+value.id)
$("#"+value.id).attr('onclick','getGlobalResults(this);');
// $("#submit_thing").prop('disabled' , false)
$("#submit_thing").html("Process a 100 sample!")
thequeries = data
var N=0,k=0;
......@@ -364,6 +361,7 @@
$("#theresults").html("")
$('#submit_thing').prop('disabled', false);
$( "#id_name" ).on('input',null);
$("#submit_thing").html("Process this!")
}
// @dynamic-query events
else {
......@@ -391,6 +389,7 @@
$("#pubmedcrawl").css("visibility", "visible");
$("#pubmedcrawl").show();
$("#file_yes").click();
$("#submit_thing").html("Process this!")
}
// hide Radio-Inputs and trigger @upload-file events
else {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment