Commit e7ac6426 authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge remote-tracking branch 'gargantext.org/simon-unstable-notebook' into unstable-merge

parents dae0243d 30c1dbdc
from sqlalchemy.schema import Column, ForeignKey, UniqueConstraint, Index from sqlalchemy.schema import Column, ForeignKey, UniqueConstraint, Index
from sqlalchemy.orm import relationship from sqlalchemy.orm import relationship, validates
from sqlalchemy.types import TypeDecorator, \ from sqlalchemy.types import TypeDecorator, \
Integer, Float, Boolean, DateTime, String, Text Integer, Float, Boolean, DateTime, String, Text
from sqlalchemy.dialects.postgresql import JSONB, DOUBLE_PRECISION as Double from sqlalchemy.dialects.postgresql import JSONB, DOUBLE_PRECISION as Double
...@@ -7,6 +7,7 @@ from sqlalchemy.ext.mutable import MutableDict, MutableList ...@@ -7,6 +7,7 @@ from sqlalchemy.ext.mutable import MutableDict, MutableList
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.declarative import declarative_base
__all__ = ["Column", "ForeignKey", "UniqueConstraint", "relationship", __all__ = ["Column", "ForeignKey", "UniqueConstraint", "relationship",
"validates", "ValidatorMixin",
"Integer", "Float", "Boolean", "DateTime", "String", "Text", "Integer", "Float", "Boolean", "DateTime", "String", "Text",
"TypeDecorator", "TypeDecorator",
"JSONB", "Double", "JSONB", "Double",
...@@ -18,6 +19,25 @@ __all__ = ["Column", "ForeignKey", "UniqueConstraint", "relationship", ...@@ -18,6 +19,25 @@ __all__ = ["Column", "ForeignKey", "UniqueConstraint", "relationship",
# all tables handled by Alembic migration scripts. # all tables handled by Alembic migration scripts.
Base = declarative_base() Base = declarative_base()
# To be used by tables already handled by Django ORM, such as User model. We # To be used by tables already handled by Django ORM, such as User model. We
# separate them in order to keep those out of Alembic sight. # separate them in order to keep those out of Alembic sight.
DjangoBase = declarative_base() DjangoBase = declarative_base()
class ValidatorMixin(object):
def enforce_length(self, key, value):
"""Truncate a string according to its column length
Usage example:
.. code-block:: python
@validates('some_column')
def validate_some_column(self, key, value):
self.enforce_length(key, value)
"""
max_len = getattr(self.__class__, key).prop.columns[0].type.length
if value and len(value) > max_len:
return value[:max_len]
return value
...@@ -9,7 +9,7 @@ from datetime import datetime ...@@ -9,7 +9,7 @@ from datetime import datetime
from .base import Base, Column, ForeignKey, relationship, TypeDecorator, Index, \ from .base import Base, Column, ForeignKey, relationship, TypeDecorator, Index, \
Integer, Float, String, DateTime, JSONB, \ Integer, Float, String, DateTime, JSONB, \
MutableList, MutableDict MutableList, MutableDict, validates, ValidatorMixin
from .users import User from .users import User
__all__ = ['Node', 'NodeNode', 'CorpusNode'] __all__ = ['Node', 'NodeNode', 'CorpusNode']
...@@ -26,7 +26,7 @@ class NodeType(TypeDecorator): ...@@ -26,7 +26,7 @@ class NodeType(TypeDecorator):
return NODETYPES[typeindex] return NODETYPES[typeindex]
class Node(Base): class Node(ValidatorMixin, Base):
"""This model can fit many purposes: """This model can fit many purposes:
myFirstCorpus = session.query(CorpusNode).first() myFirstCorpus = session.query(CorpusNode).first()
...@@ -112,6 +112,10 @@ class Node(Base): ...@@ -112,6 +112,10 @@ class Node(Base):
'user_id={0.user_id}, parent_id={0.parent_id}, ' \ 'user_id={0.user_id}, parent_id={0.parent_id}, ' \
'name={0.name!r}, date={0.date})>'.format(self) 'name={0.name!r}, date={0.date})>'.format(self)
@validates('name')
def validate_name(self, key, value):
return self.enforce_length(key, value)
@property @property
def ngrams(self): def ngrams(self):
"""Pseudo-attribute allowing to retrieve a node's ngrams. """Pseudo-attribute allowing to retrieve a node's ngrams.
......
...@@ -73,7 +73,8 @@ from rest_framework.views import APIView ...@@ -73,7 +73,8 @@ from rest_framework.views import APIView
from gargantext.util.json import json_encoder from gargantext.util.json import json_encoder
def JsonHttpResponse(data, status=200): def JsonHttpResponse(data, status=200):
return HttpResponse( return HttpResponse(
content = json_encoder.encode(data), content = data.encode('utf-8') if isinstance(data, str) else \
json_encoder.encode(data),
content_type = 'application/json; charset=utf-8', content_type = 'application/json; charset=utf-8',
status = status status = status
) )
......
#!/usr/bin/env python
""" """
Gargantext Software Copyright (c) 2016-2017 CNRS ISC-PIF - Gargantext Software Copyright (c) 2016-2017 CNRS ISC-PIF -
http://iscpif.fr http://iscpif.fr
...@@ -6,45 +7,29 @@ http://gitlab.iscpif.fr/humanities/gargantext/blob/stable/LICENSE ) ...@@ -6,45 +7,29 @@ http://gitlab.iscpif.fr/humanities/gargantext/blob/stable/LICENSE )
- In France : a CECILL variant affero compliant - In France : a CECILL variant affero compliant
- GNU aGPLV3 for all other countries - GNU aGPLV3 for all other countries
""" """
#!/usr/bin/env python
import sys
import os
import os
import django
# Django settings os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gargantext.settings')
dirname = os.path.dirname(os.path.realpath(__file__)) django.setup()
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "gargantext.settings")
# initialize Django application from gargantext.constants import QUERY_SIZE_N_MAX, get_resource, get_resource_by_name
from django.core.wsgi import get_wsgi_application from gargantext.models import ProjectNode, DocumentNode, UserNode, User
application = get_wsgi_application() from gargantext.util.db import session, get_engine
from collections import Counter
import importlib
from django.http import Http404
from gargantext.util.toolchain.main import parse_extract_indexhyperdata
from gargantext.util.db import *
from gargantext.models import Node
from gargantext.util.toolchain.main import parse_extract_indexhyperdata
from nltk.tokenize import wordpunct_tokenize
from gargantext.models import * class NotebookError(Exception):
from nltk.tokenize import word_tokenize pass
import nltk as nltk
from statistics import mean
from math import log
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
import datetime
from collections import Counter
from langdetect import detect as detect_lang
def documents(corpus_id): def documents(corpus_id):
return (session.query(Node).filter( Node.parent_id==corpus_id return (session.query(DocumentNode).filter_by(parent_id=corpus_id)
, Node.typename=="DOCUMENT" #.order_by(Node.hyperdata['publication_date'])
) .all())
# .order_by(Node.hyperdata['publication_date'])
.all()
)
#import seaborn as sns #import seaborn as sns
...@@ -56,18 +41,21 @@ def chart(docs, field): ...@@ -56,18 +41,21 @@ def chart(docs, field):
frame1 = pd.DataFrame(year_publis, columns=['Date', 'DateValue'], index=frame0.Date) frame1 = pd.DataFrame(year_publis, columns=['Date', 'DateValue'], index=frame0.Date)
return frame1 return frame1
from gargantext.util.crawlers.HAL import HalCrawler from gargantext.util.crawlers.HAL import HalCrawler
def scan_hal(request): def scan_hal(request):
hal = HalCrawler() hal = HalCrawler()
return hal.scan_results(request) return hal.scan_results(request)
def scan_gargantext(corpus_id, lang, request): def scan_gargantext(corpus_id, lang, request):
connection = get_engine().connect() connection = get_engine().connect()
# TODO add some sugar the request (ideally request should be the same for hal and garg) # TODO add some sugar the request (ideally request should be the same for hal and garg)
query = """select count(n.id) from nodes n query = """select count(n.id) from nodes n
where to_tsvector('%s', hyperdata ->> 'abstract' || 'title') where to_tsvector('%s', hyperdata ->> 'abstract' || 'title')
@@ to_tsquery('%s') @@ to_tsquery('%s')
AND n.parent_id = %s;""" % (lang, request, corpus_id) AND n.parent_id = %s;""" % (lang, request, corpus_id)
return [i for i in connection.execute(query)][0][0] return [i for i in connection.execute(query)][0][0]
connection.close() connection.close()
...@@ -77,47 +65,117 @@ def myProject_fromUrl(url): ...@@ -77,47 +65,117 @@ def myProject_fromUrl(url):
myProject :: String -> Project myProject :: String -> Project
""" """
project_id = url.split("/")[4] project_id = url.split("/")[4]
project = session.query(Node).filter(Node.id == project_id).first() project = session.query(ProjectNode).get(project_id)
return project return project
def newCorpus(project, resourceName=11, name="Machine learning", query="LSTM"): def newCorpus(project, source, name=None, query=None):
print("Corpus \"%s\" in project \"%s\" created" % (name, project.name)) error = False
corpus = project.add_child(name="Corpus name", typename='CORPUS') if name is None:
corpus.hyperdata["resources"] = [{"extracted" : "true", "type" : 11}] name = query
corpus.hyperdata["statuses"] = [{"action" : "notebook", "complete" : "true"}]
# [TODO] Add informations needed to get buttons on the Project view. if not isinstance(project, ProjectNode):
session.add(corpus) error = "a valid project"
session.commit() if not isinstance(source, int) and not isinstance(source, str):
error = "a valid source identifier: id or name"
hal = HalCrawler() elif not isinstance(query, str):
max_result = hal.scan_results(query) error = "a valid query"
paging = 100 elif not isinstance(name, str):
for page in range(0, max_result, paging): error = "a valid name"
print("%s documents downloaded / %s." % (str( paging * (page +1)), str(max_result) ))
docs = (hal._get(query, fromPage=page, count=paging) if error:
.get("response", {}) raise NotebookError("Please provide %s." % error)
.get("docs", [])
) resource = get_resource(source) if isinstance(source, int) else \
get_resource_by_name(source)
from gargantext.util.parsers.HAL import HalParser
# [TODO] fix boilerplate for docs here moissonneur_name = get_moissonneur_name(resource) if resource else \
new_docs = HalParser(docs)._parse(docs) source.lower()
for doc in new_docs: try:
new_doc = (corpus.add_child( name = doc["title"][:255] moissonneur = get_moissonneur(moissonneur_name)
, typename = 'DOCUMENT') except ImportError:
) raise NotebookError("Invalid source identifier: %r" % source)
new_doc["hyperdata"] = doc
session.add(new_doc) return run_moissonneur(moissonneur, project, name, query)
session.commit()
print("Extracting the ngrams")
parse_extract_indexhyperdata(corpus)
print("Corpus is ready to explore:")
print("http://imt.gargantext.org/projects/%s/corpora/%s/" % (project.id, corpus.id))
return corpus
def get_moissonneur_name(ident):
""" Return moissonneur module name from RESOURCETYPE or crawler name """
# Does it quacks like a RESOURCETYPE ?
if hasattr(ident, 'get'):
ident = ident.get('crawler')
# Extract name from crawler class name, otherwise assume ident is already
# a moissonneur name.
if isinstance(ident, str) and ident.endswith('Crawler'):
return ident[:-len('Crawler')].lower()
def get_moissonneur(name):
""" Return moissonneur module from its name """
if not isinstance(name, str) or not name.islower():
raise NotebookError("Invalid moissonneur name: %r" % name)
module = importlib.import_module('moissonneurs.%s' % name)
module.name = name
return module
def run_moissonneur(moissonneur, project, name, query):
""" Run moissonneur and return resulting corpus """
# XXX Uber-kludge with gory details. Spaghetti rulezzzzz!
class Dummy(object):
pass
request = Dummy()
request.method = 'POST'
request.path = 'nowhere'
request.META = {}
# XXX 'string' only have effect on moissonneurs.pubmed; its value is added
# when processing request client-side, take a deep breath and see
# templates/projects/project.html for more details.
request.POST = {'string': name,
'query': query,
'N': QUERY_SIZE_N_MAX}
request.user = Dummy()
request.user.id = project.user_id
request.user.is_authenticated = lambda: True
if moissonneur.name == 'istex':
# Replace ALL spaces by plus signs
request.POST['query'] = '+'.join(filter(None, query.split(' ')))
try:
import json
r = moissonneur.query(request)
raw_json = r.content.decode('utf-8')
data = json.loads(raw_json)
if moissonneur.name == 'pubmed':
count = sum(x['count'] for x in data)
request.POST['query'] = raw_json
elif moissonneur.name == 'istex':
count = data.get('total', 0)
else:
count = data.get('results_nb', 0)
if count > 0:
corpus = moissonneur.save(request, project.id, return_corpus=True)
else:
return None
except (ValueError, Http404) as e:
raise e
# Sometimes strange things happens...
if corpus.name != name:
corpus.name = name
session.commit()
return corpus
...@@ -30,7 +30,7 @@ def query( request): ...@@ -30,7 +30,7 @@ def query( request):
#ids = crawlerbot.get_ids(query) #ids = crawlerbot.get_ids(query)
return JsonHttpResponse({"results_nb":crawlerbot.results_nb}) return JsonHttpResponse({"results_nb":crawlerbot.results_nb})
def save(request, project_id): def save(request, project_id, return_corpus=False):
'''save''' '''save'''
if request.method == "POST": if request.method == "POST":
...@@ -101,6 +101,9 @@ def save(request, project_id): ...@@ -101,6 +101,9 @@ def save(request, project_id):
session.rollback() session.rollback()
# -------------------------------------------- # --------------------------------------------
if return_corpus:
return corpus
return render( return render(
template_name = 'pages/projects/wait.html', template_name = 'pages/projects/wait.html',
request = request, request = request,
......
...@@ -33,7 +33,7 @@ def query( request): ...@@ -33,7 +33,7 @@ def query( request):
print(results) print(results)
return JsonHttpResponse({"results_nb":crawlerbot.results_nb}) return JsonHttpResponse({"results_nb":crawlerbot.results_nb})
def save(request, project_id): def save(request, project_id, return_corpus=False):
'''save''' '''save'''
if request.method == "POST": if request.method == "POST":
...@@ -103,6 +103,9 @@ def save(request, project_id): ...@@ -103,6 +103,9 @@ def save(request, project_id):
session.rollback() session.rollback()
# -------------------------------------------- # --------------------------------------------
if return_corpus:
return corpus
return render( return render(
template_name = 'pages/projects/wait.html', template_name = 'pages/projects/wait.html',
request = request, request = request,
......
...@@ -29,7 +29,7 @@ def query( request): ...@@ -29,7 +29,7 @@ def query( request):
#ids = crawlerbot.get_ids(query) #ids = crawlerbot.get_ids(query)
return JsonHttpResponse({"results_nb":crawlerbot.results_nb}) return JsonHttpResponse({"results_nb":crawlerbot.results_nb})
def save(request, project_id): def save(request, project_id, return_corpus=False):
'''save''' '''save'''
if request.method == "POST": if request.method == "POST":
...@@ -100,6 +100,9 @@ def save(request, project_id): ...@@ -100,6 +100,9 @@ def save(request, project_id):
session.rollback() session.rollback()
# -------------------------------------------- # --------------------------------------------
if return_corpus:
return corpus
return render( return render(
template_name = 'pages/projects/wait.html', template_name = 'pages/projects/wait.html',
request = request, request = request,
......
...@@ -52,7 +52,7 @@ def query( request ): ...@@ -52,7 +52,7 @@ def query( request ):
def save(request , project_id): def save(request , project_id, return_corpus=False):
print("testISTEX:") print("testISTEX:")
print(request.method) print(request.method)
alist = ["bar","foo"] alist = ["bar","foo"]
...@@ -171,6 +171,9 @@ def save(request , project_id): ...@@ -171,6 +171,9 @@ def save(request , project_id):
session.rollback() session.rollback()
# -------------------------------------------- # --------------------------------------------
if return_corpus:
return corpus
return render( return render(
template_name = 'pages/projects/wait.html', template_name = 'pages/projects/wait.html',
request = request, request = request,
......
...@@ -33,7 +33,7 @@ def query( request): ...@@ -33,7 +33,7 @@ def query( request):
print(results) print(results)
return JsonHttpResponse({"results_nb":crawlerbot.results_nb}) return JsonHttpResponse({"results_nb":crawlerbot.results_nb})
def save(request, project_id): def save(request, project_id, return_corpus=False):
'''save''' '''save'''
if request.method == "POST": if request.method == "POST":
...@@ -104,6 +104,9 @@ def save(request, project_id): ...@@ -104,6 +104,9 @@ def save(request, project_id):
session.rollback() session.rollback()
# -------------------------------------------- # --------------------------------------------
if return_corpus:
return corpus
return render( return render(
template_name = 'pages/projects/wait.html', template_name = 'pages/projects/wait.html',
request = request, request = request,
......
...@@ -69,7 +69,7 @@ def query( request ): ...@@ -69,7 +69,7 @@ def query( request ):
return JsonHttpResponse(data) return JsonHttpResponse(data)
def save( request , project_id ) : def save( request , project_id, return_corpus=False ) :
# implicit global session # implicit global session
# do we have a valid project id? # do we have a valid project id?
try: try:
...@@ -164,6 +164,10 @@ def save( request , project_id ) : ...@@ -164,6 +164,10 @@ def save( request , project_id ) :
session.rollback() session.rollback()
# -------------------------------------------- # --------------------------------------------
sleep(1) sleep(1)
if return_corpus:
return corpus
return HttpResponseRedirect('/projects/' + str(project_id)) return HttpResponseRedirect('/projects/' + str(project_id))
data = alist data = alist
......
This diff is collapsed.
...@@ -57,7 +57,7 @@ ...@@ -57,7 +57,7 @@
<center id="corpus" class="help"> <center id="corpus" class="help">
<a data-toggle="modal" href="#addcorpus" > <a data-toggle="modal" href="#addcorpus" >
<button <button
type="button" type="button"
...@@ -532,7 +532,7 @@ ...@@ -532,7 +532,7 @@
$("#submit_thing").html("Process a {{ query_size }} sample!") $("#submit_thing").html("Process a {{ query_size }} sample!")
thequeries = data thequeries = data
var N=0,k=0; var N=0;
for(var i in thequeries) N += thequeries[i].count for(var i in thequeries) N += thequeries[i].count
if( N>0) { if( N>0) {
...@@ -571,12 +571,11 @@ ...@@ -571,12 +571,11 @@
$("#submit_thing").html("Process a {{ query_size }} sample!") $("#submit_thing").html("Process a {{ query_size }} sample!")
thequeries = data thequeries = data
var N=data.length,k=0; var N = data.total;
// for(var i in thequeries) N += thequeries[i].count
if( N>1) { if (N > 0) {
var total = JSON.parse(data).total console.log("N: "+N)
console.log("N: "+total) $("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+N+" publications.</i><br>")
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+total+" publications.</i><br>")
$('#submit_thing').prop('disabled', false); $('#submit_thing').prop('disabled', false);
} else { } else {
$("#theresults").html("<i> <b>"+data[0]+"</b></i><br>") $("#theresults").html("<i> <b>"+data[0]+"</b></i><br>")
...@@ -661,7 +660,7 @@ ...@@ -661,7 +660,7 @@
console.log(data) console.log(data)
console.log("SUCCESS") console.log("SUCCESS")
console.log("enabling "+"#"+value.id) console.log("enabling "+"#"+value.id)
// $("#"+value.id).attr('onclick','getGlobalResults(this);'); // $("#"+value.id).attr('onclick','getGlobalResults(this);');
$("#submit_thing").prop('disabled' , false) $("#submit_thing").prop('disabled' , false)
//$("#submit_thing").html("Process a {{ query_size }} sample!") //$("#submit_thing").html("Process a {{ query_size }} sample!")
...@@ -721,7 +720,7 @@ ...@@ -721,7 +720,7 @@
console.log(data) console.log(data)
console.log("SUCCESS") console.log("SUCCESS")
console.log("enabling "+"#"+value.id) console.log("enabling "+"#"+value.id)
// $("#"+value.id).attr('onclick','getGlobalResults(this);'); // $("#"+value.id).attr('onclick','getGlobalResults(this);');
$("#submit_thing").prop('disabled' , false) $("#submit_thing").prop('disabled' , false)
//$("#submit_thing").html("Process a {{ query_size }} sample!") //$("#submit_thing").html("Process a {{ query_size }} sample!")
...@@ -781,7 +780,7 @@ ...@@ -781,7 +780,7 @@
console.log(data) console.log(data)
console.log("SUCCESS") console.log("SUCCESS")
console.log("enabling "+"#"+value.id) console.log("enabling "+"#"+value.id)
// $("#"+value.id).attr('onclick','getGlobalResults(this);'); // $("#"+value.id).attr('onclick','getGlobalResults(this);');
$("#submit_thing").prop('disabled' , false) $("#submit_thing").prop('disabled' , false)
//$("#submit_thing").html("Process a {{ query_size }} sample!") //$("#submit_thing").html("Process a {{ query_size }} sample!")
...@@ -876,12 +875,12 @@ ...@@ -876,12 +875,12 @@
console.log("selected:", selectedId); console.log("selected:", selectedId);
// by typeID: 3 = PUBMED, 8 = ISTEX, 9 = CERN // by typeID: 3 = PUBMED, 8 = ISTEX, 9 = CERN
if ( selectedId == "3" if ( selectedId == "3"
|| selectedId == "8" || selectedId == "8"
|| selectedId == "9" || selectedId == "9"
|| selectedId == "10" || selectedId == "10"
|| selectedId == "11" || selectedId == "11"
|| selectedId == "12" || selectedId == "12"
) { ) {
console.log("show the button for: " + selectedId) console.log("show the button for: " + selectedId)
$("#div-fileornot").css("visibility", "visible"); $("#div-fileornot").css("visibility", "visible");
...@@ -1019,16 +1018,16 @@ ...@@ -1019,16 +1018,16 @@
function saveMultivac(query, N){ function saveMultivac(query, N){
console.log("In Multivac") console.log("In Multivac")
if(!query || query=="") return; if(!query || query=="") return;
console.log(query) console.log(query)
//var origQuery = query //var origQuery = query
var data = { "query" : query , "N": N }; var data = { "query" : query , "N": N };
// Replace all the slashes // Replace all the slashes
var projectid = window.location.href.split("projects")[1].replace(/\//g, '') var projectid = window.location.href.split("projects")[1].replace(/\//g, '')
console.log(data) console.log(data)
$.ajax({ $.ajax({
dataType: 'json', dataType: 'json',
...@@ -1066,16 +1065,16 @@ ...@@ -1066,16 +1065,16 @@
function save(query, N, urlGarg){ function save(query, N, urlGarg){
console.log("In Gargantext") console.log("In Gargantext")
if(!query || query=="") return; if(!query || query=="") return;
console.log(query) console.log(query)
//var origQuery = query //var origQuery = query
var data = { "query" : query , "N": N }; var data = { "query" : query , "N": N };
// Replace all the slashes // Replace all the slashes
var projectid = window.location.href.split("projects")[1].replace(/\//g, '') var projectid = window.location.href.split("projects")[1].replace(/\//g, '')
console.log(data) console.log(data)
$.ajax({ $.ajax({
dataType: 'json', dataType: 'json',
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment