Commit e9081a75 authored by Administrator's avatar Administrator

Merge branch 'testing' into stable

parents 63d15bd8 370d00c5
#from .celery import app as async_app
# -*- coding: utf-8 -*-
#import os
#import djcelery
#
#from celery import Celery
#
#from django.conf import settings
#
## set the default Django settings module for the 'celery' program.
#os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gargantext_web.settings')
#
#app = Celery('gargantext_web')
#
#
#app.conf.update(
# CELERY_RESULT_BACKEND='djcelery.backends.database:DatabaseBackend',
#)
#
#
#app.conf.update(
# CELERY_RESULT_BACKEND='djcelery.backends.cache:CacheBackend',
#)
#
## Using a string here means the worker will not have to
## pickle the object when using Windows.
##app.config_from_object('django.conf:settings')
#app.autodiscover_tasks(lambda: settings.INSTALLED_APPS)
#
from celery import shared_task
from node import models
#@app.task(bind=True)
@shared_task
def debug_task(request):
print('Request: {0!r}'.format(request))
from gargantext_web.db import session, Node
@shared_task
def apply_sum(x, y):
print(x+y)
print(session.query(Node.name).first())
from parsing.corpustools import add_resource, parse_resources, extract_ngrams, compute_tfidf
@shared_task
def apply_workflow(corpus_id):
corpus = session.query(Node).filter(Node.id==corpus_id).first()
parse_resources(corpus)
try:
print("-" *60)
# With Django ORM
corpus_django = models.Node.objects.get(id=corpus_id)
corpus_django.metadata['Processing'] = 0
corpus_django.save()
#TODO With SLA ORM (KO why?)
# corpus.metadata['Processing'] = 0
# session.add(corpus)
# session.flush()
except Exception as error:
print(error)
extract_ngrams(corpus, ['title'])
compute_tfidf(corpus)
......@@ -2,6 +2,7 @@ from gargantext_web import settings
from node import models
__all__ = ['literalquery', 'session', 'cache', 'Session', 'bulk_insert', 'engine', 'get_cursor']
......@@ -56,6 +57,23 @@ for model_name, model in models.__dict__.items():
NodeNgram = Node_Ngram
NodeResource = Node_Resource
# manually declare the Node table...
from datetime import datetime
from sqlalchemy.types import *
from sqlalchemy.schema import Column, ForeignKey
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.orm import relationship, aliased
# class Node(Base):
# __tablename__ = 'node_node'
# id = Column(Integer, primary_key=True)
# user_id = Column(Integer, ForeignKey('auth_user.id', ondelete='CASCADE'), index=True, nullable=False)
# type_id = Column(Integer, ForeignKey('node_nodetype.id', ondelete='CASCADE'), index=True, nullable=False)
# name = Column(String(255))
# language_id = Column(Integer, ForeignKey('node_language.id', ondelete='CASCADE'), index=True, nullable=False)
# date = Column(DateTime(), default=datetime.utcnow, nullable=True)
# metadata = Column(JSONB, default={}, nullable=False)
# debugging tool, to translate SQLAlchemy queries to string
......@@ -67,7 +85,6 @@ def literalquery(statement, dialect=None):
purposes only. Executing SQL statements with inline-rendered user values is
extremely insecure.
"""
from datetime import datetime
import sqlalchemy.orm
if isinstance(statement, sqlalchemy.orm.Query):
if dialect is None:
......
......@@ -14,11 +14,33 @@ BASE_DIR = os.path.dirname(os.path.dirname(__file__))
PROJECT_PATH = os.path.join(BASE_DIR, os.pardir)
PROJECT_PATH = os.path.abspath(PROJECT_PATH)
######################################################################
# ASYNCHRONOUS TASKS
import djcelery
djcelery.setup_loader()
BROKER_URL = 'amqp://guest:guest@localhost:5672/'
CELERY_IMPORTS=("node.models",)
CELERY_IMPORTS=("node.models","gargantext_web.celery")
#
#from celery import Celery
#
#app = Celery('gargantext_web')
#
#app.conf.update(
# CELERY_RESULT_BACKEND='djcelery.backends.database:DatabaseBackend',
#)
#
#
#app.conf.update(
# CELERY_RESULT_BACKEND='djcelery.backends.cache:CacheBackend',
#)
#
######################################################################
# Quick-start development settings - unsuitable for production
# See https://docs.djangoproject.com/en/1.6/howto/deployment/checklist/
......
from celery import shared_task
from parsing.corpustools import add_resource, parse_resources, extract_ngrams, compute_tfidf
@shared_task
def apply_workflow(corpus):
parse_resources(corpus)
extract_ngrams(corpus, ['title'])
compute_tfidf(corpus)
......@@ -46,6 +46,8 @@ from sqlalchemy import or_, func
from gargantext_web import about
def login_user(request):
logout(request)
username = password = ''
......@@ -199,7 +201,6 @@ def home_view(request):
t = get_template('home.html')
user = request.user
date = datetime.datetime.now()
html = t.render(Context({\
'user': user,\
'date': date,\
......
......@@ -7,6 +7,7 @@ from sqlalchemy.orm import aliased
from collections import defaultdict
from datetime import datetime
from time import sleep
from threading import Thread
from node.admin import CustomForm
......@@ -14,14 +15,14 @@ from gargantext_web.db import *
from gargantext_web.settings import DEBUG, MEDIA_ROOT
from gargantext_web.api import JsonHttpResponse
import json
import re
from parsing.corpustools import add_resource, parse_resources, extract_ngrams, compute_tfidf
def project(request, project_id):
from gargantext_web.celery import apply_workflow
# SQLAlchemy session
session = Session()
def project(request, project_id):
# do we have a valid project id?
try:
......@@ -51,7 +52,7 @@ def project(request, project_id):
# ... sqlalchemy.func by Resource.type_id is the guilty
# ISSUE L51
corpus_query = (session
.query(Node.id, Node.name, func.count(ChildrenNode.id))
.query(Node.id, Node.name, func.count(ChildrenNode.id), Node.metadata['Processing'])
#.query(Node.id, Node.name, Resource.type_id, func.count(ChildrenNode.id))
#.join(Node_Resource, Node_Resource.node_id == Node.id)
#.join(Resource, Resource.id == Node_Resource.resource_id)
......@@ -66,8 +67,10 @@ def project(request, project_id):
documents_count_by_resourcetype = defaultdict(int)
corpora_count = 0
corpusID_dict = {}
for corpus_id, corpus_name, document_count in corpus_query:
for corpus_id, corpus_name, document_count, processing in corpus_query:
#print(corpus_id, processing)
# Not optimized GOTO ISSUE L51
resource_type_id = (session.query(Resource.type_id)
.join(Node_Resource, Node_Resource.resource_id == Resource.id)
......@@ -82,9 +85,10 @@ def project(request, project_id):
resourcetype = cache.ResourceType[resource_type_id]
resourcetype_name = resourcetype.name
corpora_by_resourcetype[resourcetype_name].append({
'id': corpus_id,
'name': corpus_name,
'count': document_count,
'id' : corpus_id,
'name' : corpus_name,
'count' : document_count,
'processing': processing,
})
documents_count_by_resourcetype[resourcetype_name] += document_count
corpora_count += 1
......@@ -93,7 +97,7 @@ def project(request, project_id):
# do the donut
total_documents_count = sum(documents_count_by_resourcetype.values())
donut = [
{ 'source': key,
{ 'source': re.sub(' \(.*$', '', key),
'count': value,
'part' : round(value * 100 / total_documents_count) if total_documents_count else 0,
}
......@@ -112,20 +116,21 @@ def project(request, project_id):
resourcetype = cache.ResourceType[form.cleaned_data['type']]
# which default language shall be used?
if resourcetype.name == "europress_french":
if resourcetype.name == "Europress (French)":
language_id = cache.Language['fr'].id
elif resourcetype.name == "europress_english":
elif resourcetype.name == "Europress (English)":
language_id = cache.Language['en'].id
else:
language_id = None
# corpus node instanciation as a Django model
corpus = Node(
name = name,
user_id = request.user.id,
parent_id = project_id,
type_id = cache.NodeType['Corpus'].id,
name = name,
user_id = request.user.id,
parent_id = project_id,
type_id = cache.NodeType['Corpus'].id,
language_id = language_id,
metadata = {'Processing' : 1,}
)
session.add(corpus)
session.commit()
......@@ -142,25 +147,25 @@ def project(request, project_id):
)
# let's start the workflow
try:
def apply_workflow(corpus):
parse_resources(corpus)
extract_ngrams(corpus, ['title'])
compute_tfidf(corpus)
if DEBUG:
apply_workflow(corpus)
if DEBUG is False:
apply_workflow.apply_async((corpus.id,),)
else:
thread = Thread(target=apply_workflow, args=(corpus, ), daemon=True)
thread.start()
#apply_workflow(corpus)
thread = Thread(target=apply_workflow, args=(corpus.id, ), daemon=True)
thread.start()
except Exception as error:
print('WORKFLOW ERROR')
print(error)
# redirect to the main project page
# TODO need to wait before response (need corpus update)
sleep(1)
return HttpResponseRedirect('/project/' + str(project_id))
else:
print('ERROR: BAD FORM')
else:
form = CustomForm()
# HTML output
return render(request, 'project.html', {
'form' : form,
......
......@@ -104,30 +104,15 @@ except Exception as error:
# In[33]:
try:
typePubmed = ResourceType.objects.get(name='pubmed')
typeIsi = ResourceType.objects.get(name='isi')
typeRis = ResourceType.objects.get(name='ris')
typePresseFrench = ResourceType.objects.get(name='europress_french')
typePresseEnglish = ResourceType.objects.get(name='europress_english')
from parsing.parsers_config import parsers
except Exception as error:
print(error)
typePubmed = ResourceType(name='pubmed')
typePubmed.save()
typeIsi = ResourceType(name='isi')
typeIsi.save()
typeRis = ResourceType(name='ris')
typeRis.save()
typePresseFrench = ResourceType(name='europress_french')
typePresseFrench.save()
typePresseEnglish = ResourceType(name='europress_english')
typePresseEnglish.save()
ResourceType.objects.all().delete()
for key in parsers.keys():
try:
ResourceType.objects.get_or_create(name=key)
except Exception as error:
print("Ressource Error: ", error)
# In[34]:
......
......@@ -4,6 +4,7 @@ psql -d gargandb -f init.sql
sleep 2
../manage.py syncdb
psql -d gargandb -f init2.sql
......
ALTER TABLE ONLY node_node ALTER COLUMN date SET DEFAULT CURRENT_DATE ;
......@@ -54,16 +54,13 @@ import pycountry
Language.objects.all().delete()
for language in pycountry.languages:
if 'alpha2' in language.__dict__:
Language(
models.Language(
iso2 = language.alpha2,
iso3 = language.bibliographic,
fullname = language.name,
implemented = 1 if language.alpha2 in ['en', 'fr'] else 0,
).save()
english = Language.objects.get(iso2='en')
french = Language.objects.get(iso2='fr')
# Integration: users
......@@ -99,56 +96,10 @@ for node_type in node_types:
print('Initialize resource...')
resources = [
'pubmed', 'isi', 'ris', 'europress_french', 'europress_english']
for resource in resources:
models.ResourceType.objects.get_or_create(name=resource)
# TODO
# here some tests
# add a new project and some corpora to test it
# Integration: project
#
#print('Initialize project...')
#try:
# project = Node.objects.get(name='Bees project')
#except:
# project = Node(name='Bees project', type=typeProject, user=me)
# project.save()
#
# Integration: corpus
#print('Initialize corpus...')
#try:
# corpus_pubmed = Node.objects.get(name='PubMed corpus')
#except:
# corpus_pubmed = Node(parent=project, name='PubMed corpus', type=typeCorpus, user=me)
# corpus_pubmed.save()
#
#print('Initialize resource...')
#corpus_pubmed.add_resource(
# # file='./data_samples/pubmed.zip',
# #file='./data_samples/pubmed_2013-04-01_HoneyBeesBeeBees.xml',
# file='/srv/gargantext_lib/data_samples/pubmed.xml',
# type=typePubmed,
# user=me
#)
#
#for resource in corpus_pubmed.get_resources():
# print('Resource #%d - %s - %s' % (resource.id, resource.digest, resource.file))
#
## print('Parse corpus #%d...' % (corpus_pubmed.id, ))
# corpus_pubmed.parse_resources(verbose=True)
# print('Extract corpus #%d...' % (corpus_pubmed.id, ))
# corpus_pubmed.children.all().extract_ngrams(['title',])
# print('Parsed corpus #%d.' % (corpus_pubmed.id, ))
from parsing.parsers_config import parsers
for parser in parsers.keys():
models.ResourceType.objects.get_or_create(name=parser)
......
......@@ -29,6 +29,7 @@ from celery import current_app
import os
import subprocess
from parsing.parsers_config import parsers
# Some usefull functions
# TODO: start the function name with an underscore (private)
......@@ -194,15 +195,19 @@ class Node(CTENode):
print("= = = = = = = = = = =\n")
for node_resource in self.node_resource.filter(parsed=False):
resource = node_resource.resource
parser = defaultdict(lambda:FileParser.FileParser, {
'istext' : ISText,
'pubmed' : PubmedFileParser,
'isi' : IsiFileParser,
'ris' : RisFileParser,
'europress' : EuropressFileParser,
'europress_french' : EuropressFileParser,
'europress_english' : EuropressFileParser,
})[resource.type.name]()
parser = defaultdict(lambda:FileParser.FileParser, parsers
# {
# 'istext' : ISText,
# 'pubmed' : PubmedFileParser,
# 'isi' : IsiFileParser,
# 'ris' : RisFileParser,
# 'RIS (Jstor)' : JstorFileParser,
# 'europress' : EuropressFileParser,
# 'europress_french' : EuropressFileParser,
# 'europress_english' : EuropressFileParser,
# }
)[resource.type.name]()
metadata_list += parser.parse(str(resource.file))
type_id = NodeType.objects.get(name='Document').id
langages_cache = LanguagesCache()
......
from .RisFileParser import RisFileParser
class IsiFileParser(RisFileParser):
_parameters = {
......
from .RisFileParser import RisFileParser
class JstorFileParser(RisFileParser):
_parameters = {
b"ER": {"type": "delimiter"},
b"TI": {"type": "metadata", "key": "title", "separator": " "},
b"AU": {"type": "metadata", "key": "authors", "separator": ", "},
b"UR": {"type": "metadata", "key": "doi"},
b"Y1": {"type": "metadata", "key": "publication_year"},
b"PD": {"type": "metadata", "key": "publication_month"},
b"LA": {"type": "metadata", "key": "language_iso2"},
b"AB": {"type": "metadata", "key": "abstract", "separator": " "},
b"WC": {"type": "metadata", "key": "fields"},
}
from .RisFileParser import RisFileParser
from .IsiFileParser import IsiFileParser
from .JstorFileParser import JstorFileParser
from .PubmedFileParser import PubmedFileParser
from .EuropressFileParser import EuropressFileParser
from .ISText import ISText
......@@ -7,8 +7,7 @@ from math import log
from gargantext_web.db import *
from .FileParsers import *
from .parsers_config import parsers as _parsers
class DebugTime:
......@@ -31,17 +30,12 @@ class DebugTime:
# keep all the parsers in a cache
class Parsers(defaultdict):
_parsers = {
'pubmed' : PubmedFileParser,
'isi' : IsiFileParser,
'ris' : RisFileParser,
'europress' : EuropressFileParser,
'europress_french' : EuropressFileParser,
'europress_english' : EuropressFileParser,
}
def __init__(self):
self._parsers = _parsers
def __missing__(self, key):
if key not in self._parsers:
#print(self._parsers.keys())
if key not in self._parsers.keys():
raise NotImplementedError('No such parser: "%s"' % (key))
parser = self._parsers[key]()
self[key] = parser
......
from .FileParsers import *
parsers = {
'Pubmed (xml format)' : PubmedFileParser,
'Web of Science (ISI format)' : IsiFileParser,
'Scopus (RIS format)' : RisFileParser,
'Zotero (RIS format)' : RisFileParser,
'Jstor (RIS format)' : JstorFileParser,
#'Europress' : EuropressFileParser,
'Europress (French)' : EuropressFileParser,
'Europress (English)' : EuropressFileParser,
}
......@@ -40,7 +40,7 @@ def getGlobalStats(request ):
alist = ["bar","foo"]
if request.method == "POST":
N = 1000
N = 100
query = request.POST["query"]
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
......@@ -81,9 +81,6 @@ def getGlobalStatsISTEXT(request ):
def doTheQuery(request , project_id):
alist = ["hola","mundo"]
# SQLAlchemy session
session = Session()
# do we have a valid project id?
try:
project_id = int(project_id)
......@@ -120,7 +117,7 @@ def doTheQuery(request , project_id):
urlreqs.append( instancia.medlineEfetchRAW( yearquery ) )
alist = ["tudo fixe" , "tudo bem"]
resourcetype = cache.ResourceType["pubmed"]
resourcetype = cache.ResourceType["Pubmed (xml format)"]
# corpus node instanciation as a Django model
corpus = Node(
......
......@@ -83,11 +83,12 @@
<li>{{ key }}</li>
<ul>
{% for corpus in corpora %}
<li> {% ifnotequal corpus.count 0 %}
<a href="/project/{{project.id}}/corpus/{{corpus.id}}"> {{corpus.name}} </a> , {{ corpus.count }} Documents
{% else %}
<li>
{% ifequal corpus.processing 1 %}
{{corpus.name}} : <img width="20px" src="{% static "js/libs/img2/loading-bar.gif" %}"></img> Processing, drink a cup of tea, and refresh the page :)
{% endifnotequal %}
{% else %}
<a href="/project/{{project.id}}/corpus/{{corpus.id}}"> {{corpus.name}} </a> , {{ corpus.count }} Documents
{% endifequal %}
<button type="button" class="btn btn-xs btn-default" data-container="body" data-toggle="popover" data-placement="bottom"
data-content='
<ul>
......@@ -312,7 +313,7 @@
var theType = $("#id_type option:selected").html();
if(theType=="pubmed") {
if(theType=="Pubmed (xml format)") {
$.ajax({
// contentType: "application/json",
url: window.location.origin+"/tests/pubmedquery",
......@@ -416,7 +417,7 @@
$( "#id_name" ).on('input',function(e){
console.log($(this).val())
if(theType=="pubmed") testPUBMED( $(this).val() )
if(theType=="Pubmed (xml format)") testPUBMED( $(this).val() )
});
}
}
......@@ -424,7 +425,7 @@
//CSS events for changing the Select element
function CustomForSelect( selected ) {
// show Radio-Inputs and trigger FileOrNotFile>@upload-file events
if(selected=="pubmed" || selected=="istext") {
if(selected=="Pubmed (xml format)" || selected=="istext") {
// if(selected=="pubmed") {
console.log("show the button for: "+selected)
$("#pubmedcrawl").css("visibility", "visible");
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment