Commit b82a67ad authored by PkSM3's avatar PkSM3

[UPDATE] pull unstable and explorer view OK

parents 6b2cbdd1 7af355a0
......@@ -25,7 +25,7 @@ def get_session():
from aldjemy.core import get_engine
alias = 'default'
connection = connections[alias]
engine = create_engine("postgresql+psycopg2://alexandre:C8kdcUrAQy66U@localhost/gargandb",
engine = create_engine("postgresql+psycopg2://gargantua:C8kdcUrAQy66U@localhost/gargandb",
use_native_hstore=True)
Session = sessionmaker(bind=engine)
return Session()
......
......@@ -249,35 +249,3 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150
#print(data)
return data
from analysis.tfidf import tfidf
def do_tfidf(corpus, reset=True):
# print("=========== doing tfidf ===========")
with transaction.atomic():
if reset==True:
NodeNodeNgram.objects.filter(nodex=corpus).delete()
if isinstance(corpus, Node) and corpus.type.name == "Corpus":
# print("\n- - - - - - - - - - ")
# # for i in Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")):
for document in Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")):
# print("the doc:",document)
for node_ngram in Node_Ngram.objects.filter(node=document):
# print("\tngram:",node_ngram.ngram)
try:
nnn = NodeNodeNgram.objects.get(nodex=corpus, nodey=document, ngram=node_ngram.ngram)
# print("\t\tTRY")
except:
score = tfidf(corpus, document, node_ngram.ngram)
nnn = NodeNodeNgram(nodex=corpus, nodey=node_ngram.node, ngram=node_ngram.ngram, score=score)
nnn.save()
# print("\t\t",node_ngram.ngram," : ",score)
# print("- - - - - - - - - - \n")
else:
print("Only corpus implemented yet, you put instead:", type(corpus))
......@@ -21,7 +21,7 @@ def get_session():
from aldjemy.core import get_engine
alias = 'default'
connection = connections[alias]
engine = create_engine("postgresql+psycopg2://alexandre:C8kdcUrAQy66U@localhost/gargandb",
engine = create_engine("postgresql+psycopg2://gargantua:C8kdcUrAQy66U@localhost/gargandb",
use_native_hstore=True)
Session = sessionmaker(bind=engine)
return Session()
......
#from .celery import app as async_app
......@@ -11,9 +11,9 @@ from sqlalchemy import text, distinct
from sqlalchemy.sql import func
from sqlalchemy.orm import aliased
from gargantext_web.views import move_to_trash
from .db import *
from node import models
def DebugHttpResponse(data):
return HttpResponse('<html><body style="background:#000;color:#FFF"><pre>%s</pre></body></html>' % (str(data), ))
......@@ -47,10 +47,14 @@ _ngrams_order_columns = {
}
from rest_framework.authentication import SessionAuthentication, BasicAuthentication
from rest_framework.permissions import IsAuthenticated
from rest_framework.views import APIView
from rest_framework.response import Response
from rest_framework.exceptions import APIException as _APIException
class APIException(_APIException):
def __init__(self, message, code=500):
self.status_code = code
......@@ -200,7 +204,7 @@ class NodesChildrenDuplicates(APIView):
count = len(duplicate_nodes)
for node in duplicate_nodes:
print("deleting node ",node.id)
node.delete()
move_to_trash(node.id)
# print(delete_query)
# # delete_query.delete(synchronize_session=True)
# session.flush()
......@@ -552,11 +556,13 @@ class NodesChildrenQueries(APIView):
class NodesList(APIView):
authentication_classes = (SessionAuthentication, BasicAuthentication)
def get(self, request):
print("user id : " + str(request.user))
query = (session
.query(Node.id, Node.name, NodeType.name.label('type'))
.filter(Node.user_id == request.session._session_cache['_auth_user_id'])
.filter(Node.user_id == int(request.user.id))
.join(NodeType)
)
if 'type' in request.GET:
......@@ -579,6 +585,8 @@ class Nodes(APIView):
return JsonHttpResponse({
'id': node.id,
'name': node.name,
'parent_id': node.parent_id,
'type': cache.NodeType[node.type_id].name,
# 'type': node.type__name,
#'metadata': dict(node.metadata),
'metadata': node.metadata,
......@@ -589,13 +597,19 @@ class Nodes(APIView):
# it should take the subnodes into account as well,
# for better constistency...
def delete(self, request, node_id):
user = request.user
node = session.query(Node).filter(Node.id == node_id).first()
msgres = ""
msgres = str()
try:
node.delete()
msgres = node_id+" deleted!"
except:
msgres ="error deleting: "+node_id
move_to_trash(node_id)
msgres = node_id+" moved to Trash"
except Exception as error:
msgres ="error deleting : " + node_id + str(error)
return JsonHttpResponse({
'deleted': msgres,
......@@ -611,7 +625,7 @@ class CorpusController:
raise ValidationError('Corpora are identified by an integer.', 400)
corpusQuery = session.query(Node).filter(Node.id == corpus_id).first()
# print(str(corpusQuery))
# raise Http404("C'est toujours ça de pris.")
# raise Http404("404 error.")
if not corpusQuery:
raise Http404("No such corpus: %d" % (corpus_id, ))
corpus = corpusQuery.first()
......
# -*- coding: utf-8 -*-
#import os
#import djcelery
#
#from celery import Celery
#
#from django.conf import settings
#
## set the default Django settings module for the 'celery' program.
#os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gargantext_web.settings')
#
#app = Celery('gargantext_web')
#
#
#app.conf.update(
# CELERY_RESULT_BACKEND='djcelery.backends.database:DatabaseBackend',
#)
#
#
#app.conf.update(
# CELERY_RESULT_BACKEND='djcelery.backends.cache:CacheBackend',
#)
#
## Using a string here means the worker will not have to
## pickle the object when using Windows.
##app.config_from_object('django.conf:settings')
#app.autodiscover_tasks(lambda: settings.INSTALLED_APPS)
#
from celery import shared_task
from node import models
#@app.task(bind=True)
@shared_task
def debug_task(request):
print('Request: {0!r}'.format(request))
from gargantext_web.db import session, Node
@shared_task
def apply_sum(x, y):
print(x+y)
print(session.query(Node.name).first())
from parsing.corpustools import add_resource, parse_resources, extract_ngrams, compute_tfidf
@shared_task
def apply_workflow(corpus_id):
corpus = session.query(Node).filter(Node.id==corpus_id).first()
parse_resources(corpus)
try:
print("-" *60)
# With Django ORM
corpus_django = models.Node.objects.get(id=corpus_id)
corpus_django.metadata['Processing'] = 0
corpus_django.save()
print("-" *60)
#TODO With SLA ORM (KO why?)
# corpus.metadata['Processing'] = 0
# session.add(corpus)
# session.flush()
except Exception as error:
print(error)
extract_ngrams(corpus, ['title'])
compute_tfidf(corpus)
......@@ -2,6 +2,7 @@ from gargantext_web import settings
from node import models
__all__ = ['literalquery', 'session', 'cache', 'Session', 'bulk_insert', 'engine', 'get_cursor']
......@@ -56,6 +57,23 @@ for model_name, model in models.__dict__.items():
NodeNgram = Node_Ngram
NodeResource = Node_Resource
# manually declare the Node table...
from datetime import datetime
from sqlalchemy.types import *
from sqlalchemy.schema import Column, ForeignKey
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.orm import relationship, aliased
# class Node(Base):
# __tablename__ = 'node_node'
# id = Column(Integer, primary_key=True)
# user_id = Column(Integer, ForeignKey('auth_user.id', ondelete='CASCADE'), index=True, nullable=False)
# type_id = Column(Integer, ForeignKey('node_nodetype.id', ondelete='CASCADE'), index=True, nullable=False)
# name = Column(String(255))
# language_id = Column(Integer, ForeignKey('node_language.id', ondelete='CASCADE'), index=True, nullable=False)
# date = Column(DateTime(), default=datetime.utcnow, nullable=True)
# metadata = Column(JSONB, default={}, nullable=False)
# debugging tool, to translate SQLAlchemy queries to string
......@@ -67,7 +85,6 @@ def literalquery(statement, dialect=None):
purposes only. Executing SQL statements with inline-rendered user values is
extremely insecure.
"""
from datetime import datetime
import sqlalchemy.orm
if isinstance(statement, sqlalchemy.orm.Query):
if dialect is None:
......
......@@ -14,11 +14,33 @@ BASE_DIR = os.path.dirname(os.path.dirname(__file__))
PROJECT_PATH = os.path.join(BASE_DIR, os.pardir)
PROJECT_PATH = os.path.abspath(PROJECT_PATH)
######################################################################
# ASYNCHRONOUS TASKS
import djcelery
djcelery.setup_loader()
BROKER_URL = 'amqp://guest:guest@localhost:5672/'
CELERY_IMPORTS=("node.models",)
CELERY_IMPORTS=("node.models","gargantext_web.celery")
#
#from celery import Celery
#
#app = Celery('gargantext_web')
#
#app.conf.update(
# CELERY_RESULT_BACKEND='djcelery.backends.database:DatabaseBackend',
#)
#
#
#app.conf.update(
# CELERY_RESULT_BACKEND='djcelery.backends.cache:CacheBackend',
#)
#
######################################################################
# Quick-start development settings - unsuitable for production
# See https://docs.djangoproject.com/en/1.6/howto/deployment/checklist/
......@@ -48,8 +70,16 @@ TEMPLATE_DIRS = (
#ALLOWED_HOSTS = ['*',]
ALLOWED_HOSTS = ['localhost', 'master.polemic.be', 'beta.gargantext.org']
ALLOWED_HOSTS = ['localhost',
'gargantext.org',
'stable.gargantext.org',
'dev.gargantext.org',
'iscpif.gargantext.org',
'mines.gargantext.org',
'beta.gargantext.org',
'garg-dev.iscpif.fr',
'garg-stable.iscpif.fr',
]
# Application definition
......@@ -82,6 +112,16 @@ MIDDLEWARE_CLASSES = (
'django.middleware.clickjacking.XFrameOptionsMiddleware',
)
REST_SESSION_LOGIN = False
REST_FRAMEWORK = {
'DEFAULT_AUTHENTICATION_CLASSES': (
'rest_framework.authentication.TokenAuthentication',
'rest_framework.authentication.SessionAuthentication',
),
'DEFAULT_PERMISSION_CLASSES': (
'rest_framework.permissions.AllowAny',
),
}
WSGI_APPLICATION = 'wsgi.application'
......@@ -93,7 +133,7 @@ DATABASES = {
'default': {
'ENGINE': 'django.db.backends.postgresql_psycopg2',
'NAME': 'gargandb',
'USER': 'alexandre',
'USER': 'gargantua',
'PASSWORD': 'C8kdcUrAQy66U',
#'USER': 'gargantext',
#'PASSWORD': 'C8krdcURAQy99U',
......
from celery import shared_task
from parsing.corpustools import add_resource, parse_resources, extract_ngrams, compute_tfidf
@shared_task
def apply_workflow(corpus):
parse_resources(corpus)
extract_ngrams(corpus, ['title'])
compute_tfidf(corpus)
......@@ -33,7 +33,7 @@ urlpatterns = patterns('',
# Project Management
url(r'^projects/$', views.projects),
url(r'^project/(\d+)/$', views_optimized.project),
url(r'^delete/(\d+)$', views.trash_node), # => api.node('id' = id, children = 'True', copies = False)
url(r'^delete/(\d+)$', views.delete_node), # => api.node('id' = id, children = 'True', copies = False)
# Corpus management
url(r'^project/(\d+)/corpus/(\d+)/$', views.corpus),
......
......@@ -46,6 +46,8 @@ from sqlalchemy import or_, func
from gargantext_web import about
def login_user(request):
logout(request)
username = password = ''
......@@ -199,7 +201,6 @@ def home_view(request):
t = get_template('home.html')
user = request.user
date = datetime.datetime.now()
html = t.render(Context({\
'user': user,\
'date': date,\
......@@ -455,7 +456,21 @@ def empty_trash():
node.delete()
def trash_node(request, node_id):
def move_to_trash(node_id):
try:
node = session.query(Node).filter(Node.id == node_id).first()
previous_type_id = node.type_id
node.type_id = cache.NodeType['Trash'].id
session.add(node)
session.commit()
return(previous_type_id)
except Exception as error:
print("can not move to trash Node" + node_id + ":" + error)
def delete_node(request, node_id):
# do we have a valid user?
user = request.user
......@@ -466,52 +481,18 @@ def trash_node(request, node_id):
if node.user_id != user.id:
return HttpResponseForbidden()
previous_type_id = node.type_id
node.type_id = cache.NodeType['Trash'].id
session.add(node)
session.commit()
if previous_type_id == cache.NodeType['Project'].id:
previous_type_id = move_to_trash(node_id)
if previous_type_id == cache.NodeType['Corpus'].id:
return HttpResponseRedirect('/project/' + str(node.parent_id))
else:
return HttpResponseRedirect('/projects/')
elif previous_type_id == cache.NodeType['Corpus'].id:
return HttpResponseRedirect('/project/' + str(session.query(Node.id).filter(Node.id==node.parent_id).first()[0]))
if settings.DEBUG == True:
empty_trash()
def delete_node(request, node_id):
#nodes = session.query(Node).filter(or_(Node.id == node_id, Node.parent_id == node_id)).all()
# try:
# resources = session.query(Node_Resource).filter(Node_Resource.node_id==node_id).all()
# if resources is not None:
# for resource in resources:
# session.delete(resource)
#
# except Exception as error:
# print(error)
#
# node = session.query(Node).filter(Node.id == node_id).first()
# if node is not None:
# session.delete(node)
# session.commit()
node = models.Node.objects.get(id=node_id)
with transaction.atomic():
try:
node.children.delete()
except Exception as error:
print(error)
node.delete()
if node.type_id == cache.NodeType['Project'].id:
return HttpResponseRedirect('/projects/')
elif node.type_id == cache.NodeType['Corpus'].id:
return HttpResponseRedirect('/project/' + node_id)
def delete_corpus(request, project_id, node_id):
# ORM Django
......
......@@ -7,6 +7,7 @@ from sqlalchemy.orm import aliased
from collections import defaultdict
from datetime import datetime
from time import sleep
from threading import Thread
from node.admin import CustomForm
......@@ -14,14 +15,14 @@ from gargantext_web.db import *
from gargantext_web.settings import DEBUG, MEDIA_ROOT
from gargantext_web.api import JsonHttpResponse
import json
import re
from parsing.corpustools import add_resource, parse_resources, extract_ngrams, compute_tfidf
def project(request, project_id):
from gargantext_web.celery import apply_workflow
# SQLAlchemy session
session = Session()
def project(request, project_id):
# do we have a valid project id?
try:
......@@ -51,7 +52,7 @@ def project(request, project_id):
# ... sqlalchemy.func by Resource.type_id is the guilty
# ISSUE L51
corpus_query = (session
.query(Node.id, Node.name, func.count(ChildrenNode.id))
.query(Node.id, Node.name, func.count(ChildrenNode.id), Node.metadata['Processing'])
#.query(Node.id, Node.name, Resource.type_id, func.count(ChildrenNode.id))
#.join(Node_Resource, Node_Resource.node_id == Node.id)
#.join(Resource, Resource.id == Node_Resource.resource_id)
......@@ -66,8 +67,10 @@ def project(request, project_id):
documents_count_by_resourcetype = defaultdict(int)
corpora_count = 0
corpusID_dict = {}
for corpus_id, corpus_name, document_count in corpus_query:
for corpus_id, corpus_name, document_count, processing in corpus_query:
#print(corpus_id, processing)
# Not optimized GOTO ISSUE L51
resource_type_id = (session.query(Resource.type_id)
.join(Node_Resource, Node_Resource.resource_id == Resource.id)
......@@ -82,9 +85,10 @@ def project(request, project_id):
resourcetype = cache.ResourceType[resource_type_id]
resourcetype_name = resourcetype.name
corpora_by_resourcetype[resourcetype_name].append({
'id': corpus_id,
'name': corpus_name,
'count': document_count,
'id' : corpus_id,
'name' : corpus_name,
'count' : document_count,
'processing': processing,
})
documents_count_by_resourcetype[resourcetype_name] += document_count
corpora_count += 1
......@@ -93,7 +97,7 @@ def project(request, project_id):
# do the donut
total_documents_count = sum(documents_count_by_resourcetype.values())
donut = [
{ 'source': key,
{ 'source': re.sub(' \(.*$', '', key),
'count': value,
'part' : round(value * 100 / total_documents_count) if total_documents_count else 0,
}
......@@ -112,20 +116,21 @@ def project(request, project_id):
resourcetype = cache.ResourceType[form.cleaned_data['type']]
# which default language shall be used?
if resourcetype.name == "europress_french":
if resourcetype.name == "Europress (French)":
language_id = cache.Language['fr'].id
elif resourcetype.name == "europress_english":
elif resourcetype.name == "Europress (English)":
language_id = cache.Language['en'].id
else:
language_id = None
# corpus node instanciation as a Django model
corpus = Node(
name = name,
user_id = request.user.id,
parent_id = project_id,
type_id = cache.NodeType['Corpus'].id,
name = name,
user_id = request.user.id,
parent_id = project_id,
type_id = cache.NodeType['Corpus'].id,
language_id = language_id,
metadata = {'Processing' : 1,}
)
session.add(corpus)
session.commit()
......@@ -142,25 +147,25 @@ def project(request, project_id):
)
# let's start the workflow
try:
def apply_workflow(corpus):
parse_resources(corpus)
extract_ngrams(corpus, ['title'])
compute_tfidf(corpus)
if DEBUG:
apply_workflow(corpus)
if DEBUG is False:
apply_workflow.apply_async((corpus.id,),)
else:
thread = Thread(target=apply_workflow, args=(corpus, ), daemon=True)
thread.start()
#apply_workflow(corpus)
thread = Thread(target=apply_workflow, args=(corpus.id, ), daemon=True)
thread.start()
except Exception as error:
print('WORKFLOW ERROR')
print(error)
# redirect to the main project page
# TODO need to wait before response (need corpus update)
sleep(1)
return HttpResponseRedirect('/project/' + str(project_id))
else:
print('ERROR: BAD FORM')
else:
form = CustomForm()
# HTML output
return render(request, 'project.html', {
'form' : form,
......
......@@ -39,10 +39,10 @@ In PostreSQL
3) psql
4) CREATE USER alexandre WITH PASSWORD 'C8kdcUrAQy66U';
4) CREATE USER gargantua WITH PASSWORD 'C8kdcUrAQy66U';
(see gargantext_web/settings.py, DATABASES = { ... })
5) CREATE DATABASE gargandb WITH OWNER alexandre;
5) CREATE DATABASE gargandb WITH OWNER gargantua;
6) Ctrl + D
......@@ -80,7 +80,7 @@ Last steps of configuration
Warning: for ln, path has to be absolute!
5) patch CTE:
patch /srv/gargantext_env/lib/python3.4/site-packages/cte_tree/models.py /srv/gargantext/init/cte_tree.models.diff
patch /srv/gargantext_env/lib/python3.4/site-packages/cte_tree/models.py /srv/gargantext/init/patches/cte_tree.models.diff
6) init nodetypes and main variables
/srv/gargantext/manage.py shell < /srv/gargantext/init/init.py
......
sudo apt-get install postgresql
sudo apt-get install postgresql-contrib
sudo apt-get install python-virtualenv
sudo apt-get install libpng12-dev
sudo apt-get install libpng-dev
sudo apt-cache search freetype
sudo apt-get install libfreetype6-dev
sudo apt-cache search python-dev
sudo apt-get install python-dev
sudo apt-get install libpq-dev
sudo apt-get postgresql-contrib
sudo aptèget install libpq-dev
# Pour avoir toutes les dependences de matpolotlib (c'est sale, trouver
sudo apt-get build-dep python-matplotlib
#Paquets Debian a installer
# easy_install -U distribute (matplotlib)
#lxml
sudo apt-get install libffi-dev
sudo apt-get install libxml2-dev
sudo apt-get install libxslt1-dev
# ipython readline
sudo apt-get install libncurses5-dev
sudo apt-get install pandoc
# scipy:
sudo apt-get install gfortran
sudo apt-get install libopenblas-dev
sudo apt-get install liblapack-dev
#nlpserver
sudo apt-get install libgflags-dev
sudo aptitude install libgoogle-glog-dev
source /srv/gargantext_env/bin/activate
pip3 install git+https://github.com/mathieurodic/aldjemy.git
ALTER TABLE ONLY node_node
ALTER COLUMN metadata
DROP NOT NULL
;
ALTER TABLE ONLY node_node
ALTER COLUMN metadata
DROP DEFAULT
;
ALTER TABLE ONLY node_node
ALTER COLUMN metadata
TYPE JSON
USING hstore_to_json(metadata)
;
ALTER TABLE ONLY node_node
ALTER COLUMN metadata
SET DEFAULT '{}'::json
;
ALTER TABLE ONLY node_node
ALTER COLUMN metadata
SET NOT NULL
;
......@@ -104,30 +104,15 @@ except Exception as error:
# In[33]:
try:
typePubmed = ResourceType.objects.get(name='pubmed')
typeIsi = ResourceType.objects.get(name='isi')
typeRis = ResourceType.objects.get(name='ris')
typePresseFrench = ResourceType.objects.get(name='europress_french')
typePresseEnglish = ResourceType.objects.get(name='europress_english')
from parsing.parsers_config import parsers
except Exception as error:
print(error)
typePubmed = ResourceType(name='pubmed')
typePubmed.save()
typeIsi = ResourceType(name='isi')
typeIsi.save()
typeRis = ResourceType(name='ris')
typeRis.save()
typePresseFrench = ResourceType(name='europress_french')
typePresseFrench.save()
typePresseEnglish = ResourceType(name='europress_english')
typePresseEnglish.save()
ResourceType.objects.all().delete()
for key in parsers.keys():
try:
ResourceType.objects.get_or_create(name=key)
except Exception as error:
print("Ressource Error: ", error)
# In[34]:
......
......@@ -4,6 +4,7 @@ psql -d gargandb -f init.sql
sleep 2
../manage.py syncdb
psql -d gargandb -f init2.sql
......
ALTER TABLE ONLY node_node ALTER COLUMN date SET DEFAULT CURRENT_DATE;
-- ALTER TABLE ONLY node_node ALTER COLUMN metadata SET DEFAULT ''::hstore;
-- ALTER TABLE ONLY node_node ALTER COLUMN metadata TYPE JSONB;
-- ALTER TABLE ONLY node_node ALTER COLUMN metadata SET DEFAULT '{}'::JSONB;
......@@ -54,16 +54,13 @@ import pycountry
Language.objects.all().delete()
for language in pycountry.languages:
if 'alpha2' in language.__dict__:
Language(
models.Language(
iso2 = language.alpha2,
iso3 = language.bibliographic,
fullname = language.name,
implemented = 1 if language.alpha2 in ['en', 'fr'] else 0,
).save()
english = Language.objects.get(iso2='en')
french = Language.objects.get(iso2='fr')
# Integration: users
......@@ -99,56 +96,10 @@ for node_type in node_types:
print('Initialize resource...')
resources = [
'pubmed', 'isi', 'ris', 'europress_french', 'europress_english']
for resource in resources:
models.ResourceType.objects.get_or_create(name=resource)
# TODO
# here some tests
# add a new project and some corpora to test it
# Integration: project
#
#print('Initialize project...')
#try:
# project = Node.objects.get(name='Bees project')
#except:
# project = Node(name='Bees project', type=typeProject, user=me)
# project.save()
#
# Integration: corpus
#print('Initialize corpus...')
#try:
# corpus_pubmed = Node.objects.get(name='PubMed corpus')
#except:
# corpus_pubmed = Node(parent=project, name='PubMed corpus', type=typeCorpus, user=me)
# corpus_pubmed.save()
#
#print('Initialize resource...')
#corpus_pubmed.add_resource(
# # file='./data_samples/pubmed.zip',
# #file='./data_samples/pubmed_2013-04-01_HoneyBeesBeeBees.xml',
# file='/srv/gargantext_lib/data_samples/pubmed.xml',
# type=typePubmed,
# user=me
#)
#
#for resource in corpus_pubmed.get_resources():
# print('Resource #%d - %s - %s' % (resource.id, resource.digest, resource.file))
#
## print('Parse corpus #%d...' % (corpus_pubmed.id, ))
# corpus_pubmed.parse_resources(verbose=True)
# print('Extract corpus #%d...' % (corpus_pubmed.id, ))
# corpus_pubmed.children.all().extract_ngrams(['title',])
# print('Parsed corpus #%d.' % (corpus_pubmed.id, ))
from parsing.parsers_config import parsers
for parser in parsers.keys():
models.ResourceType.objects.get_or_create(name=parser)
......
# See /usr/share/postfix/main.cf.dist for a commented, more complete version
# Debian specific: Specifying a file name will cause the first
# line of that file to be used as the name. The Debian default
# is /etc/mailname.
#myorigin = /etc/mailname
smtpd_banner = $myhostname ESMTP $mail_name (Debian)
biff = no
# appending .domain is the MUA's job.
append_dot_mydomain = no
# Uncomment the next line to generate "delayed mail" warnings
#delay_warning_time = 4h
readme_directory = no
# TLS parameters
smtpd_tls_cert_file=/etc/ssl/certs/ssl-cert-snakeoil.pem
smtpd_tls_key_file=/etc/ssl/private/ssl-cert-snakeoil.key
smtpd_use_tls=yes
smtpd_tls_session_cache_database = btree:${data_directory}/smtpd_scache
smtp_tls_session_cache_database = btree:${data_directory}/smtp_scache
# See /usr/share/doc/postfix/TLS_README.gz in the postfix-doc package for
# information on enabling SSL in the smtp client.
myhostname = garg-dev.iscpif.fr
alias_maps = hash:/etc/aliases
alias_database = hash:/etc/aliases
myorigin = /etc/mailname
mydestination = garg-dev.iscpif.fr, localhost.iscpif.fr, , localhost
relayhost = smtp.iscpif.fr
mynetworks = 127.0.0.0/8 [::ffff:127.0.0.0]/104 [::1]/128
mailbox_size_limit = 0
recipient_delimiter = +
inet_interfaces = all
#!/bin/bash
apt-get install sudo
sudo apt-get install postfix
# copy from tina
sudo cp 0*cf /etc/postfix/main.cf
sudo postfix reload
sed -i 's/wheezy/jessie/g' /etc/apt/sources.list
sudo aptitude update
sudo aptitude dist-upgrade
# dpkg-reconfigure locales => add GB
ssh-keygen
cat ~/.ssh/id_rsa.pub | mail alexandre@delanoe.org -s "Key Server $(hostname)"
echo "Put ~/.ssh/id_rsa.pub on remote to enable git pull please and press enter"
read answer
sudo mkdir /srv/gargantext
cd /srv
chown gargantua:www-data gargantext
git clone ssh orign ssh://gitolite@delanoe.org:1979/gargantext
#!/bin/dash
# TODO do apt-get install --force-yes --force-yes
apt-get install --force-yes postgresql
apt-get install --force-yes postgresql-contrib
apt-get install --force-yes rabbitmq-server
apt-get install --force-yes tmux
apt-get install --force-yes uwsgi uwsgi-plugin-python3
apt-get install --force-yes python3.4-venv
#apt-get install --force-yes python-virtualenv
apt-get install --force-yes libpng12-dev
apt-get install --force-yes libpng-dev
apt-get install --force-yes libfreetype6-dev
apt-get install --force-yes python-dev
apt-get install --force-yes libpq-dev
apt-get install --force-yes libpq-dev
#apt-get build-dep python-matplotlib
#apt-get install --force-yes python-matplotlib
#Paquets Debian a installer
# easy_install --force-yes -U distribute (matplotlib)
#lxml
apt-get install --force-yes libffi-dev
apt-get install --force-yes libxml2-dev
apt-get install --force-yes libxslt1-dev
# ipython readline
apt-get install --force-yes libncurses5-dev
apt-get install --force-yes pandoc
# scipy:
apt-get install --force-yes gfortran
apt-get install --force-yes libopenblas-dev
apt-get install --force-yes liblapack-dev
#nlpserver
apt-get install --force-yes libgflags-dev
aptitude install --force-yes libgoogle-glog-dev
# MElt
# soon
## SERVER Configuration
# server configuration
apt-get install --force-yes nginx
# UWSGI with pcre support
apt-get install --force-yes libpcre3 libpcre3-dev
apt-get install --force-yes python3-pip
pip3 install --force-yes uwsgi
#!/bin/dash
sudo mkdir /srv/gargantext_env
sudo chown -R gargantua:www-data /srv/gargantext_env
pyvenv3 /srv/gargantext_env
source /srv/gargantext_env/bin/activate
pip install --upgrade pip
pip install -r 3-requirements.txt
pip3 install git+https://github.com/mathieurodic/aldjemy.git
patch /srv/gargantext_env/lib/python3.4/site-packages/cte_tree/models.py /srv/gargantext/init/patches/cte_tree.models.diff
......@@ -5,14 +5,14 @@ MarkupSafe==0.23
Pillow==2.5.3
Pygments==1.6
RandomWords==0.1.12
SQLAlchemy==0.9.8
SQLAlchemy==0.9.9
South==1.0
aldjemy==0.3.10
amqp==1.4.6
anyjson==0.3.3
bibtexparser==0.6.0
billiard==3.3.0.18
celery==3.1.15
billiard==3.3.0.19
celery==3.1.17
certifi==14.05.14
cffi==0.8.6
chardet==2.3.0
......@@ -36,7 +36,7 @@ djangorestframework==3.0.0
gensim==0.10.3
graphviz==0.4
ipython==2.2.0
kombu==3.0.23
kombu==3.0.24
lxml==3.4.1
#matplotlib==1.4.0
networkx==1.9
......@@ -52,7 +52,7 @@ pydot2==1.0.33
pyparsing==2.0.2
python-dateutil==2.2
python-igraph==0.7
pytz==2014.7
pytz==2015.2
pyzmq==14.3.1
readline==6.2.4.1
redis==2.10.3
......
#!/bin/bash
cd /tmp/
wget http://docs.delanoe.org/gargantext_lib.tar.bz2
cd /srv/
sudo mkdir gargantext_lib
sudo chown -R gargantua:www-data /srv/gargantext_lib
tar xvjf /tmp/gargantext_lib.tar.bz2
sudo chown -R gargantua:www-data /srv/gargantext_lib
cd /srv/gargantext_lib/js
git pull origin master git@github.com:PkSM3/garg.git
In PostreSQL
-------------
1) Ensure postgres is started: sudo /etc/init.d/postgresql start
2) sudo su postgres
3) psql
4) CREATE USER gargantua WITH PASSWORD 'C8kdcUrAQy66U';
(see gargantext_web/settings.py, DATABASES = { ... })
5) CREATE DATABASE gargandb WITH OWNER gargantua;
6) Ctrl + D
7) psql gargandb
6) CREATE EXTENSION hstore;
7) Ctrl + D
# the upstream component nginx needs to connect to
upstream gargantext {
server unix:///tmp/gargantext.sock; # for a file socket
#server 127.0.0.1:8001; # for a web port socket (we'll use this first)
}
# configuration of the server
server {
# the port your site will be served on
listen 8002;
# the domain name it will serve for
server_name localhost; # substitute your machine's IP address or FQDN
charset utf-8;
# max upload size
client_max_body_size 75M; # adjust to taste
# Django media
location /media {
alias /var/www/gargantext/media; # your Django project's media files - amend as required
}
location /static {
alias /var/www/gargantext/static; # your Django project's static files - amend as required
}
# Finally, send all non-media requests to the Django server.
location / {
uwsgi_pass gargantext;
include uwsgi_params;
}
}
# django.ini file
[uwsgi]
env = DJANGO_SETTINGS_MODULE=gargantext_web.settings
#module = django.core.handlers.wsgi:WSGIHandler()
#touch-reload= /tmp/gargantext.reload
# the base directory
chdir = /srv/gargantext
# Django's wsgi file
#module = wsgi
wsgi-file = /srv/gargantext/wsgi.py
# the virtualenv
home = /srv/gargantext_env/
# master
master = true
# maximum number of processes
processes = 10
# the socket (use the full path to be safe)
socket = /tmp/gargantext.sock
# with appropriate permissions - *may* be needed
chmod-socket = 666
# clear environment on exit
vacuum = true
pidfile = /tmp/gargantext.pid
# respawn processes taking more than 20 seconds
harakiri = 120
# limit the project to 128 MB
#limit-as = 128
# respawn processes after serving 5000 requests
max-requests = 5000
# background the process & log
#daemonize = /var/log/uwsgi/gargantext.log
uid = 1000
gid = 1000
#!/bin/dash
#
echo "Copy nginx configuration in sites available"
sudo cp 4-NGINX_gargantext.conf /etc/nginx/sites-available
echo "Enable site"
cd /etc/nginx/sites-enable
sudo ln -s ../sites-available/gargantext.conf
sudo service nginx restart
echo "Copy UWSGI configuration"
sudo cp 4-UWSGI_gargantext.ini /etc/uwsgi/
sudo service uwsgi restart
#!/bin/bash
git checkout stable
source /srv/gargantext_env/bin/activate
cd /srv/gargantext
./manage.py collectstatic
chown -R gargantua:www-data /var/www/gargantext
ALTER TABLE ONLY node_node ALTER COLUMN date SET DEFAULT CURRENT_DATE ;
for tbl in `psql -qAt -c "select tablename from pg_tables where schemaname = 'public';" gargandb` ; do
psql -c "alter table $tbl owner to gargantua" gargandb ;
done
ALTER TABLE ONLY node_node ALTER COLUMN date SET DEFAULT CURRENT_DATE ;
ALTER TABLE ONLY node_node ALTER COLUMN metadata DROP NOT NULL ;
ALTER TABLE ONLY node_node ALTER COLUMN metadata DROP DEFAULT ;
ALTER TABLE ONLY node_node ALTER COLUMN metadata TYPE JSONB USING hstore_to_json(metadata)::jsonb ;
ALTER TABLE ONLY node_node ALTER COLUMN metadata SET DEFAULT '{}'::jsonb ;
ALTER TABLE ONLY node_node ALTER COLUMN metadata SET NOT NULL ;
ALTER TABLE ONLY node_node ALTER COLUMN date SET DEFAULT CURRENT_DATE ;
......@@ -29,6 +29,7 @@ from celery import current_app
import os
import subprocess
from parsing.parsers_config import parsers
# Some usefull functions
# TODO: start the function name with an underscore (private)
......@@ -194,15 +195,19 @@ class Node(CTENode):
print("= = = = = = = = = = =\n")
for node_resource in self.node_resource.filter(parsed=False):
resource = node_resource.resource
parser = defaultdict(lambda:FileParser.FileParser, {
'istext' : ISText,
'pubmed' : PubmedFileParser,
'isi' : IsiFileParser,
'ris' : RisFileParser,
'europress' : EuropressFileParser,
'europress_french' : EuropressFileParser,
'europress_english' : EuropressFileParser,
})[resource.type.name]()
parser = defaultdict(lambda:FileParser.FileParser, parsers
# {
# 'istext' : ISText,
# 'pubmed' : PubmedFileParser,
# 'isi' : IsiFileParser,
# 'ris' : RisFileParser,
# 'RIS (Jstor)' : JstorFileParser,
# 'europress' : EuropressFileParser,
# 'europress_french' : EuropressFileParser,
# 'europress_english' : EuropressFileParser,
# }
)[resource.type.name]()
metadata_list += parser.parse(str(resource.file))
type_id = NodeType.objects.get(name='Document').id
langages_cache = LanguagesCache()
......
from .RisFileParser import RisFileParser
class IsiFileParser(RisFileParser):
_parameters = {
......
from .RisFileParser import RisFileParser
class JstorFileParser(RisFileParser):
_parameters = {
b"ER": {"type": "delimiter"},
b"TI": {"type": "metadata", "key": "title", "separator": " "},
b"AU": {"type": "metadata", "key": "authors", "separator": ", "},
b"UR": {"type": "metadata", "key": "doi"},
b"Y1": {"type": "metadata", "key": "publication_year"},
b"PD": {"type": "metadata", "key": "publication_month"},
b"LA": {"type": "metadata", "key": "language_iso2"},
b"AB": {"type": "metadata", "key": "abstract", "separator": " "},
b"WC": {"type": "metadata", "key": "fields"},
}
from .RisFileParser import RisFileParser
from .IsiFileParser import IsiFileParser
from .JstorFileParser import JstorFileParser
from .PubmedFileParser import PubmedFileParser
from .EuropressFileParser import EuropressFileParser
from .ISText import ISText
......@@ -7,8 +7,7 @@ from math import log
from gargantext_web.db import *
from .FileParsers import *
from .parsers_config import parsers as _parsers
class DebugTime:
......@@ -31,18 +30,12 @@ class DebugTime:
# keep all the parsers in a cache
class Parsers(defaultdict):
_parsers = {
'pubmed' : PubmedFileParser,
'istex' : ISText,
'isi' : IsiFileParser,
'ris' : RisFileParser,
'europress' : EuropressFileParser,
'europress_french' : EuropressFileParser,
'europress_english' : EuropressFileParser,
}
def __init__(self):
self._parsers = _parsers
def __missing__(self, key):
if key not in self._parsers:
#print(self._parsers.keys())
if key not in self._parsers.keys():
raise NotImplementedError('No such parser: "%s"' % (key))
parser = self._parsers[key]()
self[key] = parser
......@@ -238,11 +231,13 @@ def extract_ngrams(corpus, keys):
terms = ' '.join([token for token, tag in ngram]).lower()
# TODO BUG here
if n == 1:
tag_id = cache.Tag[ngram[0][1]].id
#tag_id = 1
#tag_id = cache.Tag[ngram[0][1]].id
tag_id = 1
#print('tag_id', tag_id)
elif n > 1:
tag_id = cache.Tag['NN'].id
tag_id = 1
#tag_id = cache.Tag[ngram[0][1]].id
#tag_id = cache.Tag['NN'].id
#tag_id = 14
#print('tag_id_2', tag_id)
node_ngram_list[node_id][terms] += 1
......
from .FileParsers import *
parsers = {
'Pubmed (xml format)' : PubmedFileParser,
'Web of Science (ISI format)' : IsiFileParser,
'Scopus (RIS format)' : RisFileParser,
'Zotero (RIS format)' : RisFileParser,
'Jstor (RIS format)' : JstorFileParser,
#'Europress' : EuropressFileParser,
'Europress (French)' : EuropressFileParser,
'Europress (English)' : EuropressFileParser,
}
......@@ -34,13 +34,15 @@ from gargantext_web.api import JsonHttpResponse
from parsing.corpustools import add_resource, parse_resources, extract_ngrams, compute_tfidf
from gargantext_web.celery import apply_workflow
from time import sleep
def getGlobalStats(request ):
print(request.method)
alist = ["bar","foo"]
if request.method == "POST":
N = 1000
N = 100
query = request.POST["query"]
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
......@@ -81,9 +83,6 @@ def getGlobalStatsISTEXT(request ):
def doTheQuery(request , project_id):
alist = ["hola","mundo"]
# SQLAlchemy session
session = Session()
# do we have a valid project id?
try:
project_id = int(project_id)
......@@ -120,7 +119,7 @@ def doTheQuery(request , project_id):
urlreqs.append( instancia.medlineEfetchRAW( yearquery ) )
alist = ["tudo fixe" , "tudo bem"]
resourcetype = cache.ResourceType["pubmed"]
resourcetype = cache.ResourceType["Pubmed (xml format)"]
# corpus node instanciation as a Django model
corpus = Node(
......@@ -129,6 +128,7 @@ def doTheQuery(request , project_id):
parent_id = project_id,
type_id = cache.NodeType['Corpus'].id,
language_id = None,
metadata = {'Processing' : 1,}
)
session.add(corpus)
session.commit()
......@@ -165,18 +165,15 @@ def doTheQuery(request , project_id):
if dwnldsOK == 0: return JsonHttpResponse(["fail"])
try:
def apply_workflow(corpus):
parse_resources(corpus)
extract_ngrams(corpus, ['title'])
compute_tfidf(corpus)
if DEBUG:
apply_workflow(corpus)
if not DEBUG:
apply_workflow.apply_async((corpus.id,),)
else:
thread = threading.Thread(target=apply_workflow, args=(corpus, ), daemon=True)
thread = threading.Thread(target=apply_workflow, args=(corpus.id, ), daemon=True)
thread.start()
except Exception as error:
print('WORKFLOW ERROR')
print(error)
sleep(1)
return HttpResponseRedirect('/project/' + str(project_id))
data = alist
......
......@@ -83,11 +83,12 @@
<li>{{ key }}</li>
<ul>
{% for corpus in corpora %}
<li> {% ifnotequal corpus.count 0 %}
<a href="/project/{{project.id}}/corpus/{{corpus.id}}"> {{corpus.name}} </a> , {{ corpus.count }} Documents
{% else %}
<li>
{% ifequal corpus.processing 1 %}
{{corpus.name}} : <img width="20px" src="{% static "js/libs/img2/loading-bar.gif" %}"></img> Processing, drink a cup of tea, and refresh the page :)
{% endifnotequal %}
{% else %}
<a href="/project/{{project.id}}/corpus/{{corpus.id}}"> {{corpus.name}} </a> , {{ corpus.count }} Documents
{% endifequal %}
<button type="button" class="btn btn-xs btn-default" data-container="body" data-toggle="popover" data-placement="bottom"
data-content='
<ul>
......@@ -321,7 +322,7 @@
console.log("theType:")
console.log(theType)
if(theType=="pubmed") {
if(theType=="Pubmed (xml format)") {
$.ajax({
// contentType: "application/json",
url: window.location.origin+"/tests/pubmedquery",
......@@ -336,7 +337,7 @@
console.log("enabling "+"#"+value.id)
$("#"+value.id).attr('onclick','getGlobalResults(this);');
// $("#submit_thing").prop('disabled' , false)
$("#submit_thing").html("Process a 1000 sample!")
$("#submit_thing").html("Process a 100 sample!")
thequeries = data
var N=0,k=0;
......@@ -425,7 +426,7 @@
$( "#id_name" ).on('input',function(e){
console.log($(this).val())
if(theType=="pubmed") testPUBMED( $(this).val() )
if(theType=="Pubmed (xml format)") testPUBMED( $(this).val() )
});
}
}
......@@ -433,7 +434,7 @@
//CSS events for changing the Select element
function CustomForSelect( selected ) {
// show Radio-Inputs and trigger FileOrNotFile>@upload-file events
if(selected=="pubmed" || selected=="istex") {
if(selected=="Pubmed (xml format)" || selected=="istext") {
// if(selected=="pubmed") {
console.log("show the button for: "+selected)
$("#pubmedcrawl").css("visibility", "visible");
......
......@@ -19,18 +19,21 @@
{% if documents %}
<div id="delAll" style="visibility: hidden;">
<button onclick="deleteDuplicates(theurl);">Delete Duplicates</button>
</div>
<ul>
{% for doc in documents %}
{% if doc.date %}
<li><div id="doc_{{doc.id}}"> <b>{{ doc.date }}</b>: <a target="_blank" href="/nodeinfo/{{doc.id}}">{{ doc.name}}</a> , @ {{ doc.metadata.source}}</div></li>
{% endif %}
{% endfor %}
<div id="delAll" style="visibility: hidden;">
<center>
<button onclick="deleteDuplicates(theurl);">Delete all Duplicates in one click</button>
</center>
</div>
</ul>
<script>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment