Commit 591915ef authored by sim's avatar sim

Remove notebook

parent 50872e57
install/notebook/gargantext_notebook.py
\ No newline at end of file
#!/bin/bash
sudo adduser --disabled-password --gecos "" notebooks
sudo docker rm $(sudo docker ps -a | grep sh | awk '{print $1}')
sudo docker build -t garg-notebook:latest ./notebook
#!/bin/bash
#-v /srv/gargandata:/srv/gargandata \
#-v /srv/gargantext_lib:/srv/gargantext_lib \
sudo docker rm $(sudo docker ps -a | grep notebook | grep sh | awk '{print $1}')
#HOSTIP=$(ip route show 0.0.0.0/0 | awk '{print $3}')
#--add-host=localhost:${HOSTIP} \
sudo docker run \
--name=garg-notebook \
--net=host \
-p 8899:8899 \
--env POSTGRES_HOST=localhost \
-v /srv/gargantext:/srv/gargantext \
-it garg-notebook:latest \
/bin/bash -c "/bin/su notebooks -c 'source /env_3-5/bin/activate && cd /home/notebooks && jupyter notebook --port=8899 --ip=0.0.0.0 --no-browser'"
# #&& jupyter nbextension enable --py widgetsnbextension --sys-prefix
#/bin/bash -c "/bin/su notebooks -c 'source /env_3-5/bin/activate && cd /srv/gargantext/ && jupyter notebook --port=8899 --ip=0.0.0.0 --no-browser --notebook-dir=/home/notebooks/'"
###########################################################
# Gargamelle WEB
###########################################################
#Build an image starting with debian:stretch image
# wich contains all the source code of the app
FROM debian:stretch
MAINTAINER ISCPIF <gargantext@iscpif.fr>
USER root
### Update and install base dependencies
RUN echo "############ DEBIAN LIBS ###############"
RUN apt-get update && \
apt-get install -y \
apt-utils ca-certificates locales \
sudo aptitude gcc g++ wget git vim \
build-essential make \
curl
# postgresql-9.6 postgresql-client-9.6 postgresql-contrib-9.6 \
# postgresql-server-dev-9.6 libpq-dev libxml2 \
# postgresql-9.6 postgresql-client-9.6 postgresql-contrib-9.6
# Install Stack
### Configure timezone and locale
RUN echo "########### LOCALES & TZ #################"
RUN echo "Europe/Paris" > /etc/timezone
ENV TZ "Europe/Paris"
RUN sed -i -e 's/# en_GB.UTF-8 UTF-8/en_GB.UTF-8 UTF-8/' /etc/locale.gen && \
sed -i -e 's/# fr_FR.UTF-8 UTF-8/fr_FR.UTF-8 UTF-8/' /etc/locale.gen && \
dpkg-reconfigure --frontend=noninteractive locales && \
echo 'LANG="fr_FR.UTF-8"' > /etc/default/locale
ENV LANG fr_FR.UTF-8
ENV LANGUAGE fr_FR.UTF-8
ENV LC_ALL fr_FR.UTF-8
### Install main dependencies and python packages based on Debian distrib
RUN echo "############# PYTHON DEPENDENCIES ###############"
RUN apt-get update && apt-get install -y \
libxml2-dev xml-core libgfortran-6-dev \
libpq-dev \
python3.5 \
python3-dev \
# for numpy, pandas and numpyperf \
python3-six python3-numpy python3-setuptools \
python3-numexpr \
# python dependencies \
python3-pip \
# for lxml
libxml2-dev libxslt-dev libxslt1-dev zlib1g-dev
# UPDATE AND CLEAN
RUN apt-get update && apt-get autoclean \
&& rm -rf /var/lib/apt/lists/*
#NB: removing /var/lib will avoid to significantly fill up your /var/ folder on your native system
########################################################################
### PYTHON ENVIRONNEMENT (as ROOT)
########################################################################
RUN adduser --disabled-password --gecos "" notebooks
RUN pip3 install virtualenv
RUN virtualenv /env_3-5
RUN echo 'alias venv="source /env_3-5/bin/activate"' >> ~/.bashrc
# CONFIG FILES
ADD requirements.txt /
#ADD psql_configure.sh /
ADD django_configure.sh /
RUN . /env_3-5/bin/activate && pip3 install -r requirements.txt && \
pip3 install git+https://github.com/zzzeek/sqlalchemy.git@rel_1_1 && \
python3 -m nltk.downloader averaged_perceptron_tagger -d /usr/local/share/nltk_data
#RUN ./psql_configure.sh
#RUN ./django_configure.sh
RUN chown notebooks:notebooks -R /env_3-5
########################################################################
### POSTGRESQL DATA (as ROOT)
########################################################################
#RUN sed -iP "s%^data_directory.*%data_directory = \'\/srv\/gargandata\'%" /etc/postgresql/9.5/main/postgresql.conf
#RUN echo "host all all 0.0.0.0/0 md5" >> /etc/postgresql/9.5/main/pg_hba.conf
#RUN echo "listen_addresses='*'" >> /etc/postgresql/9.5/main/postgresql.conf
EXPOSE 5432 8899
VOLUME ["/srv/","/home/notebooks/"]
########################################################################
### Notebook IHaskell and IPYTHON ENVIRONNEMENT
########################################################################
RUN apt-get update && apt-get install -y \
libtinfo-dev \
libzmq3-dev \
libcairo2-dev \
libpango1.0-dev \
libmagic-dev \
libblas-dev \
liblapack-dev
#USER notebooks
#
#RUN cd /home/notebooks \
# && curl -sSL https://get.haskellstack.org/ | sh \
# && stack setup \
# && git clone https://github.com/gibiansky/IHaskell \
# && . /env_3-5/bin/activate \
# && cd IHaskell \
# && stack install gtk2hs-buildtools \
# && stack install --fast \
# && /root/.local/bin/ihaskell install --stack
#
#!/bin/bash
##################################################
# __| |(_) __ _ _ __ __ _ ___
# / _` || |/ _` | '_ \ / _` |/ _ \
# | (_| || | (_| | | | | (_| | (_) |
# \__,_|/ |\__,_|_| |_|\__, |\___/
# |__/ |___/
##################################################
#configure django migrations
##################################################
echo "::::: DJANGO :::::"
#echo "Starting Postgres"
#/usr/sbin/service postgresql start
su gargantua -c 'source /srv/env_3-5/bin/activate &&\
echo "Activated env" &&\
/srv/gargantext/manage.py makemigrations &&\
/srv/gargantext/manage.py migrate && \
echo "migrations ok" &&\
/srv/gargantext/dbmigrate.py && \
/srv/gargantext/dbmigrate.py && \
/srv/gargantext/dbmigrate.py && \
/srv/gargantext/manage.py createsuperuser'
service postgresql stop
#!/usr/bin/env python
"""
Gargantext Software Copyright (c) 2016-2017 CNRS ISC-PIF -
http://iscpif.fr
Licence (see :
http://gitlab.iscpif.fr/humanities/gargantext/blob/stable/LICENSE )
- In France : a CECILL variant affero compliant
- GNU aGPLV3 for all other countries
"""
import os
import django
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gargantext.settings')
django.setup()
from gargantext.constants import QUERY_SIZE_N_MAX, get_resource, get_resource_by_name
from gargantext.models import (Node, ProjectNode, DocumentNode,
Ngram, NodeNgram, NodeNgramNgram, NodeNodeNgram)
from gargantext.util.db import session, get_engine, func, aliased, case
from collections import Counter
import importlib
from django.http import Http404
# Import those to be available by notebook user
from langdetect import detect as detect_lang
from gargantext.models import UserNode, User
import functools
class NotebookError(Exception):
pass
def documents(corpus_id):
return (session.query(DocumentNode).filter_by(parent_id=corpus_id)
#.order_by(Node.hyperdata['publication_date'])
.all())
#import seaborn as sns
import pandas as pd
def countByField(docs, field):
return list(Counter([doc.hyperdata[field] for doc in docs]).items())
def chart(docs, field):
year_publis = countByField(docs, field)
frame0 = pd.DataFrame(year_publis, columns=['Date', 'DateValue'])
frame1 = pd.DataFrame(year_publis, columns=['Date', 'DateValue'], index=frame0.Date)
return frame1
from gargantext.util.crawlers.HAL import HalCrawler
def scan_hal(request):
hal = HalCrawler()
return hal.scan_results(request)
def _search_docs(corpus_id, request, fast=False):
q = session.query(DocumentNode).filter_by(parent_id=corpus_id)
# Search ngram <request> in hyperdata <field>
H = lambda field, request: Node.hyperdata[field].astext.op('~*')(request)
if not fast:
# Only match <request> starting and ending with word boundary
# Sequence of spaces will match any sequence of spaces
request = '\s+'.join(filter(None, r'\m{}\M'.format(request).split(' ')))
return q.filter(Node.title_abstract.match(request)) if fast else \
q.filter(H('title', request) | H('abstract', request))
def scan_gargantext(corpus_id, request, fast=False, documents=False):
query = _search_docs(corpus_id, request, fast)
if documents:
return query.all()
return query.with_entities(func.count(DocumentNode.id.distinct())).one()[0]
def scan_gargantext_and_delete(corpus_id, request, fast=False):
r = _search_docs(corpus_id, request, fast).delete(synchronize_session='fetch')
session.commit()
return r
def myProject_fromUrl(url):
"""
myProject :: String -> Project
"""
project_id = url.split("/")[4]
project = session.query(ProjectNode).get(project_id)
return project
def newCorpus(project, source, name=None, query=None):
error = False
if name is None:
name = query
if not isinstance(project, ProjectNode):
error = "a valid project"
if not isinstance(source, int) and not isinstance(source, str):
error = "a valid source identifier: id or name"
elif not isinstance(query, str):
error = "a valid query"
elif not isinstance(name, str):
error = "a valid name"
if error:
raise NotebookError("Please provide %s." % error)
resource = get_resource(source) if isinstance(source, int) else \
get_resource_by_name(source)
moissonneur_name = get_moissonneur_name(resource) if resource else \
source.lower()
try:
moissonneur = get_moissonneur(moissonneur_name)
except ImportError:
raise NotebookError("Invalid source identifier: %r" % source)
return run_moissonneur(moissonneur, project, name, query)
def get_moissonneur_name(ident):
""" Return moissonneur module name from RESOURCETYPE or crawler name """
# Does it quacks like a RESOURCETYPE ?
if hasattr(ident, 'get'):
ident = ident.get('crawler')
# Extract name from crawler class name, otherwise assume ident is already
# a moissonneur name.
if isinstance(ident, str) and ident.endswith('Crawler'):
return ident[:-len('Crawler')].lower()
def get_moissonneur(name):
""" Return moissonneur module from its name """
if not isinstance(name, str) or not name.islower():
raise NotebookError("Invalid moissonneur name: %r" % name)
module = importlib.import_module('gargantext.moissonneurs.%s' % name)
module.name = name
return module
def run_moissonneur(moissonneur, project, name, query):
""" Run moissonneur and return resulting corpus """
# XXX Uber-kludge with gory details. Spaghetti rulezzzzz!
class Dummy(object):
pass
request = Dummy()
request.method = 'POST'
request.path = 'nowhere'
request.META = {}
# XXX 'string' only have effect on moissonneurs.pubmed; its value is added
# when processing request client-side, take a deep breath and see
# templates/projects/project.html for more details.
request.POST = {'string': name,
'query': query,
'N': QUERY_SIZE_N_MAX}
request.user = Dummy()
request.user.id = project.user_id
request.user.is_authenticated = lambda: True
if moissonneur.name == 'istex':
# Replace ALL spaces by plus signs
request.POST['query'] = '+'.join(filter(None, query.split(' ')))
try:
import json
r = moissonneur.query(request)
raw_json = r.content.decode('utf-8')
data = json.loads(raw_json)
if moissonneur.name == 'pubmed':
count = sum(x['count'] for x in data)
request.POST['query'] = raw_json
elif moissonneur.name == 'istex':
count = data.get('total', 0)
else:
count = data.get('results_nb', 0)
if count > 0:
corpus = moissonneur.save(request, project.id, return_corpus=True)
else:
return None
except (ValueError, Http404) as e:
raise e
# Sometimes strange things happens...
if corpus.name != name:
corpus.name = name
session.commit()
return corpus
ALL_LIST_TYPES = ['main', 'map', 'stop']
def _ngrams(corpus_id, list_types, entities):
list_types = (list_types,) if isinstance(list_types, str) else list_types
list_typenames = [
'{}LIST'.format(t.upper()) for t in list_types if t in ALL_LIST_TYPES]
# `Node` is our list, ie. MAINLIST and/or MAPLIST and/or STOPLIST
return (session.query(*entities)
.select_from(Ngram)
.filter(NodeNgram.ngram_id==Ngram.id,
NodeNgram.node_id==Node.id,
Node.parent_id==corpus_id,
Node.typename.in_(list_typenames)))
def corpus_list(corpus_id, list_types=ALL_LIST_TYPES, with_synonyms=False,
with_count=False):
# Link between a GROUPLIST, a normal form (ngram1), and a synonym (ngram2)
NNN = NodeNgramNgram
# Get the list type from the Node type -- as in CSV export
list_type = (case([(Node.typename=='MAINLIST', 'main'),
(Node.typename=='MAPLIST', 'map'),
(Node.typename=='STOPLIST', 'stop')])
.label('type'))
# We will retrieve each ngram as the following tuple:
entities = (list_type, Ngram.terms.label('ng'))
if with_count:
entities += (Ngram.id.label('id'),)
# First, get ngrams from wanted lists
ngrams = _ngrams(corpus_id, list_types, entities)
# Secondly, exclude "synonyms" (grouped ngrams that are not normal forms).
# We have to exclude synonyms first because data is inconsistent and some
# of them can be both in GROUPLIST and in MAIN/MAP/STOP lists. We want to
# take synonyms from GROUPLIST only -- see below.
Groups = aliased(Node, name='groups')
query = (ngrams.outerjoin(Groups, (Groups.parent_id==corpus_id) & (Groups.typename=='GROUPLIST'))
.outerjoin(NNN, (NNN.node_id==Groups.id) & (NNN.ngram2_id==Ngram.id))
.filter(NNN.ngram1_id==None))
# If `with_synonyms` is True, add them from GROUPLIST: this is the reliable
# source for them
if with_synonyms:
Synonym = aliased(Ngram)
ent = (list_type, Synonym.terms.label('ng'), Synonym.id.label('id'))
synonyms = (ngrams.with_entities(*ent)
.filter(NNN.ngram1_id==Ngram.id,
NNN.ngram2_id==Synonym.id,
NNN.node_id==Groups.id,
Groups.parent_id==corpus_id,
Groups.typename=='GROUPLIST'))
query = query.union(synonyms)
# Again, data is inconsistent: MAINLIST may intersect with MAPLIST and
# we don't wan't that
if 'main' in list_types and 'map' not in list_types:
# Exclude MAPLIST ngrams from MAINLIST
query = query.except_(_ngrams(corpus_id, 'map', entities))
if with_count:
N = query.subquery()
return (session.query(N.c.type, N.c.ng, NodeNodeNgram.score)
.join(Node, (Node.parent_id==corpus_id) & (Node.typename=='OCCURRENCES'))
.outerjoin(NodeNodeNgram, (NodeNodeNgram.ngram_id==N.c.id) &
(NodeNodeNgram.node1_id==Node.id) &
(NodeNodeNgram.node2_id==corpus_id)))
# Return found ngrams sorted by list type, and then alphabetically
return query.order_by('type', 'ng')
#!/bin/bash
#######################################################################
## ____ _
## | _ \ ___ ___| |_ __ _ _ __ ___ ___
## | |_) / _ \/ __| __/ _` | '__/ _ \/ __|
## | __/ (_) \__ \ || (_| | | | __/\__ \
## |_| \___/|___/\__\__, |_| \___||___/
## |___/
#######################################################################
echo "::::: POSTGRESQL :::::"
su postgres -c 'pg_dropcluster 9.4 main --stop'
#done in docker but redoing it
rm -rf /srv/gargandata && mkdir /srv/gargandata && chown postgres:postgres /srv/gargandata
su postgres -c '/usr/lib/postgresql/9.6/bin/initdb -D /srv/gargandata/'
su postgres -c '/usr/lib/postgresql/9.6/bin/pg_ctl -D /srv/gargandata/ -l /srv/gargandata/journal_applicatif start'
su postgres -c 'pg_createcluster -D /srv/gargandata 9.6 main '
su postgres -c 'pg_ctlcluster -D /srv/gargandata 9.6 main start '
su postgres -c 'pg_ctlcluster 9.6 main start'
service postgresql start
su postgres -c "psql -c \"CREATE user gargantua WITH PASSWORD 'C8kdcUrAQy66U'\""
su postgres -c "createdb -O gargantua gargandb"
echo "Postgres configured"
#service postgresql stop
# try bottleneck
eventlet==0.20.1
amqp==1.4.9
anyjson==0.3.3
billiard==3.3.0.23
celery==3.1.25
chardet==2.3.0
dateparser==0.3.5
Django==1.10.5
django-celery==3.2.1
django-pgfields==1.4.4
django-pgjsonb==0.0.23
djangorestframework==3.5.3
html5lib==0.9999999
#python-igraph>=0.7.1
jdatetime==1.7.2
kombu==3.0.37 # messaging
langdetect==1.0.6 #detectinglanguage
nltk==3.1
numpy==1.13.1
psycopg2==2.6.2
pycountry==1.20
python-dateutil==2.4.2
pytz==2016.10 # timezones
PyYAML==3.11
RandomWords==0.1.12
ujson==1.35
umalqurra==0.2 # arabic calendars (?? why use ??)
networkx==1.11
pandas==0.18.0
six==1.10.0
lxml==3.5.0
requests-futures==0.9.7
bs4==0.0.1
requests==2.10.0
djangorestframework-jwt==1.9.0
jupyter==1.0.0
jupyter-client==5.0.0
jupyter-console==5.1.0
jupyter-core==4.3.0
ipython==5.2.0
ipython-genutils==0.1.0
ipywidgets
matplotlib==2.0.2
alembic>=0.9.2
SQLAlchemy==1.1.14
SQLAlchemy-Searchable==0.10.4
SQLAlchemy-Utils==0.32.16
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment