Commit 591915ef authored by sim's avatar sim

Remove notebook

parent 50872e57
install/notebook/gargantext_notebook.py
\ No newline at end of file
#!/bin/bash
sudo adduser --disabled-password --gecos "" notebooks
sudo docker rm $(sudo docker ps -a | grep sh | awk '{print $1}')
sudo docker build -t garg-notebook:latest ./notebook
#!/bin/bash
#-v /srv/gargandata:/srv/gargandata \
#-v /srv/gargantext_lib:/srv/gargantext_lib \
sudo docker rm $(sudo docker ps -a | grep notebook | grep sh | awk '{print $1}')
#HOSTIP=$(ip route show 0.0.0.0/0 | awk '{print $3}')
#--add-host=localhost:${HOSTIP} \
sudo docker run \
--name=garg-notebook \
--net=host \
-p 8899:8899 \
--env POSTGRES_HOST=localhost \
-v /srv/gargantext:/srv/gargantext \
-it garg-notebook:latest \
/bin/bash -c "/bin/su notebooks -c 'source /env_3-5/bin/activate && cd /home/notebooks && jupyter notebook --port=8899 --ip=0.0.0.0 --no-browser'"
# #&& jupyter nbextension enable --py widgetsnbextension --sys-prefix
#/bin/bash -c "/bin/su notebooks -c 'source /env_3-5/bin/activate && cd /srv/gargantext/ && jupyter notebook --port=8899 --ip=0.0.0.0 --no-browser --notebook-dir=/home/notebooks/'"
###########################################################
# Gargamelle WEB
###########################################################
#Build an image starting with debian:stretch image
# wich contains all the source code of the app
FROM debian:stretch
MAINTAINER ISCPIF <gargantext@iscpif.fr>
USER root
### Update and install base dependencies
RUN echo "############ DEBIAN LIBS ###############"
RUN apt-get update && \
apt-get install -y \
apt-utils ca-certificates locales \
sudo aptitude gcc g++ wget git vim \
build-essential make \
curl
# postgresql-9.6 postgresql-client-9.6 postgresql-contrib-9.6 \
# postgresql-server-dev-9.6 libpq-dev libxml2 \
# postgresql-9.6 postgresql-client-9.6 postgresql-contrib-9.6
# Install Stack
### Configure timezone and locale
RUN echo "########### LOCALES & TZ #################"
RUN echo "Europe/Paris" > /etc/timezone
ENV TZ "Europe/Paris"
RUN sed -i -e 's/# en_GB.UTF-8 UTF-8/en_GB.UTF-8 UTF-8/' /etc/locale.gen && \
sed -i -e 's/# fr_FR.UTF-8 UTF-8/fr_FR.UTF-8 UTF-8/' /etc/locale.gen && \
dpkg-reconfigure --frontend=noninteractive locales && \
echo 'LANG="fr_FR.UTF-8"' > /etc/default/locale
ENV LANG fr_FR.UTF-8
ENV LANGUAGE fr_FR.UTF-8
ENV LC_ALL fr_FR.UTF-8
### Install main dependencies and python packages based on Debian distrib
RUN echo "############# PYTHON DEPENDENCIES ###############"
RUN apt-get update && apt-get install -y \
libxml2-dev xml-core libgfortran-6-dev \
libpq-dev \
python3.5 \
python3-dev \
# for numpy, pandas and numpyperf \
python3-six python3-numpy python3-setuptools \
python3-numexpr \
# python dependencies \
python3-pip \
# for lxml
libxml2-dev libxslt-dev libxslt1-dev zlib1g-dev
# UPDATE AND CLEAN
RUN apt-get update && apt-get autoclean \
&& rm -rf /var/lib/apt/lists/*
#NB: removing /var/lib will avoid to significantly fill up your /var/ folder on your native system
########################################################################
### PYTHON ENVIRONNEMENT (as ROOT)
########################################################################
RUN adduser --disabled-password --gecos "" notebooks
RUN pip3 install virtualenv
RUN virtualenv /env_3-5
RUN echo 'alias venv="source /env_3-5/bin/activate"' >> ~/.bashrc
# CONFIG FILES
ADD requirements.txt /
#ADD psql_configure.sh /
ADD django_configure.sh /
RUN . /env_3-5/bin/activate && pip3 install -r requirements.txt && \
pip3 install git+https://github.com/zzzeek/sqlalchemy.git@rel_1_1 && \
python3 -m nltk.downloader averaged_perceptron_tagger -d /usr/local/share/nltk_data
#RUN ./psql_configure.sh
#RUN ./django_configure.sh
RUN chown notebooks:notebooks -R /env_3-5
########################################################################
### POSTGRESQL DATA (as ROOT)
########################################################################
#RUN sed -iP "s%^data_directory.*%data_directory = \'\/srv\/gargandata\'%" /etc/postgresql/9.5/main/postgresql.conf
#RUN echo "host all all 0.0.0.0/0 md5" >> /etc/postgresql/9.5/main/pg_hba.conf
#RUN echo "listen_addresses='*'" >> /etc/postgresql/9.5/main/postgresql.conf
EXPOSE 5432 8899
VOLUME ["/srv/","/home/notebooks/"]
########################################################################
### Notebook IHaskell and IPYTHON ENVIRONNEMENT
########################################################################
RUN apt-get update && apt-get install -y \
libtinfo-dev \
libzmq3-dev \
libcairo2-dev \
libpango1.0-dev \
libmagic-dev \
libblas-dev \
liblapack-dev
#USER notebooks
#
#RUN cd /home/notebooks \
# && curl -sSL https://get.haskellstack.org/ | sh \
# && stack setup \
# && git clone https://github.com/gibiansky/IHaskell \
# && . /env_3-5/bin/activate \
# && cd IHaskell \
# && stack install gtk2hs-buildtools \
# && stack install --fast \
# && /root/.local/bin/ihaskell install --stack
#
#!/bin/bash
##################################################
# __| |(_) __ _ _ __ __ _ ___
# / _` || |/ _` | '_ \ / _` |/ _ \
# | (_| || | (_| | | | | (_| | (_) |
# \__,_|/ |\__,_|_| |_|\__, |\___/
# |__/ |___/
##################################################
#configure django migrations
##################################################
echo "::::: DJANGO :::::"
#echo "Starting Postgres"
#/usr/sbin/service postgresql start
su gargantua -c 'source /srv/env_3-5/bin/activate &&\
echo "Activated env" &&\
/srv/gargantext/manage.py makemigrations &&\
/srv/gargantext/manage.py migrate && \
echo "migrations ok" &&\
/srv/gargantext/dbmigrate.py && \
/srv/gargantext/dbmigrate.py && \
/srv/gargantext/dbmigrate.py && \
/srv/gargantext/manage.py createsuperuser'
service postgresql stop
#!/usr/bin/env python
"""
Gargantext Software Copyright (c) 2016-2017 CNRS ISC-PIF -
http://iscpif.fr
Licence (see :
http://gitlab.iscpif.fr/humanities/gargantext/blob/stable/LICENSE )
- In France : a CECILL variant affero compliant
- GNU aGPLV3 for all other countries
"""
import os
import django
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gargantext.settings')
django.setup()
from gargantext.constants import QUERY_SIZE_N_MAX, get_resource, get_resource_by_name
from gargantext.models import (Node, ProjectNode, DocumentNode,
Ngram, NodeNgram, NodeNgramNgram, NodeNodeNgram)
from gargantext.util.db import session, get_engine, func, aliased, case
from collections import Counter
import importlib
from django.http import Http404
# Import those to be available by notebook user
from langdetect import detect as detect_lang
from gargantext.models import UserNode, User
import functools
class NotebookError(Exception):
pass
def documents(corpus_id):
return (session.query(DocumentNode).filter_by(parent_id=corpus_id)
#.order_by(Node.hyperdata['publication_date'])
.all())
#import seaborn as sns
import pandas as pd
def countByField(docs, field):
return list(Counter([doc.hyperdata[field] for doc in docs]).items())
def chart(docs, field):
year_publis = countByField(docs, field)
frame0 = pd.DataFrame(year_publis, columns=['Date', 'DateValue'])
frame1 = pd.DataFrame(year_publis, columns=['Date', 'DateValue'], index=frame0.Date)
return frame1
from gargantext.util.crawlers.HAL import HalCrawler
def scan_hal(request):
hal = HalCrawler()
return hal.scan_results(request)
def _search_docs(corpus_id, request, fast=False):
q = session.query(DocumentNode).filter_by(parent_id=corpus_id)
# Search ngram <request> in hyperdata <field>
H = lambda field, request: Node.hyperdata[field].astext.op('~*')(request)
if not fast:
# Only match <request> starting and ending with word boundary
# Sequence of spaces will match any sequence of spaces
request = '\s+'.join(filter(None, r'\m{}\M'.format(request).split(' ')))
return q.filter(Node.title_abstract.match(request)) if fast else \
q.filter(H('title', request) | H('abstract', request))
def scan_gargantext(corpus_id, request, fast=False, documents=False):
query = _search_docs(corpus_id, request, fast)
if documents:
return query.all()
return query.with_entities(func.count(DocumentNode.id.distinct())).one()[0]
def scan_gargantext_and_delete(corpus_id, request, fast=False):
r = _search_docs(corpus_id, request, fast).delete(synchronize_session='fetch')
session.commit()
return r
def myProject_fromUrl(url):
"""
myProject :: String -> Project
"""
project_id = url.split("/")[4]
project = session.query(ProjectNode).get(project_id)
return project
def newCorpus(project, source, name=None, query=None):
error = False
if name is None:
name = query
if not isinstance(project, ProjectNode):
error = "a valid project"
if not isinstance(source, int) and not isinstance(source, str):
error = "a valid source identifier: id or name"
elif not isinstance(query, str):
error = "a valid query"
elif not isinstance(name, str):
error = "a valid name"
if error:
raise NotebookError("Please provide %s." % error)
resource = get_resource(source) if isinstance(source, int) else \
get_resource_by_name(source)
moissonneur_name = get_moissonneur_name(resource) if resource else \
source.lower()
try:
moissonneur = get_moissonneur(moissonneur_name)
except ImportError:
raise NotebookError("Invalid source identifier: %r" % source)
return run_moissonneur(moissonneur, project, name, query)
def get_moissonneur_name(ident):
""" Return moissonneur module name from RESOURCETYPE or crawler name """
# Does it quacks like a RESOURCETYPE ?
if hasattr(ident, 'get'):
ident = ident.get('crawler')
# Extract name from crawler class name, otherwise assume ident is already
# a moissonneur name.
if isinstance(ident, str) and ident.endswith('Crawler'):
return ident[:-len('Crawler')].lower()
def get_moissonneur(name):
""" Return moissonneur module from its name """
if not isinstance(name, str) or not name.islower():
raise NotebookError("Invalid moissonneur name: %r" % name)
module = importlib.import_module('gargantext.moissonneurs.%s' % name)
module.name = name
return module
def run_moissonneur(moissonneur, project, name, query):
""" Run moissonneur and return resulting corpus """
# XXX Uber-kludge with gory details. Spaghetti rulezzzzz!
class Dummy(object):
pass
request = Dummy()
request.method = 'POST'
request.path = 'nowhere'
request.META = {}
# XXX 'string' only have effect on moissonneurs.pubmed; its value is added
# when processing request client-side, take a deep breath and see
# templates/projects/project.html for more details.
request.POST = {'string': name,
'query': query,
'N': QUERY_SIZE_N_MAX}
request.user = Dummy()
request.user.id = project.user_id
request.user.is_authenticated = lambda: True
if moissonneur.name == 'istex':
# Replace ALL spaces by plus signs
request.POST['query'] = '+'.join(filter(None, query.split(' ')))
try:
import json
r = moissonneur.query(request)
raw_json = r.content.decode('utf-8')
data = json.loads(raw_json)
if moissonneur.name == 'pubmed':
count = sum(x['count'] for x in data)
request.POST['query'] = raw_json
elif moissonneur.name == 'istex':
count = data.get('total', 0)
else:
count = data.get('results_nb', 0)
if count > 0:
corpus = moissonneur.save(request, project.id, return_corpus=True)
else:
return None
except (ValueError, Http404) as e:
raise e
# Sometimes strange things happens...
if corpus.name != name:
corpus.name = name
session.commit()
return corpus
ALL_LIST_TYPES = ['main', 'map', 'stop']
def _ngrams(corpus_id, list_types, entities):
list_types = (list_types,) if isinstance(list_types, str) else list_types
list_typenames = [
'{}LIST'.format(t.upper()) for t in list_types if t in ALL_LIST_TYPES]
# `Node` is our list, ie. MAINLIST and/or MAPLIST and/or STOPLIST
return (session.query(*entities)
.select_from(Ngram)
.filter(NodeNgram.ngram_id==Ngram.id,
NodeNgram.node_id==Node.id,
Node.parent_id==corpus_id,
Node.typename.in_(list_typenames)))
def corpus_list(corpus_id, list_types=ALL_LIST_TYPES, with_synonyms=False,
with_count=False):
# Link between a GROUPLIST, a normal form (ngram1), and a synonym (ngram2)
NNN = NodeNgramNgram
# Get the list type from the Node type -- as in CSV export
list_type = (case([(Node.typename=='MAINLIST', 'main'),
(Node.typename=='MAPLIST', 'map'),
(Node.typename=='STOPLIST', 'stop')])
.label('type'))
# We will retrieve each ngram as the following tuple:
entities = (list_type, Ngram.terms.label('ng'))
if with_count:
entities += (Ngram.id.label('id'),)
# First, get ngrams from wanted lists
ngrams = _ngrams(corpus_id, list_types, entities)
# Secondly, exclude "synonyms" (grouped ngrams that are not normal forms).
# We have to exclude synonyms first because data is inconsistent and some
# of them can be both in GROUPLIST and in MAIN/MAP/STOP lists. We want to
# take synonyms from GROUPLIST only -- see below.
Groups = aliased(Node, name='groups')
query = (ngrams.outerjoin(Groups, (Groups.parent_id==corpus_id) & (Groups.typename=='GROUPLIST'))
.outerjoin(NNN, (NNN.node_id==Groups.id) & (NNN.ngram2_id==Ngram.id))
.filter(NNN.ngram1_id==None))
# If `with_synonyms` is True, add them from GROUPLIST: this is the reliable
# source for them
if with_synonyms:
Synonym = aliased(Ngram)
ent = (list_type, Synonym.terms.label('ng'), Synonym.id.label('id'))
synonyms = (ngrams.with_entities(*ent)
.filter(NNN.ngram1_id==Ngram.id,
NNN.ngram2_id==Synonym.id,
NNN.node_id==Groups.id,
Groups.parent_id==corpus_id,
Groups.typename=='GROUPLIST'))
query = query.union(synonyms)
# Again, data is inconsistent: MAINLIST may intersect with MAPLIST and
# we don't wan't that
if 'main' in list_types and 'map' not in list_types:
# Exclude MAPLIST ngrams from MAINLIST
query = query.except_(_ngrams(corpus_id, 'map', entities))
if with_count:
N = query.subquery()
return (session.query(N.c.type, N.c.ng, NodeNodeNgram.score)
.join(Node, (Node.parent_id==corpus_id) & (Node.typename=='OCCURRENCES'))
.outerjoin(NodeNodeNgram, (NodeNodeNgram.ngram_id==N.c.id) &
(NodeNodeNgram.node1_id==Node.id) &
(NodeNodeNgram.node2_id==corpus_id)))
# Return found ngrams sorted by list type, and then alphabetically
return query.order_by('type', 'ng')
#!/bin/bash
#######################################################################
## ____ _
## | _ \ ___ ___| |_ __ _ _ __ ___ ___
## | |_) / _ \/ __| __/ _` | '__/ _ \/ __|
## | __/ (_) \__ \ || (_| | | | __/\__ \
## |_| \___/|___/\__\__, |_| \___||___/
## |___/
#######################################################################
echo "::::: POSTGRESQL :::::"
su postgres -c 'pg_dropcluster 9.4 main --stop'
#done in docker but redoing it
rm -rf /srv/gargandata && mkdir /srv/gargandata && chown postgres:postgres /srv/gargandata
su postgres -c '/usr/lib/postgresql/9.6/bin/initdb -D /srv/gargandata/'
su postgres -c '/usr/lib/postgresql/9.6/bin/pg_ctl -D /srv/gargandata/ -l /srv/gargandata/journal_applicatif start'
su postgres -c 'pg_createcluster -D /srv/gargandata 9.6 main '
su postgres -c 'pg_ctlcluster -D /srv/gargandata 9.6 main start '
su postgres -c 'pg_ctlcluster 9.6 main start'
service postgresql start
su postgres -c "psql -c \"CREATE user gargantua WITH PASSWORD 'C8kdcUrAQy66U'\""
su postgres -c "createdb -O gargantua gargandb"
echo "Postgres configured"
#service postgresql stop
# try bottleneck
eventlet==0.20.1
amqp==1.4.9
anyjson==0.3.3
billiard==3.3.0.23
celery==3.1.25
chardet==2.3.0
dateparser==0.3.5
Django==1.10.5
django-celery==3.2.1
django-pgfields==1.4.4
django-pgjsonb==0.0.23
djangorestframework==3.5.3
html5lib==0.9999999
#python-igraph>=0.7.1
jdatetime==1.7.2
kombu==3.0.37 # messaging
langdetect==1.0.6 #detectinglanguage
nltk==3.1
numpy==1.13.1
psycopg2==2.6.2
pycountry==1.20
python-dateutil==2.4.2
pytz==2016.10 # timezones
PyYAML==3.11
RandomWords==0.1.12
ujson==1.35
umalqurra==0.2 # arabic calendars (?? why use ??)
networkx==1.11
pandas==0.18.0
six==1.10.0
lxml==3.5.0
requests-futures==0.9.7
bs4==0.0.1
requests==2.10.0
djangorestframework-jwt==1.9.0
jupyter==1.0.0
jupyter-client==5.0.0
jupyter-console==5.1.0
jupyter-core==4.3.0
ipython==5.2.0
ipython-genutils==0.1.0
ipywidgets
matplotlib==2.0.2
alembic>=0.9.2
SQLAlchemy==1.1.14
SQLAlchemy-Searchable==0.10.4
SQLAlchemy-Utils==0.32.16
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"# Advanced Gargantext Tutorial (Python)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"import sys\n",
"sys.path.insert(0, '/srv/gargantext')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# First import the library Gargantext Notebook\n",
"from gargantext_notebook import *\n",
"\n",
"# This enables to draw graphics later\n",
"%matplotlib inline "
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"source": [
"# Philomemies"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Instantiate the corpus you are working on"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"My corpus id is : 302695.\n"
]
}
],
"source": [
"corpus_url = \"http://localhost:8000/projects/302694/corpora/302695/\"\n",
"corpus_id = corpus_url.split(\"/\")[6]\n",
"print(\"My corpus id is : %s.\" % corpus_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Getting the Map Terms "
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(21, 'environment'), (42, 'development'), (184, 'examples'), (196, 'water'), (368, 'problem'), (576, 'work'), (654, 'technology'), (712, 'number'), (738, 'operation'), (817, 'experiments')]\n"
]
}
],
"source": [
"from gargantext.models import *\n",
"import csv\n",
"\n",
"map_id = session.query(MaplistNode.id).filter(MaplistNode.parent_id == corpus_id).first()\n",
"\n",
"mapTerms = (session.query(Ngram).join( NodeNgram, NodeNgram.ngram_id == Ngram.id)\n",
" .filter(NodeNgram.node_id == map_id)\n",
" .all()\n",
" )\n",
"\n",
"print([(m.id, m.terms) for m in mapTerms[:10]])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Save in CSV File"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"toPrint = [(m.id,m.terms) for m in mapTerms]\n",
"csvfile = \"./MapTerms.csv\"\n",
"\n",
"#Assuming res is a flat list\n",
"with open(csvfile, \"w\") as output:\n",
" writer = csv.writer(output, lineterminator='\\n')\n",
" for val in toPrint:\n",
" writer.writerow([val])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Next:\n",
"# You can have access to your CSV file in the home of you Notebook!\n",
"# Click, rename, mv, delete in your Notebook\n",
"\n",
"#Assuming output is a list of lists\n",
"#with open(csvfile, \"w\") as output:\n",
"# writer = csv.writer(output, lineterminator='\\n')\n",
"# writer.writerows(res)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Occurrences of MapTerms by Year"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(21370, 4.0),\n",
" (59430, 1.0),\n",
" (651305, 1.0),\n",
" (2360749, 1.0),\n",
" (1449939, 1.0),\n",
" (643027, 1.0),\n",
" (2364476, 1.0),\n",
" (2360737, 1.0),\n",
" (2365155, 1.0),\n",
" (2363638, 1.0),\n",
" (1443297, 2.0),\n",
" (1384982, 1.0),\n",
" (2360189, 1.0),\n",
" (525938, 1.0),\n",
" (2362296, 1.0),\n",
" (1411893, 1.0),\n",
" (2361160, 2.0),\n",
" (2362533, 1.0),\n",
" (499432, 2.0),\n",
" (734655, 1.0),\n",
" (2363202, 1.0),\n",
" (635348, 1.0),\n",
" (2365036, 1.0),\n",
" (2360700, 1.0),\n",
" (2362383, 1.0),\n",
" (567202, 1.0),\n",
" (2177469, 1.0),\n",
" (1422236, 1.0),\n",
" (2361517, 1.0),\n",
" (598620, 2.0),\n",
" (2364289, 1.0),\n",
" (1629967, 1.0),\n",
" (731546, 1.0),\n",
" (680861, 1.0),\n",
" (2363196, 1.0),\n",
" (2358884, 4.0),\n",
" (674406, 2.0),\n",
" (13012, 3.0),\n",
" (2360148, 1.0),\n",
" (622538, 1.0),\n",
" (1585366, 2.0),\n",
" (647149, 1.0),\n",
" (2358559, 1.0),\n",
" (2365513, 2.0),\n",
" (512496, 3.0),\n",
" (2365043, 1.0),\n",
" (2359304, 1.0),\n",
" (2362593, 1.0),\n",
" (513711, 3.0),\n",
" (492233, 1.0),\n",
" (64331, 3.0),\n",
" (2365676, 1.0),\n",
" (2360498, 1.0),\n",
" (445661, 1.0),\n",
" (2364442, 1.0),\n",
" (2362607, 1.0),\n",
" (806318, 1.0),\n",
" (1895463, 1.0),\n",
" (2359089, 1.0),\n",
" (2361007, 1.0),\n",
" (157477, 1.0),\n",
" (2364599, 1.0),\n",
" (2364315, 2.0),\n",
" (2360787, 2.0),\n",
" (57317, 3.0),\n",
" (514091, 8.0),\n",
" (31997, 1.0),\n",
" (2362566, 1.0),\n",
" (685604, 1.0),\n",
" (2170353, 1.0),\n",
" (501580, 1.0),\n",
" (2360868, 1.0),\n",
" (26, 25.0),\n",
" (2358702, 4.0),\n",
" (620585, 1.0),\n",
" (742237, 3.0),\n",
" (22883, 2.0),\n",
" (2362338, 1.0),\n",
" (1688637, 1.0),\n",
" (2362267, 1.0),\n",
" (2364837, 1.0),\n",
" (2360978, 1.0),\n",
" (501714, 1.0),\n",
" (2364106, 1.0),\n",
" (486294, 2.0),\n",
" (2120016, 1.0),\n",
" (2365812, 1.0),\n",
" (491783, 1.0),\n",
" (2362771, 1.0),\n",
" (2363111, 1.0),\n",
" (1433227, 1.0),\n",
" (2362738, 2.0),\n",
" (2360235, 1.0),\n",
" (2359671, 1.0),\n",
" (929961, 1.0),\n",
" (2360102, 1.0),\n",
" (2360196, 1.0),\n",
" (622587, 1.0),\n",
" (2365674, 1.0),\n",
" (2359850, 1.0),\n",
" (2362364, 1.0),\n",
" (2365945, 1.0),\n",
" (2360968, 1.0),\n",
" (2364469, 1.0),\n",
" (818337, 1.0),\n",
" (2364330, 1.0),\n",
" (851756, 1.0),\n",
" (481888, 2.0),\n",
" (2363028, 1.0),\n",
" (2362572, 1.0),\n",
" (619602, 1.0),\n",
" (480556, 2.0),\n",
" (617985, 1.0),\n",
" (2359001, 1.0),\n",
" (1530066, 3.0),\n",
" (2361189, 1.0),\n",
" (2365119, 1.0),\n",
" (2359622, 1.0),\n",
" (2358673, 1.0),\n",
" (1770959, 1.0),\n",
" (2359417, 1.0),\n",
" (2364799, 1.0),\n",
" (1625445, 1.0),\n",
" (2359191, 1.0),\n",
" (1637994, 1.0),\n",
" (2364004, 1.0),\n",
" (2365841, 1.0),\n",
" (2361921, 1.0),\n",
" (2363485, 1.0),\n",
" (2364956, 1.0),\n",
" (2363993, 1.0),\n",
" (703437, 1.0),\n",
" (2365657, 1.0),\n",
" (477579, 4.0),\n",
" (2364405, 1.0),\n",
" (931092, 1.0),\n",
" (16034, 1.0),\n",
" (55673, 3.0),\n",
" (83733, 1.0),\n",
" (632306, 8.0),\n",
" (2365015, 1.0),\n",
" (1380705, 1.0),\n",
" (2364241, 1.0),\n",
" (2361341, 1.0),\n",
" (2365226, 1.0),\n",
" (2360270, 3.0),\n",
" (2359257, 1.0),\n",
" (513664, 2.0),\n",
" (2363752, 1.0),\n",
" (2358578, 5.0),\n",
" (462354, 1.0),\n",
" (2364333, 2.0),\n",
" (2365625, 1.0),\n",
" (2136540, 1.0),\n",
" (438777, 1.0),\n",
" (1395914, 1.0),\n",
" (509545, 1.0),\n",
" (2360917, 1.0),\n",
" (2364219, 1.0),\n",
" (2361672, 1.0),\n",
" (919892, 1.0),\n",
" (2361169, 1.0),\n",
" (2363689, 1.0),\n",
" (631491, 3.0),\n",
" (1608035, 1.0),\n",
" (2363660, 2.0),\n",
" (2363106, 1.0),\n",
" (1324144, 1.0),\n",
" (5561, 14.0),\n",
" (2361420, 1.0),\n",
" (2364011, 1.0),\n",
" (1438416, 2.0),\n",
" (629048, 2.0),\n",
" (586132, 1.0),\n",
" (690740, 1.0),\n",
" (494644, 6.0),\n",
" (2359973, 1.0),\n",
" (2364755, 1.0),\n",
" (673739, 1.0),\n",
" (296, 1.0),\n",
" (926220, 2.0),\n",
" (807705, 1.0),\n",
" (528702, 1.0),\n",
" (16802, 4.0),\n",
" (2360888, 1.0),\n",
" (568435, 1.0),\n",
" (2359540, 1.0),\n",
" (1387358, 3.0),\n",
" (2359551, 3.0),\n",
" (497582, 43.0),\n",
" (1677149, 1.0),\n",
" (1355614, 4.0),\n",
" (1201745, 1.0),\n",
" (505837, 1.0),\n",
" (559722, 1.0),\n",
" (2365008, 2.0),\n",
" (2365846, 1.0),\n",
" (2360910, 1.0),\n",
" (1516185, 1.0),\n",
" (2365055, 1.0),\n",
" (2360713, 1.0),\n",
" (2363076, 1.0),\n",
" (2363231, 1.0),\n",
" (2361690, 1.0),\n",
" (1514633, 1.0),\n",
" (2361995, 1.0),\n",
" (2363636, 1.0),\n",
" (2363301, 1.0),\n",
" (440819, 1.0),\n",
" (2365719, 2.0),\n",
" (2362375, 1.0),\n",
" (735539, 1.0),\n",
" (2361324, 1.0),\n",
" (120180, 1.0),\n",
" (2170107, 1.0),\n",
" (2363634, 1.0),\n",
" (2362961, 1.0),\n",
" (2364791, 1.0),\n",
" (2360526, 1.0),\n",
" (1921124, 1.0),\n",
" (2364312, 2.0),\n",
" (2359118, 3.0),\n",
" (63107, 2.0),\n",
" (2361984, 2.0),\n",
" (499205, 1.0),\n",
" (8604, 13.0),\n",
" (2362915, 1.0),\n",
" (2363378, 1.0),\n",
" (720125, 3.0),\n",
" (302111, 1.0),\n",
" (655753, 1.0),\n",
" (735895, 2.0),\n",
" (2365447, 2.0),\n",
" (2360850, 1.0),\n",
" (689048, 2.0),\n",
" (445111, 1.0),\n",
" (503269, 2.0),\n",
" (2359395, 2.0),\n",
" (1405763, 1.0),\n",
" (829454, 1.0),\n",
" (2365278, 1.0),\n",
" (2362406, 1.0),\n",
" (2362394, 1.0),\n",
" (494627, 1.0),\n",
" (2362131, 1.0),\n",
" (2362087, 1.0),\n",
" (1353261, 2.0),\n",
" (2361179, 1.0),\n",
" (2362444, 2.0),\n",
" (2360429, 1.0),\n",
" (2362294, 1.0),\n",
" (469284, 2.0),\n",
" (1893049, 1.0),\n",
" (2365809, 3.0),\n",
" (2359723, 2.0),\n",
" (2363078, 1.0),\n",
" (2360239, 1.0),\n",
" (2362494, 1.0),\n",
" (1877521, 1.0),\n",
" (2360110, 4.0),\n",
" (2363186, 1.0),\n",
" (884258, 1.0),\n",
" (2359352, 1.0),\n",
" (2522, 3.0),\n",
" (2362417, 1.0),\n",
" (450837, 1.0),\n",
" (2364726, 1.0),\n",
" (2363699, 1.0),\n",
" (2364702, 1.0),\n",
" (2359174, 1.0),\n",
" (1963, 1.0),\n",
" (559468, 1.0),\n",
" (6118, 9.0),\n",
" (2359177, 2.0),\n",
" (2362514, 1.0),\n",
" (2362221, 1.0),\n",
" (2365090, 1.0),\n",
" (2365503, 1.0),\n",
" (527113, 1.0),\n",
" (2362930, 1.0),\n",
" (2362782, 1.0),\n",
" (2365635, 1.0),\n",
" (54751, 1.0),\n",
" (513650, 1.0),\n",
" (2362227, 2.0),\n",
" (608048, 1.0),\n",
" (2360822, 1.0),\n",
" (2365091, 1.0),\n",
" (2364883, 1.0),\n",
" (2362610, 1.0),\n",
" (620473, 9.0),\n",
" (1411038, 2.0),\n",
" (29247, 12.0),\n",
" (624176, 4.0),\n",
" (2364503, 1.0),\n",
" (7150, 1.0),\n",
" (2358794, 2.0),\n",
" (2361782, 1.0),\n",
" (2362586, 1.0),\n",
" (2360037, 1.0),\n",
" (1429116, 1.0),\n",
" (2359620, 1.0),\n",
" (923, 4.0),\n",
" (2361933, 1.0),\n",
" (2360660, 1.0),\n",
" (2365277, 2.0),\n",
" (2191553, 1.0),\n",
" (2364895, 2.0),\n",
" (2364275, 2.0),\n",
" (2361536, 1.0),\n",
" (2365404, 1.0),\n",
" (2359764, 1.0),\n",
" (1561871, 2.0),\n",
" (559320, 1.0),\n",
" (873327, 1.0),\n",
" (658039, 1.0),\n",
" (2359213, 2.0),\n",
" (2359535, 1.0),\n",
" (2361736, 1.0),\n",
" (2364559, 1.0),\n",
" (1623384, 1.0),\n",
" (30980, 1.0),\n",
" (750366, 1.0),\n",
" (20356, 1.0),\n",
" (2365921, 1.0),\n",
" (2152944, 1.0),\n",
" (587010, 1.0),\n",
" (849909, 11.0),\n",
" (14527, 1.0),\n",
" (8011, 35.0),\n",
" (2361030, 1.0),\n",
" (1545504, 1.0),\n",
" (2361015, 1.0),\n",
" (2365040, 1.0),\n",
" (1447721, 9.0),\n",
" (2362086, 1.0),\n",
" (2362995, 1.0),\n",
" (63843, 1.0),\n",
" (2365793, 1.0),\n",
" (21, 7.0),\n",
" (545578, 1.0),\n",
" (2362704, 1.0),\n",
" (2360704, 1.0),\n",
" (10704, 3.0),\n",
" (3942, 4.0),\n",
" (5270, 6.0),\n",
" (2361778, 1.0),\n",
" (2363553, 1.0),\n",
" (2364310, 1.0),\n",
" (1301103, 1.0),\n",
" (444719, 3.0),\n",
" (2359886, 1.0),\n",
" (2362677, 1.0),\n",
" (2359658, 1.0),\n",
" (2358746, 10.0),\n",
" (21645, 2.0),\n",
" (2360518, 1.0),\n",
" (2364300, 1.0),\n",
" (1387595, 1.0),\n",
" (2362101, 1.0),\n",
" (2364435, 1.0),\n",
" (2365058, 2.0),\n",
" (2359112, 1.0),\n",
" (2360899, 1.0),\n",
" (2362248, 1.0),\n",
" (854727, 1.0),\n",
" (1423016, 1.0),\n",
" (1413873, 1.0),\n",
" (2363707, 1.0),\n",
" (2363157, 1.0),\n",
" (2153, 7.0),\n",
" (934934, 1.0),\n",
" (616231, 1.0),\n",
" (511566, 1.0),\n",
" (2364500, 1.0),\n",
" (2361001, 1.0),\n",
" (1397541, 1.0),\n",
" (587884, 2.0),\n",
" (2365532, 1.0),\n",
" (8410, 1.0),\n",
" (827517, 1.0),\n",
" (19604, 1.0),\n",
" (2359015, 1.0),\n",
" (2359056, 1.0),\n",
" (2362183, 1.0),\n",
" (2365154, 1.0),\n",
" (2360190, 1.0),\n",
" (2358618, 1.0),\n",
" (463674, 38.0),\n",
" (1703021, 1.0),\n",
" (850864, 1.0),\n",
" (2361383, 1.0),\n",
" (2363051, 1.0),\n",
" (515051, 1.0),\n",
" (506340, 1.0),\n",
" (147281, 3.0),\n",
" (2359145, 6.0),\n",
" (2361831, 1.0),\n",
" (1307142, 1.0),\n",
" (2362005, 1.0),\n",
" (2362907, 1.0),\n",
" (2363086, 1.0),\n",
" (1780184, 1.0),\n",
" (2359967, 3.0),\n",
" (2341709, 1.0),\n",
" (2361449, 1.0),\n",
" (2195068, 1.0),\n",
" (1550538, 1.0),\n",
" (2359930, 1.0),\n",
" (2358556, 1.0),\n",
" (2359028, 1.0),\n",
" (2362000, 1.0),\n",
" (477488, 1.0),\n",
" (1325934, 1.0),\n",
" (2358872, 1.0),\n",
" (532439, 1.0),\n",
" (2359331, 1.0),\n",
" (2359288, 1.0),\n",
" (526473, 1.0),\n",
" (786352, 1.0),\n",
" (2362121, 1.0),\n",
" (29473, 2.0),\n",
" (2363837, 1.0),\n",
" (2364991, 1.0),\n",
" (2364888, 1.0),\n",
" (902377, 1.0),\n",
" (2363525, 1.0),\n",
" (2364401, 1.0),\n",
" (2365986, 1.0),\n",
" (2361401, 1.0),\n",
" (2365266, 1.0),\n",
" (1713272, 1.0),\n",
" (2359931, 1.0),\n",
" (506213, 1.0),\n",
" (2361843, 1.0),\n",
" (1694972, 1.0),\n",
" (590807, 1.0),\n",
" (2363469, 1.0),\n",
" (510679, 1.0),\n",
" (794150, 1.0),\n",
" (519092, 2.0),\n",
" (1733, 18.0),\n",
" (3061, 2.0),\n",
" (1585972, 1.0),\n",
" (742843, 1.0),\n",
" (520505, 1.0),\n",
" (2360506, 1.0),\n",
" (2364047, 1.0),\n",
" (2363234, 1.0),\n",
" (987, 5.0),\n",
" (509404, 1.0),\n",
" (1522832, 2.0),\n",
" (2359095, 1.0),\n",
" (1436961, 2.0),\n",
" (1201089, 2.0),\n",
" (2361240, 1.0),\n",
" (2362356, 1.0),\n",
" (2365630, 1.0),\n",
" (1602420, 1.0),\n",
" (2362337, 1.0),\n",
" (2364139, 1.0),\n",
" (2362046, 1.0),\n",
" (504418, 1.0),\n",
" (2152668, 1.0),\n",
" (2362102, 2.0),\n",
" (8096, 2.0),\n",
" (228091, 1.0),\n",
" (2365067, 1.0),\n",
" (2362173, 1.0),\n",
" (1521046, 3.0),\n",
" (2361475, 1.0),\n",
" (13387, 1.0),\n",
" (2364137, 1.0),\n",
" (2359308, 1.0),\n",
" (2360943, 1.0),\n",
" (1658105, 4.0),\n",
" (494569, 1.0),\n",
" (94, 1.0),\n",
" (55639, 2.0),\n",
" (2777, 2.0),\n",
" (418077, 1.0),\n",
" (62608, 1.0),\n",
" (2361594, 1.0),\n",
" (2358806, 1.0),\n",
" (482756, 1.0),\n",
" (2361127, 1.0),\n",
" (2364255, 1.0),\n",
" (2329826, 2.0),\n",
" (2361084, 2.0),\n",
" (2360560, 1.0),\n",
" (623059, 1.0),\n",
" (2445, 3.0),\n",
" (81429, 2.0),\n",
" (1179801, 1.0),\n",
" (2362862, 1.0),\n",
" (2361703, 1.0),\n",
" (2359312, 1.0),\n",
" (9826, 2.0),\n",
" (2364379, 1.0),\n",
" (527741, 1.0),\n",
" (2364189, 1.0),\n",
" (2359316, 2.0),\n",
" (584752, 9.0),\n",
" (1641794, 1.0),\n",
" (2365861, 1.0),\n",
" (1208011, 1.0),\n",
" (20970, 1.0),\n",
" (1937, 5.0),\n",
" (5359, 4.0),\n",
" (1752091, 1.0),\n",
" (1375448, 1.0),\n",
" (595143, 1.0),\n",
" (2364461, 1.0),\n",
" (5682, 1.0),\n",
" (2362063, 1.0),\n",
" (21879, 1.0),\n",
" (2360701, 1.0),\n",
" (2358977, 1.0),\n",
" (2361154, 1.0),\n",
" (2362340, 1.0),\n",
" (1785700, 1.0),\n",
" (2362842, 1.0),\n",
" (2359448, 1.0),\n",
" (457564, 1.0),\n",
" (8397, 13.0),\n",
" (2361431, 1.0),\n",
" (2365743, 1.0),\n",
" (1589760, 3.0),\n",
" (535634, 1.0),\n",
" (442566, 1.0),\n",
" (542422, 2.0),\n",
" (2362697, 1.0),\n",
" (439327, 4.0),\n",
" (1479888, 1.0),\n",
" (2363995, 1.0),\n",
" (2035, 8.0),\n",
" (20992, 2.0),\n",
" (2362680, 1.0),\n",
" (2362363, 1.0),\n",
" (2360139, 1.0),\n",
" (1767285, 1.0),\n",
" (676959, 5.0),\n",
" (2359645, 1.0),\n",
" (595179, 1.0),\n",
" (10269, 1.0),\n",
" (2359685, 1.0),\n",
" (2361384, 1.0),\n",
" (2364845, 1.0),\n",
" (2359606, 1.0),\n",
" (913230, 2.0),\n",
" (2361786, 1.0),\n",
" (2364482, 1.0),\n",
" (2358728, 1.0),\n",
" (1780966, 1.0),\n",
" (2358622, 1.0),\n",
" (2359594, 1.0),\n",
" (2360310, 1.0),\n",
" (455269, 1.0),\n",
" (2361842, 1.0),\n",
" (2358852, 1.0),\n",
" (2361900, 1.0),\n",
" (2358908, 1.0),\n",
" (2365963, 1.0),\n",
" (2359772, 1.0),\n",
" (2360319, 1.0),\n",
" (1317685, 1.0),\n",
" (2361684, 3.0),\n",
" (2363498, 1.0),\n",
" (2359707, 1.0),\n",
" (2364188, 1.0),\n",
" (2143737, 2.0),\n",
" (2362457, 1.0),\n",
" (512968, 1.0),\n",
" (2880, 2.0),\n",
" (2360412, 1.0),\n",
" (2361277, 1.0),\n",
" (1390970, 1.0),\n",
" (2365974, 1.0),\n",
" (2361896, 1.0),\n",
" (725235, 1.0),\n",
" (2362316, 1.0),\n",
" (2364158, 1.0),\n",
" (2365037, 1.0),\n",
" (502824, 1.0),\n",
" (2363295, 2.0),\n",
" (2363599, 1.0),\n",
" (2364585, 1.0),\n",
" (2365786, 1.0),\n",
" (536579, 2.0),\n",
" (2359141, 2.0),\n",
" (2359301, 1.0),\n",
" (2365386, 1.0),\n",
" (3009, 3.0),\n",
" (2364890, 1.0),\n",
" (59339, 1.0),\n",
" (2362906, 1.0),\n",
" (2119440, 1.0),\n",
" (2361640, 1.0),\n",
" (2364210, 1.0),\n",
" (2359236, 1.0),\n",
" (493981, 1.0),\n",
" (622177, 1.0),\n",
" (2365989, 1.0),\n",
" (1456511, 3.0),\n",
" (112504, 1.0),\n",
" (2363967, 2.0),\n",
" (2363633, 1.0),\n",
" (1513182, 1.0),\n",
" (2365117, 1.0),\n",
" (5332, 6.0),\n",
" (2360334, 1.0),\n",
" (2360666, 1.0),\n",
" (1642133, 4.0),\n",
" (2363528, 1.0),\n",
" (830264, 1.0),\n",
" (1509930, 1.0),\n",
" (7608, 1.0),\n",
" (2363558, 1.0),\n",
" (1435699, 1.0),\n",
" (2360637, 1.0),\n",
" (2360856, 1.0),\n",
" (2359505, 1.0),\n",
" (2363393, 1.0),\n",
" (3599, 1.0),\n",
" (11037, 1.0),\n",
" (578835, 1.0),\n",
" (2362787, 1.0),\n",
" (2363423, 1.0),\n",
" (2359353, 1.0),\n",
" (2362875, 1.0),\n",
" (2359700, 1.0),\n",
" (2165377, 1.0),\n",
" (2361553, 1.0),\n",
" (2363307, 5.0),\n",
" (2365987, 1.0),\n",
" (850295, 1.0),\n",
" (2365369, 1.0),\n",
" (2363897, 1.0),\n",
" (4825, 1.0),\n",
" (2251432, 1.0),\n",
" (456369, 1.0),\n",
" (2359058, 6.0),\n",
" (912625, 1.0),\n",
" (2359848, 1.0),\n",
" (2360533, 1.0),\n",
" (2156267, 1.0),\n",
" (2364731, 1.0),\n",
" (1416113, 1.0),\n",
" (2365228, 1.0),\n",
" (2361806, 1.0),\n",
" (2363276, 2.0),\n",
" (2364251, 1.0),\n",
" (2364515, 1.0),\n",
" (2359615, 2.0),\n",
" (2361776, 1.0),\n",
" (182859, 1.0),\n",
" (2363194, 1.0),\n",
" (2365020, 1.0),\n",
" (2364838, 1.0),\n",
" (2365848, 1.0),\n",
" (1641124, 1.0),\n",
" (2365690, 2.0),\n",
" (534591, 1.0),\n",
" (72938, 29.0),\n",
" (661363, 1.0),\n",
" (8973, 4.0),\n",
" (311226, 1.0),\n",
" (2359475, 1.0),\n",
" (829015, 1.0),\n",
" (2361777, 1.0),\n",
" (615301, 1.0),\n",
" (2362397, 1.0),\n",
" (509336, 1.0),\n",
" (603785, 1.0),\n",
" (610033, 1.0),\n",
" (2362519, 1.0),\n",
" (2360994, 1.0),\n",
" (1500460, 1.0),\n",
" (1587560, 2.0),\n",
" (2362004, 1.0),\n",
" (2365875, 1.0),\n",
" (2362539, 2.0),\n",
" (2363704, 1.0),\n",
" (2364974, 1.0),\n",
" (2361217, 1.0),\n",
" (2361682, 1.0),\n",
" (62444, 1.0),\n",
" (2360507, 1.0),\n",
" (2360515, 1.0),\n",
" (1891144, 1.0),\n",
" (2361650, 1.0),\n",
" (2363585, 1.0),\n",
" (8861, 1.0),\n",
" (669920, 2.0),\n",
" (2364078, 1.0),\n",
" (2363179, 1.0),\n",
" (2364103, 1.0),\n",
" (2360001, 2.0),\n",
" (1553516, 2.0),\n",
" (13863, 3.0),\n",
" (606638, 1.0),\n",
" (7123, 2.0),\n",
" (2360375, 1.0),\n",
" (846902, 2.0),\n",
" (1426631, 2.0),\n",
" (2364606, 1.0),\n",
" (56567, 2.0),\n",
" (2362827, 1.0),\n",
" (3774, 2.0),\n",
" (1640013, 1.0),\n",
" (2362743, 1.0),\n",
" (1373633, 1.0),\n",
" (2359834, 2.0),\n",
" (507624, 3.0),\n",
" (221550, 1.0),\n",
" (603246, 1.0),\n",
" (495367, 1.0),\n",
" (2361515, 2.0),\n",
" (2359822, 1.0),\n",
" (1737286, 3.0),\n",
" (2364808, 1.0),\n",
" (2365725, 1.0),\n",
" (2361772, 1.0),\n",
" (1651902, 1.0),\n",
" (2363306, 1.0),\n",
" (619, 1.0),\n",
" (1629163, 1.0),\n",
" (1504097, 1.0),\n",
" (2362986, 1.0),\n",
" (2364864, 1.0),\n",
" (2360673, 1.0),\n",
" (2362113, 1.0),\n",
" (2359830, 1.0),\n",
" (2361568, 1.0),\n",
" (2364434, 1.0),\n",
" (1458249, 7.0),\n",
" (2360311, 1.0),\n",
" (529246, 1.0),\n",
" (1488668, 1.0),\n",
" (2363642, 1.0),\n",
" (2360653, 1.0),\n",
" (1559068, 1.0),\n",
" (2365321, 1.0),\n",
" (1457684, 1.0),\n",
" (438646, 1.0),\n",
" (2365810, 1.0),\n",
" (2365732, 1.0),\n",
" (1412614, 1.0),\n",
" (2359828, 1.0),\n",
" (2361086, 1.0),\n",
" (481165, 1.0),\n",
" (1415000, 4.0),\n",
" (2361620, 1.0),\n",
" (1519582, 1.0),\n",
" (495913, 1.0),\n",
" (571277, 1.0),\n",
" (929616, 1.0),\n",
" (1496975, 1.0),\n",
" (2364259, 1.0),\n",
" (720411, 1.0),\n",
" (590431, 1.0),\n",
" (2360442, 1.0),\n",
" (10332, 16.0),\n",
" (229, 6.0),\n",
" (2364741, 1.0),\n",
" (2362709, 1.0),\n",
" (2364303, 1.0),\n",
" (849430, 1.0),\n",
" (2282498, 1.0),\n",
" (2359863, 1.0),\n",
" (2364492, 1.0),\n",
" (2362132, 1.0),\n",
" (2361029, 1.0),\n",
" (2360359, 1.0),\n",
" (2365821, 1.0),\n",
" (2361837, 1.0),\n",
" (2364649, 1.0),\n",
" (477731, 1.0),\n",
" (2365708, 1.0),\n",
" (520153, 1.0),\n",
" (721226, 1.0),\n",
" (1507049, 1.0),\n",
" (2359250, 1.0),\n",
" (1444, 2.0),\n",
" (2359380, 2.0),\n",
" (2358611, 1.0),\n",
" (2365631, 1.0),\n",
" (2358674, 1.0),\n",
" (498799, 2.0),\n",
" (518187, 2.0),\n",
" (1882294, 1.0),\n",
" (2364641, 1.0),\n",
" (2364180, 1.0),\n",
" (2358754, 1.0),\n",
" (22225, 1.0),\n",
" (1605044, 1.0),\n",
" (2365651, 1.0),\n",
" (1778186, 2.0),\n",
" (561922, 2.0),\n",
" (17401, 5.0),\n",
" (136897, 1.0),\n",
" (2365808, 1.0),\n",
" (2360158, 2.0),\n",
" (2361616, 1.0),\n",
" (2362954, 1.0),\n",
" (2364321, 1.0),\n",
" (2362764, 1.0),\n",
" (2361022, 1.0),\n",
" (2361951, 1.0),\n",
" (582950, 1.0),\n",
" (589092, 5.0),\n",
" (2362133, 1.0),\n",
" (2363691, 2.0),\n",
" (2364517, 1.0),\n",
" (60812, 4.0),\n",
" (2360940, 1.0),\n",
" (7581, 1.0),\n",
" (2364208, 1.0),\n",
" (2363030, 2.0),\n",
" (2360667, 1.0),\n",
" (16074, 3.0),\n",
" (2359460, 2.0),\n",
" (1212403, 1.0),\n",
" (2361133, 1.0),\n",
" (1307614, 1.0),\n",
" (2363300, 1.0),\n",
" (676195, 2.0),\n",
" (1386896, 1.0),\n",
" (2362905, 1.0),\n",
" (460493, 1.0),\n",
" (1754392, 1.0),\n",
" (2365403, 2.0),\n",
" (2361743, 1.0),\n",
" (1536985, 1.0),\n",
" (2359239, 1.0),\n",
" (2362454, 1.0),\n",
" (2364031, 1.0),\n",
" (2364967, 1.0),\n",
" (2363483, 1.0),\n",
" (531152, 1.0),\n",
" (628079, 1.0),\n",
" (2364775, 1.0),\n",
" (2360912, 1.0),\n",
" (2362164, 1.0),\n",
" (2361361, 1.0),\n",
" (2364337, 1.0),\n",
" (2360479, 1.0),\n",
" (1636750, 1.0),\n",
" (2362756, 1.0),\n",
" (6776, 28.0),\n",
" (2359728, 1.0),\n",
" (1509353, 1.0),\n",
" (2363718, 2.0),\n",
" (2360247, 1.0),\n",
" (14320, 3.0),\n",
" (2362270, 1.0),\n",
" (2358695, 2.0),\n",
" (2364486, 1.0),\n",
" (622987, 1.0),\n",
" (2359037, 3.0),\n",
" (2365803, 2.0),\n",
" (2360945, 1.0),\n",
" (670095, 2.0),\n",
" (1868827, 1.0),\n",
" (854430, 1.0),\n",
" (886740, 1.0),\n",
" (2363007, 1.0),\n",
" (2365356, 1.0),\n",
" (2361581, 1.0),\n",
" (1891808, 1.0),\n",
" (2364560, 1.0),\n",
" (2358659, 2.0),\n",
" (2361679, 1.0),\n",
" (1399977, 1.0),\n",
" (2362470, 1.0),\n",
" (2362535, 1.0),\n",
" (7889, 3.0),\n",
" (2360679, 1.0),\n",
" (509662, 2.0),\n",
" (2362156, 1.0),\n",
" (2364667, 1.0),\n",
" (2362033, 1.0),\n",
" (2362283, 1.0),\n",
" (2364063, 1.0),\n",
" (2361375, 1.0),\n",
" (1475626, 1.0),\n",
" (1521047, 2.0),\n",
" (511427, 1.0),\n",
" (111326, 1.0),\n",
" (2360707, 1.0),\n",
" (505637, 2.0),\n",
" (1488943, 3.0),\n",
" (2359877, 1.0),\n",
" (2360900, 1.0),\n",
" (18972, 1.0),\n",
" (16443, 13.0),\n",
" (2363138, 1.0),\n",
" (2365566, 1.0),\n",
" (2362384, 1.0),\n",
" (2360423, 1.0),\n",
" (1493576, 2.0),\n",
" (514804, 3.0),\n",
" (2364588, 1.0),\n",
" (2363799, 1.0),\n",
" (921794, 1.0),\n",
" (453389, 1.0),\n",
" (60324, 2.0),\n",
" (2358664, 2.0),\n",
" (2365322, 1.0),\n",
" (2364109, 1.0),\n",
" (2361234, 4.0),\n",
" (4098, 1.0),\n",
" (2362380, 1.0),\n",
" (20, 38.0),\n",
" (16087, 8.0),\n",
" (1424352, 1.0),\n",
" (1651793, 4.0),\n",
" (483093, 1.0),\n",
" (497620, 1.0),\n",
" (545734, 1.0),\n",
" (44231, 1.0),\n",
" (2364363, 1.0),\n",
" (2360246, 2.0),\n",
" (2362424, 1.0),\n",
" (587390, 1.0),\n",
" (2363417, 1.0),\n",
" (2362446, 1.0),\n",
" (2364057, 1.0),\n",
" (1533417, 1.0),\n",
" (2362937, 1.0),\n",
" (514879, 9.0),\n",
" (851674, 1.0),\n",
" (2362473, 1.0),\n",
" (116746, 1.0),\n",
" (2358853, 1.0),\n",
" (495626, 1.0),\n",
" (4606, 9.0),\n",
" (504650, 3.0),\n",
" (2358915, 1.0),\n",
" (2361056, 1.0),\n",
" (2414, 1.0),\n",
" (2359008, 1.0),\n",
" (920101, 1.0),\n",
" (1468842, 1.0),\n",
" (2363359, 1.0),\n",
" (507170, 31.0),\n",
" (2358637, 1.0),\n",
" (84521, 1.0),\n",
" (2359052, 1.0),\n",
" (2360379, 1.0),\n",
" (2363611, 1.0),\n",
" (1919361, 2.0),\n",
" (732322, 1.0),\n",
" (501422, 1.0),\n",
" (2365077, 2.0),\n",
" (2360409, 1.0),\n",
" (2362312, 1.0),\n",
" (1434485, 1.0),\n",
" (522828, 1.0),\n",
" (2364297, 1.0),\n",
" (1397378, 1.0),\n",
" (2364626, 1.0),\n",
" (2363767, 1.0),\n",
" (459137, 7.0),\n",
" (1523910, 1.0),\n",
" (1623256, 1.0),\n",
" (2365326, 1.0),\n",
" (2360470, 1.0),\n",
" (2363199, 1.0),\n",
" (2363020, 1.0),\n",
" (2365890, 1.0),\n",
" (2363754, 1.0),\n",
" (2365492, 1.0),\n",
" (916266, 1.0),\n",
" (239869, 5.0),\n",
" (2363263, 1.0),\n",
" (14041, 1.0),\n",
" (2359252, 1.0),\n",
" (2362123, 1.0),\n",
" (2358623, 1.0),\n",
" (1400803, 1.0),\n",
" (2363090, 1.0),\n",
" (2363484, 1.0),\n",
" (589395, 1.0),\n",
" (2362003, 1.0),\n",
" (2359407, 1.0),\n",
" (1406864, 1.0),\n",
" (2362701, 1.0),\n",
" (882, 1.0),\n",
" (2362047, 1.0),\n",
" (2365866, 1.0),\n",
" (2365640, 1.0),\n",
" (2365581, 1.0),\n",
" (616044, 2.0),\n",
" (791952, 1.0),\n",
" (2364175, 1.0),\n",
" (692211, 2.0),\n",
" (12131, 2.0),\n",
" (2359642, 3.0),\n",
" (1414828, 1.0),\n",
" ...]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from gargantext.util.toolchain.metric_tfidf import compute_occs\n",
"\n",
"corpus= session.query(CorpusNode).get(corpus_id)\n",
"\n",
"occ_id = session.query(OccurrencesNode.id).filter(OccurrencesNode.parent_id == corpus_id).first()\n",
"group_id = session.query(GrouplistNode.id).filter(GrouplistNode.parent_id == corpus_id).first()\n",
"Occurrences = aliased(NodeNodeNgram)\n",
"MapTerms = aliased(NodeNgram)\n",
"Documents = aliased(DocumentNode)\n",
"\n",
"compute_occs(corpus, interactiv=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"mapTermsOcc = (session.query(Occurrences).join( MapTerms, MapTerms.ngram_id == Occurrences.ngram_id)\n",
" .filter(MapTerms.node_id == map_id)\n",
" \n",
" .join(Documents, Documents.id == Occurrences.node2_id)\n",
" .filter(Documents.parent_id == corpus_id)\n",
" \n",
" .filter(Occurrences.node1_id == occ_id)\n",
" \n",
" #.group_by(Occurrences.ngram_id)\n",
" .all()\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(303698)"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"group_id"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[]"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mapTermsOcc"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cooccurrences of MapTerms by Year"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from graph.cooccurrences import countCooccurrences"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
" (cooc_id, cooc_matrix) = countCooccurrences( corpus_id=corpus_id, cooc_id= \n",
" , field1=field1, field2=field2 \n",
" , start=start , end =end \n",
" , mapList_id=mapList_id , groupList_id=groupList_id \n",
" , isMonopartite=True , threshold = threshold \n",
" , distance=distance , bridgeness=bridgeness \n",
" , save_on_db = True , reset = reset \n",
" ) "
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"GRAPH #303869 Filtering the matrix with Map and Group Lists.\n",
"WeightedMatrix bulk_insert start\n",
"WeightedMatrix bulk_insert stop\n",
"GRAPH #303869 ... Node Cooccurrence Matrix saved\n",
"GRAPH #303869 ... Parameters saved in Node.\n"
]
}
],
"source": [
"#countCooccurrences(corpus_id, save_on_db=False, start=\"2000-01-01\", end=\"2017-12-31\")\n",
"(cooc_id, cooc_matrix) = countCooccurrences( corpus_id = corpus_id\n",
" , cooc_id = None\n",
" , field1=\"ngrams\", field2 = \"ngrams\"\n",
" \n",
" , mapList_id = map_id\n",
" , groupList_id = group_id\n",
" \n",
" , isMonopartite =True , threshold = 2 \n",
" #, distance =Non , bridgeness=bridgeness\n",
" \n",
" , save_on_db = True\n",
" , reset = True\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"defaultdict(float, {})"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cooc_matrix.items"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Number of Documents per year"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Date DateValue\n",
"Date \n",
"1954 1954 2\n",
"1956 1956 1\n",
"1957 1957 1\n",
"1958 1958 5\n",
"1960 1960 3\n",
"1961 1961 5\n",
"1962 1962 2\n",
"1963 1963 11\n",
"1964 1964 5\n",
"1965 1965 3\n"
]
}
],
"source": [
"# To get all the documents:\n",
"docs = documents(corpus_id)\n",
"# If I want to count:\n",
"myChart = chart(docs, \"publication_year\")\n",
"print(myChart[:10])"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"source": [
"# Others example"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"139\n",
"LSTM 1000\n",
"Downloading page 0 to 100 results\n",
"Downloading page 100 to 100 results\n",
"CORPUS #303703\n",
"PARSING\n",
"Loading available PARSERS:\n",
"\t- EuropresseParser\n",
"\t- RISParser\n",
"\t- PubmedParser\n",
"\t- RISParser\n",
"\t- ISIParser\n",
"\t- RISParser\n",
"\t- CSVParser\n",
"\t- ISTexParser\n",
"\t- CernParser\n",
"\t- MultivacParser\n",
"\t- HalParser\n",
"\t- IsidoreParser\n",
"0 docs skipped\n",
"139 parsed\n",
"#MAIN language of the CORPUS __unknown__\n",
"CORPUS #303703: parsed 139\n",
"#TAGGERS LOADED: {'__unknown__': <gargantext.util.taggers.NltkTagger.NltkTagger object at 0x7f03064496a0>}\n",
"#SUPPORTED TAGGER LANGS ['__unknown__']\n",
"INTEGRATE\n",
"INTEGRATE\n",
"INTEGRATE\n",
"CORPUS #303703: extracted ngrams\n",
"CORPUS #303703: indexed hyperdata\n",
"CORPUS #303703: [2017-10-10_09:34:23] new favorites node #303843\n",
"CORPUS #303703: [2017-10-10_09:34:23] starting ngram lists computation\n",
"CORPUS #303703: [2017-10-10_09:34:24] new stoplist node #303844\n",
"# STEMMERS LOADED {'__unknown__': <nltk.stem.snowball.SnowballStemmer object at 0x7f0306497cf8>}\n",
"#SUPPORTED STEMMERS LANGS []\n",
"CORPUS #303703: [2017-10-10_09:34:25] new grouplist node #303845\n",
"CORPUS #303703: [2017-10-10_09:34:25] new occs node #303846\n",
"compute_ti_ranking\n",
"2017-10-10_09:34:25 : Starting Query tf_nd_query\n",
"2017-10-10_09:34:26 : End Query tf_nd_quer\n",
"2017-10-10_09:34:26 : tfidfsum\n",
"CORPUS #303703: [2017-10-10_09:34:26] new ti ranking node #303847\n",
"MAINLIST: keeping 3295 ngrams out of 4393\n",
"CORPUS #303703: [2017-10-10_09:34:26] new mainlist node #303848\n",
"Compute TFIDF local\n",
"CORPUS #303703: [2017-10-10_09:34:26] new localtfidf node #303849\n",
"COOCS: NEW matrix shape [215x361]\n",
"CORPUS #303703: [2017-10-10_09:34:32] computed mainlist coocs for specif rank\n",
"SPECIFICITY: computing on 209 ngrams\n",
"CORPUS #303703: [2017-10-10_09:34:32] new spec-clusion node #303853\n",
"CORPUS #303703: [2017-10-10_09:34:32] new gen-clusion node #303854\n",
"MAPLIST quotas: {'topgen': {'multigrams': 168, 'monograms': 42}, 'topspec': {'multigrams': 112, 'monograms': 28}}\n",
"MAPLIST: top_spec_monograms = 28\n",
"MAPLIST: top_spec_multigrams = 55\n",
"MAPLIST: top_gen_monograms = 42\n",
"MAPLIST: top_gen_multigrams = 0\n",
"MAPLIST: kept 125 ngrams in total \n",
"CORPUS #303703: [2017-10-10_09:34:32] new maplist node #303855\n",
"CORPUS #303703: [2017-10-10_09:34:32] FINISHED ngram lists computation\n"
]
}
],
"source": [
"#project = myProject_fromUrl(\"http://imt.gargantext.org/projects/300535\")\n",
"project = myProject_fromUrl(\"http://localhost:8000/projects/301096\")\n",
"corpus = newCorpus(project, source=\"hal\", name=\"Machine learning\", query=\"LSTM\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"6"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# search full text (english by default) in the corpus\n",
"scan_gargantext(corpus.id, \"machine | learning & deep\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# search full text (english by default) in the corpus\n",
"scan_gargantext(corpus.id, \"machine | learning & deep\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"6"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# search full text (english by default) and DELETE in the corpus\n",
"scan_gargantext_and_delete(corpus.id, \"machine | learning & deep\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"for doc in new_docs:\n",
" new_doc = (Node( user_id = project.user_id\n",
" , parent_id = corpus.id\n",
" , typename= 'DOCUMENT'\n",
" , name=doc[\"title\"][:50]\n",
" , hyperdata=doc)\n",
" )\n",
" session.add(new_doc)\n",
"session.commit()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"L'identifiant du corpus est : 254749\n"
]
}
],
"source": [
"# Copier/coller l'url du corpus (avec http://): sur lequel travailler\n",
"corpus_url = \"http://gargantext.org/projects/251737/corpora/254749\"\n",
"\n",
"corpus_id = corpus_url.split(\"/\")[6]\n",
"\n",
"print(\"L\\'identifiant du corpus est : %s\" % corpus_id)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# To get all the documents:\n",
"docs = documents(corpus_id)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"'Towards big data science in the decade ahead from ten years of InCoB and the 1st ISCB-Asia Joint Conference.'"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# To get the title of the first document \n",
"# [0] indicates the index of the first document\n",
"docs[0].hyperdata['title']"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"\"The 2011 International Conference on Bioinformatics (InCoB) conference, which is the annual scientific conference of the Asia-Pacific Bioinformatics Network (APBioNet), is hosted by Kuala Lumpur, Malaysia, is co-organized with the first ISCB-Asia conference of the International Society for Computational Biology (ISCB). InCoB and the sequencing of the human genome are both celebrating their tenth anniversaries and InCoB's goalposts for the next decade, implementing standards in bioinformatics and globally distributed computational networks, will be discussed and adopted at this conference. Of the 49 manuscripts (selected from 104 submissions) accepted to BMC Genomics and BMC Bioinformatics conference supplements, 24 are featured in this issue, covering software tools, genome/proteome analysis, systems biology (networks, pathways, bioimaging) and drug discovery and design.\""
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# To get the abstract of the first document (0)\n",
"docs[0].hyperdata['abstract']"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"'Shoba Ranganathan, Christian Schönbach, Janet Kelso, Burkhard Rost, Sheila Nathan, Tin Wee Tan'"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# To get the authors of the first document (0)\n",
"docs[0].hyperdata['authors']"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"'BMC bioinformatics'"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# To get the source of the first document (0)\n",
"docs[0].hyperdata['source']"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# If I want to count:\n",
"myChart = chart(docs, \"publication_year\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x7f48069c5208>"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAEZCAYAAACZwO5kAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAGl1JREFUeJzt3X+QVOWd7/H3hx+KURcUhymKQQYTNJorksnEYLnmF/E3\nFaxEXciWoMUuNzcSje7NOnvvrZuyyruFdVOyGvd6Q0IiWhvBsFHZRBMI0U1WLyoS8BdRJ4phKIUR\nlSiK8uN7/+gH0pIZupvpnp5++Lyqpvqc5zzn9Le7Tn84POd0H0UEZmaWr0H1LsDMzGrLQW9mljkH\nvZlZ5hz0ZmaZc9CbmWXOQW9mlrmygl7SNZKekfS0pLskDZM0XtKjkjolLZF0WOp7eJrvTMtba/kC\nzMzswEoGvaQxwFVAe0T8J2AwMB24EZgfER8B3gBmp1VmA2+k9vmpn5mZ1Um5QzdDgCMkDQE+BLwC\nfB5YmpYvAi5K09PSPGn5FEmqTrlmZlapIaU6RMQmSd8G/gC8CywHngDejIhdqVsXMCZNjwE2pnV3\nSdoGjARe6+05jjvuuGhtbT3Y12Bmdkh64oknXouIplL9Sga9pGMoHKWPB94Efgyc19cCJc0B5gAc\nf/zxrF69uq+bNDM7pEh6uZx+5QzdfAF4KSK6I2In8BPgTGBEGsoBaAE2pelNwNhUxBBgOLB1/41G\nxIKIaI+I9qamkv8gmZnZQSon6P8ATJb0oTTWPgV4FngQuDj1mQXcl6aXpXnS8l+FfznNzKxuSgZ9\nRDxK4aTqGuCptM4C4DrgWkmdFMbgF6ZVFgIjU/u1QEcN6jYzszJpIBxst7e3h8fozRrbzp076erq\nYseOHfUuJTvDhg2jpaWFoUOHfqBd0hMR0V5q/ZInY83MytHV1cXRRx9Na2srvqK6eiKCrVu30tXV\nxfjx4w9qG/4JBDOrih07djBy5EiHfJVJYuTIkX36n5KD3syqxiFfG319Xx30ZmaZ8xi9Vay142c1\n3f6GeRfWdPvWP6q9n5SzXwwePJhTTz2VnTt3MmTIEGbOnMk111zDoEG9H9Nu2LCBRx55hK985SsH\n3PYJJ5zAAw88wEknnbSv7Rvf+AajR4/muuuu63XbU6dO5emnny5Zey35iN7MsnHEEUewdu1annnm\nGVasWMEDDzzA9ddff8B1NmzYwI9+9KOS254+fTqLFy/eN79nzx6WLl3K9OnT+1x3rTnozSxLo0aN\nYsGCBdx6661EBBs2bOCss86ira2NtrY2HnnkEQA6Ojr4zW9+w6RJk5g/fz67d+/mm9/8Jp/85CeZ\nOHEi3/3udwGYMWMGS5Ys2bf9X//614wbN45x48b1uu1it99+O3Pnzt03P3XqVB566CEAli9fzhln\nnEFbWxuXXHIJb7/9dlXfCwe9mWXrhBNOYPfu3WzZsoVRo0axYsUK1qxZw5IlS7jqqqsAmDdvHmed\ndRZr167lmmuuYeHChQwfPpzHH3+cxx9/nO9973u89NJLnHrqqQwaNIh169YBsHjxYmbMmAHQ67bL\n8dprr3HDDTfwy1/+kjVr1tDe3s5NN91U1ffBY/RmdkjYuXMnc+fOZe3atQwePJjnn3++x37Lly/n\nySefZOnSwq+wb9u2jRdeeIHx48czY8YMFi9ezMc+9jHuvffefcNC5W67J6tWreLZZ5/lzDPPBOD9\n99/njDPO6OOr/SAHvZll68UXX2Tw4MGMGjWK66+/nubmZtatW8eePXsYNmxYj+tEBN/5znc499xz\n/2zZ9OnTOeecc/jMZz7DxIkTaW5uBmD+/Pkltz1kyBD27Nmzb37vdfERwdlnn81dd91VjZfcIw/d\nmFmWuru7+epXv8rcuXORxLZt2xg9ejSDBg3izjvvZPfu3QAcffTRvPXWW/vWO/fcc7ntttvYuXMn\nAM8//zzbt28H4MMf/jDHHXccHR0d+4ZtgF63Xay1tZW1a9eyZ88eNm7cyGOPPQbA5MmTefjhh+ns\n7ARg+/btFf2PoBw+ojezmqjHZbLvvvsukyZN2nd55WWXXca1114LwNe+9jW+/OUvc8cdd3Deeedx\n5JFHAjBx4kQGDx7MaaedxuWXX87VV1/Nhg0baGtrIyJoamri3nvv3fccM2bMoKOjgy996Uv72nrb\ndrEzzzyT8ePHc8opp3DyySfT1tYGQFNTE7fffjszZszgvffeA+CGG27gxBNPrNr74h81s4r5Onrr\nyfr16zn55JPrXUa2enp/y/1RMw/dmJllzkFvZpY5B72ZVc1AGArOUV/fVwe9mVXFsGHD2Lp1q8O+\nyvb+Hn1vl4OWw1fdmFlVtLS00NXVRXd3d71Lyc7eO0wdrJJBL+kkYElR0wnA/wTuSO2twAbg0oh4\nI91A/GbgAuAd4PKIWHPQFZpZQxg6dOhB3wHJaqucm4M/FxGTImIS8AkK4X0PhZt+r4yICcBK/nQT\n8POBCelvDnBbLQo3M7PyVDpGPwX4fUS8DEwDFqX2RcBFaXoacEcUrAJGSBpdlWrNzKxilQb9dGDv\nDzI0R8QrafpVoDlNjwE2Fq3TldrMzKwOyg56SYcBXwR+vP+yKJxmr+hUu6Q5klZLWu2TN2ZmtVPJ\nEf35wJqI2JzmN+8dkkmPW1L7JmBs0Xotqe0DImJBRLRHRHtTU1PllZuZWVkqCfoZ/GnYBmAZMCtN\nzwLuK2qfqYLJwLaiIR4zM+tnZV1HL+lI4GzgPxc1zwPuljQbeBm4NLXfT+HSyk4KV+hcUbVqzcys\nYmUFfURsB0bu17aVwlU4+/cN4MqqVGdmZn3mn0AwM8ucg97MLHMOejOzzDnozcwy56A3M8ucg97M\nLHMOejOzzDnozcwy56A3M8ucg97MLHMOejOzzDnozcwy56A3M8ucg97MLHMOejOzzDnozcwy56A3\nM8ucg97MLHMOejOzzJUV9JJGSFoq6XeS1ks6Q9KxklZIeiE9HpP6StItkjolPSmprbYvwczMDqTc\nI/qbgZ9HxEeB04D1QAewMiImACvTPMD5wIT0Nwe4raoVm5lZRUoGvaThwKeBhQAR8X5EvAlMAxal\nbouAi9L0NOCOKFgFjJA0uuqVm5lZWco5oh8PdAM/lPRbSd+XdCTQHBGvpD6vAs1pegywsWj9rtT2\nAZLmSFotaXV3d/fBvwIzMzugcoJ+CNAG3BYRHwe286dhGgAiIoCo5IkjYkFEtEdEe1NTUyWrmplZ\nBcoJ+i6gKyIeTfNLKQT/5r1DMulxS1q+CRhbtH5LajMzszooGfQR8SqwUdJJqWkK8CywDJiV2mYB\n96XpZcDMdPXNZGBb0RCPmZn1syFl9vs68C+SDgNeBK6g8I/E3ZJmAy8Dl6a+9wMXAJ3AO6mvmZnV\nSVlBHxFrgfYeFk3poW8AV/axLjMzqxJ/M9bMLHMOejOzzDnozcwy56A3M8ucg97MLHMOejOzzDno\nzcwy56A3M8ucg97MLHMOejOzzDnozcwy56A3M8ucg97MLHMOejOzzDnozcwy56A3M8ucg97MLHMO\nejOzzJUV9JI2SHpK0lpJq1PbsZJWSHohPR6T2iXpFkmdkp6U1FbLF2BmZgdWyRH95yJiUkTsvXds\nB7AyIiYAK9M8wPnAhPQ3B7itWsWamVnl+jJ0Mw1YlKYXARcVtd8RBauAEZJG9+F5zMysD8oN+gCW\nS3pC0pzU1hwRr6TpV4HmND0G2Fi0bldq+wBJcyStlrS6u7v7IEo3M7NyDCmz319GxCZJo4AVkn5X\nvDAiQlJU8sQRsQBYANDe3l7RumZmVr6yjugjYlN63ALcA5wObN47JJMet6Tum4CxRau3pDYzM6uD\nkkEv6UhJR++dBs4BngaWAbNSt1nAfWl6GTAzXX0zGdhWNMRjZmb9rJyhm2bgHkl7+/8oIn4u6XHg\nbkmzgZeBS1P/+4ELgE7gHeCKqldtZmZlKxn0EfEicFoP7VuBKT20B3BlVaozM7M+8zdjzcwy56A3\nM8ucg97MLHPlXkc/oLR2/Kym298w78Kabt/MrD/5iN7MLHMOejOzzDnozcwy56A3M8ucg97MLHMO\nejOzzDnozcwy56A3M8ucg97MLHMOejOzzDnozcwy56A3M8ucg97MLHMOejOzzJUd9JIGS/qtpJ+m\n+fGSHpXUKWmJpMNS++FpvjMtb61N6WZmVo5KjuivBtYXzd8IzI+IjwBvALNT+2zgjdQ+P/UzM7M6\nKSvoJbUAFwLfT/MCPg8sTV0WARel6WlpnrR8SupvZmZ1UO4R/T8Bfw/sSfMjgTcjYlea7wLGpOkx\nwEaAtHxb6v8BkuZIWi1pdXd390GWb2ZmpZQMeklTgS0R8UQ1nzgiFkREe0S0NzU1VXPTZmZWpJx7\nxp4JfFHSBcAw4C+Am4ERkoako/YWYFPqvwkYC3RJGgIMB7ZWvXIzMytLySP6iPiHiGiJiFZgOvCr\niPhr4EHg4tRtFnBfml6W5knLfxURUdWqzcysbH25jv464FpJnRTG4Bem9oXAyNR+LdDRtxLNzKwv\nyhm62SciHgIeStMvAqf30GcHcEkVajMzsyrwN2PNzDLnoDczy5yD3swscw56M7PMOejNzDLnoDcz\ny5yD3swscw56M7PMOejNzDLnoDczy5yD3swscw56M7PMOejNzDLnoDczy5yD3swscw56M7PMOejN\nzDLnoDczy1zJoJc0TNJjktZJekbS9al9vKRHJXVKWiLpsNR+eJrvTMtba/sSzMzsQMo5on8P+HxE\nnAZMAs6TNBm4EZgfER8B3gBmp/6zgTdS+/zUz8zM6qRk0EfB22l2aPoL4PPA0tS+CLgoTU9L86Tl\nUySpahWbmVlFyhqjlzRY0lpgC7AC+D3wZkTsSl26gDFpegywESAt3waM7GGbcyStlrS6u7u7b6/C\nzMx6VVbQR8TuiJgEtACnAx/t6xNHxIKIaI+I9qampr5uzszMelHRVTcR8SbwIHAGMELSkLSoBdiU\npjcBYwHS8uHA1qpUa2ZmFSvnqpsmSSPS9BHA2cB6CoF/ceo2C7gvTS9L86Tlv4qIqGbRZmZWviGl\nuzAaWCRpMIV/GO6OiJ9KehZYLOkG4LfAwtR/IXCnpE7gdWB6Deo2M7MylQz6iHgS+HgP7S9SGK/f\nv30HcElVqjMzsz7zN2PNzDLnoDczy5yD3swscw56M7PMOejNzDLnoDczy5yD3swscw56M7PMOejN\nzDLnoDczy5yD3swscw56M7PMOejNzDLnoDczy5yD3swscw56M7PMOejNzDLnoDczy1w5NwcfK+lB\nSc9KekbS1an9WEkrJL2QHo9J7ZJ0i6ROSU9Kaqv1izAzs96Vc0S/C/i7iDgFmAxcKekUoANYGRET\ngJVpHuB8YEL6mwPcVvWqzcysbCWDPiJeiYg1afotYD0wBpgGLErdFgEXpelpwB1RsAoYIWl01Ss3\nM7OyDKmks6RW4OPAo0BzRLySFr0KNKfpMcDGotW6UtsrRW1ImkPhiJ/jjz++wrLNDk2tHT+r6fY3\nzLuwptu3+ij7ZKyko4B/Bb4REX8sXhYRAUQlTxwRCyKiPSLam5qaKlnVzMwqUFbQSxpKIeT/JSJ+\nkpo37x2SSY9bUvsmYGzR6i2pzczM6qCcq24ELATWR8RNRYuWAbPS9CzgvqL2menqm8nAtqIhHjMz\n62fljNGfCVwGPCVpbWr7b8A84G5Js4GXgUvTsvuBC4BO4B3giqpWbGZmFSkZ9BHxH4B6WTylh/4B\nXNnHuszMrEr8zVgzs8w56M3MMuegNzPLnIPezCxzDnozs8w56M3MMuegNzPLnIPezCxzDnozs8w5\n6M3MMuegNzPLnIPezCxzDnozs8w56M3MMuegNzPLXEU3Bzcz6wvf3Lw+fERvZpY5B72ZWebKuTn4\nDyRtkfR0UduxklZIeiE9HpPaJekWSZ2SnpTUVsvizcystHKO6G8HztuvrQNYGRETgJVpHuB8YEL6\nmwPcVp0yzczsYJUM+oj4NfD6fs3TgEVpehFwUVH7HVGwChghaXS1ijUzs8od7Bh9c0S8kqZfBZrT\n9BhgY1G/rtT2ZyTNkbRa0uru7u6DLMPMzErp88nYiAggDmK9BRHRHhHtTU1NfS3DzMx6cbBBv3nv\nkEx63JLaNwFji/q1pDYzM6uTgw36ZcCsND0LuK+ofWa6+mYysK1oiMfMzOqg5DdjJd0FfBY4TlIX\n8C1gHnC3pNnAy8Clqfv9wAVAJ/AOcEUNajYzswqUDPqImNHLoik99A3gyr4WZWZm1eNvxpqZZc5B\nb2aWOQe9mVnmHPRmZplz0JuZZc5Bb2aWOQe9mVnmHPRmZplz0JuZZc5Bb2aWOQe9mVnmSv7WjVlu\nWjt+VrNtb5h3Yc22bXawfERvZpY5B72ZWeYc9GZmmXPQm5llzkFvZpY5B72ZWeZqEvSSzpP0nKRO\nSR21eA4zMytP1YNe0mDgn4HzgVOAGZJOqfbzmJlZeWrxhanTgc6IeBFA0mJgGvBsDZ6rIdXyCzvg\nL+2Y1UqjfnYVEdXdoHQxcF5E/E2avwz4VETM3a/fHGBOmj0JeK6qhXzQccBrNdx+rbn++mnk2sH1\n11ut6x8XEU2lOtXtJxAiYgGwoD+eS9LqiGjvj+eqBddfP41cO7j+ehso9dfiZOwmYGzRfEtqMzOz\nOqhF0D8OTJA0XtJhwHRgWQ2ex8zMylD1oZuI2CVpLvALYDDwg4h4ptrPU6F+GSKqIddfP41cO7j+\nehsQ9Vf9ZKyZmQ0s/masmVnmHPRmZplz0JuZZc5Bb2aWuUMi6CXdUe8azPqDpNMlfTJNnyLpWkkX\n1Lsuq6/sbg4uaf9r9gV8TtIIgIj4Yv9XVR2SroiIH9a7jnJJ+ksKv330dEQsr3c95ZD0UWAM8GhE\nvF3Ufl5E/Lx+lZUm6VsUfkxwiKQVwKeAB4EOSR+PiP9V1wJLkPQpYH1E/FHSEUAH0Ebhd7L+MSK2\n1bXAEiRdBdwTERvrXcv+sru8UtIaCjvG94GgEPR3UfjiFhHx7/Wrrm8k/SEijq93Hb2R9FhEnJ6m\n/xa4ErgHOAf4t4iYV8/6Skkf1CuB9cAk4OqIuC8tWxMRbfWsrxRJT1Go+3DgVaClKDQfjYiJdS2w\nBEnPAKel7+IsAN4BlgJTUvuX6lpgCZK2AduB31PInB9HRHd9qyrI7ogeaAeuBv478M2IWCvp3UYJ\neElP9rYIaO7PWg7C0KLpOcDZEdEt6dvAKmBABz3wt8AnIuJtSa3AUkmtEXEzhfd/oNsVEbuBdyT9\nPiL+CBAR70raU+fayjEoInal6faif1j/Q9LaehVVgReBTwBfAP4KuF7SExRC/ycR8Va9Cssu6CNi\nDzBf0o/T42Ya63U2A+cCb+zXLuCR/i+nIoMkHUPh3I/2Hs1ExHZJuw686oAwaO9wTURskPRZCmE/\njsYI+vclfSgi3qEQOABIGg40QtA/XTQ8uU5Se0SslnQisLPexZUhUv4sB5ZLGkphKG0G8G2g5K9M\n1kojBWBFIqILuETShcAf611PBX4KHBURf3YEI+mh/i+nIsOBJyiEYkgaHRGvSDqKxgjKzZIm7X3v\n05H9VOAHwKn1La0sn46I92DfAc9eQ4FZ9SmpIn8D3Czpf1D4ad//J2kjsDEtG+g+sI9HxE4Kv/O1\nTNKH6lNSQXZj9Aci6ajiE2zWP9JO3hwRL9W7lgOR1EJh+OPVHpadGREP16GsqmikfV/SXwDjKRyI\ndkXE5jqXVBZJJ0bE8/WuoyeHWtAP6JOZpTTSh3V/jVw7ZFG/9/06qnf92Q3dSLq2t0XAUf1ZSw08\nCzTqh7WRa4cGqN/7/oBW1/qzC3rgH4H/DfR08m/Af0GskT+sjVw7NH79eN+vq4Fcf45Bvwa4NyKe\n2H+BpEY4odPIH9ZGrh0av37v+/U1YOvPboxe0knA6z19UUFS80A/sSPpEeDrvXxYN0bE2B5WGxAa\nuXbIon7v+3U0kOvPLugbXSN/WBu5dmj8+htdo7//A7n+7II+fTnkH4CLgFEUfgZhC3AfMC8i3qxj\neWY1433fetMI416VupvCt0o/GxHHRsRI4HOp7e66VlYGScMlzZP0O0mvS9oqaX1qG1Hv+g6kkWuH\nxq8f7/t1NZDrzzHoWyPixuIvvUTEqxFxIzCujnWVq5E/rI1cOzR+/d7362vA1p/j0M1y4JfAor1j\nYpKagcsp/MjWF+pYXkmSnouIkypdNhA0cu2QRf3e9+toINef4xH9XwEjgX+X9Iak14GHgGOBS+tZ\nWJlelvT36QMKFD6skq6j8JsfA1kj1w6NX7/3/foasPVnF/QR8QbwQ2AuMDb9F+rkiLiOwk0wBrpG\n/rA2cu3Q4PV736+7AVt/jkM3DX3zCNh3l6MWYFU03l2OGrZ2aOz6ve/X34CtPyKy+gOeovAzvwCt\nwGoKOzzAb+tdXxn1XwU8B9wLbACmFS1bU+/6cq09k/q977v+Hv9y/AmERr95RCPf5aiRa4fGr9/7\nfn0N2PpzDPpGv3lEI39YG7l2aPz6ve/X14CtP7uTscBMCjdG3icidkXETODT9SmpIpslTdo7k3ac\nqcBxDPwPayPXDo1fv/f9+hqw9Wd3MrbRqYHvctTItUPj19/oGv39H8j1O+jNzDKX49CNmZkVcdCb\nmWXOQW+HJEm7Ja2V9IykdZL+TtIBPw+SWiV9pb9qNKsWB70dqt6NiEkR8THgbOB84Fsl1mkFHPTW\ncHwy1g5Jkt6OiKOK5k8AHqdwKdw44E7gyLR4bkQ8ImkVcDLwErAIuAWYB3wWOBz454j4br+9CLMy\nOejtkLR/0Ke2N4GTgLeAPRGxQ9IE4K6IaE9fgPmvETE19Z8DjIqIGyQdDjwMXBIRL/XrizErIcdv\nxpr11VDg1vTll93Aib30OweYKOniND8cmEDhiN9swHDQm7Fv6GY3hXusfgvYDJxG4TzWjt5WA74e\nEb/olyLNDpJPxtohT1IT8H+BW6MwljkceCUi9gCXAYNT17eAo4tW/QXwXyQNTds5UdKRmA0wPqK3\nQ9URktZSGKbZReHk601p2f8B/lXSTODnwPbU/iSwW9I64HbgZgpX4qyRJKAbuKi/XoBZuXwy1sws\ncx66MTPLnIPezCxzDnozs8w56M3MMuegNzPLnIPezCxzDnozs8z9f8zGHY6Yb9aNAAAAAElFTkSu\nQmCC\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x7f48069bd7b8>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"myChart.plot.bar()"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"## Title\n",
"\n",
"Here I can add some comments on the cart.\n",
"1. First point\n",
"2. Second point"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"# Lang Cleaning tools"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"detect_lang(\"Ceci est une phrase en français.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"detect_lang(\"This is an english sentence.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"chart(docs, \"language_iso2\").plot.bar()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"Counter({'de': 13,\n",
" 'en': 1547,\n",
" 'es': 5,\n",
" 'fi': 1,\n",
" 'fr': 4,\n",
" 'hu': 1,\n",
" 'it': 1,\n",
" 'ja': 5,\n",
" 'ko': 1,\n",
" 'ru': 3,\n",
" 'zh': 23})"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Counter([doc.hyperdata[\"language_iso2\"] for doc in docs])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# Deleting language that is not in majority\n",
"def cleanCorpusWithLang(corpus_id, lang):\n",
" return (session.query(Node.id).filter(Node.parent_id == corpus_id)\n",
" .filter(Node.hyperdata[\"language_iso2\"].astext != lang)\n",
" .count()\n",
" #.delete()\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"57"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cleanCorpusWithLang(corpus_id, 'en')"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"[(True, 'This is an english paragraph.\\n '),\n",
" (False, '\"This is an english paragraph.\\n\\nThis is an english paragraph.\\n ')]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"abstract0 = \"\"\"\"Ceci est un paragraphe en français.\n",
"\n",
"This is an english paragraph.\n",
" \"\"\"\n",
"\n",
"abstract1 = \"\"\"\"This is an english paragraph.\n",
"\n",
"This is an english paragraph.\n",
" \"\"\"\n",
"\n",
"def clean_lang_inText(lang, text):\n",
" \n",
" texts_before = nltk.tokenize.blankline_tokenize(text)\n",
" texts_after = '\\n\\n'.join([sentence \n",
" for sentence in texts_before\n",
" if detect_lang(sentence) == lang\n",
" ])\n",
" \n",
" return (len(texts_before) != len(nltk.tokenize.blankline_tokenize(texts_after)), texts_after)\n",
"\n",
"[clean_lang_inText('en', abstract) for abstract in [abstract0, abstract1]]\n",
"\n",
"# TODO update each document accordingly"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# TODO update all the abstract with That function"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"# Measures IMT Tools"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"ename": "ConnectionError",
"evalue": "HTTPSConnectionPool(host='api.archives-ouvertes.fr', port=443): Max retries exceeded with url: /search?wt=json&q=machine+learning+AND+deep&fl=+title_s%0A+++++++++++++++%2C+abstract_s%0A+++++++++++++++%2C+submittedDate_s%0A+++++++++++++++%2C+journalDate_s%0A+++++++++++++++%2C+authFullName_s%0A+++++++++++++++%2C+uri_s%0A+++++++++++++++%2C+isbn_s%0A+++++++++++++++%2C+issue_s%0A+++++++++++++++%2C+docType_s%0A+++++++++++++++%2C+journalPublisher_s%0A+++++++++++++&start=1&rows=10 (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x7f48069d1f98>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution',))",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mgaierror\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/packages/urllib3/connection.py\u001b[0m in \u001b[0;36m_new_conn\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 141\u001b[0m conn = connection.create_connection(\n\u001b[0;32m--> 142\u001b[0;31m (self.host, self.port), self.timeout, **extra_kw)\n\u001b[0m\u001b[1;32m 143\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/packages/urllib3/util/connection.py\u001b[0m in \u001b[0;36mcreate_connection\u001b[0;34m(address, timeout, source_address, socket_options)\u001b[0m\n\u001b[1;32m 66\u001b[0m \u001b[0merr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 67\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mres\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msocket\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetaddrinfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhost\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mport\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msocket\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSOCK_STREAM\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 68\u001b[0m \u001b[0maf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msocktype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mproto\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcanonname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msa\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/lib/python3.5/socket.py\u001b[0m in \u001b[0;36mgetaddrinfo\u001b[0;34m(host, port, family, type, proto, flags)\u001b[0m\n\u001b[1;32m 732\u001b[0m \u001b[0maddrlist\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 733\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mres\u001b[0m \u001b[0;32min\u001b[0m \u001b[0m_socket\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetaddrinfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhost\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mport\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfamily\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mproto\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflags\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 734\u001b[0m \u001b[0maf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msocktype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mproto\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcanonname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msa\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mgaierror\u001b[0m: [Errno -3] Temporary failure in name resolution",
"\nDuring handling of the above exception, another exception occurred:\n",
"\u001b[0;31mNewConnectionError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, **response_kw)\u001b[0m\n\u001b[1;32m 577\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 578\u001b[0;31m chunked=chunked)\n\u001b[0m\u001b[1;32m 579\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m 350\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 351\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate_conn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 352\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mSocketTimeout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mBaseSSLError\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_validate_conn\u001b[0;34m(self, conn)\u001b[0m\n\u001b[1;32m 813\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'sock'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# AppEngine might not have `.sock`\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 814\u001b[0;31m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 815\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/packages/urllib3/connection.py\u001b[0m in \u001b[0;36mconnect\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 253\u001b[0m \u001b[0;31m# Add certificate verification\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 254\u001b[0;31m \u001b[0mconn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_new_conn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 255\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/packages/urllib3/connection.py\u001b[0m in \u001b[0;36m_new_conn\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 150\u001b[0m raise NewConnectionError(\n\u001b[0;32m--> 151\u001b[0;31m self, \"Failed to establish a new connection: %s\" % e)\n\u001b[0m\u001b[1;32m 152\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNewConnectionError\u001b[0m: <requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x7f48069d1f98>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution",
"\nDuring handling of the above exception, another exception occurred:\n",
"\u001b[0;31mMaxRetryError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/adapters.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 402\u001b[0m \u001b[0mretries\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_retries\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 403\u001b[0;31m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 404\u001b[0m )\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, **response_kw)\u001b[0m\n\u001b[1;32m 622\u001b[0m retries = retries.increment(method, url, error=e, _pool=self,\n\u001b[0;32m--> 623\u001b[0;31m _stacktrace=sys.exc_info()[2])\n\u001b[0m\u001b[1;32m 624\u001b[0m \u001b[0mretries\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/packages/urllib3/util/retry.py\u001b[0m in \u001b[0;36mincrement\u001b[0;34m(self, method, url, response, error, _pool, _stacktrace)\u001b[0m\n\u001b[1;32m 280\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mnew_retry\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_exhausted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 281\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mMaxRetryError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_pool\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merror\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mResponseError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcause\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 282\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mMaxRetryError\u001b[0m: HTTPSConnectionPool(host='api.archives-ouvertes.fr', port=443): Max retries exceeded with url: /search?wt=json&q=machine+learning+AND+deep&fl=+title_s%0A+++++++++++++++%2C+abstract_s%0A+++++++++++++++%2C+submittedDate_s%0A+++++++++++++++%2C+journalDate_s%0A+++++++++++++++%2C+authFullName_s%0A+++++++++++++++%2C+uri_s%0A+++++++++++++++%2C+isbn_s%0A+++++++++++++++%2C+issue_s%0A+++++++++++++++%2C+docType_s%0A+++++++++++++++%2C+journalPublisher_s%0A+++++++++++++&start=1&rows=10 (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x7f48069d1f98>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution',))",
"\nDuring handling of the above exception, another exception occurred:\n",
"\u001b[0;31mConnectionError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-16-b220cbbc8ecc>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mscan_hal\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"machine learning AND deep\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/srv/gargantext/gargantext_notebook.py\u001b[0m in \u001b[0;36mscan_hal\u001b[0;34m(request)\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mscan_hal\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0mhal\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mHalCrawler\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 62\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mhal\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscan_results\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 63\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 64\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mscan_gargantext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcorpus_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlang\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/srv/gargantext/gargantext/util/crawlers/HAL.py\u001b[0m in \u001b[0;36mscan_results\u001b[0;34m(self, query)\u001b[0m\n\u001b[1;32m 90\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresults_nb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 91\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 92\u001b[0;31m total = ( self._get(query)\n\u001b[0m\u001b[1;32m 93\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"response\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 94\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"numFound\"\u001b[0m \u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/srv/gargantext/gargantext/util/crawlers/HAL.py\u001b[0m in \u001b[0;36m_get\u001b[0;34m(self, query, fromPage, count, lang)\u001b[0m\n\u001b[1;32m 69\u001b[0m \u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mURL\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 70\u001b[0m \u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 71\u001b[0;31m \u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mquerystring\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 72\u001b[0m )\n\u001b[1;32m 73\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/api.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(method, url, **kwargs)\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[0;31m# cases, and look like a memory leak in others.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 56\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0msessions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSession\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 57\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 58\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 59\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 473\u001b[0m }\n\u001b[1;32m 474\u001b[0m \u001b[0msend_kwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msettings\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 475\u001b[0;31m \u001b[0mresp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprep\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0msend_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 476\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 477\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 583\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 584\u001b[0m \u001b[0;31m# Send the request\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 585\u001b[0;31m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0madapter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 586\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 587\u001b[0m \u001b[0;31m# Total elapsed time of the request (approximately)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/adapters.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 465\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mProxyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 466\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 467\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mConnectionError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 468\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 469\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mClosedPoolError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mConnectionError\u001b[0m: HTTPSConnectionPool(host='api.archives-ouvertes.fr', port=443): Max retries exceeded with url: /search?wt=json&q=machine+learning+AND+deep&fl=+title_s%0A+++++++++++++++%2C+abstract_s%0A+++++++++++++++%2C+submittedDate_s%0A+++++++++++++++%2C+journalDate_s%0A+++++++++++++++%2C+authFullName_s%0A+++++++++++++++%2C+uri_s%0A+++++++++++++++%2C+isbn_s%0A+++++++++++++++%2C+issue_s%0A+++++++++++++++%2C+docType_s%0A+++++++++++++++%2C+journalPublisher_s%0A+++++++++++++&start=1&rows=10 (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x7f48069d1f98>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution',))"
]
}
],
"source": [
"scan_hal(\"machine learning AND deep\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# Request syntax\n",
"# \"network analysis\" = network <-> analysis\n",
"# \"network OR analysis\" = network | analysis\n",
"# \"network AND analysis\" = network & analysis\n",
"\n",
"scan_gargantext(corpus_id, 'english', \"machine | learning & deep\")\n",
"\n",
"# \"network NOT analysis\" = @@ to_tsquery('network') !! to_tsquery('analysis')\n",
"# (need to change the function if not has to be used)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# Forces / Faiblesses de l'IMT\n",
"# Hal Query Gargantext Query\n",
"queries = [ (\"network analysis\" , \"network <-> analysis\" )\n",
" , (\"big data AND something\" , \"(big <-> data) & something\")\n",
" ]\n",
"[(query[0], query[1]) for query in queries]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"def imt_vs_hal(corpus_id, queryHal, queryGarg):\n",
" return((scan_gargantext(corpus_id, 'english', queryGarg), scan_hal(queryHal)))\n",
" #return((scan_gargantext(corpus_id, 'english', queryGarg) *100 / scan_hal(queryHal)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# Then chart it to see your strenght and weakness!\n",
"[imt_vs_hal(corpus_id, query[0], query[1]) for query in queries]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"# Graph generation"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# TODO Cooccurrences optimization"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# TODO optimize the distributional distance"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"# List Management"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# Front End add a check box to merge or to overwrite previous list"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# optimize the list merge"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.3rc1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"# Advanced Gargantext Tutorial (Python)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"import sys\n",
"sys.path.insert(0, '/srv/gargantext')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# First import the library Gargantext Notebook\n",
"from gargantext_notebook import *\n",
"\n",
"# This enables to draw graphics later\n",
"%matplotlib inline "
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"source": [
"# Philomemies"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Instantiate the corpus you are working on"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"My corpus id is : 302695.\n"
]
}
],
"source": [
"corpus_url = \"http://localhost:8000/projects/302694/corpora/302695/\"\n",
"corpus_id = corpus_url.split(\"/\")[6]\n",
"print(\"My corpus id is : %s.\" % corpus_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Getting the Map Terms "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(21, 'environment'), (42, 'development'), (184, 'examples'), (196, 'water'), (368, 'problem'), (576, 'work'), (654, 'technology'), (712, 'number'), (738, 'operation'), (817, 'experiments')]\n"
]
}
],
"source": [
"from gargantext.models import *\n",
"import csv\n",
"\n",
"map_id = session.query(MaplistNode.id).filter(MaplistNode.parent_id == corpus_id).first()\n",
"\n",
"mapTerms = (session.query(Ngram).join( NodeNgram, NodeNgram.ngram_id == Ngram.id)\n",
" .filter(NodeNgram.node_id == map_id)\n",
" .all()\n",
" )\n",
"\n",
"print([(m.id, m.terms) for m in mapTerms[:10]])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Save in CSV File"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"toPrint = [(m.id,m.terms) for m in mapTerms]\n",
"csvfile = \"./MapTerms.csv\"\n",
"\n",
"#Assuming res is a flat list\n",
"with open(csvfile, \"w\") as output:\n",
" writer = csv.writer(output, lineterminator='\\n')\n",
" for val in toPrint:\n",
" writer.writerow([val])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Next:\n",
"# You can have access to your CSV file in the home of you Notebook!\n",
"# Click, rename, mv, delete in your Notebook\n",
"\n",
"#Assuming output is a list of lists\n",
"#with open(csvfile, \"w\") as output:\n",
"# writer = csv.writer(output, lineterminator='\\n')\n",
"# writer.writerows(res)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Occurrences of MapTerms by Year"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from gargantext.util.toolchain.metric_tfidf import compute_occs\n",
"\n",
"corpus= session.query(CorpusNode).get(corpus_id)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"{'abstract': 'The purpose of this paper is to develop a new fuzzy dynamic programming approach for solving hybrid multiobjective multistage decision-making problems. We first present a methodology of fuzzy evaluation and fuzzy optimization for hybrid multiobjective systems, in which the qualitative and quantitative objectives are synthetically considered. The qualitative objectives are evaluated by decision-makers with linguistic variables and the quantitative objectives are converted into proper dimensionless indices. After getting the marginal evaluations for each objective, a new aggregation method based on the principle of fuzzy pattern recognition is developed to get a global evaluation for all objectives. With the global evaluation obtained, a fuzzy optimization process is performed. Then we present a dynamic optimization algorithm by incorporating the fuzzy optimization process with the conventional dynamic programming technique to solve hybrid multiobjective multistage decision-making problems. A characteristic feature of the approach proposed is that various objectives are synthetically considered by the fuzzy systematic technique instead of the frequently employed weighted average method. Finally, an illustrative example is also given to clarify the developed approach and to demonstrate its effectiveness.',\n",
" 'authors': 'Lushu Li, K.K. Lai',\n",
" 'authorsRAW': [{'affiliations': ['Faculty of Administration, University of New Brunswick, Fredericton, N.B., Canada',\n",
" 'Corresponding author'],\n",
" 'name': 'Lushu Li'},\n",
" {'affiliations': ['Department of Management Science, City University of Hong Kong, Tat Chee Avenue, Kowloon, Hong Kong'],\n",
" 'name': 'K.K. Lai'}],\n",
" 'doi': '10.1016/S0165-0114(98)00423-0',\n",
" 'genre': ['research-article'],\n",
" 'id': '5E6CB638271D0121DB653AB9150D2F025346816A',\n",
" 'language_iso2': 'en',\n",
" 'language_iso3': 'eng',\n",
" 'language_name': 'English',\n",
" 'publication_date': '2001-01-01 00:00:00+00:00',\n",
" 'publication_day': 1,\n",
" 'publication_hour': 0,\n",
" 'publication_minute': 0,\n",
" 'publication_month': 1,\n",
" 'publication_second': 0,\n",
" 'publication_year': 2001,\n",
" 'source': 'Fuzzy Sets and Systems',\n",
" 'statuses': [],\n",
" 'title': 'Fuzzy dynamic programming approach to hybrid multiobjective multistage decision-making problems'}"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# To get all the documents:\n",
"docs = documents(corpus_id)\n",
"docs[0].hyperdata"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(1954, 2), (1956, 1), (1957, 1), (1958, 5), (1960, 3), (1961, 5), (1962, 2), (1963, 11), (1964, 5), (1965, 3), (1966, 1), (1967, 8), (1968, 17), (1969, 10), (1970, 8), (1971, 20), (1972, 12), (1973, 20), (1974, 16), (1975, 17), (1976, 8), (1977, 10), (1978, 14), (1979, 16), (1980, 28), (1981, 12), (1982, 14), (1983, 15), (1984, 19), (1985, 22), (1986, 27), (1987, 28), (1988, 24), (1989, 20), (1990, 26), (1991, 54), (1992, 48), (1993, 40), (1994, 40), (1995, 28), (1996, 32), (1997, 34), (1998, 30), (1999, 25), (2000, 37), (2001, 29), (2002, 13), (2003, 19), (2004, 17), (2005, 21), (2006, 17), (2007, 11), (2008, 10), (2009, 8), (2010, 9), (2011, 9), (2012, 12), (2013, 7)]\n"
]
}
],
"source": [
"pubsByYear = countByField(docs, \"publication_year\")\n",
"\n",
"print(pubsByYear)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[1954, 1956, 1957, 1958, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013]\n"
]
}
],
"source": [
"years = [y for y in map(lambda x: x[0], pubsByYear)]\n",
"print(years)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# To Add the groups you need to get the Node\n",
"group_id = session.query(GrouplistNode.id).filter(GrouplistNode.parent_id == corpus_id).first()\n",
"\n",
"occByYear = list()\n",
"\n",
"# Not optmized yet since sql request is launched for each year\n",
"# We will use a group by if needed, depends on the size of corpus\n",
"# Clarity of the computation is first done here\n",
"# Optmization will be the step After\n",
"for year in years:\n",
" listNgramOcc = compute_occs(corpus, groupings_id=group_id, year=year, interactiv=True)\n",
" listYearNgramOcc = [(year, ngram_id, occ) for (ngram_id, occ) in listNgramOcc]\n",
" occByYear.append(listYearNgramOcc)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[(1954, 5249, 1.0), (1954, 5366, 1.0), (1954, 7019, 1.0), (1954, 10524, 1.0), (1954, 121362, 1.0), (1954, 505775, 1.0)], [(1956, 7019, 1.0), (1956, 8604, 1.0), (1956, 755610, 1.0), (1956, 2361839, 1.0)]]\n"
]
}
],
"source": [
"\n",
"# Saving the results in file\n",
"toPrint = [(m.id,m.terms) for m in mapTerms]\n",
"csvfile = \"./MapTerms.csv\"\n",
"\n",
"#Assuming res is a flat list\n",
"with open(csvfile, \"w\") as output:\n",
" writer = csv.writer(output, lineterminator='\\n')\n",
" for val in toPrint:\n",
" writer.writerow([val])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"mapTermsOcc = (session.query(Occurrences).join( MapTerms, MapTerms.ngram_id == Occurrences.ngram_id)\n",
" .filter(MapTerms.node_id == map_id)\n",
" \n",
" .join(Documents, Documents.id == Occurrences.node2_id)\n",
" .filter(Documents.parent_id == corpus_id)\n",
" \n",
" .filter(Occurrences.node1_id == occ_id)\n",
" \n",
" #.group_by(Occurrences.ngram_id)\n",
" .all()\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(303698)"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"group_id"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[]"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mapTermsOcc"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cooccurrences of MapTerms by Year"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from graph.cooccurrences import countCooccurrences"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
" (cooc_id, cooc_matrix) = countCooccurrences( corpus_id=corpus_id, cooc_id= \n",
" , field1=field1, field2=field2 \n",
" , start=start , end =end \n",
" , mapList_id=mapList_id , groupList_id=groupList_id \n",
" , isMonopartite=True , threshold = threshold \n",
" , distance=distance , bridgeness=bridgeness \n",
" , save_on_db = True , reset = reset \n",
" ) "
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"GRAPH #303869 Filtering the matrix with Map and Group Lists.\n",
"WeightedMatrix bulk_insert start\n",
"WeightedMatrix bulk_insert stop\n",
"GRAPH #303869 ... Node Cooccurrence Matrix saved\n",
"GRAPH #303869 ... Parameters saved in Node.\n"
]
}
],
"source": [
"#countCooccurrences(corpus_id, save_on_db=False, start=\"2000-01-01\", end=\"2017-12-31\")\n",
"(cooc_id, cooc_matrix) = countCooccurrences( corpus_id = corpus_id\n",
" , cooc_id = None\n",
" , field1=\"ngrams\", field2 = \"ngrams\"\n",
" \n",
" , mapList_id = map_id\n",
" , groupList_id = group_id\n",
" \n",
" , isMonopartite =True , threshold = 2 \n",
" #, distance =Non , bridgeness=bridgeness\n",
" \n",
" , save_on_db = True\n",
" , reset = True\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"defaultdict(float, {})"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cooc_matrix.items"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Number of Documents per year"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Date DateValue\n",
"Date \n",
"1954 1954 2\n",
"1956 1956 1\n",
"1957 1957 1\n",
"1958 1958 5\n",
"1960 1960 3\n",
"1961 1961 5\n",
"1962 1962 2\n",
"1963 1963 11\n",
"1964 1964 5\n",
"1965 1965 3\n"
]
}
],
"source": [
"# To get all the documents:\n",
"docs = documents(corpus_id)\n",
"# If I want to count:\n",
"myChart = chart(docs, \"publication_year\")\n",
"print(myChart[:10])"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"source": [
"# Others example"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"139\n",
"LSTM 1000\n",
"Downloading page 0 to 100 results\n",
"Downloading page 100 to 100 results\n",
"CORPUS #303703\n",
"PARSING\n",
"Loading available PARSERS:\n",
"\t- EuropresseParser\n",
"\t- RISParser\n",
"\t- PubmedParser\n",
"\t- RISParser\n",
"\t- ISIParser\n",
"\t- RISParser\n",
"\t- CSVParser\n",
"\t- ISTexParser\n",
"\t- CernParser\n",
"\t- MultivacParser\n",
"\t- HalParser\n",
"\t- IsidoreParser\n",
"0 docs skipped\n",
"139 parsed\n",
"#MAIN language of the CORPUS __unknown__\n",
"CORPUS #303703: parsed 139\n",
"#TAGGERS LOADED: {'__unknown__': <gargantext.util.taggers.NltkTagger.NltkTagger object at 0x7f03064496a0>}\n",
"#SUPPORTED TAGGER LANGS ['__unknown__']\n",
"INTEGRATE\n",
"INTEGRATE\n",
"INTEGRATE\n",
"CORPUS #303703: extracted ngrams\n",
"CORPUS #303703: indexed hyperdata\n",
"CORPUS #303703: [2017-10-10_09:34:23] new favorites node #303843\n",
"CORPUS #303703: [2017-10-10_09:34:23] starting ngram lists computation\n",
"CORPUS #303703: [2017-10-10_09:34:24] new stoplist node #303844\n",
"# STEMMERS LOADED {'__unknown__': <nltk.stem.snowball.SnowballStemmer object at 0x7f0306497cf8>}\n",
"#SUPPORTED STEMMERS LANGS []\n",
"CORPUS #303703: [2017-10-10_09:34:25] new grouplist node #303845\n",
"CORPUS #303703: [2017-10-10_09:34:25] new occs node #303846\n",
"compute_ti_ranking\n",
"2017-10-10_09:34:25 : Starting Query tf_nd_query\n",
"2017-10-10_09:34:26 : End Query tf_nd_quer\n",
"2017-10-10_09:34:26 : tfidfsum\n",
"CORPUS #303703: [2017-10-10_09:34:26] new ti ranking node #303847\n",
"MAINLIST: keeping 3295 ngrams out of 4393\n",
"CORPUS #303703: [2017-10-10_09:34:26] new mainlist node #303848\n",
"Compute TFIDF local\n",
"CORPUS #303703: [2017-10-10_09:34:26] new localtfidf node #303849\n",
"COOCS: NEW matrix shape [215x361]\n",
"CORPUS #303703: [2017-10-10_09:34:32] computed mainlist coocs for specif rank\n",
"SPECIFICITY: computing on 209 ngrams\n",
"CORPUS #303703: [2017-10-10_09:34:32] new spec-clusion node #303853\n",
"CORPUS #303703: [2017-10-10_09:34:32] new gen-clusion node #303854\n",
"MAPLIST quotas: {'topgen': {'multigrams': 168, 'monograms': 42}, 'topspec': {'multigrams': 112, 'monograms': 28}}\n",
"MAPLIST: top_spec_monograms = 28\n",
"MAPLIST: top_spec_multigrams = 55\n",
"MAPLIST: top_gen_monograms = 42\n",
"MAPLIST: top_gen_multigrams = 0\n",
"MAPLIST: kept 125 ngrams in total \n",
"CORPUS #303703: [2017-10-10_09:34:32] new maplist node #303855\n",
"CORPUS #303703: [2017-10-10_09:34:32] FINISHED ngram lists computation\n"
]
}
],
"source": [
"#project = myProject_fromUrl(\"http://imt.gargantext.org/projects/300535\")\n",
"project = myProject_fromUrl(\"http://localhost:8000/projects/301096\")\n",
"corpus = newCorpus(project, source=\"hal\", name=\"Machine learning\", query=\"LSTM\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"6"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# search full text (english by default) in the corpus\n",
"scan_gargantext(corpus.id, \"machine | learning & deep\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# search full text (english by default) in the corpus\n",
"scan_gargantext(corpus.id, \"machine | learning & deep\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"6"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# search full text (english by default) and DELETE in the corpus\n",
"scan_gargantext_and_delete(corpus.id, \"machine | learning & deep\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"for doc in new_docs:\n",
" new_doc = (Node( user_id = project.user_id\n",
" , parent_id = corpus.id\n",
" , typename= 'DOCUMENT'\n",
" , name=doc[\"title\"][:50]\n",
" , hyperdata=doc)\n",
" )\n",
" session.add(new_doc)\n",
"session.commit()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"L'identifiant du corpus est : 254749\n"
]
}
],
"source": [
"# Copier/coller l'url du corpus (avec http://): sur lequel travailler\n",
"corpus_url = \"http://gargantext.org/projects/251737/corpora/254749\"\n",
"\n",
"corpus_id = corpus_url.split(\"/\")[6]\n",
"\n",
"print(\"L\\'identifiant du corpus est : %s\" % corpus_id)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# To get all the documents:\n",
"docs = documents(corpus_id)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"'Towards big data science in the decade ahead from ten years of InCoB and the 1st ISCB-Asia Joint Conference.'"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# To get the title of the first document \n",
"# [0] indicates the index of the first document\n",
"docs[0].hyperdata['title']"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"\"The 2011 International Conference on Bioinformatics (InCoB) conference, which is the annual scientific conference of the Asia-Pacific Bioinformatics Network (APBioNet), is hosted by Kuala Lumpur, Malaysia, is co-organized with the first ISCB-Asia conference of the International Society for Computational Biology (ISCB). InCoB and the sequencing of the human genome are both celebrating their tenth anniversaries and InCoB's goalposts for the next decade, implementing standards in bioinformatics and globally distributed computational networks, will be discussed and adopted at this conference. Of the 49 manuscripts (selected from 104 submissions) accepted to BMC Genomics and BMC Bioinformatics conference supplements, 24 are featured in this issue, covering software tools, genome/proteome analysis, systems biology (networks, pathways, bioimaging) and drug discovery and design.\""
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# To get the abstract of the first document (0)\n",
"docs[0].hyperdata['abstract']"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"'Shoba Ranganathan, Christian Schönbach, Janet Kelso, Burkhard Rost, Sheila Nathan, Tin Wee Tan'"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# To get the authors of the first document (0)\n",
"docs[0].hyperdata['authors']"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"'BMC bioinformatics'"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# To get the source of the first document (0)\n",
"docs[0].hyperdata['source']"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# If I want to count:\n",
"myChart = chart(docs, \"publication_year\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x7f48069c5208>"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAEZCAYAAACZwO5kAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAGl1JREFUeJzt3X+QVOWd7/H3hx+KURcUhymKQQYTNJorksnEYLnmF/E3\nFaxEXciWoMUuNzcSje7NOnvvrZuyyruFdVOyGvd6Q0IiWhvBsFHZRBMI0U1WLyoS8BdRJ4phKIUR\nlSiK8uN7/+gH0pIZupvpnp5++Lyqpvqc5zzn9Le7Tn84POd0H0UEZmaWr0H1LsDMzGrLQW9mljkH\nvZlZ5hz0ZmaZc9CbmWXOQW9mlrmygl7SNZKekfS0pLskDZM0XtKjkjolLZF0WOp7eJrvTMtba/kC\nzMzswEoGvaQxwFVAe0T8J2AwMB24EZgfER8B3gBmp1VmA2+k9vmpn5mZ1Um5QzdDgCMkDQE+BLwC\nfB5YmpYvAi5K09PSPGn5FEmqTrlmZlapIaU6RMQmSd8G/gC8CywHngDejIhdqVsXMCZNjwE2pnV3\nSdoGjARe6+05jjvuuGhtbT3Y12Bmdkh64oknXouIplL9Sga9pGMoHKWPB94Efgyc19cCJc0B5gAc\nf/zxrF69uq+bNDM7pEh6uZx+5QzdfAF4KSK6I2In8BPgTGBEGsoBaAE2pelNwNhUxBBgOLB1/41G\nxIKIaI+I9qamkv8gmZnZQSon6P8ATJb0oTTWPgV4FngQuDj1mQXcl6aXpXnS8l+FfznNzKxuSgZ9\nRDxK4aTqGuCptM4C4DrgWkmdFMbgF6ZVFgIjU/u1QEcN6jYzszJpIBxst7e3h8fozRrbzp076erq\nYseOHfUuJTvDhg2jpaWFoUOHfqBd0hMR0V5q/ZInY83MytHV1cXRRx9Na2srvqK6eiKCrVu30tXV\nxfjx4w9qG/4JBDOrih07djBy5EiHfJVJYuTIkX36n5KD3syqxiFfG319Xx30ZmaZ8xi9Vay142c1\n3f6GeRfWdPvWP6q9n5SzXwwePJhTTz2VnTt3MmTIEGbOnMk111zDoEG9H9Nu2LCBRx55hK985SsH\n3PYJJ5zAAw88wEknnbSv7Rvf+AajR4/muuuu63XbU6dO5emnny5Zey35iN7MsnHEEUewdu1annnm\nGVasWMEDDzzA9ddff8B1NmzYwI9+9KOS254+fTqLFy/eN79nzx6WLl3K9OnT+1x3rTnozSxLo0aN\nYsGCBdx6661EBBs2bOCss86ira2NtrY2HnnkEQA6Ojr4zW9+w6RJk5g/fz67d+/mm9/8Jp/85CeZ\nOHEi3/3udwGYMWMGS5Ys2bf9X//614wbN45x48b1uu1it99+O3Pnzt03P3XqVB566CEAli9fzhln\nnEFbWxuXXHIJb7/9dlXfCwe9mWXrhBNOYPfu3WzZsoVRo0axYsUK1qxZw5IlS7jqqqsAmDdvHmed\ndRZr167lmmuuYeHChQwfPpzHH3+cxx9/nO9973u89NJLnHrqqQwaNIh169YBsHjxYmbMmAHQ67bL\n8dprr3HDDTfwy1/+kjVr1tDe3s5NN91U1ffBY/RmdkjYuXMnc+fOZe3atQwePJjnn3++x37Lly/n\nySefZOnSwq+wb9u2jRdeeIHx48czY8YMFi9ezMc+9jHuvffefcNC5W67J6tWreLZZ5/lzDPPBOD9\n99/njDPO6OOr/SAHvZll68UXX2Tw4MGMGjWK66+/nubmZtatW8eePXsYNmxYj+tEBN/5znc499xz\n/2zZ9OnTOeecc/jMZz7DxIkTaW5uBmD+/Pkltz1kyBD27Nmzb37vdfERwdlnn81dd91VjZfcIw/d\nmFmWuru7+epXv8rcuXORxLZt2xg9ejSDBg3izjvvZPfu3QAcffTRvPXWW/vWO/fcc7ntttvYuXMn\nAM8//zzbt28H4MMf/jDHHXccHR0d+4ZtgF63Xay1tZW1a9eyZ88eNm7cyGOPPQbA5MmTefjhh+ns\n7ARg+/btFf2PoBw+ojezmqjHZbLvvvsukyZN2nd55WWXXca1114LwNe+9jW+/OUvc8cdd3Deeedx\n5JFHAjBx4kQGDx7MaaedxuWXX87VV1/Nhg0baGtrIyJoamri3nvv3fccM2bMoKOjgy996Uv72nrb\ndrEzzzyT8ePHc8opp3DyySfT1tYGQFNTE7fffjszZszgvffeA+CGG27gxBNPrNr74h81s4r5Onrr\nyfr16zn55JPrXUa2enp/y/1RMw/dmJllzkFvZpY5B72ZVc1AGArOUV/fVwe9mVXFsGHD2Lp1q8O+\nyvb+Hn1vl4OWw1fdmFlVtLS00NXVRXd3d71Lyc7eO0wdrJJBL+kkYElR0wnA/wTuSO2twAbg0oh4\nI91A/GbgAuAd4PKIWHPQFZpZQxg6dOhB3wHJaqucm4M/FxGTImIS8AkK4X0PhZt+r4yICcBK/nQT\n8POBCelvDnBbLQo3M7PyVDpGPwX4fUS8DEwDFqX2RcBFaXoacEcUrAJGSBpdlWrNzKxilQb9dGDv\nDzI0R8QrafpVoDlNjwE2Fq3TldrMzKwOyg56SYcBXwR+vP+yKJxmr+hUu6Q5klZLWu2TN2ZmtVPJ\nEf35wJqI2JzmN+8dkkmPW1L7JmBs0Xotqe0DImJBRLRHRHtTU1PllZuZWVkqCfoZ/GnYBmAZMCtN\nzwLuK2qfqYLJwLaiIR4zM+tnZV1HL+lI4GzgPxc1zwPuljQbeBm4NLXfT+HSyk4KV+hcUbVqzcys\nYmUFfURsB0bu17aVwlU4+/cN4MqqVGdmZn3mn0AwM8ucg97MLHMOejOzzDnozcwy56A3M8ucg97M\nLHMOejOzzDnozcwy56A3M8ucg97MLHMOejOzzDnozcwy56A3M8ucg97MLHMOejOzzDnozcwy56A3\nM8ucg97MLHMOejOzzJUV9JJGSFoq6XeS1ks6Q9KxklZIeiE9HpP6StItkjolPSmprbYvwczMDqTc\nI/qbgZ9HxEeB04D1QAewMiImACvTPMD5wIT0Nwe4raoVm5lZRUoGvaThwKeBhQAR8X5EvAlMAxal\nbouAi9L0NOCOKFgFjJA0uuqVm5lZWco5oh8PdAM/lPRbSd+XdCTQHBGvpD6vAs1pegywsWj9rtT2\nAZLmSFotaXV3d/fBvwIzMzugcoJ+CNAG3BYRHwe286dhGgAiIoCo5IkjYkFEtEdEe1NTUyWrmplZ\nBcoJ+i6gKyIeTfNLKQT/5r1DMulxS1q+CRhbtH5LajMzszooGfQR8SqwUdJJqWkK8CywDJiV2mYB\n96XpZcDMdPXNZGBb0RCPmZn1syFl9vs68C+SDgNeBK6g8I/E3ZJmAy8Dl6a+9wMXAJ3AO6mvmZnV\nSVlBHxFrgfYeFk3poW8AV/axLjMzqxJ/M9bMLHMOejOzzDnozcwy56A3M8ucg97MLHMOejOzzDno\nzcwy56A3M8ucg97MLHMOejOzzDnozcwy56A3M8ucg97MLHMOejOzzDnozcwy56A3M8ucg97MLHMO\nejOzzJUV9JI2SHpK0lpJq1PbsZJWSHohPR6T2iXpFkmdkp6U1FbLF2BmZgdWyRH95yJiUkTsvXds\nB7AyIiYAK9M8wPnAhPQ3B7itWsWamVnl+jJ0Mw1YlKYXARcVtd8RBauAEZJG9+F5zMysD8oN+gCW\nS3pC0pzU1hwRr6TpV4HmND0G2Fi0bldq+wBJcyStlrS6u7v7IEo3M7NyDCmz319GxCZJo4AVkn5X\nvDAiQlJU8sQRsQBYANDe3l7RumZmVr6yjugjYlN63ALcA5wObN47JJMet6Tum4CxRau3pDYzM6uD\nkkEv6UhJR++dBs4BngaWAbNSt1nAfWl6GTAzXX0zGdhWNMRjZmb9rJyhm2bgHkl7+/8oIn4u6XHg\nbkmzgZeBS1P/+4ELgE7gHeCKqldtZmZlKxn0EfEicFoP7VuBKT20B3BlVaozM7M+8zdjzcwy56A3\nM8ucg97MLHPlXkc/oLR2/Kym298w78Kabt/MrD/5iN7MLHMOejOzzDnozcwy56A3M8ucg97MLHMO\nejOzzDnozcwy56A3M8ucg97MLHMOejOzzDnozcwy56A3M8ucg97MLHMOejOzzJUd9JIGS/qtpJ+m\n+fGSHpXUKWmJpMNS++FpvjMtb61N6WZmVo5KjuivBtYXzd8IzI+IjwBvALNT+2zgjdQ+P/UzM7M6\nKSvoJbUAFwLfT/MCPg8sTV0WARel6WlpnrR8SupvZmZ1UO4R/T8Bfw/sSfMjgTcjYlea7wLGpOkx\nwEaAtHxb6v8BkuZIWi1pdXd390GWb2ZmpZQMeklTgS0R8UQ1nzgiFkREe0S0NzU1VXPTZmZWpJx7\nxp4JfFHSBcAw4C+Am4ERkoako/YWYFPqvwkYC3RJGgIMB7ZWvXIzMytLySP6iPiHiGiJiFZgOvCr\niPhr4EHg4tRtFnBfml6W5knLfxURUdWqzcysbH25jv464FpJnRTG4Bem9oXAyNR+LdDRtxLNzKwv\nyhm62SciHgIeStMvAqf30GcHcEkVajMzsyrwN2PNzDLnoDczy5yD3swscw56M7PMOejNzDLnoDcz\ny5yD3swscw56M7PMOejNzDLnoDczy5yD3swscw56M7PMOejNzDLnoDczy5yD3swscw56M7PMOejN\nzDLnoDczy1zJoJc0TNJjktZJekbS9al9vKRHJXVKWiLpsNR+eJrvTMtba/sSzMzsQMo5on8P+HxE\nnAZMAs6TNBm4EZgfER8B3gBmp/6zgTdS+/zUz8zM6qRk0EfB22l2aPoL4PPA0tS+CLgoTU9L86Tl\nUySpahWbmVlFyhqjlzRY0lpgC7AC+D3wZkTsSl26gDFpegywESAt3waM7GGbcyStlrS6u7u7b6/C\nzMx6VVbQR8TuiJgEtACnAx/t6xNHxIKIaI+I9qampr5uzszMelHRVTcR8SbwIHAGMELSkLSoBdiU\npjcBYwHS8uHA1qpUa2ZmFSvnqpsmSSPS9BHA2cB6CoF/ceo2C7gvTS9L86Tlv4qIqGbRZmZWviGl\nuzAaWCRpMIV/GO6OiJ9KehZYLOkG4LfAwtR/IXCnpE7gdWB6Deo2M7MylQz6iHgS+HgP7S9SGK/f\nv30HcElVqjMzsz7zN2PNzDLnoDczy5yD3swscw56M7PMOejNzDLnoDczy5yD3swscw56M7PMOejN\nzDLnoDczy5yD3swscw56M7PMOejNzDLnoDczy5yD3swscw56M7PMOejNzDLnoDczy1w5NwcfK+lB\nSc9KekbS1an9WEkrJL2QHo9J7ZJ0i6ROSU9Kaqv1izAzs96Vc0S/C/i7iDgFmAxcKekUoANYGRET\ngJVpHuB8YEL6mwPcVvWqzcysbCWDPiJeiYg1afotYD0wBpgGLErdFgEXpelpwB1RsAoYIWl01Ss3\nM7OyDKmks6RW4OPAo0BzRLySFr0KNKfpMcDGotW6UtsrRW1ImkPhiJ/jjz++wrLNDk2tHT+r6fY3\nzLuwptu3+ij7ZKyko4B/Bb4REX8sXhYRAUQlTxwRCyKiPSLam5qaKlnVzMwqUFbQSxpKIeT/JSJ+\nkpo37x2SSY9bUvsmYGzR6i2pzczM6qCcq24ELATWR8RNRYuWAbPS9CzgvqL2menqm8nAtqIhHjMz\n62fljNGfCVwGPCVpbWr7b8A84G5Js4GXgUvTsvuBC4BO4B3giqpWbGZmFSkZ9BHxH4B6WTylh/4B\nXNnHuszMrEr8zVgzs8w56M3MMuegNzPLnIPezCxzDnozs8w56M3MMuegNzPLnIPezCxzDnozs8w5\n6M3MMuegNzPLnIPezCxzDnozs8w56M3MMuegNzPLXEU3Bzcz6wvf3Lw+fERvZpY5B72ZWebKuTn4\nDyRtkfR0UduxklZIeiE9HpPaJekWSZ2SnpTUVsvizcystHKO6G8HztuvrQNYGRETgJVpHuB8YEL6\nmwPcVp0yzczsYJUM+oj4NfD6fs3TgEVpehFwUVH7HVGwChghaXS1ijUzs8od7Bh9c0S8kqZfBZrT\n9BhgY1G/rtT2ZyTNkbRa0uru7u6DLMPMzErp88nYiAggDmK9BRHRHhHtTU1NfS3DzMx6cbBBv3nv\nkEx63JLaNwFji/q1pDYzM6uTgw36ZcCsND0LuK+ofWa6+mYysK1oiMfMzOqg5DdjJd0FfBY4TlIX\n8C1gHnC3pNnAy8Clqfv9wAVAJ/AOcEUNajYzswqUDPqImNHLoik99A3gyr4WZWZm1eNvxpqZZc5B\nb2aWOQe9mVnmHPRmZplz0JuZZc5Bb2aWOQe9mVnmHPRmZplz0JuZZc5Bb2aWOQe9mVnmSv7WjVlu\nWjt+VrNtb5h3Yc22bXawfERvZpY5B72ZWeYc9GZmmXPQm5llzkFvZpY5B72ZWeZqEvSSzpP0nKRO\nSR21eA4zMytP1YNe0mDgn4HzgVOAGZJOqfbzmJlZeWrxhanTgc6IeBFA0mJgGvBsDZ6rIdXyCzvg\nL+2Y1UqjfnYVEdXdoHQxcF5E/E2avwz4VETM3a/fHGBOmj0JeK6qhXzQccBrNdx+rbn++mnk2sH1\n11ut6x8XEU2lOtXtJxAiYgGwoD+eS9LqiGjvj+eqBddfP41cO7j+ehso9dfiZOwmYGzRfEtqMzOz\nOqhF0D8OTJA0XtJhwHRgWQ2ex8zMylD1oZuI2CVpLvALYDDwg4h4ptrPU6F+GSKqIddfP41cO7j+\nehsQ9Vf9ZKyZmQ0s/masmVnmHPRmZplz0JuZZc5Bb2aWuUMi6CXdUe8azPqDpNMlfTJNnyLpWkkX\n1Lsuq6/sbg4uaf9r9gV8TtIIgIj4Yv9XVR2SroiIH9a7jnJJ+ksKv330dEQsr3c95ZD0UWAM8GhE\nvF3Ufl5E/Lx+lZUm6VsUfkxwiKQVwKeAB4EOSR+PiP9V1wJLkPQpYH1E/FHSEUAH0Ebhd7L+MSK2\n1bXAEiRdBdwTERvrXcv+sru8UtIaCjvG94GgEPR3UfjiFhHx7/Wrrm8k/SEijq93Hb2R9FhEnJ6m\n/xa4ErgHOAf4t4iYV8/6Skkf1CuB9cAk4OqIuC8tWxMRbfWsrxRJT1Go+3DgVaClKDQfjYiJdS2w\nBEnPAKel7+IsAN4BlgJTUvuX6lpgCZK2AduB31PInB9HRHd9qyrI7ogeaAeuBv478M2IWCvp3UYJ\neElP9rYIaO7PWg7C0KLpOcDZEdEt6dvAKmBABz3wt8AnIuJtSa3AUkmtEXEzhfd/oNsVEbuBdyT9\nPiL+CBAR70raU+fayjEoInal6faif1j/Q9LaehVVgReBTwBfAP4KuF7SExRC/ycR8Va9Cssu6CNi\nDzBf0o/T42Ya63U2A+cCb+zXLuCR/i+nIoMkHUPh3I/2Hs1ExHZJuw686oAwaO9wTURskPRZCmE/\njsYI+vclfSgi3qEQOABIGg40QtA/XTQ8uU5Se0SslnQisLPexZUhUv4sB5ZLGkphKG0G8G2g5K9M\n1kojBWBFIqILuETShcAf611PBX4KHBURf3YEI+mh/i+nIsOBJyiEYkgaHRGvSDqKxgjKzZIm7X3v\n05H9VOAHwKn1La0sn46I92DfAc9eQ4FZ9SmpIn8D3Czpf1D4ad//J2kjsDEtG+g+sI9HxE4Kv/O1\nTNKH6lNSQXZj9Aci6ajiE2zWP9JO3hwRL9W7lgOR1EJh+OPVHpadGREP16GsqmikfV/SXwDjKRyI\ndkXE5jqXVBZJJ0bE8/WuoyeHWtAP6JOZpTTSh3V/jVw7ZFG/9/06qnf92Q3dSLq2t0XAUf1ZSw08\nCzTqh7WRa4cGqN/7/oBW1/qzC3rgH4H/DfR08m/Af0GskT+sjVw7NH79eN+vq4Fcf45Bvwa4NyKe\n2H+BpEY4odPIH9ZGrh0av37v+/U1YOvPboxe0knA6z19UUFS80A/sSPpEeDrvXxYN0bE2B5WGxAa\nuXbIon7v+3U0kOvPLugbXSN/WBu5dmj8+htdo7//A7n+7II+fTnkH4CLgFEUfgZhC3AfMC8i3qxj\neWY1433fetMI416VupvCt0o/GxHHRsRI4HOp7e66VlYGScMlzZP0O0mvS9oqaX1qG1Hv+g6kkWuH\nxq8f7/t1NZDrzzHoWyPixuIvvUTEqxFxIzCujnWVq5E/rI1cOzR+/d7362vA1p/j0M1y4JfAor1j\nYpKagcsp/MjWF+pYXkmSnouIkypdNhA0cu2QRf3e9+toINef4xH9XwEjgX+X9Iak14GHgGOBS+tZ\nWJlelvT36QMKFD6skq6j8JsfA1kj1w6NX7/3/foasPVnF/QR8QbwQ2AuMDb9F+rkiLiOwk0wBrpG\n/rA2cu3Q4PV736+7AVt/jkM3DX3zCNh3l6MWYFU03l2OGrZ2aOz6ve/X34CtPyKy+gOeovAzvwCt\nwGoKOzzAb+tdXxn1XwU8B9wLbACmFS1bU+/6cq09k/q977v+Hv9y/AmERr95RCPf5aiRa4fGr9/7\nfn0N2PpzDPpGv3lEI39YG7l2aPz6ve/X14CtP7uTscBMCjdG3icidkXETODT9SmpIpslTdo7k3ac\nqcBxDPwPayPXDo1fv/f9+hqw9Wd3MrbRqYHvctTItUPj19/oGv39H8j1O+jNzDKX49CNmZkVcdCb\nmWXOQW+HJEm7Ja2V9IykdZL+TtIBPw+SWiV9pb9qNKsWB70dqt6NiEkR8THgbOB84Fsl1mkFHPTW\ncHwy1g5Jkt6OiKOK5k8AHqdwKdw44E7gyLR4bkQ8ImkVcDLwErAIuAWYB3wWOBz454j4br+9CLMy\nOejtkLR/0Ke2N4GTgLeAPRGxQ9IE4K6IaE9fgPmvETE19Z8DjIqIGyQdDjwMXBIRL/XrizErIcdv\nxpr11VDg1vTll93Aib30OweYKOniND8cmEDhiN9swHDQm7Fv6GY3hXusfgvYDJxG4TzWjt5WA74e\nEb/olyLNDpJPxtohT1IT8H+BW6MwljkceCUi9gCXAYNT17eAo4tW/QXwXyQNTds5UdKRmA0wPqK3\nQ9URktZSGKbZReHk601p2f8B/lXSTODnwPbU/iSwW9I64HbgZgpX4qyRJKAbuKi/XoBZuXwy1sws\ncx66MTPLnIPezCxzDnozs8w56M3MMuegNzPLnIPezCxzDnozs8z9f8zGHY6Yb9aNAAAAAElFTkSu\nQmCC\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x7f48069bd7b8>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"myChart.plot.bar()"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"## Title\n",
"\n",
"Here I can add some comments on the cart.\n",
"1. First point\n",
"2. Second point"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"# Lang Cleaning tools"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"detect_lang(\"Ceci est une phrase en français.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"detect_lang(\"This is an english sentence.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"chart(docs, \"language_iso2\").plot.bar()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"Counter({'de': 13,\n",
" 'en': 1547,\n",
" 'es': 5,\n",
" 'fi': 1,\n",
" 'fr': 4,\n",
" 'hu': 1,\n",
" 'it': 1,\n",
" 'ja': 5,\n",
" 'ko': 1,\n",
" 'ru': 3,\n",
" 'zh': 23})"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Counter([doc.hyperdata[\"language_iso2\"] for doc in docs])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# Deleting language that is not in majority\n",
"def cleanCorpusWithLang(corpus_id, lang):\n",
" return (session.query(Node.id).filter(Node.parent_id == corpus_id)\n",
" .filter(Node.hyperdata[\"language_iso2\"].astext != lang)\n",
" .count()\n",
" #.delete()\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"57"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cleanCorpusWithLang(corpus_id, 'en')"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"[(True, 'This is an english paragraph.\\n '),\n",
" (False, '\"This is an english paragraph.\\n\\nThis is an english paragraph.\\n ')]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"abstract0 = \"\"\"\"Ceci est un paragraphe en français.\n",
"\n",
"This is an english paragraph.\n",
" \"\"\"\n",
"\n",
"abstract1 = \"\"\"\"This is an english paragraph.\n",
"\n",
"This is an english paragraph.\n",
" \"\"\"\n",
"\n",
"def clean_lang_inText(lang, text):\n",
" \n",
" texts_before = nltk.tokenize.blankline_tokenize(text)\n",
" texts_after = '\\n\\n'.join([sentence \n",
" for sentence in texts_before\n",
" if detect_lang(sentence) == lang\n",
" ])\n",
" \n",
" return (len(texts_before) != len(nltk.tokenize.blankline_tokenize(texts_after)), texts_after)\n",
"\n",
"[clean_lang_inText('en', abstract) for abstract in [abstract0, abstract1]]\n",
"\n",
"# TODO update each document accordingly"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# TODO update all the abstract with That function"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"# Measures IMT Tools"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"ename": "ConnectionError",
"evalue": "HTTPSConnectionPool(host='api.archives-ouvertes.fr', port=443): Max retries exceeded with url: /search?wt=json&q=machine+learning+AND+deep&fl=+title_s%0A+++++++++++++++%2C+abstract_s%0A+++++++++++++++%2C+submittedDate_s%0A+++++++++++++++%2C+journalDate_s%0A+++++++++++++++%2C+authFullName_s%0A+++++++++++++++%2C+uri_s%0A+++++++++++++++%2C+isbn_s%0A+++++++++++++++%2C+issue_s%0A+++++++++++++++%2C+docType_s%0A+++++++++++++++%2C+journalPublisher_s%0A+++++++++++++&start=1&rows=10 (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x7f48069d1f98>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution',))",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mgaierror\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/packages/urllib3/connection.py\u001b[0m in \u001b[0;36m_new_conn\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 141\u001b[0m conn = connection.create_connection(\n\u001b[0;32m--> 142\u001b[0;31m (self.host, self.port), self.timeout, **extra_kw)\n\u001b[0m\u001b[1;32m 143\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/packages/urllib3/util/connection.py\u001b[0m in \u001b[0;36mcreate_connection\u001b[0;34m(address, timeout, source_address, socket_options)\u001b[0m\n\u001b[1;32m 66\u001b[0m \u001b[0merr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 67\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mres\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msocket\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetaddrinfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhost\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mport\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msocket\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSOCK_STREAM\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 68\u001b[0m \u001b[0maf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msocktype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mproto\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcanonname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msa\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/lib/python3.5/socket.py\u001b[0m in \u001b[0;36mgetaddrinfo\u001b[0;34m(host, port, family, type, proto, flags)\u001b[0m\n\u001b[1;32m 732\u001b[0m \u001b[0maddrlist\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 733\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mres\u001b[0m \u001b[0;32min\u001b[0m \u001b[0m_socket\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetaddrinfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhost\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mport\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfamily\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mproto\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflags\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 734\u001b[0m \u001b[0maf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msocktype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mproto\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcanonname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msa\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mgaierror\u001b[0m: [Errno -3] Temporary failure in name resolution",
"\nDuring handling of the above exception, another exception occurred:\n",
"\u001b[0;31mNewConnectionError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, **response_kw)\u001b[0m\n\u001b[1;32m 577\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 578\u001b[0;31m chunked=chunked)\n\u001b[0m\u001b[1;32m 579\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m 350\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 351\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate_conn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 352\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mSocketTimeout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mBaseSSLError\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_validate_conn\u001b[0;34m(self, conn)\u001b[0m\n\u001b[1;32m 813\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'sock'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# AppEngine might not have `.sock`\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 814\u001b[0;31m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 815\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/packages/urllib3/connection.py\u001b[0m in \u001b[0;36mconnect\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 253\u001b[0m \u001b[0;31m# Add certificate verification\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 254\u001b[0;31m \u001b[0mconn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_new_conn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 255\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/packages/urllib3/connection.py\u001b[0m in \u001b[0;36m_new_conn\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 150\u001b[0m raise NewConnectionError(\n\u001b[0;32m--> 151\u001b[0;31m self, \"Failed to establish a new connection: %s\" % e)\n\u001b[0m\u001b[1;32m 152\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNewConnectionError\u001b[0m: <requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x7f48069d1f98>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution",
"\nDuring handling of the above exception, another exception occurred:\n",
"\u001b[0;31mMaxRetryError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/adapters.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 402\u001b[0m \u001b[0mretries\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_retries\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 403\u001b[0;31m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 404\u001b[0m )\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, **response_kw)\u001b[0m\n\u001b[1;32m 622\u001b[0m retries = retries.increment(method, url, error=e, _pool=self,\n\u001b[0;32m--> 623\u001b[0;31m _stacktrace=sys.exc_info()[2])\n\u001b[0m\u001b[1;32m 624\u001b[0m \u001b[0mretries\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/packages/urllib3/util/retry.py\u001b[0m in \u001b[0;36mincrement\u001b[0;34m(self, method, url, response, error, _pool, _stacktrace)\u001b[0m\n\u001b[1;32m 280\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mnew_retry\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_exhausted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 281\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mMaxRetryError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_pool\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merror\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mResponseError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcause\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 282\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mMaxRetryError\u001b[0m: HTTPSConnectionPool(host='api.archives-ouvertes.fr', port=443): Max retries exceeded with url: /search?wt=json&q=machine+learning+AND+deep&fl=+title_s%0A+++++++++++++++%2C+abstract_s%0A+++++++++++++++%2C+submittedDate_s%0A+++++++++++++++%2C+journalDate_s%0A+++++++++++++++%2C+authFullName_s%0A+++++++++++++++%2C+uri_s%0A+++++++++++++++%2C+isbn_s%0A+++++++++++++++%2C+issue_s%0A+++++++++++++++%2C+docType_s%0A+++++++++++++++%2C+journalPublisher_s%0A+++++++++++++&start=1&rows=10 (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x7f48069d1f98>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution',))",
"\nDuring handling of the above exception, another exception occurred:\n",
"\u001b[0;31mConnectionError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-16-b220cbbc8ecc>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mscan_hal\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"machine learning AND deep\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/srv/gargantext/gargantext_notebook.py\u001b[0m in \u001b[0;36mscan_hal\u001b[0;34m(request)\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mscan_hal\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0mhal\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mHalCrawler\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 62\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mhal\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscan_results\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 63\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 64\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mscan_gargantext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcorpus_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlang\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/srv/gargantext/gargantext/util/crawlers/HAL.py\u001b[0m in \u001b[0;36mscan_results\u001b[0;34m(self, query)\u001b[0m\n\u001b[1;32m 90\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresults_nb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 91\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 92\u001b[0;31m total = ( self._get(query)\n\u001b[0m\u001b[1;32m 93\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"response\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 94\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"numFound\"\u001b[0m \u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/srv/gargantext/gargantext/util/crawlers/HAL.py\u001b[0m in \u001b[0;36m_get\u001b[0;34m(self, query, fromPage, count, lang)\u001b[0m\n\u001b[1;32m 69\u001b[0m \u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mURL\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 70\u001b[0m \u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 71\u001b[0;31m \u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mquerystring\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 72\u001b[0m )\n\u001b[1;32m 73\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/api.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(method, url, **kwargs)\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[0;31m# cases, and look like a memory leak in others.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 56\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0msessions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSession\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 57\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 58\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 59\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 473\u001b[0m }\n\u001b[1;32m 474\u001b[0m \u001b[0msend_kwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msettings\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 475\u001b[0;31m \u001b[0mresp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprep\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0msend_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 476\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 477\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 583\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 584\u001b[0m \u001b[0;31m# Send the request\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 585\u001b[0;31m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0madapter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 586\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 587\u001b[0m \u001b[0;31m# Total elapsed time of the request (approximately)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/adapters.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 465\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mProxyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 466\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 467\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mConnectionError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 468\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 469\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mClosedPoolError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mConnectionError\u001b[0m: HTTPSConnectionPool(host='api.archives-ouvertes.fr', port=443): Max retries exceeded with url: /search?wt=json&q=machine+learning+AND+deep&fl=+title_s%0A+++++++++++++++%2C+abstract_s%0A+++++++++++++++%2C+submittedDate_s%0A+++++++++++++++%2C+journalDate_s%0A+++++++++++++++%2C+authFullName_s%0A+++++++++++++++%2C+uri_s%0A+++++++++++++++%2C+isbn_s%0A+++++++++++++++%2C+issue_s%0A+++++++++++++++%2C+docType_s%0A+++++++++++++++%2C+journalPublisher_s%0A+++++++++++++&start=1&rows=10 (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x7f48069d1f98>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution',))"
]
}
],
"source": [
"scan_hal(\"machine learning AND deep\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# Request syntax\n",
"# \"network analysis\" = network <-> analysis\n",
"# \"network OR analysis\" = network | analysis\n",
"# \"network AND analysis\" = network & analysis\n",
"\n",
"scan_gargantext(corpus_id, 'english', \"machine | learning & deep\")\n",
"\n",
"# \"network NOT analysis\" = @@ to_tsquery('network') !! to_tsquery('analysis')\n",
"# (need to change the function if not has to be used)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# Forces / Faiblesses de l'IMT\n",
"# Hal Query Gargantext Query\n",
"queries = [ (\"network analysis\" , \"network <-> analysis\" )\n",
" , (\"big data AND something\" , \"(big <-> data) & something\")\n",
" ]\n",
"[(query[0], query[1]) for query in queries]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"def imt_vs_hal(corpus_id, queryHal, queryGarg):\n",
" return((scan_gargantext(corpus_id, 'english', queryGarg), scan_hal(queryHal)))\n",
" #return((scan_gargantext(corpus_id, 'english', queryGarg) *100 / scan_hal(queryHal)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# Then chart it to see your strenght and weakness!\n",
"[imt_vs_hal(corpus_id, query[0], query[1]) for query in queries]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"# Graph generation"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# TODO Cooccurrences optimization"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# TODO optimize the distributional distance"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"# List Management"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# Front End add a check box to merge or to overwrite previous list"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# optimize the list merge"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.3rc1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment