Commit e79dfe73 authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge branch 'stable-notebook' into stable-imt-notebook

parents 96c36213 916b6b85
......@@ -2,7 +2,7 @@
## Community
* [http://gargantext.org/about](http://gargantext.org/about)
* IRC Chat: (OFTC/FreeNode) #gargantex
* IRC Chat: (OFTC/FreeNode) #gargantext
##Tools
* gogs
......
from sqlalchemy.schema import Column, ForeignKey, UniqueConstraint, Index
from sqlalchemy.orm import relationship
from sqlalchemy.orm import relationship, validates
from sqlalchemy.types import TypeDecorator, \
Integer, Float, Boolean, DateTime, String, Text
from sqlalchemy.dialects.postgresql import JSONB, DOUBLE_PRECISION as Double
......@@ -7,6 +7,7 @@ from sqlalchemy.ext.mutable import MutableDict, MutableList
from sqlalchemy.ext.declarative import declarative_base
__all__ = ["Column", "ForeignKey", "UniqueConstraint", "relationship",
"validates", "ValidatorMixin",
"Integer", "Float", "Boolean", "DateTime", "String", "Text",
"TypeDecorator",
"JSONB", "Double",
......@@ -18,6 +19,25 @@ __all__ = ["Column", "ForeignKey", "UniqueConstraint", "relationship",
# all tables handled by Alembic migration scripts.
Base = declarative_base()
# To be used by tables already handled by Django ORM, such as User model. We
# separate them in order to keep those out of Alembic sight.
DjangoBase = declarative_base()
class ValidatorMixin(object):
def enforce_length(self, key, value):
"""Truncate a string according to its column length
Usage example:
.. code-block:: python
@validates('some_column')
def validate_some_column(self, key, value):
self.enforce_length(key, value)
"""
max_len = getattr(self.__class__, key).prop.columns[0].type.length
if value and len(value) > max_len:
return value[:max_len]
return value
......@@ -9,7 +9,7 @@ from datetime import datetime
from .base import Base, Column, ForeignKey, relationship, TypeDecorator, Index, \
Integer, Float, String, DateTime, JSONB, \
MutableList, MutableDict
MutableList, MutableDict, validates, ValidatorMixin
from .users import User
__all__ = ['Node', 'NodeNode', 'CorpusNode']
......@@ -26,7 +26,7 @@ class NodeType(TypeDecorator):
return NODETYPES[typeindex]
class Node(Base):
class Node(ValidatorMixin, Base):
"""This model can fit many purposes:
myFirstCorpus = session.query(CorpusNode).first()
......@@ -112,6 +112,10 @@ class Node(Base):
'user_id={0.user_id}, parent_id={0.parent_id}, ' \
'name={0.name!r}, date={0.date})>'.format(self)
@validates('name')
def validate_name(self, key, value):
return self.enforce_length(key, value)
@property
def ngrams(self):
"""Pseudo-attribute allowing to retrieve a node's ngrams.
......
......@@ -120,7 +120,7 @@ class HalCrawler(Crawler):
msg = "Invalid sample size N = %i (max = %i)" % ( self.query_max
, QUERY_SIZE_N_MAX
)
print("ERROR (scrap: Multivac d/l ): " , msg)
print("ERROR (scrap: HAL d/l ): " , msg)
self.query_max = QUERY_SIZE_N_MAX
#for page in range(1, trunc(self.query_max / 100) + 2):
......
......@@ -73,7 +73,8 @@ from rest_framework.views import APIView
from gargantext.util.json import json_encoder
def JsonHttpResponse(data, status=200):
return HttpResponse(
content = json_encoder.encode(data),
content = data.encode('utf-8') if isinstance(data, str) else \
json_encoder.encode(data),
content_type = 'application/json; charset=utf-8',
status = status
)
......
......@@ -95,19 +95,30 @@ def query_list(list_id,
else:
# NB: score can be undefined (eg ex-subform that now became free)
# ==> we need outerjoin
# and the filter needs to have scoring_metric_id so we do it before
NNN = NodeNodeNgram
ScoresTable = (session
.query(NodeNodeNgram.score, NodeNodeNgram.ngram_id)
.filter(NodeNodeNgram.node1_id == scoring_metric_id)
.subquery()
)
query = (session
.query(Ngram.id, Ngram.terms, NNN.score)
# Ngrams must be related to our list <Node(id=list_id)>
.join(NodeNgram, (NodeNgram.ngram_id == Ngram.id) &
(NodeNgram.node_id == list_id))
# Select by metric <Node(id=scoring_metric_id)>
.outerjoin(NNN, (NNN.ngram_id == Ngram.id) &
(NNN.node1_id == scoring_metric_id))
# Sort by descending score
.order_by(NNN.score.desc())
.query(
NodeNgram.ngram_id,
Ngram.terms,
ScoresTable.c.score
)
.join(Ngram, NodeNgram.ngram_id == Ngram.id)
# main filter ----------------------
.filter(NodeNgram.node_id == list_id)
# scores if possible
.outerjoin(ScoresTable,
ScoresTable.c.ngram_id == NodeNgram.ngram_id)
.order_by(desc(ScoresTable.c.score))
)
if pagination_limit:
......
......@@ -11,17 +11,8 @@ from datetime import datetime
import json
class HalParser(Parser):
def parse(self, filebuf):
'''
parse :: FileBuff -> [Hyperdata]
'''
contents = filebuf.read().decode("UTF-8")
data = json.loads(contents)
def _parse(self, json_docs):
filebuf.close()
json_docs = data
hyperdata_list = []
hyperdata_path = { "id" : "isbn_s"
......@@ -83,3 +74,13 @@ class HalParser(Parser):
hyperdata_list.append(hyperdata)
return hyperdata_list
def parse(self, filebuf):
'''
parse :: FileBuff -> [Hyperdata]
'''
contents = filebuf.read().decode("UTF-8")
data = json.loads(contents)
return self._parse(data)
......@@ -16,7 +16,7 @@ sudo docker run \
--env POSTGRES_HOST=localhost \
-v /srv/gargantext:/srv/gargantext \
-it garg-notebook:latest \
/bin/bash -c "/bin/su notebooks -c 'source /env_3-5/bin/activate && cd /srv/gargantext/ && jupyter notebook --port=8899 --ip=0.0.0.0 --no-browser'"
/bin/bash -c "/bin/su notebooks -c 'source /env_3-5/bin/activate && cd /home/notebooks && jupyter notebook --port=8899 --ip=0.0.0.0 --no-browser'"
# #&& jupyter nbextension enable --py widgetsnbextension --sys-prefix
#/bin/bash -c "/bin/su notebooks -c 'source /env_3-5/bin/activate && cd /srv/gargantext/ && jupyter notebook --port=8899 --ip=0.0.0.0 --no-browser --notebook-dir=/home/notebooks/'"
......
......@@ -78,32 +78,8 @@ RUN . /env_3-5/bin/activate && pip3 install -r requirements.txt
#RUN ./psql_configure.sh
#RUN ./django_configure.sh
RUN chown notebooks:notebooks -R /env_3-5
########################################################################
### Notebook IHaskell and IPYTHON ENVIRONNEMENT
########################################################################
#RUN apt-get update && apt-get install -y \
# libtinfo-dev \
# libzmq3-dev \
# libcairo2-dev \
# libpango1.0-dev \
# libmagic-dev \
# libblas-dev \
# liblapack-dev
#RUN curl -sSL https://get.haskellstack.org/ | sh
#RUN stack setup
#RUN git clone https://github.com/gibiansky/IHaskell
#RUN . /env_3-5/bin/activate \
# && cd IHaskell \
# && stack install gtk2hs-buildtools \
# && stack install --fast \
# && /root/.local/bin/ihaskell install --stack
#
#
########################################################################
### POSTGRESQL DATA (as ROOT)
########################################################################
......@@ -115,3 +91,32 @@ RUN chown notebooks:notebooks -R /env_3-5
EXPOSE 5432 8899
VOLUME ["/srv/","/home/notebooks/"]
########################################################################
### Notebook IHaskell and IPYTHON ENVIRONNEMENT
########################################################################
RUN apt-get update && apt-get install -y \
libtinfo-dev \
libzmq3-dev \
libcairo2-dev \
libpango1.0-dev \
libmagic-dev \
libblas-dev \
liblapack-dev
USER notebooks
RUN cd /home/notebooks \
&& curl -sSL https://get.haskellstack.org/ | sh \
&& stack setup \
&& git clone https://github.com/gibiansky/IHaskell \
&& . /env_3-5/bin/activate \
&& cd IHaskell \
&& stack install gtk2hs-buildtools \
&& stack install --fast \
&& /root/.local/bin/ihaskell install --stack
#!/usr/bin/env python
"""
Gargantext Software Copyright (c) 2016-2017 CNRS ISC-PIF -
http://iscpif.fr
......@@ -6,45 +7,29 @@ http://gitlab.iscpif.fr/humanities/gargantext/blob/stable/LICENSE )
- In France : a CECILL variant affero compliant
- GNU aGPLV3 for all other countries
"""
#!/usr/bin/env python
import sys
import os
import os
import django
# Django settings
dirname = os.path.dirname(os.path.realpath(__file__))
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "gargantext.settings")
# initialize Django application
from django.core.wsgi import get_wsgi_application
application = get_wsgi_application()
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gargantext.settings')
django.setup()
from gargantext.util.toolchain.main import parse_extract_indexhyperdata
from gargantext.util.db import *
from gargantext.models import Node
from gargantext.constants import QUERY_SIZE_N_MAX, get_resource, get_resource_by_name
from gargantext.models import ProjectNode, DocumentNode, UserNode, User
from gargantext.util.db import session, get_engine
from collections import Counter
import importlib
from django.http import Http404
from nltk.tokenize import wordpunct_tokenize
from gargantext.models import *
from nltk.tokenize import word_tokenize
import nltk as nltk
from statistics import mean
from math import log
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
import datetime
class NotebookError(Exception):
pass
from collections import Counter
from langdetect import detect as detect_lang
def documents(corpus_id):
return (session.query(Node).filter( Node.parent_id==corpus_id
, Node.typename=="DOCUMENT"
)
# .order_by(Node.hyperdata['publication_date'])
.all()
)
return (session.query(DocumentNode).filter_by(parent_id=corpus_id)
#.order_by(Node.hyperdata['publication_date'])
.all())
#import seaborn as sns
......@@ -63,13 +48,134 @@ def scan_hal(request):
hal = HalCrawler()
return hal.scan_results(request)
def scan_gargantext(corpus_id, lang, request):
connection = get_engine().connect()
# TODO add some sugar the request (ideally request should be the same for hal and garg)
query = """select count(n.id) from nodes n
where to_tsvector('%s', hyperdata ->> 'abstract' || 'title')
where to_tsvector('%s', hyperdata ->> 'abstract' || 'title')
@@ to_tsquery('%s')
AND n.parent_id = %s;""" % (lang, request, corpus_id)
AND n.parent_id = %s;""" % (lang, request, corpus_id)
return [i for i in connection.execute(query)][0][0]
connection.close()
def myProject_fromUrl(url):
"""
myProject :: String -> Project
"""
project_id = url.split("/")[4]
project = session.query(ProjectNode).get(project_id)
return project
def newCorpus(project, source, name=None, query=None):
error = False
if name is None:
name = query
if not isinstance(project, ProjectNode):
error = "a valid project"
if not isinstance(source, int) and not isinstance(source, str):
error = "a valid source identifier: id or name"
elif not isinstance(query, str):
error = "a valid query"
elif not isinstance(name, str):
error = "a valid name"
if error:
raise NotebookError("Please provide %s." % error)
resource = get_resource(source) if isinstance(source, int) else \
get_resource_by_name(source)
moissonneur_name = get_moissonneur_name(resource) if resource else \
source.lower()
try:
moissonneur = get_moissonneur(moissonneur_name)
except ImportError:
raise NotebookError("Invalid source identifier: %r" % source)
return run_moissonneur(moissonneur, project, name, query)
def get_moissonneur_name(ident):
""" Return moissonneur module name from RESOURCETYPE or crawler name """
# Does it quacks like a RESOURCETYPE ?
if hasattr(ident, 'get'):
ident = ident.get('crawler')
# Extract name from crawler class name, otherwise assume ident is already
# a moissonneur name.
if isinstance(ident, str) and ident.endswith('Crawler'):
return ident[:-len('Crawler')].lower()
def get_moissonneur(name):
""" Return moissonneur module from its name """
if not isinstance(name, str) or not name.islower():
raise NotebookError("Invalid moissonneur name: %r" % name)
module = importlib.import_module('moissonneurs.%s' % name)
module.name = name
return module
def run_moissonneur(moissonneur, project, name, query):
""" Run moissonneur and return resulting corpus """
# XXX Uber-kludge with gory details. Spaghetti rulezzzzz!
class Dummy(object):
pass
request = Dummy()
request.method = 'POST'
request.path = 'nowhere'
request.META = {}
# XXX 'string' only have effect on moissonneurs.pubmed; its value is added
# when processing request client-side, take a deep breath and see
# templates/projects/project.html for more details.
request.POST = {'string': name,
'query': query,
'N': QUERY_SIZE_N_MAX}
request.user = Dummy()
request.user.id = project.user_id
request.user.is_authenticated = lambda: True
if moissonneur.name == 'istex':
# Replace ALL spaces by plus signs
request.POST['query'] = '+'.join(filter(None, query.split(' ')))
try:
import json
r = moissonneur.query(request)
raw_json = r.content.decode('utf-8')
data = json.loads(raw_json)
if moissonneur.name == 'pubmed':
count = sum(x['count'] for x in data)
request.POST['query'] = raw_json
elif moissonneur.name == 'istex':
count = data.get('total', 0)
else:
count = data.get('results_nb', 0)
if count > 0:
corpus = moissonneur.save(request, project.id, return_corpus=True)
else:
return None
except (ValueError, Http404) as e:
raise e
# Sometimes strange things happens...
if corpus.name != name:
corpus.name = name
session.commit()
return corpus
......@@ -30,7 +30,7 @@ def query( request):
#ids = crawlerbot.get_ids(query)
return JsonHttpResponse({"results_nb":crawlerbot.results_nb})
def save(request, project_id):
def save(request, project_id, return_corpus=False):
'''save'''
if request.method == "POST":
......@@ -101,6 +101,9 @@ def save(request, project_id):
session.rollback()
# --------------------------------------------
if return_corpus:
return corpus
return render(
template_name = 'pages/projects/wait.html',
request = request,
......
......@@ -33,7 +33,7 @@ def query( request):
print(results)
return JsonHttpResponse({"results_nb":crawlerbot.results_nb})
def save(request, project_id):
def save(request, project_id, return_corpus=False):
'''save'''
if request.method == "POST":
......@@ -103,6 +103,9 @@ def save(request, project_id):
session.rollback()
# --------------------------------------------
if return_corpus:
return corpus
return render(
template_name = 'pages/projects/wait.html',
request = request,
......
......@@ -29,7 +29,7 @@ def query( request):
#ids = crawlerbot.get_ids(query)
return JsonHttpResponse({"results_nb":crawlerbot.results_nb})
def save(request, project_id):
def save(request, project_id, return_corpus=False):
'''save'''
if request.method == "POST":
......@@ -100,6 +100,9 @@ def save(request, project_id):
session.rollback()
# --------------------------------------------
if return_corpus:
return corpus
return render(
template_name = 'pages/projects/wait.html',
request = request,
......
......@@ -52,7 +52,7 @@ def query( request ):
def save(request , project_id):
def save(request , project_id, return_corpus=False):
print("testISTEX:")
print(request.method)
alist = ["bar","foo"]
......@@ -171,6 +171,9 @@ def save(request , project_id):
session.rollback()
# --------------------------------------------
if return_corpus:
return corpus
return render(
template_name = 'pages/projects/wait.html',
request = request,
......
......@@ -33,7 +33,7 @@ def query( request):
print(results)
return JsonHttpResponse({"results_nb":crawlerbot.results_nb})
def save(request, project_id):
def save(request, project_id, return_corpus=False):
'''save'''
if request.method == "POST":
......@@ -104,6 +104,9 @@ def save(request, project_id):
session.rollback()
# --------------------------------------------
if return_corpus:
return corpus
return render(
template_name = 'pages/projects/wait.html',
request = request,
......
......@@ -69,7 +69,7 @@ def query( request ):
return JsonHttpResponse(data)
def save( request , project_id ) :
def save( request , project_id, return_corpus=False ) :
# implicit global session
# do we have a valid project id?
try:
......@@ -164,6 +164,10 @@ def save( request , project_id ) :
session.rollback()
# --------------------------------------------
sleep(1)
if return_corpus:
return corpus
return HttpResponseRedirect('/projects/' + str(project_id))
data = alist
......
......@@ -2,11 +2,38 @@
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"# Advanced Gargantext Tutorial (Python)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"ename": "TypeError",
"evalue": "'list' object is not callable",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-3-a8e3501c9a54>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'/srv/gargantext'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m: 'list' object is not callable"
]
}
],
"source": [
"import sys\n",
"sys.pa"
]
},
{
"cell_type": "code",
"execution_count": 1,
......@@ -28,7 +55,9 @@
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
......
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Advanced Gargantext Tutorial (Python)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.insert(0, '/srv/gargantext')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# First import the library Gargantext Notebook\n",
"from gargantext_notebook import *\n",
"\n",
"# This enables to draw graphics later\n",
"%matplotlib inline "
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"105\n",
"LSTM 1000\n",
"Downloading page 0 to 100 results\n",
"Downloading page 100 to 100 results\n",
"CORPUS #17058\n",
"PARSING\n",
"Loading available PARSERS:\n",
"\t- EuropresseParser\n",
"\t- RISParser\n",
"\t- PubmedParser\n",
"\t- RISParser\n",
"\t- ISIParser\n",
"\t- RISParser\n",
"\t- CSVParser\n",
"\t- ISTexParser\n",
"\t- CernParser\n",
"\t- MultivacParser\n",
"\t- HalParser\n",
"\t- IsidoreParser\n",
"0 docs skipped\n",
"105 parsed\n",
"#MAIN language of the CORPUS __unknown__\n",
"CORPUS #17058: parsed 105\n",
"INTEGRATE\n",
"INTEGRATE\n",
"CORPUS #17058: extracted ngrams\n",
"CORPUS #17058: indexed hyperdata\n",
"CORPUS #17058: [2017-08-11_11:21:18] new favorites node #17164\n",
"CORPUS #17058: [2017-08-11_11:21:18] starting ngram lists computation\n",
"CORPUS #17058: [2017-08-11_11:21:18] new stoplist node #17165\n",
"# STEMMERS LOADED {'__unknown__': <nltk.stem.snowball.SnowballStemmer object at 0x7fb811588e48>}\n",
"#SUPPORTED STEMMERS LANGS []\n",
"CORPUS #17058: [2017-08-11_11:21:20] new grouplist node #17166\n",
"CORPUS #17058: [2017-08-11_11:21:20] new occs node #17167\n",
"compute_ti_ranking\n",
"2017-08-11_11:21:20 : Starting Query tf_nd_query\n",
"2017-08-11_11:21:21 : End Query tf_nd_quer\n",
"2017-08-11_11:21:21 : tfidfsum\n",
"CORPUS #17058: [2017-08-11_11:21:21] new ti ranking node #17168\n",
"MAINLIST: keeping 2908 ngrams out of 3878\n",
"CORPUS #17058: [2017-08-11_11:21:21] new mainlist node #17169\n",
"Compute TFIDF local\n",
"CORPUS #17058: [2017-08-11_11:21:22] new localtfidf node #17170\n",
"COOCS: NEW matrix shape [220x807]\n",
"CORPUS #17058: [2017-08-11_11:21:23] computed mainlist coocs for specif rank\n",
"SPECIFICITY: computing on 154 ngrams\n",
"CORPUS #17058: [2017-08-11_11:21:23] new spec-clusion node #17171\n",
"CORPUS #17058: [2017-08-11_11:21:23] new gen-clusion node #17172\n",
"MAPLIST quotas: {'topgen': {'multigrams': 168, 'monograms': 42}, 'topspec': {'multigrams': 112, 'monograms': 28}}\n",
"MAPLIST: top_spec_monograms = 28\n",
"MAPLIST: top_spec_multigrams = 41\n",
"MAPLIST: top_gen_monograms = 42\n",
"MAPLIST: top_gen_multigrams = 0\n",
"MAPLIST: kept 111 ngrams in total \n",
"CORPUS #17058: [2017-08-11_11:21:23] new maplist node #17173\n",
"CORPUS #17058: [2017-08-11_11:21:23] FINISHED ngram lists computation\n"
]
}
],
"source": [
"#project = myProject_fromUrl(\"http://imt.gargantext.org/projects/300535\")\n",
"project = myProject_fromUrl(\"http://localhost:8000/projects/2\")\n",
"corpus = newCorpus(project, source=\"hal\", name=\"Machine learning\", query=\"LSTM\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"session.query(Node.hyperdata[\"\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for doc in new_docs:\n",
" new_doc = (Node( user_id = project.user_id\n",
" , parent_id = corpus.id\n",
" , typename= 'DOCUMENT'\n",
" , name=doc[\"title\"][:50]\n",
" , hyperdata=doc)\n",
" )\n",
" session.add(new_doc)\n",
"session.commit()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"L'identifiant du corpus est : 254749\n"
]
}
],
"source": [
"# Copier/coller l'url du corpus (avec http://): sur lequel travailler\n",
"corpus_url = \"http://gargantext.org/projects/251737/corpora/254749\"\n",
"\n",
"corpus_id = corpus_url.split(\"/\")[6]\n",
"\n",
"print(\"L\\'identifiant du corpus est : %s\" % corpus_id)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# To get all the documents:\n",
"docs = documents(corpus_id)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Towards big data science in the decade ahead from ten years of InCoB and the 1st ISCB-Asia Joint Conference.'"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# To get the title of the first document \n",
"# [0] indicates the index of the first document\n",
"docs[0].hyperdata['title']"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\"The 2011 International Conference on Bioinformatics (InCoB) conference, which is the annual scientific conference of the Asia-Pacific Bioinformatics Network (APBioNet), is hosted by Kuala Lumpur, Malaysia, is co-organized with the first ISCB-Asia conference of the International Society for Computational Biology (ISCB). InCoB and the sequencing of the human genome are both celebrating their tenth anniversaries and InCoB's goalposts for the next decade, implementing standards in bioinformatics and globally distributed computational networks, will be discussed and adopted at this conference. Of the 49 manuscripts (selected from 104 submissions) accepted to BMC Genomics and BMC Bioinformatics conference supplements, 24 are featured in this issue, covering software tools, genome/proteome analysis, systems biology (networks, pathways, bioimaging) and drug discovery and design.\""
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# To get the abstract of the first document (0)\n",
"docs[0].hyperdata['abstract']"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Shoba Ranganathan, Christian Schönbach, Janet Kelso, Burkhard Rost, Sheila Nathan, Tin Wee Tan'"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# To get the authors of the first document (0)\n",
"docs[0].hyperdata['authors']"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'BMC bioinformatics'"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# To get the source of the first document (0)\n",
"docs[0].hyperdata['source']"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# If I want to count:\n",
"myChart = chart(docs, \"publication_year\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x7f48069c5208>"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAEZCAYAAACZwO5kAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAGl1JREFUeJzt3X+QVOWd7/H3hx+KURcUhymKQQYTNJorksnEYLnmF/E3\nFaxEXciWoMUuNzcSje7NOnvvrZuyyruFdVOyGvd6Q0IiWhvBsFHZRBMI0U1WLyoS8BdRJ4phKIUR\nlSiK8uN7/+gH0pIZupvpnp5++Lyqpvqc5zzn9Le7Tn84POd0H0UEZmaWr0H1LsDMzGrLQW9mljkH\nvZlZ5hz0ZmaZc9CbmWXOQW9mlrmygl7SNZKekfS0pLskDZM0XtKjkjolLZF0WOp7eJrvTMtba/kC\nzMzswEoGvaQxwFVAe0T8J2AwMB24EZgfER8B3gBmp1VmA2+k9vmpn5mZ1Um5QzdDgCMkDQE+BLwC\nfB5YmpYvAi5K09PSPGn5FEmqTrlmZlapIaU6RMQmSd8G/gC8CywHngDejIhdqVsXMCZNjwE2pnV3\nSdoGjARe6+05jjvuuGhtbT3Y12Bmdkh64oknXouIplL9Sga9pGMoHKWPB94Efgyc19cCJc0B5gAc\nf/zxrF69uq+bNDM7pEh6uZx+5QzdfAF4KSK6I2In8BPgTGBEGsoBaAE2pelNwNhUxBBgOLB1/41G\nxIKIaI+I9qamkv8gmZnZQSon6P8ATJb0oTTWPgV4FngQuDj1mQXcl6aXpXnS8l+FfznNzKxuSgZ9\nRDxK4aTqGuCptM4C4DrgWkmdFMbgF6ZVFgIjU/u1QEcN6jYzszJpIBxst7e3h8fozRrbzp076erq\nYseOHfUuJTvDhg2jpaWFoUOHfqBd0hMR0V5q/ZInY83MytHV1cXRRx9Na2srvqK6eiKCrVu30tXV\nxfjx4w9qG/4JBDOrih07djBy5EiHfJVJYuTIkX36n5KD3syqxiFfG319Xx30ZmaZ8xi9Vay142c1\n3f6GeRfWdPvWP6q9n5SzXwwePJhTTz2VnTt3MmTIEGbOnMk111zDoEG9H9Nu2LCBRx55hK985SsH\n3PYJJ5zAAw88wEknnbSv7Rvf+AajR4/muuuu63XbU6dO5emnny5Zey35iN7MsnHEEUewdu1annnm\nGVasWMEDDzzA9ddff8B1NmzYwI9+9KOS254+fTqLFy/eN79nzx6WLl3K9OnT+1x3rTnozSxLo0aN\nYsGCBdx6661EBBs2bOCss86ira2NtrY2HnnkEQA6Ojr4zW9+w6RJk5g/fz67d+/mm9/8Jp/85CeZ\nOHEi3/3udwGYMWMGS5Ys2bf9X//614wbN45x48b1uu1it99+O3Pnzt03P3XqVB566CEAli9fzhln\nnEFbWxuXXHIJb7/9dlXfCwe9mWXrhBNOYPfu3WzZsoVRo0axYsUK1qxZw5IlS7jqqqsAmDdvHmed\ndRZr167lmmuuYeHChQwfPpzHH3+cxx9/nO9973u89NJLnHrqqQwaNIh169YBsHjxYmbMmAHQ67bL\n8dprr3HDDTfwy1/+kjVr1tDe3s5NN91U1ffBY/RmdkjYuXMnc+fOZe3atQwePJjnn3++x37Lly/n\nySefZOnSwq+wb9u2jRdeeIHx48czY8YMFi9ezMc+9jHuvffefcNC5W67J6tWreLZZ5/lzDPPBOD9\n99/njDPO6OOr/SAHvZll68UXX2Tw4MGMGjWK66+/nubmZtatW8eePXsYNmxYj+tEBN/5znc499xz\n/2zZ9OnTOeecc/jMZz7DxIkTaW5uBmD+/Pkltz1kyBD27Nmzb37vdfERwdlnn81dd91VjZfcIw/d\nmFmWuru7+epXv8rcuXORxLZt2xg9ejSDBg3izjvvZPfu3QAcffTRvPXWW/vWO/fcc7ntttvYuXMn\nAM8//zzbt28H4MMf/jDHHXccHR0d+4ZtgF63Xay1tZW1a9eyZ88eNm7cyGOPPQbA5MmTefjhh+ns\n7ARg+/btFf2PoBw+ojezmqjHZbLvvvsukyZN2nd55WWXXca1114LwNe+9jW+/OUvc8cdd3Deeedx\n5JFHAjBx4kQGDx7MaaedxuWXX87VV1/Nhg0baGtrIyJoamri3nvv3fccM2bMoKOjgy996Uv72nrb\ndrEzzzyT8ePHc8opp3DyySfT1tYGQFNTE7fffjszZszgvffeA+CGG27gxBNPrNr74h81s4r5Onrr\nyfr16zn55JPrXUa2enp/y/1RMw/dmJllzkFvZpY5B72ZVc1AGArOUV/fVwe9mVXFsGHD2Lp1q8O+\nyvb+Hn1vl4OWw1fdmFlVtLS00NXVRXd3d71Lyc7eO0wdrJJBL+kkYElR0wnA/wTuSO2twAbg0oh4\nI91A/GbgAuAd4PKIWHPQFZpZQxg6dOhB3wHJaqucm4M/FxGTImIS8AkK4X0PhZt+r4yICcBK/nQT\n8POBCelvDnBbLQo3M7PyVDpGPwX4fUS8DEwDFqX2RcBFaXoacEcUrAJGSBpdlWrNzKxilQb9dGDv\nDzI0R8QrafpVoDlNjwE2Fq3TldrMzKwOyg56SYcBXwR+vP+yKJxmr+hUu6Q5klZLWu2TN2ZmtVPJ\nEf35wJqI2JzmN+8dkkmPW1L7JmBs0Xotqe0DImJBRLRHRHtTU1PllZuZWVkqCfoZ/GnYBmAZMCtN\nzwLuK2qfqYLJwLaiIR4zM+tnZV1HL+lI4GzgPxc1zwPuljQbeBm4NLXfT+HSyk4KV+hcUbVqzcys\nYmUFfURsB0bu17aVwlU4+/cN4MqqVGdmZn3mn0AwM8ucg97MLHMOejOzzDnozcwy56A3M8ucg97M\nLHMOejOzzDnozcwy56A3M8ucg97MLHMOejOzzDnozcwy56A3M8ucg97MLHMOejOzzDnozcwy56A3\nM8ucg97MLHMOejOzzJUV9JJGSFoq6XeS1ks6Q9KxklZIeiE9HpP6StItkjolPSmprbYvwczMDqTc\nI/qbgZ9HxEeB04D1QAewMiImACvTPMD5wIT0Nwe4raoVm5lZRUoGvaThwKeBhQAR8X5EvAlMAxal\nbouAi9L0NOCOKFgFjJA0uuqVm5lZWco5oh8PdAM/lPRbSd+XdCTQHBGvpD6vAs1pegywsWj9rtT2\nAZLmSFotaXV3d/fBvwIzMzugcoJ+CNAG3BYRHwe286dhGgAiIoCo5IkjYkFEtEdEe1NTUyWrmplZ\nBcoJ+i6gKyIeTfNLKQT/5r1DMulxS1q+CRhbtH5LajMzszooGfQR8SqwUdJJqWkK8CywDJiV2mYB\n96XpZcDMdPXNZGBb0RCPmZn1syFl9vs68C+SDgNeBK6g8I/E3ZJmAy8Dl6a+9wMXAJ3AO6mvmZnV\nSVlBHxFrgfYeFk3poW8AV/axLjMzqxJ/M9bMLHMOejOzzDnozcwy56A3M8ucg97MLHMOejOzzDno\nzcwy56A3M8ucg97MLHMOejOzzDnozcwy56A3M8ucg97MLHMOejOzzDnozcwy56A3M8ucg97MLHMO\nejOzzJUV9JI2SHpK0lpJq1PbsZJWSHohPR6T2iXpFkmdkp6U1FbLF2BmZgdWyRH95yJiUkTsvXds\nB7AyIiYAK9M8wPnAhPQ3B7itWsWamVnl+jJ0Mw1YlKYXARcVtd8RBauAEZJG9+F5zMysD8oN+gCW\nS3pC0pzU1hwRr6TpV4HmND0G2Fi0bldq+wBJcyStlrS6u7v7IEo3M7NyDCmz319GxCZJo4AVkn5X\nvDAiQlJU8sQRsQBYANDe3l7RumZmVr6yjugjYlN63ALcA5wObN47JJMet6Tum4CxRau3pDYzM6uD\nkkEv6UhJR++dBs4BngaWAbNSt1nAfWl6GTAzXX0zGdhWNMRjZmb9rJyhm2bgHkl7+/8oIn4u6XHg\nbkmzgZeBS1P/+4ELgE7gHeCKqldtZmZlKxn0EfEicFoP7VuBKT20B3BlVaozM7M+8zdjzcwy56A3\nM8ucg97MLHPlXkc/oLR2/Kym298w78Kabt/MrD/5iN7MLHMOejOzzDnozcwy56A3M8ucg97MLHMO\nejOzzDnozcwy56A3M8ucg97MLHMOejOzzDnozcwy56A3M8ucg97MLHMOejOzzJUd9JIGS/qtpJ+m\n+fGSHpXUKWmJpMNS++FpvjMtb61N6WZmVo5KjuivBtYXzd8IzI+IjwBvALNT+2zgjdQ+P/UzM7M6\nKSvoJbUAFwLfT/MCPg8sTV0WARel6WlpnrR8SupvZmZ1UO4R/T8Bfw/sSfMjgTcjYlea7wLGpOkx\nwEaAtHxb6v8BkuZIWi1pdXd390GWb2ZmpZQMeklTgS0R8UQ1nzgiFkREe0S0NzU1VXPTZmZWpJx7\nxp4JfFHSBcAw4C+Am4ERkoako/YWYFPqvwkYC3RJGgIMB7ZWvXIzMytLySP6iPiHiGiJiFZgOvCr\niPhr4EHg4tRtFnBfml6W5knLfxURUdWqzcysbH25jv464FpJnRTG4Bem9oXAyNR+LdDRtxLNzKwv\nyhm62SciHgIeStMvAqf30GcHcEkVajMzsyrwN2PNzDLnoDczy5yD3swscw56M7PMOejNzDLnoDcz\ny5yD3swscw56M7PMOejNzDLnoDczy5yD3swscw56M7PMOejNzDLnoDczy5yD3swscw56M7PMOejN\nzDLnoDczy1zJoJc0TNJjktZJekbS9al9vKRHJXVKWiLpsNR+eJrvTMtba/sSzMzsQMo5on8P+HxE\nnAZMAs6TNBm4EZgfER8B3gBmp/6zgTdS+/zUz8zM6qRk0EfB22l2aPoL4PPA0tS+CLgoTU9L86Tl\nUySpahWbmVlFyhqjlzRY0lpgC7AC+D3wZkTsSl26gDFpegywESAt3waM7GGbcyStlrS6u7u7b6/C\nzMx6VVbQR8TuiJgEtACnAx/t6xNHxIKIaI+I9qampr5uzszMelHRVTcR8SbwIHAGMELSkLSoBdiU\npjcBYwHS8uHA1qpUa2ZmFSvnqpsmSSPS9BHA2cB6CoF/ceo2C7gvTS9L86Tlv4qIqGbRZmZWviGl\nuzAaWCRpMIV/GO6OiJ9KehZYLOkG4LfAwtR/IXCnpE7gdWB6Deo2M7MylQz6iHgS+HgP7S9SGK/f\nv30HcElVqjMzsz7zN2PNzDLnoDczy5yD3swscw56M7PMOejNzDLnoDczy5yD3swscw56M7PMOejN\nzDLnoDczy5yD3swscw56M7PMOejNzDLnoDczy5yD3swscw56M7PMOejNzDLnoDczy1w5NwcfK+lB\nSc9KekbS1an9WEkrJL2QHo9J7ZJ0i6ROSU9Kaqv1izAzs96Vc0S/C/i7iDgFmAxcKekUoANYGRET\ngJVpHuB8YEL6mwPcVvWqzcysbCWDPiJeiYg1afotYD0wBpgGLErdFgEXpelpwB1RsAoYIWl01Ss3\nM7OyDKmks6RW4OPAo0BzRLySFr0KNKfpMcDGotW6UtsrRW1ImkPhiJ/jjz++wrLNDk2tHT+r6fY3\nzLuwptu3+ij7ZKyko4B/Bb4REX8sXhYRAUQlTxwRCyKiPSLam5qaKlnVzMwqUFbQSxpKIeT/JSJ+\nkpo37x2SSY9bUvsmYGzR6i2pzczM6qCcq24ELATWR8RNRYuWAbPS9CzgvqL2menqm8nAtqIhHjMz\n62fljNGfCVwGPCVpbWr7b8A84G5Js4GXgUvTsvuBC4BO4B3giqpWbGZmFSkZ9BHxH4B6WTylh/4B\nXNnHuszMrEr8zVgzs8w56M3MMuegNzPLnIPezCxzDnozs8w56M3MMuegNzPLnIPezCxzDnozs8w5\n6M3MMuegNzPLnIPezCxzDnozs8w56M3MMuegNzPLXEU3Bzcz6wvf3Lw+fERvZpY5B72ZWebKuTn4\nDyRtkfR0UduxklZIeiE9HpPaJekWSZ2SnpTUVsvizcystHKO6G8HztuvrQNYGRETgJVpHuB8YEL6\nmwPcVp0yzczsYJUM+oj4NfD6fs3TgEVpehFwUVH7HVGwChghaXS1ijUzs8od7Bh9c0S8kqZfBZrT\n9BhgY1G/rtT2ZyTNkbRa0uru7u6DLMPMzErp88nYiAggDmK9BRHRHhHtTU1NfS3DzMx6cbBBv3nv\nkEx63JLaNwFji/q1pDYzM6uTgw36ZcCsND0LuK+ofWa6+mYysK1oiMfMzOqg5DdjJd0FfBY4TlIX\n8C1gHnC3pNnAy8Clqfv9wAVAJ/AOcEUNajYzswqUDPqImNHLoik99A3gyr4WZWZm1eNvxpqZZc5B\nb2aWOQe9mVnmHPRmZplz0JuZZc5Bb2aWOQe9mVnmHPRmZplz0JuZZc5Bb2aWOQe9mVnmSv7WjVlu\nWjt+VrNtb5h3Yc22bXawfERvZpY5B72ZWeYc9GZmmXPQm5llzkFvZpY5B72ZWeZqEvSSzpP0nKRO\nSR21eA4zMytP1YNe0mDgn4HzgVOAGZJOqfbzmJlZeWrxhanTgc6IeBFA0mJgGvBsDZ6rIdXyCzvg\nL+2Y1UqjfnYVEdXdoHQxcF5E/E2avwz4VETM3a/fHGBOmj0JeK6qhXzQccBrNdx+rbn++mnk2sH1\n11ut6x8XEU2lOtXtJxAiYgGwoD+eS9LqiGjvj+eqBddfP41cO7j+ehso9dfiZOwmYGzRfEtqMzOz\nOqhF0D8OTJA0XtJhwHRgWQ2ex8zMylD1oZuI2CVpLvALYDDwg4h4ptrPU6F+GSKqIddfP41cO7j+\nehsQ9Vf9ZKyZmQ0s/masmVnmHPRmZplz0JuZZc5Bb2aWuUMi6CXdUe8azPqDpNMlfTJNnyLpWkkX\n1Lsuq6/sbg4uaf9r9gV8TtIIgIj4Yv9XVR2SroiIH9a7jnJJ+ksKv330dEQsr3c95ZD0UWAM8GhE\nvF3Ufl5E/Lx+lZUm6VsUfkxwiKQVwKeAB4EOSR+PiP9V1wJLkPQpYH1E/FHSEUAH0Ebhd7L+MSK2\n1bXAEiRdBdwTERvrXcv+sru8UtIaCjvG94GgEPR3UfjiFhHx7/Wrrm8k/SEijq93Hb2R9FhEnJ6m\n/xa4ErgHOAf4t4iYV8/6Skkf1CuB9cAk4OqIuC8tWxMRbfWsrxRJT1Go+3DgVaClKDQfjYiJdS2w\nBEnPAKel7+IsAN4BlgJTUvuX6lpgCZK2AduB31PInB9HRHd9qyrI7ogeaAeuBv478M2IWCvp3UYJ\neElP9rYIaO7PWg7C0KLpOcDZEdEt6dvAKmBABz3wt8AnIuJtSa3AUkmtEXEzhfd/oNsVEbuBdyT9\nPiL+CBAR70raU+fayjEoInal6faif1j/Q9LaehVVgReBTwBfAP4KuF7SExRC/ycR8Va9Cssu6CNi\nDzBf0o/T42Ya63U2A+cCb+zXLuCR/i+nIoMkHUPh3I/2Hs1ExHZJuw686oAwaO9wTURskPRZCmE/\njsYI+vclfSgi3qEQOABIGg40QtA/XTQ8uU5Se0SslnQisLPexZUhUv4sB5ZLGkphKG0G8G2g5K9M\n1kojBWBFIqILuETShcAf611PBX4KHBURf3YEI+mh/i+nIsOBJyiEYkgaHRGvSDqKxgjKzZIm7X3v\n05H9VOAHwKn1La0sn46I92DfAc9eQ4FZ9SmpIn8D3Czpf1D4ad//J2kjsDEtG+g+sI9HxE4Kv/O1\nTNKH6lNSQXZj9Aci6ajiE2zWP9JO3hwRL9W7lgOR1EJh+OPVHpadGREP16GsqmikfV/SXwDjKRyI\ndkXE5jqXVBZJJ0bE8/WuoyeHWtAP6JOZpTTSh3V/jVw7ZFG/9/06qnf92Q3dSLq2t0XAUf1ZSw08\nCzTqh7WRa4cGqN/7/oBW1/qzC3rgH4H/DfR08m/Af0GskT+sjVw7NH79eN+vq4Fcf45Bvwa4NyKe\n2H+BpEY4odPIH9ZGrh0av37v+/U1YOvPboxe0knA6z19UUFS80A/sSPpEeDrvXxYN0bE2B5WGxAa\nuXbIon7v+3U0kOvPLugbXSN/WBu5dmj8+htdo7//A7n+7II+fTnkH4CLgFEUfgZhC3AfMC8i3qxj\neWY1433fetMI416VupvCt0o/GxHHRsRI4HOp7e66VlYGScMlzZP0O0mvS9oqaX1qG1Hv+g6kkWuH\nxq8f7/t1NZDrzzHoWyPixuIvvUTEqxFxIzCujnWVq5E/rI1cOzR+/d7362vA1p/j0M1y4JfAor1j\nYpKagcsp/MjWF+pYXkmSnouIkypdNhA0cu2QRf3e9+toINef4xH9XwEjgX+X9Iak14GHgGOBS+tZ\nWJlelvT36QMKFD6skq6j8JsfA1kj1w6NX7/3/foasPVnF/QR8QbwQ2AuMDb9F+rkiLiOwk0wBrpG\n/rA2cu3Q4PV736+7AVt/jkM3DX3zCNh3l6MWYFU03l2OGrZ2aOz6ve/X34CtPyKy+gOeovAzvwCt\nwGoKOzzAb+tdXxn1XwU8B9wLbACmFS1bU+/6cq09k/q977v+Hv9y/AmERr95RCPf5aiRa4fGr9/7\nfn0N2PpzDPpGv3lEI39YG7l2aPz6ve/X14CtP7uTscBMCjdG3icidkXETODT9SmpIpslTdo7k3ac\nqcBxDPwPayPXDo1fv/f9+hqw9Wd3MrbRqYHvctTItUPj19/oGv39H8j1O+jNzDKX49CNmZkVcdCb\nmWXOQW+HJEm7Ja2V9IykdZL+TtIBPw+SWiV9pb9qNKsWB70dqt6NiEkR8THgbOB84Fsl1mkFHPTW\ncHwy1g5Jkt6OiKOK5k8AHqdwKdw44E7gyLR4bkQ8ImkVcDLwErAIuAWYB3wWOBz454j4br+9CLMy\nOejtkLR/0Ke2N4GTgLeAPRGxQ9IE4K6IaE9fgPmvETE19Z8DjIqIGyQdDjwMXBIRL/XrizErIcdv\nxpr11VDg1vTll93Aib30OweYKOniND8cmEDhiN9swHDQm7Fv6GY3hXusfgvYDJxG4TzWjt5WA74e\nEb/olyLNDpJPxtohT1IT8H+BW6MwljkceCUi9gCXAYNT17eAo4tW/QXwXyQNTds5UdKRmA0wPqK3\nQ9URktZSGKbZReHk601p2f8B/lXSTODnwPbU/iSwW9I64HbgZgpX4qyRJKAbuKi/XoBZuXwy1sws\ncx66MTPLnIPezCxzDnozs8w56M3MMuegNzPLnIPezCxzDnozs8z9f8zGHY6Yb9aNAAAAAElFTkSu\nQmCC\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x7f48069bd7b8>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"myChart.plot.bar()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Title\n",
"\n",
"Here I can add some comments on the cart.\n",
"1. First point\n",
"2. Second point"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Lang Cleaning tools"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"detect_lang(\"Ceci est une phrase en français.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"detect_lang(\"This is an english sentence.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"chart(docs, \"language_iso2\").plot.bar()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Counter({'de': 13,\n",
" 'en': 1547,\n",
" 'es': 5,\n",
" 'fi': 1,\n",
" 'fr': 4,\n",
" 'hu': 1,\n",
" 'it': 1,\n",
" 'ja': 5,\n",
" 'ko': 1,\n",
" 'ru': 3,\n",
" 'zh': 23})"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Counter([doc.hyperdata[\"language_iso2\"] for doc in docs])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Deleting language that is not in majority\n",
"def cleanCorpusWithLang(corpus_id, lang):\n",
" return (session.query(Node.id).filter(Node.parent_id == corpus_id)\n",
" .filter(Node.hyperdata[\"language_iso2\"].astext != lang)\n",
" .count()\n",
" #.delete()\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"57"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cleanCorpusWithLang(corpus_id, 'en')"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(True, 'This is an english paragraph.\\n '),\n",
" (False, '\"This is an english paragraph.\\n\\nThis is an english paragraph.\\n ')]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"abstract0 = \"\"\"\"Ceci est un paragraphe en français.\n",
"\n",
"This is an english paragraph.\n",
" \"\"\"\n",
"\n",
"abstract1 = \"\"\"\"This is an english paragraph.\n",
"\n",
"This is an english paragraph.\n",
" \"\"\"\n",
"\n",
"def clean_lang_inText(lang, text):\n",
" \n",
" texts_before = nltk.tokenize.blankline_tokenize(text)\n",
" texts_after = '\\n\\n'.join([sentence \n",
" for sentence in texts_before\n",
" if detect_lang(sentence) == lang\n",
" ])\n",
" \n",
" return (len(texts_before) != len(nltk.tokenize.blankline_tokenize(texts_after)), texts_after)\n",
"\n",
"[clean_lang_inText('en', abstract) for abstract in [abstract0, abstract1]]\n",
"\n",
"# TODO update each document accordingly"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# TODO update all the abstract with That function"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Measures IMT Tools"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"ename": "ConnectionError",
"evalue": "HTTPSConnectionPool(host='api.archives-ouvertes.fr', port=443): Max retries exceeded with url: /search?wt=json&q=machine+learning+AND+deep&fl=+title_s%0A+++++++++++++++%2C+abstract_s%0A+++++++++++++++%2C+submittedDate_s%0A+++++++++++++++%2C+journalDate_s%0A+++++++++++++++%2C+authFullName_s%0A+++++++++++++++%2C+uri_s%0A+++++++++++++++%2C+isbn_s%0A+++++++++++++++%2C+issue_s%0A+++++++++++++++%2C+docType_s%0A+++++++++++++++%2C+journalPublisher_s%0A+++++++++++++&start=1&rows=10 (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x7f48069d1f98>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution',))",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mgaierror\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/packages/urllib3/connection.py\u001b[0m in \u001b[0;36m_new_conn\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 141\u001b[0m conn = connection.create_connection(\n\u001b[0;32m--> 142\u001b[0;31m (self.host, self.port), self.timeout, **extra_kw)\n\u001b[0m\u001b[1;32m 143\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/packages/urllib3/util/connection.py\u001b[0m in \u001b[0;36mcreate_connection\u001b[0;34m(address, timeout, source_address, socket_options)\u001b[0m\n\u001b[1;32m 66\u001b[0m \u001b[0merr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 67\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mres\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msocket\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetaddrinfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhost\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mport\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msocket\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSOCK_STREAM\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 68\u001b[0m \u001b[0maf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msocktype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mproto\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcanonname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msa\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/lib/python3.5/socket.py\u001b[0m in \u001b[0;36mgetaddrinfo\u001b[0;34m(host, port, family, type, proto, flags)\u001b[0m\n\u001b[1;32m 732\u001b[0m \u001b[0maddrlist\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 733\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mres\u001b[0m \u001b[0;32min\u001b[0m \u001b[0m_socket\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetaddrinfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhost\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mport\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfamily\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mproto\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflags\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 734\u001b[0m \u001b[0maf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msocktype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mproto\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcanonname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msa\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mgaierror\u001b[0m: [Errno -3] Temporary failure in name resolution",
"\nDuring handling of the above exception, another exception occurred:\n",
"\u001b[0;31mNewConnectionError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, **response_kw)\u001b[0m\n\u001b[1;32m 577\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 578\u001b[0;31m chunked=chunked)\n\u001b[0m\u001b[1;32m 579\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m 350\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 351\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate_conn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 352\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mSocketTimeout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mBaseSSLError\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_validate_conn\u001b[0;34m(self, conn)\u001b[0m\n\u001b[1;32m 813\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'sock'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# AppEngine might not have `.sock`\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 814\u001b[0;31m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 815\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/packages/urllib3/connection.py\u001b[0m in \u001b[0;36mconnect\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 253\u001b[0m \u001b[0;31m# Add certificate verification\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 254\u001b[0;31m \u001b[0mconn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_new_conn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 255\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/packages/urllib3/connection.py\u001b[0m in \u001b[0;36m_new_conn\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 150\u001b[0m raise NewConnectionError(\n\u001b[0;32m--> 151\u001b[0;31m self, \"Failed to establish a new connection: %s\" % e)\n\u001b[0m\u001b[1;32m 152\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNewConnectionError\u001b[0m: <requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x7f48069d1f98>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution",
"\nDuring handling of the above exception, another exception occurred:\n",
"\u001b[0;31mMaxRetryError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/adapters.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 402\u001b[0m \u001b[0mretries\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_retries\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 403\u001b[0;31m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 404\u001b[0m )\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, **response_kw)\u001b[0m\n\u001b[1;32m 622\u001b[0m retries = retries.increment(method, url, error=e, _pool=self,\n\u001b[0;32m--> 623\u001b[0;31m _stacktrace=sys.exc_info()[2])\n\u001b[0m\u001b[1;32m 624\u001b[0m \u001b[0mretries\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/packages/urllib3/util/retry.py\u001b[0m in \u001b[0;36mincrement\u001b[0;34m(self, method, url, response, error, _pool, _stacktrace)\u001b[0m\n\u001b[1;32m 280\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mnew_retry\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_exhausted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 281\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mMaxRetryError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_pool\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merror\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mResponseError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcause\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 282\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mMaxRetryError\u001b[0m: HTTPSConnectionPool(host='api.archives-ouvertes.fr', port=443): Max retries exceeded with url: /search?wt=json&q=machine+learning+AND+deep&fl=+title_s%0A+++++++++++++++%2C+abstract_s%0A+++++++++++++++%2C+submittedDate_s%0A+++++++++++++++%2C+journalDate_s%0A+++++++++++++++%2C+authFullName_s%0A+++++++++++++++%2C+uri_s%0A+++++++++++++++%2C+isbn_s%0A+++++++++++++++%2C+issue_s%0A+++++++++++++++%2C+docType_s%0A+++++++++++++++%2C+journalPublisher_s%0A+++++++++++++&start=1&rows=10 (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x7f48069d1f98>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution',))",
"\nDuring handling of the above exception, another exception occurred:\n",
"\u001b[0;31mConnectionError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-16-b220cbbc8ecc>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mscan_hal\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"machine learning AND deep\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/srv/gargantext/gargantext_notebook.py\u001b[0m in \u001b[0;36mscan_hal\u001b[0;34m(request)\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mscan_hal\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0mhal\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mHalCrawler\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 62\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mhal\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscan_results\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 63\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 64\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mscan_gargantext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcorpus_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlang\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/srv/gargantext/gargantext/util/crawlers/HAL.py\u001b[0m in \u001b[0;36mscan_results\u001b[0;34m(self, query)\u001b[0m\n\u001b[1;32m 90\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresults_nb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 91\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 92\u001b[0;31m total = ( self._get(query)\n\u001b[0m\u001b[1;32m 93\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"response\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 94\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"numFound\"\u001b[0m \u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/srv/gargantext/gargantext/util/crawlers/HAL.py\u001b[0m in \u001b[0;36m_get\u001b[0;34m(self, query, fromPage, count, lang)\u001b[0m\n\u001b[1;32m 69\u001b[0m \u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mURL\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 70\u001b[0m \u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 71\u001b[0;31m \u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mquerystring\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 72\u001b[0m )\n\u001b[1;32m 73\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/api.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(method, url, **kwargs)\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[0;31m# cases, and look like a memory leak in others.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 56\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0msessions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSession\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 57\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 58\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 59\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 473\u001b[0m }\n\u001b[1;32m 474\u001b[0m \u001b[0msend_kwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msettings\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 475\u001b[0;31m \u001b[0mresp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprep\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0msend_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 476\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 477\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 583\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 584\u001b[0m \u001b[0;31m# Send the request\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 585\u001b[0;31m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0madapter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 586\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 587\u001b[0m \u001b[0;31m# Total elapsed time of the request (approximately)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/requests/adapters.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 465\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mProxyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 466\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 467\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mConnectionError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 468\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 469\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mClosedPoolError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mConnectionError\u001b[0m: HTTPSConnectionPool(host='api.archives-ouvertes.fr', port=443): Max retries exceeded with url: /search?wt=json&q=machine+learning+AND+deep&fl=+title_s%0A+++++++++++++++%2C+abstract_s%0A+++++++++++++++%2C+submittedDate_s%0A+++++++++++++++%2C+journalDate_s%0A+++++++++++++++%2C+authFullName_s%0A+++++++++++++++%2C+uri_s%0A+++++++++++++++%2C+isbn_s%0A+++++++++++++++%2C+issue_s%0A+++++++++++++++%2C+docType_s%0A+++++++++++++++%2C+journalPublisher_s%0A+++++++++++++&start=1&rows=10 (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x7f48069d1f98>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution',))"
]
}
],
"source": [
"scan_hal(\"machine learning AND deep\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Request syntax\n",
"# \"network analysis\" = network <-> analysis\n",
"# \"network OR analysis\" = network | analysis\n",
"# \"network AND analysis\" = network & analysis\n",
"\n",
"scan_gargantext(corpus_id, 'english', \"machine | learning & deep\")\n",
"\n",
"# \"network NOT analysis\" = @@ to_tsquery('network') !! to_tsquery('analysis')\n",
"# (need to change the function if not has to be used)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Forces / Faiblesses de l'IMT\n",
"# Hal Query Gargantext Query\n",
"queries = [ (\"network analysis\" , \"network <-> analysis\" )\n",
" , (\"big data AND something\" , \"(big <-> data) & something\")\n",
" ]\n",
"[(query[0], query[1]) for query in queries]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def imt_vs_hal(corpus_id, queryHal, queryGarg):\n",
" return((scan_gargantext(corpus_id, 'english', queryGarg), scan_hal(queryHal)))\n",
" #return((scan_gargantext(corpus_id, 'english', queryGarg) *100 / scan_hal(queryHal)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Then chart it to see your strenght and weakness!\n",
"[imt_vs_hal(corpus_id, query[0], query[1]) for query in queries]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Graph generation"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# TODO Cooccurrences optimization"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# TODO optimize the distributional distance"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# List Management"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Front End add a check box to merge or to overwrite previous list"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# optimize the list merge"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -57,7 +57,7 @@
<center id="corpus" class="help">
<a data-toggle="modal" href="#addcorpus" >
<button
type="button"
......@@ -532,7 +532,7 @@
$("#submit_thing").html("Process a {{ query_size }} sample!")
thequeries = data
var N=0,k=0;
var N=0;
for(var i in thequeries) N += thequeries[i].count
if( N>0) {
......@@ -571,12 +571,11 @@
$("#submit_thing").html("Process a {{ query_size }} sample!")
thequeries = data
var N=data.length,k=0;
// for(var i in thequeries) N += thequeries[i].count
if( N>1) {
var total = JSON.parse(data).total
console.log("N: "+total)
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+total+" publications.</i><br>")
var N = data.total;
if (N > 0) {
console.log("N: "+N)
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+N+" publications.</i><br>")
$('#submit_thing').prop('disabled', false);
} else {
$("#theresults").html("<i> <b>"+data[0]+"</b></i><br>")
......@@ -661,7 +660,7 @@
console.log(data)
console.log("SUCCESS")
console.log("enabling "+"#"+value.id)
// $("#"+value.id).attr('onclick','getGlobalResults(this);');
$("#submit_thing").prop('disabled' , false)
//$("#submit_thing").html("Process a {{ query_size }} sample!")
......@@ -721,7 +720,7 @@
console.log(data)
console.log("SUCCESS")
console.log("enabling "+"#"+value.id)
// $("#"+value.id).attr('onclick','getGlobalResults(this);');
$("#submit_thing").prop('disabled' , false)
//$("#submit_thing").html("Process a {{ query_size }} sample!")
......@@ -781,7 +780,7 @@
console.log(data)
console.log("SUCCESS")
console.log("enabling "+"#"+value.id)
// $("#"+value.id).attr('onclick','getGlobalResults(this);');
$("#submit_thing").prop('disabled' , false)
//$("#submit_thing").html("Process a {{ query_size }} sample!")
......@@ -876,12 +875,12 @@
console.log("selected:", selectedId);
// by typeID: 3 = PUBMED, 8 = ISTEX, 9 = CERN
if ( selectedId == "3"
|| selectedId == "8"
|| selectedId == "9"
|| selectedId == "10"
|| selectedId == "11"
|| selectedId == "12"
if ( selectedId == "3"
|| selectedId == "8"
|| selectedId == "9"
|| selectedId == "10"
|| selectedId == "11"
|| selectedId == "12"
) {
console.log("show the button for: " + selectedId)
$("#div-fileornot").css("visibility", "visible");
......@@ -1019,16 +1018,16 @@
function saveMultivac(query, N){
console.log("In Multivac")
if(!query || query=="") return;
console.log(query)
//var origQuery = query
var data = { "query" : query , "N": N };
// Replace all the slashes
var projectid = window.location.href.split("projects")[1].replace(/\//g, '')
console.log(data)
$.ajax({
dataType: 'json',
......@@ -1066,16 +1065,16 @@
function save(query, N, urlGarg){
console.log("In Gargantext")
if(!query || query=="") return;
console.log(query)
//var origQuery = query
var data = { "query" : query , "N": N };
// Replace all the slashes
var projectid = window.location.href.split("projects")[1].replace(/\//g, '')
console.log(data)
$.ajax({
dataType: 'json',
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment