From fc15b8bc286141fd7fbd6ec068412c4ed87c95ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexandre=20Delano=C3=AB?= <devel+git@delanoe.org> Date: Mon, 7 Aug 2017 07:13:24 +0200 Subject: [PATCH] [FEAT] need to fix the crawler. --- AdvancedTutorial.ipynb | 764 ------------------------ gargantext/util/crawlers/HAL.py | 2 +- gargantext/util/parsers/HAL.py | 21 +- install/notebook.run | 2 +- install/notebook/gargantext_notebook.py | 54 +- 5 files changed, 64 insertions(+), 779 deletions(-) delete mode 100644 AdvancedTutorial.ipynb diff --git a/AdvancedTutorial.ipynb b/AdvancedTutorial.ipynb deleted file mode 100644 index c453e873..00000000 --- a/AdvancedTutorial.ipynb +++ /dev/null @@ -1,764 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Advanced Gargantext Tutorial (Python)" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "# First import the library Gargantext Notebook\n", - "from gargantext_notebook import *\n", - "\n", - "# This enables to draw graphics later\n", - "%matplotlib inline " - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "L'identifiant du corpus est : 254749\n" - ] - } - ], - "source": [ - "# Copier/coller l'url du corpus (avec http://): sur lequel travailler\n", - "corpus_url = \"http://gargantext.org/projects/251737/corpora/254749\"\n", - "\n", - "corpus_id = corpus_url.split(\"/\")[6]\n", - "\n", - "print(\"L\\'identifiant du corpus est : %s\" % corpus_id)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "# To get all the documents:\n", - "docs = documents(corpus_id)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'Towards big data science in the decade ahead from ten years of InCoB and the 1st ISCB-Asia Joint Conference.'" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# To get the title of the first document \n", - "# [0] indicates the index of the first document\n", - "docs[0].hyperdata['title']" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "\"The 2011 International Conference on Bioinformatics (InCoB) conference, which is the annual scientific conference of the Asia-Pacific Bioinformatics Network (APBioNet), is hosted by Kuala Lumpur, Malaysia, is co-organized with the first ISCB-Asia conference of the International Society for Computational Biology (ISCB). InCoB and the sequencing of the human genome are both celebrating their tenth anniversaries and InCoB's goalposts for the next decade, implementing standards in bioinformatics and globally distributed computational networks, will be discussed and adopted at this conference. Of the 49 manuscripts (selected from 104 submissions) accepted to BMC Genomics and BMC Bioinformatics conference supplements, 24 are featured in this issue, covering software tools, genome/proteome analysis, systems biology (networks, pathways, bioimaging) and drug discovery and design.\"" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# To get the abstract of the first document (0)\n", - "docs[0].hyperdata['abstract']" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'Shoba Ranganathan, Christian Schönbach, Janet Kelso, Burkhard Rost, Sheila Nathan, Tin Wee Tan'" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# To get the authors of the first document (0)\n", - "docs[0].hyperdata['authors']" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'BMC bioinformatics'" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# To get the source of the first document (0)\n", - "docs[0].hyperdata['source']" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "# If I want to count:\n", - "myChart = chart(docs, \"publication_year\")" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "<matplotlib.axes._subplots.AxesSubplot at 0x7fc48a3da128>" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAEZCAYAAACZwO5kAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAGjxJREFUeJzt3X2QVfWd5/H3hwfFqAOKTRdFo40JGs2KpNMxWI55Ij5b\ngUrUgWwJWsyw2Ug0OpuxZ3drU1a5U6Q2JaNx1g0JiWhNRMNEYRNNIEYnGV1UJO0jUTuKoSmFFpEo\nPvHw3T/uD3JlgHsvfS+n74/Pq6rrnvM7v3Pu9946/enTv3vOPYoIzMwsX4OKLsDMzBrLQW9mljkH\nvZlZ5hz0ZmaZc9CbmWXOQW9mlrmqgl7S1ZKekfS0pDskDZM0TtIjknok3SnpkNT30DTfk5a3N/IF\nmJnZvlUMekljgCuBzoj4D8BgYBrwbWBeRHwE2ATMSqvMAjal9nmpn5mZFaTaoZshwGGShgAfAl4B\nPg8sTssXAlPT9JQ0T1o+WZLqU66ZmdVqSKUOEbFO0neAPwLvAMuAx4E3ImJb6tYLjEnTY4C1ad1t\nkjYDI4HX9vYcxxxzTLS3t+/vazAzOyg9/vjjr0VES6V+FYNe0lGUjtLHAW8APwHO7W+BkmYDswGO\nPfZYVq5c2d9NmpkdVCS9XE2/aoZuvgC8FBF9EbEV+ClwBjAiDeUAtAHr0vQ6YGwqYggwHNi4+0Yj\nYn5EdEZEZ0tLxT9IZma2n6oJ+j8CkyR9KI21TwaeBR4ALkp9ZgJL0vTSNE9a/uvwN6eZmRWmYtBH\nxCOUPlRdBTyV1pkPXAtcI6mH0hj8grTKAmBkar8G6GpA3WZmViUNhIPtzs7O2H2MfuvWrfT29vLu\nu+8WVFW+hg0bRltbG0OHDi26FDPrB0mPR0RnpX4VP4wtSm9vL0ceeSTt7e347Mz6iQg2btxIb28v\n48aNK7ocMzsABuxXILz77ruMHDnSIV9nkhg5cqT/UzI7iAzYoAcc8g3i99Xs4DKgg97MzPpvwI7R\n76696+d13d6auRdU7DN48GBOOeUUtm7dypAhQ5gxYwZXX301gwbt/e/jmjVrePjhh/nKV76yz20f\nf/zx3HfffZx44om72r7xjW8wevRorr322r1u+8ILL+Tpp5+uWLvZQFTv3+PdVfN7fTDyEf0+HHbY\nYXR3d/PMM8+wfPly7rvvPq677rp9rrNmzRp+/OMfV9z2tGnTWLRo0a75HTt2sHjxYqZNm9bvus3M\nyjnoqzRq1Cjmz5/PzTffTESwZs0azjzzTDo6Oujo6ODhhx8GoKuri9/+9rdMnDiRefPmsX37dr75\nzW/yyU9+kgkTJvC9730PgOnTp3PnnXfu2v5vfvMbjjvuOI477ri9brvcrbfeypw5c3bNX3jhhTz4\n4IMALFu2jNNPP52Ojg4uvvhi3nrrrQa+M2Y20Dnoa3D88cezfft2NmzYwKhRo1i+fDmrVq3izjvv\n5MorrwRg7ty5nHnmmXR3d3P11VezYMEChg8fzmOPPcZjjz3G97//fV566SVOOeUUBg0axBNPPAHA\nokWLmD59OsBet12N1157jeuvv55f/epXrFq1is7OTm644Yb6vxlm1jSaZox+oNm6dStz5syhu7ub\nwYMH8/zzz++x37Jly3jyySdZvLj0jc6bN2/mhRdeYNy4cUyfPp1FixbxsY99jHvuuWfXsFC1296T\nFStW8Oyzz3LGGWcA8P7773P66af389WaWTNz0NfgxRdfZPDgwYwaNYrrrruO1tZWnnjiCXbs2MGw\nYcP2uE5E8N3vfpdzzjnn3y2bNm0aZ599Np/5zGeYMGECra2tAMybN6/itocMGcKOHTt2ze88Lz4i\nOOuss7jjjjvq8ZLNLAMeuqlSX18fX/3qV5kzZw6S2Lx5M6NHj2bQoEHcfvvtbN++HYAjjzySN998\nc9d655xzDrfccgtbt24F4Pnnn2fLli0AfPjDH+aYY46hq6tr17ANsNdtl2tvb6e7u5sdO3awdu1a\nHn30UQAmTZrEQw89RE9PDwBbtmyp6T8CM8tP0xzRF3Ha1DvvvMPEiRN3nV556aWXcs011wDwta99\njS9/+cvcdtttnHvuuRx++OEATJgwgcGDB3Pqqady2WWXcdVVV7FmzRo6OjqICFpaWrjnnnt2Pcf0\n6dPp6uriS1/60q62vW273BlnnMG4ceM4+eSTOemkk+jo6ACgpaWFW2+9lenTp/Pee+8BcP3113PC\nCSc07H0ys4FtwH6p2erVqznppJMKqih/fn+tCD6Pvr6q/VIzD92YmWXOQW9mlrkBHfQDYVgpR35f\nzQ4uAzbohw0bxsaNGx1Kdbbz++j3djqomeVnwJ5109bWRm9vL319fUWXkp2dd5gys4NDxaCXdCJw\nZ1nT8cD/AG5L7e3AGuCSiNiUbiB+I3A+8DZwWUSsqrWwoUOH+g5IZmZ1UM3NwZ+LiIkRMRH4BKXw\nvpvSTb/vj4jxwP38+Sbg5wHj089s4JZGFG5mZtWpdYx+MvCHiHgZmAIsTO0LgalpegpwW5SsAEZI\nGl2Xas3MrGa1Bv00YOeXqLRGxCtp+lWgNU2PAdaWrdOb2szMrABVB72kQ4AvAj/ZfVmUTo2p6fQY\nSbMlrZS00h+4mpk1Ti1H9OcBqyJifZpfv3NIJj1uSO3rgLFl67Wltg+IiPkR0RkRnS0tLbVXbmZm\nVakl6Kfz52EbgKXAzDQ9E1hS1j5DJZOAzWVDPGZmdoBVdR69pMOBs4D/VNY8F7hL0izgZeCS1H4v\npVMreyidoXN53ao1M7OaVRX0EbEFGLlb20ZKZ+Hs3jeAK+pSnZmZ9duA/QoEMzOrDwe9mVnmHPRm\nZplz0JuZZc5Bb2aWOQe9mVnmHPRmZplz0JuZZc5Bb2aWOQe9mVnmHPRmZplz0JuZZc5Bb2aWOQe9\nmVnmHPRmZplz0JuZZc5Bb2aWOQe9mVnmHPRmZpmrKugljZC0WNLvJa2WdLqkoyUtl/RCejwq9ZWk\nmyT1SHpSUkdjX4KZme1LtUf0NwK/iIiPAqcCq4Eu4P6IGA/cn+YBzgPGp5/ZwC11rdjMzGpSMegl\nDQc+DSwAiIj3I+INYAqwMHVbCExN01OA26JkBTBC0ui6V25mZlWp5oh+HNAH/EjS7yT9QNLhQGtE\nvJL6vAq0pukxwNqy9XtT2wdImi1ppaSVfX19+/8KzMxsn6oJ+iFAB3BLRHwc2MKfh2kAiIgAopYn\njoj5EdEZEZ0tLS21rGpmZjWoJuh7gd6IeCTNL6YU/Ot3Dsmkxw1p+TpgbNn6banNzMwKUDHoI+JV\nYK2kE1PTZOBZYCkwM7XNBJak6aXAjHT2zSRgc9kQj5mZHWBDquz3deCfJR0CvAhcTumPxF2SZgEv\nA5ekvvcC5wM9wNupr5mZFaSqoI+IbqBzD4sm76FvAFf0sy4zM6sTXxlrZpY5B72ZWeYc9GZmmXPQ\nm5llzkFvZpY5B72ZWeYc9GZmmXPQm5llzkFvZpY5B72ZWeYc9GZmmXPQm5llzkFvZpY5B72ZWeYc\n9GZmmXPQm5llzkFvZpY5B72ZWeaqCnpJayQ9Jalb0srUdrSk5ZJeSI9HpXZJuklSj6QnJXU08gWY\nmdm+1XJE/7mImBgRO+8d2wXcHxHjgfvTPMB5wPj0Mxu4pV7FmplZ7fozdDMFWJimFwJTy9pvi5IV\nwAhJo/vxPGZm1g/VBn0AyyQ9Lml2amuNiFfS9KtAa5oeA6wtW7c3tX2ApNmSVkpa2dfXtx+lm5lZ\nNYZU2e8vI2KdpFHAckm/L18YESEpanniiJgPzAfo7OysaV0zM6teVUf0EbEuPW4A7gZOA9bvHJJJ\njxtS93XA2LLV21KbmZkVoGLQSzpc0pE7p4GzgaeBpcDM1G0msCRNLwVmpLNvJgGby4Z4zMzsAKtm\n6KYVuFvSzv4/johfSHoMuEvSLOBl4JLU/17gfKAHeBu4vO5Vm5lZ1SoGfUS8CJy6h/aNwOQ9tAdw\nRV2qMzOzfvOVsWZmmXPQm5llzkFvZpa5as+jN8tGe9fPG7btNXMvaNi2zfaXj+jNzDLnoDczy5yD\n3swscw56M7PMOejNzDLnoDczy5yD3swscw56M7PMOejNzDLnoDczy5yD3swscw56M7PMOejNzDLn\noDczy1zVQS9psKTfSfpZmh8n6RFJPZLulHRIaj80zfek5e2NKd3MzKpRyxH9VcDqsvlvA/Mi4iPA\nJmBWap8FbErt81I/MzMrSFVBL6kNuAD4QZoX8HlgceqyEJiapqekedLyyam/mZkVoNoj+n8E/g7Y\nkeZHAm9ExLY03wuMSdNjgLUAafnm1P8DJM2WtFLSyr6+vv0s38zMKqkY9JIuBDZExOP1fOKImB8R\nnRHR2dLSUs9Nm5lZmWruGXsG8EVJ5wPDgL8AbgRGSBqSjtrbgHWp/zpgLNAraQgwHNhY98rNzKwq\nFY/oI+LvI6ItItqBacCvI+I/Ag8AF6VuM4ElaXppmict/3VERF2rNjOzqvXnPPprgWsk9VAag1+Q\n2hcAI1P7NUBX/0o0M7P+qGboZpeIeBB4ME2/CJy2hz7vAhfXoTYzM6sDXxlrZpY5B72ZWeYc9GZm\nmXPQm5llzkFvZpY5B72ZWeYc9GZmmXPQm5llzkFvZpY5B72ZWeYc9GZmmXPQm5llzkFvZpY5B72Z\nWeYc9GZmmXPQm5llzkFvZpY5B72ZWeYqBr2kYZIelfSEpGckXZfax0l6RFKPpDslHZLaD03zPWl5\ne2NfgpmZ7Us1R/TvAZ+PiFOBicC5kiYB3wbmRcRHgE3ArNR/FrAptc9L/czMrCAVgz5K3kqzQ9NP\nAJ8HFqf2hcDUND0lzZOWT5akulVsZmY1qWqMXtJgSd3ABmA58AfgjYjYlrr0AmPS9BhgLUBavhkY\nuYdtzpa0UtLKvr6+/r0KMzPbq6qCPiK2R8REoA04Dfhof584IuZHRGdEdLa0tPR3c2Zmthc1nXUT\nEW8ADwCnAyMkDUmL2oB1aXodMBYgLR8ObKxLtWZmVrNqzrppkTQiTR8GnAWsphT4F6VuM4ElaXpp\nmict/3VERD2LNjOz6g2p3IXRwEJJgyn9YbgrIn4m6VlgkaTrgd8BC1L/BcDtknqA14FpDajbzMyq\nVDHoI+JJ4ON7aH+R0nj97u3vAhfXpTozM+s3XxlrZpY5B72ZWeYc9GZmmXPQm5llzkFvZpY5B72Z\nWeYc9GZmmXPQm5llzkFvZpY5B72ZWeYc9GZmmXPQm5llzkFvZpY5B72ZWeYc9GZmmXPQm5llzkFv\nZpY5B72ZWeaquTn4WEkPSHpW0jOSrkrtR0taLumF9HhUapekmyT1SHpSUkejX4SZme1dNUf024C/\njYiTgUnAFZJOBrqA+yNiPHB/mgc4DxiffmYDt9S9ajMzq1rFoI+IVyJiVZp+E1gNjAGmAAtTt4XA\n1DQ9BbgtSlYAIySNrnvlZmZWlSG1dJbUDnwceARojYhX0qJXgdY0PQZYW7Zab2p7pawNSbMpHfFz\n7LHH1lR0e9fPa+pfqzVzL2jo9s3MDqSqP4yVdATwL8A3IuJP5csiIoCo5YkjYn5EdEZEZ0tLSy2r\nmplZDaoKeklDKYX8P0fET1Pz+p1DMulxQ2pfB4wtW70ttZmZWQGqOetGwAJgdUTcULZoKTAzTc8E\nlpS1z0hn30wCNpcN8ZiZ2QFWzRj9GcClwFOSulPbfwXmAndJmgW8DFySlt0LnA/0AG8Dl9e1YjMz\nq0nFoI+IfwO0l8WT99A/gCv6WZeZmdWJr4w1M8ucg97MLHMOejOzzDnozcwy56A3M8ucg97MLHMO\nejOzzDnozcwy56A3M8ucg97MLHMOejOzzDnozcwy56A3M8ucg97MLHMOejOzzNV0c3Az8M3ZzZqN\nj+jNzDLnoDczy1w1Nwf/oaQNkp4uazta0nJJL6THo1K7JN0kqUfSk5I6Glm8mZlVVs0R/a3Aubu1\ndQH3R8R44P40D3AeMD79zAZuqU+ZZma2vyoGfUT8Bnh9t+YpwMI0vRCYWtZ+W5SsAEZIGl2vYs3M\nrHb7O0bfGhGvpOlXgdY0PQZYW9avN7X9O5JmS1opaWVfX99+lmFmZpX0+8PYiAgg9mO9+RHRGRGd\nLS0t/S3DzMz2Yn+Dfv3OIZn0uCG1rwPGlvVrS21mZlaQ/Q36pcDMND0TWFLWPiOdfTMJ2Fw2xGNm\nZgWoeGWspDuAzwLHSOoFvgXMBe6SNAt4Gbgkdb8XOB/oAd4GLm9AzWZmVoOKQR8R0/eyaPIe+gZw\nRX+LMjOz+vGVsWZmmXPQm5llzkFvZpY5B72ZWeYc9GZmmXPQm5llzkFvZpY5B72ZWeYc9GZmmXPQ\nm5llzkFvZpa5it91Y/XX3vXzhm5/zdwLGrp9M2suPqI3M8ucg97MLHMOejOzzDnozcwy56A3M8uc\ng97MLHMNCXpJ50p6TlKPpK5GPIeZmVWn7kEvaTDwT8B5wMnAdEkn1/t5zMysOo24YOo0oCciXgSQ\ntAiYAjzbgOcyO6j4YrtiNev7r4io7wali4BzI+Kv0/ylwKciYs5u/WYDs9PsicBzdS3kg44BXmvg\n9hvN9RenmWsH11+0Rtd/XES0VOpU2FcgRMR8YP6BeC5JKyOi80A8VyO4/uI0c+3g+os2UOpvxIex\n64CxZfNtqc3MzArQiKB/DBgvaZykQ4BpwNIGPI+ZmVWh7kM3EbFN0hzgl8Bg4IcR8Uy9n6dGB2SI\nqIFcf3GauXZw/UUbEPXX/cNYMzMbWHxlrJlZ5hz0ZmaZc9CbmWXOQW9mlrksg17SaZI+maZPlnSN\npPOLrsuaj6Tbiq7BrL+yuzm4pG9R+kK1IZKWA58CHgC6JH08Iv5noQVWQdJHgTHAIxHxVln7uRHx\ni+Iq2zdJnwJWR8SfJB0GdAEdlL7n6B8iYnOhBVYgaffrPQR8TtIIgIj44oGvqj4kXR4RPyq6jlpI\n+ktK3531dEQsK7qeSiRdCdwdEWuLrmV32Z1eKekpYCJwKPAq0FYWPI9ExIRCC6wg7SxXAKspvY6r\nImJJWrYqIjqKrG9fJD0DnJqupZgPvA0sBian9i8VWmAFklZR+qP0AyAoBf0dlC76IyL+tbjq+kfS\nHyPi2KLr2BdJj0bEaWn6byj9HtwNnA3834iYW2R9lUjaDGwB/kBpv/lJRPQVW1VJdkf0wLaI2A68\nLekPEfEngIh4R9KOgmurxt8An4iItyS1A4sltUfEjZSCZyAbFBHb0nRn2R+lf5PUXVRRNegErgL+\nG/DNiOiW9E6zBLykJ/e2CGg9kLXsp6Fl07OBsyKiT9J3gBXAgA564EXgE8AXgL8CrpP0OKXQ/2lE\nvFlUYTkG/fuSPhQRb1N60wGQNBxohqAftHO4JiLWSPospbA/joEf9E+XDRE8IakzIlZKOgHYWnRx\nlUTEDmCepJ+kx/U01+9IK3AOsGm3dgEPH/hyajZI0lGUPjvUzqPhiNgiadu+Vx0QIu1Dy4BlkoZS\nGkaeDnwHqPgtk43STDtxtT4dEe/Brl/cnYYCM4spqSbrJU2MiG6AdGR/IfBD4JRiS6vor4EbJf13\nSl/N+v8krQXWpmVNISJ6gYslXQD8qeh6avAz4Iid+045SQ8e+HJqNhx4nNIfppA0OiJekXQEA/8g\nB3arMSK2Uvqer6WSPlRMSSXZjdHvi6Qjyj/cHIgktVEafnp1D8vOiIiHCiirJpL+AhhH6UCiNyLW\nF1xSvzXDvpOrFJKtEfFS0bXsi6QTIuL5ouvYk4Mt6Af8B1L70sxh08y1g/edorn+/slu6EbSNXtb\nBBxxIGtpgGeBZg2bAV+7950BzfX3Q3ZBD/wD8L+APX14M+AvEGvmsGnm2hPvOwVy/Y2TY9CvAu6J\niMd3XyCpGT4QbOawaebawftO0Vx/g2Q3Ri/pROD1PV2oIKl1oH8wKOlh4Ot7CZu1ETF2D6sNCM1c\nO3jfKZrrb5zsgr7ZNXPYNHPtOWj299/1N052QZ8ujPp7YCowitKl7BuAJcDciHijwPJsAPO+Y7lq\nhnGvWt1F6crAz0bE0RExEvhcarur0MqqIGm4pLmSfi/pdUkbJa1ObSOKrm9fmrn2xPtOgVx/4+QY\n9O0R8e3yC44i4tWI+DZwXIF1VauZw6aZawfvO0Vz/Q2S49DNMuBXwMKdY2KSWoHLKH1J0hcKLK8i\nSc9FxIm1LhsImrl28L5TNNffODke0f8VMBL4V0mbJL0OPAgcDVxSZGFVelnS36WAAUphI+laSt8Z\nM5A1c+3gfadorr9Bsgv6iNgE/AiYA4xN/0KdFBHXUrqJwUDXzGHTzLV73yme62+QHIdumvbGHTup\ndIepNmBFNNEdpqDpa/e+UzDX3yARkdUP8BSlr2oFaAdWUvqFBfhd0fVVUf+VwHPAPcAaYErZslVF\n15dr7d53iv9x/Y37yfErEJr5xh3Q3HeYaubawftO0Vx/g+QY9M184w5o7rBp5trB+07RXH+jCivy\nyRtkBqWbgu8SEdsiYgbw6WJKqsl6SRN3zqQd50LgGAZ+2DRz7eB9p2iuv0Gy+zC22amJ7zDVzLXn\noNnff9ffOA56M7PM5Th0Y2ZmZRz0ZmaZc9DbQUnSdkndkp6R9ISkv5W0z98HSe2SvnKgajSrFwe9\nHazeiYiJEfEx4CzgPOBbFdZpBxz01nT8YawdlCS9FRFHlM0fDzxG6VS444DbgcPT4jkR8bCkFcBJ\nwEvAQuAmYC7wWeBQ4J8i4nsH7EWYVclBbwel3YM+tb0BnAi8CeyIiHcljQfuiIjOdAHMf4mIC1P/\n2cCoiLhe0qHAQ8DFEfHSAX0xZhXkeGWsWX8NBW5OF79sB07YS7+zgQmSLkrzw4HxlI74zQYMB70Z\nu4ZutlO6R+y3gPXAqZQ+x3p3b6sBX4+IXx6QIs32kz+MtYOepBbg/wA3R2ksczjwSkTsAC4FBqeu\nbwJHlq36S+A/SxqatnOCpMMxG2B8RG8Hq8MkdVMaptlG6cPXG9Ky/w38i6QZwC+ALan9SWC7pCeA\nW4EbKZ2Js0qSgD5g6oF6AWbV8oexZmaZ89CNmVnmHPRmZplz0JuZZc5Bb2aWOQe9mVnmHPRmZplz\n0JuZZe7/A6+cHY7zduzoAAAAAElFTkSuQmCC\n", - "text/plain": [ - "<matplotlib.figure.Figure at 0x7fc48a441a58>" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "myChart.plot.bar()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, - "source": [ - "## Title\n", - "\n", - "Here I can add some comments on the cart.\n", - "1. First point\n", - "2. Second point" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, - "source": [ - "# Lang Cleaning tools" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'fr'" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "detect_lang(\"Ceci est une phrase en français.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'en'" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "detect_lang(\"This is an english sentence.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "<matplotlib.axes._subplots.AxesSubplot at 0x7fc487e01e80>" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAENCAYAAAAG6bK5AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAGuhJREFUeJzt3X+8VXWd7/HXm4OKmqMoB64Beg4NOiGBnTkqXMfJMkHL\nG95u9uDYD6YsKiX7dSuamcf12tTjYc0oZRYTJoFeA31YKbecUTQbTa8KElAq6rmIcbgax1+EFsmP\nz/1jfQ9ujucXe5+9t/B9Px+P82Cv7/qu9fluOOz3Xt+19l6KCMzMLD9D6j0AMzOrDweAmVmmHABm\nZplyAJiZZcoBYGaWKQeAmVmmHABmZplyAJiZZcoBYGaWqX4DQNJCSZsl/bZb+6clrZP0sKRvlrR/\nRVK7pMckTS9pPyu1tUuaO7hPw8zM9pb6+yoISX8LvARcGxETU9vbgX8A3h0Rf5Y0MiI2S5oALAFO\nBt4I3AEcl3b1OHAm0AGsANoi4pG+ao8YMSKamprKfW5mZll66KGHno2Ixv76De2vQ0TcLampW/On\ngMsi4s+pz+bUPgNYmtqflNROEQYA7RGxHkDS0tS3zwBoampi5cqV/Q3RzMxKSHpqIP3KPQdwHHCa\npAck/Yekk1L7aGBjSb+O1NZbu5mZ1Um/RwB9bHckMAU4CbhR0rjBGJCk2cBsgGOOOWYwdmlmZj0o\n9wigA/hJFB4EdgEjgE3A2JJ+Y1Jbb+2vERELIqI1IlobG/udwjIzszKVewRwM/B24C5JxwEHAs8C\ny4AfSbqC4iTweOBBQMB4Sc0UL/wzgfMrHLuZ7QO2b99OR0cH27Ztq/dQ9jvDhg1jzJgxHHDAAWVt\n328ASFoCnA6MkNQBXAIsBBamS0NfAWZFcTnRw5JupDi5uwO4KCJ2pv3MAW4DGoCFEfFwWSM2s31K\nR0cHhx12GE1NTUiq93D2GxHBc889R0dHB83NzWXtYyBXAbX1suqDvfT/OvD1HtpvBW7dq9GZ2T5v\n27ZtfvGvAkkcddRRdHZ2lr0PfxLYzKrOL/7VUenfqwPAzPZ7DQ0NnHjiiZxwwglMnjyZyy+/nF27\ndvW5zYYNG/jRj37U777HjRvHY489tkfbZz/7Wb7xjW/0ue+JEycObPBVVO5JYKuTprk/L3vbDZe9\nexBHYlaeSn6HezKQ3+uDDz6Y1atXA7B582bOP/98/vCHP3DppZf2vt8UAOef3/f1KjNnzmTp0qVc\ncsklAOzatYubbrqJe++9dy+eRX34CMDMsjJy5EgWLFjAVVddRUSwYcMGTjvtNFpaWmhpaeG+++4D\nYO7cudxzzz2ceOKJzJs3j507d/LFL36Rk046iUmTJvH9738fgLa2Nm644Ybd+7/77rs59thjOfbY\nY3vdd6lFixYxZ86c3cvnnHMOv/zlLwG4/fbbmTp1Ki0tLZx33nm89NJLg/p34QAws+yMGzeOnTt3\nsnnzZkaOHMny5ctZtWoVN9xwAxdffDEAl112GaeddhqrV6/mc5/7HNdccw2HH344K1asYMWKFVx9\n9dU8+eSTvOUtb2HIkCGsWbMGgKVLl9LWVlw709u+B+LZZ5/la1/7GnfccQerVq2itbWVK664YlD/\nHjwFZGZZ2759O3PmzGH16tU0NDTw+OOP99jv9ttvZ+3atdx0000AbNmyhSeeeILm5mba2tpYunQp\nJ5xwAjfffPPuqaWB7rsn999/P4888ginnnoqAK+88gpTp06t8NnuyQFgZtlZv349DQ0NjBw5kksv\nvZRRo0axZs0adu3axbBhw3rcJiL4zne+w/Tp01+zbubMmUybNo23ve1tTJo0iVGjRgEwb968fvc9\ndOjQPU5Id31gLiI488wzWbJkyWA85R55CsjMstLZ2cknP/lJ5syZgyS2bNnC0UcfzZAhQ7juuuvY\nuXMnAIcddhhbt27dvd306dOZP38+27dvB+Dxxx/n5ZdfBuBNb3oTI0aMYO7cubunf4Be912qqamJ\n1atXs2vXLjZu3MiDDz4IwJQpU7j33ntpb28H4OWXX96rI4iBcACY2X7vT3/60+7LQN/5zncybdq0\n3VftXHjhhSxevJjJkyezbt06Dj30UAAmTZpEQ0MDkydPZt68eXzsYx9jwoQJtLS0MHHiRD7xiU+w\nY8eO3TXa2tpYt24d733ve3e39bbvUqeeeirNzc1MmDCBiy++mJaWFgAaGxtZtGgRbW1tTJo0ialT\np7Ju3bpB/Xvp94Yw9dTa2hq+H8CefBmo7WseffRR3vzmN9d7GPutnv5+JT0UEa39besjADOzTDkA\nzMwy5QAwM8uUA8DMqu71fK5xX1bp36sDwMyqatiwYTz33HMOgUHWdT+A3j63MBD+IJiZVdWYMWPo\n6Oio6HvrrWdddwQrlwPAzKrqgAMOKPuOVVZdngIyM8tUvwEgaaGkzen+v93XfUFSSBqRliXpSknt\nktZKainpO0vSE+ln1uA+DTMz21sDOQJYBJzVvVHSWGAa8LuS5rOB8elnNjA/9T2S4mbypwAnA5dI\nGl7JwM3MrDL9BkBE3A0838OqecCXgNJT+zOAa6NwP3CEpKOB6cDyiHg+Il4AltNDqJiZWe2UdQ5A\n0gxgU0Ss6bZqNLCxZLkjtfXWbmZmdbLXVwFJOgT4e4rpn0EnaTbF9BHHHHNMNUqYmRnlHQG8CWgG\n1kjaAIwBVkn6T8AmYGxJ3zGprbf214iIBRHRGhGtjY2NZQzPzMwGYq8DICJ+ExEjI6IpIpoopnNa\nIuIZYBnw4XQ10BRgS0Q8DdwGTJM0PJ38nZbazMysTgZyGegS4P8Ax0vqkHRBH91vBdYD7cDVwIUA\nEfE88E/AivTz1dRmZmZ10u85gIho62d9U8njAC7qpd9CYOFejs/MzKrEnwQ2M8uUA8DMLFMOADOz\nTDkAzMwy5QAwM8uUA8DMLFMOADOzTDkAzMwy5QAwM8uUA8DMLFMOADOzTDkAzMwy5QAwM8uUA8DM\nLFMOADOzTDkAzMwy5QAwM8uUA8DMLFMDuSfwQkmbJf22pO2fJa2TtFbSTyUdUbLuK5LaJT0maXpJ\n+1mprV3S3MF/KmZmtjcGcgSwCDirW9tyYGJETAIeB74CIGkCMBM4IW3zPUkNkhqA7wJnAxOAttTX\nzMzqpN8AiIi7gee7td0eETvS4v3AmPR4BrA0Iv4cEU8C7cDJ6ac9ItZHxCvA0tTXzMzqZDDOAXwU\n+Lf0eDSwsWRdR2rrrf01JM2WtFLSys7OzkEYnpmZ9aSiAJD0D8AO4PrBGQ5ExIKIaI2I1sbGxsHa\nrZmZdTO03A0l/R1wDnBGRERq3gSMLek2JrXRR7uZmdVBWUcAks4CvgS8JyL+WLJqGTBT0kGSmoHx\nwIPACmC8pGZJB1KcKF5W2dDNzKwS/R4BSFoCnA6MkNQBXEJx1c9BwHJJAPdHxCcj4mFJNwKPUEwN\nXRQRO9N+5gC3AQ3Awoh4uArPx8zMBqjfAIiIth6ar+mj/9eBr/fQfitw616NzszMqsafBDYzy5QD\nwMwsUw4AM7NMOQDMzDLlADAzy5QDwMwsUw4AM7NMOQDMzDLlADAzy5QDwMwsUw4AM7NMOQDMzDLl\nADAzy5QDwMwsUw4AM7NMOQDMzDLlADAzy1S/ASBpoaTNkn5b0nakpOWSnkh/Dk/tknSlpHZJayW1\nlGwzK/V/QtKs6jwdMzMbqIEcASwCzurWNhe4MyLGA3emZYCzKW4EPx6YDcyHIjAo7iV8CnAycElX\naJiZWX30GwARcTfwfLfmGcDi9HgxcG5J+7VRuB84QtLRwHRgeUQ8HxEvAMt5baiYmVkNlXsOYFRE\nPJ0ePwOMSo9HAxtL+nWktt7azcysTio+CRwRAcQgjAUASbMlrZS0srOzc7B2a2Zm3ZQbAL9PUzuk\nPzen9k3A2JJ+Y1Jbb+2vERELIqI1IlobGxvLHJ6ZmfWn3ABYBnRdyTMLuKWk/cPpaqApwJY0VXQb\nME3S8HTyd1pqMzOzOhnaXwdJS4DTgRGSOiiu5rkMuFHSBcBTwPtT91uBdwHtwB+BjwBExPOS/glY\nkfp9NSK6n1g2M7Ma6jcAIqKtl1Vn9NA3gIt62c9CYOFejc7MzKrGnwQ2M8uUA8DMLFMOADOzTDkA\nzMwy5QAwM8uUA8DMLFMOADOzTDkAzMwy5QAwM8uUA8DMLFMOADOzTDkAzMwy5QAwM8uUA8DMLFMO\nADOzTDkAzMwy5QAwM8uUA8DMLFMVBYCkz0l6WNJvJS2RNExSs6QHJLVLukHSganvQWm5Pa1vGown\nYGZm5Sk7ACSNBi4GWiNiItAAzAS+AcyLiL8EXgAuSJtcALyQ2uelfmZmVieVTgENBQ6WNBQ4BHga\neAdwU1q/GDg3PZ6Rlknrz5CkCuubmVmZyg6AiNgE/AvwO4oX/i3AQ8CLEbEjdesARqfHo4GNadsd\nqf9R5dY3M7PKVDIFNJziXX0z8EbgUOCsSgckabaklZJWdnZ2Vro7MzPrRSVTQO8EnoyIzojYDvwE\nOBU4Ik0JAYwBNqXHm4CxAGn94cBz3XcaEQsiojUiWhsbGysYnpmZ9aWSAPgdMEXSIWku/wzgEeAu\n4H2pzyzglvR4WVomrf9FREQF9c3MrAKVnAN4gOJk7irgN2lfC4AvA5+X1E4xx39N2uQa4KjU/nlg\nbgXjNjOzCg3tv0vvIuIS4JJuzeuBk3vouw04r5J6ZmY2ePxJYDOzTDkAzMwy5QAwM8uUA8DMLFMO\nADOzTDkAzMwy5QAwM8uUA8DMLFMOADOzTDkAzMwy5QAwM8uUA8DMLFMOADOzTDkAzMwy5QAwM8uU\nA8DMLFMOADOzTDkAzMwyVVEASDpC0k2S1kl6VNJUSUdKWi7pifTn8NRXkq6U1C5praSWwXkKZmZW\njkqPAL4N/HtE/BUwGXiU4mbvd0bEeOBOXr35+9nA+PQzG5hfYW0zM6tA2QEg6XDgb4FrACLilYh4\nEZgBLE7dFgPnpsczgGujcD9whKSjyx65mZlVpJIjgGagE/ihpF9L+oGkQ4FREfF06vMMMCo9Hg1s\nLNm+I7WZmVkdVBIAQ4EWYH5EvBV4mVenewCIiABib3YqabaklZJWdnZ2VjA8MzPrSyUB0AF0RMQD\nafkmikD4fdfUTvpzc1q/CRhbsv2Y1LaHiFgQEa0R0drY2FjB8MzMrC9lB0BEPANslHR8ajoDeARY\nBsxKbbOAW9LjZcCH09VAU4AtJVNFZmZWY0Mr3P7TwPWSDgTWAx+hCJUbJV0APAW8P/W9FXgX0A78\nMfU1M7M6qSgAImI10NrDqjN66BvARZXUMzOzweNPApuZZcoBYGaWKQeAmVmmHABmZplyAJiZZcoB\nYGaWKQeAmVmmHABmZplyAJiZZcoBYGaWKQeAmVmmHABmZplyAJiZZcoBYGaWKQeAmVmmHABmZply\nAJiZZcoBYGaWqYoDQFKDpF9L+llabpb0gKR2STek+wUj6aC03J7WN1Va28zMyjcYRwCfAR4tWf4G\nMC8i/hJ4AbggtV8AvJDa56V+ZmZWJxUFgKQxwLuBH6RlAe8AbkpdFgPnpscz0jJp/Rmpv5mZ1UGl\nRwDfAr4E7ErLRwEvRsSOtNwBjE6PRwMbAdL6Lam/mZnVQdkBIOkcYHNEPDSI40HSbEkrJa3s7Owc\nzF2bmVmJSo4ATgXeI2kDsJRi6ufbwBGShqY+Y4BN6fEmYCxAWn848Fz3nUbEgohojYjWxsbGCoZn\nZmZ9KTsAIuIrETEmIpqAmcAvIuIDwF3A+1K3WcAt6fGytExa/4uIiHLrm5lZZarxOYAvA5+X1E4x\nx39Nar8GOCq1fx6YW4XaZmY2QEP779K/iPgl8Mv0eD1wcg99tgHnDUY9MzOrnD8JbGaWKQeAmVmm\nHABmZplyAJiZZcoBYGaWKQeAmVmmHABmZplyAJiZZcoBYGaWKQeAmVmmHABmZplyAJiZZcoBYGaW\nKQeAmVmmHABmZplyAJiZZcoBYGaWqbIDQNJYSXdJekTSw5I+k9qPlLRc0hPpz+GpXZKulNQuaa2k\nlsF6EmZmtvcqOQLYAXwhIiYAU4CLJE2guNfvnRExHriTV+/9ezYwPv3MBuZXUNvMzCpUdgBExNMR\nsSo93go8CowGZgCLU7fFwLnp8Qzg2ijcDxwh6eiyR25mZhUZlHMAkpqAtwIPAKMi4um06hlgVHo8\nGthYsllHajMzszqoOAAkvQH4MfDZiPhD6bqICCD2cn+zJa2UtLKzs7PS4ZmZWS8qCgBJB1C8+F8f\nET9Jzb/vmtpJf25O7ZuAsSWbj0lte4iIBRHRGhGtjY2NlQzPzMz6UMlVQAKuAR6NiCtKVi0DZqXH\ns4BbSto/nK4GmgJsKZkqMjOzGhtawbanAh8CfiNpdWr7e+Ay4EZJFwBPAe9P624F3gW0A38EPlJB\nbTMzq1DZARARvwLUy+ozeugfwEXl1jMzs8HlTwKbmWXKAWBmlikHgJlZphwAZmaZcgCYmWXKAWBm\nlikHgJlZphwAZmaZcgCYmWXKAWBmlikHgJlZphwAZmaZcgCYmWXKAWBmlikHgJlZphwAZmaZcgCY\nmWXKAWBmlqmaB4CksyQ9Jqld0txa1zczs0IlN4Xfa5IagO8CZwIdwApJyyLikVqOw/Ze09yfV7T9\nhsvePUgjMbPBUtMAAE4G2iNiPYCkpcAMwAFgZnVRyZubff2NTa0DYDSwsWS5AzilxmOomN8N11bO\n/0HNqkkRUbti0vuAsyLiY2n5Q8ApETGnpM9sYHZaPB54rIKSI4BnK9h+X6tbz9q51a1nbT/nPGpX\nUvfYiGjsr1OtjwA2AWNLlsektt0iYgGwYDCKSVoZEa2Dsa99oW49a+dWt561/ZzzqF2LurW+CmgF\nMF5Ss6QDgZnAshqPwczMqPERQETskDQHuA1oABZGxMO1HIOZmRVqPQVERNwK3FqjcoMylbQP1a1n\n7dzq1rO2n3Metatet6Yngc3M7PXDXwVhZpYpB4CZWaZqfg5gfyZpJDCsazkiflfH4ZiZ9clHAINA\n0nskPQE8CfwHsAH4t7oOyvY7kg6UNDH9HFDDup8ZSNsg1muQdH219m+v2i9OAkv6UkR8U9J3gNc8\noYi4uMr11wDvAO6IiLdKejvwwYi4oJp1U+1G4ONAEyVHdBHx0f2xbqr9GeCHwFbgB8BbgbkRcXsN\naj9Jz79j46pc93RgMcWbC1F8oHJWRNxdzbqp9qqIaOnW9uuIeGsVa/4KeEdEvFKtGn3UPg6YD4yK\niImSJgHviYiv1aD2D+n596sq/6/2lymgLwPfBP4v8EId6m+PiOckDZE0JCLukvStGtW+BbgHuAPY\nWaOa9awL8NGI+Lak6cBw4EPAdUDVAwAo/WTmMOA84Mga1L0cmBYRj8HuF6klwF9Xq6CkNuB8oFlS\n6Qc2DwOer1bdZD1wb6r7cldjRFxR5boAVwNfBL6faq6V9COg6gEA/Kzk8TDgvwL/r1rF9pcA+L2k\nNwIfAU6neIdUSy9KegPFC+L1kjZT8ktbZYdExJdrVOv1UBde/fd9N3BdRDwsqSb/5hHxXLemb0l6\nCPgfVS59QNeLfxrH4zWYBroPeJriO2kuL2nfCqytRkFJ10XEh4D3APMopqkPq0atPhwSEQ92+5Xa\nUYvCEfHj0mVJS4BfVave/hIA84E7gXHAQyXtojicqurhOcUv6zbgM8AHgb8ALq1yzS4/k/Su9AG7\nWqpXXYCHJN1G8e86V9JhwK5aFJZUOhUyhOKIoBb/j1ZK+gHwv9LyBym+WqVqIuIp4ClgajXrdPPX\n6c3c74Dv1LBuqWclvYk0FZO+xPLpOo1lPDCyWjvfL84BdJE0PyI+VcN6v4qIv5G0lVfn7breNuyi\nOEz+54j4XhXHsBU4BHgF2J7qR0T8RRXrdT3XN6S6r1S7brcxDAH+ERgeEZ+TdAzFtx/eU4Pad/Hq\n899BMSf/LxHxeJXr/g3F/TT+JjXdAzwRET/rfauKa/b0+w1V/LeWdDHwKaCZPac+umpW+80cksZR\nfAr3P1NMKT8JfCAFYjXrimI69aWS5meAr3Q/Mhi0mvtTALzeSDoKuC8ijq9ijSHAB4DmiPhqejE8\nOiIeqFbNVPcXwOUR8fOStqsj4uPVrJvqzKcI2HdExJslDQduj4iTalB7GPDf2PPkd0TEV6tcdxXF\nSd/fpOU24LMRsc/dT2Mgav1mLtX8fLemgymO8l6G2px/kPTbiJhY7TpdfBloFaX54tOrXOa7wBSg\nLS1vBa6qck0oXgC/JKl07rtqJyS7OSUiLqKYdiMiXgAOrFHtm4H/QnG09VL6qcX5nvcBiyQdL+nj\nwIXAtBrUrYtav/gnh6WfVoqjkOHAEcAngZY+thtMD0mq+huZLvvLOYDXrYio9tzhKRHRIunXqd4L\n6au2q+1F4AzgSkn/m2JOula2p/tLd83RNlKjcwDAmIg4q0a1douI9eld/80U8+PTIuJPtR7H/iwi\nLgWQdDfQEhFb0/L/BCq7DeDAnQJ8QNJTFG8suqa+JlWjmANg31evF0NFxA7gQkl/R3GlwvAa1AW4\nEvgpMFLS1yneHf9jjWrfJ+ktXVMx1SbpN+w5/34kxVepPyCJar0wZG4UxXmtLq+ktlqYXqM6gANg\nf1CvF8N/7XoQEYvSC9VFNahLRFyfLr08g+Id0rkR8Wg1a5a8EA8FPiJpPfBnqvwODTinSvu13l0L\nPCjpp2n5XGBRLQpX+0Rzdz4JvB+Q9Fe8+mJ4Z7VfDHMk6di+1tf6P65VV7rc97S0eHdE/Lqe46kW\nB4CZWaZ8FZCZWaYcAGZmmXIAmJWQtFPSakkPS1oj6Qvpw3Z9bdMk6fxajdFssDgAzPb0p4g4MSJO\nAM4EzgYu6WebJopvzTTbp/gksFkJSS9FxBtKlsdRfOnaCOBYiq+dPjStnhMR90m6H3gzxXfGLKa4\nNPcyik+BHwR8NyK+X7MnYTZADgCzEt0DILW9CBxP8TUbuyJim6TxwJKIaE03a/nvEXFO6j8bGBkR\nX5N0EHAvcF5EPFnTJ2PWD38QzGzgDgCuknQixbc2HtdLv2nApPQ1wgCHU3ytrwPAXlccAGZ9SFNA\nO4HNFOcCfg9Mpjh/tq23zYBPR8RtNRmkWZl8EtisF+l7lf4VuCqKudLDgacjYhfFbSgbUtet7HnX\nqtuAT3XdsUvScZIOxex1xkcAZns6WNJqiumeHRQnfbu+B/57wI8lfRj4d179Gui1wE5Jayi+M+bb\nFFcGrUo3+eik+D4Zs9cVnwQ2M8uUp4DMzDLlADAzy5QDwMwsUw4AM7NMOQDMzDLlADAzy5QDwMws\nUw4AM7NM/X+WgGyX8gJ3yQAAAABJRU5ErkJggg==\n", - "text/plain": [ - "<matplotlib.figure.Figure at 0x7fc4cc6ea6d8>" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "chart(docs, \"language_iso2\").plot.bar()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "Counter({'de': 13,\n", - " 'en': 1547,\n", - " 'es': 5,\n", - " 'fi': 1,\n", - " 'fr': 4,\n", - " 'hu': 1,\n", - " 'it': 1,\n", - " 'ja': 5,\n", - " 'ko': 1,\n", - " 'ru': 3,\n", - " 'zh': 23})" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Counter([doc.hyperdata[\"language_iso2\"] for doc in docs])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "# Deleting language that is not in majority\n", - "def cleanCorpusWithLang(corpus_id, lang):\n", - " return (session.query(Node.id).filter(Node.parent_id == corpus_id)\n", - " .filter(Node.hyperdata[\"language_iso2\"].astext != lang)\n", - " .count()\n", - " #.delete()\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "57" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cleanCorpusWithLang(corpus_id, 'en')" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "[(True, 'This is an english paragraph.\\n '),\n", - " (False, '\"This is an english paragraph.\\n\\nThis is an english paragraph.\\n ')]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "abstract0 = \"\"\"\"Ceci est un paragraphe en français.\n", - "\n", - "This is an english paragraph.\n", - " \"\"\"\n", - "\n", - "abstract1 = \"\"\"\"This is an english paragraph.\n", - "\n", - "This is an english paragraph.\n", - " \"\"\"\n", - "\n", - "def clean_lang_inText(lang, text):\n", - " \n", - " texts_before = nltk.tokenize.blankline_tokenize(text)\n", - " texts_after = '\\n\\n'.join([sentence \n", - " for sentence in texts_before\n", - " if detect_lang(sentence) == lang\n", - " ])\n", - " \n", - " return (len(texts_before) != len(nltk.tokenize.blankline_tokenize(texts_after)), texts_after)\n", - "\n", - "[clean_lang_inText('en', abstract) for abstract in [abstract0, abstract1]]\n", - "\n", - "# TODO update each document accordingly" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "# TODO update all the abstract with That function" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, - "source": [ - "# Measures IMT Tools" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "154" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "scan_hal(\"machine learning AND deep\")" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "90" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Request syntax\n", - "# \"network analysis\" = network <-> analysis\n", - "# \"network OR analysis\" = network | analysis\n", - "# \"network AND analysis\" = network & analysis\n", - "\n", - "scan_gargantext(corpus_id, 'english', \"machine | learning & deep\")\n", - "\n", - "# \"network NOT analysis\" = @@ to_tsquery('network') !! to_tsquery('analysis')\n", - "# (need to change the function if not has to be used)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "[('network analysis', 'network <-> analysis'),\n", - " ('big data AND something', '(big <-> data) & something')]" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Forces / Faiblesses de l'IMT\n", - "# Hal Query Gargantext Query\n", - "queries = [ (\"network analysis\" , \"network <-> analysis\" )\n", - " , (\"big data AND something\" , \"(big <-> data) & something\")\n", - " ]\n", - "[(query[0], query[1]) for query in queries]" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "def imt_vs_hal(corpus_id, queryHal, queryGarg):\n", - " return((scan_gargantext(corpus_id, 'english', queryGarg), scan_hal(queryHal)))\n", - " #return((scan_gargantext(corpus_id, 'english', queryGarg) *100 / scan_hal(queryHal)))" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "[(5, 10649), (0, 5)]" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Then chart it to see your strenght and weakness!\n", - "[imt_vs_hal(corpus_id, query[0], query[1]) for query in queries]\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, - "source": [ - "# Graph generation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "# TODO Cooccurrences optimization" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "# TODO optimize the distributional distance" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, - "source": [ - "# List Management" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "# Front End add a check box to merge or to overwrite previous list" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "# optimize the list merge" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.3rc1" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/gargantext/util/crawlers/HAL.py b/gargantext/util/crawlers/HAL.py index b920b1e6..df929459 100644 --- a/gargantext/util/crawlers/HAL.py +++ b/gargantext/util/crawlers/HAL.py @@ -113,7 +113,7 @@ class HalCrawler(Crawler): msg = "Invalid sample size N = %i (max = %i)" % ( self.query_max , QUERY_SIZE_N_MAX ) - print("ERROR (scrap: Multivac d/l ): " , msg) + print("ERROR (scrap: HAL d/l ): " , msg) self.query_max = QUERY_SIZE_N_MAX #for page in range(1, trunc(self.query_max / 100) + 2): diff --git a/gargantext/util/parsers/HAL.py b/gargantext/util/parsers/HAL.py index e92228cf..d869bbeb 100644 --- a/gargantext/util/parsers/HAL.py +++ b/gargantext/util/parsers/HAL.py @@ -11,17 +11,8 @@ from datetime import datetime import json class HalParser(Parser): - - def parse(self, filebuf): - ''' - parse :: FileBuff -> [Hyperdata] - ''' - contents = filebuf.read().decode("UTF-8") - data = json.loads(contents) + def _parse(self, json_docs): - filebuf.close() - - json_docs = data hyperdata_list = [] hyperdata_path = { "id" : "isbn_s" @@ -73,3 +64,13 @@ class HalParser(Parser): hyperdata_list.append(hyperdata) return hyperdata_list + + def parse(self, filebuf): + ''' + parse :: FileBuff -> [Hyperdata] + ''' + contents = filebuf.read().decode("UTF-8") + data = json.loads(contents) + + return self._parse(data) + diff --git a/install/notebook.run b/install/notebook.run index ac999fb5..1f6c6bca 100755 --- a/install/notebook.run +++ b/install/notebook.run @@ -16,7 +16,7 @@ sudo docker run \ --env POSTGRES_HOST=localhost \ -v /srv/gargantext:/srv/gargantext \ -it garg-notebook:latest \ - /bin/bash -c "/bin/su notebooks -c 'source /env_3-5/bin/activate && cd /srv/gargantext/ && jupyter notebook --port=8899 --ip=0.0.0.0 --no-browser'" + /bin/bash -c "/bin/su notebooks -c 'source /env_3-5/bin/activate && cd /home/notebooks && jupyter notebook --port=8899 --ip=0.0.0.0 --no-browser'" # #&& jupyter nbextension enable --py widgetsnbextension --sys-prefix #/bin/bash -c "/bin/su notebooks -c 'source /env_3-5/bin/activate && cd /srv/gargantext/ && jupyter notebook --port=8899 --ip=0.0.0.0 --no-browser --notebook-dir=/home/notebooks/'" diff --git a/install/notebook/gargantext_notebook.py b/install/notebook/gargantext_notebook.py index f24fef71..b3b564a8 100644 --- a/install/notebook/gargantext_notebook.py +++ b/install/notebook/gargantext_notebook.py @@ -22,7 +22,7 @@ application = get_wsgi_application() from gargantext.util.toolchain.main import parse_extract_indexhyperdata from gargantext.util.db import * from gargantext.models import Node - +from gargantext.util.toolchain.main import parse_extract_indexhyperdata from nltk.tokenize import wordpunct_tokenize from gargantext.models import * @@ -56,9 +56,7 @@ def chart(docs, field): frame1 = pd.DataFrame(year_publis, columns=['Date', 'DateValue'], index=frame0.Date) return frame1 - from gargantext.util.crawlers.HAL import HalCrawler - def scan_hal(request): hal = HalCrawler() return hal.scan_results(request) @@ -73,3 +71,53 @@ def scan_gargantext(corpus_id, lang, request): return [i for i in connection.execute(query)][0][0] connection.close() + +def myProject_fromUrl(url): + """ + myProject :: String -> Project + """ + project_id = url.split("/")[4] + project = session.query(Node).filter(Node.id == project_id).first() + return project + + +def newCorpus(project, resourceName=11, name="Machine learning", query="LSTM"): + print("Corpus \"%s\" in project \"%s\" created" % (name, project.name)) + + corpus = project.add_child(name="Corpus name", typename='CORPUS') + corpus.hyperdata["resources"] = [{"extracted" : "true", "type" : 11}] + corpus.hyperdata["statuses"] = [{"action" : "notebook", "complete" : "true"}] + # [TODO] Add informations needed to get buttons on the Project view. + session.add(corpus) + session.commit() + + hal = HalCrawler() + max_result = hal.scan_results(query) + paging = 100 + for page in range(0, max_result, paging): + print("%s documents downloaded / %s." % (str( paging * (page +1)), str(max_result) )) + docs = (hal._get(query, fromPage=page, count=paging) + .get("response", {}) + .get("docs", []) + ) + + from gargantext.util.parsers.HAL import HalParser + # [TODO] fix boilerplate for docs here + new_docs = HalParser(docs)._parse(docs) + + for doc in new_docs: + new_doc = (corpus.add_child( name = doc["title"][:255] + , typename = 'DOCUMENT') + ) + new_doc["hyperdata"] = doc + session.add(new_doc) + session.commit() + + print("Extracting the ngrams") + parse_extract_indexhyperdata(corpus) + + print("Corpus is ready to explore:") + print("http://imt.gargantext.org/projects/%s/corpora/%s/" % (project.id, corpus.id)) + + return corpus + -- 2.21.0