{
 "metadata": {
  "name": "",
  "signature": "sha256:2afae28d08bbb0945aaca44a5b704550048c5dc193cc3d81cb11a551fcc03864"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "cd .."
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "/home/alexandre/projets/gargantext.py\n"
       ]
      }
     ],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import gargantext_core as gargantext"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "cd gargantext_web/"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "/home/alexandre/projets/gargantext.py/gargantext_web\n"
       ]
      }
     ],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import documents"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import nltk\n",
      "from nltk.stem.snowball import EnglishStemmer\n",
      "stemmer = EnglishStemmer()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from collections import defaultdict"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 6
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "# Imporation\n",
      "## Europresse"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "c = gargantext.bdd.Europresse()\n",
      "c.add(\"/home/alexandre/projets/abeilles/documents/Europresse/html/\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 7
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "for doc in c:\n",
      "    d = documents.models.Document()\n",
      "    d.project_id = \"1\"\n",
      "    d.corpus_id = \"1\"\n",
      "    d.analyst_id = \"1\"\n",
      "    try:\n",
      "        d.uniqu_id = doc[\"object_id\"]\n",
      "        d.date = doc[\"date\"]\n",
      "        d.title = doc[\"title\"]\n",
      "        d.authors = doc[\"authors\"]\n",
      "        d.text = doc[\"text\"]\n",
      "        d.source = doc[\"source\"]\n",
      "        d.save()\n",
      "    except:\n",
      "        pass"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 88
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## ISI (todo)"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "projects = documents.models.Project.objects.all()\n",
      "for p in projects:\n",
      "    corpora = documents.models.Corpus.objects.filter(project_id=p.id)\n",
      "    print(p.id, p.title)\n",
      "    for c in corpora:\n",
      "        print(\"|_\", c.id,\":\", c.title)\n",
      "    print(\"\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "3 Hola Ebola\n",
        "\n",
        "2 Fukushima again\n",
        "|_ 7 : Test\n",
        "\n",
        "4 Thanks anthrax\n",
        "\n",
        "1 Bees swarm\n",
        "|_ 9 : Bees tweets\n",
        "|_ 4 : Health academic publications\n",
        "|_ 2 : bees and (pesticides or chemicals or neocotinoids)\n",
        "|_ 1 : Quand les abeilles meurent, les articles sont compt\u00e9s\n",
        "\n",
        "6 CIRDEM\n",
        "|_ 8 : Zotero fichier du 9 sept.\n",
        "\n"
       ]
      }
     ],
     "prompt_number": 7
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def extractNgrams(corpus_pk=1):\n",
      "    corpus = documents.models.Corpus.objects.get(pk=1)\n",
      "    data = gargantext.Corpus()\n",
      "    docs = data.query('''select *  from documents_document\n",
      "                        where corpus_id = %d\n",
      "                        limit 90;''' % corpus_pk)\n",
      "    words = gargantext.Ngrams()\n",
      "    words.get(docs, key='text', unique_id=\"unique_id\")\n",
      "    return(words)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 29
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "data = gargantext.Corpus()\n",
      "docs = data.query('''select *  from documents_document\n",
      "                        where corpus_id = %d\n",
      "                        limit 9;''' % 1)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 6
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def ngram(terms):\n",
      "    stems = stemmer.stem(terms)\n",
      "    n = len(stems.split(\" \"))\n",
      "    ngram = documents.models.Ngram.objects.get_or_create(terms = terms,\\\n",
      "                                                 stem = stems,\\\n",
      "                                                 n= n)\n",
      "    return(ngram[0])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 7
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "doc = documents.models.Document.objects.get(pk = 9103)\n",
      "doc.title"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 8,
       "text": [
        "'France/R\u00e9gent TS: contr\u00f4le judiciaire annul\u00e9 pour BASF, confirm\u00e9 pour Bayer'"
       ]
      }
     ],
     "prompt_number": 8
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "gram = ngram(\"de\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 41
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def addNgram2doc(gram, doc):\n",
      "    ngramDoc = documents.models.NgramDocument.objects.get_or_create(terms=gram,\\\n",
      "                                                           document = doc,\\\n",
      "                                                           defaults={'occurrences':0})[0]\n",
      "    ngramDoc.occurrences = F('occurrences') + 1\n",
      "    ngramDoc.save()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 40
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "# Insert ngrams"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "l = set()\n",
      "d = defaultdict(lambda : defaultdict(lambda: defaultdict(lambda: defaultdict(int))))\n",
      "\n",
      "docs = documents.models.Document.objects.all().filter(project_id=1)[:1000] \n",
      "\n",
      "for doc in docs:\n",
      "    sentences = nltk.sent_tokenize(doc.text)\n",
      "    for sentence in sentences:\n",
      "        words = nltk.wordpunct_tokenize(sentence)\n",
      "        #print(len(words))\n",
      "        for word in words:\n",
      "            stems = stemmer.stem(word)\n",
      "            new = (word, stems, len(stems.split(\" \")))\n",
      "            l.add(new)\n",
      "            \n",
      "            d[word][doc.id]['count'] = d[word][doc.id].get('count', 0) + 1\n",
      "            \n",
      "\n",
      "new_grams = [documents.models.Ngram(terms=x[0], stem=x[1], n=x[2]) for x in l]\n",
      "new_gramDoc = [ documents.models.NgramDocumentTemporary(terms=k, document=pk, occurrences=d[k][pk]['count']) \\\n",
      "               for k in d.keys() \\\n",
      "               for pk in d[k].keys()\\\n",
      "               ]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 13
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "documents.models.NgramTemporary.objects.bulk_create(new_grams)\n",
      "documents.models.NgramDocumentTemporary.objects.bulk_create(new_gramDoc)\n",
      "\n",
      "from django.db import connection\n",
      "cursor = connection.cursor()\n",
      "# LOCK TABLE documents_ngramtemporary IN EXCLUSIVE MODE;\n",
      "query_string = \"\"\"\n",
      "                 INSERT INTO documents_ngram \n",
      "                 SELECT * FROM documents_ngramtemporary WHERE NOT EXISTS \n",
      "                 ( SELECT 1 FROM documents_ngram WHERE \n",
      "                 documents_ngram.terms = documents_ngramtemporary.terms);\n",
      "                 \n",
      "                 delete from documents_ngramtemporary;\n",
      "                 \n",
      "                 INSERT INTO \n",
      "                 documents_ngramdocument (terms_id, document_id, occurrences)\n",
      "                 SELECT \n",
      "                 GT.id, DT.id, NDT.occurrences \n",
      "                 FROM \n",
      "                 documents_ngramdocumenttemporary as NDT \n",
      "                 INNER JOIN documents_document AS DT ON DT.id = NDT.document \n",
      "                 INNER JOIN documents_ngram AS GT ON GT.terms = NDT.terms ;\n",
      "                 \n",
      "                 delete from documents_ngramdocumenttemporary;\n",
      "             \"\"\"\n",
      "cursor.execute(query_string)\n",
      "\n",
      "try:\n",
      "    while True:\n",
      "        row = cursor.fetchone()\n",
      "        if row is None:\n",
      "            break\n",
      "        print(row)\n",
      "except:\n",
      "    pass"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 14
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "# Insert NgramsDoc"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "\n",
      "SELECT t1.terms_id, t2.terms_id, COUNT(*) AS c, t3.project_id\n",
      "FROM documents_ngramdocument AS t1\n",
      "\n",
      "INNER JOIN documents_ngramdocument AS t2\n",
      "ON t1.document_id = t2.document_id\n",
      "\n",
      "INNER JOIN documents_corpus\n",
      "\n",
      "GROUP BY t1.terms_id, t2.terms_id;\n",
      "\n",
      "# add corpus_id in column !"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 20
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "#Coocurrences"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 75
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 76
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "curs.execute(\"select * from documents_project;\")\n",
      "curs.fetchone()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 77,
       "text": [
        "(3, datetime.date(2014, 9, 8), 1, 'Hola Ebola', 'Dance with the risks', {})"
       ]
      }
     ],
     "prompt_number": 77
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# anything can be used as a file if it has .read() and .readline() methods\n",
      "import io\n",
      "data = io.StringIO()\n",
      "data.write('\\n'.join(['Test\\tretest\\t2',\n",
      "                  'Madonna\\tMado\\t45',\n",
      "                  'Federico\\tDi Gregorio\\t3']))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 81,
       "text": [
        "52"
       ]
      }
     ],
     "prompt_number": 81
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "data.seek(0)\n",
      "curs.copy_from(data, 'documents_ngramtemporary', columns=('terms', 'stem', 'n'))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "ename": "InternalError",
       "evalue": "ERREUR:  la transaction est annul\u00e9e, les commandes sont ignor\u00e9es jusqu'\u00e0 la fin du bloc\nde la transaction\n",
       "output_type": "pyerr",
       "traceback": [
        "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mInternalError\u001b[0m                             Traceback (most recent call last)",
        "\u001b[1;32m<ipython-input-82-a9cf2ba7718f>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[0mdata\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mseek\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mcurs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcopy_from\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'documents_ngramtemporary'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'terms'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'stem'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'n'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
        "\u001b[1;31mInternalError\u001b[0m: ERREUR:  la transaction est annul\u00e9e, les commandes sont ignor\u00e9es jusqu'\u00e0 la fin du bloc\nde la transaction\n"
       ]
      }
     ],
     "prompt_number": 82
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 57
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 57
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 57
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 12
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 15
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 12
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 0
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 69
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 20
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 73
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 58
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 61
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 64
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 65
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 12
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 12
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 12
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 12
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 12
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 78
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 12
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 12
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 12
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 91
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}