{ "metadata": { "name": "", "signature": "sha256:2afae28d08bbb0945aaca44a5b704550048c5dc193cc3d81cb11a551fcc03864" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "cd .." ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "/home/alexandre/projets/gargantext.py\n" ] } ], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "import gargantext_core as gargantext" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "cd gargantext_web/" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "/home/alexandre/projets/gargantext.py/gargantext_web\n" ] } ], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "import documents" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "import nltk\n", "from nltk.stem.snowball import EnglishStemmer\n", "stemmer = EnglishStemmer()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "from collections import defaultdict" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Imporation\n", "## Europresse" ] }, { "cell_type": "code", "collapsed": false, "input": [ "c = gargantext.bdd.Europresse()\n", "c.add(\"/home/alexandre/projets/abeilles/documents/Europresse/html/\")" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "for doc in c:\n", " d = documents.models.Document()\n", " d.project_id = \"1\"\n", " d.corpus_id = \"1\"\n", " d.analyst_id = \"1\"\n", " try:\n", " d.uniqu_id = doc[\"object_id\"]\n", " d.date = doc[\"date\"]\n", " d.title = doc[\"title\"]\n", " d.authors = doc[\"authors\"]\n", " d.text = doc[\"text\"]\n", " d.source = doc[\"source\"]\n", " d.save()\n", " except:\n", " pass" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 88 }, { "cell_type": "markdown", "metadata": {}, "source": [ "## ISI (todo)" ] }, { "cell_type": "code", "collapsed": false, "input": [ "projects = documents.models.Project.objects.all()\n", "for p in projects:\n", " corpora = documents.models.Corpus.objects.filter(project_id=p.id)\n", " print(p.id, p.title)\n", " for c in corpora:\n", " print(\"|_\", c.id,\":\", c.title)\n", " print(\"\")" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "3 Hola Ebola\n", "\n", "2 Fukushima again\n", "|_ 7 : Test\n", "\n", "4 Thanks anthrax\n", "\n", "1 Bees swarm\n", "|_ 9 : Bees tweets\n", "|_ 4 : Health academic publications\n", "|_ 2 : bees and (pesticides or chemicals or neocotinoids)\n", "|_ 1 : Quand les abeilles meurent, les articles sont compt\u00e9s\n", "\n", "6 CIRDEM\n", "|_ 8 : Zotero fichier du 9 sept.\n", "\n" ] } ], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "def extractNgrams(corpus_pk=1):\n", " corpus = documents.models.Corpus.objects.get(pk=1)\n", " data = gargantext.Corpus()\n", " docs = data.query('''select * from documents_document\n", " where corpus_id = %d\n", " limit 90;''' % corpus_pk)\n", " words = gargantext.Ngrams()\n", " words.get(docs, key='text', unique_id=\"unique_id\")\n", " return(words)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 29 }, { "cell_type": "code", "collapsed": false, "input": [ "data = gargantext.Corpus()\n", "docs = data.query('''select * from documents_document\n", " where corpus_id = %d\n", " limit 9;''' % 1)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "def ngram(terms):\n", " stems = stemmer.stem(terms)\n", " n = len(stems.split(\" \"))\n", " ngram = documents.models.Ngram.objects.get_or_create(terms = terms,\\\n", " stem = stems,\\\n", " n= n)\n", " return(ngram[0])" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "doc = documents.models.Document.objects.get(pk = 9103)\n", "doc.title" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 8, "text": [ "'France/R\u00e9gent TS: contr\u00f4le judiciaire annul\u00e9 pour BASF, confirm\u00e9 pour Bayer'" ] } ], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "gram = ngram(\"de\")" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 41 }, { "cell_type": "code", "collapsed": false, "input": [ "def addNgram2doc(gram, doc):\n", " ngramDoc = documents.models.NgramDocument.objects.get_or_create(terms=gram,\\\n", " document = doc,\\\n", " defaults={'occurrences':0})[0]\n", " ngramDoc.occurrences = F('occurrences') + 1\n", " ngramDoc.save()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 40 }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Insert ngrams" ] }, { "cell_type": "code", "collapsed": false, "input": [ "l = set()\n", "d = defaultdict(lambda : defaultdict(lambda: defaultdict(lambda: defaultdict(int))))\n", "\n", "docs = documents.models.Document.objects.all().filter(project_id=1)[:1000] \n", "\n", "for doc in docs:\n", " sentences = nltk.sent_tokenize(doc.text)\n", " for sentence in sentences:\n", " words = nltk.wordpunct_tokenize(sentence)\n", " #print(len(words))\n", " for word in words:\n", " stems = stemmer.stem(word)\n", " new = (word, stems, len(stems.split(\" \")))\n", " l.add(new)\n", " \n", " d[word][doc.id]['count'] = d[word][doc.id].get('count', 0) + 1\n", " \n", "\n", "new_grams = [documents.models.Ngram(terms=x[0], stem=x[1], n=x[2]) for x in l]\n", "new_gramDoc = [ documents.models.NgramDocumentTemporary(terms=k, document=pk, occurrences=d[k][pk]['count']) \\\n", " for k in d.keys() \\\n", " for pk in d[k].keys()\\\n", " ]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 13 }, { "cell_type": "code", "collapsed": false, "input": [ "documents.models.NgramTemporary.objects.bulk_create(new_grams)\n", "documents.models.NgramDocumentTemporary.objects.bulk_create(new_gramDoc)\n", "\n", "from django.db import connection\n", "cursor = connection.cursor()\n", "# LOCK TABLE documents_ngramtemporary IN EXCLUSIVE MODE;\n", "query_string = \"\"\"\n", " INSERT INTO documents_ngram \n", " SELECT * FROM documents_ngramtemporary WHERE NOT EXISTS \n", " ( SELECT 1 FROM documents_ngram WHERE \n", " documents_ngram.terms = documents_ngramtemporary.terms);\n", " \n", " delete from documents_ngramtemporary;\n", " \n", " INSERT INTO \n", " documents_ngramdocument (terms_id, document_id, occurrences)\n", " SELECT \n", " GT.id, DT.id, NDT.occurrences \n", " FROM \n", " documents_ngramdocumenttemporary as NDT \n", " INNER JOIN documents_document AS DT ON DT.id = NDT.document \n", " INNER JOIN documents_ngram AS GT ON GT.terms = NDT.terms ;\n", " \n", " delete from documents_ngramdocumenttemporary;\n", " \"\"\"\n", "cursor.execute(query_string)\n", "\n", "try:\n", " while True:\n", " row = cursor.fetchone()\n", " if row is None:\n", " break\n", " print(row)\n", "except:\n", " pass" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 14 }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Insert NgramsDoc" ] }, { "cell_type": "code", "collapsed": false, "input": [ "\n", "SELECT t1.terms_id, t2.terms_id, COUNT(*) AS c, t3.project_id\n", "FROM documents_ngramdocument AS t1\n", "\n", "INNER JOIN documents_ngramdocument AS t2\n", "ON t1.document_id = t2.document_id\n", "\n", "INNER JOIN documents_corpus\n", "\n", "GROUP BY t1.terms_id, t2.terms_id;\n", "\n", "# add corpus_id in column !" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 20 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#Coocurrences" ] }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 75 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 76 }, { "cell_type": "code", "collapsed": false, "input": [ "curs.execute(\"select * from documents_project;\")\n", "curs.fetchone()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 77, "text": [ "(3, datetime.date(2014, 9, 8), 1, 'Hola Ebola', 'Dance with the risks', {})" ] } ], "prompt_number": 77 }, { "cell_type": "code", "collapsed": false, "input": [ "# anything can be used as a file if it has .read() and .readline() methods\n", "import io\n", "data = io.StringIO()\n", "data.write('\\n'.join(['Test\\tretest\\t2',\n", " 'Madonna\\tMado\\t45',\n", " 'Federico\\tDi Gregorio\\t3']))" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 81, "text": [ "52" ] } ], "prompt_number": 81 }, { "cell_type": "code", "collapsed": false, "input": [ "data.seek(0)\n", "curs.copy_from(data, 'documents_ngramtemporary', columns=('terms', 'stem', 'n'))" ], "language": "python", "metadata": {}, "outputs": [ { "ename": "InternalError", "evalue": "ERREUR: la transaction est annul\u00e9e, les commandes sont ignor\u00e9es jusqu'\u00e0 la fin du bloc\nde la transaction\n", "output_type": "pyerr", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mInternalError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[0mdata\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mseek\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mcurs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcopy_from\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'documents_ngramtemporary'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'terms'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'stem'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'n'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;31mInternalError\u001b[0m: ERREUR: la transaction est annul\u00e9e, les commandes sont ignor\u00e9es jusqu'\u00e0 la fin du bloc\nde la transaction\n" ] } ], "prompt_number": 82 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 57 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 57 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 57 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 15 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 0 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 69 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 20 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 73 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 58 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 61 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 64 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 65 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 78 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 91 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }