{ "metadata": { "name": "", "signature": "sha256:33c2f41e3ea5983e768350b4012544242c5df9b394091647362f00929812a921" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "from node.models import Node, NodeType,\\\n", " Project, Corpus, Document,\\\n", " Ngram, Node_Ngram,\\\n", " User, Language, ResourceType\n", " \n", "from parsing.Caches import NgramsCache\n", " \n", "from django.db import connection\n", "cursor = connection.cursor()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "import networkx as nx\n", "from networkx.readwrite import json_graph" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 15 }, { "cell_type": "code", "collapsed": false, "input": [ "import csv" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "help(csv.writer)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Help on built-in function writer in module _csv:\n", "\n", "writer(...)\n", " csv_writer = csv.writer(fileobj [, dialect='excel']\n", " [optional keyword args])\n", " for row in sequence:\n", " csv_writer.writerow(row)\n", " \n", " [or]\n", " \n", " csv_writer = csv.writer(fileobj [, dialect='excel']\n", " [optional keyword args])\n", " csv_writer.writerows(rows)\n", " \n", " The \"fileobj\" argument can be any object that supports the file API.\n", "\n" ] } ], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "type(x)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 52, "text": [ "_io.TextIOWrapper" ] } ], "prompt_number": 52 }, { "cell_type": "code", "collapsed": false, "input": [ "file = open('/tmp/test.graph', 'w')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stderr", "text": [ "-c:1: ResourceWarning: unclosed file <_io.TextIOWrapper name='/tmp/test.graph' mode='w' encoding='UTF-8'>\n", "\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "WARNING:py.warnings:-c:1: ResourceWarning: unclosed file <_io.TextIOWrapper name='/tmp/test.graph' mode='w' encoding='UTF-8'>\n", "\n" ] } ], "prompt_number": 42 }, { "cell_type": "code", "collapsed": false, "input": [ "file.write('ici il fait beau')\n", "file.close()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 46 }, { "cell_type": "code", "collapsed": false, "input": [ "G = nx.complete_graph(30)\n", "f = open(\"graph.json\", \"w\")\n", "f.write(json_graph.node_link_graph(G))\n", "f.close()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stderr", "text": [ "-c:2: ResourceWarning: unclosed file <_io.TextIOWrapper name='graph.json' mode='w' encoding='UTF-8'>\n", "\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "WARNING:py.warnings:-c:2: ResourceWarning: unclosed file <_io.TextIOWrapper name='graph.json' mode='w' encoding='UTF-8'>\n", "\n" ] }, { "ename": "AttributeError", "evalue": "'Graph' object has no attribute 'get'", "output_type": "pyerr", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[0mG\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnx\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcomplete_graph\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m30\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[0mf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"graph.json\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"w\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mjson_graph\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnode_link_graph\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mG\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 4\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m/home/alexandre/projets/gargantext.py/env/lib/python3.4/site-packages/networkx/readwrite/json_graph/node_link.py\u001b[0m in \u001b[0;36mnode_link_graph\u001b[1;34m(data, directed, multigraph, attrs)\u001b[0m\n\u001b[0;32m 134\u001b[0m \u001b[0mnode_link_data\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0madjacency_data\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtree_data\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 135\u001b[0m \"\"\"\n\u001b[1;32m--> 136\u001b[1;33m \u001b[0mmultigraph\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'multigraph'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmultigraph\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 137\u001b[0m \u001b[0mdirected\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'directed'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdirected\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 138\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mmultigraph\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mAttributeError\u001b[0m: 'Graph' object has no attribute 'get'" ] } ], "prompt_number": 17 }, { "cell_type": "code", "collapsed": false, "input": [ "typeCorpus = NodeType.objects.get(name='Corpus')\n", "for corpus in Node.objects.filter(type=typeCorpus):\n", " print(\"#%d - %s\" % (corpus.id, corpus))" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "#7 - Presse corpus\n" ] } ], "prompt_number": 33 }, { "cell_type": "code", "collapsed": false, "input": [ "try:\n", " me = User.objects.get(username='alexandre')\n", "except:\n", " me = User(username='alexandre')\n", " me.save()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 34 }, { "cell_type": "code", "collapsed": false, "input": [ "#\u00a0corpus = Node.objects.filter(type=typeCorpus).first()\n", "corpus = Node.objects.get(id=44338)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 4 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Occurences" ] }, { "cell_type": "code", "collapsed": false, "input": [ "cursor.execute(\"\"\"\n", " SELECT\n", " metadata -> 'publication_year' as year,\n", " metadata -> 'publication_month' as month, \n", " metadata -> 'publication_day' as day,\n", " COUNT(*)\n", " FROM\n", " node_node AS n\n", " WHERE\n", " n.parent_id = %s\n", " GROUP BY\n", " day, month, year\n", " ORDER BY\n", " year, month, day ASC\n", " LIMIT\n", " 20\n", "\"\"\", [corpus.id])\n", "\n", "\n", "while True:\n", " row = cursor.fetchone()\n", " if row is None:\n", " break\n", " print('\\'' + row[0] + '/' + row[1] + '/' + row[2] + '\\'' \n", " + ',' + '\\'' + str(row[3]) + '\\'' )" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "'2004/01/02','2'\n", "'2004/01/03','1'\n", "'2004/01/06','8'\n", "'2004/01/07','5'\n", "'2004/01/08','8'\n", "'2004/01/09','1'\n", "'2004/01/10','3'\n", "'2004/01/12','2'\n", "'2004/01/13','6'\n", "'2004/01/15','2'\n", "'2004/01/16','1'\n", "'2004/01/17','5'\n", "'2004/01/19','2'\n", "'2004/01/20','2'\n", "'2004/01/21','7'\n", "'2004/01/23','1'\n", "'2004/01/24','4'\n", "'2004/01/25','4'\n", "'2004/01/26','5'\n", "'2004/01/27','2'\n" ] } ], "prompt_number": 37 }, { "cell_type": "code", "collapsed": false, "input": [ "cursor.execute(\"\"\"\n", " SELECT\n", " COUNT(*) AS occurrences,\n", " ngX.terms\n", " FROM\n", " node_node AS n\n", " INNER JOIN\n", " node_node_ngram AS nngX ON nngX.node_id = n.id\n", " INNER JOIN\n", " node_ngram AS ngX ON ngX.id = nngX.ngram_id\n", " WHERE\n", " n.parent_id = %s\n", " GROUP BY\n", " ngX.terms\n", " ORDER BY\n", " occurrences DESC\n", " LIMIT\n", " 20\n", "\"\"\", [corpus.id])\n", "\n", "while True:\n", " row = cursor.fetchone()\n", " if row is None:\n", " break\n", " print(row)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "(196, 'patients')\n", "(135, 'voice')\n", "(129, 'study')\n", "(111, 'disease')\n", "(69, 'treatment')\n", "(66, 'life')\n", "(58, 'patient')\n", "(53, 'quality')\n", "(49, 'care')\n", "(45, 'use')\n", "(44, 'Patients')\n", "(43, 'people')\n", "(41, 'development')\n", "(41, 'purpose')\n", "(40, 's disease')\n", "(39, 's')\n", "(38, 'results')\n", "(37, 'diagnosis')\n", "(36, 'years')\n", "(34, 'women')\n" ] } ], "prompt_number": 104 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Cooccurrences par ann\u00e9e" ] }, { "cell_type": "code", "collapsed": false, "input": [ "cursor.execute(\"\"\"\n", " SELECT\n", " COUNT(*) AS occurrences,\n", " n.metadata->'publication_year' AS year,\n", " ngX.terms\n", " FROM\n", " node_node AS n\n", " INNER JOIN\n", " node_node_ngram AS nngX ON nngX.node_id = n.id\n", " INNER JOIN\n", " node_ngram AS ngX ON ngX.id = nngX.ngram_id\n", " WHERE\n", " n.parent_id = %s\n", " GROUP BY\n", " terms,\n", " year\n", " ORDER BY\n", " occurrences DESC\n", " LIMIT\n", " 20\n", "\"\"\", [corpus.id])\n", "\n", "while True:\n", " row = cursor.fetchone()\n", " if row is None:\n", " break\n", " print(row)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "(24, '2014', 'patients')\n", "(22, '2005', 'patients')\n", "(18, '2005', 'study')\n", "(15, '2014', 'voice')\n", "(14, '2002', 'disease')\n", "(14, '2013', 'patients')\n", "(14, '2006', 'study')\n", "(13, '2014', 'treatment')\n", "(12, '2011', 'patients')\n", "(12, '2004', 'voice')\n", "(12, '2012', 'patients')\n", "(12, '2003', 'patients')\n", "(12, '2005', 'voice')\n", "(11, '2002', 'patients')\n", "(11, '2014', 'study')\n", "(10, '2007', 'patients')\n", "(10, '2006', 'patients')\n", "(10, '2004', 'study')\n", "(10, '2001', 'patients')\n", "(10, '2014', 'phase')\n" ] } ], "prompt_number": 105 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Cr\u00e9ation d'une liste de synonymes" ] }, { "cell_type": "code", "collapsed": false, "input": [ "ngramsCache = NgramsCache(Language.objects.get(iso2='fr'))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 21 }, { "cell_type": "code", "collapsed": false, "input": [ "synonymePairs = [\n", " ['danger', 'risques'],\n", " ['risque', 'risques'],\n", " ['R\u00e9gent', 'R\u00e9gent TS']\n", "]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 22 }, { "cell_type": "code", "collapsed": false, "input": [ "try:\n", " typeSynonyme = NodeType.objects.get(name='Synonyme')\n", "except:\n", " typeSynonyme = NodeType(name='Synonyme')\n", " typeSynonyme.save()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 23 }, { "cell_type": "code", "collapsed": false, "input": [ "listSynonymes = Node(name='Syno abeilles', type=typeSynonyme, user=me)\n", "listSynonymes.save()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 24 }, { "cell_type": "code", "collapsed": false, "input": [ "for synonymePair in synonymePairs:\n", " NodeNgramNgram(\n", " ngramx = ngramsCache[synonymePair[0]],\n", " ngramy = ngramsCache[synonymePair[1]],\n", " node = listSynonymes,\n", " score = 1.\n", " ).save()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 25 }, { "cell_type": "code", "collapsed": false, "input": [ "listSynonymes.id" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 26, "text": [ "6" ] } ], "prompt_number": 26 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Occurrences avec synonymes" ] }, { "cell_type": "code", "collapsed": false, "input": [ "'''cursor.execute(\"\"\"\n", " SELECT\n", " COUNT(*) AS occurrences,\n", " ngx.terms\n", " FROM\n", " node_node AS n\n", " INNER JOIN\n", " node_node_ngram AS nngX ON nngX.node_id = n.id\n", " INNER JOIN\n", " node_ngram AS ngX ON ngX.id = nngX.ngram_id\n", " WHERE\n", " n.parent_id = %s\n", " GROUP BY\n", " ngX.terms\n", " ORDER BY\n", " occurrences DESC\n", " LIMIT\n", " 20\n", "\"\"\")'''\n", "cursor.execute(\"\"\"\n", " SELECT\n", " n.id\n", " FROM\n", " node_node AS n\n", " INNER JOIN\n", " node_node_ngram AS nngx ON nngx.node_id = n.id\n", " INNER JOIN\n", " node_nodengramngram AS nngng ON nngng.ngramx_id = nngx.ngram_id\n", " INNER JOIN\n", " node_node_ngram AS nngy ON nngy.id = nngng.ngramy_id\n", " WHERE\n", " n.parent_id = %s\n", "\"\"\", [corpus.id])\n", "#\u00a0\"\"\" % [listSynonymes.id])\n", "\n", "while True:\n", " row = cursor.fetchone()\n", " if row is None:\n", " break\n", " print(row)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 47 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Cooccurrences" ] }, { "cell_type": "code", "collapsed": false, "input": [ "cursor.execute(\"\"\"\n", " SELECT\n", " COUNT(*) AS cooccurrences,\n", " ngX.terms,\n", " ngY.terms\n", " FROM\n", " node_node AS n\n", " \n", " INNER JOIN\n", " node_node_ngram AS nngX ON nngX.node_id = n.id\n", " INNER JOIN\n", " node_ngram AS ngX ON ngX.id = nngX.ngram_id\n", " \n", " INNER JOIN\n", " node_node_ngram AS nngY ON nngY.node_id = n.id\n", " INNER JOIN\n", " node_ngram AS ngY ON ngY.id = nngY.ngram_id\n", " \n", " WHERE\n", " n.parent_id = %s\n", " AND\n", " nngX.ngram_id > nngY.ngram_id\n", " \n", " GROUP BY\n", " ngX.id,\n", " ngX.terms,\n", " ngY.id,\n", " ngY.terms\n", " ORDER BY\n", " cooccurrences DESC\n", " LIMIT\n", " 20\n", "\"\"\", [corpus.id])\n", "\n", "while True:\n", " row = cursor.fetchone()\n", " if row is None:\n", " break\n", " print(row)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "(98, 'patients', 'study')\n", "(88, 'patients', 'disease')\n", "(78, 'voice', 'patients')\n", "(76, 'Parkinson', 's disease')\n", "(64, 'life', 'patients')\n", "(62, 'life', 'quality')\n", "(60, 'treatment', 'patients')\n", "(56, 'patient', 'patients')\n", "(56, 'voice', 'study')\n", "(54, 'Patients', 'patients')\n", "(54, 'purpose', 'study')\n", "(54, 'voice', 'disease')\n", "(52, 'study', 'disease')\n", "(48, 'voice', 'treatment')\n", "(46, 'treatment', 'disease')\n", "(42, 'quality', 'patients')\n", "(42, 'life', 'study')\n", "(40, 'care', 'patients')\n", "(40, 'PD', 'Parkinson')\n", "(40, 'PD', 's disease')\n" ] } ], "prompt_number": 108 } ], "metadata": {} } ] }