Commit 4eab6f6f authored by Administrator's avatar Administrator

[TESTS] SQL tests.

parent 2efb0a0d
{
"metadata": {
"name": "",
"signature": "sha256:077d042952acacfaf160a1c3655fadac325f85beb77437b97247043bf738b3a5"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"from node.models import Node, NodeType,\\\n",
" Project, Corpus, Document,\\\n",
" Ngram, Node_Ngram,\\\n",
" User, Language, ResourceType"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 15
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import pycountry\n",
"\n",
"for language in pycountry.languages:\n",
" try:\n",
" implemented = 1 if language.alpha2 in ['en', 'fr'] else 0\n",
" Language(iso2=language.alpha2, iso3=language.terminology, fullname=language.name, implemented=implemented).save()\n",
" except:\n",
" pass"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 16
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"english = Language.objects.get(iso2='en')\n",
"french = Language.objects.get(iso2='fr')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 17
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"try:\n",
" me = User.objects.get(username='alexandre')\n",
"except:\n",
" me = User(username='alexandre')\n",
" me.save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 18
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"try:\n",
" typeProject = NodeType.objects.get(name='Project')\n",
"except Exception as error:\n",
" print(error)\n",
" typeProject = NodeType(name='Project')\n",
" typeProject.save() \n",
"\n",
"try:\n",
" typeCorpus = NodeType.objects.get(name='Corpus')\n",
"except Exception as error:\n",
" print(error)\n",
" typeCorpus = NodeType(name='Corpus')\n",
" typeCorpus.save()\n",
" \n",
"try:\n",
" typeDoc = NodeType.objects.get(name='Document')\n",
"except Exception as error:\n",
" print(error)\n",
" typeDoc = NodeType(name='Document')\n",
" typeDoc.save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 19
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"try:\n",
" typePubmed = ResourceType.objects.get(name='pubmed')\n",
" typeIsi = ResourceType.objects.get(name='isi')\n",
" typeRis = ResourceType.objects.get(name='ris')\n",
" typePresse = ResourceType.objects.get(name='europress')\n",
"\n",
"except Exception as error:\n",
" print(error)\n",
" \n",
" typePubmed = ResourceType(name='pubmed')\n",
" typePubmed.save() \n",
" \n",
" typeIsi = ResourceType(name='isi')\n",
" typeIsi.save()\n",
" \n",
" typeRis = ResourceType(name='ris')\n",
" typeRis.save()\n",
" \n",
" typePresse = ResourceType(name='europress')\n",
" typePresse.save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 20
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"Node.objects.all().delete()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"try:\n",
" project = Node.objects.get(name='Bees project')\n",
"except:\n",
" project = Node(name='Bees project', type=typeProject, user=me)\n",
" project.save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 21
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Pubmed"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"try:\n",
" corpus_pubmed = Node.objects.get(name='PubMed corpus')\n",
"except:\n",
" corpus_pubmed = Node(parent=project, name='PubMed corpus', type=typeCorpus, user=me)\n",
" corpus_pubmed.save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 22
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"corpus_pubmed.add_resource(file='/srv/gargantext_lib/data_samples/pubmedBig.zip', type=typePubmed)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 23,
"text": [
"<Resource: Resource object>"
]
}
],
"prompt_number": 23
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#corpus_abeille.add_resource(file='/srv/gargantext_lib/data_samples/pubmed.zip', type=typePubmed)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 24
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"corpus_pubmed.parse_resources()\n",
"corpus_pubmed.children.count()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 25,
"text": [
"1200"
]
}
],
"prompt_number": 25
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"corpus_pubmed.id"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 26,
"text": [
"698"
]
}
],
"prompt_number": 26
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"corpus_pubmed.children.all().extract_ngrams(['title', 'abstract'])\n",
"#Node_Ngram.objects.filter(node=corpus_pubmed.children.all()[0]).count()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n"
]
}
],
"prompt_number": 27
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### RIS"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"try:\n",
" corpus_ris = Node.objects.get(name='RIS corpus')\n",
"except:\n",
" corpus_ris = Node(parent=project, name='RIS corpus', type=typeCorpus, user=me)\n",
" corpus_ris.save()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"corpus_ris.add_resource(file='/srv/gargantext_lib/data_samples/risUnix.zip', type=typeRis)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"corpus_ris.parse_resources()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"corpus_ris.children.count()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"corpus_ris.children.all()[15].metadata"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"corpus_ris.name = \"ZOTERO CORPUS (CIRDEM)\"\n",
"corpus_ris.save()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Science"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"try:\n",
" science = Node.objects.get(name='WOS corpus')\n",
"except:\n",
" science = Node(parent=project, name='WOS corpus', type=typeCorpus, user=me)\n",
" science.save()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"science.add_resource(file='/srv/gargantext_lib/data_samples/isi.zip', type=typeIsi)\n",
"science.parse_resources()\n",
"science.children.count()"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#[n.metadata for n in science.children.all()]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 10
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"science.children.all().extract_ngrams(['title',])\n",
"Node_Ngram.objects.filter(node=science.children.all()[0]).count()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Press"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"try:\n",
" presse = Node.objects.get(name='Presse corpus')\n",
"except:\n",
" presse = Node(parent=project, name='Presse corpus', type=typeCorpus, user=me)\n",
" presse.save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 12
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"presse.language = Language.objects.get(iso2='fr')\n",
"presse.save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"presse.add_resource(file='/srv/gargantext_lib/data_samples/html/html_french.zip', type=typePresse)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 14,
"text": [
"<Resource: Resource object>"
]
}
],
"prompt_number": 14
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"presse.parse_resources()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 15
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"presse.children.count()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 16,
"text": [
"88"
]
}
],
"prompt_number": 16
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for child in presse.children.all():\n",
" print(child.metadata['title'])\n",
" child.extract_ngrams(['title',])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Conf\u00e9d\u00e9ration paysanne : \" retrait imm\u00e9diat \" du R\u00e9gent\n",
"defaultdict(<class 'float'>, {'retrait imm\u00e9diat': 1.0, 'R\u00e9gent': 1.0, 'Conf\u00e9d\u00e9ration paysanne': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Gaucho, R\u00e9gent : la mobilisation continue\n",
"defaultdict(<class 'float'>, {'R\u00e9gent': 1.0, 'mobilisation continue': 1.0, 'Gaucho': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"/usr/lib/python3.4/subprocess.py:473: ResourceWarning: unclosed file <_io.FileIO name=44 mode='rb'>\n",
" for inst in _active[:]:\n",
"\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"WARNING:py.warnings:/usr/lib/python3.4/subprocess.py:473: ResourceWarning: unclosed file <_io.FileIO name=44 mode='rb'>\n",
" for inst in _active[:]:\n",
"\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"/usr/lib/python3.4/subprocess.py:473: ResourceWarning: unclosed file <_io.FileIO name=43 mode='wb'>\n",
" for inst in _active[:]:\n",
"\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"WARNING:py.warnings:/usr/lib/python3.4/subprocess.py:473: ResourceWarning: unclosed file <_io.FileIO name=43 mode='wb'>\n",
" for inst in _active[:]:\n",
"\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"/usr/lib/python3.4/subprocess.py:473: ResourceWarning: unclosed file <_io.FileIO name=46 mode='rb'>\n",
" for inst in _active[:]:\n",
"\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"WARNING:py.warnings:/usr/lib/python3.4/subprocess.py:473: ResourceWarning: unclosed file <_io.FileIO name=46 mode='rb'>\n",
" for inst in _active[:]:\n",
"\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"GB/rapport: \"oui mais\" au ma\u00efs OGM, \"non mais\" pour colza et betterave\n",
"defaultdict(<class 'float'>, {'betterave': 1.0, 'ma\u00efs': 1.0, 'GB rapport': 1.0, 'colza': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"/usr/lib/python3.4/subprocess.py:473: ResourceWarning: unclosed file <_io.FileIO name=47 mode='rb'>\n",
" for inst in _active[:]:\n",
"\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"WARNING:py.warnings:/usr/lib/python3.4/subprocess.py:473: ResourceWarning: unclosed file <_io.FileIO name=47 mode='rb'>\n",
" for inst in _active[:]:\n",
"\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"/usr/lib/python3.4/subprocess.py:473: ResourceWarning: unclosed file <_io.FileIO name=45 mode='wb'>\n",
" for inst in _active[:]:\n",
"\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"WARNING:py.warnings:/usr/lib/python3.4/subprocess.py:473: ResourceWarning: unclosed file <_io.FileIO name=45 mode='wb'>\n",
" for inst in _active[:]:\n",
"\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"/usr/lib/python3.4/subprocess.py:473: ResourceWarning: unclosed file <_io.FileIO name=49 mode='rb'>\n",
" for inst in _active[:]:\n",
"\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"WARNING:py.warnings:/usr/lib/python3.4/subprocess.py:473: ResourceWarning: unclosed file <_io.FileIO name=49 mode='rb'>\n",
" for inst in _active[:]:\n",
"\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Rapport: \"oui mais\" au ma\u00efs OGM, \"non mais\" pour colza et betterave \u00e0 sucre\n",
"defaultdict(<class 'float'>, {'ma\u00efs': 1.0, 'betterave': 1.0, 'Rapport': 1.0, 'sucre': 1.0, 'colza': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"/usr/lib/python3.4/subprocess.py:473: ResourceWarning: unclosed file <_io.FileIO name=48 mode='rb'>\n",
" for inst in _active[:]:\n",
"\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"WARNING:py.warnings:/usr/lib/python3.4/subprocess.py:473: ResourceWarning: unclosed file <_io.FileIO name=48 mode='rb'>\n",
" for inst in _active[:]:\n",
"\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Rapport: \"oui mais\" au ma\u00efs OGM, \"non mais\" pour colza et betterave \u00e0 sucre\n",
"defaultdict(<class 'float'>, {'ma\u00efs': 1.0, 'betterave': 1.0, 'Rapport': 1.0, 'sucre': 1.0, 'colza': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"La Conf\u00e9d\u00e9ration exige le retrait du R\u00e9gent\n",
"defaultdict(<class 'float'>, {'R\u00e9gent': 1.0, 'Conf\u00e9d\u00e9ration exige': 1.0, 'retrait': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"La Conf\u00e9d\u00e9ration exige le retrait du R\u00e9gent\n",
"defaultdict(<class 'float'>, {'R\u00e9gent': 1.0, 'Conf\u00e9d\u00e9ration exige': 1.0, 'retrait': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"La Conf\u00e9d\u00e9ration paysanne demande le retrait du R\u00e9gent\n",
"defaultdict(<class 'float'>, {'R\u00e9gent': 1.0, 'retrait': 1.0, 'Conf\u00e9d\u00e9ration paysanne demande': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Enqu\u00eate R\u00e9gent: BASF demande le statut de \"t\u00e9moin assist\u00e9\"\n",
"defaultdict(<class 'float'>, {'t\u00e9moin assist\u00e9': 1.0, 'statut': 1.0, 'Enqu\u00eate R\u00e9gent': 1.0, 'BASF demande': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Enqu\u00eate R\u00e9gent: BASF demande le statut de \"t\u00e9moin assist\u00e9\"\n",
"defaultdict(<class 'float'>, {'t\u00e9moin assist\u00e9': 1.0, 'statut': 1.0, 'Enqu\u00eate R\u00e9gent': 1.0, 'BASF demande': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Un film-enqu\u00eate\n",
"defaultdict(<class 'float'>, {'film-enqu\u00eate': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"La Conf\u00e9d\u00e9ration Paysanne demande le \"retrait imm\u00e9diat\" du R\u00e9gent TS\n",
"defaultdict(<class 'float'>, {'retrait imm\u00e9diat': 1.0, 'R\u00e9gent TS': 1.0, 'Conf\u00e9d\u00e9ration Paysanne demande': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"La Conf\u00e9d\u00e9ration Paysanne demande le \"retrait imm\u00e9diat\" du R\u00e9gent TS\n",
"defaultdict(<class 'float'>, {'retrait imm\u00e9diat': 1.0, 'R\u00e9gent TS': 1.0, 'Conf\u00e9d\u00e9ration Paysanne demande': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Insecticide R\u00e9gent TS: un juge souhaite enqu\u00eater sur la mise en danger d'autrui\n",
"defaultdict(<class 'float'>, {'juge souhaite enqu\u00eater': 1.0, 'mise': 1.0, 'Insecticide R\u00e9gent TS': 1.0, 'danger d': 1.0, 'autrui': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Sous-estimation des risques li\u00e9s \u00e0 l'utilisation du R\u00e9gent TS\n",
"defaultdict(<class 'float'>, {'l': 1.0, 'utilisation': 1.0, 'Sous-estimation': 1.0, 'risques li\u00e9s': 1.0, 'R\u00e9gent TS': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"L'affaire de l'insecticide rebondit\n",
"defaultdict(<class 'float'>, {'l': 1.0, 'affaire': 1.0, 'insecticide rebondit': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Coup d'oeil sur 2003 : les faits marquants\n",
"defaultdict(<class 'float'>, {'faits marquants': 1.0, 'Coup d': 1.0, 'oeil': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"La Conf\u00e9d\u00e9ration exige le retrait du R\u00e9gent\n",
"defaultdict(<class 'float'>, {'R\u00e9gent': 1.0, 'Conf\u00e9d\u00e9ration exige': 1.0, 'retrait': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Le juge veut enqu\u00eater sur la mise en danger d'autrui\n",
"defaultdict(<class 'float'>, {'mise': 1.0, 'juge veut enqu\u00eater': 1.0, 'danger d': 1.0, 'autrui': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"La Conf\u00e9d\u00e9ration paysanne demande le retrait du R\u00e9gent TS"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"defaultdict(<class 'float'>, {'R\u00e9gent TS': 1.0, 'retrait': 1.0, 'Conf\u00e9d\u00e9ration paysanne demande': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Maires anti-Gaucho devant le tribunal\n",
"defaultdict(<class 'float'>, {'Maires anti-Gaucho': 1.0, 'tribunal': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"D\u00e9chets m\u00e9nagers, abeilles, OGM... Nature Avenir fait le point\n",
"defaultdict(<class 'float'>, {'D\u00e9chets m\u00e9nagers': 1.0, 'point': 1.0, 'abeilles': 1.0, 'Nature Avenir fait': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"D\u00e9chets m\u00e9nagers, abeilles, OGM... Nature Avenir fait le point\n",
"defaultdict(<class 'float'>, {'D\u00e9chets m\u00e9nagers': 1.0, 'point': 1.0, 'abeilles': 1.0, 'Nature Avenir fait': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"La banlieue par la bande\n",
"defaultdict(<class 'float'>, {'banlieue': 1.0, 'bande': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Insecticide R\u00e9gent TS .\n",
"defaultdict(<class 'float'>, {'Insecticide R\u00e9gent TS': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Chimie : une nouvelle expertise affirme la toxicit\u00e9 de l'insecticide R\u00e9gent TS\n",
"defaultdict(<class 'float'>, {'nouvelle expertise affirme': 1.0, 'insecticide R\u00e9gent TS': 1.0, 'Chimie': 1.0, 'l': 1.0, 'toxicit\u00e9': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"[Une expertise judiciaire affirme que les risques pour l'homme et pour l'environnement li\u00e9s \u00e0 l'utilisation de l'insecticide R\u00e9gent TS ont \u00e9t\u00e9 sous-estim\u00e9s.]\n",
"defaultdict(<class 'float'>, {'insecticide R\u00e9gent TS ont \u00e9t\u00e9 sous-estim\u00e9s': 1.0, 'expertise judiciaire affirme': 1.0, 'utilisation': 1.0, 'l': 4.0, 'environnement li\u00e9s': 1.0, 'risques': 1.0, 'homme': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Un insecticide \u00e0 risque\n",
"defaultdict(<class 'float'>, {'risque': 1.0, 'insecticide': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"La cuv\u00e9e des miels 2003 est plus que rare, s\u00e9cheresse oblige\n",
"defaultdict(<class 'float'>, {'miels': 1.0, 's\u00e9cheresse oblige': 1.0, 'cuv\u00e9e': 1.0, 'rare': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Les semences Gaucho, des d\u00e9chets banals\u00a0?\n",
"defaultdict(<class 'float'>, {'semences Gaucho': 1.0, 'd\u00e9chets banals': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Inqui\u00e9tudes des apiculteurs finist\u00e9riens (Lire en page 8)\n",
"defaultdict(<class 'float'>, {'Inqui\u00e9tudes': 1.0, 'Lire': 1.0, 'apiculteurs finist\u00e9riens': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Inqui\u00e9tudes des apiculteurs finist\u00e9riens\n",
"defaultdict(<class 'float'>, {'Inqui\u00e9tudes': 1.0, 'apiculteurs finist\u00e9riens': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"2003 dans le r\u00e9tro\n",
"defaultdict(<class 'float'>, {'r\u00e9tro': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"A. David, apiculteur : \u00ab Rien ne change \u00bb\n",
"defaultdict(<class 'float'>, {'David': 1.0, 'Rien': 1.0, 'apiculteur': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"R\u00e9gent TS: selon une nouvelle expertise, les risques ont \u00e9t\u00e9 sous-estim\u00e9s\n",
"defaultdict(<class 'float'>, {'nouvelle expertise': 1.0, 'R\u00e9gent TS': 1.0, 'risques ont \u00e9t\u00e9 sous-estim\u00e9s': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"R\u00e9gent TS: selon une nouvelle expertise, les risques ont \u00e9t\u00e9 sous-estim\u00e9s"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"defaultdict(<class 'float'>, {'nouvelle expertise': 1.0, 'R\u00e9gent TS': 1.0, 'risques ont \u00e9t\u00e9 sous-estim\u00e9s': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"R\u00e9gent TS: les risques pour l'homme auraient \u00e9t\u00e9 sous-estim\u00e9s (expertise)\n",
"defaultdict(<class 'float'>, {'l': 1.0, 'expertise': 1.0, 'risques': 1.0, 'R\u00e9gent TS': 1.0, 'homme auraient \u00e9t\u00e9 sous-estim\u00e9s': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"R\u00e9gent TS: les risques pour l'homme auraient \u00e9t\u00e9 sous-estim\u00e9s (expertise)\n",
"defaultdict(<class 'float'>, {'l': 1.0, 'expertise': 1.0, 'risques': 1.0, 'R\u00e9gent TS': 1.0, 'homme auraient \u00e9t\u00e9 sous-estim\u00e9s': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"R\u00e9gent TS: un troisi\u00e8me expert \u00e9voque des risques pour la sant\u00e9 humaine\n",
"defaultdict(<class 'float'>, {'expert \u00e9voque': 1.0, 'risques': 1.0, 'R\u00e9gent TS': 1.0, 'sant\u00e9 humaine': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"R\u00e9gent TS: un troisi\u00e8me expert \u00e9voque des risques pour la sant\u00e9 humaine\n",
"defaultdict(<class 'float'>, {'expert \u00e9voque': 1.0, 'risques': 1.0, 'R\u00e9gent TS': 1.0, 'sant\u00e9 humaine': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Les vrais ennemis des abeilles\n",
"defaultdict(<class 'float'>, {'abeilles': 1.0, 'vrais ennemis': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Un expert d\u00e9nonce les dangers d'un pesticide\n",
"defaultdict(<class 'float'>, {'expert d\u00e9nonce': 1.0, 'pesticide': 1.0, 'dangers d': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Huissier ind\u00e9sirable : le maire de Murs-\u00c9rign\u00e9 \u00e9crit au ministre de l'Int\u00e9rieur\n",
"defaultdict(<class 'float'>, {'Murs-\u00c9rign\u00e9 \u00e9crit': 1.0, 'l': 1.0, 'Huissier ind\u00e9sirable': 1.0, 'Int\u00e9rieur': 1.0, 'maire': 1.0, 'ministre': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"R\u00e9gent TS : nouvelles accusations\n",
"defaultdict(<class 'float'>, {'nouvelles accusations': 1.0, 'R\u00e9gent TS': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Un cocotier dans votre salon ?\n",
"defaultdict(<class 'float'>, {'cocotier': 1.0, 'salon': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Pratiques commerciales: L'autre malbouffe\n",
"defaultdict(<class 'float'>, {'Pratiques commerciales': 1.0, 'autre malbouffe': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Quel avenir pour le XXIe si\u00e8cle ?"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"defaultdict(<class 'float'>, {'avenir': 1.0, 'si\u00e8cle': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Des abeilles, du miel et du pain d'\u00e9pice\n",
"defaultdict(<class 'float'>, {'pain d': 1.0, 'abeilles': 1.0, 'miel': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Abeilles et fipronil : un dossier \" empoisonnant \"\n",
"defaultdict(<class 'float'>, {'fipronil': 1.0, 'dossier': 1.0, 'Abeilles': 1.0, 'empoisonnant': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Des apiculteurs manifestent \u00e0 Angers contre \"une tentative d'intimidation\"\n",
"defaultdict(<class 'float'>, {'tentative d': 1.0, 'apiculteurs manifestent': 1.0, 'Angers': 1.0, 'intimidation': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Insecticides : manifestation apr\u00e8s \" l'intimidation \"\n",
"defaultdict(<class 'float'>, {'l': 1.0, 'manifestation': 1.0, 'Insecticides': 1.0, 'intimidation': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Apiculteurs : non \u00e0 l'atteinte aux libert\u00e9s"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"defaultdict(<class 'float'>, {'libert\u00e9s': 1.0, 'l': 1.0, 'Apiculteurs': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Une ruche politique... consensuelle\n",
"defaultdict(<class 'float'>, {'ruche politique': 1.0, 'consensuelle': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Les apiculteurs manifestent\n",
"defaultdict(<class 'float'>, {'apiculteurs manifestent': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"[Les apiculteurs locaux invitent la population vendredi 13 f\u00e9vrier \u00e0 20 heures \u00e0 la mairie pour assister \u00e0 une projection de cassettes vid\u00e9os sur la vie des abeilles et les cons\u00e9quences de l'utilisation de certains insecticides.]\n",
"defaultdict(<class 'float'>, {'vie': 1.0, 'abeilles': 1.0, 'heures': 1.0, 'l': 1.0, 'f\u00e9vrier': 1.0, 'cassettes vid\u00e9os': 1.0, 'insecticides': 1.0, 'population vendredi': 1.0, 'projection': 1.0, 'mairie': 1.0, 'utilisation': 1.0, 'cons\u00e9quences': 1.0, 'apiculteurs locaux invitent': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"\" Les abeilles du coche \""
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"defaultdict(<class 'float'>, {'abeilles': 1.0, 'coche': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Gaucho : une r\u00e9action de Philippe Bodard\n",
"defaultdict(<class 'float'>, {'Philippe Bodard': 1.0, 'r\u00e9action': 1.0, 'Gaucho': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Baisse de production des apiculteurs fran\u00e7ais\n",
"defaultdict(<class 'float'>, {'apiculteurs fran\u00e7ais': 1.0, 'production': 1.0, 'Baisse': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Une esp\u00e8ce en danger\n",
"defaultdict(<class 'float'>, {'danger': 1.0, 'esp\u00e8ce': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Progression du taux de mortalit\u00e9 chez les abeilles\n",
"defaultdict(<class 'float'>, {'mortalit\u00e9': 1.0, 'abeilles': 1.0, 'Progression': 1.0, 'taux': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Un huissier chez des apiculteurs Visite impromptue, vendredi, dans le Maine-et-Loire lors d'une r\u00e9union.\n",
"defaultdict(<class 'float'>, {'apiculteurs Visite impromptue': 1.0, 'huissier': 1.0, 'Maine-et-Loire': 1.0, 'vendredi': 1.0, 'd': 1.0, 'r\u00e9union': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Un huissier chez des apiculteurs Visite surprise \u00e0 Blaison-Gohier (49) lors d'une r\u00e9union sur les insecticides.\n",
"defaultdict(<class 'float'>, {'Blaison-Gohier': 1.0, 'huissier': 1.0, 'insecticides': 1.0, 'r\u00e9union': 1.0, 'apiculteurs Visite surprise': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Un huissier chez des apiculteurs Visite impromptue \u00e0 Blaison-Gohier, lors d'une r\u00e9union sur les insecticides.\n",
"defaultdict(<class 'float'>, {'Blaison-Gohier': 1.0, 'huissier': 1.0, 'insecticides': 1.0, 'apiculteurs Visite impromptue': 1.0, 'r\u00e9union': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Un huissier chez des apiculteurs Visite impromptue, pr\u00e8s d'Angers, lors d'une r\u00e9union sur les insecticides.\n",
"defaultdict(<class 'float'>, {'r\u00e9union': 1.0, 'huissier': 1.0, 'insecticides': 1.0, 'apiculteurs Visite impromptue': 1.0, 'Angers': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Intimidation d'un huissier apr\u00e8s une r\u00e9union d'information\n",
"defaultdict(<class 'float'>, {'Intimidation d': 1.0, 'huissier': 1.0, 'r\u00e9union d': 1.0, 'information': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Les apiculteurs interpellent les citoyens\n",
"defaultdict(<class 'float'>, {'citoyens': 1.0, 'apiculteurs interpellent': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Insecticides : \" T\u00e9moins g\u00eanants \", huissier g\u00ean\u00e9\n",
"defaultdict(<class 'float'>, {'T\u00e9moins g\u00eanants': 1.0, 'huissier g\u00ean\u00e9': 1.0, 'Insecticides': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Professeur Narbonne : \" Certaines personnes n'ont pas bien fait leur boulot \"\n",
"defaultdict(<class 'float'>, {'boulot': 1.0, 'personnes n': 1.0, 'Professeur Narbonne': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"La Conf\u00e9d\u00e9ration interpelle l'agriculture raisonn\u00e9e"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"defaultdict(<class 'float'>, {'agriculture raisonn\u00e9e': 1.0, 'Conf\u00e9d\u00e9ration interpelle l': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"La Conf\u00e9d\u00e9ration interpelle l'agriculture raisonn\u00e9e\n",
"defaultdict(<class 'float'>, {'agriculture raisonn\u00e9e': 1.0, 'Conf\u00e9d\u00e9ration interpelle l': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"La Conf\u00e9d\u00e9ration interpelle l'agriculture raisonn\u00e9e\n",
"defaultdict(<class 'float'>, {'agriculture raisonn\u00e9e': 1.0, 'Conf\u00e9d\u00e9ration interpelle l': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"La Conf\u00e9d\u00e9ration interpelle l'agriculture raisonn\u00e9e\n",
"defaultdict(<class 'float'>, {'agriculture raisonn\u00e9e': 1.0, 'Conf\u00e9d\u00e9ration interpelle l': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"La Conf\u00e9d\u00e9ration interpelle l'agriculture raisonn\u00e9e\n",
"defaultdict(<class 'float'>, {'agriculture raisonn\u00e9e': 1.0, 'Conf\u00e9d\u00e9ration interpelle l': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"La Conf\u00e9d\u00e9ration interpelle l'agriculture raisonn\u00e9e\n",
"defaultdict(<class 'float'>, {'agriculture raisonn\u00e9e': 1.0, 'Conf\u00e9d\u00e9ration interpelle l': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"La Conf\u00e9d\u00e9ration interpelle l'agriculture raisonn\u00e9e\n",
"defaultdict(<class 'float'>, {'agriculture raisonn\u00e9e': 1.0, 'Conf\u00e9d\u00e9ration interpelle l': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Homologation bloqu\u00e9e pour le Regent .\n",
"defaultdict(<class 'float'>, {'Homologation bloqu\u00e9e': 1.0, 'Regent': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Insecticides : un film saisi pr\u00e8s d'Angers\n",
"defaultdict(<class 'float'>, {'film saisi': 1.0, 'Angers': 1.0, 'Insecticides': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Un huissier saisit un film sur des insecticides lors d'une r\u00e9union publique\n",
"defaultdict(<class 'float'>, {'huissier saisit': 1.0, 'film': 1.0, 'insecticides': 1.0, 'r\u00e9union publique': 1.0, 'd': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Les apiculteurs ont le bourdon\n",
"defaultdict(<class 'float'>, {'apiculteurs ont': 1.0, 'bourdon': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"R\u00e9gent : BASF demande \u00e0 \u00eatre t\u00e9moin assist\u00e9\n",
"defaultdict(<class 'float'>, {'R\u00e9gent': 1.0, '\u00eatre t\u00e9moin assist\u00e9': 1.0, 'BASF demande': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"R\u00e9gent : BASF demande \u00e0 \u00eatre t\u00e9moin assist\u00e9\n",
"defaultdict(<class 'float'>, {'R\u00e9gent': 1.0, '\u00eatre t\u00e9moin assist\u00e9': 1.0, 'BASF demande': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"R\u00e9gent : BASF demande \u00e0 \u00eatre t\u00e9moin assist\u00e9\n",
"defaultdict(<class 'float'>, {'R\u00e9gent': 1.0, '\u00eatre t\u00e9moin assist\u00e9': 1.0, 'BASF demande': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"R\u00e9gent : BASF demande \u00e0 \u00eatre t\u00e9moin assist\u00e9\n",
"defaultdict(<class 'float'>, {'R\u00e9gent': 1.0, '\u00eatre t\u00e9moin assist\u00e9': 1.0, 'BASF demande': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"BASF demande le statut de \" t\u00e9moin assist\u00e9 \"\n",
"defaultdict(<class 'float'>, {'t\u00e9moin assist\u00e9': 1.0, 'statut': 1.0, 'BASF demande': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Le Gaucho et le R\u00e9gent TS toujours en accusation\n",
"defaultdict(<class 'float'>, {'R\u00e9gent TS': 1.0, 'Gaucho': 1.0, 'accusation': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Attention, abeilles en danger"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"defaultdict(<class 'float'>, {'danger': 1.0, 'Attention': 1.0, 'abeilles': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"R\u00e9gent : BASF demande \u00e0 \u00eatre t\u00e9moin assist\u00e9\n",
"defaultdict(<class 'float'>, {'R\u00e9gent': 1.0, '\u00eatre t\u00e9moin assist\u00e9': 1.0, 'BASF demande': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"La Conf\u00e9d\u00e9ration interpelle l'agriculture raisonn\u00e9e\n",
"defaultdict(<class 'float'>, {'agriculture raisonn\u00e9e': 1.0, 'Conf\u00e9d\u00e9ration interpelle l': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n"
]
}
],
"prompt_number": 17
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"firstchild = presse.children.first()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for n in Node_Ngram.objects.filter(node=firstchild):\n",
" print(n.ngram.terms)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"liste_ordered = collections.OrderedDict(sorted(liste.items()), key=lambda t: t[1])"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#liste_ordered"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Cr\u00e9ation des Listes"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import collections"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"liste = collections.defaultdict(int)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"try:\n",
" whitelist_type = NodeType.objects.get(name='WhiteList')\n",
" blacklist_type = NodeType.objects.get(name='BlackList')\n",
"except:\n",
" whitelist_type = NodeType(name='WhiteList')\n",
" whitelist_type.save()\n",
" \n",
" blacklist_type = NodeType(name='BlackList')\n",
" blacklist_type.save()\n",
"\n",
"white_node = Node.objects.create(name='WhiteList Pubmed', user=me, parent=corpus_pubmed, type=whitelist_type)\n",
"black_node = Node.objects.create(name='BlackList Pubmed', user=me, parent=corpus_pubmed, type=blacklist_type)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"Node_Ngram.objects.filter(node=white_node).count()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Cr\u00e9ation de la white list"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"with transaction.atomic():\n",
" for node in presse.children.all():\n",
" for node_ngram in Node_Ngram.objects.filter(node=node):\n",
" if node_ngram.ngram.n > 1:\n",
" #liste[node_ngram.ngram.terms] += node_ngram.weight\n",
" Node_Ngram.objects.create(node=white_node, ngram=node_ngram.ngram, weight=1)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"white_node.pk"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"Node_Ngram.objects.filter(node=white_node).count()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Cr\u00e9ation de la black list"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"with transaction.atomic():\n",
" for node_ngram_object in Node_Ngram.objects.all()[101:150]:\n",
" Node_Ngram.objects.create(node=black_node, ngram=node_ngram_object.ngram, occurences=1)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"Node_Ngram.objects.filter(node=black_node)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Cr\u00e9ation des synonymes"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"syno_type = NodeType.objects.get(name='Synonyme')\n",
"syno_node = Node.objects.create(name='Syno Pubmed',\n",
" user=user, \n",
" parent=corpus, \n",
" type=syno_type)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"synonyme1, synonyme2 = Node_Ngram.objects.filter(node=white_node)[3:5]"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"NodeNgramNgram.objects.create(node=syno_node, ngramX=synonyme1.ngram, ngramY=synonyme2.ngram)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Cooccurrence"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"white_node.children.count()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"black_node.pk"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"try:\n",
" cooc_type = NodeType.objects.get(name='Cooccurrence')\n",
"except:\n",
" cooc_type = NodeType(name='Cooccurrence')\n",
" cooc_type.save()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"cooc = Node.objects.create(user=me,\\\n",
" parent=corpus_pubmed,\\\n",
" type=cooc_type,\\\n",
" name=\"Cooccurrences calcul Alpha\")"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"cooc.pk"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"white_node.children.all().delete()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from django.db import connection\n",
"cursor = connection.cursor()\n",
"# LOCK TABLE documents_ngramtemporary IN EXCLUSIVE MODE;\n",
"query_string = \"\"\"\n",
"INSERT INTO node_nodengramngram (node_id, \"ngramX_id\", \"ngramY_id\", score)\n",
"\n",
"SELECT \n",
"%d as node_id, x.ngram_id, y.ngram_id, COUNT(*) AS score\n",
"\n",
"FROM\n",
"node_node_ngram AS x\n",
"\n",
"INNER JOIN \n",
"node_node_ngram AS y \n",
"ON x.node_id = y.node_id\n",
"\n",
"\n",
"WHERE\n",
"x.id in (select id from node_node_ngram WHERE node_id = %d )\n",
"AND\n",
"y.id in (select id from node_node_ngram WHERE node_id = %d )\n",
"AND\n",
"x.ngram_id <> y.ngram_id\n",
"\n",
"\n",
"GROUP BY\n",
"x.ngram_id, y.ngram_id\n",
"\n",
"HAVING count(*) > 1\n",
"\n",
"ORDER BY score\n",
"\n",
"LIMIT 300\n",
"\n",
" \"\"\" % (cooc.pk, white_node.pk, white_node.pk)\n",
"\n",
"cursor.execute(query_string)\n",
"\n",
"try:\n",
" while True:\n",
" row = cursor.fetchone()\n",
" if row is None:\n",
" break\n",
" print(row)\n",
"except:\n",
" pass"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from copy import copy\n",
"import numpy as np\n",
"import pandas as pd\n",
"import networkx as nx\n",
"from collections import defaultdict\n",
"from analysis.louvain import *\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"matrix = \"\""
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"matrix = defaultdict(lambda : defaultdict(float))\n",
"for cooccurrence in NodeNgramNgram.objects.filter(node=cooc):\n",
" if cooccurrence.score > 1 :\n",
" #print(x.ngramX.terms, x.ngramY.terms)\n",
" matrix[cooccurrence.ngramX.terms][cooccurrence.ngramY.terms] = cooccurrence.score\n",
" matrix[cooccurrence.ngramY.terms][cooccurrence.ngramX.terms] = cooccurrence.score"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df = pd.DataFrame(matrix).T.fillna(0)\n",
"x = copy(df.values)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"x = np.where((x.sum(axis=1) > x.shape[0] / 2), 0, x )\n",
"x = np.where((x.sum(axis=1) > x.shape[0] / 10), 0, x )"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"x = x / x.sum(axis=1)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"matrix_filtered = np.where(x > .4, 1, 0)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"matrix_filtered"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"G = nx.from_numpy_matrix(matrix_filtered)\n",
"G = nx.relabel_nodes(G, dict(enumerate(df.columns)))"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"nx.draw(G, with_labels=True)\n",
"plt.show()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"partition = best_partition(G)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#partition"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"pos = nx.spring_layout(G)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"count = 0.0\n",
"node_min = 3\n",
"for com in set(partition.values()) :\n",
" count = count + 1\n",
" list_nodes = [nodes for nodes in partition.keys() if partition[nodes] == com]\n",
" \n",
" if len(list_nodes) > node_min:\n",
" nx.draw_networkx_nodes(G, pos, list_nodes, node_size = 20, with_labels=True)#, node_color = str(count / size))\n",
" nx.draw_networkx_edges(G, pos, alpha=0.5)\n",
" plt.title(\"Clique \" + str(count))\n",
" \n",
" for node in list_nodes: \n",
" print(node)\n",
" plt.show()\n",
" print(\"-\" * 30)\n"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
\ No newline at end of file
{
"metadata": {
"name": "",
"signature": "sha256:a5bd8dfc6ee8fc121f5aec3f45e591fc715917cc12bb133015d3f91f7337cbc7"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"from node.models import Node, NodeType,\\\n",
" Project, Corpus, Document,\\\n",
" Ngram, Node_Ngram,\\\n",
" User, Language, ResourceType\n",
" \n",
"from parsing.Caches import NgramsCache\n",
" \n",
"from django.db import connection\n",
"cursor = connection.cursor()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 30
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"typeCorpus = NodeType.objects.get(name='Corpus')\n",
"for corpus in Node.objects.filter(type=typeCorpus):\n",
" print(\"#%d - %s\" % (corpus.id, corpus))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"#7 - Presse corpus\n"
]
}
],
"prompt_number": 33
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"try:\n",
" me = User.objects.get(username='alexandre')\n",
"except:\n",
" me = User(username='alexandre')\n",
" me.save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 34
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#\u00a0corpus = Node.objects.filter(type=typeCorpus).first()\n",
"corpus = Node.objects.get(id=698)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 59
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Occurences"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"cursor.execute(\"\"\"\n",
" SELECT\n",
" COUNT(*) AS occurrences,\n",
" ngX.terms\n",
" FROM\n",
" node_node AS n\n",
" INNER JOIN\n",
" node_node_ngram AS nngX ON nngX.node_id = n.id\n",
" INNER JOIN\n",
" node_ngram AS ngX ON ngX.id = nngX.ngram_id\n",
" WHERE\n",
" n.parent_id = %s\n",
" GROUP BY\n",
" ngX.terms\n",
" ORDER BY\n",
" occurrences DESC\n",
" LIMIT\n",
" 20\n",
"\"\"\", [corpus.id])\n",
"\n",
"while True:\n",
" row = cursor.fetchone()\n",
" if row is None:\n",
" break\n",
" print(row)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"(196, 'patients')\n",
"(135, 'voice')\n",
"(129, 'study')\n",
"(111, 'disease')\n",
"(69, 'treatment')\n",
"(66, 'life')\n",
"(58, 'patient')\n",
"(53, 'quality')\n",
"(49, 'care')\n",
"(45, 'use')\n",
"(44, 'Patients')\n",
"(43, 'people')\n",
"(41, 'development')\n",
"(41, 'purpose')\n",
"(40, 's disease')\n",
"(39, 's')\n",
"(38, 'results')\n",
"(37, 'diagnosis')\n",
"(36, 'years')\n",
"(34, 'women')\n"
]
}
],
"prompt_number": 104
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Cooccurrences par ann\u00e9e"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"cursor.execute(\"\"\"\n",
" SELECT\n",
" COUNT(*) AS occurrences,\n",
" n.metadata->'publication_year' AS year,\n",
" ngX.terms\n",
" FROM\n",
" node_node AS n\n",
" INNER JOIN\n",
" node_node_ngram AS nngX ON nngX.node_id = n.id\n",
" INNER JOIN\n",
" node_ngram AS ngX ON ngX.id = nngX.ngram_id\n",
" WHERE\n",
" n.parent_id = %s\n",
" GROUP BY\n",
" terms,\n",
" year\n",
" ORDER BY\n",
" occurrences DESC\n",
" LIMIT\n",
" 20\n",
"\"\"\", [corpus.id])\n",
"\n",
"while True:\n",
" row = cursor.fetchone()\n",
" if row is None:\n",
" break\n",
" print(row)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"(24, '2014', 'patients')\n",
"(22, '2005', 'patients')\n",
"(18, '2005', 'study')\n",
"(15, '2014', 'voice')\n",
"(14, '2002', 'disease')\n",
"(14, '2013', 'patients')\n",
"(14, '2006', 'study')\n",
"(13, '2014', 'treatment')\n",
"(12, '2011', 'patients')\n",
"(12, '2004', 'voice')\n",
"(12, '2012', 'patients')\n",
"(12, '2003', 'patients')\n",
"(12, '2005', 'voice')\n",
"(11, '2002', 'patients')\n",
"(11, '2014', 'study')\n",
"(10, '2007', 'patients')\n",
"(10, '2006', 'patients')\n",
"(10, '2004', 'study')\n",
"(10, '2001', 'patients')\n",
"(10, '2014', 'phase')\n"
]
}
],
"prompt_number": 105
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Cr\u00e9ation d'une liste de synonymes"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"ngramsCache = NgramsCache(Language.objects.get(iso2='fr'))"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 21
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"synonymePairs = [\n",
" ['danger', 'risques'],\n",
" ['risque', 'risques'],\n",
" ['R\u00e9gent', 'R\u00e9gent TS']\n",
"]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 22
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"try:\n",
" typeSynonyme = NodeType.objects.get(name='Synonyme')\n",
"except:\n",
" typeSynonyme = NodeType(name='Synonyme')\n",
" typeSynonyme.save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 23
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"listSynonymes = Node(name='Syno abeilles', type=typeSynonyme, user=me)\n",
"listSynonymes.save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 24
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for synonymePair in synonymePairs:\n",
" NodeNgramNgram(\n",
" ngramx = ngramsCache[synonymePair[0]],\n",
" ngramy = ngramsCache[synonymePair[1]],\n",
" node = listSynonymes,\n",
" score = 1.\n",
" ).save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 25
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"listSynonymes.id"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 26,
"text": [
"6"
]
}
],
"prompt_number": 26
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Occurrences avec synonymes"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"'''cursor.execute(\"\"\"\n",
" SELECT\n",
" COUNT(*) AS occurrences,\n",
" ngx.terms\n",
" FROM\n",
" node_node AS n\n",
" INNER JOIN\n",
" node_node_ngram AS nngX ON nngX.node_id = n.id\n",
" INNER JOIN\n",
" node_ngram AS ngX ON ngX.id = nngX.ngram_id\n",
" WHERE\n",
" n.parent_id = %s\n",
" GROUP BY\n",
" ngX.terms\n",
" ORDER BY\n",
" occurrences DESC\n",
" LIMIT\n",
" 20\n",
"\"\"\")'''\n",
"cursor.execute(\"\"\"\n",
" SELECT\n",
" n.id\n",
" FROM\n",
" node_node AS n\n",
" INNER JOIN\n",
" node_node_ngram AS nngx ON nngx.node_id = n.id\n",
" INNER JOIN\n",
" node_nodengramngram AS nngng ON nngng.ngramx_id = nngx.ngram_id\n",
" INNER JOIN\n",
" node_node_ngram AS nngy ON nngy.id = nngng.ngramy_id\n",
" WHERE\n",
" n.parent_id = %s\n",
"\"\"\", [corpus.id])\n",
"#\u00a0\"\"\" % [listSynonymes.id])\n",
"\n",
"while True:\n",
" row = cursor.fetchone()\n",
" if row is None:\n",
" break\n",
" print(row)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 47
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Cooccurrences"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"cursor.execute(\"\"\"\n",
" SELECT\n",
" COUNT(*) AS cooccurrences,\n",
" ngX.terms,\n",
" ngY.terms\n",
" FROM\n",
" node_node AS n\n",
" \n",
" INNER JOIN\n",
" node_node_ngram AS nngX ON nngX.node_id = n.id\n",
" INNER JOIN\n",
" node_ngram AS ngX ON ngX.id = nngX.ngram_id\n",
" \n",
" INNER JOIN\n",
" node_node_ngram AS nngY ON nngY.node_id = n.id\n",
" INNER JOIN\n",
" node_ngram AS ngY ON ngY.id = nngY.ngram_id\n",
" \n",
" WHERE\n",
" n.parent_id = %s\n",
" AND\n",
" nngX.ngram_id > nngY.ngram_id\n",
" \n",
" GROUP BY\n",
" ngX.id,\n",
" ngX.terms,\n",
" ngY.id,\n",
" ngY.terms\n",
" ORDER BY\n",
" cooccurrences DESC\n",
" LIMIT\n",
" 20\n",
"\"\"\", [corpus.id])\n",
"\n",
"while True:\n",
" row = cursor.fetchone()\n",
" if row is None:\n",
" break\n",
" print(row)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"(98, 'patients', 'study')\n",
"(88, 'patients', 'disease')\n",
"(78, 'voice', 'patients')\n",
"(76, 'Parkinson', 's disease')\n",
"(64, 'life', 'patients')\n",
"(62, 'life', 'quality')\n",
"(60, 'treatment', 'patients')\n",
"(56, 'patient', 'patients')\n",
"(56, 'voice', 'study')\n",
"(54, 'Patients', 'patients')\n",
"(54, 'purpose', 'study')\n",
"(54, 'voice', 'disease')\n",
"(52, 'study', 'disease')\n",
"(48, 'voice', 'treatment')\n",
"(46, 'treatment', 'disease')\n",
"(42, 'quality', 'patients')\n",
"(42, 'life', 'study')\n",
"(40, 'care', 'patients')\n",
"(40, 'PD', 'Parkinson')\n",
"(40, 'PD', 's disease')\n"
]
}
],
"prompt_number": 108
}
],
"metadata": {}
}
]
}
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment