Commit de3fe8db authored by Mathieu Rodic's avatar Mathieu Rodic

[GIT] Resolved conflicts in url.py

[CODE] Import paths are now relative instead of absolute in parsing/
parent f5443d84
__pycache__/
parsing/Taggers/treetagger/
.ipynb_checkpoints/
*.pyc
{
"metadata": {
"name": "",
"signature": "sha256:0e63832a6b33d476c8b284b72b0740bd9ade357e5ebb1f73bdc399bbd2824a16"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"from node.models import Node, NodeType,\\\n",
" Project, Corpus, Document,\\\n",
" Ngram, Node_Ngram,\\\n",
" User, Language, ResourceType"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"Node.objects.all().delete()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import pycountry\n",
"\n",
"for language in pycountry.languages:\n",
" try:\n",
" implemented = 1 if language.alpha2 in ['en', 'fr'] else 0\n",
" Language(iso2=language.alpha2, iso3=language.terminology, fullname=language.name, implemented=implemented).save()\n",
" except:\n",
" pass"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 24
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"english = Language.objects.get(iso2='en')\n",
"french = Language.objects.get(iso2='fr')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 25
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"try:\n",
" me = User.objects.get(username='alexandre')\n",
"except:\n",
" me = User(username='alexandre')\n",
" me.save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"try:\n",
" typeProject = NodeType.objects.get(name='Project')\n",
"except Exception as error:\n",
" print(error)\n",
" typeProject = NodeType(name='Project')\n",
" typeProject.save() \n",
"\n",
"try:\n",
" typeCorpus = NodeType.objects.get(name='Corpus')\n",
"except Exception as error:\n",
" print(error)\n",
" typeCorpus = NodeType(name='Corpus')\n",
" typeCorpus.save()\n",
" \n",
"try:\n",
" typeDoc = NodeType.objects.get(name='Document')\n",
"except Exception as error:\n",
" print(error)\n",
" typeDoc = NodeType(name='Document')\n",
" typeDoc.save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"try:\n",
" typePubmed = ResourceType.objects.get(name='pubmed')\n",
" typeIsi = ResourceType.objects.get(name='isi')\n",
" typeRis = ResourceType.objects.get(name='ris')\n",
" typePresseFrench = ResourceType.objects.get(name='europress_french')\n",
" typePresseEnglish = ResourceType.objects.get(name='europress_english')\n",
"\n",
"except Exception as error:\n",
" print(error)\n",
" \n",
" typePubmed = ResourceType(name='pubmed')\n",
" typePubmed.save() \n",
" \n",
" typeIsi = ResourceType(name='isi')\n",
" typeIsi.save()\n",
" \n",
" typeRis = ResourceType(name='ris')\n",
" typeRis.save()\n",
" \n",
" typePresseFrench = ResourceType(name='europress_french')\n",
" typePresseFrench.save()\n",
" \n",
" typePresseEnglish = ResourceType(name='europress_english')\n",
" typePresseEnglish.save()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"get() returned more than one ResourceType -- it returned 2!\n"
]
}
],
"prompt_number": 33
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"Node.objects.all().delete()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 34
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"try:\n",
" project = Node.objects.get(name='Bees project')\n",
"except:\n",
" project = Node(name='Bees project', type=typeProject, user=me)\n",
" project.save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 9
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Pubmed"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"try:\n",
" corpus_pubmed = Node.objects.get(name='PubMed corpus')\n",
"except:\n",
" corpus_pubmed = Node(parent=project, name='PubMed corpus', type=typeCorpus, user=me)\n",
" corpus_pubmed.save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"corpus_pubmed.add_resource(file='/srv/gargantext_lib/data_samples/pubmedBig.zip', type=typePubmed)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 11,
"text": [
"<Resource: Resource object>"
]
}
],
"prompt_number": 11
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#corpus_abeille.add_resource(file='/srv/gargantext_lib/data_samples/pubmed.zip', type=typePubmed)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 12
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"corpus_pubmed.parse_resources()\n",
"corpus_pubmed.children.count()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 13,
"text": [
"600"
]
}
],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"corpus_pubmed.id"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 14,
"text": [
"3131"
]
}
],
"prompt_number": 14
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"corpus_pubmed.children.all().extract_ngrams(['title', 'abstract'])\n",
"#Node_Ngram.objects.filter(node=corpus_pubmed.children.all()[0]).count()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n"
]
}
],
"prompt_number": 15
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### RIS"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"try:\n",
" corpus_ris = Node.objects.get(name='RIS corpus')\n",
"except:\n",
" corpus_ris = Node(parent=project, name='RIS corpus', type=typeCorpus, user=me)\n",
" corpus_ris.save()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"corpus_ris.add_resource(file='/srv/gargantext_lib/data_samples/risUnix.zip', type=typeRis)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"corpus_ris.parse_resources()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"corpus_ris.children.count()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"corpus_ris.children.all()[15].metadata"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"corpus_ris.name = \"ZOTERO CORPUS (CIRDEM)\"\n",
"corpus_ris.save()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Science"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"try:\n",
" science = Node.objects.get(name='WOS corpus')\n",
"except:\n",
" science = Node(parent=project, name='WOS corpus', type=typeCorpus, user=me)\n",
" science.save()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"science.add_resource(file='/srv/gargantext_lib/data_samples/isi.zip', type=typeIsi)\n",
"science.parse_resources()\n",
"science.children.count()"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#[n.metadata for n in science.children.all()]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 10
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"science.children.all().extract_ngrams(['title',])\n",
"Node_Ngram.objects.filter(node=science.children.all()[0]).count()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Press"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"try:\n",
" presse = Node.objects.get(name='Presse corpus')\n",
"except:\n",
" presse = Node(parent=project, name='Presse corpus', type=typeCorpus, user=me)\n",
" presse.save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 12
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"presse.language = Language.objects.get(iso2='fr')\n",
"presse.save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"presse.add_resource(file='/srv/gargantext_lib/data_samples/html/html_french.zip', type=typePresse)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 14,
"text": [
"<Resource: Resource object>"
]
}
],
"prompt_number": 14
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"presse.parse_resources()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 15
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"presse.children.count()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 16,
"text": [
"88"
]
}
],
"prompt_number": 16
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for child in presse.children.all():\n",
" print(child.metadata['title'])\n",
" child.extract_ngrams(['title',])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Conf\u00e9d\u00e9ration paysanne : \" retrait imm\u00e9diat \" du R\u00e9gent\n",
"defaultdict(<class 'float'>, {'retrait imm\u00e9diat': 1.0, 'R\u00e9gent': 1.0, 'Conf\u00e9d\u00e9ration paysanne': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Gaucho, R\u00e9gent : la mobilisation continue\n",
"defaultdict(<class 'float'>, {'R\u00e9gent': 1.0, 'mobilisation continue': 1.0, 'Gaucho': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"/usr/lib/python3.4/subprocess.py:473: ResourceWarning: unclosed file <_io.FileIO name=44 mode='rb'>\n",
" for inst in _active[:]:\n",
"\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"WARNING:py.warnings:/usr/lib/python3.4/subprocess.py:473: ResourceWarning: unclosed file <_io.FileIO name=44 mode='rb'>\n",
" for inst in _active[:]:\n",
"\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"/usr/lib/python3.4/subprocess.py:473: ResourceWarning: unclosed file <_io.FileIO name=43 mode='wb'>\n",
" for inst in _active[:]:\n",
"\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"WARNING:py.warnings:/usr/lib/python3.4/subprocess.py:473: ResourceWarning: unclosed file <_io.FileIO name=43 mode='wb'>\n",
" for inst in _active[:]:\n",
"\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"/usr/lib/python3.4/subprocess.py:473: ResourceWarning: unclosed file <_io.FileIO name=46 mode='rb'>\n",
" for inst in _active[:]:\n",
"\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"WARNING:py.warnings:/usr/lib/python3.4/subprocess.py:473: ResourceWarning: unclosed file <_io.FileIO name=46 mode='rb'>\n",
" for inst in _active[:]:\n",
"\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"GB/rapport: \"oui mais\" au ma\u00efs OGM, \"non mais\" pour colza et betterave\n",
"defaultdict(<class 'float'>, {'betterave': 1.0, 'ma\u00efs': 1.0, 'GB rapport': 1.0, 'colza': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"/usr/lib/python3.4/subprocess.py:473: ResourceWarning: unclosed file <_io.FileIO name=47 mode='rb'>\n",
" for inst in _active[:]:\n",
"\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"WARNING:py.warnings:/usr/lib/python3.4/subprocess.py:473: ResourceWarning: unclosed file <_io.FileIO name=47 mode='rb'>\n",
" for inst in _active[:]:\n",
"\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"/usr/lib/python3.4/subprocess.py:473: ResourceWarning: unclosed file <_io.FileIO name=45 mode='wb'>\n",
" for inst in _active[:]:\n",
"\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"WARNING:py.warnings:/usr/lib/python3.4/subprocess.py:473: ResourceWarning: unclosed file <_io.FileIO name=45 mode='wb'>\n",
" for inst in _active[:]:\n",
"\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"/usr/lib/python3.4/subprocess.py:473: ResourceWarning: unclosed file <_io.FileIO name=49 mode='rb'>\n",
" for inst in _active[:]:\n",
"\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"WARNING:py.warnings:/usr/lib/python3.4/subprocess.py:473: ResourceWarning: unclosed file <_io.FileIO name=49 mode='rb'>\n",
" for inst in _active[:]:\n",
"\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Rapport: \"oui mais\" au ma\u00efs OGM, \"non mais\" pour colza et betterave \u00e0 sucre\n",
"defaultdict(<class 'float'>, {'ma\u00efs': 1.0, 'betterave': 1.0, 'Rapport': 1.0, 'sucre': 1.0, 'colza': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"/usr/lib/python3.4/subprocess.py:473: ResourceWarning: unclosed file <_io.FileIO name=48 mode='rb'>\n",
" for inst in _active[:]:\n",
"\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"WARNING:py.warnings:/usr/lib/python3.4/subprocess.py:473: ResourceWarning: unclosed file <_io.FileIO name=48 mode='rb'>\n",
" for inst in _active[:]:\n",
"\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Rapport: \"oui mais\" au ma\u00efs OGM, \"non mais\" pour colza et betterave \u00e0 sucre\n",
"defaultdict(<class 'float'>, {'ma\u00efs': 1.0, 'betterave': 1.0, 'Rapport': 1.0, 'sucre': 1.0, 'colza': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"La Conf\u00e9d\u00e9ration exige le retrait du R\u00e9gent\n",
"defaultdict(<class 'float'>, {'R\u00e9gent': 1.0, 'Conf\u00e9d\u00e9ration exige': 1.0, 'retrait': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"La Conf\u00e9d\u00e9ration exige le retrait du R\u00e9gent\n",
"defaultdict(<class 'float'>, {'R\u00e9gent': 1.0, 'Conf\u00e9d\u00e9ration exige': 1.0, 'retrait': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"La Conf\u00e9d\u00e9ration paysanne demande le retrait du R\u00e9gent\n",
"defaultdict(<class 'float'>, {'R\u00e9gent': 1.0, 'retrait': 1.0, 'Conf\u00e9d\u00e9ration paysanne demande': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Enqu\u00eate R\u00e9gent: BASF demande le statut de \"t\u00e9moin assist\u00e9\"\n",
"defaultdict(<class 'float'>, {'t\u00e9moin assist\u00e9': 1.0, 'statut': 1.0, 'Enqu\u00eate R\u00e9gent': 1.0, 'BASF demande': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Enqu\u00eate R\u00e9gent: BASF demande le statut de \"t\u00e9moin assist\u00e9\"\n",
"defaultdict(<class 'float'>, {'t\u00e9moin assist\u00e9': 1.0, 'statut': 1.0, 'Enqu\u00eate R\u00e9gent': 1.0, 'BASF demande': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Un film-enqu\u00eate\n",
"defaultdict(<class 'float'>, {'film-enqu\u00eate': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"La Conf\u00e9d\u00e9ration Paysanne demande le \"retrait imm\u00e9diat\" du R\u00e9gent TS\n",
"defaultdict(<class 'float'>, {'retrait imm\u00e9diat': 1.0, 'R\u00e9gent TS': 1.0, 'Conf\u00e9d\u00e9ration Paysanne demande': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"La Conf\u00e9d\u00e9ration Paysanne demande le \"retrait imm\u00e9diat\" du R\u00e9gent TS\n",
"defaultdict(<class 'float'>, {'retrait imm\u00e9diat': 1.0, 'R\u00e9gent TS': 1.0, 'Conf\u00e9d\u00e9ration Paysanne demande': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Insecticide R\u00e9gent TS: un juge souhaite enqu\u00eater sur la mise en danger d'autrui\n",
"defaultdict(<class 'float'>, {'juge souhaite enqu\u00eater': 1.0, 'mise': 1.0, 'Insecticide R\u00e9gent TS': 1.0, 'danger d': 1.0, 'autrui': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Sous-estimation des risques li\u00e9s \u00e0 l'utilisation du R\u00e9gent TS\n",
"defaultdict(<class 'float'>, {'l': 1.0, 'utilisation': 1.0, 'Sous-estimation': 1.0, 'risques li\u00e9s': 1.0, 'R\u00e9gent TS': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"L'affaire de l'insecticide rebondit\n",
"defaultdict(<class 'float'>, {'l': 1.0, 'affaire': 1.0, 'insecticide rebondit': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Coup d'oeil sur 2003 : les faits marquants\n",
"defaultdict(<class 'float'>, {'faits marquants': 1.0, 'Coup d': 1.0, 'oeil': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"La Conf\u00e9d\u00e9ration exige le retrait du R\u00e9gent\n",
"defaultdict(<class 'float'>, {'R\u00e9gent': 1.0, 'Conf\u00e9d\u00e9ration exige': 1.0, 'retrait': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Le juge veut enqu\u00eater sur la mise en danger d'autrui\n",
"defaultdict(<class 'float'>, {'mise': 1.0, 'juge veut enqu\u00eater': 1.0, 'danger d': 1.0, 'autrui': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"La Conf\u00e9d\u00e9ration paysanne demande le retrait du R\u00e9gent TS"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"defaultdict(<class 'float'>, {'R\u00e9gent TS': 1.0, 'retrait': 1.0, 'Conf\u00e9d\u00e9ration paysanne demande': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Maires anti-Gaucho devant le tribunal\n",
"defaultdict(<class 'float'>, {'Maires anti-Gaucho': 1.0, 'tribunal': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"D\u00e9chets m\u00e9nagers, abeilles, OGM... Nature Avenir fait le point\n",
"defaultdict(<class 'float'>, {'D\u00e9chets m\u00e9nagers': 1.0, 'point': 1.0, 'abeilles': 1.0, 'Nature Avenir fait': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"D\u00e9chets m\u00e9nagers, abeilles, OGM... Nature Avenir fait le point\n",
"defaultdict(<class 'float'>, {'D\u00e9chets m\u00e9nagers': 1.0, 'point': 1.0, 'abeilles': 1.0, 'Nature Avenir fait': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"La banlieue par la bande\n",
"defaultdict(<class 'float'>, {'banlieue': 1.0, 'bande': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Insecticide R\u00e9gent TS .\n",
"defaultdict(<class 'float'>, {'Insecticide R\u00e9gent TS': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Chimie : une nouvelle expertise affirme la toxicit\u00e9 de l'insecticide R\u00e9gent TS\n",
"defaultdict(<class 'float'>, {'nouvelle expertise affirme': 1.0, 'insecticide R\u00e9gent TS': 1.0, 'Chimie': 1.0, 'l': 1.0, 'toxicit\u00e9': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"[Une expertise judiciaire affirme que les risques pour l'homme et pour l'environnement li\u00e9s \u00e0 l'utilisation de l'insecticide R\u00e9gent TS ont \u00e9t\u00e9 sous-estim\u00e9s.]\n",
"defaultdict(<class 'float'>, {'insecticide R\u00e9gent TS ont \u00e9t\u00e9 sous-estim\u00e9s': 1.0, 'expertise judiciaire affirme': 1.0, 'utilisation': 1.0, 'l': 4.0, 'environnement li\u00e9s': 1.0, 'risques': 1.0, 'homme': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Un insecticide \u00e0 risque\n",
"defaultdict(<class 'float'>, {'risque': 1.0, 'insecticide': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"La cuv\u00e9e des miels 2003 est plus que rare, s\u00e9cheresse oblige\n",
"defaultdict(<class 'float'>, {'miels': 1.0, 's\u00e9cheresse oblige': 1.0, 'cuv\u00e9e': 1.0, 'rare': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Les semences Gaucho, des d\u00e9chets banals\u00a0?\n",
"defaultdict(<class 'float'>, {'semences Gaucho': 1.0, 'd\u00e9chets banals': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Inqui\u00e9tudes des apiculteurs finist\u00e9riens (Lire en page 8)\n",
"defaultdict(<class 'float'>, {'Inqui\u00e9tudes': 1.0, 'Lire': 1.0, 'apiculteurs finist\u00e9riens': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Inqui\u00e9tudes des apiculteurs finist\u00e9riens\n",
"defaultdict(<class 'float'>, {'Inqui\u00e9tudes': 1.0, 'apiculteurs finist\u00e9riens': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"2003 dans le r\u00e9tro\n",
"defaultdict(<class 'float'>, {'r\u00e9tro': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"A. David, apiculteur : \u00ab Rien ne change \u00bb\n",
"defaultdict(<class 'float'>, {'David': 1.0, 'Rien': 1.0, 'apiculteur': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"R\u00e9gent TS: selon une nouvelle expertise, les risques ont \u00e9t\u00e9 sous-estim\u00e9s\n",
"defaultdict(<class 'float'>, {'nouvelle expertise': 1.0, 'R\u00e9gent TS': 1.0, 'risques ont \u00e9t\u00e9 sous-estim\u00e9s': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"R\u00e9gent TS: selon une nouvelle expertise, les risques ont \u00e9t\u00e9 sous-estim\u00e9s"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"defaultdict(<class 'float'>, {'nouvelle expertise': 1.0, 'R\u00e9gent TS': 1.0, 'risques ont \u00e9t\u00e9 sous-estim\u00e9s': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"R\u00e9gent TS: les risques pour l'homme auraient \u00e9t\u00e9 sous-estim\u00e9s (expertise)\n",
"defaultdict(<class 'float'>, {'l': 1.0, 'expertise': 1.0, 'risques': 1.0, 'R\u00e9gent TS': 1.0, 'homme auraient \u00e9t\u00e9 sous-estim\u00e9s': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"R\u00e9gent TS: les risques pour l'homme auraient \u00e9t\u00e9 sous-estim\u00e9s (expertise)\n",
"defaultdict(<class 'float'>, {'l': 1.0, 'expertise': 1.0, 'risques': 1.0, 'R\u00e9gent TS': 1.0, 'homme auraient \u00e9t\u00e9 sous-estim\u00e9s': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"R\u00e9gent TS: un troisi\u00e8me expert \u00e9voque des risques pour la sant\u00e9 humaine\n",
"defaultdict(<class 'float'>, {'expert \u00e9voque': 1.0, 'risques': 1.0, 'R\u00e9gent TS': 1.0, 'sant\u00e9 humaine': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"R\u00e9gent TS: un troisi\u00e8me expert \u00e9voque des risques pour la sant\u00e9 humaine\n",
"defaultdict(<class 'float'>, {'expert \u00e9voque': 1.0, 'risques': 1.0, 'R\u00e9gent TS': 1.0, 'sant\u00e9 humaine': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Les vrais ennemis des abeilles\n",
"defaultdict(<class 'float'>, {'abeilles': 1.0, 'vrais ennemis': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Un expert d\u00e9nonce les dangers d'un pesticide\n",
"defaultdict(<class 'float'>, {'expert d\u00e9nonce': 1.0, 'pesticide': 1.0, 'dangers d': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Huissier ind\u00e9sirable : le maire de Murs-\u00c9rign\u00e9 \u00e9crit au ministre de l'Int\u00e9rieur\n",
"defaultdict(<class 'float'>, {'Murs-\u00c9rign\u00e9 \u00e9crit': 1.0, 'l': 1.0, 'Huissier ind\u00e9sirable': 1.0, 'Int\u00e9rieur': 1.0, 'maire': 1.0, 'ministre': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"R\u00e9gent TS : nouvelles accusations\n",
"defaultdict(<class 'float'>, {'nouvelles accusations': 1.0, 'R\u00e9gent TS': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Un cocotier dans votre salon ?\n",
"defaultdict(<class 'float'>, {'cocotier': 1.0, 'salon': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Pratiques commerciales: L'autre malbouffe\n",
"defaultdict(<class 'float'>, {'Pratiques commerciales': 1.0, 'autre malbouffe': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Quel avenir pour le XXIe si\u00e8cle ?"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"defaultdict(<class 'float'>, {'avenir': 1.0, 'si\u00e8cle': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Des abeilles, du miel et du pain d'\u00e9pice\n",
"defaultdict(<class 'float'>, {'pain d': 1.0, 'abeilles': 1.0, 'miel': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Abeilles et fipronil : un dossier \" empoisonnant \"\n",
"defaultdict(<class 'float'>, {'fipronil': 1.0, 'dossier': 1.0, 'Abeilles': 1.0, 'empoisonnant': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Des apiculteurs manifestent \u00e0 Angers contre \"une tentative d'intimidation\"\n",
"defaultdict(<class 'float'>, {'tentative d': 1.0, 'apiculteurs manifestent': 1.0, 'Angers': 1.0, 'intimidation': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Insecticides : manifestation apr\u00e8s \" l'intimidation \"\n",
"defaultdict(<class 'float'>, {'l': 1.0, 'manifestation': 1.0, 'Insecticides': 1.0, 'intimidation': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Apiculteurs : non \u00e0 l'atteinte aux libert\u00e9s"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"defaultdict(<class 'float'>, {'libert\u00e9s': 1.0, 'l': 1.0, 'Apiculteurs': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Une ruche politique... consensuelle\n",
"defaultdict(<class 'float'>, {'ruche politique': 1.0, 'consensuelle': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Les apiculteurs manifestent\n",
"defaultdict(<class 'float'>, {'apiculteurs manifestent': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"[Les apiculteurs locaux invitent la population vendredi 13 f\u00e9vrier \u00e0 20 heures \u00e0 la mairie pour assister \u00e0 une projection de cassettes vid\u00e9os sur la vie des abeilles et les cons\u00e9quences de l'utilisation de certains insecticides.]\n",
"defaultdict(<class 'float'>, {'vie': 1.0, 'abeilles': 1.0, 'heures': 1.0, 'l': 1.0, 'f\u00e9vrier': 1.0, 'cassettes vid\u00e9os': 1.0, 'insecticides': 1.0, 'population vendredi': 1.0, 'projection': 1.0, 'mairie': 1.0, 'utilisation': 1.0, 'cons\u00e9quences': 1.0, 'apiculteurs locaux invitent': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"\" Les abeilles du coche \""
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"defaultdict(<class 'float'>, {'abeilles': 1.0, 'coche': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Gaucho : une r\u00e9action de Philippe Bodard\n",
"defaultdict(<class 'float'>, {'Philippe Bodard': 1.0, 'r\u00e9action': 1.0, 'Gaucho': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Baisse de production des apiculteurs fran\u00e7ais\n",
"defaultdict(<class 'float'>, {'apiculteurs fran\u00e7ais': 1.0, 'production': 1.0, 'Baisse': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Une esp\u00e8ce en danger\n",
"defaultdict(<class 'float'>, {'danger': 1.0, 'esp\u00e8ce': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Progression du taux de mortalit\u00e9 chez les abeilles\n",
"defaultdict(<class 'float'>, {'mortalit\u00e9': 1.0, 'abeilles': 1.0, 'Progression': 1.0, 'taux': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Un huissier chez des apiculteurs Visite impromptue, vendredi, dans le Maine-et-Loire lors d'une r\u00e9union.\n",
"defaultdict(<class 'float'>, {'apiculteurs Visite impromptue': 1.0, 'huissier': 1.0, 'Maine-et-Loire': 1.0, 'vendredi': 1.0, 'd': 1.0, 'r\u00e9union': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Un huissier chez des apiculteurs Visite surprise \u00e0 Blaison-Gohier (49) lors d'une r\u00e9union sur les insecticides.\n",
"defaultdict(<class 'float'>, {'Blaison-Gohier': 1.0, 'huissier': 1.0, 'insecticides': 1.0, 'r\u00e9union': 1.0, 'apiculteurs Visite surprise': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Un huissier chez des apiculteurs Visite impromptue \u00e0 Blaison-Gohier, lors d'une r\u00e9union sur les insecticides.\n",
"defaultdict(<class 'float'>, {'Blaison-Gohier': 1.0, 'huissier': 1.0, 'insecticides': 1.0, 'apiculteurs Visite impromptue': 1.0, 'r\u00e9union': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Un huissier chez des apiculteurs Visite impromptue, pr\u00e8s d'Angers, lors d'une r\u00e9union sur les insecticides.\n",
"defaultdict(<class 'float'>, {'r\u00e9union': 1.0, 'huissier': 1.0, 'insecticides': 1.0, 'apiculteurs Visite impromptue': 1.0, 'Angers': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Intimidation d'un huissier apr\u00e8s une r\u00e9union d'information\n",
"defaultdict(<class 'float'>, {'Intimidation d': 1.0, 'huissier': 1.0, 'r\u00e9union d': 1.0, 'information': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Les apiculteurs interpellent les citoyens\n",
"defaultdict(<class 'float'>, {'citoyens': 1.0, 'apiculteurs interpellent': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Insecticides : \" T\u00e9moins g\u00eanants \", huissier g\u00ean\u00e9\n",
"defaultdict(<class 'float'>, {'T\u00e9moins g\u00eanants': 1.0, 'huissier g\u00ean\u00e9': 1.0, 'Insecticides': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Professeur Narbonne : \" Certaines personnes n'ont pas bien fait leur boulot \"\n",
"defaultdict(<class 'float'>, {'boulot': 1.0, 'personnes n': 1.0, 'Professeur Narbonne': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"La Conf\u00e9d\u00e9ration interpelle l'agriculture raisonn\u00e9e"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"defaultdict(<class 'float'>, {'agriculture raisonn\u00e9e': 1.0, 'Conf\u00e9d\u00e9ration interpelle l': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"La Conf\u00e9d\u00e9ration interpelle l'agriculture raisonn\u00e9e\n",
"defaultdict(<class 'float'>, {'agriculture raisonn\u00e9e': 1.0, 'Conf\u00e9d\u00e9ration interpelle l': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"La Conf\u00e9d\u00e9ration interpelle l'agriculture raisonn\u00e9e\n",
"defaultdict(<class 'float'>, {'agriculture raisonn\u00e9e': 1.0, 'Conf\u00e9d\u00e9ration interpelle l': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"La Conf\u00e9d\u00e9ration interpelle l'agriculture raisonn\u00e9e\n",
"defaultdict(<class 'float'>, {'agriculture raisonn\u00e9e': 1.0, 'Conf\u00e9d\u00e9ration interpelle l': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"La Conf\u00e9d\u00e9ration interpelle l'agriculture raisonn\u00e9e\n",
"defaultdict(<class 'float'>, {'agriculture raisonn\u00e9e': 1.0, 'Conf\u00e9d\u00e9ration interpelle l': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"La Conf\u00e9d\u00e9ration interpelle l'agriculture raisonn\u00e9e\n",
"defaultdict(<class 'float'>, {'agriculture raisonn\u00e9e': 1.0, 'Conf\u00e9d\u00e9ration interpelle l': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"La Conf\u00e9d\u00e9ration interpelle l'agriculture raisonn\u00e9e\n",
"defaultdict(<class 'float'>, {'agriculture raisonn\u00e9e': 1.0, 'Conf\u00e9d\u00e9ration interpelle l': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Homologation bloqu\u00e9e pour le Regent .\n",
"defaultdict(<class 'float'>, {'Homologation bloqu\u00e9e': 1.0, 'Regent': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Insecticides : un film saisi pr\u00e8s d'Angers\n",
"defaultdict(<class 'float'>, {'film saisi': 1.0, 'Angers': 1.0, 'Insecticides': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Un huissier saisit un film sur des insecticides lors d'une r\u00e9union publique\n",
"defaultdict(<class 'float'>, {'huissier saisit': 1.0, 'film': 1.0, 'insecticides': 1.0, 'r\u00e9union publique': 1.0, 'd': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Les apiculteurs ont le bourdon\n",
"defaultdict(<class 'float'>, {'apiculteurs ont': 1.0, 'bourdon': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"R\u00e9gent : BASF demande \u00e0 \u00eatre t\u00e9moin assist\u00e9\n",
"defaultdict(<class 'float'>, {'R\u00e9gent': 1.0, '\u00eatre t\u00e9moin assist\u00e9': 1.0, 'BASF demande': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"R\u00e9gent : BASF demande \u00e0 \u00eatre t\u00e9moin assist\u00e9\n",
"defaultdict(<class 'float'>, {'R\u00e9gent': 1.0, '\u00eatre t\u00e9moin assist\u00e9': 1.0, 'BASF demande': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"R\u00e9gent : BASF demande \u00e0 \u00eatre t\u00e9moin assist\u00e9\n",
"defaultdict(<class 'float'>, {'R\u00e9gent': 1.0, '\u00eatre t\u00e9moin assist\u00e9': 1.0, 'BASF demande': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"R\u00e9gent : BASF demande \u00e0 \u00eatre t\u00e9moin assist\u00e9\n",
"defaultdict(<class 'float'>, {'R\u00e9gent': 1.0, '\u00eatre t\u00e9moin assist\u00e9': 1.0, 'BASF demande': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"BASF demande le statut de \" t\u00e9moin assist\u00e9 \"\n",
"defaultdict(<class 'float'>, {'t\u00e9moin assist\u00e9': 1.0, 'statut': 1.0, 'BASF demande': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Le Gaucho et le R\u00e9gent TS toujours en accusation\n",
"defaultdict(<class 'float'>, {'R\u00e9gent TS': 1.0, 'Gaucho': 1.0, 'accusation': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Attention, abeilles en danger"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"defaultdict(<class 'float'>, {'danger': 1.0, 'Attention': 1.0, 'abeilles': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"R\u00e9gent : BASF demande \u00e0 \u00eatre t\u00e9moin assist\u00e9\n",
"defaultdict(<class 'float'>, {'R\u00e9gent': 1.0, '\u00eatre t\u00e9moin assist\u00e9': 1.0, 'BASF demande': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"La Conf\u00e9d\u00e9ration interpelle l'agriculture raisonn\u00e9e\n",
"defaultdict(<class 'float'>, {'agriculture raisonn\u00e9e': 1.0, 'Conf\u00e9d\u00e9ration interpelle l': 1.0})"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n"
]
}
],
"prompt_number": 17
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"firstchild = presse.children.first()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for n in Node_Ngram.objects.filter(node=firstchild):\n",
" print(n.ngram.terms)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"liste_ordered = collections.OrderedDict(sorted(liste.items()), key=lambda t: t[1])"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#liste_ordered"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Cr\u00e9ation des Listes"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import collections"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"liste = collections.defaultdict(int)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"try:\n",
" whitelist_type = NodeType.objects.get(name='WhiteList')\n",
" blacklist_type = NodeType.objects.get(name='BlackList')\n",
"except:\n",
" whitelist_type = NodeType(name='WhiteList')\n",
" whitelist_type.save()\n",
" \n",
" blacklist_type = NodeType(name='BlackList')\n",
" blacklist_type.save()\n",
"\n",
"white_node = Node.objects.create(name='WhiteList Pubmed', user=me, parent=corpus_pubmed, type=whitelist_type)\n",
"black_node = Node.objects.create(name='BlackList Pubmed', user=me, parent=corpus_pubmed, type=blacklist_type)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"Node_Ngram.objects.filter(node=white_node).count()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Cr\u00e9ation de la white list"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"with transaction.atomic():\n",
" for node in presse.children.all():\n",
" for node_ngram in Node_Ngram.objects.filter(node=node):\n",
" if node_ngram.ngram.n > 1:\n",
" #liste[node_ngram.ngram.terms] += node_ngram.weight\n",
" Node_Ngram.objects.create(node=white_node, ngram=node_ngram.ngram, weight=1)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"white_node.pk"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"Node_Ngram.objects.filter(node=white_node).count()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Cr\u00e9ation de la black list"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"with transaction.atomic():\n",
" for node_ngram_object in Node_Ngram.objects.all()[101:150]:\n",
" Node_Ngram.objects.create(node=black_node, ngram=node_ngram_object.ngram, occurences=1)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"Node_Ngram.objects.filter(node=black_node)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Cr\u00e9ation des synonymes"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"syno_type = NodeType.objects.get(name='Synonyme')\n",
"syno_node = Node.objects.create(name='Syno Pubmed',\n",
" user=user, \n",
" parent=corpus, \n",
" type=syno_type)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"synonyme1, synonyme2 = Node_Ngram.objects.filter(node=white_node)[3:5]"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"NodeNgramNgram.objects.create(node=syno_node, ngramX=synonyme1.ngram, ngramY=synonyme2.ngram)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Cooccurrence"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"white_node.children.count()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"black_node.pk"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"try:\n",
" cooc_type = NodeType.objects.get(name='Cooccurrence')\n",
"except:\n",
" cooc_type = NodeType(name='Cooccurrence')\n",
" cooc_type.save()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"cooc = Node.objects.create(user=me,\\\n",
" parent=corpus_pubmed,\\\n",
" type=cooc_type,\\\n",
" name=\"Cooccurrences calcul Alpha\")"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"cooc.pk"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"white_node.children.all().delete()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from django.db import connection\n",
"cursor = connection.cursor()\n",
"# LOCK TABLE documents_ngramtemporary IN EXCLUSIVE MODE;\n",
"query_string = \"\"\"\n",
"INSERT INTO node_nodengramngram (node_id, \"ngramX_id\", \"ngramY_id\", score)\n",
"\n",
"SELECT \n",
"%d as node_id, x.ngram_id, y.ngram_id, COUNT(*) AS score\n",
"\n",
"FROM\n",
"node_node_ngram AS x\n",
"\n",
"INNER JOIN \n",
"node_node_ngram AS y \n",
"ON x.node_id = y.node_id\n",
"\n",
"\n",
"WHERE\n",
"x.id in (select id from node_node_ngram WHERE node_id = %d )\n",
"AND\n",
"y.id in (select id from node_node_ngram WHERE node_id = %d )\n",
"AND\n",
"x.ngram_id <> y.ngram_id\n",
"\n",
"\n",
"GROUP BY\n",
"x.ngram_id, y.ngram_id\n",
"\n",
"HAVING count(*) > 1\n",
"\n",
"ORDER BY score\n",
"\n",
"LIMIT 300\n",
"\n",
" \"\"\" % (cooc.pk, white_node.pk, white_node.pk)\n",
"\n",
"cursor.execute(query_string)\n",
"\n",
"try:\n",
" while True:\n",
" row = cursor.fetchone()\n",
" if row is None:\n",
" break\n",
" print(row)\n",
"except:\n",
" pass"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from copy import copy\n",
"import numpy as np\n",
"import pandas as pd\n",
"import networkx as nx\n",
"from collections import defaultdict\n",
"from analysis.louvain import *\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"matrix = \"\""
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"matrix = defaultdict(lambda : defaultdict(float))\n",
"for cooccurrence in NodeNgramNgram.objects.filter(node=cooc):\n",
" if cooccurrence.score > 1 :\n",
" #print(x.ngramX.terms, x.ngramY.terms)\n",
" matrix[cooccurrence.ngramX.terms][cooccurrence.ngramY.terms] = cooccurrence.score\n",
" matrix[cooccurrence.ngramY.terms][cooccurrence.ngramX.terms] = cooccurrence.score"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df = pd.DataFrame(matrix).T.fillna(0)\n",
"x = copy(df.values)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"x = np.where((x.sum(axis=1) > x.shape[0] / 2), 0, x )\n",
"x = np.where((x.sum(axis=1) > x.shape[0] / 10), 0, x )"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"x = x / x.sum(axis=1)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"matrix_filtered = np.where(x > .4, 1, 0)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"matrix_filtered"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"G = nx.from_numpy_matrix(matrix_filtered)\n",
"G = nx.relabel_nodes(G, dict(enumerate(df.columns)))"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"nx.draw(G, with_labels=True)\n",
"plt.show()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"partition = best_partition(G)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#partition"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"pos = nx.spring_layout(G)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"count = 0.0\n",
"node_min = 3\n",
"for com in set(partition.values()) :\n",
" count = count + 1\n",
" list_nodes = [nodes for nodes in partition.keys() if partition[nodes] == com]\n",
" \n",
" if len(list_nodes) > node_min:\n",
" nx.draw_networkx_nodes(G, pos, list_nodes, node_size = 20, with_labels=True)#, node_color = str(count / size))\n",
" nx.draw_networkx_edges(G, pos, alpha=0.5)\n",
" plt.title(\"Clique \" + str(count))\n",
" \n",
" for node in list_nodes: \n",
" print(node)\n",
" plt.show()\n",
" print(\"-\" * 30)\n"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Asynchrone"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 145
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 146
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 147
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from celerytest.tasks import add"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"add."
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 3,
"text": [
"<AsyncResult: c7df5232-b80a-4dd4-b615-432a6fb206e4>"
]
}
],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from celerytest.tasks import Test"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"t = Test()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"res = t.addition.delay((2,2))"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"res = t.addition.apply_async((2,2), countdown=2)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"res.get()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "addition() takes 2 positional arguments but 3 were given",
"output_type": "pyerr",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-5-8bb969b0b8af>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mres\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32m/home/alexandre/projets/gargantext.py/env/lib/python3.4/site-packages/celery/result.py\u001b[0m in \u001b[0;36mget\u001b[1;34m(self, timeout, propagate, interval, no_ack, follow_parents, EXCEPTION_STATES, PROPAGATE_STATES)\u001b[0m\n\u001b[0;32m 173\u001b[0m \u001b[0mstatus\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmeta\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'status'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 174\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mstatus\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mPROPAGATE_STATES\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mpropagate\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 175\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbackend\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexception_to_python\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmeta\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'result'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 176\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mstatus\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mEXCEPTION_STATES\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 177\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbackend\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexception_to_python\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmeta\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'result'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mTypeError\u001b[0m: addition() takes 2 positional arguments but 3 were given"
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from celery.contrib.methods import current_app"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"app."
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 6,
"text": [
"False"
]
}
],
"prompt_number": 6
}
],
"metadata": {}
}
]
}
\ No newline at end of file
{
"metadata": {
"name": "",
"signature": "sha256:01c37f613503c408d979ba9eb9172cbd9b6b3be2ff0d7d35089d705cebc989c2"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"from node.models import Node, NodeType,\\\n",
" Project, Corpus, Document,\\\n",
" Ngram, Node_Ngram,\\\n",
" User, Language, ResourceType\n",
" \n",
"from parsing.Caches import NgramsCache\n",
" \n",
"from django.db import connection\n",
"cursor = connection.cursor()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"me = User.objects.get(username='alexandre')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import networkx as nx\n",
"from networkx.readwrite import json_graph"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import csv"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"help(csv.writer)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Help on built-in function writer in module _csv:\n",
"\n",
"writer(...)\n",
" csv_writer = csv.writer(fileobj [, dialect='excel']\n",
" [optional keyword args])\n",
" for row in sequence:\n",
" csv_writer.writerow(row)\n",
" \n",
" [or]\n",
" \n",
" csv_writer = csv.writer(fileobj [, dialect='excel']\n",
" [optional keyword args])\n",
" csv_writer.writerows(rows)\n",
" \n",
" The \"fileobj\" argument can be any object that supports the file API.\n",
"\n"
]
}
],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"type(x)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 52,
"text": [
"_io.TextIOWrapper"
]
}
],
"prompt_number": 52
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"file = open('/tmp/test.graph', 'w')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stderr",
"text": [
"-c:1: ResourceWarning: unclosed file <_io.TextIOWrapper name='/tmp/test.graph' mode='w' encoding='UTF-8'>\n",
"\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"WARNING:py.warnings:-c:1: ResourceWarning: unclosed file <_io.TextIOWrapper name='/tmp/test.graph' mode='w' encoding='UTF-8'>\n",
"\n"
]
}
],
"prompt_number": 42
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"file.write('ici il fait beau')\n",
"file.close()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 46
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"G = nx.complete_graph(30)\n",
"f = open(\"graph.json\", \"w\")\n",
"f.write(json_graph.node_link_graph(G))\n",
"f.close()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stderr",
"text": [
"-c:2: ResourceWarning: unclosed file <_io.TextIOWrapper name='graph.json' mode='w' encoding='UTF-8'>\n",
"\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"WARNING:py.warnings:-c:2: ResourceWarning: unclosed file <_io.TextIOWrapper name='graph.json' mode='w' encoding='UTF-8'>\n",
"\n"
]
},
{
"ename": "AttributeError",
"evalue": "'Graph' object has no attribute 'get'",
"output_type": "pyerr",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-17-7d4aa550fd32>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[0mG\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnx\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcomplete_graph\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m30\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[0mf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"graph.json\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"w\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mjson_graph\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnode_link_graph\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mG\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 4\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m/home/alexandre/projets/gargantext.py/env/lib/python3.4/site-packages/networkx/readwrite/json_graph/node_link.py\u001b[0m in \u001b[0;36mnode_link_graph\u001b[1;34m(data, directed, multigraph, attrs)\u001b[0m\n\u001b[0;32m 134\u001b[0m \u001b[0mnode_link_data\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0madjacency_data\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtree_data\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 135\u001b[0m \"\"\"\n\u001b[1;32m--> 136\u001b[1;33m \u001b[0mmultigraph\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'multigraph'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmultigraph\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 137\u001b[0m \u001b[0mdirected\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'directed'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdirected\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 138\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mmultigraph\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mAttributeError\u001b[0m: 'Graph' object has no attribute 'get'"
]
}
],
"prompt_number": 17
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"typeCorpus = NodeType.objects.get(name='Corpus')\n",
"for corpus in Node.objects.filter(type=typeCorpus):\n",
" print(\"#%d - %s\" % (corpus.id, corpus))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"#7 - Presse corpus\n"
]
}
],
"prompt_number": 33
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"try:\n",
" me = User.objects.get(username='alexandre')\n",
"except:\n",
" me = User(username='alexandre')\n",
" me.save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 34
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#\u00a0corpus = Node.objects.filter(type=typeCorpus).first()\n",
"corpus = Node.objects.get(id=13064)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Occurences"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"query_date = \"\"\"\n",
" SELECT\n",
" id,\n",
" metadata -> 'publication_year' as year,\n",
" metadata -> 'publication_month' as month, \n",
" metadata -> 'publication_day' as day,\n",
" metadata -> 'title',\n",
" FROM\n",
" node_node AS n\n",
" WHERE\n",
" n.parent_id = %d\n",
" ORDER BY\n",
" year, month, day ASC\n",
" LIMIT\n",
" 20\n",
" OFFSET\n",
" %d\n",
"\"\"\" % (corpus.id, 0)\n",
"\n",
"cursor.execute(query_date)\n",
"\n",
"while True:\n",
" row = cursor.fetchone()\n",
" if row is None:\n",
" break\n",
" print(row)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"(13066, '1954', '11', '18', 'TOXICITY OF PESTICIDE DUSTS TO HONEYBEES', None)\n",
"(13065, '1954', '11', '18', 'EQUIPMENT AND TECHNIQUE USED IN LABORATORY EVALUATION OF PESTICIDE DUSTS IN TOXICOLOGICAL STUDIES WITH HONEYBEES', None)\n",
"(13067, '1958', '11', '18', 'TOXICITY OF PESTICIDES TO HONEY BEES IN LABORATORY AND FIELD TESTS IN SOUTHERN CALIFORNIA, 1955-1956', None)\n",
"(13068, '1959', '11', '18', 'THE EFFECTS OF FIELD APPLICATIONS OF SOME OF THE NEWER PESTICIDES ON HONEY BEES', None)\n",
"(13069, '1968', '11', '18', 'PROTECTING HONEYBEES FROM PESTICIDES', None)\n",
"(13071, '1969', '11', '18', 'PESTICIDE TOXICITY AND HONEY BEES', None)\n",
"(13070, '1969', '11', '18', 'POLLEN GATHERING OF HONEY BEES REDUCED BY PESTICIDE SPRAYS', None)\n",
"(13072, '1971', '11', '18', 'NEWER PESTICIDES DONT HARM ENVIRONMENT, BUT WHERE HAVE ALL BEES GONE', None)\n",
"(13075, '1971', '11', '18', 'HONEYBEES, PESTICIDES AND LAW .3.', None)\n",
"(13074, '1971', '11', '18', 'HONEYBEES, PESTICIDES AND LAW .2.', None)\n",
"(13073, '1971', '11', '18', 'HONEYBEES, PESTICIDES AND LAW .1.', None)\n",
"(13076, '1972', '11', '18', 'RICE FIELD MOSQUITO-CONTROL STUDIES WITH LOW VOLUME DURSBAN SPRAYS IN COLUSA COUNTY, CALIFORNIA .5. EFFECTS UPON HONEY BEES', None)\n",
"(13078, '1974', '11', '18', 'MICROSOMAL OXIDASES IN HONEY BEE, APIS-MELLIFERA (L)', None)\n",
"(13077, '1974', '11', '18', 'ISOLATED HONEY BEE ABDOMENS FOR MONITORING EFFECTS OF STRESS IN AMERICAN COCKROACH', None)\n",
"(13079, '1975', '11', '18', 'INHIBITOR OF MICROSOMAL OXIDATION FROM GUT TISSUES OF HONEY BEE (APIS-MELLIFERA)', None)\n",
"(13080, '1975', '11', '18', 'REPELLENT ADDITIVES TO REDUCE PESTICIDE HAZARDS TO HONEY BEES HYMENOPTERA-APIDAE, APIS-MELLIFERA-L - FIELD-TESTS', None)\n",
"(13081, '1975', '11', '18', 'HONEYBEE ABDOMEN ASSAYS OF HEMOLYMPH FROM STRESSED AND EXTERNALLY POISONED AMERICAN COCKROACHES', None)\n",
"(13082, '1976', '11', '18', 'PROBLEM OF PESTICIDES NOT DANGEROUS TO BEES', None)\n",
"(13084, '1977', '11', '18', 'EFFECT OF SOME PESTICIDES ON A SOLITARY BEE (MEGACHILE-PACIFICA-PANZ) - (HYMENOPTERA, MEGACHILIDAE)', None)\n",
"(13085, '1977', '11', '18', 'METHOD FOR TESTING PESTICIDE TOXICITY WHICH IS SUITABLE FOR SOLITARY BEES AND ESPECIALLY FOR MEGACHILE-PACIFICA-PANZ - (HYMENOPTERA, MEGACHILIDAE)', None)\n"
]
}
],
"prompt_number": 35
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"query_date = \"\"\"\n",
" SELECT\n",
" metadata -> 'publication_year' as year,\n",
" metadata -> 'publication_month' as month, \n",
" metadata -> 'publication_day' as day,\n",
" COUNT(*)\n",
" FROM\n",
" node_node AS n\n",
" WHERE\n",
" n.parent_id = %d\n",
" GROUP BY\n",
" day, month, year\n",
" ORDER BY\n",
" year, month, day ASC\n",
" LIMIT\n",
" 20\n",
" OFFSET\n",
" %d\n",
"\"\"\" % (corpus.id, 0)\n",
"\n",
"cursor.execute(query_date)\n",
"\n",
"while True:\n",
" row = cursor.fetchone()\n",
" if row is None:\n",
" break\n",
" print('\\'' + row[0] + '/' + row[1] + '/' + row[2] + '\\'' \n",
" + ',' + '\\'' + str(row[3]) + '\\'' )"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"'1954/11/18','2'\n",
"'1958/11/18','1'\n",
"'1959/11/18','1'\n",
"'1968/11/18','1'\n",
"'1969/11/18','2'\n",
"'1971/11/18','4'\n",
"'1972/11/18','1'\n",
"'1974/11/18','2'\n",
"'1975/11/18','3'\n",
"'1976/11/18','1'\n",
"'1977/11/18','6'\n",
"'1978/11/18','11'\n",
"'1979/11/18','9'\n",
"'1980/11/18','6'\n",
"'1981/11/18','4'\n",
"'1982/11/18','7'\n",
"'1983/11/18','14'\n",
"'1984/11/18','17'\n",
"'1985/11/18','18'\n",
"'1986/02/21','1'\n"
]
}
],
"prompt_number": 28
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"cursor.execute(\"\"\"\n",
" SELECT\n",
" COUNT(*) AS occurrences,\n",
" ngX.terms\n",
" FROM\n",
" node_node AS n\n",
" INNER JOIN\n",
" node_node_ngram AS nngX ON nngX.node_id = n.id\n",
" INNER JOIN\n",
" node_ngram AS ngX ON ngX.id = nngX.ngram_id\n",
" WHERE\n",
" n.parent_id = %s\n",
" AND\n",
" ngX.n >= 2\n",
" GROUP BY\n",
" ngX.terms\n",
" Having\n",
" COUNT(*) > 7\n",
" ORDER BY\n",
" occurrences DESC\n",
" LIMIT\n",
" 100\n",
" \n",
"\"\"\", [corpus.id])\n",
"\n",
"while True:\n",
" row = cursor.fetchone()\n",
" if row is None:\n",
" break\n",
" print(row)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"(138, 'honey bees')\n",
"(132, 'apis mellifera')\n",
"(69, 'honey bee')\n",
"(66, 'apis mellifera l')\n",
"(45, 'pesticide residues')\n",
"(39, 'gas chromatography')\n",
"(36, 'varroa destructor')\n",
"(36, 'honey bee colonies')\n",
"(30, 'sublethal effects')\n",
"(27, 'apidae )')\n",
"(21, 'neonicotinoid insecticides')\n",
"(21, 'honey bee ( hymenoptera')\n",
"(18, 'bee products')\n",
"(18, 'megachile rotundata')\n",
"(18, 'solid-phase extraction')\n",
"(18, 'simultaneous determination')\n",
"(18, 'mass spectrometric')\n",
"(15, 'case study')\n",
"(15, 'honey samples')\n",
"(15, 'liquid chromatography')\n",
"(15, 'high performance liquid chromatography')\n",
"(15, 'varroa mites')\n",
"(12, 'organochlorine pesticides')\n",
"(12, 'gas chromatography-mass spectrometry')\n",
"(12, 'liquid chromatography-mass spectrometry')\n",
"(12, 'colony health')\n",
"(12, 'gas chromatographic')\n",
"(12, 'colony collapse disorder')\n",
"(12, 'bumble bees')\n",
"(12, 'varroa jacobsoni')\n",
"(9, 'chemiluminescent elisa')\n",
"(9, 'diversionary plantings for reduction of pesticide related bee mortality')\n",
"(9, 'pesticides and law')\n",
"(9, 'plant protection products')\n",
"(9, 'nomia melanderi')\n",
"(9, 'electron-capture detection')\n",
"(9, 'managed pollinator cap coordinated agricultural project a national research')\n",
"(9, 'apis florea f')\n",
"(9, 'solid-phase microextraction')\n",
"(9, 'extension initiative')\n",
"(9, 'crop pollination')\n",
"(9, 'non-apis bees')\n",
"(9, 'honey bees ( apis mellifera')\n",
"(9, 'liquid chromatography-tandem mass spectrometry')\n",
"(9, 'bee pollen')\n",
"(9, 'foraging behavior')\n",
"(9, 'biological control')\n",
"(9, 'nosema ceranae')\n",
"(9, 'organophosphorus pesticides')\n",
"(9, 'field conditions')\n",
"(9, 'honey bee apis mellifera l')\n",
"(9, 'laboratory tests')\n",
"(9, 'beauveria bassiana')\n",
"(9, 'comparative toxicity')\n",
"(9, 'high levels')\n",
"(9, 'pesticide exposure')\n",
"(9, 'fluvalinate residues')\n",
"(9, 'insecticide residues')\n",
"(9, 'osmia lignaria')\n",
"(9, 'bombus impatiens')\n",
"(9, 'honey bee health')\n",
"(9, 'agricultural landscape')\n",
"(9, 'dispersive liquid-liquid microextraction')\n",
"(9, 'matrix solid-phase dispersion')\n"
]
}
],
"prompt_number": 28
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Cooccurrences par ann\u00e9e"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"cursor.execute(\"\"\"\n",
" SELECT\n",
" COUNT(*) AS occurrences,\n",
" n.metadata->'publication_year' AS year,\n",
" ngX.terms\n",
" FROM\n",
" node_node AS n\n",
" INNER JOIN\n",
" node_node_ngram AS nngX ON nngX.node_id = n.id\n",
" INNER JOIN\n",
" node_ngram AS ngX ON ngX.id = nngX.ngram_id\n",
" WHERE\n",
" n.parent_id = %s\n",
" GROUP BY\n",
" terms,\n",
" year\n",
" ORDER BY\n",
" occurrences DESC\n",
" LIMIT\n",
" 20\n",
"\"\"\", [corpus.id])\n",
"\n",
"while True:\n",
" row = cursor.fetchone()\n",
" if row is None:\n",
" break\n",
" print(row)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"(24, '2014', 'patients')\n",
"(22, '2005', 'patients')\n",
"(18, '2005', 'study')\n",
"(15, '2014', 'voice')\n",
"(14, '2002', 'disease')\n",
"(14, '2013', 'patients')\n",
"(14, '2006', 'study')\n",
"(13, '2014', 'treatment')\n",
"(12, '2011', 'patients')\n",
"(12, '2004', 'voice')\n",
"(12, '2012', 'patients')\n",
"(12, '2003', 'patients')\n",
"(12, '2005', 'voice')\n",
"(11, '2002', 'patients')\n",
"(11, '2014', 'study')\n",
"(10, '2007', 'patients')\n",
"(10, '2006', 'patients')\n",
"(10, '2004', 'study')\n",
"(10, '2001', 'patients')\n",
"(10, '2014', 'phase')\n"
]
}
],
"prompt_number": 105
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Cr\u00e9ation d'une liste de synonymes"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"ngramsCache = NgramsCache(Language.objects.get(iso2='fr'))"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 17
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"synonymePairs = [\n",
" ['danger', 'risques'],\n",
" ['risque', 'risques'],\n",
" ['R\u00e9gent', 'R\u00e9gent TS']\n",
"]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 18
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"try:\n",
" typeSynonyme = NodeType.objects.get(name='Synonyme')\n",
"except:\n",
" typeSynonyme = NodeType(name='Synonyme')\n",
" typeSynonyme.save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 19
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"listSynonymes = Node(name='Syno abeilles', type=typeSynonyme, user=me)\n",
"listSynonymes.save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 22
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for synonymePair in synonymePairs:\n",
" NodeNgramNgram(\n",
" ngramx = ngramsCache[synonymePair[0]],\n",
" ngramy = ngramsCache[synonymePair[1]],\n",
" node = listSynonymes,\n",
" score = 1.\n",
" ).save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 23
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"listSynonymes.id"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 24,
"text": [
"61297"
]
}
],
"prompt_number": 24
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Occurrences avec synonymes"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"'''cursor.execute(\"\"\"\n",
" SELECT\n",
" COUNT(*) AS occurrences,\n",
" ngx.terms\n",
" FROM\n",
" node_node AS n\n",
" INNER JOIN\n",
" node_node_ngram AS nngX ON nngX.node_id = n.id\n",
" INNER JOIN\n",
" node_ngram AS ngX ON ngX.id = nngX.ngram_id\n",
" WHERE\n",
" n.parent_id = %s\n",
" GROUP BY\n",
" ngX.terms\n",
" ORDER BY\n",
" occurrences DESC\n",
" LIMIT\n",
" 20\n",
"\"\"\")'''\n",
"cursor.execute(\"\"\"\n",
" SELECT\n",
" n.id\n",
" FROM\n",
" node_node AS n\n",
" INNER JOIN\n",
" node_node_ngram AS nngx ON nngx.node_id = n.id\n",
" INNER JOIN\n",
" node_nodengramngram AS nngng ON nngng.ngramx_id = nngx.ngram_id\n",
" INNER JOIN\n",
" node_node_ngram AS nngy ON nngy.id = nngng.ngramy_id\n",
" WHERE\n",
" n.parent_id = %s\n",
"\"\"\", [corpus.id])\n",
"#\u00a0\"\"\" % [listSynonymes.id])\n",
"\n",
"while True:\n",
" row = cursor.fetchone()\n",
" if row is None:\n",
" break\n",
" print(row)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 26
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Cooccurrences"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"cursor.execute(\"\"\"\n",
" SELECT\n",
" COUNT(*) AS cooccurrences,\n",
" ngX.terms,\n",
" ngY.terms\n",
" FROM\n",
" node_node AS n\n",
" \n",
" INNER JOIN\n",
" node_node_ngram AS nngX ON nngX.node_id = n.id\n",
" INNER JOIN\n",
" node_ngram AS ngX ON ngX.id = nngX.ngram_id\n",
" \n",
" INNER JOIN\n",
" node_node_ngram AS nngY ON nngY.node_id = n.id\n",
" INNER JOIN\n",
" node_ngram AS ngY ON ngY.id = nngY.ngram_id\n",
" \n",
" WHERE\n",
" n.parent_id = %s\n",
" AND\n",
" nngX.ngram_id in (select id from node_node_ngram WHERE node_id = 61298 )\n",
" AND\n",
" nngY.ngram_id in (select id from node_node_ngram WHERE node_id = 61298 )\n",
" AND\n",
" nngX.ngram_id <> nngY.ngram_id\n",
" \n",
" GROUP BY\n",
" ngX.id,\n",
" ngX.terms,\n",
" ngY.id,\n",
" ngY.terms\n",
" ORDER BY\n",
" cooccurrences DESC\n",
" LIMIT\n",
" 200\n",
"\"\"\", [corpus.id])\n",
"\n",
"while True:\n",
" row = cursor.fetchone()\n",
" if row is None:\n",
" break\n",
" print(row)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 15
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"cursor.execute(\"\"\"\n",
" SELECT\n",
" COUNT(*) AS cooccurrences,\n",
" ngX.terms,\n",
" ngY.terms\n",
" FROM\n",
" node_node AS n\n",
" \n",
" INNER JOIN\n",
" node_node_ngram AS nngX ON nngX.node_id = n.id\n",
" INNER JOIN\n",
" node_ngram AS ngX ON ngX.id = nngX.ngram_id\n",
" \n",
" INNER JOIN\n",
" node_node_ngram AS nngY ON nngY.node_id = n.id\n",
" INNER JOIN\n",
" node_ngram AS ngY ON ngY.id = nngY.ngram_id\n",
"\n",
" WHERE\n",
" n.parent_id = %s\n",
" AND\n",
" nngX.ngram_id <> nngY.ngram_id\n",
" \n",
" GROUP BY\n",
" ngX.id,\n",
" ngX.terms,\n",
" ngY.id,\n",
" ngY.terms\n",
" ORDER BY\n",
" cooccurrences DESC\n",
" LIMIT\n",
" 20\n",
"\"\"\", [corpus.id])\n",
"\n",
"while True:\n",
" row = cursor.fetchone()\n",
" if row is None:\n",
" break\n",
" print(row)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stderr",
"text": [
"ERROR: An unexpected error occurred while tokenizing input\n",
"The following traceback may be corrupted or invalid\n",
"The error message is: ('EOF in multi-line string', (1, 0))\n",
"\n"
]
},
{
"ename": "OperationalError",
"evalue": "arr\u00eat des connexions suite \u00e0 la demande de l'administrateur\nSSL connection has been closed unexpectedly\n",
"output_type": "pyerr",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mOperationalError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-11-752593da5735>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m 31\u001b[0m \u001b[0mLIMIT\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 32\u001b[0m \u001b[1;36m20\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 33\u001b[1;33m \"\"\", [corpus.id])\n\u001b[0m\u001b[0;32m 34\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 35\u001b[0m \u001b[1;32mwhile\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m/home/alexandre/projets/gargantext.py/env/lib/python3.4/site-packages/django/db/backends/util.py\u001b[0m in \u001b[0;36mexecute\u001b[1;34m(self, sql, params)\u001b[0m\n\u001b[0;32m 67\u001b[0m \u001b[0mstart\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 68\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 69\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0msuper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mCursorDebugWrapper\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msql\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 70\u001b[0m \u001b[1;32mfinally\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 71\u001b[0m \u001b[0mstop\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m/home/alexandre/projets/gargantext.py/env/lib/python3.4/site-packages/django/db/backends/util.py\u001b[0m in \u001b[0;36mexecute\u001b[1;34m(self, sql, params)\u001b[0m\n\u001b[0;32m 51\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcursor\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msql\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 52\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 53\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcursor\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msql\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 54\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 55\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mexecutemany\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msql\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparam_list\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m/home/alexandre/projets/gargantext.py/env/lib/python3.4/site-packages/django/db/utils.py\u001b[0m in \u001b[0;36m__exit__\u001b[1;34m(self, exc_type, exc_value, traceback)\u001b[0m\n\u001b[0;32m 97\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mdj_exc_type\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32min\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mDataError\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mIntegrityError\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 98\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwrapper\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merrors_occurred\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 99\u001b[1;33m \u001b[0msix\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreraise\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdj_exc_type\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdj_exc_value\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtraceback\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 100\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 101\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m__call__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m/home/alexandre/projets/gargantext.py/env/lib/python3.4/site-packages/django/utils/six.py\u001b[0m in \u001b[0;36mreraise\u001b[1;34m(tp, value, tb)\u001b[0m\n\u001b[0;32m 547\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mreraise\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtp\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtb\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 548\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__traceback__\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mtb\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 549\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwith_traceback\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtb\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 550\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 551\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m/home/alexandre/projets/gargantext.py/env/lib/python3.4/site-packages/django/db/backends/util.py\u001b[0m in \u001b[0;36mexecute\u001b[1;34m(self, sql, params)\u001b[0m\n\u001b[0;32m 51\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcursor\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msql\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 52\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 53\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcursor\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msql\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 54\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 55\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mexecutemany\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msql\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparam_list\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mOperationalError\u001b[0m: arr\u00eat des connexions suite \u00e0 la demande de l'administrateur\nSSL connection has been closed unexpectedly\n"
]
}
],
"prompt_number": 11
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -11,7 +11,7 @@ def create_blacklist(user, corpus):
def create_synonymes(user, corpus):
pass
def create_whitelist(user, corpus):
def create_whitelist(user, corpus, size=100):
cursor = connection.cursor()
try:
......@@ -51,7 +51,7 @@ def create_whitelist(user, corpus):
AND
n.type_id = %d
AND
ngX.n >= 1
ngX.n >= 2
GROUP BY
ngX.id
......@@ -60,16 +60,16 @@ def create_whitelist(user, corpus):
ORDER BY
occurrences DESC
LIMIT
100
%d
;
""" % (white_list.id, corpus.id, type_document.id)
""" % (white_list.id, corpus.id, type_document.id, size)
cursor.execute(query_whitelist)
return white_list
#def create_cooc(user, corpus, whitelist, blacklist, synonymes):
def create_cooc(user=None, corpus=None, whitelist=None):
def create_cooc(user=None, corpus=None, whitelist=None, size=150):
cursor = connection.cursor()
try:
......@@ -127,11 +127,117 @@ def create_cooc(user=None, corpus=None, whitelist=None):
ORDER BY
score DESC
LIMIT
150
""" % (cooc.pk, corpus.id, whitelist.id, whitelist.id)
%d
""" % (cooc.pk, corpus.id, whitelist.id, whitelist.id, size)
cursor.execute(query_cooc)
return cooc
def get_cooc(request=None, corpus_id=None, cooc_id=None, type="node_link"):
import pandas as pd
from copy import copy
import numpy as np
import networkx as nx
from networkx.readwrite import json_graph
from gargantext_web.api import JsonHttpResponse
from analysis.louvain import best_partition
matrix = defaultdict(lambda : defaultdict(float))
labels = dict()
weight = dict()
corpus = Node.objects.get(id=corpus_id)
type_cooc = NodeType.objects.get(name="Cooccurrence")
if Node.objects.filter(type=type_cooc, parent=corpus).first() is None:
print("Coocurrences do not exist yet, create it.")
if type == "node_link":
n = 150
elif type == "adjacency":
n = 50
whitelist = create_whitelist(request.user, corpus, size=n)
cooccurrence_node = create_cooc(user=request.user, corpus=corpus, whitelist=whitelist, size=n)
print(cooccurrence_node.id, "Cooc created")
else:
cooccurrence_node = Node.objects.filter(type=type_cooc, parent=corpus).first()
for cooccurrence in NodeNgramNgram.objects.filter(node=cooccurrence_node):
labels[cooccurrence.ngramx.id] = cooccurrence.ngramx.terms
labels[cooccurrence.ngramy.id] = cooccurrence.ngramy.terms
matrix[cooccurrence.ngramx.id][cooccurrence.ngramy.id] = cooccurrence.score
matrix[cooccurrence.ngramy.id][cooccurrence.ngramx.id] = cooccurrence.score
weight[cooccurrence.ngramy.terms] = weight.get(cooccurrence.ngramy.terms, 0) + cooccurrence.score
weight[cooccurrence.ngramx.terms] = weight.get(cooccurrence.ngramx.terms, 0) + cooccurrence.score
df = pd.DataFrame(matrix).T.fillna(0)
x = copy(df.values)
x = x / x.sum(axis=1)
# Removing unconnected nodes
threshold = min(x.max(axis=1))
matrix_filtered = np.where(x >= threshold, 1, 0)
#matrix_filtered = np.where(x > threshold, x, 0)
#matrix_filtered = matrix_filtered.resize((90,90))
G = nx.from_numpy_matrix(matrix_filtered)
G = nx.relabel_nodes(G, dict(enumerate([ labels[label] for label in list(df.columns)])))
#G = nx.relabel_nodes(G, dict(enumerate(df.columns)))
# Removing too connected nodes (find automatic way to do it)
# outdeg = G.degree()
# to_remove = [n for n in outdeg if outdeg[n] >= 10]
# G.remove_nodes_from(to_remove)
partition = best_partition(G)
if type == "node_link":
for node in G.nodes():
try:
#node,type(labels[node])
G.node[node]['label'] = node
G.node[node]['name'] = node
G.node[node]['size'] = weight[node]
G.node[node]['group'] = partition[node]
#G.add_edge(node, partition[node], weight=3)
# G.node[node]['color'] = '19,180,300'
except Exception as error:
print(error)
data = json_graph.node_link_data(G)
elif type == "adjacency":
for node in G.nodes():
try:
#node,type(labels[node])
#G.node[node]['label'] = node
G.node[node]['name'] = node
#G.node[node]['size'] = weight[node]
G.node[node]['group'] = partition[node]
#G.add_edge(node, partition[node], weight=3)
# G.node[node]['color'] = '19,180,300'
except Exception as error:
print(error)
data = json_graph.node_link_data(G)
# data = json_graph.node_link_data(G, attrs={\
# 'source':'source',\
# 'target':'target',\
# 'weight':'weight',\
# #'label':'label',\
# #'color':'color',\
# 'id':'id',})
#print(data)
return data
......@@ -11,12 +11,13 @@ import gargantext_web.api
admin.autodiscover()
urlpatterns = patterns('',
# url(r'^$', 'gargantext_web.views.home', name='home'),
# url(r'^blog/', include('blog.urls')),
# Admin views
url(r'^admin/', include(admin.site.urls)),
url(r'^login/', include(admin.site.urls)),
url(r'^grappelli/', include('grappelli.urls')),
# User views
url(r'^$', views.home),
url(r'^projects/$', views.projects),
......@@ -27,26 +28,35 @@ urlpatterns = patterns('',
url(r'^project/(\d+)/corpus/(\d+)/$', views.corpus),
url(r'^project/(\d+)/corpus/(\d+)/delete/$', views.delete_corpus),
# Visualizations
url(r'^corpus/(\d+)/explorer$', views.explorer_graph),
url(r'^chart$', views.explorer_chart),
url(r'^matrix$', views.explorer_matrix),
#url(r'^exploration$', views.exploration),
url(r'^corpus/(\d+)/matrix$', views.explorer_matrix),
# Getting data [which?]
url(r'^chart/corpus/(\d+)/data.csv$', views.send_csv),
url(r'^corpus/(\d+)/node_link.json$', views.node_link),
url(r'^corpus/(\d+)/adjancy_matrix$', views.node_link),
url(r'^corpus/(\d+)/adjacency.json$', views.adjacency),
"""RESTful API
These URLs allow operations on the database in a RESTful way.
"""
url(r'^api$', gargantext_web.api.Root),
# retrieve all the metadata from a given node's children
url(r'^api/nodes/(\d+)/children/metadata$', gargantext_web.api.NodesChildrenMetatadata.as_view()),
# retrieve the ngrams from a given node's children
url(r'^api/nodes/(\d+)/ngrams$', gargantext_web.api.CorpusController.ngrams),
# perform a query on a given node's children
url(r'^api/nodes/(\d+)/children/queries$', gargantext_web.api.NodesChildrenQueries.as_view()),
# get all the nodes
url(r'^api/nodes$', gargantext_web.api.NodesController.get),
url(r'^api/nodes/(\d+)/ngrams$', gargantext_web.api.CorpusController.ngrams),
url(r'^api/nodes/(\d+)/data$', gargantext_web.api.CorpusController.data),
url(r'^graph-it$', views.graph_it),
url(r'^ngrams$', views.ngrams),
# other (DEPRECATED, TO BE REMOVED)
url(r'^api/nodes$', gargantext_web.api.NodesController.get),
url(r'^api/corpus/(\d+)/ngrams$', gargantext_web.api.CorpusController.ngrams),
url(r'^api/corpus/(\d+)/metadata$', gargantext_web.api.CorpusController.metadata),
url(r'^api/corpus/(\d+)/data$', gargantext_web.api.CorpusController.data),
)
......
......@@ -26,7 +26,6 @@ from collections import defaultdict
from parsing.FileParsers import *
# SOME FUNCTIONS
def query_to_dicts(query_string, *query_args):
......@@ -243,6 +242,7 @@ def project(request, project_id):
# async
corpus.children.filter(type_id=type_document.pk).extract_ngrams(keys=['title',])
#corpus.children.filter(type_id=type_document.pk).extract_ngrams(keys=['title',])
except Exception as error:
print(error)
......@@ -367,7 +367,6 @@ def corpus(request, project_id, corpus_id):
return HttpResponse(html)
def delete_project(request, node_id):
Node.objects.filter(id=node_id).all().delete()
return HttpResponseRedirect('/projects/')
......@@ -390,14 +389,16 @@ def explorer_graph(request, corpus_id):
return HttpResponse(html)
def explorer_matrix(request):
def explorer_matrix(request, corpus_id):
t = get_template('matrix.html')
user = request.user
date = datetime.datetime.now()
corpus = Node.objects.get(id=corpus_id)
html = t.render(Context({\
'user': user,\
'date': date,\
'corpus': corpus,\
}))
return HttpResponse(html)
......@@ -469,78 +470,27 @@ def send_csv(request, corpus_id):
return response
def node_link(request, corpus_id):
'''
Create the HttpResponse object with the graph dataset.
'''
import pandas as pd
from copy import copy
import numpy as np
import networkx as nx
from networkx.readwrite import json_graph
from gargantext_web.api import JsonHttpResponse
from analysis.louvain import best_partition
from analysis.functions import create_whitelist, create_cooc
matrix = defaultdict(lambda : defaultdict(float))
labels = dict()
corpus = Node.objects.get(id=corpus_id)
type_cooc = NodeType.objects.get(name="Cooccurrence")
if Node.objects.filter(type=type_cooc, parent=corpus).first() is None:
print("Coocurrences do not exist yet, create it.")
whitelist = create_whitelist(request.user, corpus)
cooc = create_cooc(user=request.user, corpus=corpus, whitelist=whitelist)
print(cooc.id, "Cooc created")
else:
cooc = Node.objects.filter(type=type_cooc, parent=corpus).first()
for cooccurrence in NodeNgramNgram.objects.filter(node=cooc):
labels[cooccurrence.ngramx.id] = cooccurrence.ngramx.terms
labels[cooccurrence.ngramy.id] = cooccurrence.ngramy.terms
matrix[cooccurrence.ngramx.id][cooccurrence.ngramy.id] = cooccurrence.score
matrix[cooccurrence.ngramy.id][cooccurrence.ngramx.id] = cooccurrence.score
# To get the data
from gargantext_web.api import JsonHttpResponse
from analysis.functions import get_cooc
df = pd.DataFrame(matrix).T.fillna(0)
x = copy(df.values)
x = x / x.sum(axis=1)
# Removing unconnected nodes
threshold = min(x.max(axis=1))
matrix_filtered = np.where(x > threshold, 1, 0)
#matrix_filtered = np.where(x > threshold, x, 0)
G = nx.from_numpy_matrix(matrix_filtered)
G = nx.relabel_nodes(G, dict(enumerate([ labels[x] for x in list(df.columns)])))
#G = nx.relabel_nodes(G, dict(enumerate(df.columns)))
# Removing too connected nodes (find automatic way to do it)
# outdeg = G.degree()
# to_remove = [n for n in outdeg if outdeg[n] >= 10]
# G.remove_nodes_from(to_remove)
def node_link(request, corpus_id):
'''
Create the HttpResponse object with the node_link dataset.
'''
partition = best_partition(G)
for node in G.nodes():
try:
#node,type(labels[node])
G.node[node]['label'] = node
# G.node[node]['color'] = '19,180,300'
except Exception as error:
print(error)
data = get_cooc(request=request, corpus_id=corpus_id, type="node_link")
return JsonHttpResponse(data)
data = json_graph.node_link_data(G)
# data = json_graph.node_link_data(G, attrs={\
# 'source':'source',\
# 'target':'target',\
# 'weight':'weight',\
# #'label':'label',\
# #'color':'color',\
# 'id':'id',})
#print(data)
def adjacency(request, corpus_id):
'''
Create the HttpResponse object with the adjacency dataset.
'''
data = get_cooc(request=request, corpus_id=corpus_id, type="adjacency")
return JsonHttpResponse(data)
......@@ -555,7 +505,6 @@ def graph_it(request):
}))
return HttpResponse(html)
def ngrams(request):
'''The ngrams list.'''
t = get_template('ngrams.html')
......
......@@ -4,23 +4,30 @@ Install the requirements
1) Install all the Debian packages listed in dependances.deb
(also: sudo apt-get install postgresql-contrib)
2) Create a virtual enironnement with pyvenv: apt-get install python-virtualenv
2) Create a Pythton virtual enironnement
3) Type: source [your virtual environment directory]/bin/activate
4) Do your work!
On Debian:
---------
sudo apt-get install python3.4-venv
pyvenv3 /srv/gargantext_env
5) Type: deactivate
On ubuntu:
---------
sudo apt-get install python-pip
sudo pip install -U pip
sudo pip install -U virtualenv
ensuite tu peux créer ton virtualenv dans le dossier de travail ou à un
endroit choisi :
Configure stuff
---------------
sudo virtualenv -p python3 /srv/gargantext_env
1) ln -s [the project folder] /srv/gargantext
3) Type: source [your virtual environment directory]/bin/activate
2) ln -s [your folder for tree tagger] [the project folder]/parsing/Tagger/treetagger
4) sudo chown -R user:user /srv/gargantext_env
pip install -r /srv/gargantext/init/requirements.txt
Warning: for ln, path has to be absolute!
5) Type: deactivate
In PostreSQL
......@@ -52,6 +59,36 @@ Populate the database
python manage.py syncdb
Last steps of configuration:
----------------------------
1) If your project is not in /srv/gargantext:
ln -s [the project folder] /srv/gargantext
2) build gargantext_lib
wget http://docs.delanoe.org/gargantext_lib.tar.bz2
cd /srv/
sudo tar xvjf gargantext_lib.tar.bz2
sudo chown user:user /srv/gargantext_lib
3) Explorer:
cd /srv/gargantext_lib/js
git clone git@github.com:PkSM3/garg.git
4) Adapt all symlinks:
ln -s [your folder for tree tagger] [the project folder]/parsing/Tagger/treetagger
Warning: for ln, path has to be absolute!
5) patch CTE
patch /srv/gargantext_env/lib/python3.4/site-packages/cte_tree/models.py /srv/gargantext/init/cte_tree.models.diff
6) init nodetypes and main variables
/srv/gargantext/manage.py shell < /srv/gargantext/init/init.py
Extras:
======
Start the Python Notebook server
--------------------------------
......
sudo apt-get install postgresql
sudo apt-get install postgresql-contrib
sudo apt-get install python-virtualenv
sudo apt-cache search libpng
sudo apt-get install libpng12-dev
sudo apt-get install libpng-dev
apt-cache search freetype
apt-cache search freetype | grep dev
sudo apt-cache search freetype
sudo apt-get install libfreetype6-dev
sudo apt-cache search python-dev
sudo apt-get install python-dev
sudo apt-get install libpq-dev
sudo apt-get postgresql-contrib
sudo aptèget install libpq-dev
postgresql-contrib
libpq-dev
# rajouter david
#
#
# Pour avoir toutes les dependences de matpolotlib (c'est sale, trouver
sudo apt-get build-dep python-matplotlib
#Paquets Debian a installer
# easy_install -U distribute (matplotlib)
#lxml
libffi-dev
libxml2-dev
libxslt1-dev
sudo apt-get install libffi-dev
sudo apt-get install libxml2-dev
sudo apt-get install libxslt1-dev
# ipython readline
libncurses5-dev
pandoc
sudo apt-get install libncurses5-dev
sudo apt-get install pandoc
# scipy:
gfortran
libopenblas-dev
liblapack-dev
sudo apt-get install gfortran
sudo apt-get install libopenblas-dev
sudo apt-get install liblapack-dev
......@@ -82,6 +82,13 @@ except Exception as error:
typeDoc = NodeType(name='Synonyme')
typeDoc.save()
try:
typeDoc = NodeType.objects.get(name='Cooccurrence')
except Exception as error:
print(error)
typeDoc = NodeType(name='Cooccurrence')
typeDoc.save()
# In[33]:
......
......@@ -4,13 +4,16 @@ Jinja2==2.7.3
MarkupSafe==0.23
Pillow==2.5.3
Pygments==1.6
SQLAlchemy==0.9.8
South==1.0
aldjemy==0.3.51
amqp==1.4.6
anyjson==0.3.3
billiard==3.3.0.18
celery==3.1.15
certifi==14.05.14
cffi==0.8.6
chardet==2.3.0
cryptography==0.6
decorator==3.4.0
django-autoslug==1.7.2
......@@ -27,9 +30,9 @@ graphviz==0.4
ipython==2.2.0
kombu==3.0.23
lxml==3.3.6
matplotlib==1.4.0
#matplotlib==1.4.0
networkx==1.9
nltk==3.0a4
#nltk==3.0a4
nose==1.3.4
numpy==1.8.2
pandas==0.14.1
......
......@@ -200,6 +200,7 @@ class Node(CTENode):
self.node_resource.update(parsed=True)
@current_app.task(filter=task_method)
def extract_ngrams(self, keys, ngramsextractorscache=None, ngramscaches=None):
# if there is no cache...
if ngramsextractorscache is None:
......
import re
import locale
from lxml import etree
from datetime import datetime, date
from parsing.FileParsers.FileParser import FileParser
from parsing.NgramsExtractors import *
from .FileParser import FileParser
from ..NgramsExtractors import *
......
......@@ -3,7 +3,7 @@ import dateutil.parser
import zipfile
import chardet
from parsing.Caches import LanguagesCache
from ..Caches import LanguagesCache
class FileParser:
......
from parsing.FileParsers.RisFileParser import RisFileParser
from .RisFileParser import RisFileParser
class IsiFileParser(RisFileParser):
......
from django.db import transaction
from lxml import etree
from parsing.FileParsers.FileParser import FileParser
from parsing.NgramsExtractors import *
from .FileParser import FileParser
from ..NgramsExtractors import *
class PubmedFileParser(FileParser):
......
from django.db import transaction
from parsing.FileParsers.FileParser import FileParser
from .FileParser import FileParser
class RisFileParser(FileParser):
......
from parsing.NgramsExtractors.NgramsExtractor import NgramsExtractor
from parsing.Taggers import NltkTagger
from .NgramsExtractor import NgramsExtractor
from ..Taggers import NltkTagger
class EnglishNgramsExtractor(NgramsExtractor):
......
from parsing.Taggers.Tagger import Tagger
from .Tagger import Tagger
import nltk
......
from parsing.Taggers.Tagger import Tagger
from .Tagger import Tagger
import subprocess
import threading
......
# from NltkTagger import NltkTagger
# tagger = NltkTagger()
# text0 = "Forman Brown (1901–1996) was one of the world's leaders in puppet theatre in his day, as well as an important early gay novelist. He was a member of the Yale Puppeteers and the driving force behind Turnabout Theatre. He was born in Otsego, Michigan, in 1901 and died in 1996, two days after his 95th birthday. Brown briefly taught at North Carolina State College, followed by an extensive tour of Europe."
# text1 = "James Patrick (born c. 1940) is the pseudonym of a Scottish sociologist, which he used to publish a book A Glasgow Gang Observed. It attracted some attention in Scotland when it was published in 1973. It was based on research he had done in 1966, when he was aged 26. At that time he was working as a teacher in an Approved School, a Scottish reformatory. One gang member in the school, \"Tim Malloy\" (born 1950, also a pseudonym and a generic term for a Glasgow Catholic), agreed to infiltrate him into his gang in Maryhill in Glasgow. Patrick spent four months as a gang member, observing their behaviour."
from TreeTagger import TreeTagger
tagger = TreeTagger()
text0 = "La saison 1921-1922 du Foot-Ball Club Juventus est la vingtième de l'histoire du club, créé vingt-cinq ans plus tôt en 1897. La société turinoise qui fête cette année son 25e anniversaire prend part à l'édition du championnat dissident d'Italie de la CCI (appelé alors la Première division), la dernière édition d'une compétition annuelle de football avant l'ère fasciste de Mussolini."
text1 = "Le terme oblong désigne une forme qui est plus longue que large et dont les angles sont arrondis. En langage bibliographique, oblong signifie un format dont la largeur excède la hauteur. Ce qui correspond au format paysage en termes informatiques et \"à l'italienne\", pour l'imprimerie."
text2 = "Les sanglots longs des violons de l'automne bercent mon coeur d'une langueur monotone."
print()
print(tagger.tag_text(text0))
print()
print(tagger.tag_text(text1))
print()
print(tagger.tag_text(text2))
print()
\ No newline at end of file
/* Copyright 2013 Michael Bostock. All rights reserved. Do not copy. */
/*@import url(http://fonts.googleapis.com/css?family=PT+Serif|PT+Serif:b|PT+Serif:i|PT+Sans|PT+Sans:b);*/
html {
min-width: 1040px;
}
.ocks-org body {
background: #fcfcfa;
color: #333;
font-family: "PT Serif", serif;
margin: 1em auto 4em auto;
position: relative;
width: 960px;
}
.ocks-org header,
.ocks-org footer,
.ocks-org aside,
.ocks-org h1,
.ocks-org h2,
.ocks-org h3,
.ocks-org h4 {
font-family: "PT Sans", sans-serif;
}
.ocks-org h1,
.ocks-org h2,
.ocks-org h3,
.ocks-org h4 {
color: #000;
}
.ocks-org header,
.ocks-org footer {
color: #636363;
}
h1 {
font-size: 64px;
font-weight: 300;
letter-spacing: -2px;
margin: .3em 0 .1em 0;
}
h2 {
margin-top: 2em;
}
h1, h2 {
text-rendering: optimizeLegibility;
}
h2 a[name],
h2 a[id] {
color: #ccc;
right: 100%;
padding: 0 .3em;
position: absolute;
}
header,
footer {
font-size: small;
}
.ocks-org header aside,
.ocks-org footer aside {
float: left;
margin-right: .5em;
}
.ocks-org header aside:after,
.ocks-org footer aside:after {
padding-left: .5em;
content: "/";
}
footer {
margin-top: 8em;
}
h1 ~ aside {
font-size: small;
right: 0;
position: absolute;
width: 180px;
}
.attribution {
font-size: small;
margin-bottom: 2em;
}
body > p, li > p {
line-height: 1.5em;
}
body > p {
width: 720px;
}
body > blockquote {
width: 640px;
}
blockquote q {
display: block;
font-style: oblique;
}
ul {
padding: 0;
}
li {
width: 690px;
margin-left: 30px;
}
a {
color: steelblue;
}
a:not(:hover) {
text-decoration: none;
}
pre, code, textarea {
font-family: "Menlo", monospace;
}
code {
line-height: 1em;
}
textarea {
font-size: 100%;
}
body > pre {
border-left: solid 2px #ccc;
padding-left: 18px;
margin: 2em 0 2em -20px;
}
.html .value,
.javascript .string,
.javascript .regexp {
color: #756bb1;
}
.html .tag,
.css .tag,
.javascript .keyword {
color: #3182bd;
}
.comment {
color: #636363;
}
.html .doctype,
.javascript .number {
color: #31a354;
}
.html .attribute,
.css .attribute,
.javascript .class,
.javascript .special {
color: #e6550d;
}
svg {
font: 10px sans-serif;
}
.axis path, .axis line {
fill: none;
stroke: #000;
shape-rendering: crispEdges;
}
sup, sub {
line-height: 0;
}
q:before {
content: "“";
}
q:after {
content: "”";
}
blockquote q {
line-height: 1.5em;
display: inline;
}
blockquote q:before,
blockquote q:after {
content: "";
}
......@@ -127,8 +127,8 @@
<div class="jumbotron">
<h3><a href="/corpus/{{ corpus.id }}/explorer">Visualizations</a></h3>
<ol>
<li>Matrix</li>
<li>Static maps</li>
<li><a href="/corpus/{{ corpus.id }}/matrix">Adjacency matrix</a></li>
<li><a href="/corpus/{{ corpus.id }}/explorer">Static maps</a></li>
<li>Dynamics maps</li>
</ol>
</div>
......
......@@ -120,22 +120,19 @@
</ul>
<!--
<ul id="colorGraph" class="nav navbar-nav navbar-right">
<li class="dropdown">
<a href="#" class="dropdown-toggle" data-toggle="dropdown">
<img title="Set Colors" src="{% static "js/libs/img2/colors.png" %}" width="20px"><b class="caret"></b></img>
</a>
<ul class="dropdown-menu">
<li><a href="#" onclick='clustersBy("country")'>By Country</a></li>
<li><a href="#" onclick='clustersBy("acronym")'>By Acronym</a></li>
<li><a href="#" onclick='clustersBy("group")'>By Group</a></li>
<li><a href="#" onclick='clustersBy("default")'>By Default</a></li>
<li class="divider"></li>
<li><a href="#"> <span class="glyphicon glyphicon-repeat"></span> <strike>Properties</strike></a></li>
</ul>
</li>
</ul>
-->
<!---->
......@@ -361,7 +358,7 @@
<script type="text/javascript" src="{% static "js/libs/jquery/jquery.easytabs.min.js" %}"></script>
<script src="{% static "js/libs/bootstrap/js/bootstrap.min.js" %}"></script>
<!--<script src="{% static "js/libs/bootstrap/js/bootstrap.min.js" %}"></script>-->
<script src="{% static "js/libs/bootstrap/js/bootstrap-modal.js" %}" type="text/javascript"></script>
<script src="{% static "js/libs/bootstrap/js/bootstrap-hover-dropdown.min.js" %}" type="text/javascript"></script>
......
......@@ -5,6 +5,22 @@
{% load staticfiles %}
<link rel="stylesheet" href="{% static "css/bootstrap.css" %}">
<link rel="stylesheet" href="{% static "css/bootstrap-theme.min.css" %}">
<link rel="stylesheet" href="{% static "css/d3matrix.css" %}">
<style>
.background {
fill: #eee;
}
line {
stroke: #fff;
}
text.active {
fill: red;
}
</style>
{% endblock %}
......@@ -17,7 +33,11 @@
</div>
</div>
<script src="{% static "js/d3/d3.v2.min.js" %}></script>
<div id="graphid" style="visibility: hidden;">/corpus/{{ corpus.id }}/adjacency.json</div>
<script src="{% static "js/jquery/jquery.min.js" %}" type="text/javascript"></script>
<script src="{% static "js/d3/d3.v2.min.js" %}"></script>
<p>Order: <select id="order">
<option value="name">by Name</option>
......@@ -39,11 +59,12 @@ var x = d3.scale.ordinal().rangeBands([0, width]),
var svg = d3.select("body").append("svg")
.attr("width", width + margin.left + margin.right)
.attr("height", height + margin.top + margin.bottom)
.style("margin-left", -margin.left + "px")
//.style("margin-left", -margin.left + "px")
.append("g")
.attr("transform", "translate(" + margin.left + "," + margin.top + ")");
d3.json("{% static "img/miserables.json" %}, function(miserables) {
var filename = document.getElementById("graphid").innerHTML
d3.json(filename, function(miserables) {
var matrix = [],
nodes = miserables.nodes,
n = nodes.length;
......@@ -57,12 +78,12 @@ var svg = d3.select("body").append("svg")
// Convert links to matrix; count character occurrences.
miserables.links.forEach(function(link) {
matrix[link.source][link.target].z += link.value;
matrix[link.target][link.source].z += link.value;
matrix[link.source][link.source].z += link.value;
matrix[link.target][link.target].z += link.value;
nodes[link.source].count += link.value;
nodes[link.target].count += link.value;
matrix[link.source][link.target].z += link.weight;
matrix[link.target][link.source].z += link.weight;
matrix[link.source][link.source].z += link.weight;
matrix[link.target][link.target].z += link.weight;
nodes[link.source].count += link.weight;
nodes[link.target].count += link.weight;
});
// Precompute the orders.
......
......@@ -146,7 +146,7 @@
data: [
{% if donut %}
{% for part in donut %}
{label: '{{ part.source }}, {{part.count}} docs', value: {{ part.part }} },
{label: '{{ part.source }}', value: {{ part.part }} },
{% endfor %}
{% endif %}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment