Commit 6b0e0d41 authored by Administrator's avatar Administrator

[TEST DEV] Towards the graphs.

parent 339886ab
{
"metadata": {
"name": "",
"signature": "sha256:33c2f41e3ea5983e768350b4012544242c5df9b394091647362f00929812a921"
"signature": "sha256:65f487ee62067486e4f832ed088fe02ede3daa27052c9dcaf58b3edffa169245"
},
"nbformat": 3,
"nbformat_minor": 0,
......@@ -27,6 +27,17 @@
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"me = User.objects.get(username='alexandre')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
......@@ -37,7 +48,7 @@
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 15
"prompt_number": 2
},
{
"cell_type": "code",
......@@ -48,7 +59,7 @@
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
"prompt_number": 3
},
{
"cell_type": "code",
......@@ -227,12 +238,12 @@
"collapsed": false,
"input": [
"#\u00a0corpus = Node.objects.filter(type=typeCorpus).first()\n",
"corpus = Node.objects.get(id=44338)"
"corpus = Node.objects.get(id=13064)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
"prompt_number": 3
},
{
"cell_type": "heading",
......@@ -279,30 +290,30 @@
"output_type": "stream",
"stream": "stdout",
"text": [
"'2004/01/02','2'\n",
"'2004/01/03','1'\n",
"'2004/01/06','8'\n",
"'2004/01/07','5'\n",
"'2004/01/08','8'\n",
"'2004/01/09','1'\n",
"'2004/01/10','3'\n",
"'2004/01/12','2'\n",
"'2004/01/13','6'\n",
"'2004/01/15','2'\n",
"'2004/01/16','1'\n",
"'2004/01/17','5'\n",
"'2004/01/19','2'\n",
"'2004/01/20','2'\n",
"'2004/01/21','7'\n",
"'2004/01/23','1'\n",
"'2004/01/24','4'\n",
"'2004/01/25','4'\n",
"'2004/01/26','5'\n",
"'2004/01/27','2'\n"
"'1954/11/18','2'\n",
"'1958/11/18','1'\n",
"'1959/11/18','1'\n",
"'1968/11/18','1'\n",
"'1969/11/18','2'\n",
"'1971/11/18','4'\n",
"'1972/11/18','1'\n",
"'1974/11/18','2'\n",
"'1975/11/18','3'\n",
"'1976/11/18','1'\n",
"'1977/11/18','6'\n",
"'1978/11/18','11'\n",
"'1979/11/18','9'\n",
"'1980/11/18','6'\n",
"'1981/11/18','4'\n",
"'1982/11/18','7'\n",
"'1983/11/18','14'\n",
"'1984/11/18','17'\n",
"'1985/11/18','18'\n",
"'1986/02/21','1'\n"
]
}
],
"prompt_number": 37
"prompt_number": 6
},
{
"cell_type": "code",
......@@ -320,12 +331,17 @@
" node_ngram AS ngX ON ngX.id = nngX.ngram_id\n",
" WHERE\n",
" n.parent_id = %s\n",
" AND\n",
" ngX.n >= 2\n",
" GROUP BY\n",
" ngX.terms\n",
" Having\n",
" COUNT(*) > 7\n",
" ORDER BY\n",
" occurrences DESC\n",
" LIMIT\n",
" 20\n",
" 100\n",
" \n",
"\"\"\", [corpus.id])\n",
"\n",
"while True:\n",
......@@ -341,30 +357,74 @@
"output_type": "stream",
"stream": "stdout",
"text": [
"(196, 'patients')\n",
"(135, 'voice')\n",
"(129, 'study')\n",
"(111, 'disease')\n",
"(69, 'treatment')\n",
"(66, 'life')\n",
"(58, 'patient')\n",
"(53, 'quality')\n",
"(49, 'care')\n",
"(45, 'use')\n",
"(44, 'Patients')\n",
"(43, 'people')\n",
"(41, 'development')\n",
"(41, 'purpose')\n",
"(40, 's disease')\n",
"(39, 's')\n",
"(38, 'results')\n",
"(37, 'diagnosis')\n",
"(36, 'years')\n",
"(34, 'women')\n"
"(138, 'honey bees')\n",
"(132, 'apis mellifera')\n",
"(69, 'honey bee')\n",
"(66, 'apis mellifera l')\n",
"(45, 'pesticide residues')\n",
"(39, 'gas chromatography')\n",
"(36, 'varroa destructor')\n",
"(36, 'honey bee colonies')\n",
"(30, 'sublethal effects')\n",
"(27, 'apidae )')\n",
"(21, 'neonicotinoid insecticides')\n",
"(21, 'honey bee ( hymenoptera')\n",
"(18, 'bee products')\n",
"(18, 'megachile rotundata')\n",
"(18, 'solid-phase extraction')\n",
"(18, 'simultaneous determination')\n",
"(18, 'mass spectrometric')\n",
"(15, 'case study')\n",
"(15, 'honey samples')\n",
"(15, 'liquid chromatography')\n",
"(15, 'high performance liquid chromatography')\n",
"(15, 'varroa mites')\n",
"(12, 'organochlorine pesticides')\n",
"(12, 'gas chromatography-mass spectrometry')\n",
"(12, 'liquid chromatography-mass spectrometry')\n",
"(12, 'colony health')\n",
"(12, 'gas chromatographic')\n",
"(12, 'colony collapse disorder')\n",
"(12, 'bumble bees')\n",
"(12, 'varroa jacobsoni')\n",
"(9, 'chemiluminescent elisa')\n",
"(9, 'diversionary plantings for reduction of pesticide related bee mortality')\n",
"(9, 'pesticides and law')\n",
"(9, 'plant protection products')\n",
"(9, 'nomia melanderi')\n",
"(9, 'electron-capture detection')\n",
"(9, 'managed pollinator cap coordinated agricultural project a national research')\n",
"(9, 'apis florea f')\n",
"(9, 'solid-phase microextraction')\n",
"(9, 'extension initiative')\n",
"(9, 'crop pollination')\n",
"(9, 'non-apis bees')\n",
"(9, 'honey bees ( apis mellifera')\n",
"(9, 'liquid chromatography-tandem mass spectrometry')\n",
"(9, 'bee pollen')\n",
"(9, 'foraging behavior')\n",
"(9, 'biological control')\n",
"(9, 'nosema ceranae')\n",
"(9, 'organophosphorus pesticides')\n",
"(9, 'field conditions')\n",
"(9, 'honey bee apis mellifera l')\n",
"(9, 'laboratory tests')\n",
"(9, 'beauveria bassiana')\n",
"(9, 'comparative toxicity')\n",
"(9, 'high levels')\n",
"(9, 'pesticide exposure')\n",
"(9, 'fluvalinate residues')\n",
"(9, 'insecticide residues')\n",
"(9, 'osmia lignaria')\n",
"(9, 'bombus impatiens')\n",
"(9, 'honey bee health')\n",
"(9, 'agricultural landscape')\n",
"(9, 'dispersive liquid-liquid microextraction')\n",
"(9, 'matrix solid-phase dispersion')\n"
]
}
],
"prompt_number": 104
"prompt_number": 28
},
{
"cell_type": "heading",
......@@ -455,7 +515,7 @@
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 21
"prompt_number": 17
},
{
"cell_type": "code",
......@@ -470,7 +530,7 @@
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 22
"prompt_number": 18
},
{
"cell_type": "code",
......@@ -485,7 +545,7 @@
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 23
"prompt_number": 19
},
{
"cell_type": "code",
......@@ -497,7 +557,7 @@
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 24
"prompt_number": 22
},
{
"cell_type": "code",
......@@ -514,7 +574,7 @@
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 25
"prompt_number": 23
},
{
"cell_type": "code",
......@@ -528,13 +588,13 @@
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 26,
"prompt_number": 24,
"text": [
"6"
"61297"
]
}
],
"prompt_number": 26
"prompt_number": 24
},
{
"cell_type": "heading",
......@@ -592,7 +652,7 @@
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 47
"prompt_number": 26
},
{
"cell_type": "heading",
......@@ -627,7 +687,60 @@
" WHERE\n",
" n.parent_id = %s\n",
" AND\n",
" nngX.ngram_id > nngY.ngram_id\n",
" nngX.ngram_id in (select id from node_node_ngram WHERE node_id = 61298 )\n",
" AND\n",
" nngY.ngram_id in (select id from node_node_ngram WHERE node_id = 61298 )\n",
" AND\n",
" nngX.ngram_id <> nngY.ngram_id\n",
" \n",
" GROUP BY\n",
" ngX.id,\n",
" ngX.terms,\n",
" ngY.id,\n",
" ngY.terms\n",
" ORDER BY\n",
" cooccurrences DESC\n",
" LIMIT\n",
" 200\n",
"\"\"\", [corpus.id])\n",
"\n",
"while True:\n",
" row = cursor.fetchone()\n",
" if row is None:\n",
" break\n",
" print(row)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 15
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"cursor.execute(\"\"\"\n",
" SELECT\n",
" COUNT(*) AS cooccurrences,\n",
" ngX.terms,\n",
" ngY.terms\n",
" FROM\n",
" node_node AS n\n",
" \n",
" INNER JOIN\n",
" node_node_ngram AS nngX ON nngX.node_id = n.id\n",
" INNER JOIN\n",
" node_ngram AS ngX ON ngX.id = nngX.ngram_id\n",
" \n",
" INNER JOIN\n",
" node_node_ngram AS nngY ON nngY.node_id = n.id\n",
" INNER JOIN\n",
" node_ngram AS ngY ON ngY.id = nngY.ngram_id\n",
"\n",
" WHERE\n",
" n.parent_id = %s\n",
" AND\n",
" nngX.ngram_id <> nngY.ngram_id\n",
" \n",
" GROUP BY\n",
" ngX.id,\n",
......@@ -651,32 +764,39 @@
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"stream": "stderr",
"text": [
"(98, 'patients', 'study')\n",
"(88, 'patients', 'disease')\n",
"(78, 'voice', 'patients')\n",
"(76, 'Parkinson', 's disease')\n",
"(64, 'life', 'patients')\n",
"(62, 'life', 'quality')\n",
"(60, 'treatment', 'patients')\n",
"(56, 'patient', 'patients')\n",
"(56, 'voice', 'study')\n",
"(54, 'Patients', 'patients')\n",
"(54, 'purpose', 'study')\n",
"(54, 'voice', 'disease')\n",
"(52, 'study', 'disease')\n",
"(48, 'voice', 'treatment')\n",
"(46, 'treatment', 'disease')\n",
"(42, 'quality', 'patients')\n",
"(42, 'life', 'study')\n",
"(40, 'care', 'patients')\n",
"(40, 'PD', 'Parkinson')\n",
"(40, 'PD', 's disease')\n"
"ERROR: An unexpected error occurred while tokenizing input\n",
"The following traceback may be corrupted or invalid\n",
"The error message is: ('EOF in multi-line string', (1, 0))\n",
"\n"
]
},
{
"ename": "OperationalError",
"evalue": "arr\u00eat des connexions suite \u00e0 la demande de l'administrateur\nSSL connection has been closed unexpectedly\n",
"output_type": "pyerr",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mOperationalError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-11-752593da5735>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m 31\u001b[0m \u001b[0mLIMIT\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 32\u001b[0m \u001b[1;36m20\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 33\u001b[1;33m \"\"\", [corpus.id])\n\u001b[0m\u001b[0;32m 34\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 35\u001b[0m \u001b[1;32mwhile\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m/home/alexandre/projets/gargantext.py/env/lib/python3.4/site-packages/django/db/backends/util.py\u001b[0m in \u001b[0;36mexecute\u001b[1;34m(self, sql, params)\u001b[0m\n\u001b[0;32m 67\u001b[0m \u001b[0mstart\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 68\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 69\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0msuper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mCursorDebugWrapper\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msql\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 70\u001b[0m \u001b[1;32mfinally\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 71\u001b[0m \u001b[0mstop\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m/home/alexandre/projets/gargantext.py/env/lib/python3.4/site-packages/django/db/backends/util.py\u001b[0m in \u001b[0;36mexecute\u001b[1;34m(self, sql, params)\u001b[0m\n\u001b[0;32m 51\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcursor\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msql\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 52\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 53\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcursor\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msql\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 54\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 55\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mexecutemany\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msql\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparam_list\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m/home/alexandre/projets/gargantext.py/env/lib/python3.4/site-packages/django/db/utils.py\u001b[0m in \u001b[0;36m__exit__\u001b[1;34m(self, exc_type, exc_value, traceback)\u001b[0m\n\u001b[0;32m 97\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mdj_exc_type\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32min\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mDataError\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mIntegrityError\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 98\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwrapper\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merrors_occurred\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 99\u001b[1;33m \u001b[0msix\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreraise\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdj_exc_type\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdj_exc_value\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtraceback\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 100\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 101\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m__call__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m/home/alexandre/projets/gargantext.py/env/lib/python3.4/site-packages/django/utils/six.py\u001b[0m in \u001b[0;36mreraise\u001b[1;34m(tp, value, tb)\u001b[0m\n\u001b[0;32m 547\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mreraise\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtp\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtb\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 548\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__traceback__\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mtb\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 549\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwith_traceback\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtb\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 550\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 551\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m/home/alexandre/projets/gargantext.py/env/lib/python3.4/site-packages/django/db/backends/util.py\u001b[0m in \u001b[0;36mexecute\u001b[1;34m(self, sql, params)\u001b[0m\n\u001b[0;32m 51\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcursor\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msql\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 52\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 53\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcursor\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msql\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 54\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 55\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mexecutemany\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msql\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparam_list\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mOperationalError\u001b[0m: arr\u00eat des connexions suite \u00e0 la demande de l'administrateur\nSSL connection has been closed unexpectedly\n"
]
}
],
"prompt_number": 108
"prompt_number": 11
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
......
{
"metadata": {
"name": "",
"signature": "sha256:225dd3e49010ec98ad91be828e48d40276edd9a4242811d192ef4667286a0611"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"from node.models import Node, NodeType,\\\n",
" Project, Corpus, Document,\\\n",
" Ngram, Node_Ngram,\\\n",
" User, Language, ResourceType\n",
"from django.db import connection\n",
"cursor = connection.cursor()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"me = User.objects.get(username='alexandre')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print(me.id)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"1\n"
]
}
],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"corpus = Corpus(id=13064)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"document = NodeType.objects.get(name='Document')\n",
"# note cr\u00e9er un type Document_Poubelle (et une fonction vider la poubelle)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"corpus.children.count()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 5,
"text": [
"916"
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"Node_Ngram.objects.all().delete()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 27
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"corpus.children.filter(type_id=document.pk).extract_ngrams(keys=['title',])\n",
"\n",
"# bug\n",
"#corpus.children.filter(type_id=document.pk).extract_ngrams(keys=['title','abstract'])\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n"
]
}
],
"prompt_number": 28
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"children = corpus.children.all()[:10]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 46
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for child in children:\n",
" for n in Node_Ngram.objects.filter(node=child):\n",
" print(n.ngram.terms)\n",
" print(\"=\" * 10)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"equipment\n",
"technique used in laboratory evaluation of pesticide dusts in toxicological studies with honeybees\n",
"==========\n",
"pesticide dusts to honeybees\n",
"toxicity\n",
"==========\n",
"pesticides to honey bees in laboratory and field tests in southern california\n",
"toxicity\n",
"==========\n",
"field applications of some of the newer pesticides on honey bees\n",
"effects\n",
"==========\n",
"protecting honeybees\n",
"pesticides\n",
"==========\n",
"pollen gathering of honey bees reduced by pesticide sprays\n",
"==========\n",
"pesticide toxicity and honey bees\n",
"==========\n",
"newer pesticides dont harm environment\n",
"where have all bees gone\n",
"==========\n",
"honeybees\n",
"pesticides and law\n",
"==========\n",
"honeybees\n",
"pesticides and law\n",
"==========\n"
]
}
],
"prompt_number": 48
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#liste_ordered = collections.OrderedDict(sorted(liste.items()), key=lambda t: t[1])"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#Cr\u00e9ation des listes"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import collections\n",
"liste = collections.defaultdict(int)\n",
"try:\n",
" whitelist_type = NodeType.objects.get(name='WhiteList')\n",
" blacklist_type = NodeType.objects.get(name='BlackList')\n",
"except:\n",
" whitelist_type = NodeType(name='WhiteList')\n",
" whitelist_type.save()\n",
" \n",
" blacklist_type = NodeType(name='BlackList')\n",
" blacklist_type.save()\n",
"\n",
"white_list = Node.objects.create(name='WhiteList Pubmed', user=me, parent=corpus, type=whitelist_type)\n",
"black_list = Node.objects.create(name='BlackList Pubmed', user=me, parent=corpus, type=blacklist_type)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 37
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"Node_Ngram.objects.filter(node=white_list).count()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 38,
"text": [
"0"
]
}
],
"prompt_number": 38
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Cr\u00e9ation de la white list (simple sans syn ni black plus tard...)"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# delete avant pour \u00e9viter les doublons\n",
"Node_Ngram.objects.filter(node=white_list).all().delete()\n",
"cursor.execute(\"\"\"\n",
" SELECT\n",
" ngX.id,\n",
" ngX.terms,\n",
" COUNT(*) AS occurrences\n",
" FROM\n",
" node_node AS n\n",
" INNER JOIN\n",
" node_node_ngram AS nngX ON nngX.node_id = n.id\n",
" INNER JOIN\n",
" node_ngram AS ngX ON ngX.id = nngX.ngram_id\n",
" WHERE\n",
" n.parent_id = %s\n",
" AND\n",
" ngX.n >= 1\n",
" GROUP BY\n",
" ngX.id\n",
" Having\n",
" COUNT(*) >= 3\n",
" ORDER BY\n",
" occurrences DESC\n",
" LIMIT\n",
" 100\n",
" \n",
"\"\"\", [corpus.id])\n",
"\n",
"# \u00e0 optimiser avec un insert dans la requ\u00eate SQL\n",
"with transaction.atomic():\n",
" while True:\n",
" row = cursor.fetchone()\n",
" if row is None:\n",
" break\n",
" #print(row)\n",
" Node_Ngram.objects.create(node=white_list, ngram=Ngram.objects.get(id=row[0]), weight=row[2])"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 43
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"Node_Ngram.objects.filter(node=white_list).count()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 44,
"text": [
"100"
]
}
],
"prompt_number": 44
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"white_list.id"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 45,
"text": [
"61305"
]
}
],
"prompt_number": 45
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"cursor.execute(\"\"\"\n",
" SELECT\n",
" COUNT(*) AS cooccurrences,\n",
" ngX.terms,\n",
" ngY.terms\n",
" FROM\n",
" node_node AS n\n",
" \n",
" INNER JOIN\n",
" node_node_ngram AS nngX ON nngX.node_id = n.id\n",
" INNER JOIN\n",
" node_ngram AS ngX ON ngX.id = nngX.ngram_id\n",
" \n",
" INNER JOIN\n",
" node_node_ngram AS nngY ON nngY.node_id = n.id\n",
" INNER JOIN\n",
" node_ngram AS ngY ON ngY.id = nngY.ngram_id\n",
" \n",
" WHERE\n",
" n.parent_id = %s\n",
" AND\n",
" nngX.ngram_id in (select id from node_node_ngram WHERE node_id = %s )\n",
" AND\n",
" nngY.ngram_id in (select id from node_node_ngram WHERE node_id = %s )\n",
" AND\n",
" nngX.ngram_id <> nngY.ngram_id\n",
" \n",
" GROUP BY\n",
" ngX.id,\n",
" ngX.terms,\n",
" ngY.id,\n",
" ngY.terms\n",
" ORDER BY\n",
" cooccurrences DESC\n",
" LIMIT\n",
" 200\n",
"\"\"\", [corpus.id, white_list.id, white_list.id])\n",
"\n",
"while True:\n",
" row = cursor.fetchone()\n",
" if row is None:\n",
" break\n",
" print(row)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 42
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 42
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 42
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Cr\u00e9ation de la black list"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"with transaction.atomic():\n",
" for node_ngram_object in Node_Ngram.objects.all()[101:150]:\n",
" Node_Ngram.objects.create(node=black_node, ngram=node_ngram_object.ngram, score=1)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "'score' is an invalid keyword argument for this function",
"output_type": "pyerr",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-24-b9a499432911>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0mtransaction\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0matomic\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mnode_ngram_object\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mNode_Ngram\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mobjects\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mall\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m101\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;36m150\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mNode_Ngram\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mobjects\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcreate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnode\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mblack_node\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mngram\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mnode_ngram_object\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mngram\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mscore\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32m/home/alexandre/projets/gargantext.py/env/lib/python3.4/site-packages/django/db/models/manager.py\u001b[0m in \u001b[0;36mcreate\u001b[1;34m(self, **kwargs)\u001b[0m\n\u001b[0;32m 155\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 156\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mcreate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 157\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_queryset\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcreate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 158\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 159\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mbulk_create\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m/home/alexandre/projets/gargantext.py/env/lib/python3.4/site-packages/django/db/models/query.py\u001b[0m in \u001b[0;36mcreate\u001b[1;34m(self, **kwargs)\u001b[0m\n\u001b[0;32m 318\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mreturning\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mcreated\u001b[0m \u001b[0mobject\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 319\u001b[0m \"\"\"\n\u001b[1;32m--> 320\u001b[1;33m \u001b[0mobj\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmodel\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 321\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_for_write\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 322\u001b[0m \u001b[0mobj\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msave\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mforce_insert\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0musing\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdb\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m/home/alexandre/projets/gargantext.py/env/lib/python3.4/site-packages/django/db/models/base.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 415\u001b[0m \u001b[1;32mpass\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 416\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 417\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"'%s' is an invalid keyword argument for this function\"\u001b[0m \u001b[1;33m%\u001b[0m \u001b[0mlist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 418\u001b[0m \u001b[0msuper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mModel\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__init__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 419\u001b[0m \u001b[0msignals\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpost_init\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msender\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__class__\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minstance\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mTypeError\u001b[0m: 'score' is an invalid keyword argument for this function"
]
}
],
"prompt_number": 24
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"Node_Ngram.objects.filter(node=black_node)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Cr\u00e9ation des synonymes"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"syno_type = NodeType.objects.get(name='Synonyme')\n",
"syno_node = Node.objects.create(name='Syno Pubmed',\n",
" user=user, \n",
" parent=corpus, \n",
" type=syno_type)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"synonyme1, synonyme2 = Node_Ngram.objects.filter(node=white_node)[3:5]"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"NodeNgramNgram.objects.create(node=syno_node, ngramX=synonyme1.ngram, ngramY=synonyme2.ngram)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Cooccurrence"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 25,
"text": [
"0"
]
}
],
"prompt_number": 25
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"black_node.pk"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 18,
"text": [
"61295"
]
}
],
"prompt_number": 18
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"try:\n",
" cooc_type = NodeType.objects.get(name='Cooccurrence')\n",
"except:\n",
" cooc_type = NodeType(name='Cooccurrence')\n",
" cooc_type.save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 26
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"cooc = Node.objects.create(user=me,\\\n",
" parent=corpus,\\\n",
" type=cooc_type,\\\n",
" name=\"Cooccurrences calcul Alpha\")"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 27
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"cooc.pk"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 28,
"text": [
"61300"
]
}
],
"prompt_number": 28
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"white_node.pk"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 42,
"text": [
"61298"
]
}
],
"prompt_number": 42
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from django.db import connection\n",
"cursor = connection.cursor()\n",
"# LOCK TABLE documents_ngramtemporary IN EXCLUSIVE MODE;\n",
"query_string = \"\"\"\n",
"\n",
"INSERT INTO node_nodengramngram (node_id, \"ngramx_id\", \"ngramy_id\", score)\n",
"\n",
"SELECT \n",
"%d as node_id, x.ngram_id, y.ngram_id, COUNT(*) AS score\n",
"\n",
"FROM\n",
"node_node_ngram AS x\n",
"\n",
"INNER JOIN \n",
"node_node_ngram AS y ON x.node_id = y.node_id\n",
"\n",
"WHERE\n",
" x.id in (select id from node_node_ngram WHERE node_id = %d )\n",
"AND\n",
" y.id in (select id from node_node_ngram WHERE node_id = %d )\n",
"AND\n",
" x.id > y.id\n",
"\n",
"\n",
"GROUP BY\n",
"x.ngram_id, y.ngram_id\n",
"\n",
"HAVING count(*) > 1\n",
"\n",
"ORDER BY score\n",
"\n",
"LIMIT 300\n",
"\n",
" \"\"\" % (cooc.pk, white_node.pk, white_node.pk)\n",
"\n",
"cursor.execute(query_string)\n",
"\n",
"try:\n",
" while True:\n",
" row = cursor.fetchone()\n",
" if row is None:\n",
" break\n",
" print(row)\n",
"except:\n",
" pass"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 76
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from copy import copy\n",
"import numpy as np\n",
"import pandas as pd\n",
"import networkx as nx\n",
"from collections import defaultdict\n",
"from analysis.louvain import *\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 44
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"matrix = \"\""
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 57
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"matrix = defaultdict(lambda : defaultdict(float))\n",
"for cooccurrence in NodeNgramNgram.objects.filter(node=cooc):\n",
" if cooccurrence.score >= 1 :\n",
" #print(x.ngramX.terms, x.ngramY.terms)\n",
" matrix[cooccurrence.ngramx.terms][cooccurrence.ngramx.terms] = cooccurrence.score\n",
" matrix[cooccurrence.ngramy.terms][cooccurrence.ngramx.terms] = cooccurrence.score"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 58
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df = pd.DataFrame(matrix).T.fillna(0)\n",
"x = copy(df.values)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 59
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"x = np.where((x.sum(axis=1) > x.shape[0] / 2), 0, x )\n",
"x = np.where((x.sum(axis=1) > x.shape[0] / 10), 0, x )"
],
"language": "python",
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "shape mismatch: objects cannot be broadcast to a single shape",
"output_type": "pyerr",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-39-28332fbcc3a3>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mx\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwhere\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msum\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m>\u001b[0m \u001b[0mx\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m/\u001b[0m \u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mx\u001b[0m \u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[0mx\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwhere\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msum\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m>\u001b[0m \u001b[0mx\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m/\u001b[0m \u001b[1;36m10\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mx\u001b[0m \u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mValueError\u001b[0m: shape mismatch: objects cannot be broadcast to a single shape"
]
}
],
"prompt_number": 39
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"x = x / x.sum(axis=1)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "operands could not be broadcast together with shapes (87,74) (87,) ",
"output_type": "pyerr",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-60-044d62562031>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mx\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mx\u001b[0m \u001b[1;33m/\u001b[0m \u001b[0mx\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msum\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;31mValueError\u001b[0m: operands could not be broadcast together with shapes (87,74) (87,) "
]
}
],
"prompt_number": 60
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"matrix_filtered = np.where(x > .4, 1, 0)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 50
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"matrix_filtered"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 51,
"text": [
"array([[1, 0, 0, ..., 0, 0, 0],\n",
" [0, 1, 0, ..., 0, 0, 0],\n",
" [0, 0, 1, ..., 0, 0, 0],\n",
" ..., \n",
" [0, 0, 0, ..., 0, 0, 0],\n",
" [0, 0, 0, ..., 0, 1, 0],\n",
" [0, 0, 0, ..., 0, 0, 1]])"
]
}
],
"prompt_number": 51
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"G = nx.from_numpy_matrix(matrix_filtered)\n",
"G = nx.relabel_nodes(G, dict(enumerate(df.columns)))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"ename": "NetworkXError",
"evalue": "('Adjacency matrix is not square.', 'nx,ny=(87, 74)')",
"output_type": "pyerr",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mNetworkXError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-52-a97ac71e981e>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mG\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnx\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfrom_numpy_matrix\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmatrix_filtered\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[0mG\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnx\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrelabel_nodes\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mG\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0menumerate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m/home/alexandre/projets/gargantext.py/env/lib/python3.4/site-packages/networkx/convert_matrix.py\u001b[0m in \u001b[0;36mfrom_numpy_matrix\u001b[1;34m(A, create_using)\u001b[0m\n\u001b[0;32m 267\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mn\u001b[0m\u001b[1;33m!=\u001b[0m\u001b[0mm\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 268\u001b[0m raise nx.NetworkXError(\"Adjacency matrix is not square.\",\n\u001b[1;32m--> 269\u001b[1;33m \"nx,ny=%s\"%(A.shape,))\n\u001b[0m\u001b[0;32m 270\u001b[0m \u001b[0mdt\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mA\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 271\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mNetworkXError\u001b[0m: ('Adjacency matrix is not square.', 'nx,ny=(87, 74)')"
]
}
],
"prompt_number": 52
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"nx.draw(G, with_labels=True)\n",
"plt.show()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"partition = best_partition(G)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#partition"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"pos = nx.spring_layout(G)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"count = 0.0\n",
"node_min = 3\n",
"for com in set(partition.values()) :\n",
" count = count + 1\n",
" list_nodes = [nodes for nodes in partition.keys() if partition[nodes] == com]\n",
" \n",
" if len(list_nodes) > node_min:\n",
" nx.draw_networkx_nodes(G, pos, list_nodes, node_size = 20, with_labels=True)#, node_color = str(count / size))\n",
" nx.draw_networkx_edges(G, pos, alpha=0.5)\n",
" plt.title(\"Clique \" + str(count))\n",
" \n",
" for node in list_nodes: \n",
" print(node)\n",
" plt.show()\n",
" print(\"-\" * 30)\n"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 145
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 146
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 147
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 0
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 0
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 0
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 0
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 0
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment