Commit b811a2bb authored by Administrator's avatar Administrator

Merge branch 'master' into alex

Intégration Master
parents f76b6997 845d37bb
{
"metadata": {
"name": "",
"signature": "sha256:e0c3b2efe7c205a29dc4e028b10ffb7b9d0569f35c4b426febdf523069abffdb"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"from pprint import pprint\n",
"from node.models import Node, NodeType, Language, Ngram\n",
"from django.contrib.auth.models import User\n",
"import parsing\n",
"from parsing.FileParsers import *"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Define user\n",
"try:\n",
" user = User.objects.get(username='Mat')\n",
"except:\n",
" user = User(username='Mat', password='0123', email='mathieu@rodic.fr')\n",
" user.save()\n",
"\n",
"# Define document types\n",
"nodetypes = {}\n",
"for name in ['Corpus', 'Document']:\n",
" try:\n",
" nodetypes[name] = NodeType.objects.get(name=name)\n",
" except:\n",
" nodetypes[name] = NodeType(name=name)\n",
" nodetypes[name].save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"Node.objects.all().delete()\n",
"corpus = Node(name='PubMed corpus', user=user, type=nodetypes['Corpus'])\n",
"corpus.save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"fileparser = PubmedFileParser.PubmedFileParser(file='/home/mat/projects/gargantext/data_samples/pubmed.zip')"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"fileparser.parse(corpus)\n",
"print('Ok!')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
}
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for node_ngram in corpus.children.first().node_ngram_set.all():\n",
" print(node_ngram.ngram.terms)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
\ No newline at end of file
{
"metadata": {
"name": "",
"signature": "sha256:eac7c9b22e240bb0ef6d0aeec21261194d84a3f0ba53cd02af69f80d30ec5a17"
"signature": "sha256:70c2c8a4c8089e61195ee9da9232043152cf5e6c658a32115c0dcf990c2e98af"
},
"nbformat": 3,
"nbformat_minor": 0,
......@@ -122,17 +122,34 @@
],
"language": "python",
"metadata": {},
"outputs": []
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"d = dateutil.parser.parse(\"2014 OCT 11 1:2:3\")"
"import locale\n",
"locale.setlocale(locale.LC_ALL, \"fr_FR\")\n",
"d = dateutil.parser.parse(\"20 janvier 2004\")"
],
"language": "python",
"metadata": {},
"outputs": []
"outputs": [
{
"ename": "TypeError",
"evalue": "'NoneType' object is not iterable",
"output_type": "pyerr",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-2-0756678732db>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mlocale\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[0mlocale\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msetlocale\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlocale\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mLC_ALL\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"fr_FR\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0md\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdateutil\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparser\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"20 janvier 2004\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32m/home/mat/projects/gargantext/myvenv/lib/python3.4/site-packages/dateutil/parser.py\u001b[0m in \u001b[0;36mparse\u001b[1;34m(timestr, parserinfo, **kwargs)\u001b[0m\n\u001b[0;32m 746\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mparser\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mparserinfo\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimestr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 747\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 748\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mDEFAULTPARSER\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimestr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 749\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 750\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m/home/mat/projects/gargantext/myvenv/lib/python3.4/site-packages/dateutil/parser.py\u001b[0m in \u001b[0;36mparse\u001b[1;34m(self, timestr, default, ignoretz, tzinfos, **kwargs)\u001b[0m\n\u001b[0;32m 308\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 309\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 310\u001b[1;33m \u001b[0mres\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mskipped_tokens\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_parse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimestr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 311\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 312\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mres\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mTypeError\u001b[0m: 'NoneType' object is not iterable"
]
}
],
"prompt_number": 2
},
{
"cell_type": "code",
......@@ -142,7 +159,17 @@
],
"language": "python",
"metadata": {},
"outputs": []
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 7,
"text": [
"'2014-02-02 00:00:00'"
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
......@@ -152,7 +179,8 @@
],
"language": "python",
"metadata": {},
"outputs": []
"outputs": [],
"prompt_number": 8
},
{
"cell_type": "code",
......
{
"metadata": {
"name": "",
"signature": "sha256:0acf5fbcb496d74a2d9016459305d1064eb9955ccb3380a26a1d183784e57f49"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"from pprint import pprint\n",
"from node.models import Node, NodeType, Language, Ngram\n",
"from django.contrib.auth.models import User\n",
"import parsing\n",
"from parsing.FileParsers import *"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Define user\n",
"try:\n",
" user = User.objects.get(username='Mat')\n",
"except:\n",
" user = User(username='Mat', password='0123', email='mathieu@rodic.fr')\n",
" user.save()\n",
"\n",
"# Define document types\n",
"nodetypes = {}\n",
"for name in ['Corpus', 'Document']:\n",
" try:\n",
" nodetypes[name] = NodeType.objects.get(name=name)\n",
" except:\n",
" nodetypes[name] = NodeType(name=name)\n",
" nodetypes[name].save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"Node.objects.all().delete()\n",
"corpus = Node(name='PubMed corpus', user=user, type=nodetypes['Corpus'])\n",
"corpus.save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"fileparser = PubmedFileParser(file='/home/mat/projects/gargantext/data_samples/pubmed.zip')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"fileparser.parse(corpus)\n",
"print('Ok!')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'language' is not defined",
"output_type": "pyerr",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-5-973a6eb3747b>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mfileparser\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcorpus\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'Ok!'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m/home/mat/projects/gargantext/gargantext/parsing/FileParsers/FileParser.py\u001b[0m in \u001b[0;36mparse\u001b[1;34m(self, parentNode, file)\u001b[0m\n\u001b[0;32m 127\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0mzipfile\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mZipFile\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mzipArchive\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 128\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mfilename\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mzipArchive\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnamelist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 129\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mparentNode\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mzipArchive\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"r\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 130\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 131\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_parse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mparentNode\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfile\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m/home/mat/projects/gargantext/gargantext/parsing/FileParsers/FileParser.py\u001b[0m in \u001b[0;36mparse\u001b[1;34m(self, parentNode, file)\u001b[0m\n\u001b[0;32m 129\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mparentNode\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mzipArchive\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"r\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 130\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 131\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_parse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mparentNode\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfile\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 132\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 133\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mextract\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparentNode\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkeys\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m/home/mat/projects/gargantext/gargantext/parsing/FileParsers/PubmedFileParser.py\u001b[0m in \u001b[0;36m_parse\u001b[1;34m(self, parentNode, file)\u001b[0m\n\u001b[0;32m 40\u001b[0m \u001b[0mparentNode\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mparentNode\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 41\u001b[0m \u001b[0mtitle\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmetadata\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"title\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 42\u001b[1;33m \u001b[0mmetadata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmetadata\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 43\u001b[0m \u001b[1;31m#guid = metadata[\"doi\"],\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 44\u001b[0m )\n",
"\u001b[1;32m/home/mat/projects/gargantext/gargantext/parsing/FileParsers/FileParser.py\u001b[0m in \u001b[0;36mcreate_document\u001b[1;34m(self, parentNode, title, metadata, guid)\u001b[0m\n\u001b[0;32m 100\u001b[0m \u001b[0mtype\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_document_nodetype\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 101\u001b[0m \u001b[0mname\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtitle\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 102\u001b[1;33m \u001b[0mlanguage\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlanguage\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 103\u001b[0m \u001b[0mmetadata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmetadata\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 104\u001b[0m \u001b[1;31m#resource = resource,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mNameError\u001b[0m: name 'language' is not defined"
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for node_ngram in corpus.children.first().node_ngram_set.all():\n",
" print(node_ngram.ngram.terms)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"furthermore\n",
"pathogen host shifts\n",
"negative impact\n",
"arthropod hosts\n",
"tobacco ringspot\n",
"common ancestor\n",
"distinct lineage\n",
"high mutation rates"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"gradual decline\n",
"present study\n",
"animal kingdoms\n",
"entire body\n",
"bee hemolymph\n",
"in\n",
"humans"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"tree topology\n",
"infections\n",
"health\n",
"viruses\n",
"result\n",
"trsvs\n",
"winter colony collapse\n",
"bees"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"domesticates\n",
"unique example\n",
"infected colonies\n",
"host ranges\n",
"systemic invasion\n",
"varroa\n",
"monophyletic clade\n",
"honeybees"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"diseases\n",
"spread\n",
"prevalence\n",
"infectious diseases\n",
"instances\n",
"plant hosts\n",
"phylogenetic analysis"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"plant-pathogenic rna virus\n",
"apis mellifera\n",
"plant\n",
"significant source\n",
"transkingdom host alteration\n",
"gastric cecum\n",
"feed\n",
"host populations\n",
"threat\n",
"varroa mites"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"honeybee hosts\n",
"ectoparasitic varroa\n",
"virus\n",
"colony survival\n",
"intracellular life cycle\n",
"species-level genetic variation\n",
"other bee viruses\n",
"study\n",
"virions"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"colonies\n",
"rna viruses\n",
"pathogenesis\n",
"winter\n",
"trsv\n",
"conjunction\n",
"trsv-infected individuals\n"
]
}
],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"corpus.children.first().metadata"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 7,
"text": [
"{'journal': 'mBio',\n",
" 'publication_year': '2014',\n",
" 'language_iso3': 'eng',\n",
" 'publication_month': '01',\n",
" 'title': 'Systemic spread and propagation of a plant-pathogenic virus in European honeybees, Apis mellifera.',\n",
" 'doi': '',\n",
" 'publication_day': '22',\n",
" 'abstract': 'Emerging and reemerging diseases that result from pathogen host shifts are a threat to the health of humans and their domesticates. RNA viruses have extremely high mutation rates and thus represent a significant source of these infectious diseases. In the present study, we showed that a plant-pathogenic RNA virus, tobacco ringspot virus (TRSV), could replicate and produce virions in honeybees, Apis mellifera, resulting in infections that were found throughout the entire body. Additionally, we showed that TRSV-infected individuals were continually present in some monitored colonies. While intracellular life cycle, species-level genetic variation, and pathogenesis of the virus in honeybee hosts remain to be determined, the increasing prevalence of TRSV in conjunction with other bee viruses from spring toward winter in infected colonies was associated with gradual decline of host populations and winter colony collapse, suggesting the negative impact of the virus on colony survival. Furthermore, we showed that TRSV was also found in ectoparasitic Varroa mites that feed on bee hemolymph, but in those instances the virus was restricted to the gastric cecum of Varroa mites, suggesting that Varroa mites may facilitate the spread of TRSV in bees but do not experience systemic invasion. Finally, our phylogenetic analysis revealed that TRSV isolates from bees, bee pollen, and Varroa mites clustered together, forming a monophyletic clade. The tree topology indicated that the TRSVs from arthropod hosts shared a common ancestor with those from plant hosts and subsequently evolved as a distinct lineage after transkingdom host alteration. This study represents a unique example of viruses with host ranges spanning both the plant and animal kingdoms.'}"
]
}
],
"prompt_number": 7
}
],
"metadata": {}
}
]
}
\ No newline at end of file
from django.db import models
from django.utils import timezone
from django.contrib.auth.models import User
from django_hstore import hstore
from cte_tree.models import CTENode, Manager
#from cte_tree.fields import DepthField, PathField, OrderingField
from parsing.Caches import LanguagesCache, NgramsExtractorsCache, NgramsCaches
from parsing.FileParsers import *
from time import time
from collections import defaultdict
from django.contrib.auth.models import User
from collections import defaultdict
# Some usefull functions
# TODO: start the function name with an underscore (private)
def upload_to(instance, filename):
return 'corpora/%s/%s' % (instance.user.username, filename)
#return 'corpora/%s/%f/%s' % (instance.user.username, time(), filename)
......@@ -28,35 +31,50 @@ class Language(models.Model):
def __str__(self):
return self.fullname
class DatabaseType(models.Model):
class ResourceType(models.Model):
name = models.CharField(max_length=255)
def __str__(self):
return self.name
class Ngram(models.Model):
language = models.ForeignKey(Language, blank=True, null=True, on_delete=models.SET_NULL)
n = models.IntegerField()
terms = models.CharField(max_length=255)
class Resource(models.Model):
user = models.ForeignKey(User)
guid = models.CharField(max_length=255)
bdd_type = models.ForeignKey(DatabaseType, blank=True, null=True)
type = models.ForeignKey(ResourceType, blank=True, null=True)
file = models.FileField(upload_to=upload_to, blank=True)
def __str__(self):
return "%s => %s" % (self.bdd_type, self.file)
digest = models.CharField(max_length=32) # MD5 digest
class NodeType(models.Model):
name = models.CharField(max_length=200)
def __str__(self):
return self.name
class Ngram(models.Model):
language = models.ForeignKey(Language, blank=True, null=True, on_delete=models.SET_NULL)
n = models.IntegerField()
terms = models.CharField(max_length=255)
def __str__(self):
return "[%d] %s" % (self.pk, self.terms)
class NodeQuerySet(models.query.QuerySet):
"""Methods available from Node querysets."""
def extract_ngrams(self, keys, ngramsextractorscache=None, ngramscaches=None):
if ngramsextractorscache is None:
ngramsextractorscache = NgramsExtractorsCache()
if ngramscaches is None:
ngramscaches = NgramsCaches()
for node in self:
node.extract_ngrams(keys, ngramsextractorscache, ngramscaches)
class NodeManager(models.Manager):
"""Methods available from Node.object."""
def get_queryset(self):
return NodeQuerySet(self.model)
def __getattr__(self, name, *args):
if name.startswith("_"):
raise AttributeError
return getattr(self.get_queryset(), name, *args)
class Node(CTENode):
objects = Manager()
"""The node."""
objects = NodeManager()
user = models.ForeignKey(User)
type = models.ForeignKey(NodeType)
......@@ -66,30 +84,104 @@ class Node(CTENode):
date = models.DateField(default=timezone.now, blank=True)
metadata = hstore.DictionaryField(blank=True)
resource = models.ManyToManyField(Resource, blank=True)
ngrams = models.ManyToManyField(Ngram, blank=True, help_text="Hold down")
# TODO: remove the three following fields
fichier = models.FileField(upload_to=upload_to, blank=True)
#resource = models.ForeignKey(Resource, blank=True, null=True)
#ngrams = models.ManyToManyField(NGrams)
def __str__(self):
return self.name
def liste(self, user):
for noeud in Node.objects.filter(user=user):
print(noeud.depth * " " + "[%d] %d" % (noeud.pk, noeud.name))
def add_resource(self, **kwargs):
resource = Resource(**kwargs)
# TODO: vérifier si tous ces 'save' sont réellement utiles
resource.save()
node_resource = Node_Resource(
node = self,
resource = resource
)
node_resource.save()
return resource
def parse_resources(self):
# parse all resources into a list of metadata
metadata_list = []
for node_resource in self.node_resource.filter(parsed=False):
resource = node_resource.resource
parser = defaultdict(lambda:FileParser.FileParser, {
'pubmed' : PubmedFileParser,
'isi' : IsiFileParser,
'ris' : RisFileParser,
'europress' : EuropressFileParser,
})[resource.type.name]()
metadata_list += parser.parse(str(resource.file))
# insert the new resources in the database!
type = NodeType.objects.get(name='Document')
langages_cache = LanguagesCache()
Node.objects.bulk_create([
Node(
user = self.user,
type = type,
name = metadata['title'] if 'title' in metadata else '',
parent = self,
language = langages_cache[metadata['language_iso2']] if 'language_iso2' in metadata else None,
metadata = metadata,
)
for metadata in metadata_list
])
# mark the resources as parsed for this node
self.node_resource.update(parsed=True)
def extract_ngrams(self, keys, ngramsextractorscache=None, ngramscaches=None):
# if there is no cache...
if ngramsextractorscache is None:
ngramsextractorscache = NgramsExtractorsCache()
if ngramscaches is None:
ngramscaches = NgramsCaches()
# what do we want from the cache?
extractor = ngramsextractorscache[self.language]
ngrams = ngramscaches[self.language]
# find & count all the occurrences
associations = defaultdict(float) # float or int?
if isinstance(keys, dict):
for key, weight in keys.items():
for ngram in extractor.extract_ngrams(self.metadata[key]):
terms = ' '.join([token for token, tag in ngram])
associations[ngram] += weight
else:
for key in keys:
for ngram in extractor.extract_ngrams(self.metadata[key]):
terms = ' '.join([token for token, tag in ngram])
associations[terms] += 1
# insert the occurrences in the database
Node_Ngram.objects.bulk_create([
Node_Ngram(
node = self,
ngram = ngrams[ngram_text],
weight = weight
)
for ngram_text, weight in associations.items()
])
class Node_Resource(models.Model):
node = models.ForeignKey(Node, related_name='node_resource')
resource = models.ForeignKey(Resource)
parsed = models.BooleanField(default=False)
class Node_Ngram(models.Model):
node = models.ForeignKey(Node)
ngram = models.ForeignKey(Ngram)
weight = models.FloatField()
class Project(Node):
class Meta:
proxy=True
class CorpusManager(models.Manager):
def get_query_set(self):
corpus_type = NodeType.objects.get(name='Corpus')
return super(CorpusManager, self).get_query_set().filter(type=corpus_type)
class Corpus(Node):
objects = CorpusManager()
class Meta:
proxy=True
verbose_name_plural = 'Corpora'
......@@ -98,28 +190,4 @@ class Document(Node):
class Meta:
proxy=True
############################
# NGRAMS
############################
class Node_Ngram(models.Model):
node = models.ForeignKey(Node, on_delete=models.CASCADE)
ngram = models.ForeignKey(Ngram, on_delete=models.CASCADE)
occurences = models.IntegerField()
def __str__(self):
return "%s: %s" % (self.node.name, self.ngram.terms)
class NodeNgramNgram(models.Model):
node = models.ForeignKey(Node)
ngramX = models.ForeignKey(Ngram, related_name="nodengramngramx", on_delete=models.CASCADE)
ngramY = models.ForeignKey(Ngram, related_name="nodengramngramy", on_delete=models.CASCADE)
score = models.FloatField(default=0)
def __str__(self):
return "%s: %s / %s" % (self.node.name, self.ngramX.terms, self.ngramY.terms)
import node.models
from parsing.NgramsExtractors import EnglishNgramsExtractor, FrenchNgramsExtractor
from collections import defaultdict
class NgramsCache(defaultdict):
"""This allows the fast retrieval of ngram ids
from a cache instead of calling the database every time.
This class is language-specific."""
def __init__(self, language):
"""The cache only works with one language,
which is the required parameter of the constructor."""
self.language = language
def __missing__(self, terms):
"""If the terms are not yet present in the dictionary,
retrieve it from the database or insert it."""
try:
ngram = node.models.Ngram.get(terms=terms, language=self.language)
except:
ngram = node.models.Ngram(terms=terms, n=len(terms.split()), language=self.language)
ngram.save()
self[terms] = ngram
return self[terms]
class NgramsCaches(defaultdict):
"""This allows the fast retrieval of ngram ids
from a cache instead of calling the database every time."""
def __missing__(self, language):
"""If the cache for this language is not reachable,
add id to the dictionary."""
self[language] = NgramsCache(language)
return self[language]
class NgramsExtractorsCache(defaultdict):
"""This allows the fast retrieval of ngram ids
from a cache instead of calling the database every time."""
def __missing__(self, key):
"""If the ngrams extractor is not instancianted yet
for the given language, do it!"""
# format the language
if isinstance(key, str):
language = key.strip().lower()
elif key:
language = key.iso2
else:
language = None
# find the proper extractor
if language in ["en", "eng", "english"]:
Extractor = EnglishNgramsExtractor
elif language in ["fr", "fra", "fre", "french"]:
Extractor = FrenchNgramsExtractor
else:
Extractor = NgramsExtractor
# try to see if already instanciated with another key
found = False
for extractor in self.values():
if type(extractor) == Extractor:
self[key] = extractor
found = True
break
# well if not, let's instanciate it...
if not found:
self[key] = Extractor()
# return the proper extractor
return self[key]
class LanguagesCache(defaultdict):
def __missing__(self, key):
if len(self) == 0:
for language in node.models.Language.objects.all():
self[str(language.iso2.lower())] = language
self[str(language.iso3.lower())] = language
self[str(language.fullname.lower())] = language
betterKey = key.strip().lower()
self[key] = self[betterKey] if betterKey in self.keys() else None
return self[betterKey]
class Caches:
"""This is THE cache of the caches.
See NgramsCaches and NgramsExtractorsCache for better understanding."""
def __init__(self):
self.ngrams = NgramsCaches()
self.extractors = NgramsExtractorsCache()
self.languages = LanguagesCache()
from node.models import Node, NodeType, Language, Ngram, Node_Ngram
from parsing.NgramsExtractors import *
import collections
import dateutil.parser
import zipfile
class NgramCache:
"""
This allows the fast retrieval of ngram ids
from the cache instead of using the database for every call
"""
def __init__(self, language):
self._cache = dict()
self._language = language
def __getitem__(self, terms):
terms = terms.strip().lower()
if terms not in self._cache:
try:
ngram = Ngram.get(terms=terms, language=self._language)
except:
ngram = Ngram(terms=terms, n=len(terms.split()), language=self._language)
ngram.save()
self._cache[terms] = ngram
return self._cache[terms]
class NgramCaches(collections.defaultdict):
def __missing__(self, language):
self[language] = NgramCache(language)
return self[language]
from parsing.Caches import LanguagesCache
"""Base class for performing files parsing depending on their type.
"""
class FileParser:
def __init__(self, file=None, filepath="", encoding="utf8"):
# ...get the file item...
if file is None:
self._file = open(filepath, "rb")
else:
self._file = file
# cache for ngrams
self._ngramcaches = NgramCaches()
# extractors
self._extractors = dict()
self._document_nodetype = NodeType.objects.get(name='Document')
languages = Language.objects.all()
self._languages_fullname = {language.fullname.lower(): language for language in languages}
self._languages_iso2 = {language.iso2.lower(): language for language in languages}
self._languages_iso3 = {language.iso3.lower(): language for language in languages}
#self.parse()
"""Extract the ngrams from a given text.
"""Base class for performing files parsing depending on their type.
"""
def extract_ngrams(self, text, language):
# Get the appropriate ngrams extractor, if it exists
if language not in self._extractors:
extractor = None
if language.iso2 == 'en':
extractor = EnglishNgramsExtractor()
elif language.iso2 == 'fr':
extractor = FrenchNgramsExtractor()
self._extractors[language] = extractor
else:
extractor = self._extractors[language]
# Extract the ngrams
if extractor:
tokens = []
for ngram in extractor.extract_ngrams(text):
ngram_text = ' '.join([token for token, tag in ngram])
tokens.append(ngram_text)
return collections.Counter(
# [token for token, tag in extractor.extract_ngrams(text)]
tokens
)
else:
return dict()
def __init__(self, language_cache=None):
self._languages_cache = LanguagesCache() if language_cache is None else language_cache
#TODO
# * make it possible to tag and parse separately
# * only tags some data (only titles, titles & abstracts, some chapters...)
"""Add a document to the database.
"""
def create_document(self, parentNode, title, contents, language, metadata, guid=None):
metadata = self.format_metadata(metadata)
# create or retrieve a resource for that document, based on its user id
# if guid is None:
# resource = Resource(guid=guid)
# else:
# try:
# resource = Resource.get(guid=guid)
# except:
# resource = Resource(guid=guid)
# # If the parent node already has a child with this resource, pass
# # (is it a good thing?)
# if parentNode.descendants().filter(resource=resource).exists():
# return None
# create the document itself
if len(title) > 200:
title = title[:200]
childNode = Node(
user = parentNode.user,
type = self._document_nodetype,
name = title,
language = language,
metadata = metadata,
#resource = resource,
parent = parentNode
)
childNode.save()
# parse it!
ngrams = self.extract_ngrams(contents, language)
# we are already in a transaction, so no use doing another one (or is there?)
ngramcache = self._ngramcaches[language]
for terms, occurences in ngrams.items():
ngram = ngramcache[terms]
Node_Ngram(
node = childNode,
ngram = ngram,
occurences = occurences
).save()
# return the created document
return childNode
"""Useful method to detect the document encoding.
Not sure it should be here actually.
"""
def detect_encoding(self, string):
# see the chardet library
"""Useful method to detect the document encoding.
"""
pass
"""Parse the data.
This method shall be overriden by inherited classes.
"""
def parse(self):
return list()
def format_metadata_dates(self, metadata):
"""Format the dates found in the metadata.
Example: {"publication_date": "2014-10-23 09:57:42"} -> {...}
Examples:
{"publication_date": "2014-10-23 09:57:42"}
-> {"publication_date": "2014-10-23 09:57:42", "publication_year": "2014", ...}
{"publication_year": "2014"}
-> {"publication_date": "2014-01-01 00:00:00", "publication_year": "2014", ...}
"""
# First, check the split dates...
......@@ -187,10 +62,49 @@ class FileParser:
metadata[prefix + "_minute"] = date.strftime("%M")
metadata[prefix + "_second"] = date.strftime("%S")
# finally, return the result!
# finally, return the transformed result!
return metadata
def format_metadata_languages(self, metadata):
"""format the languages found in the metadata."""
language = None
for key in ["fullname", "iso3", "iso2"]:
language_key = "language_" + key
if language_key in metadata:
language_symbol = metadata[language_key]
language = self._languages_cache[language_symbol]
if language:
break
if language:
metadata["language_iso2"] = language.iso2
metadata["language_iso3"] = language.iso3
metadata["language_fullname"] = language.fullname
return metadata
def format_metadata(self, metadata):
"""Format the metadata."""
metadata = self.format_metadata_dates(metadata)
metadata = self.format_metadata_languages(metadata)
return metadata
def _parse(self, file):
"""This method shall be overriden by inherited classes."""
return list()
def parse(self, file):
"""Parse the file, and its children files found in the file.
"""
# initialize the list of metadata
metadata_list = []
# is the file is a ZIP archive, recurse on each of its files...
if zipfile.is_zipfile(file):
zipArchive = zipfile.ZipFile(file)
for filename in zipArchive.namelist():
metadata_list += self.parse(zipArchive.open(filename, "r"))
# ...otherwise, let's parse it directly!
else:
metadata_list += self._parse(file)
# return the list of formatted metadata
return map(self.format_metadata, metadata_list)
......@@ -2,54 +2,36 @@ from django.db import transaction
from lxml import etree
from parsing.FileParsers.FileParser import FileParser
from parsing.NgramsExtractors import *
import zipfile
import datetime
class PubmedFileParser(FileParser):
def parse(self, parentNode=None, tag=True):
def _parse(self, file):
# open the file as XML
xml_parser = etree.XMLParser(resolve_entities=False, recover=True)
documents = []
with transaction.atomic():
with zipfile.ZipFile(self._file) as zipFile:
for filename in zipFile.namelist():
file = zipFile.open(filename, "r")
xml = etree.parse(file, parser=xml_parser)
# parse all the articles, one by one
# all database operations should be performed within one transaction
xml_articles = xml.findall('PubmedArticle')
for xml_article in xml_articles:
# extract data from the document
metadata = {}
metadata_path = {
"journal" : 'MedlineCitation/Article/Journal/Title',
"title" : 'MedlineCitation/Article/ArticleTitle',
"language_iso3" : 'MedlineCitation/Article/Language',
"doi" : 'PubmedData/ArticleIdList/ArticleId[type=doi]',
"abstract" : 'MedlineCitation/Article/Abstract/AbstractText',
"publication_year" : 'MedlineCitation/DateCreated/Year',
"publication_month" : 'MedlineCitation/DateCreated/Month',
"publication_day" : 'MedlineCitation/DateCreated/Day',
}
for key, path in metadata_path.items():
try:
node = xml_article.find(path)
metadata[key] = node.text
except:
metadata[key] = ""
contents = metadata["abstract"]
# create the document in the database
document = self.create_document(
parentNode = parentNode,
title = metadata["title"],
contents = contents,
language = self._languages_iso3[metadata["language_iso3"].lower()],
metadata = metadata,
#guid = metadata["doi"],
)
if document:
documents.append(document)
return documents
xml = etree.parse(file, parser=xml_parser)
xml_articles = xml.findall('PubmedArticle')
# initialize the list of metadata
metadata_list = []
# parse all the articles, one by one
for xml_article in xml_articles:
# extract data from the document
metadata = {}
metadata_path = {
"journal" : 'MedlineCitation/Article/Journal/Title',
"title" : 'MedlineCitation/Article/ArticleTitle',
"language_iso3" : 'MedlineCitation/Article/Language',
"doi" : 'PubmedData/ArticleIdList/ArticleId[@type=doi]',
"abstract" : 'MedlineCitation/Article/Abstract/AbstractText',
"publication_year" : 'MedlineCitation/DateCreated/Year',
"publication_month" : 'MedlineCitation/DateCreated/Month',
"publication_day" : 'MedlineCitation/DateCreated/Day',
}
for key, path in metadata_path.items():
try:
node = xml_article.find(path)
metadata[key] = node.text
except:
metadata[key] = ""
metadata_list.append(metadata)
# return the list of metadata
return metadata_list
......@@ -7,33 +7,24 @@ class RisFileParser(FileParser):
_parameters = {
}
def _parse(self, parentNode, file):
def _parse(self, file):
metadata_list = []
metadata = {}
last_key = None
last_values = []
with transaction.atomic():
for line in self._file:
if len(line) > 2:
parameter_key = line[:2]
if parameter_key != b' ' and parameter_key != last_key:
if last_key in self._parameters:
parameter = self._parameters[last_key]
if parameter["type"] == "metadata":
separator = parameter["separator"] if "separator" in parameter else ""
metadata[parameter["key"]] = separator.join(last_values)
elif parameter["type"] == "delimiter":
language = self._languages_fullname[metadata["language"].lower()]
self.create_document(
parentNode = parentNode,
title = metadata["title"],
metadata = metadata,
guid = metadata["doi"]
)
# print(self.format_metadata(metadata))
# print()
metadata = {}
last_key = parameter_key
last_values = []
last_values.append(line[3:-1].decode())
self._file.close()
for line in self._file:
if len(line) > 2:
parameter_key = line[:2]
if parameter_key != b' ' and parameter_key != last_key:
if last_key in self._parameters:
parameter = self._parameters[last_key]
if parameter["type"] == "metadata":
separator = parameter["separator"] if "separator" in parameter else ""
metadata[parameter["key"]] = separator.join(last_values)
elif parameter["type"] == "delimiter":
language = self._languages_fullname[metadata["language"].lower()]
metadata_list.append(metadata)
last_key = parameter_key
last_values = []
last_values.append(line[3:-1].decode())
return metadata_list
......@@ -7,3 +7,4 @@ class EnglishNgramsExtractor(NgramsExtractor):
def start(self):
self.tagger = NltkTagger()
\ No newline at end of file
#from NgramsExtractors.FrenchNgramsExtractor import FrenchNgramsExtractor
#from NgramsExtractors.EnglishNgramsExtractor import EnglishNgramsExtractor
from parsing.NgramsExtractors.FrenchNgramsExtractor import FrenchNgramsExtractor
from parsing.NgramsExtractors.EnglishNgramsExtractor import EnglishNgramsExtractor
from parsing.NgramsExtractors.NgramsExtractor import NgramsExtractor
......@@ -46,7 +46,7 @@ Shall be used for french texts.
"""
class TreeTagger(Tagger):
def start(self, treeTaggerPath = "./Taggers/treetagger"):
def start(self, treeTaggerPath = "./parsing/Taggers/treetagger"):
binaryFile = "%s/bin/tree-tagger" % treeTaggerPath
tagcmdlist = [
binaryFile,
......
from parsing.Taggers.Tagger import Tagger
from parsing.Taggers.NltkTagger import NltkTagger
from parsing.Taggers.TreeTagger import TreeTagger
#from .Taggers import *
#from .NgramsExtractors import *
from .FileParsers import *
from node.models import Node, NodeType
import zipfile
import collections
# import chardet
class Parser:
def __init__(self):
pass
def parse_file(self, file):
# CHECKER GUID!!!!!!!!!!!!!!!!!!!!!!!!!!!!
pass
def parse_node_fichier(self, node):
if node.fichier and zipfile.is_zipfile(node.fichier):
with zipfile.ZipFile(node.fichier, "r") as zipFile:
node_type = NodeType.objects.get(name="Document")
for filename in zipFile.namelist():
file = zipFile.open(filename, "r")
node.objects.create(
parent = node,
type = node_type,
user = node.user,
)
def parse_node(self, node):
for resource in node.resources:
if node.resources.file and zipfile.is_zipfile(node.resources.file):
with zipfile.ZipFile(node.resources.file, "r") as zipFile:
for filename in zipFile.namelist():
file = zipFile.open(filename, "r")
Node.objects.create(
parent = node,
type = NodeType.get(name="Document"),
user = node.user,
)
def parse_node_recursively(self, node):
self.parse_node(node)
for descendant in node.get_descendants():
self.parse_node(descendant)
from node.models import Node, NodeType, User, Language, ResourceType
from parsing.Caches import Caches
try:
me = User.objects.get(username='Mat')
except:
me = User(username='Mat')
me.save()
try:
typePubmed = ResourceType.get(name='pubmed')
except:
typePubmed = ResourceType(name='pubmed')
typePubmed.save()
try:
typeCorpus = NodeType.get(name='corpus')
typeDoc = NodeType.get(name='document')
except:
typeCorpus = NodeType(name='corpus')
typeCorpus.save()
typeDoc = NodeType(name='document')
typeDoc.save()
english = Language.objects.get(iso2='en')
Node.objects.all().delete()
try:
corpus = Node.objects.get(name='My first corpus')
except:
corpus = Node(name='My first corpus', type=typeCorpus, user=me)
corpus.save()
print('Remove previously existing children of the corpus...')
corpus.children.all().delete()
print('Adding a resource to the corpus...')
corpus.add_resource(file='./data_samples/pubmed.zip', type=typePubmed)
print('Adding the corpus resources...')
corpus.parse_resources()
print('Extracting ngrams from the documents...')
corpus.children.all().extract_ngrams(['title', 'abstract'])
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment