Commit 86bbf12a authored by Mathieu Rodic's avatar Mathieu Rodic

[FEATURE] Dates in parsing metadata - All the dates are being formatted in FileParser

parent 5036bc48
{
"metadata": {
"name": "",
"signature": "sha256:e0c3b2efe7c205a29dc4e028b10ffb7b9d0569f35c4b426febdf523069abffdb"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"from pprint import pprint\n",
"from node.models import Node, NodeType, Language, Ngram\n",
"from django.contrib.auth.models import User\n",
"import parsing\n",
"from parsing.FileParsers import *"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Define user\n",
"try:\n",
" user = User.objects.get(username='Mat')\n",
"except:\n",
" user = User(username='Mat', password='0123', email='mathieu@rodic.fr')\n",
" user.save()\n",
"\n",
"# Define document types\n",
"nodetypes = {}\n",
"for name in ['Corpus', 'Document']:\n",
" try:\n",
" nodetypes[name] = NodeType.objects.get(name=name)\n",
" except:\n",
" nodetypes[name] = NodeType(name=name)\n",
" nodetypes[name].save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"Node.objects.all().delete()\n",
"corpus = Node(name='PubMed corpus', user=user, type=nodetypes['Corpus'])\n",
"corpus.save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"fileparser = PubmedFileParser.PubmedFileParser(file='/home/mat/projects/gargantext/data_samples/pubmed.zip')"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"fileparser.parse(corpus)\n",
"print('Ok!')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
}
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for node_ngram in corpus.children.first().node_ngram_set.all():\n",
" print(node_ngram.ngram.terms)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
\ No newline at end of file
This diff is collapsed.
...@@ -89,6 +89,7 @@ class FileParser: ...@@ -89,6 +89,7 @@ class FileParser:
"""Add a document to the database. """Add a document to the database.
""" """
def create_document(self, parentNode, title, contents, language, metadata, guid=None): def create_document(self, parentNode, title, contents, language, metadata, guid=None):
metadata = self.format_metadata(metadata)
# create or retrieve a resource for that document, based on its user id # create or retrieve a resource for that document, based on its user id
# if guid is None: # if guid is None:
# resource = Resource(guid=guid) # resource = Resource(guid=guid)
......
...@@ -16,7 +16,6 @@ class PubmedFileParser(FileParser): ...@@ -16,7 +16,6 @@ class PubmedFileParser(FileParser):
with zipfile.ZipFile(self._file) as zipFile: with zipfile.ZipFile(self._file) as zipFile:
for filename in zipFile.namelist(): for filename in zipFile.namelist():
file = zipFile.open(filename, "r") file = zipFile.open(filename, "r")
# print(file.read())
xml = etree.parse(file, parser=xml_parser) xml = etree.parse(file, parser=xml_parser)
# parse all the articles, one by one # parse all the articles, one by one
...@@ -24,19 +23,17 @@ class PubmedFileParser(FileParser): ...@@ -24,19 +23,17 @@ class PubmedFileParser(FileParser):
xml_articles = xml.findall('PubmedArticle') xml_articles = xml.findall('PubmedArticle')
for xml_article in xml_articles: for xml_article in xml_articles:
# extract data from the document # extract data from the document
date_year = int(xml_article.find('MedlineCitation/DateCreated/Year').text) metadata = {}
date_month = int(xml_article.find('MedlineCitation/DateCreated/Month').text)
date_day = int(xml_article.find('MedlineCitation/DateCreated/Day').text)
metadata = {
"date_pub": '%s-%s-%s' % (date_year, date_month, date_day),
}
metadata_path = { metadata_path = {
"journal" : 'MedlineCitation/Article/Journal/Title', "journal" : 'MedlineCitation/Article/Journal/Title',
"title" : 'MedlineCitation/Article/ArticleTitle', "title" : 'MedlineCitation/Article/ArticleTitle',
"language_iso3" : 'MedlineCitation/Article/Language', "language_iso3" : 'MedlineCitation/Article/Language',
"doi" : 'PubmedData/ArticleIdList/ArticleId[type=doi]', "doi" : 'PubmedData/ArticleIdList/ArticleId[type=doi]',
"abstract" : 'MedlineCitation/Article/Abstract/AbstractText' "abstract" : 'MedlineCitation/Article/Abstract/AbstractText',
} "publication_year" : 'MedlineCitation/DateCreated/Year',
"publication_month" : 'MedlineCitation/DateCreated/Month',
"publication_day" : 'MedlineCitation/DateCreated/Day',
}
for key, path in metadata_path.items(): for key, path in metadata_path.items():
try: try:
node = xml_article.find(path) node = xml_article.find(path)
......
{
"metadata": {
"name": "",
"signature": "sha256:71dcc854ee670084dd2d3795a96e0faa7d3feb1f1958d41b08c32fe1a0d70be9"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"from pprint import pprint\n",
"from node.models import Node, NodeType, Language, Ngram\n",
"from django.contrib.auth.models import User\n",
"import parsing\n",
"from parsing.FileParsers import *"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Define user\n",
"try:\n",
" user = User.objects.get(username='Mat')\n",
"except:\n",
" user = User(username='Mat', password='0123', email='mathieu@rodic.fr')\n",
" user.save()\n",
"\n",
"# Define document types\n",
"nodetypes = {}\n",
"for name in ['Corpus', 'Document']:\n",
" try:\n",
" nodetypes[name] = NodeType.objects.get(name=name)\n",
" except:\n",
" nodetypes[name] = NodeType(name=name)\n",
" nodetypes[name].save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"Node.objects.all().delete()\n",
"corpus = Node(name='PubMed corpus', user=user, type=nodetypes['Corpus'])\n",
"corpus.save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"fileparser = PubmedFileParser.PubmedFileParser(file='/home/mat/projects/gargantext/data_samples/pubmed.zip')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"fileparser.parse(corpus)\n",
"print('Ok!')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Ok!"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n"
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for node_ngram in corpus.children.first().node_ngram_set.all():\n",
" print(node_ngram.ngram.terms)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"plant-pathogenic rna virus\n",
"significant source\n",
"result\n",
"host populations\n",
"in\n",
"arthropod hosts\n",
"unique example\n",
"spread\n",
"tobacco ringspot\n",
"colony survival\n",
"apis mellifera\n",
"other bee viruses"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"negative impact\n",
"threat\n",
"honeybees\n",
"varroa mites\n",
"intracellular life cycle\n",
"virus\n",
"conjunction\n",
"honeybee hosts\n",
"bee hemolymph\n",
"distinct lineage\n",
"transkingdom host alteration"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"monophyletic clade\n",
"prevalence\n",
"winter\n",
"pathogen host shifts\n",
"furthermore\n",
"species-level genetic variation\n",
"trsvs\n",
"diseases\n",
"gradual decline\n",
"domesticates\n",
"systemic invasion"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"high mutation rates\n",
"pathogenesis\n",
"entire body\n",
"humans\n",
"plant hosts\n",
"infections\n",
"virions\n",
"plant\n",
"varroa"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"infectious diseases\n",
"winter colony collapse\n",
"infected colonies\n",
"rna viruses\n",
"gastric cecum"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"trsv-infected individuals\n",
"instances\n",
"host ranges\n",
"health\n",
"viruses\n",
"study\n",
"bees\n",
"ectoparasitic varroa\n",
"present study\n",
"tree topology"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"animal kingdoms\n",
"phylogenetic analysis\n",
"colonies\n",
"feed\n",
"common ancestor\n",
"trsv\n"
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 6
}
],
"metadata": {}
}
]
}
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment