[FEATURE] Dates in parsing metadata - All the dates are being formatted in FileParser

86bbf12a · Mathieu Rodic · 5036bc48 · 86bbf12a · 5036bc48 · 86bbf12a
Commit 86bbf12a authored Oct 23, 2014 by Mathieu Rodic
6 changed files
--- a/.ipynb_checkpoints/Test ISI parsing-checkpoint.ipynb
+++ b/.ipynb_checkpoints/Test ISI parsing-checkpoint.ipynb
--- a/.ipynb_checkpoints/test_parser_ngramextractor (Mat)-checkpoint.ipynb
+++ b/.ipynb_checkpoints/test_parser_ngramextractor (Mat)-checkpoint.ipynb
-{
- "metadata": {
-  "name": "",
-  "signature": "sha256:e0c3b2efe7c205a29dc4e028b10ffb7b9d0569f35c4b426febdf523069abffdb"
- },
- "nbformat": 3,
- "nbformat_minor": 0,
- "worksheets": [
-  {
-   "cells": [
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "from pprint import pprint\n",
-      "from node.models import Node, NodeType, Language, Ngram\n",
-      "from django.contrib.auth.models import User\n",
-      "import parsing\n",
-      "from parsing.FileParsers import *"
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": [],
-     "prompt_number": 1
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "# Define user\n",
-      "try:\n",
-      "    user = User.objects.get(username='Mat')\n",
-      "except:\n",
-      "    user = User(username='Mat', password='0123', email='mathieu@rodic.fr')\n",
-      "    user.save()\n",
-      "\n",
-      "# Define document types\n",
-      "nodetypes = {}\n",
-      "for name in ['Corpus', 'Document']:\n",
-      "    try:\n",
-      "        nodetypes[name] = NodeType.objects.get(name=name)\n",
-      "    except:\n",
-      "        nodetypes[name] = NodeType(name=name)\n",
-      "        nodetypes[name].save()"
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": [],
-     "prompt_number": 2
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "Node.objects.all().delete()\n",
-      "corpus = Node(name='PubMed corpus', user=user, type=nodetypes['Corpus'])\n",
-      "corpus.save()"
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": [],
-     "prompt_number": 3
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "fileparser = PubmedFileParser.PubmedFileParser(file='/home/mat/projects/gargantext/data_samples/pubmed.zip')"
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": []
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "fileparser.parse(corpus)\n",
-      "print('Ok!')"
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": [
-      {
-       "output_type": "stream",
-       "stream": "stdout",
-       "text": [
-        "Warning: parsing empty text\n",
-        "Warning: parsing empty text\n",
-        "Warning: parsing empty text"
-       ]
-      },
-      {
-       "output_type": "stream",
-       "stream": "stdout",
-       "text": [
-        "\n",
-        "Warning: parsing empty text\n",
-        "Warning: parsing empty text"
-       ]
-      }
-     ]
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "for node_ngram in corpus.children.first().node_ngram_set.all():\n",
-      "    print(node_ngram.ngram.terms)"
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": []
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [],
-     "language": "python",
-     "metadata": {},
-     "outputs": []
-    }
-   ],
-   "metadata": {}
-  }
- ]
-}
\ No newline at end of file
--- a/Test ISI parsing.ipynb
+++ b/Test ISI parsing.ipynb
--- a/parsing/FileParsers/FileParser.py
+++ b/parsing/FileParsers/FileParser.py
@@ -89,6 +89,7 @@ class FileParser:
    """Add a document to the database.
    """
    def create_document(self, parentNode, title, contents, language, metadata, guid=None):
+        metadata = self.format_metadata(metadata)
        # create or retrieve a resource for that document, based on its user id
 #        if guid is None:
 #            resource = Resource(guid=guid)

--- a/parsing/FileParsers/PubmedFileParser.py
+++ b/parsing/FileParsers/PubmedFileParser.py
@@ -16,7 +16,6 @@ class PubmedFileParser(FileParser):
            with zipfile.ZipFile(self._file) as zipFile:
                for filename in zipFile.namelist():
                    file = zipFile.open(filename, "r")
-#                    print(file.read())
                    xml = etree.parse(file, parser=xml_parser)
                    # parse all the articles, one by one
@@ -24,19 +23,17 @@ class PubmedFileParser(FileParser):
                    xml_articles = xml.findall('PubmedArticle')
                    for xml_article in xml_articles:
                        # extract data from the document
-                        date_year   = int(xml_article.find('MedlineCitation/DateCreated/Year').text)
+                        metadata = {}
-                        date_month  = int(xml_article.find('MedlineCitation/DateCreated/Month').text)
-                        date_day    = int(xml_article.find('MedlineCitation/DateCreated/Day').text)
-                        metadata = {
-                                "date_pub":      '%s-%s-%s' % (date_year, date_month, date_day),
-                                }
                        metadata_path = {
-                                "journal" : 'MedlineCitation/Article/Journal/Title',
+                            "journal"           : 'MedlineCitation/Article/Journal/Title',
-                                "title" : 'MedlineCitation/Article/ArticleTitle',
+                            "title"             : 'MedlineCitation/Article/ArticleTitle',
-                                "language_iso3" : 'MedlineCitation/Article/Language',
+                            "language_iso3"     : 'MedlineCitation/Article/Language',
-                                "doi" : 'PubmedData/ArticleIdList/ArticleId[type=doi]',
+                            "doi"               : 'PubmedData/ArticleIdList/ArticleId[type=doi]',
-                                "abstract" : 'MedlineCitation/Article/Abstract/AbstractText'
+                            "abstract"          : 'MedlineCitation/Article/Abstract/AbstractText',
-                                }
+                            "publication_year"  : 'MedlineCitation/DateCreated/Year',
+                            "publication_month" : 'MedlineCitation/DateCreated/Month',
+                            "publication_day"   : 'MedlineCitation/DateCreated/Day',
+                        }
                        for key, path in metadata_path.items():
                            try:
                                node = xml_article.find(path)

--- a/test_parser_ngramextractor (Mat).ipynb
+++ b/test_parser_ngramextractor (Mat).ipynb
-{
- "metadata": {
-  "name": "",
-  "signature": "sha256:71dcc854ee670084dd2d3795a96e0faa7d3feb1f1958d41b08c32fe1a0d70be9"
- },
- "nbformat": 3,
- "nbformat_minor": 0,
- "worksheets": [
-  {
-   "cells": [
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "from pprint import pprint\n",
-      "from node.models import Node, NodeType, Language, Ngram\n",
-      "from django.contrib.auth.models import User\n",
-      "import parsing\n",
-      "from parsing.FileParsers import *"
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": [],
-     "prompt_number": 1
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "# Define user\n",
-      "try:\n",
-      "    user = User.objects.get(username='Mat')\n",
-      "except:\n",
-      "    user = User(username='Mat', password='0123', email='mathieu@rodic.fr')\n",
-      "    user.save()\n",
-      "\n",
-      "# Define document types\n",
-      "nodetypes = {}\n",
-      "for name in ['Corpus', 'Document']:\n",
-      "    try:\n",
-      "        nodetypes[name] = NodeType.objects.get(name=name)\n",
-      "    except:\n",
-      "        nodetypes[name] = NodeType(name=name)\n",
-      "        nodetypes[name].save()"
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": [],
-     "prompt_number": 2
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "Node.objects.all().delete()\n",
-      "corpus = Node(name='PubMed corpus', user=user, type=nodetypes['Corpus'])\n",
-      "corpus.save()"
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": [],
-     "prompt_number": 3
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "fileparser = PubmedFileParser.PubmedFileParser(file='/home/mat/projects/gargantext/data_samples/pubmed.zip')"
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": [],
-     "prompt_number": 4
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "fileparser.parse(corpus)\n",
-      "print('Ok!')"
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": [
-      {
-       "output_type": "stream",
-       "stream": "stdout",
-       "text": [
-        "Warning: parsing empty text\n",
-        "Warning: parsing empty text\n",
-        "Warning: parsing empty text"
-       ]
-      },
-      {
-       "output_type": "stream",
-       "stream": "stdout",
-       "text": [
-        "\n",
-        "Warning: parsing empty text\n",
-        "Warning: parsing empty text"
-       ]
-      },
-      {
-       "output_type": "stream",
-       "stream": "stdout",
-       "text": [
-        "\n",
-        "Warning: parsing empty text\n",
-        "Warning: parsing empty text"
-       ]
-      },
-      {
-       "output_type": "stream",
-       "stream": "stdout",
-       "text": [
-        "\n",
-        "Warning: parsing empty text"
-       ]
-      },
-      {
-       "output_type": "stream",
-       "stream": "stdout",
-       "text": [
-        "\n",
-        "Warning: parsing empty text"
-       ]
-      },
-      {
-       "output_type": "stream",
-       "stream": "stdout",
-       "text": [
-        "\n",
-        "Warning: parsing empty text"
-       ]
-      },
-      {
-       "output_type": "stream",
-       "stream": "stdout",
-       "text": [
-        "\n",
-        "Warning: parsing empty text"
-       ]
-      },
-      {
-       "output_type": "stream",
-       "stream": "stdout",
-       "text": [
-        "\n",
-        "Warning: parsing empty text"
-       ]
-      },
-      {
-       "output_type": "stream",
-       "stream": "stdout",
-       "text": [
-        "\n",
-        "Warning: parsing empty text"
-       ]
-      },
-      {
-       "output_type": "stream",
-       "stream": "stdout",
-       "text": [
-        "\n",
-        "Warning: parsing empty text"
-       ]
-      },
-      {
-       "output_type": "stream",
-       "stream": "stdout",
-       "text": [
-        "\n",
-        "Warning: parsing empty text"
-       ]
-      },
-      {
-       "output_type": "stream",
-       "stream": "stdout",
-       "text": [
-        "\n",
-        "Ok!"
-       ]
-      },
-      {
-       "output_type": "stream",
-       "stream": "stdout",
-       "text": [
-        "\n"
-       ]
-      }
-     ],
-     "prompt_number": 5
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "for node_ngram in corpus.children.first().node_ngram_set.all():\n",
-      "    print(node_ngram.ngram.terms)"
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": [
-      {
-       "output_type": "stream",
-       "stream": "stdout",
-       "text": [
-        "plant-pathogenic rna virus\n",
-        "significant source\n",
-        "result\n",
-        "host populations\n",
-        "in\n",
-        "arthropod hosts\n",
-        "unique example\n",
-        "spread\n",
-        "tobacco ringspot\n",
-        "colony survival\n",
-        "apis mellifera\n",
-        "other bee viruses"
-       ]
-      },
-      {
-       "output_type": "stream",
-       "stream": "stdout",
-       "text": [
-        "\n",
-        "negative impact\n",
-        "threat\n",
-        "honeybees\n",
-        "varroa mites\n",
-        "intracellular life cycle\n",
-        "virus\n",
-        "conjunction\n",
-        "honeybee hosts\n",
-        "bee hemolymph\n",
-        "distinct lineage\n",
-        "transkingdom host alteration"
-       ]
-      },
-      {
-       "output_type": "stream",
-       "stream": "stdout",
-       "text": [
-        "\n",
-        "monophyletic clade\n",
-        "prevalence\n",
-        "winter\n",
-        "pathogen host shifts\n",
-        "furthermore\n",
-        "species-level genetic variation\n",
-        "trsvs\n",
-        "diseases\n",
-        "gradual decline\n",
-        "domesticates\n",
-        "systemic invasion"
-       ]
-      },
-      {
-       "output_type": "stream",
-       "stream": "stdout",
-       "text": [
-        "\n",
-        "high mutation rates\n",
-        "pathogenesis\n",
-        "entire body\n",
-        "humans\n",
-        "plant hosts\n",
-        "infections\n",
-        "virions\n",
-        "plant\n",
-        "varroa"
-       ]
-      },
-      {
-       "output_type": "stream",
-       "stream": "stdout",
-       "text": [
-        "\n",
-        "infectious diseases\n",
-        "winter colony collapse\n",
-        "infected colonies\n",
-        "rna viruses\n",
-        "gastric cecum"
-       ]
-      },
-      {
-       "output_type": "stream",
-       "stream": "stdout",
-       "text": [
-        "\n",
-        "trsv-infected individuals\n",
-        "instances\n",
-        "host ranges\n",
-        "health\n",
-        "viruses\n",
-        "study\n",
-        "bees\n",
-        "ectoparasitic varroa\n",
-        "present study\n",
-        "tree topology"
-       ]
-      },
-      {
-       "output_type": "stream",
-       "stream": "stdout",
-       "text": [
-        "\n",
-        "animal kingdoms\n",
-        "phylogenetic analysis\n",
-        "colonies\n",
-        "feed\n",
-        "common ancestor\n",
-        "trsv\n"
-       ]
-      }
-     ],
-     "prompt_number": 7
-    },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [],
-     "language": "python",
-     "metadata": {},
-     "outputs": [],
-     "prompt_number": 6
-    }
-   ],
-   "metadata": {}
-  }
- ]
-}
\ No newline at end of file