[FEATURE] RIS & ISI file parsers - Now ready to go!

...just need to add some elements in RisFileParser._parameters and IsiFileParser._parameters.

[FEATURE] RIS & ISI file parsers - Now ready to go!
...just need to add some elements in RisFileParser._parameters and IsiFileParser._parameters.
5036bc48 · Mathieu Rodic · a8532659 · 5036bc48 · 5036bc48 · 5036bc48
Commit 5036bc48 authored Oct 23, 2014 by Mathieu Rodic
5 changed files
--- a/.ipynb_checkpoints/Test ISI parsing-checkpoint.ipynb
+++ b/.ipynb_checkpoints/Test ISI parsing-checkpoint.ipynb
 {
 "metadata": {
  "name": "",
-  "signature": "sha256:7c80ed9f4b088e13444efb451a1ee46e5727247be14aaf30ddf0236a49ac461b"
+  "signature": "sha256:fdea95172a1e0072cc1f2a8f601b8abdd8aed5fbec5b600f2b29e57009dc8ef6"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
- "worksheets": []
+ "worksheets": [
+  {
+   "cells": [
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "from parsing.FileParsers import *"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 1
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "print(\"RE abcdefgh\\n\"[3:-1])\n",
+      "print(b\"english\".decode())"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "abcdefgh\n",
+        "english\n"
+       ]
+      }
+     ],
+     "prompt_number": 2
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "parser = IsiFileParser(filepath='/home/mat/projects/gargantext/data_samples/isi.txt')"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 3
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "parser.parse()"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "ename": "NameError",
+       "evalue": "name 'value' is not defined",
+       "output_type": "pyerr",
+       "traceback": [
+        "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+        "\u001b[1;32m<ipython-input-4-785d3def061e>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mparser\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
+        "\u001b[1;32m/home/mat/projects/gargantext/gargantext/parsing/FileParsers/RisFileParser.py\u001b[0m in \u001b[0;36mparse\u001b[1;34m(self, parentNode, tag)\u001b[0m\n\u001b[0;32m     32\u001b[0m                                     \u001b[1;31m# guid        = metadata[\"guid\"]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     33\u001b[0m                                 \u001b[1;31m# )\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 34\u001b[1;33m                                 \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat_metadata_dates\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmetadata\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     35\u001b[0m                                 \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     36\u001b[0m                                 \u001b[0mmetadata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m{\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+        "\u001b[1;32m/home/mat/projects/gargantext/gargantext/parsing/FileParsers/FileParser.py\u001b[0m in \u001b[0;36mformat_metadata_dates\u001b[1;34m(self, metadata)\u001b[0m\n\u001b[0;32m    155\u001b[0m                 \u001b[0mkey\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mprefix\u001b[0m \u001b[1;33m+\u001b[0m \u001b[1;34m\"_month\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    156\u001b[0m                 \u001b[1;32mif\u001b[0m \u001b[0mkey\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mmetadata\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 157\u001b[1;33m                     \u001b[0mdate_string\u001b[0m \u001b[1;33m+=\u001b[0m \u001b[1;34m\" \"\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    158\u001b[0m                     \u001b[0mkey\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mprefix\u001b[0m \u001b[1;33m+\u001b[0m \u001b[1;34m\"_day\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    159\u001b[0m                     \u001b[1;32mif\u001b[0m \u001b[0mkey\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mmetadata\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+        "\u001b[1;31mNameError\u001b[0m: name 'value' is not defined"
+       ]
+      }
+     ],
+     "prompt_number": 4
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "print(\"publication_date\"[-5:])\n",
+      "print(\"publication_date\"[:-5])"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "_date\n",
+        "publication\n"
+       ]
+      }
+     ],
+     "prompt_number": 10
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "import dateutil.parser"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 11
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "d = dateutil.parser.parse(\"2014  OCT 11 1:2:3\")"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 25
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "d.strftime(\"%Y-%m-%d %H:%M:%S\")"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "metadata": {},
+       "output_type": "pyout",
+       "prompt_number": 33,
+       "text": [
+        "'2014-10-11 01:02:03'"
+       ]
+      }
+     ],
+     "prompt_number": 33
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "t = d.timetuple()"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 34
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "d.strftime(\"%H\")\n"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "metadata": {},
+       "output_type": "pyout",
+       "prompt_number": 36,
+       "text": [
+        "'01'"
+       ]
+      }
+     ],
+     "prompt_number": 36
+    }
+   ],
+   "metadata": {}
+  }
+ ]
 }
\ No newline at end of file
--- a/Test ISI parsing.ipynb
+++ b/Test ISI parsing.ipynb
--- a/init/README.rst
+++ b/init/README.rst
@@ -55,7 +55,8 @@ python manage.py syncdb
 Start the Python Notebook server
 --------------------------------

-1)  In Pyvenv: python manage.py shell_plus --notebook
+1)  In Pyvenv:
+    python manage.py shell_plus --notebook

 2)  Work from your browser!

@@ -63,4 +64,5 @@ Start the Python Notebook server
 Start the Django server
 -----------------------

+In Pyvenv:
 python manage.py runserver
\ No newline at end of file
--- a/parsing/FileParsers/FileParser.py
+++ b/parsing/FileParsers/FileParser.py
-import collections
 from node.models import Node, NodeType, Language, Ngram, Node_Ngram
 from parsing.NgramsExtractors import *

+import collections
+import dateutil.parser
+
+
 class NgramCache:
    """
    This allows the fast retrieval of ngram ids
@@ -138,3 +141,51 @@ class FileParser:
    def parse(self):
        return list()

+
+    def format_metadata_dates(self, metadata):
+        """Format the dates found in the metadata.
+        Example: {"publication_date": "2014-10-23 09:57:42"} -> {...}
+        """
+        
+        # First, check the split dates...
+        prefixes = [key[:-5] for key in metadata.keys() if key[-5:] == "_year"]
+        for prefix in prefixes:
+            date_string = metadata[prefix + "_year"]
+            key = prefix + "_month"
+            if key in metadata:
+                date_string += " " + metadata[key]
+                key = prefix + "_day"
+                if key in metadata:
+                    date_string += " " + metadata[key]
+                    key = prefix + "_hour"
+                    if key in metadata:
+                        date_string += " " + metadata[key]
+                        key = prefix + "_minute"
+                        if key in metadata:
+                            date_string += ":" + metadata[key]
+                            key = prefix + "_second"
+                            if key in metadata:
+                                date_string += ":" + metadata[key]
+            try:
+                metadata[prefix + "_date"] = dateutil.parser.parse(date_string).strftime("%Y-%m-%d %H:%M:%S")
+            except:
+                pass
+        
+        # ...then parse all the "date" fields, to parse it into separate elements
+        prefixes = [key[:-5] for key in metadata.keys() if key[-5:] == "_date"]
+        for prefix in prefixes:
+            date = dateutil.parser.parse(metadata[prefix + "_date"])
+            metadata[prefix + "_year"]      = date.strftime("%Y")
+            metadata[prefix + "_month"]     = date.strftime("%m")
+            metadata[prefix + "_day"]       = date.strftime("%d")
+            metadata[prefix + "_hour"]      = date.strftime("%H")
+            metadata[prefix + "_minute"]    = date.strftime("%M")
+            metadata[prefix + "_second"]    = date.strftime("%S")
+                
+        # finally, return the result!
+        return metadata
+    
+    def format_metadata(self, metadata):
+        """Format the metadata."""
+        metadata = self.format_metadata_dates(metadata)
+        return metadata
\ No newline at end of file
--- a/parsing/FileParsers/IsiFileParser.py
+++ b/parsing/FileParsers/IsiFileParser.py
-from django.db import transaction
-from parsing.FileParsers.FileParser import FileParser
+from parsing.FileParsers.RisFileParser import RisFileParser


-class IsiFileParser(FileParser):
+class IsiFileParser(RisFileParser):
    
    _parameters = {
        b"ER":  {"type": "delimiter"},
-        b"TI":  {"type": "metadata", "key": "title", "separator": b" "},
-        b"AU":  {"type": "metadata", "key": "authors", "separator": b", "},
+        b"TI":  {"type": "metadata", "key": "title", "separator": " "},
+        b"AU":  {"type": "metadata", "key": "authors", "separator": ", "},
        b"DI":  {"type": "metadata", "key": "doi"},
+        b"PY":  {"type": "metadata", "key": "publication_year"},
+        b"PD":  {"type": "metadata", "key": "publication_month"},
        b"LA":  {"type": "metadata", "key": "language"},
-        b"AB":  {"type": "metadata", "key": "abstract", "separator": b" "},
+        b"AB":  {"type": "metadata", "key": "abstract", "separator": " "},
+        b"WC":  {"type": "metadata", "key": "fields"},
    }
-    
-    def parse(self, parentNode=None, tag=True):
-        metadata = {}
-        last_key = None
-        last_values = []
-        for line in self._file:
-            if len(line) > 2:
-                parameter_key = line[:2]
-                if parameter_key != b'  ' and parameter_key != last_key:
-                    if last_key in self._parameters:
-                        parameter = self._parameters[last_key]
-                        if parameter["type"] == "metadata":
-                            separator = parameter["separator"] if "separator" in parameter else b""
-                            metadata[parameter["key"]] = separator.join(last_values)
-                        elif parameter["type"] == "delimiter":
-                            language = self._languages_fullname[metadata["language"].lower().decode()]
-                            # self.create_document(
-                                # parentNode  = parentNode,
-                                # title       = metadata["title"],
-                                # contents    = metadata["abstract"],
-                                # language    = language,
-                                # metadata    = metadata,
-                                # guid        = metadata["guid"]
-                            # )
-                            print(metadata)
-                            print()
-                            metadata = {}
-                    last_key = parameter_key
-                    last_values = []
-                last_values.append(line[3:-1])
-        self._file.close()
-