Commit 5036bc48 authored by Mathieu Rodic's avatar Mathieu Rodic

[FEATURE] RIS & ISI file parsers - Now ready to go!

...just need to add some elements in RisFileParser._parameters and IsiFileParser._parameters.
parent a8532659
{ {
"metadata": { "metadata": {
"name": "", "name": "",
"signature": "sha256:7c80ed9f4b088e13444efb451a1ee46e5727247be14aaf30ddf0236a49ac461b" "signature": "sha256:fdea95172a1e0072cc1f2a8f601b8abdd8aed5fbec5b600f2b29e57009dc8ef6"
}, },
"nbformat": 3, "nbformat": 3,
"nbformat_minor": 0, "nbformat_minor": 0,
"worksheets": [] "worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"from parsing.FileParsers import *"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print(\"RE abcdefgh\\n\"[3:-1])\n",
"print(b\"english\".decode())"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"abcdefgh\n",
"english\n"
]
}
],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"parser = IsiFileParser(filepath='/home/mat/projects/gargantext/data_samples/isi.txt')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"parser.parse()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'value' is not defined",
"output_type": "pyerr",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-4-785d3def061e>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mparser\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32m/home/mat/projects/gargantext/gargantext/parsing/FileParsers/RisFileParser.py\u001b[0m in \u001b[0;36mparse\u001b[1;34m(self, parentNode, tag)\u001b[0m\n\u001b[0;32m 32\u001b[0m \u001b[1;31m# guid = metadata[\"guid\"]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 33\u001b[0m \u001b[1;31m# )\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 34\u001b[1;33m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat_metadata_dates\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmetadata\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 35\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 36\u001b[0m \u001b[0mmetadata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m{\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m/home/mat/projects/gargantext/gargantext/parsing/FileParsers/FileParser.py\u001b[0m in \u001b[0;36mformat_metadata_dates\u001b[1;34m(self, metadata)\u001b[0m\n\u001b[0;32m 155\u001b[0m \u001b[0mkey\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mprefix\u001b[0m \u001b[1;33m+\u001b[0m \u001b[1;34m\"_month\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 156\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mkey\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mmetadata\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 157\u001b[1;33m \u001b[0mdate_string\u001b[0m \u001b[1;33m+=\u001b[0m \u001b[1;34m\" \"\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 158\u001b[0m \u001b[0mkey\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mprefix\u001b[0m \u001b[1;33m+\u001b[0m \u001b[1;34m\"_day\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 159\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mkey\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mmetadata\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mNameError\u001b[0m: name 'value' is not defined"
]
}
],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print(\"publication_date\"[-5:])\n",
"print(\"publication_date\"[:-5])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"_date\n",
"publication\n"
]
}
],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import dateutil.parser"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 11
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"d = dateutil.parser.parse(\"2014 OCT 11 1:2:3\")"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 25
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"d.strftime(\"%Y-%m-%d %H:%M:%S\")"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 33,
"text": [
"'2014-10-11 01:02:03'"
]
}
],
"prompt_number": 33
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"t = d.timetuple()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 34
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"d.strftime(\"%H\")\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 36,
"text": [
"'01'"
]
}
],
"prompt_number": 36
}
],
"metadata": {}
}
]
} }
\ No newline at end of file
This diff is collapsed.
...@@ -55,7 +55,8 @@ python manage.py syncdb ...@@ -55,7 +55,8 @@ python manage.py syncdb
Start the Python Notebook server Start the Python Notebook server
-------------------------------- --------------------------------
1) In Pyvenv: python manage.py shell_plus --notebook 1) In Pyvenv:
python manage.py shell_plus --notebook
2) Work from your browser! 2) Work from your browser!
...@@ -63,4 +64,5 @@ Start the Python Notebook server ...@@ -63,4 +64,5 @@ Start the Python Notebook server
Start the Django server Start the Django server
----------------------- -----------------------
In Pyvenv:
python manage.py runserver python manage.py runserver
\ No newline at end of file
import collections
from node.models import Node, NodeType, Language, Ngram, Node_Ngram from node.models import Node, NodeType, Language, Ngram, Node_Ngram
from parsing.NgramsExtractors import * from parsing.NgramsExtractors import *
import collections
import dateutil.parser
class NgramCache: class NgramCache:
""" """
This allows the fast retrieval of ngram ids This allows the fast retrieval of ngram ids
...@@ -138,3 +141,51 @@ class FileParser: ...@@ -138,3 +141,51 @@ class FileParser:
def parse(self): def parse(self):
return list() return list()
def format_metadata_dates(self, metadata):
"""Format the dates found in the metadata.
Example: {"publication_date": "2014-10-23 09:57:42"} -> {...}
"""
# First, check the split dates...
prefixes = [key[:-5] for key in metadata.keys() if key[-5:] == "_year"]
for prefix in prefixes:
date_string = metadata[prefix + "_year"]
key = prefix + "_month"
if key in metadata:
date_string += " " + metadata[key]
key = prefix + "_day"
if key in metadata:
date_string += " " + metadata[key]
key = prefix + "_hour"
if key in metadata:
date_string += " " + metadata[key]
key = prefix + "_minute"
if key in metadata:
date_string += ":" + metadata[key]
key = prefix + "_second"
if key in metadata:
date_string += ":" + metadata[key]
try:
metadata[prefix + "_date"] = dateutil.parser.parse(date_string).strftime("%Y-%m-%d %H:%M:%S")
except:
pass
# ...then parse all the "date" fields, to parse it into separate elements
prefixes = [key[:-5] for key in metadata.keys() if key[-5:] == "_date"]
for prefix in prefixes:
date = dateutil.parser.parse(metadata[prefix + "_date"])
metadata[prefix + "_year"] = date.strftime("%Y")
metadata[prefix + "_month"] = date.strftime("%m")
metadata[prefix + "_day"] = date.strftime("%d")
metadata[prefix + "_hour"] = date.strftime("%H")
metadata[prefix + "_minute"] = date.strftime("%M")
metadata[prefix + "_second"] = date.strftime("%S")
# finally, return the result!
return metadata
def format_metadata(self, metadata):
"""Format the metadata."""
metadata = self.format_metadata_dates(metadata)
return metadata
\ No newline at end of file
from django.db import transaction from parsing.FileParsers.RisFileParser import RisFileParser
from parsing.FileParsers.FileParser import FileParser
class IsiFileParser(FileParser): class IsiFileParser(RisFileParser):
_parameters = { _parameters = {
b"ER": {"type": "delimiter"}, b"ER": {"type": "delimiter"},
b"TI": {"type": "metadata", "key": "title", "separator": b" "}, b"TI": {"type": "metadata", "key": "title", "separator": " "},
b"AU": {"type": "metadata", "key": "authors", "separator": b", "}, b"AU": {"type": "metadata", "key": "authors", "separator": ", "},
b"DI": {"type": "metadata", "key": "doi"}, b"DI": {"type": "metadata", "key": "doi"},
b"PY": {"type": "metadata", "key": "publication_year"},
b"PD": {"type": "metadata", "key": "publication_month"},
b"LA": {"type": "metadata", "key": "language"}, b"LA": {"type": "metadata", "key": "language"},
b"AB": {"type": "metadata", "key": "abstract", "separator": b" "}, b"AB": {"type": "metadata", "key": "abstract", "separator": " "},
b"WC": {"type": "metadata", "key": "fields"},
} }
def parse(self, parentNode=None, tag=True):
metadata = {}
last_key = None
last_values = []
for line in self._file:
if len(line) > 2:
parameter_key = line[:2]
if parameter_key != b' ' and parameter_key != last_key:
if last_key in self._parameters:
parameter = self._parameters[last_key]
if parameter["type"] == "metadata":
separator = parameter["separator"] if "separator" in parameter else b""
metadata[parameter["key"]] = separator.join(last_values)
elif parameter["type"] == "delimiter":
language = self._languages_fullname[metadata["language"].lower().decode()]
# self.create_document(
# parentNode = parentNode,
# title = metadata["title"],
# contents = metadata["abstract"],
# language = language,
# metadata = metadata,
# guid = metadata["guid"]
# )
print(metadata)
print()
metadata = {}
last_key = parameter_key
last_values = []
last_values.append(line[3:-1])
self._file.close()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment