Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
f6122f1c
Commit
f6122f1c
authored
Oct 23, 2014
by
Mathieu Rodic
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FEATURE] file parsers - separate parsing from extraction
Work in progress...
parent
86bbf12a
Changes
6
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
605 additions
and
97 deletions
+605
-97
Test Pubmed parsing-checkpoint.ipynb
.ipynb_checkpoints/Test Pubmed parsing-checkpoint.ipynb
+127
-0
Test ISI parsing.ipynb
Test ISI parsing.ipynb
+34
-6
Test Pubmed parsing.ipynb
Test Pubmed parsing.ipynb
+265
-0
FileParser.py
parsing/FileParsers/FileParser.py
+102
-46
PubmedFileParser.py
parsing/FileParsers/PubmedFileParser.py
+38
-45
RisFileParser.py
parsing/FileParsers/RisFileParser.py
+39
-0
No files found.
.ipynb_checkpoints/Test Pubmed parsing-checkpoint.ipynb
0 → 100644
View file @
f6122f1c
{
"metadata": {
"name": "",
"signature": "sha256:e0c3b2efe7c205a29dc4e028b10ffb7b9d0569f35c4b426febdf523069abffdb"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"from pprint import pprint\n",
"from node.models import Node, NodeType, Language, Ngram\n",
"from django.contrib.auth.models import User\n",
"import parsing\n",
"from parsing.FileParsers import *"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Define user\n",
"try:\n",
" user = User.objects.get(username='Mat')\n",
"except:\n",
" user = User(username='Mat', password='0123', email='mathieu@rodic.fr')\n",
" user.save()\n",
"\n",
"# Define document types\n",
"nodetypes = {}\n",
"for name in ['Corpus', 'Document']:\n",
" try:\n",
" nodetypes[name] = NodeType.objects.get(name=name)\n",
" except:\n",
" nodetypes[name] = NodeType(name=name)\n",
" nodetypes[name].save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"Node.objects.all().delete()\n",
"corpus = Node(name='PubMed corpus', user=user, type=nodetypes['Corpus'])\n",
"corpus.save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"fileparser = PubmedFileParser.PubmedFileParser(file='/home/mat/projects/gargantext/data_samples/pubmed.zip')"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"fileparser.parse(corpus)\n",
"print('Ok!')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
}
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for node_ngram in corpus.children.first().node_ngram_set.all():\n",
" print(node_ngram.ngram.terms)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
\ No newline at end of file
Test ISI parsing.ipynb
View file @
f6122f1c
{
{
"metadata": {
"metadata": {
"name": "",
"name": "",
"signature": "sha256:
eac7c9b22e240bb0ef6d0aeec21261194d84a3f0ba53cd02af69f80d30ec5a17
"
"signature": "sha256:
70c2c8a4c8089e61195ee9da9232043152cf5e6c658a32115c0dcf990c2e98af
"
},
},
"nbformat": 3,
"nbformat": 3,
"nbformat_minor": 0,
"nbformat_minor": 0,
...
@@ -122,17 +122,34 @@
...
@@ -122,17 +122,34 @@
],
],
"language": "python",
"language": "python",
"metadata": {},
"metadata": {},
"outputs": []
"outputs": [],
"prompt_number": 1
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"collapsed": false,
"collapsed": false,
"input": [
"input": [
"d = dateutil.parser.parse(\"2014 OCT 11 1:2:3\")"
"import locale\n",
"locale.setlocale(locale.LC_ALL, \"fr_FR\")\n",
"d = dateutil.parser.parse(\"20 janvier 2004\")"
],
],
"language": "python",
"language": "python",
"metadata": {},
"metadata": {},
"outputs": []
"outputs": [
{
"ename": "TypeError",
"evalue": "'NoneType' object is not iterable",
"output_type": "pyerr",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-2-0756678732db>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mlocale\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[0mlocale\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msetlocale\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlocale\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mLC_ALL\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"fr_FR\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0md\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdateutil\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparser\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"20 janvier 2004\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32m/home/mat/projects/gargantext/myvenv/lib/python3.4/site-packages/dateutil/parser.py\u001b[0m in \u001b[0;36mparse\u001b[1;34m(timestr, parserinfo, **kwargs)\u001b[0m\n\u001b[0;32m 746\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mparser\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mparserinfo\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimestr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 747\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 748\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mDEFAULTPARSER\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimestr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 749\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 750\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m/home/mat/projects/gargantext/myvenv/lib/python3.4/site-packages/dateutil/parser.py\u001b[0m in \u001b[0;36mparse\u001b[1;34m(self, timestr, default, ignoretz, tzinfos, **kwargs)\u001b[0m\n\u001b[0;32m 308\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 309\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 310\u001b[1;33m \u001b[0mres\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mskipped_tokens\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_parse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimestr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 311\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 312\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mres\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mTypeError\u001b[0m: 'NoneType' object is not iterable"
]
}
],
"prompt_number": 2
},
},
{
{
"cell_type": "code",
"cell_type": "code",
...
@@ -142,7 +159,17 @@
...
@@ -142,7 +159,17 @@
],
],
"language": "python",
"language": "python",
"metadata": {},
"metadata": {},
"outputs": []
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 7,
"text": [
"'2014-02-02 00:00:00'"
]
}
],
"prompt_number": 7
},
},
{
{
"cell_type": "code",
"cell_type": "code",
...
@@ -152,7 +179,8 @@
...
@@ -152,7 +179,8 @@
],
],
"language": "python",
"language": "python",
"metadata": {},
"metadata": {},
"outputs": []
"outputs": [],
"prompt_number": 8
},
},
{
{
"cell_type": "code",
"cell_type": "code",
...
...
Test Pubmed parsing.ipynb
0 → 100644
View file @
f6122f1c
This diff is collapsed.
Click to expand it.
parsing/FileParsers/FileParser.py
View file @
f6122f1c
...
@@ -3,12 +3,12 @@ from parsing.NgramsExtractors import *
...
@@ -3,12 +3,12 @@ from parsing.NgramsExtractors import *
import
collections
import
collections
import
dateutil.parser
import
dateutil.parser
import
zipfile
class
NgramCache
:
class
NgramCache
:
"""
"""This allows the fast retrieval of ngram ids
This allows the fast retrieval of ngram ids
from a cache instead of calling the database every time
from the cache instead of using the database for every call
"""
"""
def
__init__
(
self
,
language
):
def
__init__
(
self
,
language
):
...
@@ -35,9 +35,9 @@ class NgramCaches(collections.defaultdict):
...
@@ -35,9 +35,9 @@ class NgramCaches(collections.defaultdict):
"""Base class for performing files parsing depending on their type.
"""
class
FileParser
:
class
FileParser
:
"""Base class for performing files parsing depending on their type.
"""
def
__init__
(
self
,
file
=
None
,
filepath
=
""
,
encoding
=
"utf8"
):
def
__init__
(
self
,
file
=
None
,
filepath
=
""
,
encoding
=
"utf8"
):
# ...get the file item...
# ...get the file item...
...
@@ -54,11 +54,10 @@ class FileParser:
...
@@ -54,11 +54,10 @@ class FileParser:
self
.
_languages_fullname
=
{
language
.
fullname
.
lower
():
language
for
language
in
languages
}
self
.
_languages_fullname
=
{
language
.
fullname
.
lower
():
language
for
language
in
languages
}
self
.
_languages_iso2
=
{
language
.
iso2
.
lower
():
language
for
language
in
languages
}
self
.
_languages_iso2
=
{
language
.
iso2
.
lower
():
language
for
language
in
languages
}
self
.
_languages_iso3
=
{
language
.
iso3
.
lower
():
language
for
language
in
languages
}
self
.
_languages_iso3
=
{
language
.
iso3
.
lower
():
language
for
language
in
languages
}
#self.parse()
"""Extract the ngrams from a given text.
"""
def
extract_ngrams
(
self
,
text
,
language
):
def
extract_ngrams
(
self
,
text
,
language
):
"""Extract the ngrams from a given text.
"""
# Get the appropriate ngrams extractor, if it exists
# Get the appropriate ngrams extractor, if it exists
if
language
not
in
self
.
_extractors
:
if
language
not
in
self
.
_extractors
:
extractor
=
None
extractor
=
None
...
@@ -75,20 +74,13 @@ class FileParser:
...
@@ -75,20 +74,13 @@ class FileParser:
for
ngram
in
extractor
.
extract_ngrams
(
text
):
for
ngram
in
extractor
.
extract_ngrams
(
text
):
ngram_text
=
' '
.
join
([
token
for
token
,
tag
in
ngram
])
ngram_text
=
' '
.
join
([
token
for
token
,
tag
in
ngram
])
tokens
.
append
(
ngram_text
)
tokens
.
append
(
ngram_text
)
return
collections
.
Counter
(
return
collections
.
Counter
(
tokens
)
# [token for token, tag in extractor.extract_ngrams(text)]
tokens
)
else
:
else
:
return
dict
()
return
dict
()
#TODO
def
create_document
(
self
,
parentNode
,
title
,
metadata
,
guid
=
None
):
# * make it possible to tag and parse separately
"""Add a document to the database.
# * only tags some data (only titles, titles & abstracts, some chapters...)
"""
"""Add a document to the database.
"""
def
create_document
(
self
,
parentNode
,
title
,
contents
,
language
,
metadata
,
guid
=
None
):
metadata
=
self
.
format_metadata
(
metadata
)
metadata
=
self
.
format_metadata
(
metadata
)
# create or retrieve a resource for that document, based on its user id
# create or retrieve a resource for that document, based on its user id
# if guid is None:
# if guid is None:
...
@@ -103,6 +95,10 @@ class FileParser:
...
@@ -103,6 +95,10 @@ class FileParser:
# if parentNode.descendants().filter(resource=resource).exists():
# if parentNode.descendants().filter(resource=resource).exists():
# return None
# return None
# create the document itself
# create the document itself
try
:
language
=
self
.
_languages_iso3
[
metadata
[
"language_iso3"
]]
except
:
language
=
None
childNode
=
Node
(
childNode
=
Node
(
user
=
parentNode
.
user
,
user
=
parentNode
.
user
,
type
=
self
.
_document_nodetype
,
type
=
self
.
_document_nodetype
,
...
@@ -113,39 +109,74 @@ class FileParser:
...
@@ -113,39 +109,74 @@ class FileParser:
parent
=
parentNode
parent
=
parentNode
)
)
childNode
.
save
()
childNode
.
save
()
# parse it!
ngrams
=
self
.
extract_ngrams
(
contents
,
language
)
# we are already in a transaction, so no use doing another one (or is there?)
ngramcache
=
self
.
_ngramcaches
[
language
]
for
terms
,
occurences
in
ngrams
.
items
():
ngram
=
ngramcache
[
terms
]
Node_Ngram
(
node
=
childNode
,
ngram
=
ngram
,
occurences
=
occurences
)
.
save
()
# return the created document
return
childNode
return
childNode
"""Useful method to detect the document encoding.
Not sure it should be here actually.
"""
def
detect_encoding
(
self
,
string
):
def
detect_encoding
(
self
,
string
):
# see the chardet library
"""Useful method to detect the document encoding.
"""
pass
pass
"""Parse the data.
def
_parse
(
self
,
parentNode
,
file
):
This method shall be overriden by inherited classes.
"""This method shall be overriden by inherited classes."""
"""
def
parse
(
self
):
return
list
()
return
list
()
def
parse
(
self
,
parentNode
,
file
=
None
):
"""Parse the files found in the file.
This method shall be overriden by inherited classes.
"""
if
file
is
None
:
with
transaction
.
atomic
():
self
.
parse
(
parentNode
,
self
.
_file
)
if
zipfile
.
is_zipfile
(
file
):
with
zipfile
.
ZipFile
(
file
)
as
zipArchive
:
for
filename
in
zipArchive
.
namelist
():
self
.
parse
(
parentNode
,
zipArchive
.
open
(
filename
,
"r"
))
else
:
self
.
_parse
(
parentNode
,
file
)
def
extract
(
self
,
parentNode
,
keys
):
"""Extract ngrams from the child nodes, given a list of field names."""
# get all the descendants of type "document"
childNodes
=
parentNode
.
descendants
()
.
filter
(
type
=
self
.
_document_nodetype
)
with
transaction
.
atomic
():
for
childNode
in
childNodes
:
# most importantly...
metadata
=
childNode
.
metadata
# which extractor shall we use?
if
language
not
in
self
.
_extractors
:
extractor
=
None
if
language
.
iso2
==
'en'
:
# use English
extractor
=
EnglishNgramsExtractor
()
elif
language
.
iso2
==
'fr'
:
# use French
extractor
=
FrenchNgramsExtractor
()
else
:
# no recognized language has been specified...
continue
self
.
_extractors
[
language
]
=
extractor
# extract ngrams from every field, find the id, count them
ngrams
=
collections
.
defaultdict
(
int
)
ngramscache
=
self
.
_ngramcaches
[
language
]
for
key
in
keys
:
for
ngram
in
extractor
.
extract_ngrams
(
text
):
ngram_text
=
' '
.
join
([
token
for
token
,
tag
in
ngram
])
ngram_id
=
ngramscache
[
ngramtext
]
.
id
ngrams
[
ngram_id
]
+=
1
# insert node/ngram associations in the database
for
ngram_id
,
occurences
in
ngrams
.
items
():
Node_Ngram
(
node_id
=
childNode
.
id
,
ngram_id
=
ngram_id
,
occurences
=
occurences
)
.
save
()
def
format_metadata_dates
(
self
,
metadata
):
def
format_metadata_dates
(
self
,
metadata
):
"""Format the dates found in the metadata.
"""Format the dates found in the metadata.
Example: {"publication_date": "2014-10-23 09:57:42"} -> {...}
Examples:
{"publication_date": "2014-10-23 09:57:42"}
-> {"publication_date": "2014-10-23 09:57:42", "publication_year": "2014"}
"""
"""
# First, check the split dates...
# First, check the split dates...
...
@@ -185,8 +216,33 @@ class FileParser:
...
@@ -185,8 +216,33 @@ class FileParser:
# finally, return the result!
# finally, return the result!
return
metadata
return
metadata
def
format_metadata_languages
(
self
,
metadata
):
"""format the languages found in the metadata."""
try
:
if
"language_fullname"
in
metadata
:
language
=
self
.
_languages_fullname
[
metadata
[
"language_fullname"
]
.
lower
()]
elif
"language_iso3"
in
metadata
:
language
=
self
.
_languages_iso3
[
metadata
[
"language_iso3"
]
.
lower
()]
elif
"language_iso2"
in
metadata
:
language
=
self
.
_languages_iso2
[
metadata
[
"language_iso2"
]
.
lower
()]
else
:
return
metadata
except
KeyError
:
# the language has not been found
for
key
in
[
"language_fullname"
,
"language_iso3"
,
"language_iso2"
]:
try
:
metadata
.
pop
(
key
)
except
:
continue
return
metadata
metadata
[
"language_iso2"
]
=
language
.
iso2
metadata
[
"language_iso3"
]
=
language
.
iso3
metadata
[
"language_fullname"
]
=
language
.
fullname
return
metadata
def
format_metadata
(
self
,
metadata
):
def
format_metadata
(
self
,
metadata
):
"""Format the metadata."""
"""Format the metadata."""
metadata
=
self
.
format_metadata_dates
(
metadata
)
metadata
=
self
.
format_metadata_dates
(
metadata
)
return
metadata
metadata
=
self
.
format_metadata_languages
(
metadata
)
\ No newline at end of file
return
metadata
parsing/FileParsers/PubmedFileParser.py
View file @
f6122f1c
...
@@ -2,54 +2,47 @@ from django.db import transaction
...
@@ -2,54 +2,47 @@ from django.db import transaction
from
lxml
import
etree
from
lxml
import
etree
from
parsing.FileParsers.FileParser
import
FileParser
from
parsing.FileParsers.FileParser
import
FileParser
from
parsing.NgramsExtractors
import
*
from
parsing.NgramsExtractors
import
*
import
zipfile
import
datetime
class
PubmedFileParser
(
FileParser
):
class
PubmedFileParser
(
FileParser
):
def
parse
(
self
,
parentNode
=
None
,
tag
=
Tru
e
):
def
_parse
(
self
,
parentNode
,
fil
e
):
# open the file as XML
# open the file as XML
xml_parser
=
etree
.
XMLParser
(
resolve_entities
=
False
,
recover
=
True
)
xml_parser
=
etree
.
XMLParser
(
resolve_entities
=
False
,
recover
=
True
)
documents
=
[]
xml
=
etree
.
parse
(
file
,
parser
=
xml_parser
)
xml_articles
=
xml
.
findall
(
'PubmedArticle'
)
with
transaction
.
atomic
():
with
transaction
.
atomic
():
with
zipfile
.
ZipFile
(
self
.
_file
)
as
zipFile
:
# initialize the list of documents
for
filename
in
zipFile
.
namelist
():
documents
=
[]
file
=
zipFile
.
open
(
filename
,
"r"
)
# parse all the articles, one by one
xml
=
etree
.
parse
(
file
,
parser
=
xml_parser
)
# all database operations should be performed within one transaction
for
xml_article
in
xml_articles
:
# parse all the articles, one by one
# extract data from the document
# all database operations should be performed within one transaction
metadata
=
{}
xml_articles
=
xml
.
findall
(
'PubmedArticle'
)
metadata_path
=
{
for
xml_article
in
xml_articles
:
"journal"
:
'MedlineCitation/Article/Journal/Title'
,
# extract data from the document
"title"
:
'MedlineCitation/Article/ArticleTitle'
,
metadata
=
{}
"language_iso3"
:
'MedlineCitation/Article/Language'
,
metadata_path
=
{
"doi"
:
'PubmedData/ArticleIdList/ArticleId[@type=doi]'
,
"journal"
:
'MedlineCitation/Article/Journal/Title'
,
"abstract"
:
'MedlineCitation/Article/Abstract/AbstractText'
,
"title"
:
'MedlineCitation/Article/ArticleTitle'
,
"publication_year"
:
'MedlineCitation/DateCreated/Year'
,
"language_iso3"
:
'MedlineCitation/Article/Language'
,
"publication_month"
:
'MedlineCitation/DateCreated/Month'
,
"doi"
:
'PubmedData/ArticleIdList/ArticleId[type=doi]'
,
"publication_day"
:
'MedlineCitation/DateCreated/Day'
,
"abstract"
:
'MedlineCitation/Article/Abstract/AbstractText'
,
}
"publication_year"
:
'MedlineCitation/DateCreated/Year'
,
for
key
,
path
in
metadata_path
.
items
():
"publication_month"
:
'MedlineCitation/DateCreated/Month'
,
try
:
"publication_day"
:
'MedlineCitation/DateCreated/Day'
,
node
=
xml_article
.
find
(
path
)
}
metadata
[
key
]
=
node
.
text
for
key
,
path
in
metadata_path
.
items
():
except
:
try
:
metadata
[
key
]
=
""
node
=
xml_article
.
find
(
path
)
contents
=
metadata
[
"abstract"
]
metadata
[
key
]
=
node
.
text
# create the document in the database
except
:
document
=
self
.
create_document
(
metadata
[
key
]
=
""
parentNode
=
parentNode
,
contents
=
metadata
[
"abstract"
]
title
=
metadata
[
"title"
],
# create the document in the database
metadata
=
metadata
,
document
=
self
.
create_document
(
#guid = metadata["doi"],
parentNode
=
parentNode
,
)
title
=
metadata
[
"title"
],
if
document
:
contents
=
contents
,
documents
.
append
(
document
)
language
=
self
.
_languages_iso3
[
metadata
[
"language_iso3"
]
.
lower
()],
# return the list of documents
metadata
=
metadata
,
return
documents
#guid = metadata["doi"],
)
if
document
:
documents
.
append
(
document
)
return
documents
parsing/FileParsers/RisFileParser.py
0 → 100644
View file @
f6122f1c
from
django.db
import
transaction
from
parsing.FileParsers.FileParser
import
FileParser
class
RisFileParser
(
FileParser
):
_parameters
=
{
}
def
_parse
(
self
,
parentNode
,
file
):
metadata
=
{}
last_key
=
None
last_values
=
[]
with
transaction
.
atomic
():
for
line
in
self
.
_file
:
if
len
(
line
)
>
2
:
parameter_key
=
line
[:
2
]
if
parameter_key
!=
b
' '
and
parameter_key
!=
last_key
:
if
last_key
in
self
.
_parameters
:
parameter
=
self
.
_parameters
[
last_key
]
if
parameter
[
"type"
]
==
"metadata"
:
separator
=
parameter
[
"separator"
]
if
"separator"
in
parameter
else
""
metadata
[
parameter
[
"key"
]]
=
separator
.
join
(
last_values
)
elif
parameter
[
"type"
]
==
"delimiter"
:
language
=
self
.
_languages_fullname
[
metadata
[
"language"
]
.
lower
()]
# self.create_document(
# parentNode = parentNode,
# title = metadata["title"],
# metadata = metadata,
# guid = metadata["doi"]
# )
print
(
self
.
format_metadata
(
metadata
))
print
()
metadata
=
{}
last_key
=
parameter_key
last_values
=
[]
last_values
.
append
(
line
[
3
:
-
1
]
.
decode
())
self
.
_file
.
close
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment