Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
852f71b6
Commit
852f71b6
authored
May 12, 2015
by
Administrator
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FIX] Adding zotero parser
parent
69333ff9
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
52 additions
and
18 deletions
+52
-18
FileParser.py
parsing/FileParsers/FileParser.py
+14
-12
RisFileParser.py
parsing/FileParsers/RisFileParser.py
+12
-4
ZoteroFileParser.py
parsing/FileParsers/ZoteroFileParser.py
+23
-0
__init__.py
parsing/FileParsers/__init__.py
+1
-0
parsers_config.py
parsing/parsers_config.py
+2
-2
No files found.
parsing/FileParsers/FileParser.py
View file @
852f71b6
...
...
@@ -4,21 +4,21 @@ import zipfile
import
chardet
from
..Caches
import
LanguagesCache
class
FileParser
:
"""Base class for performing files parsing depending on their type.
"""
def
__init__
(
self
,
language_cache
=
None
):
self
.
_languages_cache
=
LanguagesCache
()
if
language_cache
is
None
else
language_cache
def
detect_encoding
(
self
,
string
):
"""Useful method to detect the document encoding.
"""
encoding
=
chardet
.
detect
(
string
)
return
encoding
.
get
(
'encoding'
,
'UTF-8'
)
def
format_hyperdata_dates
(
self
,
hyperdata
):
"""Format the dates found in the hyperdata.
Examples:
...
...
@@ -27,7 +27,7 @@ class FileParser:
{"publication_year": "2014"}
-> {"publication_date": "2014-01-01 00:00:00", "publication_year": "2014", ...}
"""
# First, check the split dates...
prefixes
=
[
key
[:
-
5
]
for
key
in
hyperdata
.
keys
()
if
key
[
-
5
:]
==
"_year"
]
for
prefix
in
prefixes
:
...
...
@@ -51,21 +51,23 @@ class FileParser:
hyperdata
[
prefix
+
"_date"
]
=
dateutil
.
parser
.
parse
(
date_string
)
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
except
:
pass
# ...then parse all the "date" fields, to parse it into separate elements
prefixes
=
[
key
[:
-
5
]
for
key
in
hyperdata
.
keys
()
if
key
[
-
5
:]
==
"_date"
]
for
prefix
in
prefixes
:
date
=
dateutil
.
parser
.
parse
(
hyperdata
[
prefix
+
"_date"
])
print
(
'date'
)
hyperdata
[
prefix
+
"_year"
]
=
date
.
strftime
(
"
%
Y"
)
hyperdata
[
prefix
+
"_month"
]
=
date
.
strftime
(
"
%
m"
)
hyperdata
[
prefix
+
"_day"
]
=
date
.
strftime
(
"
%
d"
)
hyperdata
[
prefix
+
"_hour"
]
=
date
.
strftime
(
"
%
H"
)
hyperdata
[
prefix
+
"_minute"
]
=
date
.
strftime
(
"
%
M"
)
hyperdata
[
prefix
+
"_second"
]
=
date
.
strftime
(
"
%
S"
)
# finally, return the transformed result!
return
hyperdata
def
format_hyperdata_languages
(
self
,
hyperdata
):
"""format the languages found in the hyperdata."""
language
=
None
...
...
@@ -81,18 +83,18 @@ class FileParser:
hyperdata
[
"language_iso3"
]
=
language
.
iso3
hyperdata
[
"language_fullname"
]
=
language
.
fullname
return
hyperdata
def
format_hyperdata
(
self
,
hyperdata
):
"""Format the hyperdata."""
hyperdata
=
self
.
format_hyperdata_dates
(
hyperdata
)
hyperdata
=
self
.
format_hyperdata_languages
(
hyperdata
)
return
hyperdata
def
_parse
(
self
,
file
):
"""This method shall be overriden by inherited classes."""
return
list
()
def
parse
(
self
,
file
):
"""Parse the file, and its children files found in the file.
"""
...
...
parsing/FileParsers/RisFileParser.py
View file @
852f71b6
...
...
@@ -3,15 +3,17 @@ from .FileParser import FileParser
from
..Caches
import
LanguagesCache
from
admin.utils
import
PrintException
class
RisFileParser
(
FileParser
):
def
__init__
(
self
,
language_cache
=
None
):
super
(
FileParser
,
self
)
.
__init__
()
self
.
_languages_cache
=
LanguagesCache
()
if
language_cache
is
None
else
language_cache
self
.
_begin
=
6
self
.
_parameters
=
{
b
"ER"
:
{
"type"
:
"delimiter"
},
b
"TI"
:
{
"type"
:
"hyperdata"
,
"key"
:
"title"
,
"separator"
:
" "
},
...
...
@@ -24,7 +26,7 @@ class RisFileParser(FileParser):
b
"AB"
:
{
"type"
:
"hyperdata"
,
"key"
:
"abstract"
,
"separator"
:
" "
},
b
"WC"
:
{
"type"
:
"hyperdata"
,
"key"
:
"fields"
},
}
def
_parse
(
self
,
file
):
hyperdata
=
{}
...
...
@@ -57,5 +59,11 @@ class RisFileParser(FileParser):
print
(
error
)
# if a hyperdata object is left in memory, yield it as well
if
hyperdata
:
# try:
# if hyperdata['date_to_parse']:
# print(hyperdata['date_to_parse'])
# except:
# pass
#
#print(hyperdata['title'])
yield
hyperdata
parsing/FileParsers/ZoteroFileParser.py
0 → 100644
View file @
852f71b6
from
.RisFileParser
import
RisFileParser
from
..Caches
import
LanguagesCache
class
ZoteroFileParser
(
RisFileParser
):
def
__init__
(
self
):
super
(
RisFileParser
,
self
)
.
__init__
()
self
.
_begin
=
6
self
.
_parameters
=
{
b
"ER"
:
{
"type"
:
"delimiter"
},
b
"TI"
:
{
"type"
:
"hyperdata"
,
"key"
:
"title"
,
"separator"
:
" "
},
b
"AU"
:
{
"type"
:
"hyperdata"
,
"key"
:
"authors"
,
"separator"
:
", "
},
b
"UR"
:
{
"type"
:
"hyperdata"
,
"key"
:
"doi"
},
b
"DA"
:
{
"type"
:
"hyperdata"
,
"key"
:
"publication_date"
},
b
"PY"
:
{
"type"
:
"hyperdata"
,
"key"
:
"publication_year"
},
b
"PD"
:
{
"type"
:
"hyperdata"
,
"key"
:
"publication_month"
},
b
"LA"
:
{
"type"
:
"hyperdata"
,
"key"
:
"language_iso2"
},
b
"AB"
:
{
"type"
:
"hyperdata"
,
"key"
:
"abstract"
,
"separator"
:
" "
},
b
"WC"
:
{
"type"
:
"hyperdata"
,
"key"
:
"fields"
},
}
parsing/FileParsers/__init__.py
View file @
852f71b6
from
.RisFileParser
import
RisFileParser
from
.IsiFileParser
import
IsiFileParser
from
.JstorFileParser
import
JstorFileParser
from
.ZoteroFileParser
import
ZoteroFileParser
from
.PubmedFileParser
import
PubmedFileParser
from
.EuropressFileParser
import
EuropressFileParser
from
.ISText
import
ISText
parsing/parsers_config.py
View file @
852f71b6
...
...
@@ -4,11 +4,11 @@ parsers = {
'Pubmed (xml format)'
:
PubmedFileParser
,
'Web of Science (ISI format)'
:
IsiFileParser
,
'Scopus (RIS format)'
:
RisFileParser
,
'Zotero (RIS format)'
:
Jstor
FileParser
,
'Zotero (RIS format)'
:
Zotero
FileParser
,
'Jstor (RIS format)'
:
JstorFileParser
,
#'Europress' : EuropressFileParser,
'Europress (French)'
:
EuropressFileParser
,
'Europress (English)'
:
EuropressFileParser
,
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment