Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
f197ccb4
Commit
f197ccb4
authored
May 12, 2016
by
c24b
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[CERN XML PARSER] OK
parent
4454aa47
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
61 additions
and
65 deletions
+61
-65
Cern.py
gargantext/util/parsers/Cern.py
+53
-60
_Parser.py
gargantext/util/parsers/_Parser.py
+8
-5
No files found.
gargantext/util/parsers/Cern.py
View file @
f197ccb4
from
._Parser
import
Parser
from
datetime
import
datetime
from
bs4
import
BeautifulSoup
#from io import BytesIO
from
io
import
StringIO
import
json
from
lxml
import
etree
class
CernParser
(
Parser
):
...
...
@@ -11,90 +8,86 @@ class CernParser(Parser):
MARC21
=
{
#here main author
"100"
:{
"a"
:
"author
_name
"
,
"v"
:
"author
_affiliation
"
,
"w"
:
"author
_country
"
,
"m"
:
"author
_mail
"
,
"a"
:
"author
s
"
,
"v"
:
"author
s_affiliations
"
,
"w"
:
"author
s_countries
"
,
"m"
:
"author
s_mails
"
,
},
#here cooauthor
#here cooauthor
mais rappatrié vers la list l'auteur avec main author [0]
"700"
:
{
"a"
:
"authors
_name
"
,
"v"
:
"authors_affiliation"
,
"w"
:
"authors_countr
y
"
,
"a"
:
"authors"
,
"v"
:
"authors_affiliation
s
"
,
"w"
:
"authors_countr
ies
"
,
},
"773"
:{
"c"
:
"pages"
,
"n"
:
"issue"
,
"p"
:
"journal"
,
"v"
:
"volume"
,
"y"
:
"year"
"y"
:
"
publication_
year"
},
"024"
:
{
"a"
:
"doi"
},
"037"
:
{
"a"
:
"arxiv"
},
"022"
:
{
"a"
:
"isbn"
},
#
"037": {"a":"arxiv"},
#
"022": {"a":"isbn"},
"245"
:
{
"a"
:
"title"
},
"520"
:
{
"a"
:
"abstract"
},
"260"
:
{
"b"
:
"publisher"
,
"c"
:
"pubdate"
},
#"024": {"t":"date"},
"260"
:
{
"b"
:
"publisher"
,
"c"
:
"pub
lication_
date"
},
"024"
:
{
"t"
:
"realdate_full_"
},
#correspond to query date
#"540": {"a":"licence"},
#"653": {"a":"keywords"},
#
"856": {"u":"pdf_source"},
"856"
:
{
"u"
:
"pdf_source"
},
}
#~ hyperdata_item = {
#~ "journal" : '',
#~ "title" : '',
#~ "abstract" : '',
#~ "title" : '',
#~ "language_iso2" : 'en',
#~ "doi" : '',
#~ "realdate_full_" : '',
#~ "realdate_year_" : '',
#~ "realdate_month_" : '',
#~ "realdate_day_" : '',
#~ "publication_year" : '',
#~ "publication_month" : '',
#~ "publication_day" : '',
#~ "authors" : '',
#~ "authors_countries" : '',
#~ "authors_affiliations": '',
#~ "publisher": '',
#~ }
def
format_date
(
self
,
hyperdata
):
'''formatting pubdate'''
prefix
=
"publication"
date
=
datetime
.
strptime
(
hyperdata
[
prefix
+
"_date"
],
"
%
Y-
%
m-
%
d"
)
hyperdata
[
prefix
+
"_year"
]
=
date
.
strftime
(
'
%
Y'
)
hyperdata
[
prefix
+
"_month"
]
=
date
.
strftime
(
"
%
m"
)
hyperdata
[
prefix
+
"_day"
]
=
date
.
strftime
(
"
%
d"
)
hyperdata
[
prefix
+
"_hour"
]
=
date
.
strftime
(
"
%
h"
)
hyperdata
[
prefix
+
"_minute"
]
=
date
.
strftime
(
"
%
m"
)
hyperdata
[
prefix
+
"_second"
]
=
date
.
strftime
(
"
%
s"
)
hyperdata
[
prefix
+
"_date"
]
=
date
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
return
hyperdata
def
parse
(
self
,
file
):
if
isinstance
(
file
,
str
):
file
=
open
(
file
,
'rb'
)
doc
=
etree
.
parse
(
file
.
read
())
tree
=
etree
.
tostring
(
doc
)
#parser = etree.XMLParser()
hyperdata_list
=
[]
soup
=
BeautifulSoup
(
tree
,
"lxml"
)
hyperdata_list
=
[]
doc
=
file
.
read
()
soup
=
BeautifulSoup
(
doc
.
decode
(
"utf-8"
),
"lxml"
)
for
record
in
soup
.
find_all
(
"record"
):
r
=
{
v
:[]
for
v
in
self
.
MARC21
[
"700"
]
.
values
()}
r
[
"uid"
]
=
soup
.
find
(
"controlfield"
)
.
text
hyperdata
=
{
v
:[]
for
v
in
self
.
MARC21
[
"100"
]
.
values
()}
hyperdata
[
"uid"
]
=
soup
.
find
(
"controlfield"
)
.
text
hyperdata
[
"language_iso2"
]
=
"en"
for
data
in
soup
.
find_all
(
"datafield"
):
tag
=
data
.
get
(
"tag"
)
if
tag
in
self
.
MARC21
.
keys
():
for
sub
in
data
.
find_all
(
"subfield"
):
code
=
sub
.
get
(
"code"
)
if
code
in
self
.
MARC21
[
tag
]
.
keys
():
if
tag
==
"700"
:
r
[
self
.
MARC21
[
tag
][
code
]]
.
append
(
sub
.
text
)
if
tag
==
"100"
:
r
[
self
.
MARC21
[
"700"
][
code
]]
.
insert
(
0
,
sub
.
text
)
try
:
hyperdata
[
self
.
MARC21
[
"100"
][
code
]]
.
insert
(
0
,
sub
.
text
)
except
AttributeError
:
hyperdata
[
self
.
MARC21
[
"100"
][
code
]]
=
[
sub
.
text
]
#print ("1", self.MARC21["100"][code], hyperdata[self.MARC21["100"][code]])
elif
tag
==
"700"
:
#print ("7", self.MARC21["100"][code], hyperdata[self.MARC21["100"][code]])
try
:
hyperdata
[
self
.
MARC21
[
"100"
][
code
]]
.
append
(
sub
.
text
)
except
AttributeError
:
hyperdata
[
self
.
MARC21
[
"100"
][
code
]]
=
[
sub
.
text
]
else
:
r
[
self
.
MARC21
[
tag
][
code
]]
=
sub
.
text
print
(
r
)
#hyperdata_list.append(r["uid.decode('utf-8'))
break
hyperdata
[
self
.
MARC21
[
tag
][
code
]]
=
sub
.
text
hyperdata
[
"authors_countries"
]
=
(
","
)
.
join
(
hyperdata
[
"authors_countries"
])
hyperdata
[
"authors_affiliations"
]
=
(
","
)
.
join
(
hyperdata
[
"authors_affiliations"
])
hyperdata
[
"authors"
]
=
(
","
)
.
join
(
hyperdata
[
"authors"
])
hyperdata
[
"authors_mails"
]
=
(
","
)
.
join
(
hyperdata
[
"authors_mails"
])
hyperdata
=
self
.
format_date
(
hyperdata
)
hyperdata_list
.
append
(
hyperdata
)
return
hyperdata_list
if
__name__
==
"__main__"
:
pass
#~ e = CernParser()
#~ hyperdata = e.parse(str(sys.argv[1]))
#~ for h in hyperdata:
#~ try:
#~ print(h['journal'], ":", h['publication_date'])
#~ except:
#~ pass
#~ break
gargantext/util/parsers/_Parser.py
View file @
f197ccb4
...
...
@@ -2,7 +2,6 @@ import datetime
import
dateutil.parser
import
zipfile
import
re
import
dateparser
as
date_parser
from
gargantext.util.languages
import
languages
...
...
@@ -23,8 +22,12 @@ class Parser:
def
__del__
(
self
):
self
.
_file
.
close
()
def
detect_format
(
self
,
accepted_format
):
print
(
self
.
_file
[:
1000
])
def
detect_format
(
self
,
afile
,
a_formats
):
#import magic
print
(
"Detecting format"
)
#print(magic.from_file(afile))
return
def
detect_encoding
(
self
,
string
):
"""Useful method to detect the encoding of a document.
...
...
@@ -110,10 +113,10 @@ class Parser:
hyperdata
[
prefix
+
"_hour"
]
=
date
.
strftime
(
"
%
H"
)
hyperdata
[
prefix
+
"_minute"
]
=
date
.
strftime
(
"
%
M"
)
hyperdata
[
prefix
+
"_second"
]
=
date
.
strftime
(
"
%
S"
)
print
(
hyperdata
[
'publication_date'
])
# finally, return the transformed result!
return
hyperdata
print
(
hyperdata
[
'publication_date'
])
def
format_hyperdata_languages
(
self
,
hyperdata
):
"""format the languages found in the hyperdata."""
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment