Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
54dae3c9
Commit
54dae3c9
authored
May 12, 2016
by
c24b
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[CERN XML PARSER] OK
parent
dc7d6860
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
61 additions
and
65 deletions
+61
-65
Cern.py
gargantext/util/parsers/Cern.py
+53
-60
_Parser.py
gargantext/util/parsers/_Parser.py
+8
-5
No files found.
gargantext/util/parsers/Cern.py
View file @
54dae3c9
from
._Parser
import
Parser
from
._Parser
import
Parser
from
datetime
import
datetime
from
datetime
import
datetime
from
bs4
import
BeautifulSoup
from
bs4
import
BeautifulSoup
#from io import BytesIO
from
io
import
StringIO
import
json
from
lxml
import
etree
from
lxml
import
etree
class
CernParser
(
Parser
):
class
CernParser
(
Parser
):
...
@@ -11,90 +8,86 @@ class CernParser(Parser):
...
@@ -11,90 +8,86 @@ class CernParser(Parser):
MARC21
=
{
MARC21
=
{
#here main author
#here main author
"100"
:{
"100"
:{
"a"
:
"author
_name
"
,
"a"
:
"author
s
"
,
"v"
:
"author
_affiliation
"
,
"v"
:
"author
s_affiliations
"
,
"w"
:
"author
_country
"
,
"w"
:
"author
s_countries
"
,
"m"
:
"author
_mail
"
,
"m"
:
"author
s_mails
"
,
},
},
#here cooauthor
#here cooauthor
mais rappatrié vers la list l'auteur avec main author [0]
"700"
:
{
"700"
:
{
"a"
:
"authors
_name
"
,
"a"
:
"authors"
,
"v"
:
"authors_affiliation"
,
"v"
:
"authors_affiliation
s
"
,
"w"
:
"authors_countr
y
"
,
"w"
:
"authors_countr
ies
"
,
},
},
"773"
:{
"773"
:{
"c"
:
"pages"
,
"c"
:
"pages"
,
"n"
:
"issue"
,
"n"
:
"issue"
,
"p"
:
"journal"
,
"p"
:
"journal"
,
"v"
:
"volume"
,
"v"
:
"volume"
,
"y"
:
"year"
"y"
:
"
publication_
year"
},
},
"024"
:
{
"a"
:
"doi"
},
"024"
:
{
"a"
:
"doi"
},
"037"
:
{
"a"
:
"arxiv"
},
#
"037": {"a":"arxiv"},
"022"
:
{
"a"
:
"isbn"
},
#
"022": {"a":"isbn"},
"245"
:
{
"a"
:
"title"
},
"245"
:
{
"a"
:
"title"
},
"520"
:
{
"a"
:
"abstract"
},
"520"
:
{
"a"
:
"abstract"
},
"260"
:
{
"b"
:
"publisher"
,
"c"
:
"pubdate"
},
"260"
:
{
"b"
:
"publisher"
,
"c"
:
"pub
lication_
date"
},
#"024": {"t":"date"},
"024"
:
{
"t"
:
"realdate_full_"
},
#correspond to query date
#"540": {"a":"licence"},
#"540": {"a":"licence"},
#"653": {"a":"keywords"},
#"653": {"a":"keywords"},
#
"856": {"u":"pdf_source"},
"856"
:
{
"u"
:
"pdf_source"
},
}
}
#~ hyperdata_item = {
#~ "journal" : '',
def
format_date
(
self
,
hyperdata
):
#~ "title" : '',
'''formatting pubdate'''
#~ "abstract" : '',
prefix
=
"publication"
#~ "title" : '',
date
=
datetime
.
strptime
(
hyperdata
[
prefix
+
"_date"
],
"
%
Y-
%
m-
%
d"
)
#~ "language_iso2" : 'en',
hyperdata
[
prefix
+
"_year"
]
=
date
.
strftime
(
'
%
Y'
)
#~ "doi" : '',
hyperdata
[
prefix
+
"_month"
]
=
date
.
strftime
(
"
%
m"
)
#~ "realdate_full_" : '',
hyperdata
[
prefix
+
"_day"
]
=
date
.
strftime
(
"
%
d"
)
#~ "realdate_year_" : '',
hyperdata
[
prefix
+
"_hour"
]
=
date
.
strftime
(
"
%
h"
)
#~ "realdate_month_" : '',
hyperdata
[
prefix
+
"_minute"
]
=
date
.
strftime
(
"
%
m"
)
#~ "realdate_day_" : '',
hyperdata
[
prefix
+
"_second"
]
=
date
.
strftime
(
"
%
s"
)
#~ "publication_year" : '',
hyperdata
[
prefix
+
"_date"
]
=
date
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
#~ "publication_month" : '',
return
hyperdata
#~ "publication_day" : '',
#~ "authors" : '',
#~ "authors_countries" : '',
#~ "authors_affiliations": '',
#~ "publisher": '',
#~ }
def
parse
(
self
,
file
):
def
parse
(
self
,
file
):
if
isinstance
(
file
,
str
):
hyperdata_list
=
[]
file
=
open
(
file
,
'rb'
)
doc
=
file
.
read
()
doc
=
etree
.
parse
(
file
.
read
())
soup
=
BeautifulSoup
(
doc
.
decode
(
"utf-8"
),
"lxml"
)
tree
=
etree
.
tostring
(
doc
)
#parser = etree.XMLParser()
hyperdata_list
=
[]
soup
=
BeautifulSoup
(
tree
,
"lxml"
)
for
record
in
soup
.
find_all
(
"record"
):
for
record
in
soup
.
find_all
(
"record"
):
r
=
{
v
:[]
for
v
in
self
.
MARC21
[
"700"
]
.
values
()}
hyperdata
=
{
v
:[]
for
v
in
self
.
MARC21
[
"100"
]
.
values
()}
r
[
"uid"
]
=
soup
.
find
(
"controlfield"
)
.
text
hyperdata
[
"uid"
]
=
soup
.
find
(
"controlfield"
)
.
text
hyperdata
[
"language_iso2"
]
=
"en"
for
data
in
soup
.
find_all
(
"datafield"
):
for
data
in
soup
.
find_all
(
"datafield"
):
tag
=
data
.
get
(
"tag"
)
tag
=
data
.
get
(
"tag"
)
if
tag
in
self
.
MARC21
.
keys
():
if
tag
in
self
.
MARC21
.
keys
():
for
sub
in
data
.
find_all
(
"subfield"
):
for
sub
in
data
.
find_all
(
"subfield"
):
code
=
sub
.
get
(
"code"
)
code
=
sub
.
get
(
"code"
)
if
code
in
self
.
MARC21
[
tag
]
.
keys
():
if
code
in
self
.
MARC21
[
tag
]
.
keys
():
if
tag
==
"700"
:
r
[
self
.
MARC21
[
tag
][
code
]]
.
append
(
sub
.
text
)
if
tag
==
"100"
:
if
tag
==
"100"
:
r
[
self
.
MARC21
[
"700"
][
code
]]
.
insert
(
0
,
sub
.
text
)
try
:
hyperdata
[
self
.
MARC21
[
"100"
][
code
]]
.
insert
(
0
,
sub
.
text
)
except
AttributeError
:
hyperdata
[
self
.
MARC21
[
"100"
][
code
]]
=
[
sub
.
text
]
#print ("1", self.MARC21["100"][code], hyperdata[self.MARC21["100"][code]])
elif
tag
==
"700"
:
#print ("7", self.MARC21["100"][code], hyperdata[self.MARC21["100"][code]])
try
:
hyperdata
[
self
.
MARC21
[
"100"
][
code
]]
.
append
(
sub
.
text
)
except
AttributeError
:
hyperdata
[
self
.
MARC21
[
"100"
][
code
]]
=
[
sub
.
text
]
else
:
else
:
r
[
self
.
MARC21
[
tag
][
code
]]
=
sub
.
text
hyperdata
[
self
.
MARC21
[
tag
][
code
]]
=
sub
.
text
print
(
r
)
#hyperdata_list.append(r["uid.decode('utf-8'))
hyperdata
[
"authors_countries"
]
=
(
","
)
.
join
(
hyperdata
[
"authors_countries"
])
break
hyperdata
[
"authors_affiliations"
]
=
(
","
)
.
join
(
hyperdata
[
"authors_affiliations"
])
hyperdata
[
"authors"
]
=
(
","
)
.
join
(
hyperdata
[
"authors"
])
hyperdata
[
"authors_mails"
]
=
(
","
)
.
join
(
hyperdata
[
"authors_mails"
])
hyperdata
=
self
.
format_date
(
hyperdata
)
hyperdata_list
.
append
(
hyperdata
)
return
hyperdata_list
return
hyperdata_list
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
pass
pass
#~ e = CernParser()
#~ hyperdata = e.parse(str(sys.argv[1]))
#~ for h in hyperdata:
#~ try:
#~ print(h['journal'], ":", h['publication_date'])
#~ except:
#~ pass
#~ break
gargantext/util/parsers/_Parser.py
View file @
54dae3c9
...
@@ -2,7 +2,6 @@ import datetime
...
@@ -2,7 +2,6 @@ import datetime
import
dateutil.parser
import
dateutil.parser
import
zipfile
import
zipfile
import
re
import
re
import
dateparser
as
date_parser
import
dateparser
as
date_parser
from
gargantext.util.languages
import
languages
from
gargantext.util.languages
import
languages
...
@@ -23,8 +22,12 @@ class Parser:
...
@@ -23,8 +22,12 @@ class Parser:
def
__del__
(
self
):
def
__del__
(
self
):
self
.
_file
.
close
()
self
.
_file
.
close
()
def
detect_format
(
self
,
accepted_format
):
def
detect_format
(
self
,
afile
,
a_formats
):
print
(
self
.
_file
[:
1000
])
#import magic
print
(
"Detecting format"
)
#print(magic.from_file(afile))
return
def
detect_encoding
(
self
,
string
):
def
detect_encoding
(
self
,
string
):
"""Useful method to detect the encoding of a document.
"""Useful method to detect the encoding of a document.
...
@@ -110,10 +113,10 @@ class Parser:
...
@@ -110,10 +113,10 @@ class Parser:
hyperdata
[
prefix
+
"_hour"
]
=
date
.
strftime
(
"
%
H"
)
hyperdata
[
prefix
+
"_hour"
]
=
date
.
strftime
(
"
%
H"
)
hyperdata
[
prefix
+
"_minute"
]
=
date
.
strftime
(
"
%
M"
)
hyperdata
[
prefix
+
"_minute"
]
=
date
.
strftime
(
"
%
M"
)
hyperdata
[
prefix
+
"_second"
]
=
date
.
strftime
(
"
%
S"
)
hyperdata
[
prefix
+
"_second"
]
=
date
.
strftime
(
"
%
S"
)
print
(
hyperdata
[
'publication_date'
])
# finally, return the transformed result!
# finally, return the transformed result!
return
hyperdata
return
hyperdata
print
(
hyperdata
[
'publication_date'
])
def
format_hyperdata_languages
(
self
,
hyperdata
):
def
format_hyperdata_languages
(
self
,
hyperdata
):
"""format the languages found in the hyperdata."""
"""format the languages found in the hyperdata."""
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment