Commit 4454aa47 authored by c24b's avatar c24b

encours: parsers

parent 9a0747ad
......@@ -7,42 +7,68 @@ import json
from lxml import etree
class CernParser(Parser):
#mapping MARC21 ==> hyperdata
MARC21 = {
"100":{"a": "author_name",
"v": "author_affiliation",
"w": "author_country",
"m": "author_mail",
},
"700": {"a": "coauthor_name",
"v": "coauthor_affiliation",
"w": "coauthor_country",
#here main author
"100":{
"a": "author_name",
"v": "author_affiliation",
"w": "author_country",
"m": "author_mail",
},
"773":{ "c": "pages",
#here cooauthor
"700": {
"a": "authors_name",
"v": "authors_affiliation",
"w": "authors_country",
},
"773":{
"c": "pages",
"n": "issue",
"p": "journal",
"v": "volume",
"y": "year"
},
"024": {"a":"DOI"},
"037": {"a":"ArXiv"},
"022": {"a":"ISSN"},
"245": {"a":"Title"},
"520": {"a":"Abstract"},
"260": {"b":"Publisher","c":"Pubdate"},
"024": {"t":"Date"},
"540": {"a":"Licence"},
"653": {"a":"keywords"},
"856": {"u":"pdf_source"},
"024": {"a":"doi"},
"037": {"a":"arxiv"},
"022": {"a":"isbn"},
"245": {"a":"title"},
"520": {"a":"abstract"},
"260": {"b":"publisher","c":"pubdate"},
#"024": {"t":"date"},
#"540": {"a":"licence"},
#"653": {"a":"keywords"},
#"856": {"u":"pdf_source"},
}
#~ hyperdata_item = {
#~ "journal" : '',
#~ "title" : '',
#~ "abstract" : '',
#~ "title" : '',
#~ "language_iso2" : 'en',
#~ "doi" : '',
#~ "realdate_full_" : '',
#~ "realdate_year_" : '',
#~ "realdate_month_" : '',
#~ "realdate_day_" : '',
#~ "publication_year" : '',
#~ "publication_month" : '',
#~ "publication_day" : '',
#~ "authors" : '',
#~ "authors_countries" : '',
#~ "authors_affiliations": '',
#~ "publisher": '',
#~ }
def parse(self, filebuf):
doc = etree.parse(filebuf)
def parse(self, file):
if isinstance(file, str):
file = open(file, 'rb')
doc = etree.parse(file.read())
tree = etree.tostring(doc)
#parser = etree.XMLParser()
hyperdata_list =[]
soup = BeautifulSoup(tree, "lxml")
for record in soup.find_all("record"):
r = {v:[] for v in self.MARC21["700"].values()}
r["uid"] = soup.find("controlfield").text
for data in soup.find_all("datafield"):
......@@ -53,8 +79,22 @@ class CernParser(Parser):
if code in self.MARC21[tag].keys():
if tag == "700":
r[self.MARC21[tag][code]].append(sub.text)
if tag == "100":
r[self.MARC21["700"][code]].insert(0,sub.text)
else:
r[self.MARC21[tag][code]] = sub.text
records.append(r.decode('utf-8'))
print(r)
#hyperdata_list.append(r["uid.decode('utf-8'))
break
return hyperdata_list
if __name__ == "__main__":
pass
#~ e = CernParser()
#~ hyperdata = e.parse(str(sys.argv[1]))
#~ for h in hyperdata:
#~ try:
#~ print(h['journal'], ":", h['publication_date'])
#~ except:
#~ pass
#~ break
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment