Commit f197ccb4 authored by c24b's avatar c24b

[CERN XML PARSER] OK

parent 4454aa47
from ._Parser import Parser
from datetime import datetime
from bs4 import BeautifulSoup
#from io import BytesIO
from io import StringIO
import json
from lxml import etree
class CernParser(Parser):
......@@ -11,90 +8,86 @@ class CernParser(Parser):
MARC21 = {
#here main author
"100":{
"a": "author_name",
"v": "author_affiliation",
"w": "author_country",
"m": "author_mail",
"a": "authors",
"v": "authors_affiliations",
"w": "authors_countries",
"m": "authors_mails",
},
#here cooauthor
#here cooauthor mais rappatrié vers la list l'auteur avec main author [0]
"700": {
"a": "authors_name",
"v": "authors_affiliation",
"w": "authors_country",
"a": "authors",
"v": "authors_affiliations",
"w": "authors_countries",
},
"773":{
"c": "pages",
"n": "issue",
"p": "journal",
"v": "volume",
"y": "year"
"y": "publication_year"
},
"024": {"a":"doi"},
"037": {"a":"arxiv"},
"022": {"a":"isbn"},
#"037": {"a":"arxiv"},
#"022": {"a":"isbn"},
"245": {"a":"title"},
"520": {"a":"abstract"},
"260": {"b":"publisher","c":"pubdate"},
#"024": {"t":"date"},
"260": {"b":"publisher","c":"publication_date"},
"024": {"t":"realdate_full_"}, #correspond to query date
#"540": {"a":"licence"},
#"653": {"a":"keywords"},
#"856": {"u":"pdf_source"},
"856": {"u":"pdf_source"},
}
#~ hyperdata_item = {
#~ "journal" : '',
#~ "title" : '',
#~ "abstract" : '',
#~ "title" : '',
#~ "language_iso2" : 'en',
#~ "doi" : '',
#~ "realdate_full_" : '',
#~ "realdate_year_" : '',
#~ "realdate_month_" : '',
#~ "realdate_day_" : '',
#~ "publication_year" : '',
#~ "publication_month" : '',
#~ "publication_day" : '',
#~ "authors" : '',
#~ "authors_countries" : '',
#~ "authors_affiliations": '',
#~ "publisher": '',
#~ }
def format_date(self, hyperdata):
'''formatting pubdate'''
prefix = "publication"
date = datetime.strptime(hyperdata[prefix + "_date"], "%Y-%m-%d")
hyperdata[prefix + "_year"] = date.strftime('%Y')
hyperdata[prefix + "_month"] = date.strftime("%m")
hyperdata[prefix + "_day"] = date.strftime("%d")
hyperdata[prefix + "_hour"] = date.strftime("%h")
hyperdata[prefix + "_minute"] = date.strftime("%m")
hyperdata[prefix + "_second"] = date.strftime("%s")
hyperdata[prefix + "_date"] = date.strftime("%Y-%m-%d %H:%M:%S")
return hyperdata
def parse(self, file):
if isinstance(file, str):
file = open(file, 'rb')
doc = etree.parse(file.read())
tree = etree.tostring(doc)
#parser = etree.XMLParser()
hyperdata_list =[]
soup = BeautifulSoup(tree, "lxml")
hyperdata_list = []
doc = file.read()
soup = BeautifulSoup(doc.decode("utf-8"), "lxml")
for record in soup.find_all("record"):
r = {v:[] for v in self.MARC21["700"].values()}
r["uid"] = soup.find("controlfield").text
hyperdata = {v:[] for v in self.MARC21["100"].values()}
hyperdata["uid"] = soup.find("controlfield").text
hyperdata["language_iso2"] = "en"
for data in soup.find_all("datafield"):
tag = data.get("tag")
if tag in self.MARC21.keys():
for sub in data.find_all("subfield"):
code = sub.get("code")
if code in self.MARC21[tag].keys():
if tag == "700":
r[self.MARC21[tag][code]].append(sub.text)
if tag == "100":
r[self.MARC21["700"][code]].insert(0,sub.text)
try:
hyperdata[self.MARC21["100"][code]].insert(0,sub.text)
except AttributeError:
hyperdata[self.MARC21["100"][code]] = [sub.text]
#print ("1", self.MARC21["100"][code], hyperdata[self.MARC21["100"][code]])
elif tag == "700":
#print ("7", self.MARC21["100"][code], hyperdata[self.MARC21["100"][code]])
try:
hyperdata[self.MARC21["100"][code]].append(sub.text)
except AttributeError:
hyperdata[self.MARC21["100"][code]] = [sub.text]
else:
r[self.MARC21[tag][code]] = sub.text
print(r)
#hyperdata_list.append(r["uid.decode('utf-8'))
break
hyperdata[self.MARC21[tag][code]] = sub.text
hyperdata["authors_countries"] = (",").join(hyperdata["authors_countries"])
hyperdata["authors_affiliations"] = (",").join(hyperdata["authors_affiliations"])
hyperdata["authors"] = (",").join(hyperdata["authors"])
hyperdata["authors_mails"] = (",").join(hyperdata["authors_mails"])
hyperdata = self.format_date(hyperdata)
hyperdata_list.append(hyperdata)
return hyperdata_list
if __name__ == "__main__":
pass
#~ e = CernParser()
#~ hyperdata = e.parse(str(sys.argv[1]))
#~ for h in hyperdata:
#~ try:
#~ print(h['journal'], ":", h['publication_date'])
#~ except:
#~ pass
#~ break
......@@ -2,7 +2,6 @@ import datetime
import dateutil.parser
import zipfile
import re
import dateparser as date_parser
from gargantext.util.languages import languages
......@@ -23,8 +22,12 @@ class Parser:
def __del__(self):
self._file.close()
def detect_format(self, accepted_format):
print(self._file[:1000])
def detect_format(self, afile, a_formats):
#import magic
print("Detecting format")
#print(magic.from_file(afile))
return
def detect_encoding(self, string):
"""Useful method to detect the encoding of a document.
......@@ -110,10 +113,10 @@ class Parser:
hyperdata[prefix + "_hour"] = date.strftime("%H")
hyperdata[prefix + "_minute"] = date.strftime("%M")
hyperdata[prefix + "_second"] = date.strftime("%S")
print(hyperdata['publication_date'])
# finally, return the transformed result!
return hyperdata
print(hyperdata['publication_date'])
def format_hyperdata_languages(self, hyperdata):
"""format the languages found in the hyperdata."""
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment