Commit f197ccb4 authored by c24b's avatar c24b

[CERN XML PARSER] OK

parent 4454aa47
from ._Parser import Parser from ._Parser import Parser
from datetime import datetime from datetime import datetime
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
#from io import BytesIO
from io import StringIO
import json
from lxml import etree from lxml import etree
class CernParser(Parser): class CernParser(Parser):
...@@ -11,90 +8,86 @@ class CernParser(Parser): ...@@ -11,90 +8,86 @@ class CernParser(Parser):
MARC21 = { MARC21 = {
#here main author #here main author
"100":{ "100":{
"a": "author_name", "a": "authors",
"v": "author_affiliation", "v": "authors_affiliations",
"w": "author_country", "w": "authors_countries",
"m": "author_mail", "m": "authors_mails",
}, },
#here cooauthor #here cooauthor mais rappatrié vers la list l'auteur avec main author [0]
"700": { "700": {
"a": "authors_name", "a": "authors",
"v": "authors_affiliation", "v": "authors_affiliations",
"w": "authors_country", "w": "authors_countries",
}, },
"773":{ "773":{
"c": "pages", "c": "pages",
"n": "issue", "n": "issue",
"p": "journal", "p": "journal",
"v": "volume", "v": "volume",
"y": "year" "y": "publication_year"
}, },
"024": {"a":"doi"}, "024": {"a":"doi"},
"037": {"a":"arxiv"}, #"037": {"a":"arxiv"},
"022": {"a":"isbn"}, #"022": {"a":"isbn"},
"245": {"a":"title"}, "245": {"a":"title"},
"520": {"a":"abstract"}, "520": {"a":"abstract"},
"260": {"b":"publisher","c":"pubdate"}, "260": {"b":"publisher","c":"publication_date"},
#"024": {"t":"date"}, "024": {"t":"realdate_full_"}, #correspond to query date
#"540": {"a":"licence"}, #"540": {"a":"licence"},
#"653": {"a":"keywords"}, #"653": {"a":"keywords"},
#"856": {"u":"pdf_source"}, "856": {"u":"pdf_source"},
} }
#~ hyperdata_item = {
#~ "journal" : '', def format_date(self, hyperdata):
#~ "title" : '', '''formatting pubdate'''
#~ "abstract" : '', prefix = "publication"
#~ "title" : '', date = datetime.strptime(hyperdata[prefix + "_date"], "%Y-%m-%d")
#~ "language_iso2" : 'en', hyperdata[prefix + "_year"] = date.strftime('%Y')
#~ "doi" : '', hyperdata[prefix + "_month"] = date.strftime("%m")
#~ "realdate_full_" : '', hyperdata[prefix + "_day"] = date.strftime("%d")
#~ "realdate_year_" : '', hyperdata[prefix + "_hour"] = date.strftime("%h")
#~ "realdate_month_" : '', hyperdata[prefix + "_minute"] = date.strftime("%m")
#~ "realdate_day_" : '', hyperdata[prefix + "_second"] = date.strftime("%s")
#~ "publication_year" : '', hyperdata[prefix + "_date"] = date.strftime("%Y-%m-%d %H:%M:%S")
#~ "publication_month" : '', return hyperdata
#~ "publication_day" : '',
#~ "authors" : '',
#~ "authors_countries" : '',
#~ "authors_affiliations": '',
#~ "publisher": '',
#~ }
def parse(self, file): def parse(self, file):
if isinstance(file, str): hyperdata_list = []
file = open(file, 'rb') doc = file.read()
doc = etree.parse(file.read()) soup = BeautifulSoup(doc.decode("utf-8"), "lxml")
tree = etree.tostring(doc)
#parser = etree.XMLParser()
hyperdata_list =[]
soup = BeautifulSoup(tree, "lxml")
for record in soup.find_all("record"): for record in soup.find_all("record"):
r = {v:[] for v in self.MARC21["700"].values()} hyperdata = {v:[] for v in self.MARC21["100"].values()}
r["uid"] = soup.find("controlfield").text hyperdata["uid"] = soup.find("controlfield").text
hyperdata["language_iso2"] = "en"
for data in soup.find_all("datafield"): for data in soup.find_all("datafield"):
tag = data.get("tag") tag = data.get("tag")
if tag in self.MARC21.keys(): if tag in self.MARC21.keys():
for sub in data.find_all("subfield"): for sub in data.find_all("subfield"):
code = sub.get("code") code = sub.get("code")
if code in self.MARC21[tag].keys(): if code in self.MARC21[tag].keys():
if tag == "700":
r[self.MARC21[tag][code]].append(sub.text)
if tag == "100": if tag == "100":
r[self.MARC21["700"][code]].insert(0,sub.text) try:
hyperdata[self.MARC21["100"][code]].insert(0,sub.text)
except AttributeError:
hyperdata[self.MARC21["100"][code]] = [sub.text]
#print ("1", self.MARC21["100"][code], hyperdata[self.MARC21["100"][code]])
elif tag == "700":
#print ("7", self.MARC21["100"][code], hyperdata[self.MARC21["100"][code]])
try:
hyperdata[self.MARC21["100"][code]].append(sub.text)
except AttributeError:
hyperdata[self.MARC21["100"][code]] = [sub.text]
else: else:
r[self.MARC21[tag][code]] = sub.text hyperdata[self.MARC21[tag][code]] = sub.text
print(r)
#hyperdata_list.append(r["uid.decode('utf-8')) hyperdata["authors_countries"] = (",").join(hyperdata["authors_countries"])
break hyperdata["authors_affiliations"] = (",").join(hyperdata["authors_affiliations"])
hyperdata["authors"] = (",").join(hyperdata["authors"])
hyperdata["authors_mails"] = (",").join(hyperdata["authors_mails"])
hyperdata = self.format_date(hyperdata)
hyperdata_list.append(hyperdata)
return hyperdata_list return hyperdata_list
if __name__ == "__main__": if __name__ == "__main__":
pass pass
#~ e = CernParser()
#~ hyperdata = e.parse(str(sys.argv[1]))
#~ for h in hyperdata:
#~ try:
#~ print(h['journal'], ":", h['publication_date'])
#~ except:
#~ pass
#~ break
...@@ -2,7 +2,6 @@ import datetime ...@@ -2,7 +2,6 @@ import datetime
import dateutil.parser import dateutil.parser
import zipfile import zipfile
import re import re
import dateparser as date_parser import dateparser as date_parser
from gargantext.util.languages import languages from gargantext.util.languages import languages
...@@ -23,8 +22,12 @@ class Parser: ...@@ -23,8 +22,12 @@ class Parser:
def __del__(self): def __del__(self):
self._file.close() self._file.close()
def detect_format(self, accepted_format): def detect_format(self, afile, a_formats):
print(self._file[:1000]) #import magic
print("Detecting format")
#print(magic.from_file(afile))
return
def detect_encoding(self, string): def detect_encoding(self, string):
"""Useful method to detect the encoding of a document. """Useful method to detect the encoding of a document.
...@@ -110,10 +113,10 @@ class Parser: ...@@ -110,10 +113,10 @@ class Parser:
hyperdata[prefix + "_hour"] = date.strftime("%H") hyperdata[prefix + "_hour"] = date.strftime("%H")
hyperdata[prefix + "_minute"] = date.strftime("%M") hyperdata[prefix + "_minute"] = date.strftime("%M")
hyperdata[prefix + "_second"] = date.strftime("%S") hyperdata[prefix + "_second"] = date.strftime("%S")
print(hyperdata['publication_date'])
# finally, return the transformed result! # finally, return the transformed result!
return hyperdata return hyperdata
print(hyperdata['publication_date'])
def format_hyperdata_languages(self, hyperdata): def format_hyperdata_languages(self, hyperdata):
"""format the languages found in the hyperdata.""" """format the languages found in the hyperdata."""
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment