Commit b41241e5 authored by PkSM3's avatar PkSM3

istex parser added

parent 76298b45
from django.db import transaction
from lxml import etree
from .FileParser import FileParser
from ..NgramsExtractors import *
from datetime import datetime
from io import BytesIO
import json
class ISText(FileParser):
def _parse(self, thefile):
json_data=open(thefile,"r")
data = json.load(json_data)
json_data.close()
json_docs = data["hits"]
metadata_list = []
metadata_path = {
"id" : "id",
"source" : 'corpusName',
"title" : 'title',
"genre" : "genre",
# "language_iso3" : 'MedlineCitation/Article/Language',
"doi" : 'doi',
"host" : 'host',
"publication_date" : 'pubdate',
# "authors" : 'author',
"authorsRAW" : 'author',
"keywords" : "keywords"
}
metadata = {}
import pprint
import datetime
for json_doc in json_docs:
for key, path in metadata_path.items():
try:
# print(path," ==> ",len(json_doc[path]))
metadata[key] = json_doc[path]
except: pass
# print("|",metadata["publication_date"])
if "doi" in metadata: metadata["doi"] = metadata["doi"][0]
keywords = []
if "keywords" in metadata:
for keyw in metadata["keywords"]:
keywords.append(keyw["value"] )
metadata["keywords"] = ", ".join( keywords )
moredate=False
moresource=False
if "host" in metadata:
if "genre" in metadata["host"] and len(metadata["host"]["genre"])>0:
if "genre" in metadata and len(metadata["genre"])==0:
metadata["genre"] = metadata["host"]["genre"]
# print(metadata["host"])
if "pubdate" in metadata["host"]:
onebuffer = metadata["publication_date"]
metadata["publication_date"] = []
metadata["publication_date"].append(onebuffer)
metadata["publication_date"].append( metadata["host"]["pubdate"] )
if "title" in metadata["host"]:
metadata["journal"] = metadata["host"]["title"]
authors=False
if "authorsRAW" in metadata:
names = []
for author in metadata["authorsRAW"]:
names.append(author["name"])
metadata["authors"] = ", ".join(names)
if "host" in metadata: metadata.pop("host")
if "genre" in metadata:
if len(metadata["genre"])==0:
metadata.pop("genre")
if "publication_date" in metadata and isinstance(metadata["publication_date"], list):
if len(metadata["publication_date"])>1:
d1 = metadata["publication_date"][0]
d2 = metadata["publication_date"][1]
# print("date1:",d1)
# print("date2:",d2)
if len(d1)==len(d2):
metadata["publication_date"] = d2
# if int(d1)>int(d2): metadata["publication_date"] = d2
else:
fulldate = ""
year = d2[:4]
fulldate+=year
if len(d2)>4:
month = d2[4:6]
fulldate+="-"+month
if len(d2)>6:
day = d2[6:8]
fulldate+="-"+day
metadata["publication_date"] = fulldate
else:
if "copyrightdate" in json_doc:
metadata["publication_date"] = json_doc["copyrightdate"]
else:
if "copyrightdate" in json_doc:
metadata["publication_date"] = json_doc["copyrightdate"]
print("||",metadata["title"])
metadata_list.append(metadata)
print("=============================")
print("\nlen list:",len(metadata_list))
return metadata_list
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment