from django.db import transaction from lxml import etree from .FileParser import FileParser from ..NgramsExtractors import * from datetime import datetime from io import BytesIO import json class ISText(FileParser): def _parse(self, thefile): json_data=open(thefile,"r") data = json.load(json_data) json_data.close() json_docs = data["hits"] metadata_list = [] metadata_path = { "id" : "id", "source" : 'corpusName', "title" : 'title', "genre" : "genre", # "language_iso3" : 'MedlineCitation/Article/Language', "doi" : 'doi', "host" : 'host', "publication_date" : 'pubdate', # "authors" : 'author', "authorsRAW" : 'author', "keywords" : "keywords" } metadata = {} import pprint import datetime for json_doc in json_docs: for key, path in metadata_path.items(): try: # print(path," ==> ",len(json_doc[path])) metadata[key] = json_doc[path] except: pass # print("|",metadata["publication_date"]) if "doi" in metadata: metadata["doi"] = metadata["doi"][0] keywords = [] if "keywords" in metadata: for keyw in metadata["keywords"]: keywords.append(keyw["value"] ) metadata["keywords"] = ", ".join( keywords ) moredate=False moresource=False if "host" in metadata: if "genre" in metadata["host"] and len(metadata["host"]["genre"])>0: if "genre" in metadata and len(metadata["genre"])==0: metadata["genre"] = metadata["host"]["genre"] # print(metadata["host"]) if "pubdate" in metadata["host"]: onebuffer = metadata["publication_date"] metadata["publication_date"] = [] metadata["publication_date"].append(onebuffer) metadata["publication_date"].append( metadata["host"]["pubdate"] ) if "title" in metadata["host"]: metadata["journal"] = metadata["host"]["title"] authors=False if "authorsRAW" in metadata: names = [] for author in metadata["authorsRAW"]: names.append(author["name"]) metadata["authors"] = ", ".join(names) if "host" in metadata: metadata.pop("host") if "genre" in metadata: if len(metadata["genre"])==0: metadata.pop("genre") if "publication_date" in metadata and isinstance(metadata["publication_date"], list): if len(metadata["publication_date"])>1: d1 = metadata["publication_date"][0] d2 = metadata["publication_date"][1] # print("date1:",d1) # print("date2:",d2) if len(d1)==len(d2): metadata["publication_date"] = d2 # if int(d1)>int(d2): metadata["publication_date"] = d2 else: fulldate = "" year = d2[:4] fulldate+=year if len(d2)>4: month = d2[4:6] fulldate+="-"+month if len(d2)>6: day = d2[6:8] fulldate+="-"+day metadata["publication_date"] = fulldate else: if "copyrightdate" in json_doc: metadata["publication_date"] = json_doc["copyrightdate"] else: if "copyrightdate" in json_doc: metadata["publication_date"] = json_doc["copyrightdate"] print("||",metadata["title"]) metadata_list.append(metadata) print("=============================") print("\nlen list:",len(metadata_list)) return metadata_list