from django.db import transaction from lxml import etree from .FileParser import FileParser from ..NgramsExtractors import * from datetime import datetime from io import BytesIO import json class ISTex(FileParser): def _parse(self, thefile): json_data=open(thefile,"r") data = json.load(json_data) json_data.close() json_docs = data["hits"] hyperdata_list = [] hyperdata_path = { "id" : "id", "source" : 'corpusName', "title" : 'title', "genre" : "genre", "language_iso3" : 'language', "doi" : 'doi', "host" : 'host', "publication_date" : 'publicationDate', # "authors" : 'author', "authorsRAW" : 'author', "keywords" : "keywords" } suma = 0 for json_doc in json_docs: hyperdata = {} for key, path in hyperdata_path.items(): try: # print(path," ==> ",len(json_doc[path])) hyperdata[key] = json_doc[path] except: pass # print("|",hyperdata["language_iso3"]) if "doi" in hyperdata: hyperdata["doi"] = hyperdata["doi"][0] keywords = [] if "keywords" in hyperdata: for keyw in hyperdata["keywords"]: keywords.append(keyw["value"] ) hyperdata["keywords"] = ", ".join( keywords ) moredate=False moresource=False if "host" in hyperdata: if "genre" in hyperdata["host"] and len(hyperdata["host"]["genre"])>0: if "genre" in hyperdata and len(hyperdata["genre"])==0: hyperdata["genre"] = hyperdata["host"]["genre"] # print(hyperdata["host"]) if "pubdate" in hyperdata["host"]: onebuffer = hyperdata["publication_date"] hyperdata["publication_date"] = [] hyperdata["publication_date"].append(onebuffer) hyperdata["publication_date"].append( hyperdata["host"]["pubdate"] ) if "title" in hyperdata["host"]: hyperdata["journal"] = hyperdata["host"]["title"] authors=False if "authorsRAW" in hyperdata: names = [] for author in hyperdata["authorsRAW"]: names.append(author["name"]) hyperdata["authors"] = ", ".join(names) if "host" in hyperdata: hyperdata.pop("host") if "genre" in hyperdata: if len(hyperdata["genre"])==0: hyperdata.pop("genre") if "language_iso3" in hyperdata: if len(hyperdata["language_iso3"])>0: hyperdata["language_iso3"] = hyperdata["language_iso3"][0] else: hyperdata["language_iso3"] = "eng" if "publication_date" in hyperdata: RealDate = hyperdata["publication_date"] if "publication_date" in hyperdata: hyperdata.pop("publication_date") if isinstance(RealDate, list): RealDate = RealDate[0] # print( RealDate ," | length:",len(RealDate)) Decision="" if len(RealDate)>4: if len(RealDate)>8: try: Decision = datetime.strptime(RealDate, '%Y-%b-%d').date() except: try: Decision = datetime.strptime(RealDate, '%Y-%m-%d').date() except: Decision=False else: try: Decision = datetime.strptime(RealDate, '%Y-%b').date() except: try: Decision = datetime.strptime(RealDate, '%Y-%m').date() except: Decision=False else: try: Decision = datetime.strptime(RealDate, '%Y').date() except: Decision=False if Decision!=False: hyperdata["publication_year"] = str(Decision.year) hyperdata["publication_month"] = str(Decision.month) hyperdata["publication_day"] = str(Decision.day) hyperdata_list.append(hyperdata) # print("\t||",hyperdata["title"]) # print("\t\t",Decision) # print("=============================") # else: # suma+=1 # if "pubdate" in json_doc: # print ("\tfail pubdate:",json_doc["pubdate"]) # print ("nb_hits:",len(json_docs)) # print("\t - nb_fails:",suma) # print(" -- - - - - - -- - -") return hyperdata_list