Commit b0ed631c authored by delanoe's avatar delanoe

[FEAT] Parsers : RIS ok, ISI, ok, Istex and CSV need to be tested.

parent c362aed0
...@@ -37,7 +37,7 @@ RESOURCETYPES = [ ...@@ -37,7 +37,7 @@ RESOURCETYPES = [
'default_language': 'fr', 'default_language': 'fr',
}, },
{ 'name': 'Jstor (RIS format)', { 'name': 'Jstor (RIS format)',
# 'parser': RISParser, 'parser': RISParser,
'default_language': 'en', 'default_language': 'en',
}, },
{ 'name': 'Pubmed (XML format)', { 'name': 'Pubmed (XML format)',
...@@ -45,15 +45,15 @@ RESOURCETYPES = [ ...@@ -45,15 +45,15 @@ RESOURCETYPES = [
'default_language': 'en', 'default_language': 'en',
}, },
{ 'name': 'Scopus (RIS format)', { 'name': 'Scopus (RIS format)',
# 'parser': RISParser, 'parser': RISParser,
'default_language': 'en', 'default_language': 'en',
}, },
{ 'name': 'Web of Science (ISI format)', { 'name': 'Web of Science (ISI format)',
# 'parser': ISIParser, 'parser': ISIParser,
'default_language': 'fr', 'default_language': 'fr',
}, },
{ 'name': 'Zotero (RIS format)', { 'name': 'Zotero (RIS format)',
# 'parser': RISParser, 'parser': RISParser,
'default_language': 'en', 'default_language': 'en',
}, },
# { 'name': 'CSV', # { 'name': 'CSV',
......
...@@ -94,7 +94,7 @@ WSGI_APPLICATION = 'gargantext.wsgi.application' ...@@ -94,7 +94,7 @@ WSGI_APPLICATION = 'gargantext.wsgi.application'
DATABASES = { DATABASES = {
'default': { 'default': {
'ENGINE': 'django.db.backends.postgresql_psycopg2', 'ENGINE': 'django.db.backends.postgresql_psycopg2',
'NAME': 'garganfacto', 'NAME': 'gargandb',
'USER': 'gargantua', 'USER': 'gargantua',
'PASSWORD': 'C8kdcUrAQy66U', 'PASSWORD': 'C8kdcUrAQy66U',
'HOST': '127.0.0.1', 'HOST': '127.0.0.1',
......
...@@ -47,3 +47,5 @@ for pycountry_language in pycountry.languages: ...@@ -47,3 +47,5 @@ for pycountry_language in pycountry.languages:
# because PubMed has weird language codes: # because PubMed has weird language codes:
languages['fre'] = languages['fr'] languages['fre'] = languages['fr']
languages['ger'] = languages['de'] languages['ger'] = languages['de']
languages['Français'] = languages['fr']
languages['en_US'] = languages['en']
from ._Parser import Parser
# from ..NgramsExtractors import *
import sys
import csv
csv.field_size_limit(sys.maxsize)
import numpy as np
import os
class CSVParser(Parser):
def CSVsample( self, filename , delim) :
ifile = open( filename, "r" )
reader = csv.reader(ifile, delimiter=delim)
Freqs = []
for row in reader:
Freqs.append(len(row))
ifile.close()
return Freqs
def parse(self, filename):
sample_size = 10
sample_file = filename.replace(".csv","_sample.csv")
hyperdata_list = []
command_for_sample = "cat '"+filename+"' | head -n "+str(sample_size)+" > '"+sample_file+"'"
os.system(command_for_sample) # you just created a *_sample.csv
# # = = = = [ Getting delimiters frequency ] = = = = #
PossibleDelimiters = [ ',',' ','\t', ';', '|', ':' ]
AllDelimiters = {}
for delim in PossibleDelimiters:
AllDelimiters[delim] = self.CSVsample( sample_file , delim )
# # = = = = [ / Getting delimiters frequency ] = = = = #
# # OUTPUT example:
# # AllDelimiters = {
# # '\t': [1, 1, 1, 1, 1],
# # ' ': [1, 13, 261, 348, 330],
# # ',': [15, 15, 15, 15, 15],
# # ';': [1, 1, 1, 1, 1],
# # '|': [1, 1, 1, 1, 1]
# # }
# # = = = = [ Stand.Dev=0 & Sum of delimiters ] = = = = #
Delimiters = []
for d in AllDelimiters:
freqs = AllDelimiters[d]
suma = np.sum( freqs )
if suma >0:
std = np.std( freqs )
# print [ d , suma , len(freqs) , std]
if std == 0:
Delimiters.append ( [ d , suma , len(freqs) , std] )
# # = = = = [ / Stand.Dev=0 & Sum of delimiters ] = = = = #
# # OUTPUT example:
# # Delimiters = [
# # ['\t', 5, 5, 0.0],
# # [',', 75, 5, 0.0],
# # ['|', 5, 5, 0.0]
# # ]
# # = = = = [ Delimiter selection ] = = = = #
Sorted_Delims = sorted(Delimiters, key=lambda x: x[1], reverse=True)
HighestDelim = Sorted_Delims[0][0]
# print("selected delimiter:",[HighestDelim]
# print
# # = = = = [ / Delimiter selection ] = = = = #
# # = = = = [ First data coordinate ] = = = = #
Coords = {
"row": -1,
"column": -1
}
ifile = open( sample_file, "r" )
reader = csv.reader(ifile, delimiter=HighestDelim)
for rownum, tokens in enumerate(reader):
joined_tokens = "".join (tokens)
if Coords["row"]<0 and len( joined_tokens )>0 :
Coords["row"] = rownum
for columnum in range(len(tokens)):
t = tokens[columnum]
if len(t)>0:
Coords["column"] = columnum
break
ifile.close()
# # = = = = [ / First data coordinate ] = = = = #
# # = = = = [ Setting Headers ] = = = = #
Headers_Int2Str = {}
ifile = open( sample_file, "r" )
reader = csv.reader(ifile, delimiter=HighestDelim)
for rownum, tokens in enumerate(reader):
if rownum>=Coords["row"]:
for columnum in range( Coords["column"],len(tokens) ):
t = tokens[columnum]
Headers_Int2Str[columnum] = t
break
ifile.close()
# # = = = = [ / Setting Headers ] = = = = #
# # OUTPUT example:
# # Headers_Int2Str = {
# # 0: 'publication_date',
# # 1: 'publication_month',
# # 2: 'publication_second',
# # 3: 'abstract'
# # }
# # = = = = [ Reading the whole CSV and saving ] = = = = #
hyperdata_list = []
ifile = open( filename, "r" )
reader = csv.reader(ifile, delimiter=HighestDelim)
for rownum, tokens in enumerate(reader):
if rownum>Coords["row"]:
RecordDict = {}
for columnum in range( Coords["column"],len(tokens) ):
data = tokens[columnum]
RecordDict[ Headers_Int2Str[columnum] ] = data
hyperdata_list.append( RecordDict )
ifile.close()
# # = = = = [ / Reading the whole CSV and saving ] = = = = #
return hyperdata_list
from lxml import etree
from ._Parser import Parser
from datetime import datetime
from io import BytesIO
import json
class ISTex(Parser):
def parse(self, thefile):
json_data=open(thefile,"r")
data = json.load(json_data)
json_data.close()
json_docs = data["hits"]
hyperdata_list = []
hyperdata_path = {
"id" : "id",
"source" : 'corpusName',
"title" : 'title',
"genre" : "genre",
"language_iso3" : 'language',
"doi" : 'doi',
"host" : 'host',
"publication_date" : 'publicationDate',
"abstract" : 'abstract',
# "authors" : 'author',
"authorsRAW" : 'author',
"keywords" : "keywords"
}
suma = 0
for json_doc in json_docs:
hyperdata = {}
for key, path in hyperdata_path.items():
try:
# print(path," ==> ",len(json_doc[path]))
hyperdata[key] = json_doc[path]
except:
pass
# print("|",hyperdata["language_iso3"])
if "doi" in hyperdata:
hyperdata["doi"] = hyperdata["doi"][0]
keywords = []
if "keywords" in hyperdata:
for keyw in hyperdata["keywords"]:
keywords.append(keyw["value"] )
hyperdata["keywords"] = ", ".join( keywords )
moredate=False
moresource=False
if "host" in hyperdata:
if "genre" in hyperdata["host"] and len(hyperdata["host"]["genre"])>0:
if "genre" in hyperdata and len(hyperdata["genre"])==0:
hyperdata["genre"] = hyperdata["host"]["genre"]
# print(hyperdata["host"])
if "pubdate" in hyperdata["host"]:
onebuffer = hyperdata["publication_date"]
hyperdata["publication_date"] = []
hyperdata["publication_date"].append(onebuffer)
hyperdata["publication_date"].append( hyperdata["host"]["pubdate"] )
if "title" in hyperdata["host"]:
hyperdata["journal"] = hyperdata["host"]["title"]
authors=False
if "authorsRAW" in hyperdata:
names = []
for author in hyperdata["authorsRAW"]:
names.append(author["name"])
hyperdata["authors"] = ", ".join(names)
if "host" in hyperdata: hyperdata.pop("host")
if "genre" in hyperdata:
if len(hyperdata["genre"])==0:
hyperdata.pop("genre")
if "language_iso3" in hyperdata:
# retrieve lang if lang != [] and lang != ["unknown"]
# ---------------------------------------------------
if len(hyperdata["language_iso3"])>0 and hyperdata["language_iso3"][0] != "unknown" :
hyperdata["language_iso3"] = hyperdata["language_iso3"][0]
# default value = eng
# possible even better: langid.classify(abstract)
else:
# NB 97% des docs istex sont eng donc par défaut
# ----------------------------------------------
hyperdata["language_iso3"] = "eng"
# (cf. api.istex.fr/document/?q=*&facet=language
# et tests langid sur les language=["unknown"])
if "publication_date" in hyperdata:
RealDate = hyperdata["publication_date"]
if "publication_date" in hyperdata:
hyperdata.pop("publication_date")
if isinstance(RealDate, list):
RealDate = RealDate[0]
# print( RealDate ," | length:",len(RealDate))
Decision=""
if len(RealDate)>4:
if len(RealDate)>8:
try: Decision = datetime.strptime(RealDate, '%Y-%b-%d').date()
except:
try: Decision = datetime.strptime(RealDate, '%Y-%m-%d').date()
except: Decision=False
else:
try: Decision = datetime.strptime(RealDate, '%Y-%b').date()
except:
try: Decision = datetime.strptime(RealDate, '%Y-%m').date()
except: Decision=False
else:
try: Decision = datetime.strptime(RealDate, '%Y').date()
except: Decision=False
if Decision!=False:
hyperdata["publication_year"] = str(Decision.year)
hyperdata["publication_month"] = str(Decision.month)
hyperdata["publication_day"] = str(Decision.day)
hyperdata_list.append(hyperdata)
# print("\t||",hyperdata["title"])
# print("\t\t",Decision)
# print("=============================")
# else:
# suma+=1
# if "pubdate" in json_doc:
# print ("\tfail pubdate:",json_doc["pubdate"])
# print ("nb_hits:",len(json_docs))
# print("\t - nb_fails:",suma)
# print(" -- - - - - - -- - -")
return hyperdata_list
from .Ris import RISParser
class ISIParser(RISParser):
_begin = 3
_parameters = {
b"ER": {"type": "delimiter"},
b"TI": {"type": "hyperdata", "key": "title", "separator": " "},
b"AU": {"type": "hyperdata", "key": "authors", "separator": ", "},
b"DI": {"type": "hyperdata", "key": "doi"},
b"SO": {"type": "hyperdata", "key": "journal"},
b"PY": {"type": "hyperdata", "key": "publication_year"},
b"PD": {"type": "hyperdata", "key": "publication_month"},
b"LA": {"type": "hyperdata", "key": "language_fullname"},
b"AB": {"type": "hyperdata", "key": "abstract", "separator": " "},
b"WC": {"type": "hyperdata", "key": "fields"},
}
from ._Parser import Parser
from gargantext.util.languages import languages
#from admin.utils import PrintException
class RISParser(Parser):
# def __init__(self, language_cache=None):
#
# #super(Parser, self).__init__()
# #super(Parser, self).__init__()
# self._languages_cache = LanguagesCache() if language_cache is None else language_cache
_begin = 6
_parameters = {
b"ER": {"type": "delimiter"},
b"TI": {"type": "hyperdata", "key": "title", "separator": " "},
b"ST": {"type": "hyperdata", "key": "subtitle", "separator": " "},
b"AU": {"type": "hyperdata", "key": "authors", "separator": ", "},
b"T2": {"type": "hyperdata", "key": "journal"},
b"UR": {"type": "hyperdata", "key": "doi"},
b"PY": {"type": "hyperdata", "key": "publication_year"},
b"PD": {"type": "hyperdata", "key": "publication_month"},
b"LA": {"type": "hyperdata", "key": "language_iso2"},
b"AB": {"type": "hyperdata", "key": "abstract", "separator": " "},
b"WC": {"type": "hyperdata", "key": "fields"},
}
def parse(self, file):
hyperdata = {}
last_key = None
last_values = []
# browse every line of the file
for line in file:
if len(line) > 2:
# extract the parameter key
parameter_key = line[:2]
if parameter_key != b' ' and parameter_key != last_key:
if last_key in self._parameters:
# translate the parameter key
parameter = self._parameters[last_key]
if parameter["type"] == "hyperdata":
separator = parameter["separator"] if "separator" in parameter else ""
hyperdata[parameter["key"]] = separator.join(last_values)
elif parameter["type"] == "delimiter":
if 'language_fullname' not in hyperdata.keys():
if 'language_iso3' not in hyperdata.keys():
if 'language_iso2' not in hyperdata.keys():
hyperdata['language_iso2'] = 'en'
yield hyperdata
hyperdata = {}
last_key = parameter_key
last_values = []
try:
last_values.append(line[self._begin:-1].decode())
except Exception as error:
print(error)
# if a hyperdata object is left in memory, yield it as well
if hyperdata:
# try:
# if hyperdata['date_to_parse']:
# print(hyperdata['date_to_parse'])
# except:
# pass
#
#print(hyperdata['title'])
yield hyperdata
# from .Ris import RisParser from .Ris import RISParser
# from .Isi import IsiParser from .Isi import ISIParser
# from .Jstor import JstorParser # from .Jstor import JstorParser
# from .Zotero import ZoteroParser # from .Zotero import ZoteroParser
from .Pubmed import PubmedParser from .Pubmed import PubmedParser
# # 2015-12-08: parser 2 en 1 # # 2015-12-08: parser 2 en 1
from .Europress import EuropressParser from .Europress import EuropressParser
# from .ISTex import ISTexParser # from .ISTex import ISTexParser
# from .CSV import CSVParser # from .CSV import CSVParser
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment