Commit 82341b15 authored by c24b's avatar c24b

PARSER and CRAWLER changed with NEW NAMING CONVENTION

parent 6c5d252b
# Scrapers config
QUERY_SIZE_N_MAX = 1000
from gargantext.constants import get_resource
from gargantext.util.scheduling import scheduled
from gargantext.util.db import session
from requests_futures.sessions import FuturesSession
from gargantext.util.db import session
import requests
from gargantext.models.nodes import Node
#from gargantext.util.toolchain import parse_extract_indexhyperdata
from datetime import date
class Crawler:
"""Base class for performing search and add corpus file depending on the type
"""
def __init__(self, record):
#the name of corpus
#that will be built in case of internal fileparsing
self.record = record
self.name = record["corpus_name"]
self.project_id = record["project_id"]
self.user_id = record["user_id"]
self.resource = record["source"]
self.type = get_resource(self.resource)
self.query = record["query"]
#format the sampling
self.n_last_years = 5
self.YEAR = date.today().year
#pas glop
# mais easy version
self.MONTH = str(date.today().month)
if len(self.MONTH) == 1:
self.MONTH = "0"+self.MONTH
self.MAX_RESULTS = 1000
try:
self.results_nb = int(record["count"])
except KeyError:
#n'existe pas encore
self.results_nb = 0
try:
self.webEnv = record["webEnv"]
self.queryKey = record["queryKey"]
self.retMax = record["retMax"]
except KeyError:
#n'exsite pas encore
self.queryKey = None
self.webEnv = None
self.retMax = 1
self.status = [None]
self.path = "/tmp/results.txt"
def tmp_file(self):
'''here should stored the results
depending on the type of format'''
raise NotImplemented
def parse_query(self):
'''here should parse the parameters of the query
depending on the type and retrieve a set of activated search option
'''
raise NotImplemented
def fetch(self):
if self.download():
self.create_corpus()
return self.corpus_id
def get_sampling_dates():
'''Create a sample list of min and max date based on Y and M f*
or N_LAST_YEARS results'''
dates = []
for i in range(self.n_last_years):
maxyear = self.YEAR -i
mindate = str(maxyear-1)+"/"+str(self.MONTH)
maxdate = str(maxyear)+"/"+str(self.MONTH)
print(mindate,"-",maxdate)
dates.append((mindate, maxdate))
return dates
def create_corpus(self):
#create a corpus
corpus = Node(
name = self.query,
user_id = self.user_id,
parent_id = self.project_id,
typename = 'CORPUS',
hyperdata = { "action" : "Scrapping data",
"language_id" : self.type["default_language"],
}
)
self.corpus_id = corpus.id
if len(self.paths) > 0:
for path in self.paths:
#add the resource
corpus.add_resource(
type = self.type["type"],
name = self.type["name"],
path = path
)
session.add(corpus)
session.commit()
scheduled(parse_extract_indexhyperdata(corpus.id))
else:
#add the resource
corpus.add_resource(
type = self.type["type"],
name = self.type["name"],
path = self.path
)
session.add(corpus)
session.commit()
scheduled(parse_extract_indexhyperdata(corpus.id))
return corpus
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# ***** CERN Scrapper *****
# ****************************
# Author:c24b
# Date: 27/05/2015
from ._Crawler import Crawler
import hmac, hashlib
import requests
import os
import random
import urllib.parse as uparse
from lxml import etree
from gargantext.settings import API_TOKENS
#from gargantext.util.files import build_corpus_path
from gargantext.util.db import session
from gargantext.models import Node
class CernCrawler(Crawler):
'''CERN SCOAP3 API Interaction'''
def __generate_signature__(self, url):
'''creation de la signature'''
#hmac-sha1 salted with secret
return hmac.new(self.secret,url, hashlib.sha1).hexdigest()
def __format_query__(self, query, of="xm", fields= None):
''' for query filters params
see doc https://scoap3.org/scoap3-repository/xml-api/
'''
#dict_q = uparse.parse_qs(query)
dict_q = {}
#by default: search by pattern
dict_q["p"] = query
if fields is not None and isinstance(fields, list):
fields = ",".join(fields)
dict_q["f"] = fields
#outputformat: "xm", "xmt", "h", "html"
dict_q["of"]= of
return dict_q
def __format_url__(self, dict_q):
'''format the url with encoded query'''
#add the apikey
dict_q["apikey"] = [self.apikey]
params = "&".join([(str(k)+"="+str(uparse.quote(v[0]))) for k,v in sorted(dict_q.items())])
return self.BASE_URL+params
def sign_url(self, dict_q):
'''add signature'''
API = API_TOKENS["CERN"]
self.apikey = API["APIKEY"]
self.secret = API["APISECRET"].encode("utf-8")
self.BASE_URL = u"http://api.scoap3.org/search?"
url = self.__format_url__(dict_q)
return url+"&signature="+self.__generate_signature__(url.encode("utf-8"))
def create_corpus(self):
#create a corpus
corpus = Node(
name = self.query,
#user_id = self.user_id,
parent_id = self.project_id,
typename = 'CORPUS',
hyperdata = { "action" : "Scrapping data"
, "language_id" : self.type["default_language"]
}
)
#add the resource
corpus.add_resource(
type = self.type["type"],
name = self.type["name"],
path = self.path)
try:
print("PARSING")
# p = eval(self.type["parser"])()
session.add(corpus)
session.commit()
self.corpus_id = corpus.id
parse_extract_indexhyperdata(corpus.id)
return self
except Exception as error:
print('WORKFLOW ERROR')
print(error)
session.rollback()
return self
def download(self):
import time
self.path = "/tmp/results.xml"
query = self.__format_query__(self.query)
url = self.sign_url(query)
start = time.time()
r = requests.get(url, stream=True)
downloaded = False
#the long part
with open(self.path, 'wb') as f:
print("Downloading file")
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
#print("===")
f.write(chunk)
downloaded = True
end = time.time()
#print (">>>>>>>>>>LOAD results", end-start)
return downloaded
def scan_results(self):
'''scanner le nombre de resultat en récupérant 1 seul résultat
qui affiche uniquement l'auteur de la page 1
on récupère le commentaire en haut de la page
'''
import time
self.results_nb = 0
query = self.__format_query__(self.query, of="hb")
query["ot"] = "100"
query["jrec"]='1'
query["rg"]='1'
url = self.sign_url(query)
print(url)
#start = time.time()
r = requests.get(url)
#end = time.time()
#print (">>>>>>>>>>LOAD results_nb", end-start)
if r.status_code == 200:
self.results_nb = int(r.text.split("-->")[0].split(': ')[-1][:-1])
return self.results_nb
else:
raise ValueError(r.status)
from ._Crawler import *
import json
class ISTexCrawler(Crawler):
"""
ISTEX Crawler
"""
def __format_query__(self,query=None):
'''formating query urlquote instead'''
if query is not None:
query = query.replace(" ","+")
return query
else:
self.query = self.query.replace(" ","+")
return self.query
def scan_results(self):
#get the number of results
self.results_nb = 0
self.query = self.__format_query__()
_url = "http://api.istex.fr/document/?q="+self.query+"&size=0"
#"&output=id,title,abstract,pubdate,corpusName,authors,language"
r = requests.get(_url)
print(r)
if r.status_code == 200:
self.results_nb = int(r.json()["total"])
self.status.append("fetching results")
return self.results_nb
else:
self.status.append("error")
raise ValueError(r.status)
def download(self):
'''fetching items'''
downloaded = False
def get_hits(future):
'''here we directly get the result hits'''
response = future.result()
if response.status_code == 200:
return response.json()["hits"]
else:
return None
#session = FuturesSession()
#self.path = "/tmp/results.json"
self.status.append("fetching results")
paging = 100
self.query_max = self.results_nb
if self.query_max > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % (self.query_max, QUERY_SIZE_N_MAX)
print("ERROR (scrap: istex d/l ): ",msg)
self.query_max = QUERY_SIZE_N_MAX
#urlreqs = []
with open(self.path, 'wb') as f:
for i in range(0, self.query_max, paging):
url_base = "http://api.istex.fr/document/?q="+self.query+"&output=*&from=%i&size=%i" %(i, paging)
r = requests.get(url_base)
if r.status_code == 200:
downloaded = True
f.write(r.text.encode("utf-8"))
else:
downloaded = False
self.status.insert(0, "error fetching ISTEX "+ r.status)
break
return downloaded
This diff is collapsed.
from gargantext.util.files import download
import sys
import time
import threading
from queue import Queue
from lxml import etree
if sys.version_info >= (3, 0):
from urllib.request import urlopen
else:
from urllib import urlopen
class Scraper :
def __init__(self):
self.queue_size = 8
self.q = Queue()
self.firstResults = []
self.lock = threading.Lock() # lock to serialize console output
self.pubMedEutilsURL = 'http://www.ncbi.nlm.nih.gov/entrez/eutils'
self.pubMedDB = 'Pubmed'
self.reportType = 'medline'
# Return the globalResults!:
# - count =
# - queryKey =
# - webEnv =
def medlineEsearch(self , query):
# print ("MedlineFetcher::medlineEsearch :")
"Get number of results for query 'query' in variable 'count'"
"Get also 'queryKey' and 'webEnv', which are used by function 'medlineEfetch'"
# print(query)
origQuery = query
query = query.replace(' ', '%20')
eSearch = '%s/esearch.fcgi?db=%s&retmax=1&usehistory=y&term=%s' \
% ( self.pubMedEutilsURL, self.pubMedDB, query )
try:
eSearchResult = urlopen(eSearch)
data = eSearchResult.read()
root = etree.XML(data)
findcount = etree.XPath("/eSearchResult/Count/text()")
count = findcount(root)[0]
findquerykey = etree.XPath("/eSearchResult/QueryKey/text()")
queryKey = findquerykey(root)[0]
findwebenv = etree.XPath("/eSearchResult/WebEnv/text()")
webEnv = findwebenv(root)[0]
except Exception as Error:
print(Error)
count = 0
queryKey = False
webEnv = False
origQuery = False
values = { "query" : origQuery
, "count" : int(count)
, "queryKey" : queryKey
, "webEnv" : webEnv
}
return values
# RETMAX:
# Total number of UIDs from the retrieved set to be shown in the XML output (default=20)
# maximum of 100,000 records
def medlineEfetchRAW( self , fullquery):
query = fullquery [ "string" ]
retmax = fullquery [ "retmax" ]
count = fullquery [ "count" ]
queryKey = fullquery [ "queryKey"]
webEnv = fullquery [ "webEnv" ]
"Fetch medline result for query 'query', saving results to file every 'retmax' articles"
queryNoSpace = query.replace(' ', '') # No space in directory and file names, avoids stupid errors
# print ("LOG::TIME: ",'medlineEfetchRAW :Query "' , query , '"\t:\t' , count , ' results')
retstart = 0
eFetch = '%s/efetch.fcgi?email=youremail@example.org&rettype=%s&retmode=xml&retstart=%s&retmax=%s&db=%s&query_key=%s&WebEnv=%s' %(self.pubMedEutilsURL, self.reportType, retstart, retmax, self.pubMedDB, queryKey, webEnv)
return eFetch
# generic!
def download(self, url):
print(url)
filename = download(url)
with self.lock:
print(threading.current_thread().name, filename+" OK")
return filename
# generic!
def do_work(self,item):
# time.sleep(1) # pretend to do some lengthy work.
returnvalue = self.medlineEsearch(item)
with self.lock:
# print(threading.current_thread().name, item)
return returnvalue
# The worker thread pulls an item from the queue and processes it
def worker(self):
while True:
item = self.q.get()
self.firstResults.append(self.do_work(item))
self.q.task_done()
def worker2(self):
while True:
item = self.q.get()
results = []
try:
result = self.download(item)
except Exception as error :
print(error)
result = False
self.firstResults.append(result)
self.q.task_done()
def chunks(self , l , n):
print("chunks:")
for i in range(0, len(l), n):
yield l[i:i+n]
# GLOBALLIMIT:
# I will retrieve this exact amount of publications.
# The publications per year i'll retrieve per year will be :
# (k/N)*GlobalLimit
# \_ this is used as RETMAX
# - k : Number of publications of x year (according to pubmed)
# - N : Sum of every k belonging to {X} (total number of pubs according to pubmed)
# - GlobalLimit : Number of publications i want.
def serialFetcher(self , yearsNumber , query, globalLimit):
# Create the queue and thread pool.
for i in range(self.queue_size):
t = threading.Thread(target=self.worker)
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
start = time.perf_counter()
N = 0
# print ("MedlineFetcher::serialFetcher :")
thequeries = []
globalresults = []
for i in range(yearsNumber):
year = str(2015 - i)
# print ('YEAR ' + year)
# print ('---------\n')
pubmedquery = str(year) + '[dp] '+query
self.q.put( pubmedquery ) #put task in the queue
self.q.join()
print('time:',time.perf_counter() - start)
Total = 0
Fails = 0
for globalresults in self.firstResults:
# globalresults = self.medlineEsearch(pubmedquery)
Total += 1
if globalresults["queryKey"]==False:
Fails += 1
if globalresults["count"] > 0 :
N+=globalresults["count"]
queryhyperdata = { "string" : globalresults["query"]
, "count" : globalresults["count"]
, "queryKey" : globalresults["queryKey"]
, "webEnv" : globalresults["webEnv"]
, "retmax" : 0
}
thequeries.append ( queryhyperdata )
print("Total Number:", N,"publications")
print("And i want just:",globalLimit,"publications")
print("---------------------------------------\n")
for i,query in enumerate(thequeries):
k = query["count"]
proportion = k/float(N)
retmax_forthisyear = int(round(globalLimit*proportion))
query["retmax"] = retmax_forthisyear
if query["retmax"] == 0 : query["retmax"]+=1
print(query["string"],"\t[",k,">",query["retmax"],"]")
if ((Fails+1)/(Total+1)) == 1 : # for identifying the epic fail or connection error
thequeries = [False]
return thequeries
...@@ -2,6 +2,8 @@ from ._Parser import Parser ...@@ -2,6 +2,8 @@ from ._Parser import Parser
from datetime import datetime from datetime import datetime
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from lxml import etree from lxml import etree
#import asyncio
#q = asyncio.Queue(maxsize=0)
class CernParser(Parser): class CernParser(Parser):
#mapping MARC21 ==> hyperdata #mapping MARC21 ==> hyperdata
...@@ -52,10 +54,15 @@ class CernParser(Parser): ...@@ -52,10 +54,15 @@ class CernParser(Parser):
print("Date", hyperdata["publication_date"]) print("Date", hyperdata["publication_date"])
return hyperdata return hyperdata
#@asyncio.coroutine
def parse(self, file): def parse(self, file):
print("PARSING")
hyperdata_list = [] hyperdata_list = []
doc = file.read() doc = file.read()
soup = BeautifulSoup(doc.decode("utf-8"), "lxml") print(doc[:35])
soup = BeautifulSoup(doc, "lxml")
#print(soup.find("record"))
for record in soup.find_all("record"): for record in soup.find_all("record"):
hyperdata = {v:[] for v in self.MARC21["100"].values()} hyperdata = {v:[] for v in self.MARC21["100"].values()}
hyperdata["uid"] = soup.find("controlfield").text hyperdata["uid"] = soup.find("controlfield").text
......
from ._Parser import Parser
# from ..NgramsExtractors import *
import sys
import csv
csv.field_size_limit(sys.maxsize)
import numpy as np
import os
class CSVParser(Parser):
def CSVsample( self, small_contents , delim) :
reader = csv.reader(small_contents, delimiter=delim)
Freqs = []
for row in reader:
Freqs.append(len(row))
return Freqs
def parse(self, filebuf):
print("CSV: parsing (assuming UTF-8 and LF line endings)")
contents = filebuf.read().decode("UTF-8").split("\n")
sample_size = 10
sample_contents = contents[0:sample_size]
hyperdata_list = []
# # = = = = [ Getting delimiters frequency ] = = = = #
PossibleDelimiters = [ ',',' ','\t', ';', '|', ':' ]
AllDelimiters = {}
for delim in PossibleDelimiters:
AllDelimiters[delim] = self.CSVsample( sample_contents , delim )
# # = = = = [ / Getting delimiters frequency ] = = = = #
# # OUTPUT example:
# # AllDelimiters = {
# # '\t': [1, 1, 1, 1, 1],
# # ' ': [1, 13, 261, 348, 330],
# # ',': [15, 15, 15, 15, 15],
# # ';': [1, 1, 1, 1, 1],
# # '|': [1, 1, 1, 1, 1]
# # }
# # = = = = [ Stand.Dev=0 & Sum of delimiters ] = = = = #
Delimiters = []
for d in AllDelimiters:
freqs = AllDelimiters[d]
suma = np.sum( freqs )
if suma >0:
std = np.std( freqs )
# print [ d , suma , len(freqs) , std]
if std == 0:
Delimiters.append ( [ d , suma , len(freqs) , std] )
# # = = = = [ / Stand.Dev=0 & Sum of delimiters ] = = = = #
# # OUTPUT example:
# # Delimiters = [
# # ['\t', 5, 5, 0.0],
# # [',', 75, 5, 0.0],
# # ['|', 5, 5, 0.0]
# # ]
# # = = = = [ Delimiter selection ] = = = = #
Sorted_Delims = sorted(Delimiters, key=lambda x: x[1], reverse=True)
HighestDelim = Sorted_Delims[0][0]
# HighestDelim = ","
print("CSV selected delimiter:",[HighestDelim])
# # = = = = [ / Delimiter selection ] = = = = #
# # = = = = [ First data coordinate ] = = = = #
Coords = {
"row": -1,
"column": -1
}
reader = csv.reader(contents, delimiter=HighestDelim)
for rownum, tokens in enumerate(reader):
if rownum % 250 == 0:
print("CSV row: ", rownum)
joined_tokens = "".join (tokens)
if Coords["row"]<0 and len( joined_tokens )>0 :
Coords["row"] = rownum
for columnum in range(len(tokens)):
t = tokens[columnum]
if len(t)>0:
Coords["column"] = columnum
break
# # = = = = [ / First data coordinate ] = = = = #
# # = = = = [ Setting Headers ] = = = = #
Headers_Int2Str = {}
reader = csv.reader(contents, delimiter=HighestDelim)
for rownum, tokens in enumerate(reader):
if rownum>=Coords["row"]:
for columnum in range( Coords["column"],len(tokens) ):
t = tokens[columnum]
Headers_Int2Str[columnum] = t
break
# print("Headers_Int2Str")
# print(Headers_Int2Str)
# # = = = = [ / Setting Headers ] = = = = #
# # OUTPUT example:
# # Headers_Int2Str = {
# # 0: 'publication_date',
# # 1: 'publication_month',
# # 2: 'publication_second',
# # 3: 'abstract'
# # }
# # = = = = [ Reading the whole CSV and saving ] = = = = #
hyperdata_list = []
reader = csv.reader(contents, delimiter=HighestDelim)
for rownum, tokens in enumerate(reader):
if rownum>Coords["row"]:
RecordDict = {}
for columnum in range( Coords["column"],len(tokens) ):
data = tokens[columnum]
RecordDict[ Headers_Int2Str[columnum] ] = data
if len(RecordDict.keys())>0:
hyperdata_list.append( RecordDict )
# # = = = = [ / Reading the whole CSV and saving ] = = = = #
return hyperdata_list
from ._Parser import Parser
from datetime import datetime
from io import BytesIO
import json
class ISTexParser(Parser):
def parse(self, filebuf):
contents = filebuf.read().decode("UTF-8")
data = json.loads(contents)
filebuf.close()
json_docs = data["hits"]
hyperdata_list = []
hyperdata_path = {
"id" : "id",
"source" : 'corpusName',
"title" : 'title',
"genre" : "genre",
"language_iso3" : 'language',
"doi" : 'doi',
"host" : 'host',
"publication_date" : 'publicationDate',
"abstract" : 'abstract',
# "authors" : 'author',
"authorsRAW" : 'author',
"keywords" : "keywords"
}
suma = 0
for json_doc in json_docs:
hyperdata = {}
for key, path in hyperdata_path.items():
try:
# print(path," ==> ",len(json_doc[path]))
hyperdata[key] = json_doc[path]
except:
pass
# print("|",hyperdata["language_iso3"])
if "doi" in hyperdata:
hyperdata["doi"] = hyperdata["doi"][0]
keywords = []
if "keywords" in hyperdata:
for keyw in hyperdata["keywords"]:
keywords.append(keyw["value"] )
hyperdata["keywords"] = ", ".join( keywords )
moredate=False
moresource=False
if "host" in hyperdata:
if "genre" in hyperdata["host"] and len(hyperdata["host"]["genre"])>0:
if "genre" in hyperdata and len(hyperdata["genre"])==0:
hyperdata["genre"] = hyperdata["host"]["genre"]
# print(hyperdata["host"])
if "pubdate" in hyperdata["host"]:
onebuffer = hyperdata["publication_date"]
hyperdata["publication_date"] = []
hyperdata["publication_date"].append(onebuffer)
hyperdata["publication_date"].append( hyperdata["host"]["pubdate"] )
if "title" in hyperdata["host"]:
hyperdata["journal"] = hyperdata["host"]["title"]
authors=False
if "authorsRAW" in hyperdata:
names = []
for author in hyperdata["authorsRAW"]:
names.append(author["name"])
hyperdata["authors"] = ", ".join(names)
if "host" in hyperdata: hyperdata.pop("host")
if "genre" in hyperdata:
if len(hyperdata["genre"])==0:
hyperdata.pop("genre")
if "language_iso3" in hyperdata:
# retrieve lang if lang != [] and lang != ["unknown"]
# ---------------------------------------------------
if len(hyperdata["language_iso3"])>0 and hyperdata["language_iso3"][0] != "unknown" :
hyperdata["language_iso3"] = hyperdata["language_iso3"][0]
# default value = eng
# possible even better: langid.classify(abstract)
else:
# NB 97% des docs istex sont eng donc par défaut
# ----------------------------------------------
hyperdata["language_iso3"] = "eng"
# (cf. api.istex.fr/document/?q=*&facet=language
# et tests langid sur les language=["unknown"])
if "publication_date" in hyperdata:
RealDate = hyperdata["publication_date"]
if "publication_date" in hyperdata:
hyperdata.pop("publication_date")
if isinstance(RealDate, list):
RealDate = RealDate[0]
# print( RealDate ," | length:",len(RealDate))
Decision=""
if len(RealDate)>4:
if len(RealDate)>8:
try: Decision = datetime.strptime(RealDate, '%Y-%b-%d').date()
except:
try: Decision = datetime.strptime(RealDate, '%Y-%m-%d').date()
except: Decision=False
else:
try: Decision = datetime.strptime(RealDate, '%Y-%b').date()
except:
try: Decision = datetime.strptime(RealDate, '%Y-%m').date()
except: Decision=False
else:
try: Decision = datetime.strptime(RealDate, '%Y').date()
except: Decision=False
if Decision!=False:
hyperdata["publication_year"] = str(Decision.year)
hyperdata["publication_month"] = str(Decision.month)
hyperdata["publication_day"] = str(Decision.day)
hyperdata_list.append(hyperdata)
# print("\t||",hyperdata["title"])
# print("\t\t",Decision)
# print("=============================")
# else:
# suma+=1
# if "pubdate" in json_doc:
# print ("\tfail pubdate:",json_doc["pubdate"])
# print ("nb_hits:",len(json_docs))
# print("\t - nb_fails:",suma)
# print(" -- - - - - - -- - -")
return hyperdata_list
...@@ -31,6 +31,7 @@ class PubmedParser(Parser): ...@@ -31,6 +31,7 @@ class PubmedParser(Parser):
if isinstance(file, bytes): if isinstance(file, bytes):
file = BytesIO(file) file = BytesIO(file)
xml = etree.parse(file, parser=self.xml_parser) xml = etree.parse(file, parser=self.xml_parser)
#print(xml.find("PubmedArticle"))
xml_articles = xml.findall('PubmedArticle') xml_articles = xml.findall('PubmedArticle')
# initialize the list of hyperdata # initialize the list of hyperdata
hyperdata_list = [] hyperdata_list = []
......
from ._Parser import Parser
from gargantext.util.languages import languages
#from admin.utils import PrintException
class RepecParser(Parser):
# def __init__(self, language_cache=None):
#
# #super(Parser, self).__init__()
# #super(Parser, self).__init__()
# self._languages_cache = LanguagesCache() if language_cache is None else language_cache
_begin = 6
_parameters = {
b"ER": {"type": "delimiter"},
b"T1": {"type": "hyperdata", "key": "title", "separator": " "},
b"ST": {"type": "hyperdata", "key": "subtitle", "separator": " "},
b"A1": {"type": "hyperdata", "key": "authors", "separator": "\n"},
b"JO": {"type": "hyperdata", "key": "journal"},
b"UR": {"type": "hyperdata", "key": "doi"},
b"Y1": {"type": "hyperdata", "key": "publication_year"},
b"PD": {"type": "hyperdata", "key": "publication_month"},
b"N1": {"type": "hyperdata", "key": "references", "separator": ", "},
b"LA": {"type": "hyperdata", "key": "language_iso2"},
b"N2": {"type": "hyperdata", "key": "abstract", "separator": " "},
b"WC": {"type": "hyperdata", "key": "fields"},
}
def parse(self, file):
hyperdata = {}
last_key = None
last_values = []
# browse every line of the file
for line in file:
if len(line) > 2 :
# extract the parameter key
parameter_key = line[:2]
if parameter_key != b' ' and parameter_key != last_key:
if last_key in self._parameters:
# translate the parameter key
parameter = self._parameters[last_key]
if parameter["type"] == "hyperdata":
separator = parameter["separator"] if "separator" in parameter else ""
if parameter["key"] == "publication_year":
hyperdata[parameter["key"]] = separator.join(last_values)[:4]
else:
hyperdata[parameter["key"]] = separator.join(last_values)
elif parameter["type"] == "delimiter":
if 'language_fullname' not in hyperdata.keys():
if 'language_iso3' not in hyperdata.keys():
if 'language_iso2' not in hyperdata.keys():
hyperdata['language_iso2'] = 'en'
yield hyperdata
hyperdata = {}
last_key = parameter_key
last_values = []
try:
last_values.append(line[self._begin:-1].decode())
except Exception as error:
print(error)
# if a hyperdata object is left in memory, yield it as well
if hyperdata:
yield hyperdata
...@@ -20,14 +20,9 @@ class Parser: ...@@ -20,14 +20,9 @@ class Parser:
self._file = file self._file = file
def __del__(self): def __del__(self):
self._file.close() if hasattr(self, '_file'):
self._file.close()
def detect_format(self, afile, a_formats):
#import magic
print("Detecting format")
#print(magic.from_file(afile))
return
def detect_encoding(self, string): def detect_encoding(self, string):
"""Useful method to detect the encoding of a document. """Useful method to detect the encoding of a document.
...@@ -167,6 +162,8 @@ class Parser: ...@@ -167,6 +162,8 @@ class Parser:
def __iter__(self, file=None): def __iter__(self, file=None):
"""Parse the file, and its children files found in the file. """Parse the file, and its children files found in the file.
C24B comment: le stokage/extraction du fichier devrait être faite en amont
et cette methode est un peu obscure
""" """
if file is None: if file is None:
file = self._file file = self._file
......
from .Ris import RISParser import importlib
from .Ris_repec import RepecParser from gargantext.constants import RESOURCETYPES
from .Isi import ISIParser from gargantext.settings import DEBUG
# from .Jstor import JstorParser #if DEBUG:
# from .Zotero import ZoteroParser # print("Loading available PARSERS:")
from .Pubmed import PubmedParser base_parser = "gargantext.util.parsers"
for resource in RESOURCETYPES:
if resource["parser"] is not None:
#parser file is without Parser
try:
fname = resource["parser"].replace("Parser", "")
#parser file is formatted as a title
module = base_parser+".%s" %(fname.title())
#parser module is has shown in constants
parser = importlib.import_module(module)
#if DEBUG:
# print("\t-", resource["parser"])
#getattr(parser,resource["parser"])
# # 2015-12-08: parser 2 en 1 except Exception as e:
from .Europress import EuropressParser print("Check constants.py %s \nLANGUAGES declaration of taggers. Parser %s is not available" %(str(e), resource["parser"]))
from .ISTex import ISTexParser
from .CSV import CSVParser
from .Cern import CernParser
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment