Commit 5160d178 authored by PkSM3's avatar PkSM3

[UPDATE] merge conflict

parents 0853ddca 2644e642
export PGPASSWORD=C8kdcUrAQy66U
psql -U gargantua -d gargandb -f drop_db.sql
ALTER TABLE node_node RENAME metadata TO hyperdata ;
ALTER TABLE node_metadata RENAME TO node_hyperdata ;
ALTER TABLE node_node_metadata RENAME TO node_node_hyperdata ;
ALTER TABLE node_node_hyperdata RENAME metadata_id TO hyperdata_id ;
...@@ -306,7 +306,6 @@ class Node(CTENode): ...@@ -306,7 +306,6 @@ class Node(CTENode):
self.hyperdata['Processing'] = 0 self.hyperdata['Processing'] = 0
self.save() self.save()
class Node_Hyperdata(models.Model): class Node_Hyperdata(models.Model):
node = models.ForeignKey(Node, on_delete=models.CASCADE) node = models.ForeignKey(Node, on_delete=models.CASCADE)
hyperdata = models.ForeignKey(Hyperdata) hyperdata = models.ForeignKey(Hyperdata)
......
...@@ -8,7 +8,7 @@ import dateutil.parser ...@@ -8,7 +8,7 @@ import dateutil.parser
from .FileParser import FileParser from .FileParser import FileParser
from ..NgramsExtractors import * from ..NgramsExtractors import *
from admin.utils import PrintException
class EuropressFileParser(FileParser): class EuropressFileParser(FileParser):
...@@ -29,8 +29,8 @@ class EuropressFileParser(FileParser): ...@@ -29,8 +29,8 @@ class EuropressFileParser(FileParser):
if encoding != "utf-8": if encoding != "utf-8":
try: try:
contents = contents.decode("latin1", errors='replace').encode(codif) contents = contents.decode("latin1", errors='replace').encode(codif)
except Exception as error: except:
print(error) PrintException()
# try: # try:
# contents = contents.decode(encoding, errors='replace').encode(codif) # contents = contents.decode(encoding, errors='replace').encode(codif)
# except Exception as error: # except Exception as error:
...@@ -40,7 +40,7 @@ class EuropressFileParser(FileParser): ...@@ -40,7 +40,7 @@ class EuropressFileParser(FileParser):
html_parser = etree.HTMLParser(encoding=codif) html_parser = etree.HTMLParser(encoding=codif)
html = etree.fromstring(contents, html_parser) html = etree.fromstring(contents, html_parser)
try: try :
format_europresse = 50 format_europresse = 50
html_articles = html.xpath('/html/body/table/tbody') html_articles = html.xpath('/html/body/table/tbody')
...@@ -51,15 +51,15 @@ class EuropressFileParser(FileParser): ...@@ -51,15 +51,15 @@ class EuropressFileParser(FileParser):
if len(html_articles) < 1: if len(html_articles) < 1:
format_europresse = 1 format_europresse = 1
html_articles = html.xpath('//div[@id="docContain"]') html_articles = html.xpath('//div[@id="docContain"]')
except Exception as error: except :
print(error) PrintException()
if format_europresse == 50: if format_europresse == 50 :
name_xpath = "./tr/td/span[@class = 'DocPublicationName']" name_xpath = "./tr/td/span[@class = 'DocPublicationName']"
header_xpath = "//span[@class = 'DocHeader']" header_xpath = "./tr/td/span[@class = 'DocHeader']"
title_xpath = "string(./tr/td/span[@class = 'TitreArticleVisu'])" title_xpath = "string(./tr/td/span[@class = 'TitreArticleVisu'])"
text_xpath = "./tr/td/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()" text_xpath = "./tr/td/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()"
elif format_europresse == 1: elif format_europresse == 1 :
name_xpath = "//span[@class = 'DocPublicationName']" name_xpath = "//span[@class = 'DocPublicationName']"
header_xpath = "//span[@class = 'DocHeader']" header_xpath = "//span[@class = 'DocHeader']"
title_xpath = "string(//div[@class = 'titreArticleVisu'])" title_xpath = "string(//div[@class = 'titreArticleVisu'])"
...@@ -79,8 +79,8 @@ class EuropressFileParser(FileParser): ...@@ -79,8 +79,8 @@ class EuropressFileParser(FileParser):
doi_xpath = "//span[@id='ucPubliC_lblNodoc']/text()" doi_xpath = "//span[@id='ucPubliC_lblNodoc']/text()"
except Exception as error: except Exception as error :
print(error) PrintException()
# parse all the articles, one by one # parse all the articles, one by one
try: try:
...@@ -99,7 +99,18 @@ class EuropressFileParser(FileParser): ...@@ -99,7 +99,18 @@ class EuropressFileParser(FileParser):
else: else:
hyperdata['journal'] = name.text.encode(codif) hyperdata['journal'] = name.text.encode(codif)
countbis = 0
for header in html_article.xpath(header_xpath): for header in html_article.xpath(header_xpath):
# print(count)
# countbis += 1
# try:
# print('109', hyperdata['publication_date'])
# except:
# print('no date yet')
# pass
try: try:
text = header.text text = header.text
#print("header", text) #print("header", text)
...@@ -145,12 +156,10 @@ class EuropressFileParser(FileParser): ...@@ -145,12 +156,10 @@ class EuropressFileParser(FileParser):
hyperdata['publication_date'] = datetime.strptime(text, '%d %B %Y') hyperdata['publication_date'] = datetime.strptime(text, '%d %B %Y')
# hyperdata['publication_date'] = dateutil.parser.parse(text) # hyperdata['publication_date'] = dateutil.parser.parse(text)
except Exception as error: except Exception as error:
print(error) print(error, text)
print(text)
pass pass
if test_date_en is not None: if test_date_en is not None:
localeEncoding = "en_GB.UTF-8" localeEncoding = "en_GB.UTF-8"
locale.setlocale(locale.LC_ALL, localeEncoding) locale.setlocale(locale.LC_ALL, localeEncoding)
...@@ -168,6 +177,13 @@ class EuropressFileParser(FileParser): ...@@ -168,6 +177,13 @@ class EuropressFileParser(FileParser):
if test_page is not None: if test_page is not None:
hyperdata['page'] = test_page.group(1).encode(codif) hyperdata['page'] = test_page.group(1).encode(codif)
# try:
# print('183', hyperdata['publication_date'])
# except:
# print('no date yet')
# pass
#
hyperdata['title'] = html_article.xpath(title_xpath).encode(codif) hyperdata['title'] = html_article.xpath(title_xpath).encode(codif)
hyperdata['abstract'] = html_article.xpath(text_xpath) hyperdata['abstract'] = html_article.xpath(text_xpath)
...@@ -215,7 +231,7 @@ class EuropressFileParser(FileParser): ...@@ -215,7 +231,7 @@ class EuropressFileParser(FileParser):
hyperdata['publication_year'] = hyperdata['publication_date'].strftime('%Y') hyperdata['publication_year'] = hyperdata['publication_date'].strftime('%Y')
hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m') hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m')
hyperdata['publication_day'] = hyperdata['publication_date'].strftime('%d') hyperdata['publication_day'] = hyperdata['publication_date'].strftime('%d')
hyperdata.pop('publication_date') #hyperdata.pop('publication_date')
if len(hyperdata['abstract'])>0 and format_europresse == 50: if len(hyperdata['abstract'])>0 and format_europresse == 50:
hyperdata['doi'] = str(hyperdata['abstract'][-9]) hyperdata['doi'] = str(hyperdata['abstract'][-9])
......
import collections import collections
import datetime
import dateutil.parser import dateutil.parser
import zipfile import zipfile
import chardet import chardet
import re
from ..Caches import LanguagesCache from ..Caches import LanguagesCache
DEFAULT_DATE = datetime.datetime(datetime.MINYEAR, 1, 1)
class FileParser: class FileParser:
"""Base class for performing files parsing depending on their type. """Base class for performing files parsing depending on their type.
""" """
...@@ -29,6 +34,21 @@ class FileParser: ...@@ -29,6 +34,21 @@ class FileParser:
""" """
# First, check the split dates... # First, check the split dates...
date_string = hyperdata.get('publication_date_to_parse', None)
if date_string is not None:
date_string = re.sub(r'\/\/+', '', date_string)
date_string = re.sub(r'undefined', '', date_string)
try:
hyperdata['publication' + "_date"] = dateutil.parser.parse(
date_string,
default=DEFAULT_DATE
).strftime("%Y-%m-%d %H:%M:%S")
except:
print('Parser Zotero, Date not parsed for:', date_string)
hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
elif hyperdata.get('publication_year', None) is not None:
prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_year"] prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_year"]
for prefix in prefixes: for prefix in prefixes:
date_string = hyperdata[prefix + "_year"] date_string = hyperdata[prefix + "_year"]
...@@ -51,11 +71,15 @@ class FileParser: ...@@ -51,11 +71,15 @@ class FileParser:
hyperdata[prefix + "_date"] = dateutil.parser.parse(date_string).strftime("%Y-%m-%d %H:%M:%S") hyperdata[prefix + "_date"] = dateutil.parser.parse(date_string).strftime("%Y-%m-%d %H:%M:%S")
except: except:
pass pass
else:
hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# ...then parse all the "date" fields, to parse it into separate elements # ...then parse all the "date" fields, to parse it into separate elements
prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_date"] prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_date"]
for prefix in prefixes: for prefix in prefixes:
date = dateutil.parser.parse(hyperdata[prefix + "_date"]) date = dateutil.parser.parse(hyperdata[prefix + "_date"])
#print(date)
hyperdata[prefix + "_year"] = date.strftime("%Y") hyperdata[prefix + "_year"] = date.strftime("%Y")
hyperdata[prefix + "_month"] = date.strftime("%m") hyperdata[prefix + "_month"] = date.strftime("%m")
hyperdata[prefix + "_day"] = date.strftime("%d") hyperdata[prefix + "_day"] = date.strftime("%d")
...@@ -65,6 +89,7 @@ class FileParser: ...@@ -65,6 +89,7 @@ class FileParser:
# finally, return the transformed result! # finally, return the transformed result!
return hyperdata return hyperdata
print(hyperdata['publication_date'])
def format_hyperdata_languages(self, hyperdata): def format_hyperdata_languages(self, hyperdata):
"""format the languages found in the hyperdata.""" """format the languages found in the hyperdata."""
......
...@@ -3,6 +3,8 @@ from .FileParser import FileParser ...@@ -3,6 +3,8 @@ from .FileParser import FileParser
from ..Caches import LanguagesCache from ..Caches import LanguagesCache
from admin.utils import PrintException
class RisFileParser(FileParser): class RisFileParser(FileParser):
def __init__(self, language_cache=None): def __init__(self, language_cache=None):
...@@ -57,5 +59,11 @@ class RisFileParser(FileParser): ...@@ -57,5 +59,11 @@ class RisFileParser(FileParser):
print(error) print(error)
# if a hyperdata object is left in memory, yield it as well # if a hyperdata object is left in memory, yield it as well
if hyperdata: if hyperdata:
# try:
# if hyperdata['date_to_parse']:
# print(hyperdata['date_to_parse'])
# except:
# pass
#
#print(hyperdata['title']) #print(hyperdata['title'])
yield hyperdata yield hyperdata
from .RisFileParser import RisFileParser
from ..Caches import LanguagesCache
class ZoteroFileParser(RisFileParser):
def __init__(self):
super(RisFileParser, self).__init__()
self._begin = 6
self._parameters = {
b"ER": {"type": "delimiter"},
b"TI": {"type": "hyperdata", "key": "title", "separator": " "},
b"AU": {"type": "hyperdata", "key": "authors", "separator": ", "},
b"UR": {"type": "hyperdata", "key": "doi"},
b"DA": {"type": "hyperdata", "key": "publication_date_to_parse"},
b"PY": {"type": "hyperdata", "key": "publication_year"},
b"PD": {"type": "hyperdata", "key": "publication_month"},
b"LA": {"type": "hyperdata", "key": "language_iso2"},
b"AB": {"type": "hyperdata", "key": "abstract", "separator": " "},
b"WC": {"type": "hyperdata", "key": "fields"},
}
from .RisFileParser import RisFileParser from .RisFileParser import RisFileParser
from .IsiFileParser import IsiFileParser from .IsiFileParser import IsiFileParser
from .JstorFileParser import JstorFileParser from .JstorFileParser import JstorFileParser
from .ZoteroFileParser import ZoteroFileParser
from .PubmedFileParser import PubmedFileParser from .PubmedFileParser import PubmedFileParser
from .EuropressFileParser import EuropressFileParser from .EuropressFileParser import EuropressFileParser
from .ISText import ISText from .ISText import ISText
......
This diff is collapsed.
...@@ -4,12 +4,11 @@ parsers = { ...@@ -4,12 +4,11 @@ parsers = {
'Pubmed (xml format)' : PubmedFileParser, 'Pubmed (xml format)' : PubmedFileParser,
'Web of Science (ISI format)' : IsiFileParser, 'Web of Science (ISI format)' : IsiFileParser,
'Scopus (RIS format)' : RisFileParser, 'Scopus (RIS format)' : RisFileParser,
'Zotero (RIS format)' : JstorFileParser, 'Zotero (RIS format)' : ZoteroFileParser,
'Jstor (RIS format)' : JstorFileParser, 'Jstor (RIS format)' : JstorFileParser,
#'Europress' : EuropressFileParser, #'Europress' : EuropressFileParser,
'Europress (French)' : EuropressFileParser, 'Europress (French)' : EuropressFileParser,
'Europress (English)' : EuropressFileParser, 'Europress (English)' : EuropressFileParser,
'CSVParser' : CSVParser, 'CSVParser' : CSVParser,
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment