Commit 5160d178 authored by PkSM3's avatar PkSM3

[UPDATE] merge conflict

parents 0853ddca 2644e642
export PGPASSWORD=C8kdcUrAQy66U
psql -U gargantua -d gargandb -f drop_db.sql
ALTER TABLE node_node RENAME metadata TO hyperdata ;
ALTER TABLE node_metadata RENAME TO node_hyperdata ;
ALTER TABLE node_node_metadata RENAME TO node_node_hyperdata ;
ALTER TABLE node_node_hyperdata RENAME metadata_id TO hyperdata_id ;
......@@ -43,13 +43,13 @@ class Language(models.Model):
iso3 = models.CharField(max_length=3, unique=True)
fullname = models.CharField(max_length=255, unique=True)
implemented = models.BooleanField(blank=True, default=True)
def __str__(self):
return self.fullname
class ResourceType(models.Model):
name = models.CharField(max_length=255, unique=True)
def __str__(self):
return self.name
......@@ -65,7 +65,7 @@ class Ngram(models.Model):
terms = models.CharField(max_length=255, unique=True)
nodes = models.ManyToManyField(through='Node_Ngram', to='Node')
tag = models.ManyToManyField(blank=True, null=True, through='NgramTag', to='Tag')
def __str__(self):
return self.terms
......@@ -120,21 +120,21 @@ class NodeQuerySet(CTENodeManager.CTEQuerySet):
('value_'+hyperdata.type) : value,
}))
Node_Hyperdata.objects.bulk_create(data)
class NodeManager(CTENodeManager):
"""Methods available from Node.object."""
def get_queryset(self):
self._ensure_parameters()
return NodeQuerySet(self.model, using=self._db)
def __getattr__(self, name, *args):
if name.startswith("_"):
if name.startswith("_"):
raise AttributeError
return getattr(self.get_queryset(), name, *args)
class Hyperdata(models.Model):
name = models.CharField(max_length=32, unique=True)
type = models.CharField(max_length=16, db_index=True)
class Node(CTENode):
"""The node."""
objects = NodeManager()
......@@ -142,9 +142,9 @@ class Node(CTENode):
user = models.ForeignKey(User)
type = models.ForeignKey(NodeType)
name = models.CharField(max_length=255)
language = models.ForeignKey(Language, blank=True, null=True, on_delete=models.SET_NULL)
date = models.DateField(default=timezone.now, blank=True)
hyperdata = JsonBField(null=False, default={})
......@@ -152,7 +152,7 @@ class Node(CTENode):
def __str__(self):
return self.name
def get_resources(self):
return Resource.objects.select_related('node_resource').filter(node_resource__node = self)
......@@ -184,7 +184,7 @@ class Node(CTENode):
)
node_resource.save()
return resource
def parse_resources(self, verbose=False):
# parse all resources into a list of hyperdata
hyperdata_list = []
......@@ -204,7 +204,7 @@ class Node(CTENode):
# 'europress_french' : EuropressFileParser,
# 'europress_english' : EuropressFileParser,
# }
)[resource.type.name]()
hyperdata_list += parser.parse(str(resource.file))
type_id = NodeType.objects.get(name='Document').id
......@@ -293,7 +293,7 @@ class Node(CTENode):
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" extract_ngrams() [s]",(end - start))
print("LOG::TIME: In workflow() / extract_ngrams()")
start = time.time()
print("In workflow() do_tfidf()")
from analysis.functions import do_tfidf
......@@ -305,7 +305,6 @@ class Node(CTENode):
print("In workflow() END")
self.hyperdata['Processing'] = 0
self.save()
class Node_Hyperdata(models.Model):
node = models.ForeignKey(Node, on_delete=models.CASCADE)
......@@ -320,7 +319,7 @@ class Node_Resource(models.Model):
node = models.ForeignKey(Node, related_name='node_resource', on_delete=models.CASCADE)
resource = models.ForeignKey(Resource, on_delete=models.CASCADE)
parsed = models.BooleanField(default=False)
class Node_Ngram(models.Model):
node = models.ForeignKey(Node, on_delete=models.CASCADE)
ngram = models.ForeignKey(Ngram)
......@@ -355,7 +354,7 @@ class Document(Node):
class NodeNgramNgram(models.Model):
node = models.ForeignKey(Node, on_delete=models.CASCADE)
ngramx = models.ForeignKey(Ngram, related_name="nodengramngramx", on_delete=models.CASCADE)
ngramy = models.ForeignKey(Ngram, related_name="nodengramngramy", on_delete=models.CASCADE)
......@@ -368,7 +367,7 @@ class NodeNgramNgram(models.Model):
class NodeNodeNgram(models.Model):
nodex = models.ForeignKey(Node, related_name="nodex", on_delete=models.CASCADE)
nodey = models.ForeignKey(Node, related_name="nodey", on_delete=models.CASCADE)
ngram = models.ForeignKey(Ngram, on_delete=models.CASCADE)
score = models.FloatField(default=0)
......@@ -387,7 +386,7 @@ class NgramNgram(models.Model):
class Group(models.Model):
'''
The creator of the group is a user who
The creator of the group is a user who
- is in it
- has all acccess by defautl
'''
......@@ -407,7 +406,7 @@ class UserGroup(models.Model):
user = models.ForeignKey(User)
group = models.ForeignKey(Group)
rights = models.CharField(max_length=1, unique=True)
def __str__(self):
return self.user, self.group
......@@ -420,7 +419,7 @@ class NodeGroup(models.Model):
node = models.ForeignKey(Node)
group = models.ForeignKey(Group)
rights = models.CharField(max_length=1, unique=True)
def __str__(self):
return self.node, self.group, self.rights
......
......@@ -8,10 +8,10 @@ import dateutil.parser
from .FileParser import FileParser
from ..NgramsExtractors import *
from admin.utils import PrintException
class EuropressFileParser(FileParser):
def _parse(self, file):
localeEncoding = "fr_FR"
......@@ -29,8 +29,8 @@ class EuropressFileParser(FileParser):
if encoding != "utf-8":
try:
contents = contents.decode("latin1", errors='replace').encode(codif)
except Exception as error:
print(error)
except:
PrintException()
# try:
# contents = contents.decode(encoding, errors='replace').encode(codif)
# except Exception as error:
......@@ -39,31 +39,31 @@ class EuropressFileParser(FileParser):
try:
html_parser = etree.HTMLParser(encoding=codif)
html = etree.fromstring(contents, html_parser)
try:
try :
format_europresse = 50
html_articles = html.xpath('/html/body/table/tbody')
if len(html_articles) < 1:
html_articles = html.xpath('/html/body/table')
if len(html_articles) < 1:
format_europresse = 1
html_articles = html.xpath('//div[@id="docContain"]')
except Exception as error:
print(error)
if format_europresse == 50:
name_xpath = "./tr/td/span[@class = 'DocPublicationName']"
header_xpath = "//span[@class = 'DocHeader']"
title_xpath = "string(./tr/td/span[@class = 'TitreArticleVisu'])"
text_xpath = "./tr/td/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()"
elif format_europresse == 1:
name_xpath = "//span[@class = 'DocPublicationName']"
header_xpath = "//span[@class = 'DocHeader']"
title_xpath = "string(//div[@class = 'titreArticleVisu'])"
text_xpath = "./descendant::*[\
except :
PrintException()
if format_europresse == 50 :
name_xpath = "./tr/td/span[@class = 'DocPublicationName']"
header_xpath = "./tr/td/span[@class = 'DocHeader']"
title_xpath = "string(./tr/td/span[@class = 'TitreArticleVisu'])"
text_xpath = "./tr/td/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()"
elif format_europresse == 1 :
name_xpath = "//span[@class = 'DocPublicationName']"
header_xpath = "//span[@class = 'DocHeader']"
title_xpath = "string(//div[@class = 'titreArticleVisu'])"
text_xpath = "./descendant::*[\
not(\
self::div[@class='Doc-SourceText'] \
or self::span[@class='DocHeader'] \
......@@ -77,17 +77,17 @@ class EuropressFileParser(FileParser):
or self::td[@class='txtCertificat'] \
)]/text()"
doi_xpath = "//span[@id='ucPubliC_lblNodoc']/text()"
except Exception as error:
print(error)
except Exception as error :
PrintException()
# parse all the articles, one by one
try:
for html_article in html_articles:
hyperdata = {}
if len(html_article):
for name in html_article.xpath(name_xpath):
if name.text is not None:
......@@ -99,14 +99,25 @@ class EuropressFileParser(FileParser):
else:
hyperdata['journal'] = name.text.encode(codif)
countbis = 0
for header in html_article.xpath(header_xpath):
# print(count)
# countbis += 1
# try:
# print('109', hyperdata['publication_date'])
# except:
# print('no date yet')
# pass
try:
text = header.text
#print("header", text)
except Exception as error:
print(error)
if isinstance(text, bytes):
text = text.decode(encoding)
format_date_fr = re.compile('\d*\s*\w+\s+\d{4}', re.UNICODE)
......@@ -123,9 +134,9 @@ class EuropressFileParser(FileParser):
test_date_en = None
test_sect = None
test_page = None
if test_date_fr is not None:
self.localeEncoding = "fr_FR"
locale.setlocale(locale.LC_ALL, localeEncoding)
......@@ -145,12 +156,10 @@ class EuropressFileParser(FileParser):
hyperdata['publication_date'] = datetime.strptime(text, '%d %B %Y')
# hyperdata['publication_date'] = dateutil.parser.parse(text)
except Exception as error:
print(error)
print(text)
print(error, text)
pass
if test_date_en is not None:
localeEncoding = "en_GB.UTF-8"
locale.setlocale(locale.LC_ALL, localeEncoding)
......@@ -164,13 +173,20 @@ class EuropressFileParser(FileParser):
if test_sect is not None:
hyperdata['section'] = test_sect.group(1).encode(codif)
if test_page is not None:
hyperdata['page'] = test_page.group(1).encode(codif)
# try:
# print('183', hyperdata['publication_date'])
# except:
# print('no date yet')
# pass
#
hyperdata['title'] = html_article.xpath(title_xpath).encode(codif)
hyperdata['abstract'] = html_article.xpath(text_xpath)
line = 0
br_tag = 10
for i in html_articles[count].iter():
......@@ -189,13 +205,13 @@ class EuropressFileParser(FileParser):
hyperdata['authors'] = 'not found'
line = 0
br_tag = 10
try:
if hyperdata['publication_date'] is not None or hyperdata['publication_date'] != '':
try:
back = hyperdata['publication_date']
except Exception as e:
except Exception as e:
#print(e)
pass
else:
......@@ -210,14 +226,14 @@ class EuropressFileParser(FileParser):
#hyperdata['language_iso2'] = 'fr'
#elif lang == 'en':
# hyperdata['language_iso2'] = 'en'
hyperdata['publication_year'] = hyperdata['publication_date'].strftime('%Y')
hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m')
hyperdata['publication_day'] = hyperdata['publication_date'].strftime('%d')
hyperdata.pop('publication_date')
if len(hyperdata['abstract'])>0 and format_europresse == 50:
#hyperdata.pop('publication_date')
if len(hyperdata['abstract'])>0 and format_europresse == 50:
hyperdata['doi'] = str(hyperdata['abstract'][-9])
hyperdata['abstract'].pop()
# Here add separator for paragraphs
......@@ -229,15 +245,15 @@ class EuropressFileParser(FileParser):
# Here add separator for paragraphs
hyperdata['abstract'] = str(' '.join(hyperdata['abstract']))
else:
else:
hyperdata['doi'] = "not found"
hyperdata['length_words'] = len(hyperdata['abstract'].split(' '))
hyperdata['length_letters'] = len(hyperdata['abstract'])
hyperdata['bdd'] = u'europresse'
hyperdata['url'] = u''
#hyperdata_str = {}
for key, value in hyperdata.items():
hyperdata[key] = value.decode() if isinstance(value, bytes) else value
......
import collections
import datetime
import dateutil.parser
import zipfile
import chardet
import re
from ..Caches import LanguagesCache
DEFAULT_DATE = datetime.datetime(datetime.MINYEAR, 1, 1)
class FileParser:
"""Base class for performing files parsing depending on their type.
"""
def __init__(self, language_cache=None):
self._languages_cache = LanguagesCache() if language_cache is None else language_cache
def detect_encoding(self, string):
"""Useful method to detect the document encoding.
"""
encoding = chardet.detect(string)
return encoding.get('encoding', 'UTF-8')
def format_hyperdata_dates(self, hyperdata):
"""Format the dates found in the hyperdata.
Examples:
......@@ -27,45 +32,65 @@ class FileParser:
{"publication_year": "2014"}
-> {"publication_date": "2014-01-01 00:00:00", "publication_year": "2014", ...}
"""
# First, check the split dates...
prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_year"]
for prefix in prefixes:
date_string = hyperdata[prefix + "_year"]
key = prefix + "_month"
if key in hyperdata:
date_string += " " + hyperdata[key]
key = prefix + "_day"
date_string = hyperdata.get('publication_date_to_parse', None)
if date_string is not None:
date_string = re.sub(r'\/\/+', '', date_string)
date_string = re.sub(r'undefined', '', date_string)
try:
hyperdata['publication' + "_date"] = dateutil.parser.parse(
date_string,
default=DEFAULT_DATE
).strftime("%Y-%m-%d %H:%M:%S")
except:
print('Parser Zotero, Date not parsed for:', date_string)
hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
elif hyperdata.get('publication_year', None) is not None:
prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_year"]
for prefix in prefixes:
date_string = hyperdata[prefix + "_year"]
key = prefix + "_month"
if key in hyperdata:
date_string += " " + hyperdata[key]
key = prefix + "_hour"
key = prefix + "_day"
if key in hyperdata:
date_string += " " + hyperdata[key]
key = prefix + "_minute"
key = prefix + "_hour"
if key in hyperdata:
date_string += ":" + hyperdata[key]
key = prefix + "_second"
date_string += " " + hyperdata[key]
key = prefix + "_minute"
if key in hyperdata:
date_string += ":" + hyperdata[key]
try:
hyperdata[prefix + "_date"] = dateutil.parser.parse(date_string).strftime("%Y-%m-%d %H:%M:%S")
except:
pass
key = prefix + "_second"
if key in hyperdata:
date_string += ":" + hyperdata[key]
try:
hyperdata[prefix + "_date"] = dateutil.parser.parse(date_string).strftime("%Y-%m-%d %H:%M:%S")
except:
pass
else:
hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# ...then parse all the "date" fields, to parse it into separate elements
prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_date"]
for prefix in prefixes:
date = dateutil.parser.parse(hyperdata[prefix + "_date"])
#print(date)
hyperdata[prefix + "_year"] = date.strftime("%Y")
hyperdata[prefix + "_month"] = date.strftime("%m")
hyperdata[prefix + "_day"] = date.strftime("%d")
hyperdata[prefix + "_hour"] = date.strftime("%H")
hyperdata[prefix + "_minute"] = date.strftime("%M")
hyperdata[prefix + "_second"] = date.strftime("%S")
# finally, return the transformed result!
return hyperdata
print(hyperdata['publication_date'])
def format_hyperdata_languages(self, hyperdata):
"""format the languages found in the hyperdata."""
language = None
......@@ -81,18 +106,18 @@ class FileParser:
hyperdata["language_iso3"] = language.iso3
hyperdata["language_fullname"] = language.fullname
return hyperdata
def format_hyperdata(self, hyperdata):
"""Format the hyperdata."""
hyperdata = self.format_hyperdata_dates(hyperdata)
hyperdata = self.format_hyperdata_languages(hyperdata)
return hyperdata
def _parse(self, file):
"""This method shall be overriden by inherited classes."""
return list()
def parse(self, file):
"""Parse the file, and its children files found in the file.
"""
......
......@@ -3,15 +3,17 @@ from .FileParser import FileParser
from ..Caches import LanguagesCache
from admin.utils import PrintException
class RisFileParser(FileParser):
def __init__(self, language_cache=None):
super(FileParser, self).__init__()
self._languages_cache = LanguagesCache() if language_cache is None else language_cache
self._begin = 6
self._parameters = {
b"ER": {"type": "delimiter"},
b"TI": {"type": "hyperdata", "key": "title", "separator": " "},
......@@ -24,7 +26,7 @@ class RisFileParser(FileParser):
b"AB": {"type": "hyperdata", "key": "abstract", "separator": " "},
b"WC": {"type": "hyperdata", "key": "fields"},
}
def _parse(self, file):
hyperdata = {}
......@@ -57,5 +59,11 @@ class RisFileParser(FileParser):
print(error)
# if a hyperdata object is left in memory, yield it as well
if hyperdata:
# try:
# if hyperdata['date_to_parse']:
# print(hyperdata['date_to_parse'])
# except:
# pass
#
#print(hyperdata['title'])
yield hyperdata
from .RisFileParser import RisFileParser
from ..Caches import LanguagesCache
class ZoteroFileParser(RisFileParser):
def __init__(self):
super(RisFileParser, self).__init__()
self._begin = 6
self._parameters = {
b"ER": {"type": "delimiter"},
b"TI": {"type": "hyperdata", "key": "title", "separator": " "},
b"AU": {"type": "hyperdata", "key": "authors", "separator": ", "},
b"UR": {"type": "hyperdata", "key": "doi"},
b"DA": {"type": "hyperdata", "key": "publication_date_to_parse"},
b"PY": {"type": "hyperdata", "key": "publication_year"},
b"PD": {"type": "hyperdata", "key": "publication_month"},
b"LA": {"type": "hyperdata", "key": "language_iso2"},
b"AB": {"type": "hyperdata", "key": "abstract", "separator": " "},
b"WC": {"type": "hyperdata", "key": "fields"},
}
from .RisFileParser import RisFileParser
from .IsiFileParser import IsiFileParser
from .JstorFileParser import JstorFileParser
from .ZoteroFileParser import ZoteroFileParser
from .PubmedFileParser import PubmedFileParser
from .EuropressFileParser import EuropressFileParser
from .ISText import ISText
......
This diff is collapsed.
......@@ -4,12 +4,11 @@ parsers = {
'Pubmed (xml format)' : PubmedFileParser,
'Web of Science (ISI format)' : IsiFileParser,
'Scopus (RIS format)' : RisFileParser,
'Zotero (RIS format)' : JstorFileParser,
'Zotero (RIS format)' : ZoteroFileParser,
'Jstor (RIS format)' : JstorFileParser,
#'Europress' : EuropressFileParser,
'Europress (French)' : EuropressFileParser,
'Europress (English)' : EuropressFileParser,
'CSVParser' : CSVParser,
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment