Commit 5160d178 authored by PkSM3's avatar PkSM3

[UPDATE] merge conflict

parents 0853ddca 2644e642
export PGPASSWORD=C8kdcUrAQy66U
psql -U gargantua -d gargandb -f drop_db.sql
ALTER TABLE node_node RENAME metadata TO hyperdata ;
ALTER TABLE node_metadata RENAME TO node_hyperdata ;
ALTER TABLE node_node_metadata RENAME TO node_node_hyperdata ;
ALTER TABLE node_node_hyperdata RENAME metadata_id TO hyperdata_id ;
......@@ -306,7 +306,6 @@ class Node(CTENode):
self.hyperdata['Processing'] = 0
self.save()
class Node_Hyperdata(models.Model):
node = models.ForeignKey(Node, on_delete=models.CASCADE)
hyperdata = models.ForeignKey(Hyperdata)
......
......@@ -8,7 +8,7 @@ import dateutil.parser
from .FileParser import FileParser
from ..NgramsExtractors import *
from admin.utils import PrintException
class EuropressFileParser(FileParser):
......@@ -29,8 +29,8 @@ class EuropressFileParser(FileParser):
if encoding != "utf-8":
try:
contents = contents.decode("latin1", errors='replace').encode(codif)
except Exception as error:
print(error)
except:
PrintException()
# try:
# contents = contents.decode(encoding, errors='replace').encode(codif)
# except Exception as error:
......@@ -40,7 +40,7 @@ class EuropressFileParser(FileParser):
html_parser = etree.HTMLParser(encoding=codif)
html = etree.fromstring(contents, html_parser)
try:
try :
format_europresse = 50
html_articles = html.xpath('/html/body/table/tbody')
......@@ -51,15 +51,15 @@ class EuropressFileParser(FileParser):
if len(html_articles) < 1:
format_europresse = 1
html_articles = html.xpath('//div[@id="docContain"]')
except Exception as error:
print(error)
except :
PrintException()
if format_europresse == 50:
if format_europresse == 50 :
name_xpath = "./tr/td/span[@class = 'DocPublicationName']"
header_xpath = "//span[@class = 'DocHeader']"
header_xpath = "./tr/td/span[@class = 'DocHeader']"
title_xpath = "string(./tr/td/span[@class = 'TitreArticleVisu'])"
text_xpath = "./tr/td/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()"
elif format_europresse == 1:
elif format_europresse == 1 :
name_xpath = "//span[@class = 'DocPublicationName']"
header_xpath = "//span[@class = 'DocHeader']"
title_xpath = "string(//div[@class = 'titreArticleVisu'])"
......@@ -79,8 +79,8 @@ class EuropressFileParser(FileParser):
doi_xpath = "//span[@id='ucPubliC_lblNodoc']/text()"
except Exception as error:
print(error)
except Exception as error :
PrintException()
# parse all the articles, one by one
try:
......@@ -99,7 +99,18 @@ class EuropressFileParser(FileParser):
else:
hyperdata['journal'] = name.text.encode(codif)
countbis = 0
for header in html_article.xpath(header_xpath):
# print(count)
# countbis += 1
# try:
# print('109', hyperdata['publication_date'])
# except:
# print('no date yet')
# pass
try:
text = header.text
#print("header", text)
......@@ -145,12 +156,10 @@ class EuropressFileParser(FileParser):
hyperdata['publication_date'] = datetime.strptime(text, '%d %B %Y')
# hyperdata['publication_date'] = dateutil.parser.parse(text)
except Exception as error:
print(error)
print(text)
print(error, text)
pass
if test_date_en is not None:
localeEncoding = "en_GB.UTF-8"
locale.setlocale(locale.LC_ALL, localeEncoding)
......@@ -168,6 +177,13 @@ class EuropressFileParser(FileParser):
if test_page is not None:
hyperdata['page'] = test_page.group(1).encode(codif)
# try:
# print('183', hyperdata['publication_date'])
# except:
# print('no date yet')
# pass
#
hyperdata['title'] = html_article.xpath(title_xpath).encode(codif)
hyperdata['abstract'] = html_article.xpath(text_xpath)
......@@ -215,7 +231,7 @@ class EuropressFileParser(FileParser):
hyperdata['publication_year'] = hyperdata['publication_date'].strftime('%Y')
hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m')
hyperdata['publication_day'] = hyperdata['publication_date'].strftime('%d')
hyperdata.pop('publication_date')
#hyperdata.pop('publication_date')
if len(hyperdata['abstract'])>0 and format_europresse == 50:
hyperdata['doi'] = str(hyperdata['abstract'][-9])
......
import collections
import datetime
import dateutil.parser
import zipfile
import chardet
import re
from ..Caches import LanguagesCache
DEFAULT_DATE = datetime.datetime(datetime.MINYEAR, 1, 1)
class FileParser:
"""Base class for performing files parsing depending on their type.
"""
......@@ -29,6 +34,21 @@ class FileParser:
"""
# First, check the split dates...
date_string = hyperdata.get('publication_date_to_parse', None)
if date_string is not None:
date_string = re.sub(r'\/\/+', '', date_string)
date_string = re.sub(r'undefined', '', date_string)
try:
hyperdata['publication' + "_date"] = dateutil.parser.parse(
date_string,
default=DEFAULT_DATE
).strftime("%Y-%m-%d %H:%M:%S")
except:
print('Parser Zotero, Date not parsed for:', date_string)
hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
elif hyperdata.get('publication_year', None) is not None:
prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_year"]
for prefix in prefixes:
date_string = hyperdata[prefix + "_year"]
......@@ -51,11 +71,15 @@ class FileParser:
hyperdata[prefix + "_date"] = dateutil.parser.parse(date_string).strftime("%Y-%m-%d %H:%M:%S")
except:
pass
else:
hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# ...then parse all the "date" fields, to parse it into separate elements
prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_date"]
for prefix in prefixes:
date = dateutil.parser.parse(hyperdata[prefix + "_date"])
#print(date)
hyperdata[prefix + "_year"] = date.strftime("%Y")
hyperdata[prefix + "_month"] = date.strftime("%m")
hyperdata[prefix + "_day"] = date.strftime("%d")
......@@ -65,6 +89,7 @@ class FileParser:
# finally, return the transformed result!
return hyperdata
print(hyperdata['publication_date'])
def format_hyperdata_languages(self, hyperdata):
"""format the languages found in the hyperdata."""
......
......@@ -3,6 +3,8 @@ from .FileParser import FileParser
from ..Caches import LanguagesCache
from admin.utils import PrintException
class RisFileParser(FileParser):
def __init__(self, language_cache=None):
......@@ -57,5 +59,11 @@ class RisFileParser(FileParser):
print(error)
# if a hyperdata object is left in memory, yield it as well
if hyperdata:
# try:
# if hyperdata['date_to_parse']:
# print(hyperdata['date_to_parse'])
# except:
# pass
#
#print(hyperdata['title'])
yield hyperdata
from .RisFileParser import RisFileParser
from ..Caches import LanguagesCache
class ZoteroFileParser(RisFileParser):
def __init__(self):
super(RisFileParser, self).__init__()
self._begin = 6
self._parameters = {
b"ER": {"type": "delimiter"},
b"TI": {"type": "hyperdata", "key": "title", "separator": " "},
b"AU": {"type": "hyperdata", "key": "authors", "separator": ", "},
b"UR": {"type": "hyperdata", "key": "doi"},
b"DA": {"type": "hyperdata", "key": "publication_date_to_parse"},
b"PY": {"type": "hyperdata", "key": "publication_year"},
b"PD": {"type": "hyperdata", "key": "publication_month"},
b"LA": {"type": "hyperdata", "key": "language_iso2"},
b"AB": {"type": "hyperdata", "key": "abstract", "separator": " "},
b"WC": {"type": "hyperdata", "key": "fields"},
}
from .RisFileParser import RisFileParser
from .IsiFileParser import IsiFileParser
from .JstorFileParser import JstorFileParser
from .ZoteroFileParser import ZoteroFileParser
from .PubmedFileParser import PubmedFileParser
from .EuropressFileParser import EuropressFileParser
from .ISText import ISText
......
from collections import defaultdict
from datetime import datetime
from random import random
from hashlib import md5
from time import time
from math import log
from gargantext_web.db import *
from .parsers_config import parsers as _parsers
class DebugTime:
def __init__(self, prefix):
self.prefix = prefix
self.message = None
self.time = None
def __del__(self):
if self.message is not None and self.time is not None:
print('%s - %s: %.4f' % (self.prefix, self.message, time() - self.time))
def show(self, message):
self.__del__()
self.message = message
self.time = time()
# keep all the parsers in a cache
class Parsers(defaultdict):
def __init__(self):
self._parsers = _parsers
def __missing__(self, key):
#print(self._parsers.keys())
if key not in self._parsers.keys():
raise NotImplementedError('No such parser: "%s"' % (key))
parser = self._parsers[key]()
self[key] = parser
return parser
parsers = Parsers()
# resources management
def add_resource(corpus, **kwargs):
# only for tests
session = Session()
resource = Resource(guid=str(random()), **kwargs )
# User
if 'user_id' not in kwargs:
resource.user_id = corpus.user_id
# Compute the digest
h = md5()
f = open(str(resource.file), 'rb')
h.update(f.read())
f.close()
resource.digest = h.hexdigest()
# check if a resource on this node already has this hash
tmp_resource = (session
.query(Resource)
.join(Node_Resource, Node_Resource.resource_id == Resource.id)
.filter(Resource.digest == resource.digest)
.filter(Node_Resource.node_id == corpus.id)
).first()
if tmp_resource is not None:
return tmp_resource
else:
session.add(resource)
session.commit()
# link with the resource
node_resource = Node_Resource(
node_id = corpus.id,
resource_id = resource.id,
parsed = False,
)
session.add(node_resource)
session.commit()
# return result
return resource
def parse_resources(corpus, user=None, user_id=None):
dbg = DebugTime('Corpus #%d - parsing' % corpus.id)
session = Session()
corpus_id = corpus.id
type_id = cache.NodeType['Document'].id
if user_id is None and user is not None:
user_id = user.id
else:
user_id = corpus.user_id
# find resource of the corpus
resources_query = (session
.query(Resource, ResourceType)
.join(ResourceType, ResourceType.id == Resource.type_id)
.join(Node_Resource, Node_Resource.resource_id == Resource.id)
.filter(Node_Resource.node_id == corpus.id)
.filter(Node_Resource.parsed == False)
)
# make a new node for every parsed document of the corpus
dbg.show('analyze documents')
nodes = list()
for resource, resourcetype in resources_query:
parser = parsers[resourcetype.name]
for hyperdata_dict in parser.parse(resource.file):
# retrieve language ID from hyperdata
if 'language_iso2' in hyperdata_dict:
try:
language_id = cache.Language[hyperdata_dict['language_iso2']].id
except KeyError:
language_id = None
else:
language_id = None
# create new node
node = Node(
name = hyperdata_dict.get('title', '')[:200],
parent_id = corpus_id,
user_id = user_id,
type_id = type_id,
language_id = language_id,
hyperdata = hyperdata_dict,
date = datetime.utcnow(),
)
nodes.append(node)
#
# TODO: mark node-resources associations as parsed
#
dbg.show('insert %d documents' % len(nodes))
session.add_all(nodes)
session.commit()
# now, index the hyperdata
dbg.show('insert hyperdata')
node_hyperdata_lists = defaultdict(list)
hyperdata_types = {
hyperdata.name: hyperdata
for hyperdata in session.query(Hyperdata)
}
for node in nodes:
node_id = node.id
for hyperdata_key, hyperdata_value in node.hyperdata.items():
try:
hyperdata = hyperdata_types[hyperdata_key]
except KeyError:
# Why silent continue here ?
continue
if hyperdata.type == 'string':
hyperdata_value = hyperdata_value[:255]
node_hyperdata_lists[hyperdata.type].append((
node_id,
hyperdata.id,
hyperdata_value,
))
for key, values in node_hyperdata_lists.items():
bulk_insert(Node_Hyperdata, ['node_id', 'hyperdata_id', 'value_'+key], values)
# mark the corpus as parsed
corpus.parsed = True
# ngrams extraction
from .NgramsExtractors import EnglishNgramsExtractor, FrenchNgramsExtractor, NgramsExtractor
class NgramsExtractors(defaultdict):
def __init__(self):
# English
self['en'] = EnglishNgramsExtractor()
for key in ('eng', 'english'):
self[key] = self['en']
# French
self['fr'] = FrenchNgramsExtractor()
for key in ('fre', 'french'):
self[key] = self['fr']
# default
self['default'] = NgramsExtractor()
def __missing__(self, key):
formatted_key = key.strip().lower()
if formatted_key in self:
self[key] = self[formatted_key]
else:
self[key] = self['default']
# raise NotImplementedError
return self[key]
ngramsextractors = NgramsExtractors()
def extract_ngrams(corpus, keys):
dbg = DebugTime('Corpus #%d - ngrams' % corpus.id)
default_language_iso2 = None if corpus.language_id is None else cache.Language[corpus.language_id].iso2
# query the hyperdata associated with the given keys
columns = [Node.id, Node.language_id] + [Node.hyperdata[key] for key in keys]
hyperdata_query = (session
.query(*columns)
.filter(Node.parent_id == corpus.id)
.filter(Node.type_id == cache.NodeType['Document'].id)
)
# prepare data to be inserted
dbg.show('find ngrams')
languages_by_id = {
language.id: language.iso2
for language in session.query(Language)
}
ngrams_data = set()
ngrams_language_data = set()
ngrams_tag_data = set()
node_ngram_list = defaultdict(lambda: defaultdict(int))
for nodeinfo in hyperdata_query:
node_id = nodeinfo[0]
language_id = nodeinfo[1]
if language_id is None:
language_iso2 = default_language_iso2
else:
language_iso2 = languages_by_id.get(language_id, None)
if language_iso2 is None:
continue
ngramsextractor = ngramsextractors[language_iso2]
for text in nodeinfo[2:]:
if text is not None and len(text):
ngrams = ngramsextractor.extract_ngrams(text.replace("[","").replace("]",""))
for ngram in ngrams:
n = len(ngram)
terms = ' '.join([token for token, tag in ngram]).lower()
# TODO BUG here
if n == 1:
#tag_id = cache.Tag[ngram[0][1]].id
tag_id = 1
#print('tag_id', tag_id)
elif n > 1:
tag_id = 1
#tag_id = cache.Tag[ngram[0][1]].id
#tag_id = cache.Tag['NN'].id
#tag_id = 14
#print('tag_id_2', tag_id)
node_ngram_list[node_id][terms] += 1
ngrams_data.add((n, terms[:255]))
ngrams_language_data.add((terms, language_id))
ngrams_tag_data.add((terms, tag_id))
# insert ngrams to temporary table
dbg.show('find ids for the %d ngrams' % len(ngrams_data))
db, cursor = get_cursor()
cursor.execute('''
CREATE TEMPORARY TABLE tmp__ngrams (
id INT,
n INT NOT NULL,
terms VARCHAR(255) NOT NULL
)
''')
bulk_insert('tmp__ngrams', ['n', 'terms'], ngrams_data, cursor=cursor)
# retrieve ngram ids from already inserted stuff
cursor.execute('''
UPDATE
tmp__ngrams
SET
id = ngram.id
FROM
%s AS ngram
WHERE
ngram.terms = tmp__ngrams.terms
''' % (Ngram.__table__.name, ))
# insert, then get the ids back
cursor.execute('''
INSERT INTO
%s (n, terms)
SELECT
n, terms
FROM
tmp__ngrams
WHERE
id IS NULL
''' % (Ngram.__table__.name, ))
cursor.execute('''
UPDATE
tmp__ngrams
SET
id = ngram.id
FROM
%s AS ngram
WHERE
ngram.terms = tmp__ngrams.terms
AND
tmp__ngrams.id IS NULL
''' % (Ngram.__table__.name, ))
# get all ids
ngram_ids = dict()
cursor.execute('SELECT id, terms FROM tmp__ngrams')
for row in cursor.fetchall():
ngram_ids[row[1]] = row[0]
#
dbg.show('insert associations')
node_ngram_data = list()
for node_id, ngrams in node_ngram_list.items():
for terms, weight in ngrams.items():
try:
ngram_id = ngram_ids[terms]
node_ngram_data.append((node_id, ngram_id, weight, ))
except Exception as e:
print("err01:",e)
bulk_insert(Node_Ngram, ['node_id', 'ngram_id', 'weight'], node_ngram_data, cursor=cursor)
dbg.message = 'insert %d associations' % len(node_ngram_data)
# commit to database
db.commit()
# tfidf calculation
def compute_tfidf(corpus):
dbg = DebugTime('Corpus #%d - tfidf' % corpus.id)
# compute terms frequency sum
dbg.show('calculate terms frequencies sums')
db, cursor = get_cursor()
cursor.execute('''
CREATE TEMPORARY TABLE tmp__st (
node_id INT NOT NULL,
frequency DOUBLE PRECISION NOT NULL
)
''')
cursor.execute('''
INSERT INTO
tmp__st (node_id, frequency)
SELECT
node_ngram.node_id,
SUM(node_ngram.weight) AS frequency
FROM
%s AS node
INNER JOIN
%s AS node_ngram ON node_ngram.node_id = node.id
WHERE
node.parent_id = %d
GROUP BY
node_ngram.node_id
''' % (Node.__table__.name, Node_Ngram.__table__.name, corpus.id, ))
# compute normalized terms frequencies
dbg.show('normalize terms frequencies')
cursor.execute('''
CREATE TEMPORARY TABLE tmp__tf (
node_id INT NOT NULL,
ngram_id INT NOT NULL,
frequency DOUBLE PRECISION NOT NULL
)
''')
cursor.execute('''
INSERT INTO
tmp__tf (node_id, ngram_id, frequency)
SELECT
node_ngram.node_id,
node_ngram.ngram_id,
(node_ngram.weight / node.frequency) AS frequency
FROM
%s AS node_ngram
INNER JOIN
tmp__st AS node ON node.node_id = node_ngram.node_id
''' % (Node_Ngram.__table__.name, ))
# show off
dbg.show('compute idf')
cursor.execute('''
CREATE TEMPORARY TABLE tmp__idf (
ngram_id INT NOT NULL,
idf DOUBLE PRECISION NOT NULL
)
''')
cursor.execute('''
INSERT INTO
tmp__idf(ngram_id, idf)
SELECT
node_ngram.ngram_id,
-ln(COUNT(*))
FROM
%s AS node
INNER JOIN
%s AS node_ngram ON node_ngram.node_id = node.id
WHERE
node.parent_id = %d
GROUP BY
node_ngram.ngram_id
''' % (Node.__table__.name, Node_Ngram.__table__.name, corpus.id, ))
cursor.execute('SELECT COUNT(*) FROM tmp__st')
D = cursor.fetchone()[0]
if D>0:
lnD = log(D)
cursor.execute('UPDATE tmp__idf SET idf = idf + %f' % (lnD, ))
# show off
dbg.show('insert tfidf for %d documents' % D)
cursor.execute('''
INSERT INTO
%s (nodex_id, nodey_id, ngram_id, score)
SELECT
%d AS nodex_id,
tf.node_id AS nodey_id,
tf.ngram_id AS ngram_id,
(tf.frequency * idf.idf) AS score
FROM
tmp__idf AS idf
INNER JOIN
tmp__tf AS tf ON tf.ngram_id = idf.ngram_id
''' % (NodeNodeNgram.__table__.name, corpus.id, ))
# # show off
# cursor.execute('''
# SELECT
# node.name,
# ngram.terms,
# node_node_ngram.score AS tfidf
# FROM
# %s AS node_node_ngram
# INNER JOIN
# %s AS node ON node.id = node_node_ngram.nodey_id
# INNER JOIN
# %s AS ngram ON ngram.id = node_node_ngram.ngram_id
# WHERE
# node_node_ngram.nodex_id = %d
# ORDER BY
# score DESC
# ''' % (NodeNodeNgram.__table__.name, Node.__table__.name, Ngram.__table__.name, corpus.id, ))
# for row in cursor.fetchall():
# print(row)
# the end!
db.commit()
......@@ -4,12 +4,11 @@ parsers = {
'Pubmed (xml format)' : PubmedFileParser,
'Web of Science (ISI format)' : IsiFileParser,
'Scopus (RIS format)' : RisFileParser,
'Zotero (RIS format)' : JstorFileParser,
'Zotero (RIS format)' : ZoteroFileParser,
'Jstor (RIS format)' : JstorFileParser,
#'Europress' : EuropressFileParser,
'Europress (French)' : EuropressFileParser,
'Europress (English)' : EuropressFileParser,
'CSVParser' : CSVParser,
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment