Commit d37bdbd3 authored by Administrator's avatar Administrator

Merge branch 'testing' into prod-dev

parents a1b68438 407b96ab
......@@ -11,7 +11,7 @@ from ..NgramsExtractors import *
from admin.utils import PrintException
class EuropressFileParser(FileParser):
def _parse(self, file):
localeEncoding = "fr_FR"
......@@ -39,21 +39,21 @@ class EuropressFileParser(FileParser):
try:
html_parser = etree.HTMLParser(encoding=codif)
html = etree.fromstring(contents, html_parser)
try :
format_europresse = 50
html_articles = html.xpath('/html/body/table/tbody')
if len(html_articles) < 1:
html_articles = html.xpath('/html/body/table')
if len(html_articles) < 1:
format_europresse = 1
html_articles = html.xpath('//div[@id="docContain"]')
except :
PrintException()
if format_europresse == 50 :
name_xpath = "./tr/td/span[@class = 'DocPublicationName']"
header_xpath = "./tr/td/span[@class = 'DocHeader']"
......@@ -77,7 +77,7 @@ class EuropressFileParser(FileParser):
or self::td[@class='txtCertificat'] \
)]/text()"
doi_xpath = "//span[@id='ucPubliC_lblNodoc']/text()"
except Exception as error :
PrintException()
......@@ -85,9 +85,9 @@ class EuropressFileParser(FileParser):
# parse all the articles, one by one
try:
for html_article in html_articles:
hyperdata = {}
if len(html_article):
for name in html_article.xpath(name_xpath):
if name.text is not None:
......@@ -98,26 +98,26 @@ class EuropressFileParser(FileParser):
hyperdata['volume'] = test_journal.group(2)
else:
hyperdata['journal'] = name.text.encode(codif)
countbis = 0
for header in html_article.xpath(header_xpath):
# print(count)
# countbis += 1
# try:
# print('109', hyperdata['publication_date'])
# except:
# print('no date yet')
# pass
try:
text = header.text
#print("header", text)
except Exception as error:
print(error)
if isinstance(text, bytes):
text = text.decode(encoding)
format_date_fr = re.compile('\d*\s*\w+\s+\d{4}', re.UNICODE)
......@@ -134,9 +134,9 @@ class EuropressFileParser(FileParser):
test_date_en = None
test_sect = None
test_page = None
if test_date_fr is not None:
self.localeEncoding = "fr_FR"
locale.setlocale(locale.LC_ALL, localeEncoding)
......@@ -158,7 +158,7 @@ class EuropressFileParser(FileParser):
except Exception as error:
print(error, text)
pass
if test_date_en is not None:
localeEncoding = "en_GB.UTF-8"
......@@ -173,20 +173,20 @@ class EuropressFileParser(FileParser):
if test_sect is not None:
hyperdata['section'] = test_sect.group(1).encode(codif)
if test_page is not None:
hyperdata['page'] = test_page.group(1).encode(codif)
try:
print('183', hyperdata['publication_date'])
except:
print('no date yet')
pass
# try:
# print('183', hyperdata['publication_date'])
# except:
# print('no date yet')
# pass
#
hyperdata['title'] = html_article.xpath(title_xpath).encode(codif)
hyperdata['abstract'] = html_article.xpath(text_xpath)
line = 0
br_tag = 10
for i in html_articles[count].iter():
......@@ -205,13 +205,13 @@ class EuropressFileParser(FileParser):
hyperdata['authors'] = 'not found'
line = 0
br_tag = 10
try:
if hyperdata['publication_date'] is not None or hyperdata['publication_date'] != '':
try:
back = hyperdata['publication_date']
except Exception as e:
except Exception as e:
#print(e)
pass
else:
......@@ -226,14 +226,14 @@ class EuropressFileParser(FileParser):
#hyperdata['language_iso2'] = 'fr'
#elif lang == 'en':
# hyperdata['language_iso2'] = 'en'
hyperdata['publication_year'] = hyperdata['publication_date'].strftime('%Y')
hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m')
hyperdata['publication_day'] = hyperdata['publication_date'].strftime('%d')
#hyperdata.pop('publication_date')
if len(hyperdata['abstract'])>0 and format_europresse == 50:
if len(hyperdata['abstract'])>0 and format_europresse == 50:
hyperdata['doi'] = str(hyperdata['abstract'][-9])
hyperdata['abstract'].pop()
# Here add separator for paragraphs
......@@ -245,15 +245,15 @@ class EuropressFileParser(FileParser):
# Here add separator for paragraphs
hyperdata['abstract'] = str(' '.join(hyperdata['abstract']))
else:
else:
hyperdata['doi'] = "not found"
hyperdata['length_words'] = len(hyperdata['abstract'].split(' '))
hyperdata['length_letters'] = len(hyperdata['abstract'])
hyperdata['bdd'] = u'europresse'
hyperdata['url'] = u''
#hyperdata_str = {}
for key, value in hyperdata.items():
hyperdata[key] = value.decode() if isinstance(value, bytes) else value
......
......@@ -4,21 +4,21 @@ import zipfile
import chardet
from ..Caches import LanguagesCache
class FileParser:
"""Base class for performing files parsing depending on their type.
"""
def __init__(self, language_cache=None):
self._languages_cache = LanguagesCache() if language_cache is None else language_cache
def detect_encoding(self, string):
"""Useful method to detect the document encoding.
"""
encoding = chardet.detect(string)
return encoding.get('encoding', 'UTF-8')
def format_hyperdata_dates(self, hyperdata):
"""Format the dates found in the hyperdata.
Examples:
......@@ -27,7 +27,7 @@ class FileParser:
{"publication_year": "2014"}
-> {"publication_date": "2014-01-01 00:00:00", "publication_year": "2014", ...}
"""
# First, check the split dates...
prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_year"]
for prefix in prefixes:
......@@ -51,21 +51,23 @@ class FileParser:
hyperdata[prefix + "_date"] = dateutil.parser.parse(date_string).strftime("%Y-%m-%d %H:%M:%S")
except:
pass
# ...then parse all the "date" fields, to parse it into separate elements
prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_date"]
for prefix in prefixes:
date = dateutil.parser.parse(hyperdata[prefix + "_date"])
print('date')
hyperdata[prefix + "_year"] = date.strftime("%Y")
hyperdata[prefix + "_month"] = date.strftime("%m")
hyperdata[prefix + "_day"] = date.strftime("%d")
hyperdata[prefix + "_hour"] = date.strftime("%H")
hyperdata[prefix + "_minute"] = date.strftime("%M")
hyperdata[prefix + "_second"] = date.strftime("%S")
# finally, return the transformed result!
return hyperdata
def format_hyperdata_languages(self, hyperdata):
"""format the languages found in the hyperdata."""
language = None
......@@ -81,18 +83,18 @@ class FileParser:
hyperdata["language_iso3"] = language.iso3
hyperdata["language_fullname"] = language.fullname
return hyperdata
def format_hyperdata(self, hyperdata):
"""Format the hyperdata."""
hyperdata = self.format_hyperdata_dates(hyperdata)
hyperdata = self.format_hyperdata_languages(hyperdata)
return hyperdata
def _parse(self, file):
"""This method shall be overriden by inherited classes."""
return list()
def parse(self, file):
"""Parse the file, and its children files found in the file.
"""
......
......@@ -3,15 +3,17 @@ from .FileParser import FileParser
from ..Caches import LanguagesCache
from admin.utils import PrintException
class RisFileParser(FileParser):
def __init__(self, language_cache=None):
super(FileParser, self).__init__()
self._languages_cache = LanguagesCache() if language_cache is None else language_cache
self._begin = 6
self._parameters = {
b"ER": {"type": "delimiter"},
b"TI": {"type": "hyperdata", "key": "title", "separator": " "},
......@@ -24,7 +26,7 @@ class RisFileParser(FileParser):
b"AB": {"type": "hyperdata", "key": "abstract", "separator": " "},
b"WC": {"type": "hyperdata", "key": "fields"},
}
def _parse(self, file):
hyperdata = {}
......@@ -57,5 +59,11 @@ class RisFileParser(FileParser):
print(error)
# if a hyperdata object is left in memory, yield it as well
if hyperdata:
# try:
# if hyperdata['date_to_parse']:
# print(hyperdata['date_to_parse'])
# except:
# pass
#
#print(hyperdata['title'])
yield hyperdata
from .RisFileParser import RisFileParser
from ..Caches import LanguagesCache
class ZoteroFileParser(RisFileParser):
def __init__(self):
super(RisFileParser, self).__init__()
self._begin = 6
self._parameters = {
b"ER": {"type": "delimiter"},
b"TI": {"type": "hyperdata", "key": "title", "separator": " "},
b"AU": {"type": "hyperdata", "key": "authors", "separator": ", "},
b"UR": {"type": "hyperdata", "key": "doi"},
b"DA": {"type": "hyperdata", "key": "publication_date"},
b"PY": {"type": "hyperdata", "key": "publication_year"},
b"PD": {"type": "hyperdata", "key": "publication_month"},
b"LA": {"type": "hyperdata", "key": "language_iso2"},
b"AB": {"type": "hyperdata", "key": "abstract", "separator": " "},
b"WC": {"type": "hyperdata", "key": "fields"},
}
from .RisFileParser import RisFileParser
from .IsiFileParser import IsiFileParser
from .JstorFileParser import JstorFileParser
from .ZoteroFileParser import ZoteroFileParser
from .PubmedFileParser import PubmedFileParser
from .EuropressFileParser import EuropressFileParser
from .ISText import ISText
......@@ -128,7 +128,7 @@ def parse_resources(corpus, user=None, user_id=None):
nodes.append(node)
#
# TODO: mark node-resources associations as parsed
#
#
dbg.show('insert %d documents' % len(nodes))
session.add_all(nodes)
session.commit()
......@@ -205,7 +205,7 @@ def extract_ngrams(corpus, keys):
language.id: language.iso2
for language in session.query(Language)
}
ngrams_data = set()
ngrams_language_data = set()
ngrams_tag_data = set()
......@@ -241,7 +241,7 @@ def extract_ngrams(corpus, keys):
#tag_id = 14
#print('tag_id_2', tag_id)
node_ngram_list[node_id][terms] += 1
ngrams_data.add((n, terms))
ngrams_data.add((n, terms[:255]))
ngrams_language_data.add((terms, language_id))
ngrams_tag_data.add((terms, tag_id))
......@@ -268,7 +268,7 @@ def extract_ngrams(corpus, keys):
ngram.terms = tmp__ngrams.terms
''' % (Ngram.__table__.name, ))
# insert, then get the ids back
cursor.execute('''
INSERT INTO
%s (n, terms)
......@@ -279,8 +279,8 @@ def extract_ngrams(corpus, keys):
WHERE
id IS NULL
''' % (Ngram.__table__.name, ))
cursor.execute('''
UPDATE
tmp__ngrams
......@@ -293,14 +293,14 @@ def extract_ngrams(corpus, keys):
AND
tmp__ngrams.id IS NULL
''' % (Ngram.__table__.name, ))
# get all ids
ngram_ids = dict()
cursor.execute('SELECT id, terms FROM tmp__ngrams')
for row in cursor.fetchall():
ngram_ids[row[1]] = row[0]
#
#
dbg.show('insert associations')
node_ngram_data = list()
for node_id, ngrams in node_ngram_list.items():
......
......@@ -4,11 +4,11 @@ parsers = {
'Pubmed (xml format)' : PubmedFileParser,
'Web of Science (ISI format)' : IsiFileParser,
'Scopus (RIS format)' : RisFileParser,
'Zotero (RIS format)' : JstorFileParser,
'Zotero (RIS format)' : ZoteroFileParser,
'Jstor (RIS format)' : JstorFileParser,
#'Europress' : EuropressFileParser,
'Europress (French)' : EuropressFileParser,
'Europress (English)' : EuropressFileParser,
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment