Commit d37bdbd3 authored by Administrator's avatar Administrator

Merge branch 'testing' into prod-dev

parents a1b68438 407b96ab
...@@ -11,7 +11,7 @@ from ..NgramsExtractors import * ...@@ -11,7 +11,7 @@ from ..NgramsExtractors import *
from admin.utils import PrintException from admin.utils import PrintException
class EuropressFileParser(FileParser): class EuropressFileParser(FileParser):
def _parse(self, file): def _parse(self, file):
localeEncoding = "fr_FR" localeEncoding = "fr_FR"
...@@ -39,21 +39,21 @@ class EuropressFileParser(FileParser): ...@@ -39,21 +39,21 @@ class EuropressFileParser(FileParser):
try: try:
html_parser = etree.HTMLParser(encoding=codif) html_parser = etree.HTMLParser(encoding=codif)
html = etree.fromstring(contents, html_parser) html = etree.fromstring(contents, html_parser)
try : try :
format_europresse = 50 format_europresse = 50
html_articles = html.xpath('/html/body/table/tbody') html_articles = html.xpath('/html/body/table/tbody')
if len(html_articles) < 1: if len(html_articles) < 1:
html_articles = html.xpath('/html/body/table') html_articles = html.xpath('/html/body/table')
if len(html_articles) < 1: if len(html_articles) < 1:
format_europresse = 1 format_europresse = 1
html_articles = html.xpath('//div[@id="docContain"]') html_articles = html.xpath('//div[@id="docContain"]')
except : except :
PrintException() PrintException()
if format_europresse == 50 : if format_europresse == 50 :
name_xpath = "./tr/td/span[@class = 'DocPublicationName']" name_xpath = "./tr/td/span[@class = 'DocPublicationName']"
header_xpath = "./tr/td/span[@class = 'DocHeader']" header_xpath = "./tr/td/span[@class = 'DocHeader']"
...@@ -77,7 +77,7 @@ class EuropressFileParser(FileParser): ...@@ -77,7 +77,7 @@ class EuropressFileParser(FileParser):
or self::td[@class='txtCertificat'] \ or self::td[@class='txtCertificat'] \
)]/text()" )]/text()"
doi_xpath = "//span[@id='ucPubliC_lblNodoc']/text()" doi_xpath = "//span[@id='ucPubliC_lblNodoc']/text()"
except Exception as error : except Exception as error :
PrintException() PrintException()
...@@ -85,9 +85,9 @@ class EuropressFileParser(FileParser): ...@@ -85,9 +85,9 @@ class EuropressFileParser(FileParser):
# parse all the articles, one by one # parse all the articles, one by one
try: try:
for html_article in html_articles: for html_article in html_articles:
hyperdata = {} hyperdata = {}
if len(html_article): if len(html_article):
for name in html_article.xpath(name_xpath): for name in html_article.xpath(name_xpath):
if name.text is not None: if name.text is not None:
...@@ -98,26 +98,26 @@ class EuropressFileParser(FileParser): ...@@ -98,26 +98,26 @@ class EuropressFileParser(FileParser):
hyperdata['volume'] = test_journal.group(2) hyperdata['volume'] = test_journal.group(2)
else: else:
hyperdata['journal'] = name.text.encode(codif) hyperdata['journal'] = name.text.encode(codif)
countbis = 0 countbis = 0
for header in html_article.xpath(header_xpath): for header in html_article.xpath(header_xpath):
# print(count) # print(count)
# countbis += 1 # countbis += 1
# try: # try:
# print('109', hyperdata['publication_date']) # print('109', hyperdata['publication_date'])
# except: # except:
# print('no date yet') # print('no date yet')
# pass # pass
try: try:
text = header.text text = header.text
#print("header", text) #print("header", text)
except Exception as error: except Exception as error:
print(error) print(error)
if isinstance(text, bytes): if isinstance(text, bytes):
text = text.decode(encoding) text = text.decode(encoding)
format_date_fr = re.compile('\d*\s*\w+\s+\d{4}', re.UNICODE) format_date_fr = re.compile('\d*\s*\w+\s+\d{4}', re.UNICODE)
...@@ -134,9 +134,9 @@ class EuropressFileParser(FileParser): ...@@ -134,9 +134,9 @@ class EuropressFileParser(FileParser):
test_date_en = None test_date_en = None
test_sect = None test_sect = None
test_page = None test_page = None
if test_date_fr is not None: if test_date_fr is not None:
self.localeEncoding = "fr_FR" self.localeEncoding = "fr_FR"
locale.setlocale(locale.LC_ALL, localeEncoding) locale.setlocale(locale.LC_ALL, localeEncoding)
...@@ -158,7 +158,7 @@ class EuropressFileParser(FileParser): ...@@ -158,7 +158,7 @@ class EuropressFileParser(FileParser):
except Exception as error: except Exception as error:
print(error, text) print(error, text)
pass pass
if test_date_en is not None: if test_date_en is not None:
localeEncoding = "en_GB.UTF-8" localeEncoding = "en_GB.UTF-8"
...@@ -173,20 +173,20 @@ class EuropressFileParser(FileParser): ...@@ -173,20 +173,20 @@ class EuropressFileParser(FileParser):
if test_sect is not None: if test_sect is not None:
hyperdata['section'] = test_sect.group(1).encode(codif) hyperdata['section'] = test_sect.group(1).encode(codif)
if test_page is not None: if test_page is not None:
hyperdata['page'] = test_page.group(1).encode(codif) hyperdata['page'] = test_page.group(1).encode(codif)
try: # try:
print('183', hyperdata['publication_date']) # print('183', hyperdata['publication_date'])
except: # except:
print('no date yet') # print('no date yet')
pass # pass
#
hyperdata['title'] = html_article.xpath(title_xpath).encode(codif) hyperdata['title'] = html_article.xpath(title_xpath).encode(codif)
hyperdata['abstract'] = html_article.xpath(text_xpath) hyperdata['abstract'] = html_article.xpath(text_xpath)
line = 0 line = 0
br_tag = 10 br_tag = 10
for i in html_articles[count].iter(): for i in html_articles[count].iter():
...@@ -205,13 +205,13 @@ class EuropressFileParser(FileParser): ...@@ -205,13 +205,13 @@ class EuropressFileParser(FileParser):
hyperdata['authors'] = 'not found' hyperdata['authors'] = 'not found'
line = 0 line = 0
br_tag = 10 br_tag = 10
try: try:
if hyperdata['publication_date'] is not None or hyperdata['publication_date'] != '': if hyperdata['publication_date'] is not None or hyperdata['publication_date'] != '':
try: try:
back = hyperdata['publication_date'] back = hyperdata['publication_date']
except Exception as e: except Exception as e:
#print(e) #print(e)
pass pass
else: else:
...@@ -226,14 +226,14 @@ class EuropressFileParser(FileParser): ...@@ -226,14 +226,14 @@ class EuropressFileParser(FileParser):
#hyperdata['language_iso2'] = 'fr' #hyperdata['language_iso2'] = 'fr'
#elif lang == 'en': #elif lang == 'en':
# hyperdata['language_iso2'] = 'en' # hyperdata['language_iso2'] = 'en'
hyperdata['publication_year'] = hyperdata['publication_date'].strftime('%Y') hyperdata['publication_year'] = hyperdata['publication_date'].strftime('%Y')
hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m') hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m')
hyperdata['publication_day'] = hyperdata['publication_date'].strftime('%d') hyperdata['publication_day'] = hyperdata['publication_date'].strftime('%d')
#hyperdata.pop('publication_date') #hyperdata.pop('publication_date')
if len(hyperdata['abstract'])>0 and format_europresse == 50: if len(hyperdata['abstract'])>0 and format_europresse == 50:
hyperdata['doi'] = str(hyperdata['abstract'][-9]) hyperdata['doi'] = str(hyperdata['abstract'][-9])
hyperdata['abstract'].pop() hyperdata['abstract'].pop()
# Here add separator for paragraphs # Here add separator for paragraphs
...@@ -245,15 +245,15 @@ class EuropressFileParser(FileParser): ...@@ -245,15 +245,15 @@ class EuropressFileParser(FileParser):
# Here add separator for paragraphs # Here add separator for paragraphs
hyperdata['abstract'] = str(' '.join(hyperdata['abstract'])) hyperdata['abstract'] = str(' '.join(hyperdata['abstract']))
else: else:
hyperdata['doi'] = "not found" hyperdata['doi'] = "not found"
hyperdata['length_words'] = len(hyperdata['abstract'].split(' ')) hyperdata['length_words'] = len(hyperdata['abstract'].split(' '))
hyperdata['length_letters'] = len(hyperdata['abstract']) hyperdata['length_letters'] = len(hyperdata['abstract'])
hyperdata['bdd'] = u'europresse' hyperdata['bdd'] = u'europresse'
hyperdata['url'] = u'' hyperdata['url'] = u''
#hyperdata_str = {} #hyperdata_str = {}
for key, value in hyperdata.items(): for key, value in hyperdata.items():
hyperdata[key] = value.decode() if isinstance(value, bytes) else value hyperdata[key] = value.decode() if isinstance(value, bytes) else value
......
...@@ -4,21 +4,21 @@ import zipfile ...@@ -4,21 +4,21 @@ import zipfile
import chardet import chardet
from ..Caches import LanguagesCache from ..Caches import LanguagesCache
class FileParser: class FileParser:
"""Base class for performing files parsing depending on their type. """Base class for performing files parsing depending on their type.
""" """
def __init__(self, language_cache=None): def __init__(self, language_cache=None):
self._languages_cache = LanguagesCache() if language_cache is None else language_cache self._languages_cache = LanguagesCache() if language_cache is None else language_cache
def detect_encoding(self, string): def detect_encoding(self, string):
"""Useful method to detect the document encoding. """Useful method to detect the document encoding.
""" """
encoding = chardet.detect(string) encoding = chardet.detect(string)
return encoding.get('encoding', 'UTF-8') return encoding.get('encoding', 'UTF-8')
def format_hyperdata_dates(self, hyperdata): def format_hyperdata_dates(self, hyperdata):
"""Format the dates found in the hyperdata. """Format the dates found in the hyperdata.
Examples: Examples:
...@@ -27,7 +27,7 @@ class FileParser: ...@@ -27,7 +27,7 @@ class FileParser:
{"publication_year": "2014"} {"publication_year": "2014"}
-> {"publication_date": "2014-01-01 00:00:00", "publication_year": "2014", ...} -> {"publication_date": "2014-01-01 00:00:00", "publication_year": "2014", ...}
""" """
# First, check the split dates... # First, check the split dates...
prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_year"] prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_year"]
for prefix in prefixes: for prefix in prefixes:
...@@ -51,21 +51,23 @@ class FileParser: ...@@ -51,21 +51,23 @@ class FileParser:
hyperdata[prefix + "_date"] = dateutil.parser.parse(date_string).strftime("%Y-%m-%d %H:%M:%S") hyperdata[prefix + "_date"] = dateutil.parser.parse(date_string).strftime("%Y-%m-%d %H:%M:%S")
except: except:
pass pass
# ...then parse all the "date" fields, to parse it into separate elements # ...then parse all the "date" fields, to parse it into separate elements
prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_date"] prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_date"]
for prefix in prefixes: for prefix in prefixes:
date = dateutil.parser.parse(hyperdata[prefix + "_date"]) date = dateutil.parser.parse(hyperdata[prefix + "_date"])
print('date')
hyperdata[prefix + "_year"] = date.strftime("%Y") hyperdata[prefix + "_year"] = date.strftime("%Y")
hyperdata[prefix + "_month"] = date.strftime("%m") hyperdata[prefix + "_month"] = date.strftime("%m")
hyperdata[prefix + "_day"] = date.strftime("%d") hyperdata[prefix + "_day"] = date.strftime("%d")
hyperdata[prefix + "_hour"] = date.strftime("%H") hyperdata[prefix + "_hour"] = date.strftime("%H")
hyperdata[prefix + "_minute"] = date.strftime("%M") hyperdata[prefix + "_minute"] = date.strftime("%M")
hyperdata[prefix + "_second"] = date.strftime("%S") hyperdata[prefix + "_second"] = date.strftime("%S")
# finally, return the transformed result! # finally, return the transformed result!
return hyperdata return hyperdata
def format_hyperdata_languages(self, hyperdata): def format_hyperdata_languages(self, hyperdata):
"""format the languages found in the hyperdata.""" """format the languages found in the hyperdata."""
language = None language = None
...@@ -81,18 +83,18 @@ class FileParser: ...@@ -81,18 +83,18 @@ class FileParser:
hyperdata["language_iso3"] = language.iso3 hyperdata["language_iso3"] = language.iso3
hyperdata["language_fullname"] = language.fullname hyperdata["language_fullname"] = language.fullname
return hyperdata return hyperdata
def format_hyperdata(self, hyperdata): def format_hyperdata(self, hyperdata):
"""Format the hyperdata.""" """Format the hyperdata."""
hyperdata = self.format_hyperdata_dates(hyperdata) hyperdata = self.format_hyperdata_dates(hyperdata)
hyperdata = self.format_hyperdata_languages(hyperdata) hyperdata = self.format_hyperdata_languages(hyperdata)
return hyperdata return hyperdata
def _parse(self, file): def _parse(self, file):
"""This method shall be overriden by inherited classes.""" """This method shall be overriden by inherited classes."""
return list() return list()
def parse(self, file): def parse(self, file):
"""Parse the file, and its children files found in the file. """Parse the file, and its children files found in the file.
""" """
......
...@@ -3,15 +3,17 @@ from .FileParser import FileParser ...@@ -3,15 +3,17 @@ from .FileParser import FileParser
from ..Caches import LanguagesCache from ..Caches import LanguagesCache
from admin.utils import PrintException
class RisFileParser(FileParser): class RisFileParser(FileParser):
def __init__(self, language_cache=None): def __init__(self, language_cache=None):
super(FileParser, self).__init__() super(FileParser, self).__init__()
self._languages_cache = LanguagesCache() if language_cache is None else language_cache self._languages_cache = LanguagesCache() if language_cache is None else language_cache
self._begin = 6 self._begin = 6
self._parameters = { self._parameters = {
b"ER": {"type": "delimiter"}, b"ER": {"type": "delimiter"},
b"TI": {"type": "hyperdata", "key": "title", "separator": " "}, b"TI": {"type": "hyperdata", "key": "title", "separator": " "},
...@@ -24,7 +26,7 @@ class RisFileParser(FileParser): ...@@ -24,7 +26,7 @@ class RisFileParser(FileParser):
b"AB": {"type": "hyperdata", "key": "abstract", "separator": " "}, b"AB": {"type": "hyperdata", "key": "abstract", "separator": " "},
b"WC": {"type": "hyperdata", "key": "fields"}, b"WC": {"type": "hyperdata", "key": "fields"},
} }
def _parse(self, file): def _parse(self, file):
hyperdata = {} hyperdata = {}
...@@ -57,5 +59,11 @@ class RisFileParser(FileParser): ...@@ -57,5 +59,11 @@ class RisFileParser(FileParser):
print(error) print(error)
# if a hyperdata object is left in memory, yield it as well # if a hyperdata object is left in memory, yield it as well
if hyperdata: if hyperdata:
# try:
# if hyperdata['date_to_parse']:
# print(hyperdata['date_to_parse'])
# except:
# pass
#
#print(hyperdata['title']) #print(hyperdata['title'])
yield hyperdata yield hyperdata
from .RisFileParser import RisFileParser
from ..Caches import LanguagesCache
class ZoteroFileParser(RisFileParser):
def __init__(self):
super(RisFileParser, self).__init__()
self._begin = 6
self._parameters = {
b"ER": {"type": "delimiter"},
b"TI": {"type": "hyperdata", "key": "title", "separator": " "},
b"AU": {"type": "hyperdata", "key": "authors", "separator": ", "},
b"UR": {"type": "hyperdata", "key": "doi"},
b"DA": {"type": "hyperdata", "key": "publication_date"},
b"PY": {"type": "hyperdata", "key": "publication_year"},
b"PD": {"type": "hyperdata", "key": "publication_month"},
b"LA": {"type": "hyperdata", "key": "language_iso2"},
b"AB": {"type": "hyperdata", "key": "abstract", "separator": " "},
b"WC": {"type": "hyperdata", "key": "fields"},
}
from .RisFileParser import RisFileParser from .RisFileParser import RisFileParser
from .IsiFileParser import IsiFileParser from .IsiFileParser import IsiFileParser
from .JstorFileParser import JstorFileParser from .JstorFileParser import JstorFileParser
from .ZoteroFileParser import ZoteroFileParser
from .PubmedFileParser import PubmedFileParser from .PubmedFileParser import PubmedFileParser
from .EuropressFileParser import EuropressFileParser from .EuropressFileParser import EuropressFileParser
from .ISText import ISText from .ISText import ISText
...@@ -128,7 +128,7 @@ def parse_resources(corpus, user=None, user_id=None): ...@@ -128,7 +128,7 @@ def parse_resources(corpus, user=None, user_id=None):
nodes.append(node) nodes.append(node)
# #
# TODO: mark node-resources associations as parsed # TODO: mark node-resources associations as parsed
# #
dbg.show('insert %d documents' % len(nodes)) dbg.show('insert %d documents' % len(nodes))
session.add_all(nodes) session.add_all(nodes)
session.commit() session.commit()
...@@ -205,7 +205,7 @@ def extract_ngrams(corpus, keys): ...@@ -205,7 +205,7 @@ def extract_ngrams(corpus, keys):
language.id: language.iso2 language.id: language.iso2
for language in session.query(Language) for language in session.query(Language)
} }
ngrams_data = set() ngrams_data = set()
ngrams_language_data = set() ngrams_language_data = set()
ngrams_tag_data = set() ngrams_tag_data = set()
...@@ -241,7 +241,7 @@ def extract_ngrams(corpus, keys): ...@@ -241,7 +241,7 @@ def extract_ngrams(corpus, keys):
#tag_id = 14 #tag_id = 14
#print('tag_id_2', tag_id) #print('tag_id_2', tag_id)
node_ngram_list[node_id][terms] += 1 node_ngram_list[node_id][terms] += 1
ngrams_data.add((n, terms)) ngrams_data.add((n, terms[:255]))
ngrams_language_data.add((terms, language_id)) ngrams_language_data.add((terms, language_id))
ngrams_tag_data.add((terms, tag_id)) ngrams_tag_data.add((terms, tag_id))
...@@ -268,7 +268,7 @@ def extract_ngrams(corpus, keys): ...@@ -268,7 +268,7 @@ def extract_ngrams(corpus, keys):
ngram.terms = tmp__ngrams.terms ngram.terms = tmp__ngrams.terms
''' % (Ngram.__table__.name, )) ''' % (Ngram.__table__.name, ))
# insert, then get the ids back # insert, then get the ids back
cursor.execute(''' cursor.execute('''
INSERT INTO INSERT INTO
%s (n, terms) %s (n, terms)
...@@ -279,8 +279,8 @@ def extract_ngrams(corpus, keys): ...@@ -279,8 +279,8 @@ def extract_ngrams(corpus, keys):
WHERE WHERE
id IS NULL id IS NULL
''' % (Ngram.__table__.name, )) ''' % (Ngram.__table__.name, ))
cursor.execute(''' cursor.execute('''
UPDATE UPDATE
tmp__ngrams tmp__ngrams
...@@ -293,14 +293,14 @@ def extract_ngrams(corpus, keys): ...@@ -293,14 +293,14 @@ def extract_ngrams(corpus, keys):
AND AND
tmp__ngrams.id IS NULL tmp__ngrams.id IS NULL
''' % (Ngram.__table__.name, )) ''' % (Ngram.__table__.name, ))
# get all ids # get all ids
ngram_ids = dict() ngram_ids = dict()
cursor.execute('SELECT id, terms FROM tmp__ngrams') cursor.execute('SELECT id, terms FROM tmp__ngrams')
for row in cursor.fetchall(): for row in cursor.fetchall():
ngram_ids[row[1]] = row[0] ngram_ids[row[1]] = row[0]
# #
dbg.show('insert associations') dbg.show('insert associations')
node_ngram_data = list() node_ngram_data = list()
for node_id, ngrams in node_ngram_list.items(): for node_id, ngrams in node_ngram_list.items():
......
...@@ -4,11 +4,11 @@ parsers = { ...@@ -4,11 +4,11 @@ parsers = {
'Pubmed (xml format)' : PubmedFileParser, 'Pubmed (xml format)' : PubmedFileParser,
'Web of Science (ISI format)' : IsiFileParser, 'Web of Science (ISI format)' : IsiFileParser,
'Scopus (RIS format)' : RisFileParser, 'Scopus (RIS format)' : RisFileParser,
'Zotero (RIS format)' : JstorFileParser, 'Zotero (RIS format)' : ZoteroFileParser,
'Jstor (RIS format)' : JstorFileParser, 'Jstor (RIS format)' : JstorFileParser,
#'Europress' : EuropressFileParser, #'Europress' : EuropressFileParser,
'Europress (French)' : EuropressFileParser, 'Europress (French)' : EuropressFileParser,
'Europress (English)' : EuropressFileParser, 'Europress (English)' : EuropressFileParser,
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment