Commit 85b37233 authored by Dalf's avatar Dalf

[mod] speed optimization

compile XPath only once
avoid redundant call to urlparse
get_locale(webapp.py): avoid useless call to request.accept_languages.best_match
parent 42d5e2c0
...@@ -18,7 +18,7 @@ from lxml import html ...@@ -18,7 +18,7 @@ from lxml import html
from searx import logger, utils from searx import logger, utils
from searx.engines.xpath import extract_text from searx.engines.xpath import extract_text
from searx.url_utils import urlencode from searx.url_utils import urlencode
from searx.utils import match_language, gen_useragent from searx.utils import match_language, gen_useragent, eval_xpath
logger = logger.getChild('bing engine') logger = logger.getChild('bing engine')
...@@ -65,11 +65,11 @@ def response(resp): ...@@ -65,11 +65,11 @@ def response(resp):
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
# parse results # parse results
for result in dom.xpath('//div[@class="sa_cc"]'): for result in eval_xpath(dom, '//div[@class="sa_cc"]'):
link = result.xpath('.//h3/a')[0] link = eval_xpath(result, './/h3/a')[0]
url = link.attrib.get('href') url = link.attrib.get('href')
title = extract_text(link) title = extract_text(link)
content = extract_text(result.xpath('.//p')) content = extract_text(eval_xpath(result, './/p'))
# append result # append result
results.append({'url': url, results.append({'url': url,
...@@ -77,11 +77,11 @@ def response(resp): ...@@ -77,11 +77,11 @@ def response(resp):
'content': content}) 'content': content})
# parse results again if nothing is found yet # parse results again if nothing is found yet
for result in dom.xpath('//li[@class="b_algo"]'): for result in eval_xpath(dom, '//li[@class="b_algo"]'):
link = result.xpath('.//h2/a')[0] link = eval_xpath(result, './/h2/a')[0]
url = link.attrib.get('href') url = link.attrib.get('href')
title = extract_text(link) title = extract_text(link)
content = extract_text(result.xpath('.//p')) content = extract_text(eval_xpath(result, './/p'))
# append result # append result
results.append({'url': url, results.append({'url': url,
...@@ -89,7 +89,7 @@ def response(resp): ...@@ -89,7 +89,7 @@ def response(resp):
'content': content}) 'content': content})
try: try:
result_len_container = "".join(dom.xpath('//span[@class="sb_count"]/text()')) result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]/text()'))
result_len_container = utils.to_string(result_len_container) result_len_container = utils.to_string(result_len_container)
if "-" in result_len_container: if "-" in result_len_container:
# Remove the part "from-to" for paginated request ... # Remove the part "from-to" for paginated request ...
...@@ -113,9 +113,9 @@ def response(resp): ...@@ -113,9 +113,9 @@ def response(resp):
def _fetch_supported_languages(resp): def _fetch_supported_languages(resp):
supported_languages = [] supported_languages = []
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
options = dom.xpath('//div[@id="limit-languages"]//input') options = eval_xpath(dom, '//div[@id="limit-languages"]//input')
for option in options: for option in options:
code = option.xpath('./@id')[0].replace('_', '-') code = eval_xpath(option, './@id')[0].replace('_', '-')
if code == 'nb': if code == 'nb':
code = 'no' code = 'no'
supported_languages.append(code) supported_languages.append(code)
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
import re import re
from lxml import html from lxml import html
from searx.utils import is_valid_lang from searx.utils import is_valid_lang, eval_xpath
from searx.url_utils import urljoin from searx.url_utils import urljoin
categories = ['general'] categories = ['general']
...@@ -47,14 +47,14 @@ def response(resp): ...@@ -47,14 +47,14 @@ def response(resp):
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
for k, result in enumerate(dom.xpath(results_xpath)[1:]): for k, result in enumerate(eval_xpath(dom, results_xpath)[1:]):
try: try:
from_result, to_results_raw = result.xpath('./td') from_result, to_results_raw = eval_xpath(result, './td')
except: except:
continue continue
to_results = [] to_results = []
for to_result in to_results_raw.xpath('./p/a'): for to_result in eval_xpath(to_results_raw, './p/a'):
t = to_result.text_content() t = to_result.text_content()
if t.strip(): if t.strip():
to_results.append(to_result.text_content()) to_results.append(to_result.text_content())
......
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
from lxml.html import fromstring from lxml.html import fromstring
from searx.engines.xpath import extract_text from searx.engines.xpath import extract_text
from searx.utils import eval_xpath
from searx.url_utils import urlencode from searx.url_utils import urlencode
# engine dependent config # engine dependent config
...@@ -45,16 +46,16 @@ def response(resp): ...@@ -45,16 +46,16 @@ def response(resp):
# parse results # parse results
# Quickhits # Quickhits
for r in doc.xpath('//div[@class="search_quickresult"]/ul/li'): for r in eval_xpath(doc, '//div[@class="search_quickresult"]/ul/li'):
try: try:
res_url = r.xpath('.//a[@class="wikilink1"]/@href')[-1] res_url = eval_xpath(r, './/a[@class="wikilink1"]/@href')[-1]
except: except:
continue continue
if not res_url: if not res_url:
continue continue
title = extract_text(r.xpath('.//a[@class="wikilink1"]/@title')) title = extract_text(eval_xpath(r, './/a[@class="wikilink1"]/@title'))
# append result # append result
results.append({'title': title, results.append({'title': title,
...@@ -62,13 +63,13 @@ def response(resp): ...@@ -62,13 +63,13 @@ def response(resp):
'url': base_url + res_url}) 'url': base_url + res_url})
# Search results # Search results
for r in doc.xpath('//dl[@class="search_results"]/*'): for r in eval_xpath(doc, '//dl[@class="search_results"]/*'):
try: try:
if r.tag == "dt": if r.tag == "dt":
res_url = r.xpath('.//a[@class="wikilink1"]/@href')[-1] res_url = eval_xpath(r, './/a[@class="wikilink1"]/@href')[-1]
title = extract_text(r.xpath('.//a[@class="wikilink1"]/@title')) title = extract_text(eval_xpath(r, './/a[@class="wikilink1"]/@title'))
elif r.tag == "dd": elif r.tag == "dd":
content = extract_text(r.xpath('.')) content = extract_text(eval_xpath(r, '.'))
# append result # append result
results.append({'title': title, results.append({'title': title,
......
...@@ -18,7 +18,7 @@ from json import loads ...@@ -18,7 +18,7 @@ from json import loads
from searx.engines.xpath import extract_text from searx.engines.xpath import extract_text
from searx.poolrequests import get from searx.poolrequests import get
from searx.url_utils import urlencode from searx.url_utils import urlencode
from searx.utils import match_language from searx.utils import match_language, eval_xpath
# engine dependent config # engine dependent config
categories = ['general'] categories = ['general']
...@@ -106,19 +106,19 @@ def response(resp): ...@@ -106,19 +106,19 @@ def response(resp):
doc = fromstring(resp.text) doc = fromstring(resp.text)
# parse results # parse results
for i, r in enumerate(doc.xpath(result_xpath)): for i, r in enumerate(eval_xpath(doc, result_xpath)):
if i >= 30: if i >= 30:
break break
try: try:
res_url = r.xpath(url_xpath)[-1] res_url = eval_xpath(r, url_xpath)[-1]
except: except:
continue continue
if not res_url: if not res_url:
continue continue
title = extract_text(r.xpath(title_xpath)) title = extract_text(eval_xpath(r, title_xpath))
content = extract_text(r.xpath(content_xpath)) content = extract_text(eval_xpath(r, content_xpath))
# append result # append result
results.append({'title': title, results.append({'title': title,
......
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
from lxml import html, etree from lxml import html, etree
import re import re
from searx.engines.xpath import extract_text from searx.engines.xpath import extract_text
from searx.utils import eval_xpath
from searx.url_utils import quote, urljoin from searx.url_utils import quote, urljoin
from searx import logger from searx import logger
...@@ -52,9 +53,9 @@ def response(resp): ...@@ -52,9 +53,9 @@ def response(resp):
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
try: try:
number_of_results_string = re.sub('[^0-9]', '', dom.xpath( number_of_results_string =\
'//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()')[0] re.sub('[^0-9]', '',
) eval_xpath(dom, '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()')[0])
results.append({'number_of_results': int(number_of_results_string)}) results.append({'number_of_results': int(number_of_results_string)})
...@@ -62,12 +63,12 @@ def response(resp): ...@@ -62,12 +63,12 @@ def response(resp):
logger.debug("Couldn't read number of results.") logger.debug("Couldn't read number of results.")
pass pass
for result in dom.xpath('//section[not(contains(@class, "essay"))]'): for result in eval_xpath(dom, '//section[not(contains(@class, "essay"))]'):
try: try:
url = result.xpath('.//h2/a')[0].get('href') url = eval_xpath(result, './/h2/a')[0].get('href')
url = urljoin(base_url, url) url = urljoin(base_url, url)
title = result.xpath('string(.//h2/a)').strip() title = eval_xpath(result, 'string(.//h2/a)').strip()
content = extract_text(result.xpath('.//p')) content = extract_text(eval_xpath(result, './/p'))
# append result # append result
results.append({'url': url, results.append({'url': url,
'title': title, 'title': title,
......
...@@ -15,6 +15,7 @@ from json import loads ...@@ -15,6 +15,7 @@ from json import loads
from time import time from time import time
from lxml.html import fromstring from lxml.html import fromstring
from searx.url_utils import urlencode from searx.url_utils import urlencode
from searx.utils import eval_xpath
# engine dependent config # engine dependent config
categories = ['general'] categories = ['general']
...@@ -99,9 +100,9 @@ def response(resp): ...@@ -99,9 +100,9 @@ def response(resp):
def _fetch_supported_languages(resp): def _fetch_supported_languages(resp):
supported_languages = [] supported_languages = []
dom = fromstring(resp.text) dom = fromstring(resp.text)
links = dom.xpath('//span[@id="menu2"]/a') links = eval_xpath(dom, '//span[@id="menu2"]/a')
for link in links: for link in links:
href = link.xpath('./@href')[0].split('lang%3A') href = eval_xpath(link, './@href')[0].split('lang%3A')
if len(href) == 2: if len(href) == 2:
code = href[1].split('_') code = href[1].split('_')
if len(code) == 2: if len(code) == 2:
......
...@@ -14,7 +14,7 @@ from lxml import html, etree ...@@ -14,7 +14,7 @@ from lxml import html, etree
from searx.engines.xpath import extract_text, extract_url from searx.engines.xpath import extract_text, extract_url
from searx import logger from searx import logger
from searx.url_utils import urlencode, urlparse, parse_qsl from searx.url_utils import urlencode, urlparse, parse_qsl
from searx.utils import match_language from searx.utils import match_language, eval_xpath
logger = logger.getChild('google engine') logger = logger.getChild('google engine')
...@@ -156,7 +156,7 @@ def parse_url(url_string, google_hostname): ...@@ -156,7 +156,7 @@ def parse_url(url_string, google_hostname):
# returns extract_text on the first result selected by the xpath or None # returns extract_text on the first result selected by the xpath or None
def extract_text_from_dom(result, xpath): def extract_text_from_dom(result, xpath):
r = result.xpath(xpath) r = eval_xpath(result, xpath)
if len(r) > 0: if len(r) > 0:
return extract_text(r[0]) return extract_text(r[0])
return None return None
...@@ -226,21 +226,21 @@ def response(resp): ...@@ -226,21 +226,21 @@ def response(resp):
# convert the text to dom # convert the text to dom
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
instant_answer = dom.xpath('//div[@id="_vBb"]//text()') instant_answer = eval_xpath(dom, '//div[@id="_vBb"]//text()')
if instant_answer: if instant_answer:
results.append({'answer': u' '.join(instant_answer)}) results.append({'answer': u' '.join(instant_answer)})
try: try:
results_num = int(dom.xpath('//div[@id="resultStats"]//text()')[0] results_num = int(eval_xpath(dom, '//div[@id="resultStats"]//text()')[0]
.split()[1].replace(',', '')) .split()[1].replace(',', ''))
results.append({'number_of_results': results_num}) results.append({'number_of_results': results_num})
except: except:
pass pass
# parse results # parse results
for result in dom.xpath(results_xpath): for result in eval_xpath(dom, results_xpath):
try: try:
title = extract_text(result.xpath(title_xpath)[0]) title = extract_text(eval_xpath(result, title_xpath)[0])
url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname) url = parse_url(extract_url(eval_xpath(result, url_xpath), google_url), google_hostname)
parsed_url = urlparse(url, google_hostname) parsed_url = urlparse(url, google_hostname)
# map result # map result
...@@ -249,7 +249,7 @@ def response(resp): ...@@ -249,7 +249,7 @@ def response(resp):
continue continue
# if parsed_url.path.startswith(maps_path) or parsed_url.netloc.startswith(map_hostname_start): # if parsed_url.path.startswith(maps_path) or parsed_url.netloc.startswith(map_hostname_start):
# print "yooooo"*30 # print "yooooo"*30
# x = result.xpath(map_near) # x = eval_xpath(result, map_near)
# if len(x) > 0: # if len(x) > 0:
# # map : near the location # # map : near the location
# results = results + parse_map_near(parsed_url, x, google_hostname) # results = results + parse_map_near(parsed_url, x, google_hostname)
...@@ -286,11 +286,11 @@ def response(resp): ...@@ -286,11 +286,11 @@ def response(resp):
continue continue
# parse suggestion # parse suggestion
for suggestion in dom.xpath(suggestion_xpath): for suggestion in eval_xpath(dom, suggestion_xpath):
# append suggestion # append suggestion
results.append({'suggestion': extract_text(suggestion)}) results.append({'suggestion': extract_text(suggestion)})
for correction in dom.xpath(spelling_suggestion_xpath): for correction in eval_xpath(dom, spelling_suggestion_xpath):
results.append({'correction': extract_text(correction)}) results.append({'correction': extract_text(correction)})
# return results # return results
...@@ -299,9 +299,9 @@ def response(resp): ...@@ -299,9 +299,9 @@ def response(resp):
def parse_images(result, google_hostname): def parse_images(result, google_hostname):
results = [] results = []
for image in result.xpath(images_xpath): for image in eval_xpath(result, images_xpath):
url = parse_url(extract_text(image.xpath(image_url_xpath)[0]), google_hostname) url = parse_url(extract_text(eval_xpath(image, image_url_xpath)[0]), google_hostname)
img_src = extract_text(image.xpath(image_img_src_xpath)[0]) img_src = extract_text(eval_xpath(image, image_img_src_xpath)[0])
# append result # append result
results.append({'url': url, results.append({'url': url,
...@@ -388,10 +388,10 @@ def attributes_to_html(attributes): ...@@ -388,10 +388,10 @@ def attributes_to_html(attributes):
def _fetch_supported_languages(resp): def _fetch_supported_languages(resp):
supported_languages = {} supported_languages = {}
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
options = dom.xpath('//*[@id="langSec"]//input[@name="lr"]') options = eval_xpath(dom, '//*[@id="langSec"]//input[@name="lr"]')
for option in options: for option in options:
code = option.xpath('./@value')[0].split('_')[-1] code = eval_xpath(option, './@value')[0].split('_')[-1]
name = option.xpath('./@data-name')[0].title() name = eval_xpath(option, './@data-name')[0].title()
supported_languages[code] = {"name": name} supported_languages[code] = {"name": name}
return supported_languages return supported_languages
...@@ -16,6 +16,7 @@ from datetime import datetime, timedelta ...@@ -16,6 +16,7 @@ from datetime import datetime, timedelta
import re import re
from searx.engines.xpath import extract_text from searx.engines.xpath import extract_text
from searx.languages import language_codes from searx.languages import language_codes
from searx.utils import eval_xpath
# engine dependent config # engine dependent config
categories = ['general'] categories = ['general']
...@@ -70,8 +71,8 @@ def response(resp): ...@@ -70,8 +71,8 @@ def response(resp):
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
# parse results # parse results
for result in dom.xpath(results_xpath): for result in eval_xpath(dom, results_xpath):
links = result.xpath(link_xpath) links = eval_xpath(result, link_xpath)
if not links: if not links:
continue continue
link = links[0] link = links[0]
...@@ -87,8 +88,8 @@ def response(resp): ...@@ -87,8 +88,8 @@ def response(resp):
title = extract_text(link) title = extract_text(link)
if result.xpath(content_xpath): if eval_xpath(result, content_xpath):
content = extract_text(result.xpath(content_xpath)) content = extract_text(eval_xpath(result, content_xpath))
else: else:
content = '' content = ''
......
...@@ -16,7 +16,7 @@ from searx.poolrequests import get ...@@ -16,7 +16,7 @@ from searx.poolrequests import get
from searx.engines.xpath import extract_text from searx.engines.xpath import extract_text
from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url
from searx.url_utils import urlencode from searx.url_utils import urlencode
from searx.utils import match_language from searx.utils import match_language, eval_xpath
from json import loads from json import loads
from lxml.html import fromstring from lxml.html import fromstring
...@@ -57,22 +57,6 @@ language_fallback_xpath = '//sup[contains(@class,"wb-language-fallback-indicator ...@@ -57,22 +57,6 @@ language_fallback_xpath = '//sup[contains(@class,"wb-language-fallback-indicator
calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]' calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]'
media_xpath = value_xpath + '//div[contains(@class,"commons-media-caption")]//a' media_xpath = value_xpath + '//div[contains(@class,"commons-media-caption")]//a'
# xpath_cache
xpath_cache = {}
def get_xpath(xpath_str):
result = xpath_cache.get(xpath_str, None)
if not result:
result = etree.XPath(xpath_str)
xpath_cache[xpath_str] = result
return result
def eval_xpath(element, xpath_str):
xpath = get_xpath(xpath_str)
return xpath(element)
def get_id_cache(result): def get_id_cache(result):
id_cache = {} id_cache = {}
......
from lxml import html from lxml import html
from lxml.etree import _ElementStringResult, _ElementUnicodeResult from lxml.etree import _ElementStringResult, _ElementUnicodeResult
from searx.utils import html_to_text from searx.utils import html_to_text, eval_xpath
from searx.url_utils import unquote, urlencode, urljoin, urlparse from searx.url_utils import unquote, urlencode, urljoin, urlparse
search_url = None search_url = None
...@@ -104,15 +104,15 @@ def response(resp): ...@@ -104,15 +104,15 @@ def response(resp):
results = [] results = []
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
if results_xpath: if results_xpath:
for result in dom.xpath(results_xpath): for result in eval_xpath(dom, results_xpath):
url = extract_url(result.xpath(url_xpath), search_url) url = extract_url(eval_xpath(result, url_xpath), search_url)
title = extract_text(result.xpath(title_xpath)) title = extract_text(eval_xpath(result, title_xpath))
content = extract_text(result.xpath(content_xpath)) content = extract_text(eval_xpath(result, content_xpath))
tmp_result = {'url': url, 'title': title, 'content': content} tmp_result = {'url': url, 'title': title, 'content': content}
# add thumbnail if available # add thumbnail if available
if thumbnail_xpath: if thumbnail_xpath:
thumbnail_xpath_result = result.xpath(thumbnail_xpath) thumbnail_xpath_result = eval_xpath(result, thumbnail_xpath)
if len(thumbnail_xpath_result) > 0: if len(thumbnail_xpath_result) > 0:
tmp_result['img_src'] = extract_url(thumbnail_xpath_result, search_url) tmp_result['img_src'] = extract_url(thumbnail_xpath_result, search_url)
...@@ -120,14 +120,14 @@ def response(resp): ...@@ -120,14 +120,14 @@ def response(resp):
else: else:
for url, title, content in zip( for url, title, content in zip(
(extract_url(x, search_url) for (extract_url(x, search_url) for
x in dom.xpath(url_xpath)), x in eval_xpath(dom, url_xpath)),
map(extract_text, dom.xpath(title_xpath)), map(extract_text, eval_xpath(dom, title_xpath)),
map(extract_text, dom.xpath(content_xpath)) map(extract_text, eval_xpath(dom, content_xpath))
): ):
results.append({'url': url, 'title': title, 'content': content}) results.append({'url': url, 'title': title, 'content': content})
if not suggestion_xpath: if not suggestion_xpath:
return results return results
for suggestion in dom.xpath(suggestion_xpath): for suggestion in eval_xpath(dom, suggestion_xpath):
results.append({'suggestion': extract_text(suggestion)}) results.append({'suggestion': extract_text(suggestion)})
return results return results
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
from lxml import html from lxml import html
from searx.engines.xpath import extract_text, extract_url from searx.engines.xpath import extract_text, extract_url
from searx.url_utils import unquote, urlencode from searx.url_utils import unquote, urlencode
from searx.utils import match_language from searx.utils import match_language, eval_xpath
# engine dependent config # engine dependent config
categories = ['general'] categories = ['general']
...@@ -109,21 +109,21 @@ def response(resp): ...@@ -109,21 +109,21 @@ def response(resp):
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
try: try:
results_num = int(dom.xpath('//div[@class="compPagination"]/span[last()]/text()')[0] results_num = int(eval_xpath(dom, '//div[@class="compPagination"]/span[last()]/text()')[0]
.split()[0].replace(',', '')) .split()[0].replace(',', ''))
results.append({'number_of_results': results_num}) results.append({'number_of_results': results_num})
except: except:
pass pass
# parse results # parse results
for result in dom.xpath(results_xpath): for result in eval_xpath(dom, results_xpath):
try: try:
url = parse_url(extract_url(result.xpath(url_xpath), search_url)) url = parse_url(extract_url(eval_xpath(result, url_xpath), search_url))
title = extract_text(result.xpath(title_xpath)[0]) title = extract_text(eval_xpath(result, title_xpath)[0])
except: except:
continue continue
content = extract_text(result.xpath(content_xpath)[0]) content = extract_text(eval_xpath(result, content_xpath)[0])
# append result # append result
results.append({'url': url, results.append({'url': url,
...@@ -131,7 +131,7 @@ def response(resp): ...@@ -131,7 +131,7 @@ def response(resp):
'content': content}) 'content': content})
# if no suggestion found, return results # if no suggestion found, return results
suggestions = dom.xpath(suggestion_xpath) suggestions = eval_xpath(dom, suggestion_xpath)
if not suggestions: if not suggestions:
return results return results
...@@ -148,9 +148,9 @@ def response(resp): ...@@ -148,9 +148,9 @@ def response(resp):
def _fetch_supported_languages(resp): def _fetch_supported_languages(resp):
supported_languages = [] supported_languages = []
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
options = dom.xpath('//div[@id="yschlang"]/span/label/input') options = eval_xpath(dom, '//div[@id="yschlang"]/span/label/input')
for option in options: for option in options:
code_parts = option.xpath('./@value')[0][5:].split('_') code_parts = eval_xpath(option, './@value')[0][5:].split('_')
if len(code_parts) == 2: if len(code_parts) == 2:
code = code_parts[0] + '-' + code_parts[1].upper() code = code_parts[0] + '-' + code_parts[1].upper()
else: else:
......
...@@ -67,8 +67,9 @@ def merge_two_infoboxes(infobox1, infobox2): ...@@ -67,8 +67,9 @@ def merge_two_infoboxes(infobox1, infobox2):
for url2 in infobox2.get('urls', []): for url2 in infobox2.get('urls', []):
unique_url = True unique_url = True
for url1 in infobox1.get('urls', []): parsed_url2 = urlparse(url2.get('url', ''))
if compare_urls(urlparse(url1.get('url', '')), urlparse(url2.get('url', ''))): for url1 in urls1:
if compare_urls(urlparse(url1.get('url', '')), parsed_url2):
unique_url = False unique_url = False
break break
if unique_url: if unique_url:
...@@ -188,8 +189,9 @@ class ResultContainer(object): ...@@ -188,8 +189,9 @@ class ResultContainer(object):
add_infobox = True add_infobox = True
infobox_id = infobox.get('id', None) infobox_id = infobox.get('id', None)
if infobox_id is not None: if infobox_id is not None:
parsed_url_infobox_id = urlparse(infobox_id)
for existingIndex in self.infoboxes: for existingIndex in self.infoboxes:
if compare_urls(urlparse(existingIndex.get('id', '')), urlparse(infobox_id)): if compare_urls(urlparse(existingIndex.get('id', '')), parsed_url_infobox_id):
merge_two_infoboxes(existingIndex, infobox) merge_two_infoboxes(existingIndex, infobox)
add_infobox = False add_infobox = False
......
...@@ -13,6 +13,7 @@ from numbers import Number ...@@ -13,6 +13,7 @@ from numbers import Number
from os.path import splitext, join from os.path import splitext, join
from io import open from io import open
from random import choice from random import choice
from lxml.etree import XPath
import sys import sys
import json import json
...@@ -51,6 +52,7 @@ ecma_unescape2_re = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE) ...@@ -51,6 +52,7 @@ ecma_unescape2_re = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)
useragents = json.loads(open(os.path.dirname(os.path.realpath(__file__)) useragents = json.loads(open(os.path.dirname(os.path.realpath(__file__))
+ "/data/useragents.json", 'r', encoding='utf-8').read()) + "/data/useragents.json", 'r', encoding='utf-8').read())
xpath_cache = dict()
lang_to_lc_cache = dict() lang_to_lc_cache = dict()
...@@ -450,3 +452,16 @@ def get_engine_from_settings(name): ...@@ -450,3 +452,16 @@ def get_engine_from_settings(name):
return engine return engine
return {} return {}
def get_xpath(xpath_str):
result = xpath_cache.get(xpath_str, None)
if result is None:
result = XPath(xpath_str)
xpath_cache[xpath_str] = result
return result
def eval_xpath(element, xpath_str):
xpath = get_xpath(xpath_str)
return xpath(element)
...@@ -154,20 +154,18 @@ outgoing_proxies = settings['outgoing'].get('proxies') or None ...@@ -154,20 +154,18 @@ outgoing_proxies = settings['outgoing'].get('proxies') or None
@babel.localeselector @babel.localeselector
def get_locale(): def get_locale():
locale = request.accept_languages.best_match(settings['locales'].keys()) if 'locale' in request.form\
and request.form['locale'] in settings['locales']:
if request.preferences.get_value('locale') != '': return request.form['locale']
locale = request.preferences.get_value('locale')
if 'locale' in request.args\ if 'locale' in request.args\
and request.args['locale'] in settings['locales']: and request.args['locale'] in settings['locales']:
locale = request.args['locale'] return request.args['locale']
if 'locale' in request.form\ if request.preferences.get_value('locale') != '':
and request.form['locale'] in settings['locales']: return request.preferences.get_value('locale')
locale = request.form['locale']
return locale return request.accept_languages.best_match(settings['locales'].keys())
# code-highlighter # code-highlighter
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment