Commit 6e0285b2 authored by Dalf's avatar Dalf

[fix] wikidata engine: faster processing, remove one HTTP redirection.

* Search URL is https://www.wikidata.org/w/index.php?{query}&ns0=1 (with ns0=1 at the end to avoid an HTTP redirection)
* url_detail: remove the disabletidy=1 deprecated parameter
* Add eval_xpath function: compile once for all xpath.
* Add get_id_cache: retrieve all HTML with an id, avoid the slow to procress dynamic xpath '//div[@id="{propertyid}"]'.replace('{propertyid}')
* Create an etree.HTMLParser() instead of using the global one (see #1575)
parent cbc5e132
This diff is collapsed.
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from lxml.html import fromstring from lxml.html import fromstring
from lxml import etree
from collections import defaultdict from collections import defaultdict
import mock import mock
from searx.engines import wikidata from searx.engines import wikidata
...@@ -30,12 +31,12 @@ class TestWikidataEngine(SearxTestCase): ...@@ -30,12 +31,12 @@ class TestWikidataEngine(SearxTestCase):
wikidata.supported_languages = ['en', 'es'] wikidata.supported_languages = ['en', 'es']
wikidata.language_aliases = {} wikidata.language_aliases = {}
response = mock.Mock(text='<html></html>', search_params={"language": "en"}) response = mock.Mock(content='<html></html>'.encode("utf-8"), search_params={"language": "en"})
self.assertEqual(wikidata.response(response), []) self.assertEqual(wikidata.response(response), [])
def test_getDetail(self): def test_getDetail(self):
response = {} response = {}
results = wikidata.getDetail(response, "Q123", "en", "en-US") results = wikidata.getDetail(response, "Q123", "en", "en-US", etree.HTMLParser())
self.assertEqual(results, []) self.assertEqual(results, [])
title_html = '<div><div class="wikibase-title-label">Test</div></div>' title_html = '<div><div class="wikibase-title-label">Test</div></div>'
...@@ -52,7 +53,7 @@ class TestWikidataEngine(SearxTestCase): ...@@ -52,7 +53,7 @@ class TestWikidataEngine(SearxTestCase):
""" """
response = {"parse": {"displaytitle": title_html, "text": html}} response = {"parse": {"displaytitle": title_html, "text": html}}
results = wikidata.getDetail(response, "Q123", "en", "en-US") results = wikidata.getDetail(response, "Q123", "en", "en-US", etree.HTMLParser())
self.assertEqual(len(results), 1) self.assertEqual(len(results), 1)
self.assertEqual(results[0]['url'], 'https://en.wikipedia.org/wiki/Test') self.assertEqual(results[0]['url'], 'https://en.wikipedia.org/wiki/Test')
...@@ -92,7 +93,7 @@ class TestWikidataEngine(SearxTestCase): ...@@ -92,7 +93,7 @@ class TestWikidataEngine(SearxTestCase):
""" """
response = {"parse": {"displaytitle": title_html, "text": html}} response = {"parse": {"displaytitle": title_html, "text": html}}
results = wikidata.getDetail(response, "Q123", "yua", "yua_MX") results = wikidata.getDetail(response, "Q123", "yua", "yua_MX", etree.HTMLParser())
self.assertEqual(len(results), 2) self.assertEqual(len(results), 2)
self.assertEqual(results[0]['title'], 'Official website') self.assertEqual(results[0]['title'], 'Official website')
self.assertEqual(results[0]['url'], 'https://officialsite.com') self.assertEqual(results[0]['url'], 'https://officialsite.com')
...@@ -139,8 +140,8 @@ class TestWikidataEngine(SearxTestCase): ...@@ -139,8 +140,8 @@ class TestWikidataEngine(SearxTestCase):
</div> </div>
""" """
html_etree = fromstring(html) html_etree = fromstring(html)
id_cache = wikidata.get_id_cache(html_etree)
image_src = wikidata.add_image(html_etree) image_src = wikidata.add_image(id_cache)
self.assertEqual(image_src, self.assertEqual(image_src,
"https://commons.wikimedia.org/wiki/Special:FilePath/image.png?width=500&height=400") "https://commons.wikimedia.org/wiki/Special:FilePath/image.png?width=500&height=400")
...@@ -197,8 +198,9 @@ class TestWikidataEngine(SearxTestCase): ...@@ -197,8 +198,9 @@ class TestWikidataEngine(SearxTestCase):
</div> </div>
""" """
html_etree = fromstring(html) html_etree = fromstring(html)
id_cache = wikidata.get_id_cache(html_etree)
image_src = wikidata.add_image(html_etree) image_src = wikidata.add_image(id_cache)
self.assertEqual(image_src, self.assertEqual(image_src,
"https://commons.wikimedia.org/wiki/Special:FilePath/logo.png?width=500&height=400") "https://commons.wikimedia.org/wiki/Special:FilePath/logo.png?width=500&height=400")
...@@ -232,11 +234,12 @@ class TestWikidataEngine(SearxTestCase): ...@@ -232,11 +234,12 @@ class TestWikidataEngine(SearxTestCase):
""" """
attributes = [] attributes = []
html_etree = fromstring(html) html_etree = fromstring(html)
id_cache = wikidata.get_id_cache(html_etree)
wikidata.add_attribute(attributes, html_etree, "Fail") wikidata.add_attribute(attributes, id_cache, "Fail")
self.assertEqual(attributes, []) self.assertEqual(attributes, [])
wikidata.add_attribute(attributes, html_etree, "P27") wikidata.add_attribute(attributes, id_cache, "P27")
self.assertEqual(len(attributes), 1) self.assertEqual(len(attributes), 1)
self.assertEqual(attributes[0]["label"], "Country of citizenship") self.assertEqual(attributes[0]["label"], "Country of citizenship")
self.assertEqual(attributes[0]["value"], "United Kingdom") self.assertEqual(attributes[0]["value"], "United Kingdom")
...@@ -271,7 +274,8 @@ class TestWikidataEngine(SearxTestCase): ...@@ -271,7 +274,8 @@ class TestWikidataEngine(SearxTestCase):
""" """
attributes = [] attributes = []
html_etree = fromstring(html) html_etree = fromstring(html)
wikidata.add_attribute(attributes, html_etree, "P569", date=True) id_cache = wikidata.get_id_cache(html_etree)
wikidata.add_attribute(attributes, id_cache, "P569", date=True)
self.assertEqual(len(attributes), 1) self.assertEqual(len(attributes), 1)
self.assertEqual(attributes[0]["label"], "Date of birth") self.assertEqual(attributes[0]["label"], "Date of birth")
self.assertEqual(attributes[0]["value"], "27 January 1832") self.assertEqual(attributes[0]["value"], "27 January 1832")
...@@ -319,14 +323,16 @@ class TestWikidataEngine(SearxTestCase): ...@@ -319,14 +323,16 @@ class TestWikidataEngine(SearxTestCase):
""" """
attributes = [] attributes = []
html_etree = fromstring(html) html_etree = fromstring(html)
wikidata.add_attribute(attributes, html_etree, "P6") id_cache = wikidata.get_id_cache(html_etree)
wikidata.add_attribute(attributes, id_cache, "P6")
self.assertEqual(len(attributes), 1) self.assertEqual(len(attributes), 1)
self.assertEqual(attributes[0]["label"], "Head of government") self.assertEqual(attributes[0]["label"], "Head of government")
self.assertEqual(attributes[0]["value"], "Old Prime Minister, Actual Prime Minister") self.assertEqual(attributes[0]["value"], "Old Prime Minister, Actual Prime Minister")
attributes = [] attributes = []
html_etree = fromstring(html) html_etree = fromstring(html)
wikidata.add_attribute(attributes, html_etree, "P6", trim=True) id_cache = wikidata.get_id_cache(html_etree)
wikidata.add_attribute(attributes, id_cache, "P6", trim=True)
self.assertEqual(len(attributes), 1) self.assertEqual(len(attributes), 1)
self.assertEqual(attributes[0]["value"], "Actual Prime Minister") self.assertEqual(attributes[0]["value"], "Actual Prime Minister")
...@@ -357,12 +363,13 @@ class TestWikidataEngine(SearxTestCase): ...@@ -357,12 +363,13 @@ class TestWikidataEngine(SearxTestCase):
""" """
urls = [] urls = []
html_etree = fromstring(html) html_etree = fromstring(html)
wikidata.add_url(urls, html_etree, 'P856') id_cache = wikidata.get_id_cache(html_etree)
wikidata.add_url(urls, html_etree, id_cache, 'P856')
self.assertEquals(len(urls), 1) self.assertEquals(len(urls), 1)
self.assertIn({'title': 'Official website', 'url': 'https://searx.me/'}, urls) self.assertIn({'title': 'Official website', 'url': 'https://searx.me/'}, urls)
urls = [] urls = []
results = [] results = []
wikidata.add_url(urls, html_etree, 'P856', 'custom label', results=results) wikidata.add_url(urls, html_etree, id_cache, 'P856', 'custom label', results=results)
self.assertEquals(len(urls), 1) self.assertEquals(len(urls), 1)
self.assertEquals(len(results), 1) self.assertEquals(len(results), 1)
self.assertIn({'title': 'custom label', 'url': 'https://searx.me/'}, urls) self.assertIn({'title': 'custom label', 'url': 'https://searx.me/'}, urls)
...@@ -405,7 +412,8 @@ class TestWikidataEngine(SearxTestCase): ...@@ -405,7 +412,8 @@ class TestWikidataEngine(SearxTestCase):
""" """
urls = [] urls = []
html_etree = fromstring(html) html_etree = fromstring(html)
wikidata.add_url(urls, html_etree, 'P856') id_cache = wikidata.get_id_cache(html_etree)
wikidata.add_url(urls, html_etree, id_cache, 'P856')
self.assertEquals(len(urls), 2) self.assertEquals(len(urls), 2)
self.assertIn({'title': 'Official website', 'url': 'http://www.worldofwarcraft.com'}, urls) self.assertIn({'title': 'Official website', 'url': 'http://www.worldofwarcraft.com'}, urls)
self.assertIn({'title': 'Official website', 'url': 'http://eu.battle.net/wow/en/'}, urls) self.assertIn({'title': 'Official website', 'url': 'http://eu.battle.net/wow/en/'}, urls)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment