From 772c048d01c7585fd60afca1ce30a1914e6e5b4a Mon Sep 17 00:00:00 2001 From: Marc Abonce Seguin Date: Wed, 28 Feb 2018 22:30:48 -0600 Subject: refactor engine's search language handling Add match_language function in utils to match any user given language code with a list of engine's supported languages. Also add language_aliases dict on each engine to translate standard language codes into the custom codes used by the engine. --- searx/engines/wikidata.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'searx/engines/wikidata.py') diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index 1f31a1f88..1fdbc9869 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -16,6 +16,7 @@ from searx.poolrequests import get from searx.engines.xpath import extract_text from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url from searx.url_utils import urlencode +from searx.utils import match_language from json import loads from lxml.html import fromstring @@ -56,7 +57,7 @@ calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]' def request(query, params): - language = params['language'].split('-')[0] + language = match_language(params['language'], supported_languages).split('-')[0] params['url'] = url_search.format( query=urlencode({'label': query, 'language': language})) @@ -68,7 +69,7 @@ def response(resp): html = fromstring(resp.text) wikidata_ids = html.xpath(wikidata_ids_xpath) - language = resp.search_params['language'].split('-')[0] + language = match_language(resp.search_params['language'], supported_languages).split('-')[0] # TODO: make requests asynchronous to avoid timeout when result_count > 1 for wikidata_id in wikidata_ids[:result_count]: -- cgit v1.2.3 From b12857a70dd947a804e667d864ba56055b528ee0 Mon Sep 17 00:00:00 2001 From: Marc Abonce Seguin Date: Sun, 8 Apr 2018 21:17:00 -0500 Subject: [fix] make search requests on wikidata more accurate --- searx/engines/wikidata.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'searx/engines/wikidata.py') diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index 1fdbc9869..fe53609c1 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -27,7 +27,7 @@ result_count = 1 # urls wikidata_host = 'https://www.wikidata.org' url_search = wikidata_host \ - + '/wiki/Special:ItemDisambiguation?{query}' + + '/w/index.php?{query}' wikidata_api = wikidata_host + '/w/api.php' url_detail = wikidata_api\ @@ -40,7 +40,7 @@ url_map = 'https://www.openstreetmap.org/'\ url_image = 'https://commons.wikimedia.org/wiki/Special:FilePath/{filename}?width=500&height=400' # xpaths -wikidata_ids_xpath = '//div/ul[@class="wikibase-disambiguation"]/li/a/@title' +wikidata_ids_xpath = '//ul[@class="mw-search-results"]/li//a/@href' title_xpath = '//*[contains(@class,"wikibase-title-label")]' description_xpath = '//div[contains(@class,"wikibase-entitytermsview-heading-description")]' property_xpath = '//div[@id="{propertyid}"]' @@ -57,22 +57,21 @@ calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]' def request(query, params): - language = match_language(params['language'], supported_languages).split('-')[0] - params['url'] = url_search.format( - query=urlencode({'label': query, 'language': language})) + query=urlencode({'search': query})) return params def response(resp): results = [] html = fromstring(resp.text) - wikidata_ids = html.xpath(wikidata_ids_xpath) + search_results = html.xpath(wikidata_ids_xpath) language = match_language(resp.search_params['language'], supported_languages).split('-')[0] # TODO: make requests asynchronous to avoid timeout when result_count > 1 - for wikidata_id in wikidata_ids[:result_count]: + for search_result in search_results[:result_count]: + wikidata_id = search_result.split('/')[-1] url = url_detail.format(query=urlencode({'page': wikidata_id, 'uselang': language})) htmlresponse = get(url) jsonresponse = loads(htmlresponse.text) -- cgit v1.2.3 From acaef6600e34159d2edb7bf0ef6b5f34471136e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A9o=20Bourrel?= Date: Thu, 5 Jul 2018 10:11:45 +0200 Subject: Update path to wikidata image --- searx/engines/wikidata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'searx/engines/wikidata.py') diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index fe53609c1..ffc1c8d0f 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -54,7 +54,7 @@ value_xpath = './/div[contains(@class,"wikibase-statementview-mainsnak")]'\ + '/*/div[contains(@class,"wikibase-snakview-value")]' language_fallback_xpath = '//sup[contains(@class,"wb-language-fallback-indicator")]' calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]' - +media_xpath = value_xpath + '//div[contains(@class,"commons-media-caption")]//a' def request(query, params): params['url'] = url_search.format( @@ -313,7 +313,7 @@ def add_image(result): for property_id in property_ids: image = result.xpath(property_xpath.replace('{propertyid}', property_id)) if image: - image_name = image[0].xpath(value_xpath) + image_name = image[0].xpath(media_xpath) image_src = url_image.replace('{filename}', extract_text(image_name[0])) return image_src -- cgit v1.2.3 From 7a474db61bd9ba9a08111758b058f81cb5175db4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A9o=20Bourrel?= Date: Fri, 6 Jul 2018 10:31:01 +0200 Subject: Fix formatting --- searx/engines/wikidata.py | 1 + 1 file changed, 1 insertion(+) (limited to 'searx/engines/wikidata.py') diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index ffc1c8d0f..c315b30da 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -56,6 +56,7 @@ language_fallback_xpath = '//sup[contains(@class,"wb-language-fallback-indicator calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]' media_xpath = value_xpath + '//div[contains(@class,"commons-media-caption")]//a' + def request(query, params): params['url'] = url_search.format( query=urlencode({'search': query})) -- cgit v1.2.3