diff options
Diffstat (limited to 'searx/engines')
| -rw-r--r-- | searx/engines/__init__.py | 6 | ||||
| -rw-r--r-- | searx/engines/bandcamp.py | 73 | ||||
| -rw-r--r-- | searx/engines/dictzone.py | 2 | ||||
| -rw-r--r-- | searx/engines/duckduckgo.py | 2 | ||||
| -rw-r--r-- | searx/engines/duckduckgo_images.py | 2 | ||||
| -rw-r--r-- | searx/engines/elasticsearch.py | 3 | ||||
| -rw-r--r-- | searx/engines/gigablast.py | 2 | ||||
| -rw-r--r-- | searx/engines/google.py | 7 | ||||
| -rw-r--r-- | searx/engines/meilisearch.py | 59 | ||||
| -rw-r--r-- | searx/engines/pubmed.py | 2 | ||||
| -rw-r--r-- | searx/engines/qwant.py | 2 | ||||
| -rw-r--r-- | searx/engines/seznam.py | 7 | ||||
| -rw-r--r-- | searx/engines/sjp.py | 92 | ||||
| -rw-r--r-- | searx/engines/soundcloud.py | 2 | ||||
| -rw-r--r-- | searx/engines/spotify.py | 5 | ||||
| -rw-r--r-- | searx/engines/stackoverflow.py | 5 | ||||
| -rw-r--r-- | searx/engines/wikidata.py | 2 | ||||
| -rw-r--r-- | searx/engines/wikipedia.py | 2 | ||||
| -rw-r--r-- | searx/engines/wolframalpha_noapi.py | 2 | ||||
| -rw-r--r-- | searx/engines/wordnik.py | 77 | ||||
| -rw-r--r-- | searx/engines/yacy.py | 4 | ||||
| -rw-r--r-- | searx/engines/yggtorrent.py | 2 |
22 files changed, 329 insertions, 31 deletions
diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index 2238ea1b9..95eda6dde 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -27,7 +27,7 @@ from searx import settings from searx import logger from searx.data import ENGINES_LANGUAGES from searx.exceptions import SearxEngineResponseException -from searx.poolrequests import get, get_proxy_cycles +from searx.network import get, initialize as initialize_network, set_context_network_name from searx.utils import load_module, match_language, get_engine_from_settings, gen_useragent @@ -89,8 +89,6 @@ def load_engine(engine_data): engine.categories = [] else: engine.categories = list(map(str.strip, param_value.split(','))) - elif param_name == 'proxies': - engine.proxies = get_proxy_cycles(param_value) else: setattr(engine, param_name, param_value) @@ -289,9 +287,11 @@ def load_engines(engine_list): def initialize_engines(engine_list): load_engines(engine_list) + initialize_network(engine_list, settings['outgoing']) def engine_init(engine_name, init_fn): try: + set_context_network_name(engine_name) init_fn(get_engine_from_settings(engine_name)) except SearxEngineResponseException as exc: logger.warn('%s engine: Fail to initialize // %s', engine_name, exc) diff --git a/searx/engines/bandcamp.py b/searx/engines/bandcamp.py new file mode 100644 index 000000000..dafb3ee16 --- /dev/null +++ b/searx/engines/bandcamp.py @@ -0,0 +1,73 @@ +""" +Bandcamp (Music) + +@website https://bandcamp.com/ +@provide-api no +@results HTML +@parse url, title, content, publishedDate, embedded, thumbnail +""" + +from urllib.parse import urlencode, urlparse, parse_qs +from dateutil.parser import parse as dateparse +from lxml import html +from searx.utils import extract_text + +categories = ['music'] +paging = True + +base_url = "https://bandcamp.com/" +search_string = search_string = 'search?{query}&page={page}' +embedded_url = '''<iframe width="100%" height="166" + scrolling="no" frameborder="no" + data-src="https://bandcamp.com/EmbeddedPlayer/{type}={result_id}/size=large/bgcol=ffffff/linkcol=0687f5/tracklist=false/artwork=small/transparent=true/" +></iframe>''' + + +def request(query, params): + '''pre-request callback + params<dict>: + method : POST/GET + headers : {} + data : {} # if method == POST + url : '' + category: 'search category' + pageno : 1 # number of the requested page + ''' + + search_path = search_string.format( + query=urlencode({'q': query}), + page=params['pageno']) + + params['url'] = base_url + search_path + + return params + + +def response(resp): + '''post-response callback + resp: requests response object + ''' + results = [] + tree = html.fromstring(resp.text) + search_results = tree.xpath('//li[contains(@class, "searchresult")]') + for result in search_results: + link = result.xpath('.//div[@class="itemurl"]/a')[0] + result_id = parse_qs(urlparse(link.get('href')).query)["search_item_id"][0] + title = result.xpath('.//div[@class="heading"]/a/text()') + date = dateparse(result.xpath('//div[@class="released"]/text()')[0].replace("released ", "")) + content = result.xpath('.//div[@class="subhead"]/text()') + new_result = { + "url": extract_text(link), + "title": extract_text(title), + "content": extract_text(content), + "publishedDate": date, + } + thumbnail = result.xpath('.//div[@class="art"]/img/@src') + if thumbnail: + new_result['thumbnail'] = thumbnail[0] + if "album" in result.classes: + new_result["embedded"] = embedded_url.format(type='album', result_id=result_id) + elif "track" in result.classes: + new_result["embedded"] = embedded_url.format(type='track', result_id=result_id) + results.append(new_result) + return results diff --git a/searx/engines/dictzone.py b/searx/engines/dictzone.py index 2483c0805..eaa8b6ab4 100644 --- a/searx/engines/dictzone.py +++ b/searx/engines/dictzone.py @@ -52,7 +52,7 @@ def response(resp): to_results.append(to_result.text_content()) results.append({ - 'url': urljoin(resp.url, '?%d' % k), + 'url': urljoin(str(resp.url), '?%d' % k), 'title': from_result.text_content(), 'content': '; '.join(to_results) }) diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index ae1e36686..3c086f81b 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -6,7 +6,7 @@ from lxml.html import fromstring from json import loads from searx.utils import extract_text, match_language, eval_xpath, dict_subset -from searx.poolrequests import get +from searx.network import get # about about = { diff --git a/searx/engines/duckduckgo_images.py b/searx/engines/duckduckgo_images.py index 305eb1ca1..0daaf41e9 100644 --- a/searx/engines/duckduckgo_images.py +++ b/searx/engines/duckduckgo_images.py @@ -8,7 +8,7 @@ from urllib.parse import urlencode from searx.exceptions import SearxEngineAPIException from searx.engines.duckduckgo import get_region_code from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import -from searx.poolrequests import get +from searx.network import get # about about = { diff --git a/searx/engines/elasticsearch.py b/searx/engines/elasticsearch.py index da7f98074..db84a5c13 100644 --- a/searx/engines/elasticsearch.py +++ b/searx/engines/elasticsearch.py @@ -4,7 +4,6 @@ """ from json import loads, dumps -from requests.auth import HTTPBasicAuth from searx.exceptions import SearxEngineAPIException @@ -32,7 +31,7 @@ def request(query, params): return params if username and password: - params['auth'] = HTTPBasicAuth(username, password) + params['auth'] = (username, password) params['url'] = search_url params['method'] = 'GET' diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py index 248991df9..bbd9e20d2 100644 --- a/searx/engines/gigablast.py +++ b/searx/engines/gigablast.py @@ -8,7 +8,7 @@ import re from json import loads from urllib.parse import urlencode # from searx import logger -from searx.poolrequests import get +from searx.network import get # about about = { diff --git a/searx/engines/google.py b/searx/engines/google.py index 8c20029a3..a4aee5c20 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -10,7 +10,7 @@ Definitions`_. # pylint: disable=invalid-name, missing-function-docstring -from urllib.parse import urlencode, urlparse +from urllib.parse import urlencode from lxml import html from searx import logger from searx.utils import match_language, extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex @@ -186,8 +186,7 @@ def get_lang_info(params, lang_list, custom_aliases): return ret_val def detect_google_sorry(resp): - resp_url = urlparse(resp.url) - if resp_url.netloc == 'sorry.google.com' or resp_url.path.startswith('/sorry'): + if resp.url.host == 'sorry.google.com' or resp.url.path.startswith('/sorry'): raise SearxEngineCaptchaException() @@ -243,7 +242,7 @@ def response(resp): if answer: results.append({'answer': ' '.join(answer)}) else: - logger.debug("did not found 'answer'") + logger.debug("did not find 'answer'") # results --> number_of_results try: diff --git a/searx/engines/meilisearch.py b/searx/engines/meilisearch.py new file mode 100644 index 000000000..4e0ff15f3 --- /dev/null +++ b/searx/engines/meilisearch.py @@ -0,0 +1,59 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Meilisearch +""" + +# pylint: disable=global-statement, missing-function-docstring + +from json import loads, dumps + + +base_url = 'http://localhost:7700' +index = '' +auth_key = '' +facet_filters = list() +_search_url = '' +result_template = 'key-value.html' +categories = ['general'] +paging = True + + +def init(_): + if index == '': + raise ValueError('index cannot be empty') + + global _search_url + _search_url = base_url + '/indexes/' + index + '/search' + + +def request(query, params): + if auth_key != '': + params['headers']['X-Meili-API-Key'] = auth_key + + params['headers']['Content-Type'] = 'application/json' + params['url'] = _search_url + params['method'] = 'POST' + + data = { + 'q': query, + 'offset': 10 * (params['pageno'] - 1), + 'limit': 10, + } + if len(facet_filters) > 0: + data['facetFilters'] = facet_filters + + params['data'] = dumps(data) + + return params + + +def response(resp): + results = [] + + resp_json = loads(resp.text) + for result in resp_json['hits']: + r = {key: str(value) for key, value in result.items()} + r['template'] = result_template + results.append(r) + + return results diff --git a/searx/engines/pubmed.py b/searx/engines/pubmed.py index da02f91ca..5d88d398e 100644 --- a/searx/engines/pubmed.py +++ b/searx/engines/pubmed.py @@ -7,7 +7,7 @@ from flask_babel import gettext from lxml import etree from datetime import datetime from urllib.parse import urlencode -from searx.poolrequests import get +from searx.network import get # about about = { diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py index 13dcf1250..d01dc0acc 100644 --- a/searx/engines/qwant.py +++ b/searx/engines/qwant.py @@ -8,7 +8,7 @@ from json import loads from urllib.parse import urlencode from searx.utils import html_to_text, match_language from searx.exceptions import SearxEngineAPIException, SearxEngineCaptchaException -from searx.raise_for_httperror import raise_for_httperror +from searx.network import raise_for_httperror # about about = { diff --git a/searx/engines/seznam.py b/searx/engines/seznam.py index faceb0550..042088dbe 100644 --- a/searx/engines/seznam.py +++ b/searx/engines/seznam.py @@ -3,9 +3,9 @@ Seznam """ -from urllib.parse import urlencode, urlparse +from urllib.parse import urlencode from lxml import html -from searx.poolrequests import get +from searx.network import get from searx.exceptions import SearxEngineAccessDeniedException from searx.utils import ( extract_text, @@ -46,8 +46,7 @@ def request(query, params): def response(resp): - resp_url = urlparse(resp.url) - if resp_url.path.startswith('/verify'): + if resp.url.path.startswith('/verify'): raise SearxEngineAccessDeniedException() results = [] diff --git a/searx/engines/sjp.py b/searx/engines/sjp.py new file mode 100644 index 000000000..eff7b7092 --- /dev/null +++ b/searx/engines/sjp.py @@ -0,0 +1,92 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Słownik Języka Polskiego (general) + +""" + +from lxml.html import fromstring +from searx import logger +from searx.utils import extract_text +from searx.raise_for_httperror import raise_for_httperror + +logger = logger.getChild('sjp engine') + +# about +about = { + "website": 'https://sjp.pwn.pl', + "wikidata_id": 'Q55117369', + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +categories = ['general'] +paging = False + +URL = 'https://sjp.pwn.pl' +SEARCH_URL = URL + '/szukaj/{query}.html' + +word_xpath = '//div[@class="query"]' +dict_xpath = ['//div[@class="wyniki sjp-so-wyniki sjp-so-anchor"]', + '//div[@class="wyniki sjp-wyniki sjp-anchor"]', + '//div[@class="wyniki sjp-doroszewski-wyniki sjp-doroszewski-anchor"]'] + + +def request(query, params): + params['url'] = SEARCH_URL.format(query=query) + logger.debug(f"query_url --> {params['url']}") + return params + + +def response(resp): + results = [] + + raise_for_httperror(resp) + dom = fromstring(resp.text) + word = extract_text(dom.xpath(word_xpath)) + + definitions = [] + + for dict_src in dict_xpath: + for src in dom.xpath(dict_src): + src_text = extract_text(src.xpath('.//span[@class="entry-head-title"]/text()')).strip() + + src_defs = [] + for def_item in src.xpath('.//div[contains(@class, "ribbon-element")]'): + if def_item.xpath('./div[@class="znacz"]'): + sub_defs = [] + for def_sub_item in def_item.xpath('./div[@class="znacz"]'): + def_sub_text = extract_text(def_sub_item).lstrip('0123456789. ') + sub_defs.append(def_sub_text) + src_defs.append((word, sub_defs)) + else: + def_text = extract_text(def_item).strip() + def_link = def_item.xpath('./span/a/@href') + if 'doroszewski' in def_link[0]: + def_text = f"<a href='{def_link[0]}'>{def_text}</a>" + src_defs.append((def_text, '')) + + definitions.append((src_text, src_defs)) + + if not definitions: + return results + + infobox = '' + for src in definitions: + infobox += f"<div><small>{src[0]}</small>" + infobox += "<ul>" + for (def_text, sub_def) in src[1]: + infobox += f"<li>{def_text}</li>" + if sub_def: + infobox += "<ol>" + for sub_def_text in sub_def: + infobox += f"<li>{sub_def_text}</li>" + infobox += "</ol>" + infobox += "</ul></div>" + + results.append({ + 'infobox': word, + 'content': infobox, + }) + + return results diff --git a/searx/engines/soundcloud.py b/searx/engines/soundcloud.py index b3e3383bd..a6f923855 100644 --- a/searx/engines/soundcloud.py +++ b/searx/engines/soundcloud.py @@ -9,7 +9,7 @@ from lxml import html from dateutil import parser from urllib.parse import quote_plus, urlencode from searx import logger -from searx.poolrequests import get as http_get +from searx.network import get as http_get # about about = { diff --git a/searx/engines/spotify.py b/searx/engines/spotify.py index 0ad8bfe32..6816fe672 100644 --- a/searx/engines/spotify.py +++ b/searx/engines/spotify.py @@ -5,9 +5,10 @@ from json import loads from urllib.parse import urlencode -import requests import base64 +from searx.network import post as http_post + # about about = { "website": 'https://www.spotify.com', @@ -38,7 +39,7 @@ def request(query, params): params['url'] = search_url.format(query=urlencode({'q': query}), offset=offset) - r = requests.post( + r = http_post( 'https://accounts.spotify.com/api/token', data={'grant_type': 'client_credentials'}, headers={'Authorization': 'Basic ' + base64.b64encode( diff --git a/searx/engines/stackoverflow.py b/searx/engines/stackoverflow.py index 91eaa68e9..8fc2cdb3a 100644 --- a/searx/engines/stackoverflow.py +++ b/searx/engines/stackoverflow.py @@ -3,7 +3,7 @@ Stackoverflow (IT) """ -from urllib.parse import urlencode, urljoin, urlparse +from urllib.parse import urlencode, urljoin from lxml import html from searx.utils import extract_text from searx.exceptions import SearxEngineCaptchaException @@ -41,8 +41,7 @@ def request(query, params): # get response from search-request def response(resp): - resp_url = urlparse(resp.url) - if resp_url.path.startswith('/nocaptcha'): + if resp.url.path.startswith('/nocaptcha'): raise SearxEngineCaptchaException() results = [] diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index c8e4cfae6..ddcce9085 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -12,7 +12,7 @@ from babel.dates import format_datetime, format_date, format_time, get_datetime_ from searx import logger from searx.data import WIKIDATA_UNITS -from searx.poolrequests import post, get +from searx.network import post, get from searx.utils import match_language, searx_useragent, get_string_replaces_function from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py index 3ad8748fb..5e34db9a7 100644 --- a/searx/engines/wikipedia.py +++ b/searx/engines/wikipedia.py @@ -7,7 +7,7 @@ from urllib.parse import quote from json import loads from lxml.html import fromstring from searx.utils import match_language, searx_useragent -from searx.raise_for_httperror import raise_for_httperror +from searx.network import raise_for_httperror # about about = { diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py index 8e427d575..1f2cfa4e6 100644 --- a/searx/engines/wolframalpha_noapi.py +++ b/searx/engines/wolframalpha_noapi.py @@ -7,7 +7,7 @@ from json import loads from time import time from urllib.parse import urlencode -from searx.poolrequests import get as http_get +from searx.network import get as http_get # about about = { diff --git a/searx/engines/wordnik.py b/searx/engines/wordnik.py new file mode 100644 index 000000000..4bfeb4070 --- /dev/null +++ b/searx/engines/wordnik.py @@ -0,0 +1,77 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Wordnik (general) + +""" + +from lxml.html import fromstring +from searx import logger +from searx.utils import extract_text +from searx.network import raise_for_httperror + +logger = logger.getChild('Wordnik engine') + +# about +about = { + "website": 'https://www.wordnik.com', + "wikidata_id": 'Q8034401', + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +categories = ['general'] +paging = False + +URL = 'https://www.wordnik.com' +SEARCH_URL = URL + '/words/{query}' + + +def request(query, params): + params['url'] = SEARCH_URL.format(query=query) + logger.debug(f"query_url --> {params['url']}") + return params + + +def response(resp): + results = [] + + raise_for_httperror(resp) + dom = fromstring(resp.text) + word = extract_text(dom.xpath('//*[@id="headword"]/text()')) + + definitions = [] + for src in dom.xpath('//*[@id="define"]//h3[@class="source"]'): + src_text = extract_text(src).strip() + if src_text.startswith('from '): + src_text = src_text[5:] + + src_defs = [] + for def_item in src.xpath('following-sibling::ul[1]/li'): + def_abbr = extract_text(def_item.xpath('.//abbr')).strip() + def_text = extract_text(def_item).strip() + if def_abbr: + def_text = def_text[len(def_abbr):].strip() + src_defs.append((def_abbr, def_text)) + + definitions.append((src_text, src_defs)) + + if not definitions: + return results + + infobox = '' + for src_text, src_defs in definitions: + infobox += f"<small>{src_text}</small>" + infobox += "<ul>" + for def_abbr, def_text in src_defs: + if def_abbr: + def_abbr += ": " + infobox += f"<li><i>{def_abbr}</i> {def_text}</li>" + infobox += "</ul>" + + results.append({ + 'infobox': word, + 'content': infobox, + }) + + return results diff --git a/searx/engines/yacy.py b/searx/engines/yacy.py index c194ca451..fbd99c47b 100644 --- a/searx/engines/yacy.py +++ b/searx/engines/yacy.py @@ -7,7 +7,7 @@ from json import loads from dateutil import parser from urllib.parse import urlencode -from requests.auth import HTTPDigestAuth +from httpx import DigestAuth from searx.utils import html_to_text @@ -56,7 +56,7 @@ def request(query, params): search_type=search_type) if http_digest_auth_user and http_digest_auth_pass: - params['auth'] = HTTPDigestAuth(http_digest_auth_user, http_digest_auth_pass) + params['auth'] = DigestAuth(http_digest_auth_user, http_digest_auth_pass) # add language tag if specified if params['language'] != 'all': diff --git a/searx/engines/yggtorrent.py b/searx/engines/yggtorrent.py index 8dfc0a0f2..f5af91f46 100644 --- a/searx/engines/yggtorrent.py +++ b/searx/engines/yggtorrent.py @@ -8,7 +8,7 @@ from operator import itemgetter from datetime import datetime from urllib.parse import quote from searx.utils import extract_text, get_torrent_size -from searx.poolrequests import get as http_get +from searx.network import get as http_get # about about = { |