summaryrefslogtreecommitdiff
path: root/searx/engines
diff options
context:
space:
mode:
Diffstat (limited to 'searx/engines')
-rw-r--r--searx/engines/__init__.py6
-rw-r--r--searx/engines/bandcamp.py73
-rw-r--r--searx/engines/dictzone.py2
-rw-r--r--searx/engines/duckduckgo.py2
-rw-r--r--searx/engines/duckduckgo_images.py2
-rw-r--r--searx/engines/elasticsearch.py3
-rw-r--r--searx/engines/gigablast.py2
-rw-r--r--searx/engines/google.py7
-rw-r--r--searx/engines/meilisearch.py59
-rw-r--r--searx/engines/pubmed.py2
-rw-r--r--searx/engines/qwant.py2
-rw-r--r--searx/engines/seznam.py7
-rw-r--r--searx/engines/sjp.py92
-rw-r--r--searx/engines/soundcloud.py2
-rw-r--r--searx/engines/spotify.py5
-rw-r--r--searx/engines/stackoverflow.py5
-rw-r--r--searx/engines/wikidata.py2
-rw-r--r--searx/engines/wikipedia.py2
-rw-r--r--searx/engines/wolframalpha_noapi.py2
-rw-r--r--searx/engines/wordnik.py77
-rw-r--r--searx/engines/yacy.py4
-rw-r--r--searx/engines/yggtorrent.py2
22 files changed, 329 insertions, 31 deletions
diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py
index 2238ea1b9..95eda6dde 100644
--- a/searx/engines/__init__.py
+++ b/searx/engines/__init__.py
@@ -27,7 +27,7 @@ from searx import settings
from searx import logger
from searx.data import ENGINES_LANGUAGES
from searx.exceptions import SearxEngineResponseException
-from searx.poolrequests import get, get_proxy_cycles
+from searx.network import get, initialize as initialize_network, set_context_network_name
from searx.utils import load_module, match_language, get_engine_from_settings, gen_useragent
@@ -89,8 +89,6 @@ def load_engine(engine_data):
engine.categories = []
else:
engine.categories = list(map(str.strip, param_value.split(',')))
- elif param_name == 'proxies':
- engine.proxies = get_proxy_cycles(param_value)
else:
setattr(engine, param_name, param_value)
@@ -289,9 +287,11 @@ def load_engines(engine_list):
def initialize_engines(engine_list):
load_engines(engine_list)
+ initialize_network(engine_list, settings['outgoing'])
def engine_init(engine_name, init_fn):
try:
+ set_context_network_name(engine_name)
init_fn(get_engine_from_settings(engine_name))
except SearxEngineResponseException as exc:
logger.warn('%s engine: Fail to initialize // %s', engine_name, exc)
diff --git a/searx/engines/bandcamp.py b/searx/engines/bandcamp.py
new file mode 100644
index 000000000..dafb3ee16
--- /dev/null
+++ b/searx/engines/bandcamp.py
@@ -0,0 +1,73 @@
+"""
+Bandcamp (Music)
+
+@website https://bandcamp.com/
+@provide-api no
+@results HTML
+@parse url, title, content, publishedDate, embedded, thumbnail
+"""
+
+from urllib.parse import urlencode, urlparse, parse_qs
+from dateutil.parser import parse as dateparse
+from lxml import html
+from searx.utils import extract_text
+
+categories = ['music']
+paging = True
+
+base_url = "https://bandcamp.com/"
+search_string = search_string = 'search?{query}&page={page}'
+embedded_url = '''<iframe width="100%" height="166"
+ scrolling="no" frameborder="no"
+ data-src="https://bandcamp.com/EmbeddedPlayer/{type}={result_id}/size=large/bgcol=ffffff/linkcol=0687f5/tracklist=false/artwork=small/transparent=true/"
+></iframe>'''
+
+
+def request(query, params):
+ '''pre-request callback
+ params<dict>:
+ method : POST/GET
+ headers : {}
+ data : {} # if method == POST
+ url : ''
+ category: 'search category'
+ pageno : 1 # number of the requested page
+ '''
+
+ search_path = search_string.format(
+ query=urlencode({'q': query}),
+ page=params['pageno'])
+
+ params['url'] = base_url + search_path
+
+ return params
+
+
+def response(resp):
+ '''post-response callback
+ resp: requests response object
+ '''
+ results = []
+ tree = html.fromstring(resp.text)
+ search_results = tree.xpath('//li[contains(@class, "searchresult")]')
+ for result in search_results:
+ link = result.xpath('.//div[@class="itemurl"]/a')[0]
+ result_id = parse_qs(urlparse(link.get('href')).query)["search_item_id"][0]
+ title = result.xpath('.//div[@class="heading"]/a/text()')
+ date = dateparse(result.xpath('//div[@class="released"]/text()')[0].replace("released ", ""))
+ content = result.xpath('.//div[@class="subhead"]/text()')
+ new_result = {
+ "url": extract_text(link),
+ "title": extract_text(title),
+ "content": extract_text(content),
+ "publishedDate": date,
+ }
+ thumbnail = result.xpath('.//div[@class="art"]/img/@src')
+ if thumbnail:
+ new_result['thumbnail'] = thumbnail[0]
+ if "album" in result.classes:
+ new_result["embedded"] = embedded_url.format(type='album', result_id=result_id)
+ elif "track" in result.classes:
+ new_result["embedded"] = embedded_url.format(type='track', result_id=result_id)
+ results.append(new_result)
+ return results
diff --git a/searx/engines/dictzone.py b/searx/engines/dictzone.py
index 2483c0805..eaa8b6ab4 100644
--- a/searx/engines/dictzone.py
+++ b/searx/engines/dictzone.py
@@ -52,7 +52,7 @@ def response(resp):
to_results.append(to_result.text_content())
results.append({
- 'url': urljoin(resp.url, '?%d' % k),
+ 'url': urljoin(str(resp.url), '?%d' % k),
'title': from_result.text_content(),
'content': '; '.join(to_results)
})
diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py
index ae1e36686..3c086f81b 100644
--- a/searx/engines/duckduckgo.py
+++ b/searx/engines/duckduckgo.py
@@ -6,7 +6,7 @@
from lxml.html import fromstring
from json import loads
from searx.utils import extract_text, match_language, eval_xpath, dict_subset
-from searx.poolrequests import get
+from searx.network import get
# about
about = {
diff --git a/searx/engines/duckduckgo_images.py b/searx/engines/duckduckgo_images.py
index 305eb1ca1..0daaf41e9 100644
--- a/searx/engines/duckduckgo_images.py
+++ b/searx/engines/duckduckgo_images.py
@@ -8,7 +8,7 @@ from urllib.parse import urlencode
from searx.exceptions import SearxEngineAPIException
from searx.engines.duckduckgo import get_region_code
from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import
-from searx.poolrequests import get
+from searx.network import get
# about
about = {
diff --git a/searx/engines/elasticsearch.py b/searx/engines/elasticsearch.py
index da7f98074..db84a5c13 100644
--- a/searx/engines/elasticsearch.py
+++ b/searx/engines/elasticsearch.py
@@ -4,7 +4,6 @@
"""
from json import loads, dumps
-from requests.auth import HTTPBasicAuth
from searx.exceptions import SearxEngineAPIException
@@ -32,7 +31,7 @@ def request(query, params):
return params
if username and password:
- params['auth'] = HTTPBasicAuth(username, password)
+ params['auth'] = (username, password)
params['url'] = search_url
params['method'] = 'GET'
diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py
index 248991df9..bbd9e20d2 100644
--- a/searx/engines/gigablast.py
+++ b/searx/engines/gigablast.py
@@ -8,7 +8,7 @@ import re
from json import loads
from urllib.parse import urlencode
# from searx import logger
-from searx.poolrequests import get
+from searx.network import get
# about
about = {
diff --git a/searx/engines/google.py b/searx/engines/google.py
index 8c20029a3..a4aee5c20 100644
--- a/searx/engines/google.py
+++ b/searx/engines/google.py
@@ -10,7 +10,7 @@ Definitions`_.
# pylint: disable=invalid-name, missing-function-docstring
-from urllib.parse import urlencode, urlparse
+from urllib.parse import urlencode
from lxml import html
from searx import logger
from searx.utils import match_language, extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
@@ -186,8 +186,7 @@ def get_lang_info(params, lang_list, custom_aliases):
return ret_val
def detect_google_sorry(resp):
- resp_url = urlparse(resp.url)
- if resp_url.netloc == 'sorry.google.com' or resp_url.path.startswith('/sorry'):
+ if resp.url.host == 'sorry.google.com' or resp.url.path.startswith('/sorry'):
raise SearxEngineCaptchaException()
@@ -243,7 +242,7 @@ def response(resp):
if answer:
results.append({'answer': ' '.join(answer)})
else:
- logger.debug("did not found 'answer'")
+ logger.debug("did not find 'answer'")
# results --> number_of_results
try:
diff --git a/searx/engines/meilisearch.py b/searx/engines/meilisearch.py
new file mode 100644
index 000000000..4e0ff15f3
--- /dev/null
+++ b/searx/engines/meilisearch.py
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+"""
+ Meilisearch
+"""
+
+# pylint: disable=global-statement, missing-function-docstring
+
+from json import loads, dumps
+
+
+base_url = 'http://localhost:7700'
+index = ''
+auth_key = ''
+facet_filters = list()
+_search_url = ''
+result_template = 'key-value.html'
+categories = ['general']
+paging = True
+
+
+def init(_):
+ if index == '':
+ raise ValueError('index cannot be empty')
+
+ global _search_url
+ _search_url = base_url + '/indexes/' + index + '/search'
+
+
+def request(query, params):
+ if auth_key != '':
+ params['headers']['X-Meili-API-Key'] = auth_key
+
+ params['headers']['Content-Type'] = 'application/json'
+ params['url'] = _search_url
+ params['method'] = 'POST'
+
+ data = {
+ 'q': query,
+ 'offset': 10 * (params['pageno'] - 1),
+ 'limit': 10,
+ }
+ if len(facet_filters) > 0:
+ data['facetFilters'] = facet_filters
+
+ params['data'] = dumps(data)
+
+ return params
+
+
+def response(resp):
+ results = []
+
+ resp_json = loads(resp.text)
+ for result in resp_json['hits']:
+ r = {key: str(value) for key, value in result.items()}
+ r['template'] = result_template
+ results.append(r)
+
+ return results
diff --git a/searx/engines/pubmed.py b/searx/engines/pubmed.py
index da02f91ca..5d88d398e 100644
--- a/searx/engines/pubmed.py
+++ b/searx/engines/pubmed.py
@@ -7,7 +7,7 @@ from flask_babel import gettext
from lxml import etree
from datetime import datetime
from urllib.parse import urlencode
-from searx.poolrequests import get
+from searx.network import get
# about
about = {
diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py
index 13dcf1250..d01dc0acc 100644
--- a/searx/engines/qwant.py
+++ b/searx/engines/qwant.py
@@ -8,7 +8,7 @@ from json import loads
from urllib.parse import urlencode
from searx.utils import html_to_text, match_language
from searx.exceptions import SearxEngineAPIException, SearxEngineCaptchaException
-from searx.raise_for_httperror import raise_for_httperror
+from searx.network import raise_for_httperror
# about
about = {
diff --git a/searx/engines/seznam.py b/searx/engines/seznam.py
index faceb0550..042088dbe 100644
--- a/searx/engines/seznam.py
+++ b/searx/engines/seznam.py
@@ -3,9 +3,9 @@
Seznam
"""
-from urllib.parse import urlencode, urlparse
+from urllib.parse import urlencode
from lxml import html
-from searx.poolrequests import get
+from searx.network import get
from searx.exceptions import SearxEngineAccessDeniedException
from searx.utils import (
extract_text,
@@ -46,8 +46,7 @@ def request(query, params):
def response(resp):
- resp_url = urlparse(resp.url)
- if resp_url.path.startswith('/verify'):
+ if resp.url.path.startswith('/verify'):
raise SearxEngineAccessDeniedException()
results = []
diff --git a/searx/engines/sjp.py b/searx/engines/sjp.py
new file mode 100644
index 000000000..eff7b7092
--- /dev/null
+++ b/searx/engines/sjp.py
@@ -0,0 +1,92 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+"""Słownik Języka Polskiego (general)
+
+"""
+
+from lxml.html import fromstring
+from searx import logger
+from searx.utils import extract_text
+from searx.raise_for_httperror import raise_for_httperror
+
+logger = logger.getChild('sjp engine')
+
+# about
+about = {
+ "website": 'https://sjp.pwn.pl',
+ "wikidata_id": 'Q55117369',
+ "official_api_documentation": None,
+ "use_official_api": False,
+ "require_api_key": False,
+ "results": 'HTML',
+}
+
+categories = ['general']
+paging = False
+
+URL = 'https://sjp.pwn.pl'
+SEARCH_URL = URL + '/szukaj/{query}.html'
+
+word_xpath = '//div[@class="query"]'
+dict_xpath = ['//div[@class="wyniki sjp-so-wyniki sjp-so-anchor"]',
+ '//div[@class="wyniki sjp-wyniki sjp-anchor"]',
+ '//div[@class="wyniki sjp-doroszewski-wyniki sjp-doroszewski-anchor"]']
+
+
+def request(query, params):
+ params['url'] = SEARCH_URL.format(query=query)
+ logger.debug(f"query_url --> {params['url']}")
+ return params
+
+
+def response(resp):
+ results = []
+
+ raise_for_httperror(resp)
+ dom = fromstring(resp.text)
+ word = extract_text(dom.xpath(word_xpath))
+
+ definitions = []
+
+ for dict_src in dict_xpath:
+ for src in dom.xpath(dict_src):
+ src_text = extract_text(src.xpath('.//span[@class="entry-head-title"]/text()')).strip()
+
+ src_defs = []
+ for def_item in src.xpath('.//div[contains(@class, "ribbon-element")]'):
+ if def_item.xpath('./div[@class="znacz"]'):
+ sub_defs = []
+ for def_sub_item in def_item.xpath('./div[@class="znacz"]'):
+ def_sub_text = extract_text(def_sub_item).lstrip('0123456789. ')
+ sub_defs.append(def_sub_text)
+ src_defs.append((word, sub_defs))
+ else:
+ def_text = extract_text(def_item).strip()
+ def_link = def_item.xpath('./span/a/@href')
+ if 'doroszewski' in def_link[0]:
+ def_text = f"<a href='{def_link[0]}'>{def_text}</a>"
+ src_defs.append((def_text, ''))
+
+ definitions.append((src_text, src_defs))
+
+ if not definitions:
+ return results
+
+ infobox = ''
+ for src in definitions:
+ infobox += f"<div><small>{src[0]}</small>"
+ infobox += "<ul>"
+ for (def_text, sub_def) in src[1]:
+ infobox += f"<li>{def_text}</li>"
+ if sub_def:
+ infobox += "<ol>"
+ for sub_def_text in sub_def:
+ infobox += f"<li>{sub_def_text}</li>"
+ infobox += "</ol>"
+ infobox += "</ul></div>"
+
+ results.append({
+ 'infobox': word,
+ 'content': infobox,
+ })
+
+ return results
diff --git a/searx/engines/soundcloud.py b/searx/engines/soundcloud.py
index b3e3383bd..a6f923855 100644
--- a/searx/engines/soundcloud.py
+++ b/searx/engines/soundcloud.py
@@ -9,7 +9,7 @@ from lxml import html
from dateutil import parser
from urllib.parse import quote_plus, urlencode
from searx import logger
-from searx.poolrequests import get as http_get
+from searx.network import get as http_get
# about
about = {
diff --git a/searx/engines/spotify.py b/searx/engines/spotify.py
index 0ad8bfe32..6816fe672 100644
--- a/searx/engines/spotify.py
+++ b/searx/engines/spotify.py
@@ -5,9 +5,10 @@
from json import loads
from urllib.parse import urlencode
-import requests
import base64
+from searx.network import post as http_post
+
# about
about = {
"website": 'https://www.spotify.com',
@@ -38,7 +39,7 @@ def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query}), offset=offset)
- r = requests.post(
+ r = http_post(
'https://accounts.spotify.com/api/token',
data={'grant_type': 'client_credentials'},
headers={'Authorization': 'Basic ' + base64.b64encode(
diff --git a/searx/engines/stackoverflow.py b/searx/engines/stackoverflow.py
index 91eaa68e9..8fc2cdb3a 100644
--- a/searx/engines/stackoverflow.py
+++ b/searx/engines/stackoverflow.py
@@ -3,7 +3,7 @@
Stackoverflow (IT)
"""
-from urllib.parse import urlencode, urljoin, urlparse
+from urllib.parse import urlencode, urljoin
from lxml import html
from searx.utils import extract_text
from searx.exceptions import SearxEngineCaptchaException
@@ -41,8 +41,7 @@ def request(query, params):
# get response from search-request
def response(resp):
- resp_url = urlparse(resp.url)
- if resp_url.path.startswith('/nocaptcha'):
+ if resp.url.path.startswith('/nocaptcha'):
raise SearxEngineCaptchaException()
results = []
diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py
index c8e4cfae6..ddcce9085 100644
--- a/searx/engines/wikidata.py
+++ b/searx/engines/wikidata.py
@@ -12,7 +12,7 @@ from babel.dates import format_datetime, format_date, format_time, get_datetime_
from searx import logger
from searx.data import WIKIDATA_UNITS
-from searx.poolrequests import post, get
+from searx.network import post, get
from searx.utils import match_language, searx_useragent, get_string_replaces_function
from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import
diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py
index 3ad8748fb..5e34db9a7 100644
--- a/searx/engines/wikipedia.py
+++ b/searx/engines/wikipedia.py
@@ -7,7 +7,7 @@ from urllib.parse import quote
from json import loads
from lxml.html import fromstring
from searx.utils import match_language, searx_useragent
-from searx.raise_for_httperror import raise_for_httperror
+from searx.network import raise_for_httperror
# about
about = {
diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py
index 8e427d575..1f2cfa4e6 100644
--- a/searx/engines/wolframalpha_noapi.py
+++ b/searx/engines/wolframalpha_noapi.py
@@ -7,7 +7,7 @@ from json import loads
from time import time
from urllib.parse import urlencode
-from searx.poolrequests import get as http_get
+from searx.network import get as http_get
# about
about = {
diff --git a/searx/engines/wordnik.py b/searx/engines/wordnik.py
new file mode 100644
index 000000000..4bfeb4070
--- /dev/null
+++ b/searx/engines/wordnik.py
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+"""Wordnik (general)
+
+"""
+
+from lxml.html import fromstring
+from searx import logger
+from searx.utils import extract_text
+from searx.network import raise_for_httperror
+
+logger = logger.getChild('Wordnik engine')
+
+# about
+about = {
+ "website": 'https://www.wordnik.com',
+ "wikidata_id": 'Q8034401',
+ "official_api_documentation": None,
+ "use_official_api": False,
+ "require_api_key": False,
+ "results": 'HTML',
+}
+
+categories = ['general']
+paging = False
+
+URL = 'https://www.wordnik.com'
+SEARCH_URL = URL + '/words/{query}'
+
+
+def request(query, params):
+ params['url'] = SEARCH_URL.format(query=query)
+ logger.debug(f"query_url --> {params['url']}")
+ return params
+
+
+def response(resp):
+ results = []
+
+ raise_for_httperror(resp)
+ dom = fromstring(resp.text)
+ word = extract_text(dom.xpath('//*[@id="headword"]/text()'))
+
+ definitions = []
+ for src in dom.xpath('//*[@id="define"]//h3[@class="source"]'):
+ src_text = extract_text(src).strip()
+ if src_text.startswith('from '):
+ src_text = src_text[5:]
+
+ src_defs = []
+ for def_item in src.xpath('following-sibling::ul[1]/li'):
+ def_abbr = extract_text(def_item.xpath('.//abbr')).strip()
+ def_text = extract_text(def_item).strip()
+ if def_abbr:
+ def_text = def_text[len(def_abbr):].strip()
+ src_defs.append((def_abbr, def_text))
+
+ definitions.append((src_text, src_defs))
+
+ if not definitions:
+ return results
+
+ infobox = ''
+ for src_text, src_defs in definitions:
+ infobox += f"<small>{src_text}</small>"
+ infobox += "<ul>"
+ for def_abbr, def_text in src_defs:
+ if def_abbr:
+ def_abbr += ": "
+ infobox += f"<li><i>{def_abbr}</i> {def_text}</li>"
+ infobox += "</ul>"
+
+ results.append({
+ 'infobox': word,
+ 'content': infobox,
+ })
+
+ return results
diff --git a/searx/engines/yacy.py b/searx/engines/yacy.py
index c194ca451..fbd99c47b 100644
--- a/searx/engines/yacy.py
+++ b/searx/engines/yacy.py
@@ -7,7 +7,7 @@ from json import loads
from dateutil import parser
from urllib.parse import urlencode
-from requests.auth import HTTPDigestAuth
+from httpx import DigestAuth
from searx.utils import html_to_text
@@ -56,7 +56,7 @@ def request(query, params):
search_type=search_type)
if http_digest_auth_user and http_digest_auth_pass:
- params['auth'] = HTTPDigestAuth(http_digest_auth_user, http_digest_auth_pass)
+ params['auth'] = DigestAuth(http_digest_auth_user, http_digest_auth_pass)
# add language tag if specified
if params['language'] != 'all':
diff --git a/searx/engines/yggtorrent.py b/searx/engines/yggtorrent.py
index 8dfc0a0f2..f5af91f46 100644
--- a/searx/engines/yggtorrent.py
+++ b/searx/engines/yggtorrent.py
@@ -8,7 +8,7 @@ from operator import itemgetter
from datetime import datetime
from urllib.parse import quote
from searx.utils import extract_text, get_torrent_size
-from searx.poolrequests import get as http_get
+from searx.network import get as http_get
# about
about = {