summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--AUTHORS.rst5
-rw-r--r--Dockerfile23
-rw-r--r--Makefile1
-rwxr-xr-xdockerfiles/docker-entrypoint.sh3
-rw-r--r--docs/admin/installation-searx.rst2
-rw-r--r--docs/admin/settings.rst64
-rw-r--r--docs/blog/index.rst1
-rw-r--r--docs/blog/search-indexer-engines.rst114
-rwxr-xr-xmanage.sh2
-rw-r--r--requirements-dev.txt4
-rw-r--r--requirements.txt7
-rw-r--r--searx/autocomplete.py7
-rw-r--r--searx/engines/__init__.py6
-rw-r--r--searx/engines/bandcamp.py73
-rw-r--r--searx/engines/dictzone.py2
-rw-r--r--searx/engines/duckduckgo.py2
-rw-r--r--searx/engines/duckduckgo_images.py2
-rw-r--r--searx/engines/elasticsearch.py3
-rw-r--r--searx/engines/gigablast.py2
-rw-r--r--searx/engines/google.py7
-rw-r--r--searx/engines/meilisearch.py59
-rw-r--r--searx/engines/pubmed.py2
-rw-r--r--searx/engines/qwant.py2
-rw-r--r--searx/engines/seznam.py7
-rw-r--r--searx/engines/sjp.py92
-rw-r--r--searx/engines/soundcloud.py2
-rw-r--r--searx/engines/spotify.py5
-rw-r--r--searx/engines/stackoverflow.py5
-rw-r--r--searx/engines/wikidata.py2
-rw-r--r--searx/engines/wikipedia.py2
-rw-r--r--searx/engines/wolframalpha_noapi.py2
-rw-r--r--searx/engines/wordnik.py77
-rw-r--r--searx/engines/yacy.py4
-rw-r--r--searx/engines/yggtorrent.py2
-rw-r--r--searx/metrology/error_recorder.py18
-rw-r--r--searx/network/__init__.py189
-rw-r--r--searx/network/client.py214
-rw-r--r--searx/network/network.py302
-rw-r--r--searx/network/raise_for_httperror.py (renamed from searx/raise_for_httperror.py)0
-rw-r--r--searx/plugins/tracker_url_remover.py1
-rw-r--r--searx/poolrequests.py235
-rw-r--r--searx/search/checker/impl.py12
-rw-r--r--searx/search/processors/online.py42
-rw-r--r--searx/settings.yml43
-rw-r--r--searx/settings_loader.py2
-rw-r--r--searx/static/themes/oscar/img/icons/bandcamp.pngbin0 -> 919 bytes
-rw-r--r--searx/templates/__common__/opensearch.xml4
-rw-r--r--searx/templates/oscar/result_templates/default.html4
-rw-r--r--searx/testing.py4
-rw-r--r--searx/utils.py2
-rwxr-xr-xsearx/webapp.py143
-rwxr-xr-xsearx_extra/standalone_searx.py2
-rwxr-xr-xsearx_extra/update/update_engine_descriptions.py6
-rwxr-xr-xsearx_extra/update/update_external_bangs.py6
-rw-r--r--tests/unit/network/__init__.py0
-rw-r--r--tests/unit/network/test_network.py236
-rw-r--r--tests/unit/test_engines_init.py2
-rw-r--r--tests/unit/test_poolrequests.py89
-rw-r--r--tests/unit/test_query.py15
-rw-r--r--tests/unit/test_webapp.py7
-rwxr-xr-xutils/searx.sh2
61 files changed, 1661 insertions, 512 deletions
diff --git a/AUTHORS.rst b/AUTHORS.rst
index 34d2dc25e..57834514b 100644
--- a/AUTHORS.rst
+++ b/AUTHORS.rst
@@ -1,4 +1,4 @@
-Searx was created by Adam Tauber and is maintained by Adam Tauber, Alexandre Flament, Noémi Ványi, @pofilo, Gaspard d'Hautefeuille and Markus Heiser.
+Searx was created by Adam Tauber and is maintained by Adam Tauber, Noémi Ványi, @pofilo, Gaspard d'Hautefeuille and Émilien Devos.
Major contributing authors:
@@ -12,8 +12,9 @@ Major contributing authors:
- @pofilo
- Markus Heiser @return42
- Émilien Devos @unixfox
+- Alexandre Flament
-People who have submitted patches/translates, reported bugs, consulted features or
+People who have submitted patches/translations, reported bugs, consulted features or
generally made searx better:
- Laszlo Hammerl
diff --git a/Dockerfile b/Dockerfile
index 3894aa968..3fbf62f35 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -4,26 +4,19 @@ EXPOSE 8080
VOLUME /etc/searx
VOLUME /var/log/uwsgi
-ARG GIT_URL=unknown
-ARG VERSION_GITCOMMIT=unknown
-ARG SEARX_GIT_VERSION=unknown
-
ARG SEARX_GID=977
ARG SEARX_UID=977
RUN addgroup -g ${SEARX_GID} searx && \
adduser -u ${SEARX_UID} -D -h /usr/local/searx -s /bin/sh -G searx searx
-ARG TIMESTAMP_SETTINGS=0
-ARG TIMESTAMP_UWSGI=0
-ARG LABEL_VCS_REF=
-ARG LABEL_VCS_URL=
-
ENV INSTANCE_NAME=searx \
AUTOCOMPLETE= \
BASE_URL= \
MORTY_KEY= \
- MORTY_URL=
+ MORTY_URL= \
+ SEARX_SETTINGS_PATH=/etc/searx/settings.yml \
+ UWSGI_SETTINGS_PATH=/etc/searx/uwsgi.ini
WORKDIR /usr/local/searx
@@ -60,6 +53,10 @@ RUN apk upgrade --no-cache \
COPY --chown=searx:searx . .
+ARG TIMESTAMP_SETTINGS=0
+ARG TIMESTAMP_UWSGI=0
+ARG VERSION_GITCOMMIT=unknown
+
RUN su searx -c "/usr/bin/python3 -m compileall -q searx"; \
touch -c --date=@${TIMESTAMP_SETTINGS} searx/settings.yml; \
touch -c --date=@${TIMESTAMP_UWSGI} dockerfiles/uwsgi.ini; \
@@ -70,8 +67,12 @@ RUN su searx -c "/usr/bin/python3 -m compileall -q searx"; \
-o -name '*.svg' -o -name '*.ttf' -o -name '*.eot' \) \
-type f -exec gzip -9 -k {} \+ -exec brotli --best {} \+
-# Keep this argument at the end since it change each time
+# Keep these arguments at the end to prevent redundant layer rebuilds
ARG LABEL_DATE=
+ARG GIT_URL=unknown
+ARG SEARX_GIT_VERSION=unknown
+ARG LABEL_VCS_REF=
+ARG LABEL_VCS_URL=
LABEL maintainer="searx <${GIT_URL}>" \
description="A privacy-respecting, hackable metasearch engine." \
version="${SEARX_GIT_VERSION}" \
diff --git a/Makefile b/Makefile
index 745ff5b91..f8d6359c4 100644
--- a/Makefile
+++ b/Makefile
@@ -191,6 +191,7 @@ PYLINT_FILES=\
searx/engines/google_videos.py \
searx/engines/google_images.py \
searx/engines/mediathekviewweb.py \
+ searx/engines/meilisearch.py \
searx/engines/solidtorrents.py \
searx/engines/solr.py \
searx/engines/google_scholar.py \
diff --git a/dockerfiles/docker-entrypoint.sh b/dockerfiles/docker-entrypoint.sh
index accc015f7..6592b1c70 100755
--- a/dockerfiles/docker-entrypoint.sh
+++ b/dockerfiles/docker-entrypoint.sh
@@ -24,9 +24,6 @@ if [ -z "${BIND_ADDRESS}" ]; then
export BIND_ADDRESS="${DEFAULT_BIND_ADDRESS}"
fi
-export UWSGI_SETTINGS_PATH=/etc/searx/uwsgi.ini
-export SEARX_SETTINGS_PATH=/etc/searx/settings.yml
-
# Parse special command line
# see docs/admin/installation-docker.rst
# display the help message without the version
diff --git a/docs/admin/installation-searx.rst b/docs/admin/installation-searx.rst
index 512a185a7..adea0166f 100644
--- a/docs/admin/installation-searx.rst
+++ b/docs/admin/installation-searx.rst
@@ -61,7 +61,7 @@ from the login (*~/.profile*):
.. tip::
- Open a second terminal for the configuration tasks and left the ``(searx)$``
+ Open a second terminal for the configuration tasks and leave the ``(searx)$``
terminal open for the tasks below.
diff --git a/docs/admin/settings.rst b/docs/admin/settings.rst
index 7cf055dbf..622218279 100644
--- a/docs/admin/settings.rst
+++ b/docs/admin/settings.rst
@@ -130,14 +130,12 @@ Global Settings
request_timeout : 2.0 # default timeout in seconds, can be override by engine
# max_request_timeout: 10.0 # the maximum timeout in seconds
useragent_suffix : "" # informations like an email address to the administrator
- pool_connections : 100 # Number of different hosts
- pool_maxsize : 10 # Number of simultaneous requests by host
+ pool_connections : 100 # Maximum number of allowable connections, or None for no limits. The default is 100.
+ pool_maxsize : 10 # Number of allowable keep-alive connections, or None to always allow. The default is 10.
+ enable_http2: True # See https://www.python-httpx.org/http2/
# uncomment below section if you want to use a proxy
# proxies:
- # http:
- # - http://proxy1:8080
- # - http://proxy2:8080
- # https:
+ # all://:
# - http://proxy1:8080
# - http://proxy2:8080
# uncomment below section only if you have more than one network interface
@@ -145,6 +143,7 @@ Global Settings
# source_ips:
# - 1.1.1.1
# - 1.1.1.2
+ # - fe80::/126
``request_timeout`` :
@@ -157,20 +156,46 @@ Global Settings
Suffix to the user-agent searx uses to send requests to others engines. If an
engine wish to block you, a contact info here may be useful to avoid that.
-.. _requests proxies: https://requests.readthedocs.io/en/latest/user/advanced/#proxies
-.. _PySocks: https://pypi.org/project/PySocks/
+``keepalive_expiry``:
+ Number of seconds to keep a connection in the pool. By default 5.0 seconds.
+
+.. _httpx proxies: https://www.python-httpx.org/advanced/#http-proxying
``proxies`` :
- Define one or more proxies you wish to use, see `requests proxies`_.
+ Define one or more proxies you wish to use, see `httpx proxies`_.
If there are more than one proxy for one protocol (http, https),
requests to the engines are distributed in a round-robin fashion.
- - Proxy: `see <https://2.python-requests.org/en/latest/user/advanced/#proxies>`__.
- - SOCKS proxies are also supported: `see <https://2.python-requests.org/en/latest/user/advanced/#socks>`__
-
``source_ips`` :
If you use multiple network interfaces, define from which IP the requests must
- be made. This parameter is ignored when ``proxies`` is set.
+ be made. Example:
+
+ * ``0.0.0.0`` any local IPv4 address.
+ * ``::`` any local IPv6 address.
+ * ``192.168.0.1``
+ * ``[ 192.168.0.1, 192.168.0.2 ]`` these two specific IP addresses
+ * ``fe80::60a2:1691:e5a2:ee1f``
+ * ``fe80::60a2:1691:e5a2:ee1f/126`` all IP addresses in this network.
+ * ``[ 192.168.0.1, fe80::/126 ]``
+
+``retries`` :
+ Number of retry in case of an HTTP error.
+ On each retry, searx uses an different proxy and source ip.
+
+``retry_on_http_error`` :
+ Retry request on some HTTP status code.
+
+ Example:
+
+ * ``true`` : on HTTP status code between 400 and 599.
+ * ``403`` : on HTTP status code 403.
+ * ``[403, 429]``: on HTTP status code 403 and 429.
+
+``enable_http2`` :
+ Enable by default. Set to ``False`` to disable HTTP/2.
+
+``max_redirects`` :
+ 30 by default. Maximum redirect before it is an error.
``locales:``
@@ -216,6 +241,13 @@ Engine settings
api_key : 'apikey'
disabled : True
language : en_US
+ #enable_http: False
+ #enable_http2: False
+ #retries: 1
+ #retry_on_http_error: True # or 403 or [404, 429]
+ #max_connections: 100
+ #max_keepalive_connections: 10
+ #keepalive_expiry: 5.0
#proxies:
# http:
# - http://proxy1:8080
@@ -270,6 +302,12 @@ Engine settings
``display_error_messages`` : default ``True``
When an engine returns an error, the message is displayed on the user interface.
+``network``: optional
+ Use the network configuration from another engine.
+ In addition, there are two default networks:
+ * ``ipv4`` set ``local_addresses`` to ``0.0.0.0`` (use only IPv4 local addresses)
+ * ``ipv6`` set ``local_addresses`` to ``::`` (use only IPv6 local addresses)
+
.. note::
A few more options are possible, but they are pretty specific to some
diff --git a/docs/blog/index.rst b/docs/blog/index.rst
index 689739a58..8651cef69 100644
--- a/docs/blog/index.rst
+++ b/docs/blog/index.rst
@@ -12,3 +12,4 @@ Blog
intro-offline
private-engines
command-line-engines
+ search-indexer-engines
diff --git a/docs/blog/search-indexer-engines.rst b/docs/blog/search-indexer-engines.rst
new file mode 100644
index 000000000..ca4dd3c88
--- /dev/null
+++ b/docs/blog/search-indexer-engines.rst
@@ -0,0 +1,114 @@
+===============================
+Query your local search engines
+===============================
+
+From now on, searx lets you to query your locally running search engines. The following
+ones are supported now:
+
+* `Elasticsearch`_
+* `Meilisearch`_
+* `Solr`_
+
+All of the engines above are added to ``settings.yml`` just commented out, as you have to
+``base_url`` for all them.
+
+Please note that if you are not using HTTPS to access these engines, you have to enable
+HTTP requests by setting ``enable_http`` to ``True``.
+
+Futhermore, if you do not want to expose these engines on a public instance, you can
+still add them and limit the access by setting ``tokens`` as described in the `blog post about
+private engines`_.
+
+Configuring searx for search engines
+====================================
+
+Each search engine is powerful, capable of full-text search.
+
+Elasticsearch
+-------------
+
+Elasticsearch supports numerous ways to query the data it is storing. At the moment
+the engine supports the most popular search methods: ``match``, ``simple_query_string``, ``term`` and ``terms``.
+
+If none of the methods fit your use case, you can select ``custom`` query type and provide the JSON payload
+searx has to submit to Elasticsearch in ``custom_query_json``.
+
+The following is an example configuration for an Elasticsearch instance with authentication
+configured to read from ``my-index`` index.
+
+.. code:: yaml
+
+ - name : elasticsearch
+ shortcut : es
+ engine : elasticsearch
+ base_url : http://localhost:9200
+ username : elastic
+ password : changeme
+ index : my-index
+ query_type : match
+ enable_http : True
+
+
+Meilisearch
+-----------
+
+This search engine is aimed at individuals and small companies. It is designed for
+small-scale (less than 10 million documents) data collections. E.g. it is great for storing
+web pages you have visited and searching in the contents later.
+
+The engine supports faceted search, so you can search in a subset of documents of the collection.
+Futhermore, you can search in Meilisearch instances that require authentication by setting ``auth_token``.
+
+Here is a simple example to query a Meilisearch instance:
+
+.. code:: yaml
+
+ - name : meilisearch
+ engine : meilisearch
+ shortcut: mes
+ base_url : http://localhost:7700
+ index : my-index
+ enable_http: True
+
+
+Solr
+----
+
+Solr is a popular search engine based on Lucene, just like Elasticsearch.
+But instead of searching in indices, you can search in collections.
+
+This is an example configuration for searching in the collection ``my-collection`` and get
+the results in ascending order.
+
+.. code:: yaml
+
+ - name : solr
+ engine : solr
+ shortcut : slr
+ base_url : http://localhost:8983
+ collection : my-collection
+ sort : asc
+ enable_http : True
+
+
+Next steps
+==========
+
+The next step is to add support for various SQL databases.
+
+Acknowledgement
+===============
+
+This development was sponsored by `Search and Discovery Fund`_ of `NLnet Foundation`_ .
+
+.. _blog post about private engines: private-engines.html#private-engines
+.. _Elasticsearch: https://www.elastic.co/elasticsearch/
+.. _Meilisearch: https://www.meilisearch.com/
+.. _Solr: https://solr.apache.org/
+.. _Search and Discovery Fund: https://nlnet.nl/discovery
+.. _NLnet Foundation: https://nlnet.nl/
+
+
+| Happy hacking.
+| kvch // 2021.04.07 23:16
+
diff --git a/manage.sh b/manage.sh
index bb86329f9..54511dc16 100755
--- a/manage.sh
+++ b/manage.sh
@@ -147,7 +147,7 @@ docker_build() {
# define the docker image name
GITHUB_USER=$(echo "${GIT_URL}" | sed 's/.*github\.com\/\([^\/]*\).*/\1/')
- SEARX_IMAGE_NAME="${GITHUB_USER:-searx}/searx"
+ SEARX_IMAGE_NAME="${SEARX_IMAGE_NAME:-${GITHUB_USER:-searx}/searx}"
# build Docker image
echo "Building image ${SEARX_IMAGE_NAME}:${SEARX_GIT_VERSION}"
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 76cd329e0..924972a51 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,13 +1,14 @@
mock==4.0.3
nose2[coverage_plugin]==0.10.0
cov-core==1.15.0
-pycodestyle==2.6.0
+pycodestyle==2.7.0
pylint==2.7.4
splinter==0.14.0
transifex-client==0.14.2
selenium==3.141.0
twine==3.4.1
Pallets-Sphinx-Themes==1.2.3
+docutils==0.16
Sphinx==3.5.3
sphinx-issues==1.2.0
sphinx-jinja==1.1.1
@@ -15,3 +16,4 @@ sphinx-tabs==2.1.0
sphinxcontrib-programoutput==0.17
sphinx-autobuild==2021.3.14
linuxdoc==20210324
+aiounittest==1.4.0
diff --git a/requirements.txt b/requirements.txt
index bfbcecc51..cc3235d01 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,5 +8,10 @@ lxml==4.6.3
pygments==2.8.0
python-dateutil==2.8.1
pyyaml==5.4.1
-requests[socks]==2.25.1
+httpx[http2]==0.17.1
+Brotli==1.0.9
+uvloop==0.15.2; python_version >= '3.7'
+uvloop==0.14.0; python_version < '3.7'
+httpx-socks[asyncio]==0.3.1
langdetect==1.0.8
+setproctitle==1.2.2
diff --git a/searx/autocomplete.py b/searx/autocomplete.py
index 75992a1d8..cf2deb62f 100644
--- a/searx/autocomplete.py
+++ b/searx/autocomplete.py
@@ -20,10 +20,11 @@ from lxml import etree
from json import loads
from urllib.parse import urlencode
-from requests import RequestException
+from httpx import HTTPError
+
from searx import settings
-from searx.poolrequests import get as http_get
+from searx.network import get as http_get
from searx.exceptions import SearxEngineResponseException
@@ -136,5 +137,5 @@ def search_autocomplete(backend_name, query, lang):
try:
return backend(query, lang)
- except (RequestException, SearxEngineResponseException):
+ except (HTTPError, SearxEngineResponseException):
return []
diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py
index 2238ea1b9..95eda6dde 100644
--- a/searx/engines/__init__.py
+++ b/searx/engines/__init__.py
@@ -27,7 +27,7 @@ from searx import settings
from searx import logger
from searx.data import ENGINES_LANGUAGES
from searx.exceptions import SearxEngineResponseException
-from searx.poolrequests import get, get_proxy_cycles
+from searx.network import get, initialize as initialize_network, set_context_network_name
from searx.utils import load_module, match_language, get_engine_from_settings, gen_useragent
@@ -89,8 +89,6 @@ def load_engine(engine_data):
engine.categories = []
else:
engine.categories = list(map(str.strip, param_value.split(',')))
- elif param_name == 'proxies':
- engine.proxies = get_proxy_cycles(param_value)
else:
setattr(engine, param_name, param_value)
@@ -289,9 +287,11 @@ def load_engines(engine_list):
def initialize_engines(engine_list):
load_engines(engine_list)
+ initialize_network(engine_list, settings['outgoing'])
def engine_init(engine_name, init_fn):
try:
+ set_context_network_name(engine_name)
init_fn(get_engine_from_settings(engine_name))
except SearxEngineResponseException as exc:
logger.warn('%s engine: Fail to initialize // %s', engine_name, exc)
diff --git a/searx/engines/bandcamp.py b/searx/engines/bandcamp.py
new file mode 100644
index 000000000..dafb3ee16
--- /dev/null
+++ b/searx/engines/bandcamp.py
@@ -0,0 +1,73 @@
+"""
+Bandcamp (Music)
+
+@website https://bandcamp.com/
+@provide-api no
+@results HTML
+@parse url, title, content, publishedDate, embedded, thumbnail
+"""
+
+from urllib.parse import urlencode, urlparse, parse_qs
+from dateutil.parser import parse as dateparse
+from lxml import html
+from searx.utils import extract_text
+
+categories = ['music']
+paging = True
+
+base_url = "https://bandcamp.com/"
+search_string = search_string = 'search?{query}&page={page}'
+embedded_url = '''<iframe width="100%" height="166"
+ scrolling="no" frameborder="no"
+ data-src="https://bandcamp.com/EmbeddedPlayer/{type}={result_id}/size=large/bgcol=ffffff/linkcol=0687f5/tracklist=false/artwork=small/transparent=true/"
+></iframe>'''
+
+
+def request(query, params):
+ '''pre-request callback
+ params<dict>:
+ method : POST/GET
+ headers : {}
+ data : {} # if method == POST
+ url : ''
+ category: 'search category'
+ pageno : 1 # number of the requested page
+ '''
+
+ search_path = search_string.format(
+ query=urlencode({'q': query}),
+ page=params['pageno'])
+
+ params['url'] = base_url + search_path
+
+ return params
+
+
+def response(resp):
+ '''post-response callback
+ resp: requests response object
+ '''
+ results = []
+ tree = html.fromstring(resp.text)
+ search_results = tree.xpath('//li[contains(@class, "searchresult")]')
+ for result in search_results:
+ link = result.xpath('.//div[@class="itemurl"]/a')[0]
+ result_id = parse_qs(urlparse(link.get('href')).query)["search_item_id"][0]
+ title = result.xpath('.//div[@class="heading"]/a/text()')
+ date = dateparse(result.xpath('//div[@class="released"]/text()')[0].replace("released ", ""))
+ content = result.xpath('.//div[@class="subhead"]/text()')
+ new_result = {
+ "url": extract_text(link),
+ "title": extract_text(title),
+ "content": extract_text(content),
+ "publishedDate": date,
+ }
+ thumbnail = result.xpath('.//div[@class="art"]/img/@src')
+ if thumbnail:
+ new_result['thumbnail'] = thumbnail[0]
+ if "album" in result.classes:
+ new_result["embedded"] = embedded_url.format(type='album', result_id=result_id)
+ elif "track" in result.classes:
+ new_result["embedded"] = embedded_url.format(type='track', result_id=result_id)
+ results.append(new_result)
+ return results
diff --git a/searx/engines/dictzone.py b/searx/engines/dictzone.py
index 2483c0805..eaa8b6ab4 100644
--- a/searx/engines/dictzone.py
+++ b/searx/engines/dictzone.py
@@ -52,7 +52,7 @@ def response(resp):
to_results.append(to_result.text_content())
results.append({
- 'url': urljoin(resp.url, '?%d' % k),
+ 'url': urljoin(str(resp.url), '?%d' % k),
'title': from_result.text_content(),
'content': '; '.join(to_results)
})
diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py
index ae1e36686..3c086f81b 100644
--- a/searx/engines/duckduckgo.py
+++ b/searx/engines/duckduckgo.py
@@ -6,7 +6,7 @@
from lxml.html import fromstring
from json import loads
from searx.utils import extract_text, match_language, eval_xpath, dict_subset
-from searx.poolrequests import get
+from searx.network import get
# about
about = {
diff --git a/searx/engines/duckduckgo_images.py b/searx/engines/duckduckgo_images.py
index 305eb1ca1..0daaf41e9 100644
--- a/searx/engines/duckduckgo_images.py
+++ b/searx/engines/duckduckgo_images.py
@@ -8,7 +8,7 @@ from urllib.parse import urlencode
from searx.exceptions import SearxEngineAPIException
from searx.engines.duckduckgo import get_region_code
from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import
-from searx.poolrequests import get
+from searx.network import get
# about
about = {
diff --git a/searx/engines/elasticsearch.py b/searx/engines/elasticsearch.py
index da7f98074..db84a5c13 100644
--- a/searx/engines/elasticsearch.py
+++ b/searx/engines/elasticsearch.py
@@ -4,7 +4,6 @@
"""
from json import loads, dumps
-from requests.auth import HTTPBasicAuth
from searx.exceptions import SearxEngineAPIException
@@ -32,7 +31,7 @@ def request(query, params):
return params
if username and password:
- params['auth'] = HTTPBasicAuth(username, password)
+ params['auth'] = (username, password)
params['url'] = search_url
params['method'] = 'GET'
diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py
index 248991df9..bbd9e20d2 100644
--- a/searx/engines/gigablast.py
+++ b/searx/engines/gigablast.py
@@ -8,7 +8,7 @@ import re
from json import loads
from urllib.parse import urlencode
# from searx import logger
-from searx.poolrequests import get
+from searx.network import get
# about
about = {
diff --git a/searx/engines/google.py b/searx/engines/google.py
index 8c20029a3..a4aee5c20 100644
--- a/searx/engines/google.py
+++ b/searx/engines/google.py
@@ -10,7 +10,7 @@ Definitions`_.
# pylint: disable=invalid-name, missing-function-docstring
-from urllib.parse import urlencode, urlparse
+from urllib.parse import urlencode
from lxml import html
from searx import logger
from searx.utils import match_language, extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
@@ -186,8 +186,7 @@ def get_lang_info(params, lang_list, custom_aliases):
return ret_val
def detect_google_sorry(resp):
- resp_url = urlparse(resp.url)
- if resp_url.netloc == 'sorry.google.com' or resp_url.path.startswith('/sorry'):
+ if resp.url.host == 'sorry.google.com' or resp.url.path.startswith('/sorry'):
raise SearxEngineCaptchaException()
@@ -243,7 +242,7 @@ def response(resp):
if answer:
results.append({'answer': ' '.join(answer)})
else:
- logger.debug("did not found 'answer'")
+ logger.debug("did not find 'answer'")
# results --> number_of_results
try:
diff --git a/searx/engines/meilisearch.py b/searx/engines/meilisearch.py
new file mode 100644
index 000000000..4e0ff15f3
--- /dev/null
+++ b/searx/engines/meilisearch.py
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+"""
+ Meilisearch
+"""
+
+# pylint: disable=global-statement, missing-function-docstring
+
+from json import loads, dumps
+
+
+base_url = 'http://localhost:7700'
+index = ''
+auth_key = ''
+facet_filters = list()
+_search_url = ''
+result_template = 'key-value.html'
+categories = ['general']
+paging = True
+
+
+def init(_):
+ if index == '':
+ raise ValueError('index cannot be empty')
+
+ global _search_url
+ _search_url = base_url + '/indexes/' + index + '/search'
+
+
+def request(query, params):
+ if auth_key != '':
+ params['headers']['X-Meili-API-Key'] = auth_key
+
+ params['headers']['Content-Type'] = 'application/json'
+ params['url'] = _search_url
+ params['method'] = 'POST'
+
+ data = {
+ 'q': query,
+ 'offset': 10 * (params['pageno'] - 1),
+ 'limit': 10,
+ }
+ if len(facet_filters) > 0:
+ data['facetFilters'] = facet_filters
+
+ params['data'] = dumps(data)
+
+ return params
+
+
+def response(resp):
+ results = []
+
+ resp_json = loads(resp.text)
+ for result in resp_json['hits']:
+ r = {key: str(value) for key, value in result.items()}
+ r['template'] = result_template
+ results.append(r)
+
+ return results
diff --git a/searx/engines/pubmed.py b/searx/engines/pubmed.py
index da02f91ca..5d88d398e 100644
--- a/searx/engines/pubmed.py
+++ b/searx/engines/pubmed.py
@@ -7,7 +7,7 @@ from flask_babel import gettext
from lxml import etree
from datetime import datetime
from urllib.parse import urlencode
-from searx.poolrequests import get
+from searx.network import get
# about
about = {
diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py
index 13dcf1250..d01dc0acc 100644
--- a/searx/engines/qwant.py
+++ b/searx/engines/qwant.py
@@ -8,7 +8,7 @@ from json import loads
from urllib.parse import urlencode
from searx.utils import html_to_text, match_language
from searx.exceptions import SearxEngineAPIException, SearxEngineCaptchaException
-from searx.raise_for_httperror import raise_for_httperror
+from searx.network import raise_for_httperror
# about
about = {
diff --git a/searx/engines/seznam.py b/searx/engines/seznam.py
index faceb0550..042088dbe 100644
--- a/searx/engines/seznam.py
+++ b/searx/engines/seznam.py
@@ -3,9 +3,9 @@
Seznam
"""
-from urllib.parse import urlencode, urlparse
+from urllib.parse import urlencode
from lxml import html
-from searx.poolrequests import get
+from searx.network import get
from searx.exceptions import SearxEngineAccessDeniedException
from searx.utils import (
extract_text,
@@ -46,8 +46,7 @@ def request(query, params):
def response(resp):
- resp_url = urlparse(resp.url)
- if resp_url.path.startswith('/verify'):
+ if resp.url.path.startswith('/verify'):
raise SearxEngineAccessDeniedException()
results = []
diff --git a/searx/engines/sjp.py b/searx/engines/sjp.py
new file mode 100644
index 000000000..eff7b7092
--- /dev/null
+++ b/searx/engines/sjp.py
@@ -0,0 +1,92 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+"""Słownik Języka Polskiego (general)
+
+"""
+
+from lxml.html import fromstring
+from searx import logger
+from searx.utils import extract_text
+from searx.raise_for_httperror import raise_for_httperror
+
+logger = logger.getChild('sjp engine')
+
+# about
+about = {
+ "website": 'https://sjp.pwn.pl',
+ "wikidata_id": 'Q55117369',
+ "official_api_documentation": None,
+ "use_official_api": False,
+ "require_api_key": False,
+ "results": 'HTML',
+}
+
+categories = ['general']
+paging = False
+
+URL = 'https://sjp.pwn.pl'
+SEARCH_URL = URL + '/szukaj/{query}.html'
+
+word_xpath = '//div[@class="query"]'
+dict_xpath = ['//div[@class="wyniki sjp-so-wyniki sjp-so-anchor"]',
+ '//div[@class="wyniki sjp-wyniki sjp-anchor"]',
+ '//div[@class="wyniki sjp-doroszewski-wyniki sjp-doroszewski-anchor"]']
+
+
+def request(query, params):
+ params['url'] = SEARCH_URL.format(query=query)
+ logger.debug(f"query_url --> {params['url']}")
+ return params
+
+
+def response(resp):
+ results = []
+
+ raise_for_httperror(resp)
+ dom = fromstring(resp.text)
+ word = extract_text(dom.xpath(word_xpath))
+
+ definitions = []
+
+ for dict_src in dict_xpath:
+ for src in dom.xpath(dict_src):
+ src_text = extract_text(src.xpath('.//span[@class="entry-head-title"]/text()')).strip()
+
+ src_defs = []
+ for def_item in src.xpath('.//div[contains(@class, "ribbon-element")]'):
+ if def_item.xpath('./div[@class="znacz"]'):
+ sub_defs = []
+ for def_sub_item in def_item.xpath('./div[@class="znacz"]'):
+ def_sub_text = extract_text(def_sub_item).lstrip('0123456789. ')
+ sub_defs.append(def_sub_text)
+ src_defs.append((word, sub_defs))
+ else:
+ def_text = extract_text(def_item).strip()
+ def_link = def_item.xpath('./span/a/@href')
+ if 'doroszewski' in def_link[0]:
+ def_text = f"<a href='{def_link[0]}'>{def_text}</a>"
+ src_defs.append((def_text, ''))
+
+ definitions.append((src_text, src_defs))
+
+ if not definitions:
+ return results
+
+ infobox = ''
+ for src in definitions:
+ infobox += f"<div><small>{src[0]}</small>"
+ infobox += "<ul>"
+ for (def_text, sub_def) in src[1]:
+ infobox += f"<li>{def_text}</li>"
+ if sub_def:
+ infobox += "<ol>"
+ for sub_def_text in sub_def:
+ infobox += f"<li>{sub_def_text}</li>"
+ infobox += "</ol>"
+ infobox += "</ul></div>"
+
+ results.append({
+ 'infobox': word,
+ 'content': infobox,
+ })
+
+ return results
diff --git a/searx/engines/soundcloud.py b/searx/engines/soundcloud.py
index b3e3383bd..a6f923855 100644
--- a/searx/engines/soundcloud.py
+++ b/searx/engines/soundcloud.py
@@ -9,7 +9,7 @@ from lxml import html
from dateutil import parser
from urllib.parse import quote_plus, urlencode
from searx import logger
-from searx.poolrequests import get as http_get
+from searx.network import get as http_get
# about
about = {
diff --git a/searx/engines/spotify.py b/searx/engines/spotify.py
index 0ad8bfe32..6816fe672 100644
--- a/searx/engines/spotify.py
+++ b/searx/engines/spotify.py
@@ -5,9 +5,10 @@
from json import loads
from urllib.parse import urlencode
-import requests
import base64
+from searx.network import post as http_post
+
# about
about = {
"website": 'https://www.spotify.com',
@@ -38,7 +39,7 @@ def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query}), offset=offset)
- r = requests.post(
+ r = http_post(
'https://accounts.spotify.com/api/token',
data={'grant_type': 'client_credentials'},
headers={'Authorization': 'Basic ' + base64.b64encode(
diff --git a/searx/engines/stackoverflow.py b/searx/engines/stackoverflow.py
index 91eaa68e9..8fc2cdb3a 100644
--- a/searx/engines/stackoverflow.py
+++ b/searx/engines/stackoverflow.py
@@ -3,7 +3,7 @@
Stackoverflow (IT)
"""
-from urllib.parse import urlencode, urljoin, urlparse
+from urllib.parse import urlencode, urljoin
from lxml import html
from searx.utils import extract_text
from searx.exceptions import SearxEngineCaptchaException
@@ -41,8 +41,7 @@ def request(query, params):
# get response from search-request
def response(resp):
- resp_url = urlparse(resp.url)
- if resp_url.path.startswith('/nocaptcha'):
+ if resp.url.path.startswith('/nocaptcha'):
raise SearxEngineCaptchaException()
results = []
diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py
index c8e4cfae6..ddcce9085 100644
--- a/searx/engines/wikidata.py
+++ b/searx/engines/wikidata.py
@@ -12,7 +12,7 @@ from babel.dates import format_datetime, format_date, format_time, get_datetime_
from searx import logger
from searx.data import WIKIDATA_UNITS
-from searx.poolrequests import post, get
+from searx.network import post, get
from searx.utils import match_language, searx_useragent, get_string_replaces_function
from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import
diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py
index 3ad8748fb..5e34db9a7 100644
--- a/searx/engines/wikipedia.py
+++ b/searx/engines/wikipedia.py
@@ -7,7 +7,7 @@ from urllib.parse import quote
from json import loads
from lxml.html import fromstring
from searx.utils import match_language, searx_useragent
-from searx.raise_for_httperror import raise_for_httperror
+from searx.network import raise_for_httperror
# about
about = {
diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py
index 8e427d575..1f2cfa4e6 100644
--- a/searx/engines/wolframalpha_noapi.py
+++ b/searx/engines/wolframalpha_noapi.py
@@ -7,7 +7,7 @@ from json import loads
from time import time
from urllib.parse import urlencode
-from searx.poolrequests import get as http_get
+from searx.network import get as http_get
# about
about = {
diff --git a/searx/engines/wordnik.py b/searx/engines/wordnik.py
new file mode 100644
index 000000000..4bfeb4070
--- /dev/null
+++ b/searx/engines/wordnik.py
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+"""Wordnik (general)
+
+"""
+
+from lxml.html import fromstring
+from searx import logger
+from searx.utils import extract_text
+from searx.network import raise_for_httperror
+
+logger = logger.getChild('Wordnik engine')
+
+# about
+about = {
+ "website": 'https://www.wordnik.com',
+ "wikidata_id": 'Q8034401',
+ "official_api_documentation": None,
+ "use_official_api": False,
+ "require_api_key": False,
+ "results": 'HTML',
+}
+
+categories = ['general']
+paging = False
+
+URL = 'https://www.wordnik.com'
+SEARCH_URL = URL + '/words/{query}'
+
+
+def request(query, params):
+ params['url'] = SEARCH_URL.format(query=query)
+ logger.debug(f"query_url --> {params['url']}")
+ return params
+
+
+def response(resp):
+ results = []
+
+ raise_for_httperror(resp)
+ dom = fromstring(resp.text)
+ word = extract_text(dom.xpath('//*[@id="headword"]/text()'))
+
+ definitions = []
+ for src in dom.xpath('//*[@id="define"]//h3[@class="source"]'):
+ src_text = extract_text(src).strip()
+ if src_text.startswith('from '):
+ src_text = src_text[5:]
+
+ src_defs = []
+ for def_item in src.xpath('following-sibling::ul[1]/li'):
+ def_abbr = extract_text(def_item.xpath('.//abbr')).strip()
+ def_text = extract_text(def_item).strip()
+ if def_abbr:
+ def_text = def_text[len(def_abbr):].strip()
+ src_defs.append((def_abbr, def_text))
+
+ definitions.append((src_text, src_defs))
+
+ if not definitions:
+ return results
+
+ infobox = ''
+ for src_text, src_defs in definitions:
+ infobox += f"<small>{src_text}</small>"
+ infobox += "<ul>"
+ for def_abbr, def_text in src_defs:
+ if def_abbr:
+ def_abbr += ": "
+ infobox += f"<li><i>{def_abbr}</i> {def_text}</li>"
+ infobox += "</ul>"
+
+ results.append({
+ 'infobox': word,
+ 'content': infobox,
+ })
+
+ return results
diff --git a/searx/engines/yacy.py b/searx/engines/yacy.py
index c194ca451..fbd99c47b 100644
--- a/searx/engines/yacy.py
+++ b/searx/engines/yacy.py
@@ -7,7 +7,7 @@ from json import loads
from dateutil import parser
from urllib.parse import urlencode
-from requests.auth import HTTPDigestAuth
+from httpx import DigestAuth
from searx.utils import html_to_text
@@ -56,7 +56,7 @@ def request(query, params):
search_type=search_type)
if http_digest_auth_user and http_digest_auth_pass:
- params['auth'] = HTTPDigestAuth(http_digest_auth_user, http_digest_auth_pass)
+ params['auth'] = DigestAuth(http_digest_auth_user, http_digest_auth_pass)
# add language tag if specified
if params['language'] != 'all':
diff --git a/searx/engines/yggtorrent.py b/searx/engines/yggtorrent.py
index 8dfc0a0f2..f5af91f46 100644
--- a/searx/engines/yggtorrent.py
+++ b/searx/engines/yggtorrent.py
@@ -8,7 +8,7 @@ from operator import itemgetter
from datetime import datetime
from urllib.parse import quote
from searx.utils import extract_text, get_torrent_size
-from searx.poolrequests import get as http_get
+from searx.network import get as http_get
# about
about = {
diff --git a/searx/metrology/error_recorder.py b/searx/metrology/error_recorder.py
index f533e4e8b..167d1c8aa 100644
--- a/searx/metrology/error_recorder.py
+++ b/searx/metrology/error_recorder.py
@@ -3,7 +3,7 @@ import inspect
import logging
from json import JSONDecodeError
from urllib.parse import urlparse
-from requests.exceptions import RequestException
+from httpx import HTTPError, HTTPStatusError
from searx.exceptions import (SearxXPathSyntaxException, SearxEngineXPathException, SearxEngineAPIException,
SearxEngineAccessDeniedException)
from searx import logger
@@ -60,28 +60,28 @@ def get_trace(traces):
return traces[-1]
-def get_hostname(exc: RequestException) -> typing.Optional[None]:
+def get_hostname(exc: HTTPError) -> typing.Optional[None]:
url = exc.request.url
if url is None and exc.response is not None:
url = exc.response.url
return urlparse(url).netloc
-def get_request_exception_messages(exc: RequestException)\
+def get_request_exception_messages(exc: HTTPError)\
-> typing.Tuple[typing.Optional[str], typing.Optional[str], typing.Optional[str]]:
url = None
status_code = None
reason = None
hostname = None
- if exc.request is not None:
+ if hasattr(exc, 'request') and exc.request is not None:
url = exc.request.url
- if url is None and exc.response is not None:
+ if url is None and hasattr(exc, 'response') and exc.respones is not None:
url = exc.response.url
if url is not None:
- hostname = str(urlparse(url).netloc)
- if exc.response is not None:
+ hostname = url.host
+ if isinstance(exc, HTTPStatusError):
status_code = str(exc.response.status_code)
- reason = exc.response.reason
+ reason = exc.response.reason_phrase
return (status_code, reason, hostname)
@@ -92,7 +92,7 @@ def get_messages(exc, filename) -> typing.Tuple:
return (str(exc), )
if isinstance(exc, ValueError) and 'lxml' in filename:
return (str(exc), )
- if isinstance(exc, RequestException):
+ if isinstance(exc, HTTPError):
return get_request_exception_messages(exc)
if isinstance(exc, SearxXPathSyntaxException):
return (exc.xpath_str, exc.message)
diff --git a/searx/network/__init__.py b/searx/network/__init__.py
new file mode 100644
index 000000000..dbd31c781
--- /dev/null
+++ b/searx/network/__init__.py
@@ -0,0 +1,189 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+import asyncio
+import threading
+import concurrent.futures
+from time import time
+
+import httpx
+import h2.exceptions
+
+from .network import get_network, initialize
+from .client import LOOP
+from .raise_for_httperror import raise_for_httperror
+
+# queue.SimpleQueue: Support Python 3.6
+try:
+ from queue import SimpleQueue
+except ImportError:
+ from queue import Empty
+ from collections import deque
+
+ class SimpleQueue:
+ """Minimal backport of queue.SimpleQueue"""
+
+ def __init__(self):
+ self._queue = deque()
+ self._count = threading.Semaphore(0)
+
+ def put(self, item):
+ self._queue.append(item)
+ self._count.release()
+
+ def get(self):
+ if not self._count.acquire(True):
+ raise Empty
+ return self._queue.popleft()
+
+
+THREADLOCAL = threading.local()
+
+
+def reset_time_for_thread():
+ THREADLOCAL.total_time = 0
+
+
+def get_time_for_thread():
+ return THREADLOCAL.total_time
+
+
+def set_timeout_for_thread(timeout, start_time=None):
+ THREADLOCAL.timeout = timeout
+ THREADLOCAL.start_time = start_time
+
+
+def set_context_network_name(network_name):
+ THREADLOCAL.network = get_network(network_name)
+
+
+def get_context_network():
+ try:
+ return THREADLOCAL.network
+ except AttributeError:
+ return get_network()
+
+
+def request(method, url, **kwargs):
+ """same as requests/requests/api.py request(...)"""
+ time_before_request = time()
+
+ # timeout (httpx)
+ if 'timeout' in kwargs:
+ timeout = kwargs['timeout']
+ else:
+ timeout = getattr(THREADLOCAL, 'timeout', None)
+ if timeout is not None:
+ kwargs['timeout'] = timeout
+
+ # 2 minutes timeout for the requests without timeout
+ timeout = timeout or 120
+
+ # ajdust actual timeout
+ timeout += 0.2 # overhead
+ start_time = getattr(THREADLOCAL, 'start_time', time_before_request)
+ if start_time:
+ timeout -= time() - start_time
+
+ # raise_for_error
+ check_for_httperror = True
+ if 'raise_for_httperror' in kwargs:
+ check_for_httperror = kwargs['raise_for_httperror']
+ del kwargs['raise_for_httperror']
+
+ # requests compatibility
+ if isinstance(url, bytes):
+ url = url.decode()
+
+ # network
+ network = get_context_network()
+
+ # do request
+ future = asyncio.run_coroutine_threadsafe(network.request(method, url, **kwargs), LOOP)
+ try:
+ response = future.result(timeout)
+ except concurrent.futures.TimeoutError as e:
+ raise httpx.TimeoutException('Timeout', request=None) from e
+
+ # requests compatibility
+ # see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses
+ response.ok = not response.is_error
+
+ # update total_time.
+ # See get_time_for_thread() and reset_time_for_thread()
+ if hasattr(THREADLOCAL, 'total_time'):
+ time_after_request = time()
+ THREADLOCAL.total_time += time_after_request - time_before_request
+
+ # raise an exception
+ if check_for_httperror:
+ raise_for_httperror(response)
+
+ return response
+
+
+def get(url, **kwargs):
+ kwargs.setdefault('allow_redirects', True)
+ return request('get', url, **kwargs)
+
+
+def options(url, **kwargs):
+ kwargs.setdefault('allow_redirects', True)
+ return request('options', url, **kwargs)
+
+
+def head(url, **kwargs):
+ kwargs.setdefault('allow_redirects', False)
+ return request('head', url, **kwargs)
+
+
+def post(url, data=None, **kwargs):
+ return request('post', url, data=data, **kwargs)
+
+
+def put(url, data=None, **kwargs):
+ return request('put', url, data=data, **kwargs)
+
+
+def patch(url, data=None, **kwargs):
+ return request('patch', url, data=data, **kwargs)
+
+
+def delete(url, **kwargs):
+ return request('delete', url, **kwargs)
+
+
+async def stream_chunk_to_queue(network, q, method, url, **kwargs):
+ try:
+ async with network.stream(method, url, **kwargs) as response:
+ q.put(response)
+ async for chunk in response.aiter_bytes(65536):
+ if len(chunk) > 0:
+ q.put(chunk)
+ except (httpx.HTTPError, OSError, h2.exceptions.ProtocolError) as e:
+ q.put(e)
+ finally:
+ q.put(None)
+
+
+def stream(method, url, **kwargs):
+ """Replace httpx.stream.
+
+ Usage:
+ stream = poolrequests.stream(...)
+ response = next(stream)
+ for chunk in stream:
+ ...
+
+ httpx.Client.stream requires to write the httpx.HTTPTransport version of the
+ the httpx.AsyncHTTPTransport declared above.
+ """
+ q = SimpleQueue()
+ future = asyncio.run_coroutine_threadsafe(stream_chunk_to_queue(get_network(), q, method, url, **kwargs),
+ LOOP)
+ chunk_or_exception = q.get()
+ while chunk_or_exception is not None:
+ if isinstance(chunk_or_exception, Exception):
+ raise chunk_or_exception
+ yield chunk_or_exception
+ chunk_or_exception = q.get()
+ return future.result()
diff --git a/searx/network/client.py b/searx/network/client.py
new file mode 100644
index 000000000..631e36f8f
--- /dev/null
+++ b/searx/network/client.py
@@ -0,0 +1,214 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+import asyncio
+import logging
+import threading
+
+import httpcore
+import httpx
+from httpx_socks import AsyncProxyTransport
+from python_socks import parse_proxy_url
+import python_socks._errors
+
+from searx import logger
+
+# Optional uvloop (support Python 3.6)
+try:
+ import uvloop
+except ImportError:
+ pass
+else:
+ uvloop.install()
+
+
+logger = logger.getChild('searx.http.client')
+LOOP = None
+TRANSPORT_KWARGS = {
+ 'backend': 'asyncio',
+ 'trust_env': False,
+}
+
+
+async def close_connections_for_url(connection_pool: httpcore.AsyncConnectionPool, url: httpcore._utils.URL):
+ origin = httpcore._utils.url_to_origin(url)
+ logger.debug('Drop connections for %r', origin)
+ connections_to_close = connection_pool._connections_for_origin(origin)
+ for connection in connections_to_close:
+ await connection_pool._remove_from_pool(connection)
+ try:
+ await connection.aclose()
+ except httpcore.NetworkError as e:
+ logger.warning('Error closing an existing connection', exc_info=e)
+
+
+class AsyncHTTPTransportNoHttp(httpcore.AsyncHTTPTransport):
+ """Block HTTP request"""
+
+ async def arequest(self, method, url, headers=None, stream=None, ext=None):
+ raise httpcore.UnsupportedProtocol("HTTP protocol is disabled")
+
+
+class AsyncProxyTransportFixed(AsyncProxyTransport):
+ """Fix httpx_socks.AsyncProxyTransport
+
+ Map python_socks exceptions to httpcore.ProxyError
+
+ Map socket.gaierror to httpcore.ConnectError
+
+ Note: keepalive_expiry is ignored, AsyncProxyTransport should call:
+ * self._keepalive_sweep()
+ * self._response_closed(self, connection)
+
+ Note: AsyncProxyTransport inherit from AsyncConnectionPool
+
+ Note: the API is going to change on httpx 0.18.0
+ see https://github.com/encode/httpx/pull/1522
+ """
+
+ async def arequest(self, method, url, headers=None, stream=None, ext=None):
+ retry = 2
+ while retry > 0:
+ retry -= 1
+ try:
+ return await super().arequest(method, url, headers, stream, ext)
+ except (python_socks._errors.ProxyConnectionError,
+ python_socks._errors.ProxyTimeoutError,
+ python_socks._errors.ProxyError) as e:
+ raise httpcore.ProxyError(e)
+ except OSError as e:
+ # socket.gaierror when DNS resolution fails
+ raise httpcore.NetworkError(e)
+ except httpcore.RemoteProtocolError as e:
+ # in case of httpcore.RemoteProtocolError: Server disconnected
+ await close_connections_for_url(self, url)
+ logger.warning('httpcore.RemoteProtocolError: retry', exc_info=e)
+ # retry
+ except (httpcore.NetworkError, httpcore.ProtocolError) as e:
+ # httpcore.WriteError on HTTP/2 connection leaves a new opened stream
+ # then each new request creates a new stream and raise the same WriteError
+ await close_connections_for_url(self, url)
+ raise e
+
+
+class AsyncHTTPTransportFixed(httpx.AsyncHTTPTransport):
+ """Fix httpx.AsyncHTTPTransport"""
+
+ async def arequest(self, method, url, headers=None, stream=None, ext=None):
+ retry = 2
+ while retry > 0:
+ retry -= 1
+ try:
+ return await super().arequest(method, url, headers, stream, ext)
+ except OSError as e:
+ # socket.gaierror when DNS resolution fails
+ raise httpcore.ConnectError(e)
+ except httpcore.CloseError as e:
+ # httpcore.CloseError: [Errno 104] Connection reset by peer
+ # raised by _keepalive_sweep()
+ # from https://github.com/encode/httpcore/blob/4b662b5c42378a61e54d673b4c949420102379f5/httpcore/_backends/asyncio.py#L198 # noqa
+ await close_connections_for_url(self._pool, url)
+ logger.warning('httpcore.CloseError: retry', exc_info=e)
+ # retry
+ except httpcore.RemoteProtocolError as e:
+ # in case of httpcore.RemoteProtocolError: Server disconnected
+ await close_connections_for_url(self._pool, url)
+ logger.warning('httpcore.RemoteProtocolError: retry', exc_info=e)
+ # retry
+ except (httpcore.ProtocolError, httpcore.NetworkError) as e:
+ await close_connections_for_url(self._pool, url)
+ raise e
+
+
+def get_transport_for_socks_proxy(verify, http2, local_address, proxy_url, limit, retries):
+ global LOOP, TRANSPORT_KWARGS
+ # support socks5h (requests compatibility):
+ # https://requests.readthedocs.io/en/master/user/advanced/#socks
+ # socks5:// hostname is resolved on client side
+ # socks5h:// hostname is resolved on proxy side
+ rdns = False
+ socks5h = 'socks5h://'
+ if proxy_url.startswith(socks5h):
+ proxy_url = 'socks5://' + proxy_url[len(socks5h):]
+ rdns = True
+
+ proxy_type, proxy_host, proxy_port, proxy_username, proxy_password = parse_proxy_url(proxy_url)
+
+ return AsyncProxyTransportFixed(proxy_type=proxy_type, proxy_host=proxy_host, proxy_port=proxy_port,
+ username=proxy_username, password=proxy_password,
+ rdns=rdns,
+ loop=LOOP,
+ verify=verify,
+ http2=http2,
+ local_address=local_address,
+ max_connections=limit.max_connections,
+ max_keepalive_connections=limit.max_keepalive_connections,
+ keepalive_expiry=limit.keepalive_expiry,
+ retries=retries,
+ **TRANSPORT_KWARGS)
+
+
+def get_transport(verify, http2, local_address, proxy_url, limit, retries):
+ return AsyncHTTPTransportFixed(verify=verify,
+ http2=http2,
+ local_address=local_address,
+ proxy=httpx._config.Proxy(proxy_url) if proxy_url else None,
+ limits=limit,
+ retries=retries,
+ **TRANSPORT_KWARGS)
+
+
+def iter_proxies(proxies):
+ # https://www.python-httpx.org/compatibility/#proxy-keys
+ if isinstance(proxies, str):
+ yield 'all://', proxies
+ elif isinstance(proxies, dict):
+ for pattern, proxy_url in proxies.items():
+ yield pattern, proxy_url
+
+
+def new_client(enable_http, verify, enable_http2,
+ max_connections, max_keepalive_connections, keepalive_expiry,
+ proxies, local_address, retries, max_redirects):
+ limit = httpx.Limits(max_connections=max_connections,
+ max_keepalive_connections=max_keepalive_connections,
+ keepalive_expiry=keepalive_expiry)
+ # See https://www.python-httpx.org/advanced/#routing
+ mounts = {}
+ for pattern, proxy_url in iter_proxies(proxies):
+ if not enable_http and (pattern == 'http' or pattern.startswith('http://')):
+ continue
+ if proxy_url.startswith('socks4://') \
+ or proxy_url.startswith('socks5://') \
+ or proxy_url.startswith('socks5h://'):
+ mounts[pattern] = get_transport_for_socks_proxy(verify, enable_http2, local_address, proxy_url, limit,
+ retries)
+ else:
+ mounts[pattern] = get_transport(verify, enable_http2, local_address, proxy_url, limit, retries)
+
+ if not enable_http:
+ mounts['http://'] = AsyncHTTPTransportNoHttp()
+
+ transport = get_transport(verify, enable_http2, local_address, None, limit, retries)
+ return httpx.AsyncClient(transport=transport, mounts=mounts, max_redirects=max_redirects)
+
+
+def init():
+ # log
+ for logger_name in ('hpack.hpack', 'hpack.table'):
+ logging.getLogger(logger_name).setLevel(logging.WARNING)
+
+ # loop
+ def loop_thread():
+ global LOOP
+ LOOP = asyncio.new_event_loop()
+ LOOP.run_forever()
+
+ th = threading.Thread(
+ target=loop_thread,
+ name='asyncio_loop',
+ daemon=True,
+ )
+ th.start()
+
+
+init()
diff --git a/searx/network/network.py b/searx/network/network.py
new file mode 100644
index 000000000..f50acf595
--- /dev/null
+++ b/searx/network/network.py
@@ -0,0 +1,302 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+import atexit
+import asyncio
+import ipaddress
+from itertools import cycle
+
+import httpx
+
+from .client import new_client, LOOP
+
+
+DEFAULT_NAME = '__DEFAULT__'
+NETWORKS = {}
+# requests compatibility when reading proxy settings from settings.yml
+PROXY_PATTERN_MAPPING = {
+ 'http': 'http://',
+ 'https': 'https://',
+ 'socks4': 'socks4://',
+ 'socks5': 'socks5://',
+ 'socks5h': 'socks5h://',
+ 'http:': 'http://',
+ 'https:': 'https://',
+ 'socks4:': 'socks4://',
+ 'socks5:': 'socks5://',
+ 'socks5h:': 'socks5h://',
+}
+
+ADDRESS_MAPPING = {
+ 'ipv4': '0.0.0.0',
+ 'ipv6': '::'
+}
+
+
+class Network:
+
+ __slots__ = ('enable_http', 'verify', 'enable_http2',
+ 'max_connections', 'max_keepalive_connections', 'keepalive_expiry',
+ 'local_addresses', 'proxies', 'max_redirects', 'retries', 'retry_on_http_error',
+ '_local_addresses_cycle', '_proxies_cycle', '_clients')
+
+ def __init__(self,
+ enable_http=True,
+ verify=True,
+ enable_http2=False,
+ max_connections=None,
+ max_keepalive_connections=None,
+ keepalive_expiry=None,
+ proxies=None,
+ local_addresses=None,
+ retries=0,
+ retry_on_http_error=None,
+ max_redirects=30):
+ self.enable_http = enable_http
+ self.verify = verify
+ self.enable_http2 = enable_http2
+ self.max_connections = max_connections
+ self.max_keepalive_connections = max_keepalive_connections
+ self.keepalive_expiry = keepalive_expiry
+ self.proxies = proxies
+ self.local_addresses = local_addresses
+ self.retries = retries
+ self.retry_on_http_error = retry_on_http_error
+ self.max_redirects = max_redirects
+ self._local_addresses_cycle = self.get_ipaddress_cycle()
+ self._proxies_cycle = self.get_proxy_cycles()
+ self._clients = {}
+ self.check_parameters()
+
+ def check_parameters(self):
+ for address in self.iter_ipaddresses():
+ if '/' in address:
+ ipaddress.ip_network(address, False)
+ else:
+ ipaddress.ip_address(address)
+
+ if self.proxies is not None and not isinstance(self.proxies, (str, dict)):
+ raise ValueError('proxies type has to be str, dict or None')
+
+ def iter_ipaddresses(self):
+ local_addresses = self.local_addresses
+ if not local_addresses:
+ return
+ elif isinstance(local_addresses, str):
+ local_addresses = [local_addresses]
+ for address in local_addresses:
+ yield address
+
+ def get_ipaddress_cycle(self):
+ while True:
+ count = 0
+ for address in self.iter_ipaddresses():
+ if '/' in address:
+ for a in ipaddress.ip_network(address, False).hosts():
+ yield str(a)
+ count += 1
+ else:
+ a = ipaddress.ip_address(address)
+ yield str(a)
+ count += 1
+ if count == 0:
+ yield None
+
+ def iter_proxies(self):
+ if not self.proxies:
+ return
+ # https://www.python-httpx.org/compatibility/#proxy-keys
+ if isinstance(self.proxies, str):
+ yield 'all://', [self.proxies]
+ else:
+ for pattern, proxy_url in self.proxies.items():
+ pattern = PROXY_PATTERN_MAPPING.get(pattern, pattern)
+ if isinstance(proxy_url, str):
+ proxy_url = [proxy_url]
+ yield pattern, proxy_url
+
+ def get_proxy_cycles(self):
+ proxy_settings = {}
+ for pattern, proxy_urls in self.iter_proxies():
+ proxy_settings[pattern] = cycle(proxy_urls)
+ while True:
+ yield tuple((pattern, next(proxy_url_cycle)) for pattern, proxy_url_cycle in proxy_settings.items())
+
+ def get_client(self, verify=None, max_redirects=None):
+ verify = self.verify if verify is None else verify
+ max_redirects = self.max_redirects if max_redirects is None else max_redirects
+ local_address = next(self._local_addresses_cycle)
+ proxies = next(self._proxies_cycle) # is a tuple so it can be part of the key
+ key = (verify, max_redirects, local_address, proxies)
+ if key not in self._clients or self._clients[key].is_closed:
+ self._clients[key] = new_client(self.enable_http,
+ verify,
+ self.enable_http2,
+ self.max_connections,
+ self.max_keepalive_connections,
+ self.keepalive_expiry,
+ dict(proxies),
+ local_address,
+ 0,
+ max_redirects)
+ return self._clients[key]
+
+ async def aclose(self):
+ async def close_client(client):
+ try:
+ await client.aclose()
+ except httpx.HTTPError:
+ pass
+ await asyncio.gather(*[close_client(client) for client in self._clients.values()], return_exceptions=False)
+
+ @staticmethod
+ def get_kwargs_clients(kwargs):
+ kwargs_clients = {}
+ if 'verify' in kwargs:
+ kwargs_clients['verify'] = kwargs.pop('verify')
+ if 'max_redirects' in kwargs:
+ kwargs_clients['max_redirects'] = kwargs.pop('max_redirects')
+ return kwargs_clients
+
+ def is_valid_respones(self, response):
+ if (self.retry_on_http_error is True and 400 <= response.status_code <= 599) \
+ or (isinstance(self.retry_on_http_error, list) and response.status_code in self.retry_on_http_error) \
+ or (isinstance(self.retry_on_http_error, int) and response.status_code == self.retry_on_http_error):
+ return False
+ return True
+
+ async def request(self, method, url, **kwargs):
+ retries = self.retries
+ while retries >= 0: # pragma: no cover
+ kwargs_clients = Network.get_kwargs_clients(kwargs)
+ client = self.get_client(**kwargs_clients)
+ try:
+ response = await client.request(method, url, **kwargs)
+ if self.is_valid_respones(response) or retries <= 0:
+ return response
+ except (httpx.RequestError, httpx.HTTPStatusError) as e:
+ if retries <= 0:
+ raise e
+ retries -= 1
+
+ def stream(self, method, url, **kwargs):
+ retries = self.retries
+ while retries >= 0: # pragma: no cover
+ kwargs_clients = Network.get_kwargs_clients(kwargs)
+ client = self.get_client(**kwargs_clients)
+ try:
+ response = client.stream(method, url, **kwargs)
+ if self.is_valid_respones(response) or retries <= 0:
+ return response
+ except (httpx.RequestError, httpx.HTTPStatusError) as e:
+ if retries <= 0:
+ raise e
+ retries -= 1
+
+ @classmethod
+ async def aclose_all(cls):
+ await asyncio.gather(*[network.aclose() for network in NETWORKS.values()], return_exceptions=False)
+
+
+def get_network(name=None):
+ global NETWORKS
+ return NETWORKS[name or DEFAULT_NAME]
+
+
+def initialize(settings_engines=None, settings_outgoing=None):
+ from searx.engines import engines
+ from searx import settings
+
+ global NETWORKS
+
+ settings_engines = settings_engines or settings.get('engines')
+ settings_outgoing = settings_outgoing or settings.get('outgoing')
+
+ # default parameters for AsyncHTTPTransport
+ # see https://github.com/encode/httpx/blob/e05a5372eb6172287458b37447c30f650047e1b8/httpx/_transports/default.py#L108-L121 # noqa
+ default_params = {
+ 'enable_http': False,
+ 'verify': True,
+ 'enable_http2': settings_outgoing.get('enable_http2', True),
+ # Magic number kept from previous code
+ 'max_connections': settings_outgoing.get('pool_connections', 100),
+ # Picked from constructor
+ 'max_keepalive_connections': settings_outgoing.get('pool_maxsize', 10),
+ #
+ 'keepalive_expiry': settings_outgoing.get('keepalive_expiry', 5.0),
+ 'local_addresses': settings_outgoing.get('source_ips'),
+ 'proxies': settings_outgoing.get('proxies'),
+ # default maximum redirect
+ # from https://github.com/psf/requests/blob/8c211a96cdbe9fe320d63d9e1ae15c5c07e179f8/requests/models.py#L55
+ 'max_redirects': settings_outgoing.get('max_redirects', 30),
+ #
+ 'retries': settings_outgoing.get('retries', 0),
+ 'retry_on_http_error': None,
+ }
+
+ def new_network(params):
+ nonlocal default_params
+ result = {}
+ result.update(default_params)
+ result.update(params)
+ return Network(**result)
+
+ def iter_networks():
+ nonlocal settings_engines
+ for engine_spec in settings_engines:
+ engine_name = engine_spec['name']
+ engine = engines.get(engine_name)
+ if engine is None:
+ continue
+ network = getattr(engine, 'network', None)
+ yield engine_name, engine, network
+
+ if NETWORKS:
+ done()
+ NETWORKS.clear()
+ NETWORKS[DEFAULT_NAME] = new_network({})
+ NETWORKS['ipv4'] = new_network({'local_addresses': '0.0.0.0'})
+ NETWORKS['ipv6'] = new_network({'local_addresses': '::'})
+
+ # define networks from outgoing.networks
+ for network_name, network in settings_outgoing.get('networks', {}).items():
+ NETWORKS[network_name] = new_network(network)
+
+ # define networks from engines.[i].network (except references)
+ for engine_name, engine, network in iter_networks():
+ if network is None:
+ network = {}
+ for attribute_name, attribute_value in default_params.items():
+ if hasattr(engine, attribute_name):
+ network[attribute_name] = getattr(engine, attribute_name)
+ else:
+ network[attribute_name] = attribute_value
+ NETWORKS[engine_name] = new_network(network)
+ elif isinstance(network, dict):
+ NETWORKS[engine_name] = new_network(network)
+
+ # define networks from engines.[i].network (references)
+ for engine_name, engine, network in iter_networks():
+ if isinstance(network, str):
+ NETWORKS[engine_name] = NETWORKS[network]
+
+
+@atexit.register
+def done():
+ """Close all HTTP client
+
+ Avoid a warning at exit
+ see https://github.com/encode/httpx/blob/1a6e254f72d9fd5694a1c10a28927e193ab4f76b/httpx/_client.py#L1785
+
+ Note: since Network.aclose has to be async, it is not possible to call this method on Network.__del__
+ So Network.aclose is called here using atexit.register
+ """
+ try:
+ if LOOP:
+ future = asyncio.run_coroutine_threadsafe(Network.aclose_all(), LOOP)
+ # wait 3 seconds to close the HTTP clients
+ future.result(3)
+ finally:
+ NETWORKS.clear()
+
+
+NETWORKS[DEFAULT_NAME] = Network()
diff --git a/searx/raise_for_httperror.py b/searx/network/raise_for_httperror.py
index bd12df9a9..bd12df9a9 100644
--- a/searx/raise_for_httperror.py
+++ b/searx/network/raise_for_httperror.py
diff --git a/searx/plugins/tracker_url_remover.py b/searx/plugins/tracker_url_remover.py
index 742f39013..98ddddbcd 100644
--- a/searx/plugins/tracker_url_remover.py
+++ b/searx/plugins/tracker_url_remover.py
@@ -21,6 +21,7 @@ from urllib.parse import urlunparse, parse_qsl, urlencode
regexes = {re.compile(r'utm_[^&]+'),
re.compile(r'(wkey|wemail)[^&]*'),
+ re.compile(r'(_hsenc|_hsmi|hsCtaTracking|__hssc|__hstc|__hsfp)[^&]*'),
re.compile(r'&$')}
name = gettext('Tracker URL remover')
diff --git a/searx/poolrequests.py b/searx/poolrequests.py
deleted file mode 100644
index ab327251b..000000000
--- a/searx/poolrequests.py
+++ /dev/null
@@ -1,235 +0,0 @@
-import sys
-from time import time
-from itertools import cycle
-from threading import local
-
-import requests
-
-from searx import settings
-from searx import logger
-from searx.raise_for_httperror import raise_for_httperror
-
-
-logger = logger.getChild('poolrequests')
-
-
-try:
- import ssl
- if ssl.OPENSSL_VERSION_INFO[0:3] < (1, 0, 2):
- # https://github.com/certifi/python-certifi#1024-bit-root-certificates
- logger.critical('You are using an old openssl version({0}), please upgrade above 1.0.2!'
- .format(ssl.OPENSSL_VERSION))
- sys.exit(1)
-except ImportError:
- ssl = None
-if not getattr(ssl, "HAS_SNI", False):
- try:
- import OpenSSL # pylint: disable=unused-import
- except ImportError:
- logger.critical("ssl doesn't support SNI and the pyopenssl module is not installed.\n"
- "Some HTTPS connections will fail")
- sys.exit(1)
-
-
-class HTTPAdapterWithConnParams(requests.adapters.HTTPAdapter):
-
- def __init__(self, pool_connections=requests.adapters.DEFAULT_POOLSIZE,
- pool_maxsize=requests.adapters.DEFAULT_POOLSIZE,
- max_retries=requests.adapters.DEFAULT_RETRIES,
- pool_block=requests.adapters.DEFAULT_POOLBLOCK,
- **conn_params):
- if max_retries == requests.adapters.DEFAULT_RETRIES:
- self.max_retries = requests.adapters.Retry(0, read=False)
- else:
- self.max_retries = requests.adapters.Retry.from_int(max_retries)
- self.config = {}
- self.proxy_manager = {}
-
- super().__init__()
-
- self._pool_connections = pool_connections
- self._pool_maxsize = pool_maxsize
- self._pool_block = pool_block
- self._conn_params = conn_params
-
- self.init_poolmanager(pool_connections, pool_maxsize, block=pool_block, **conn_params)
-
- def __setstate__(self, state):
- # Can't handle by adding 'proxy_manager' to self.__attrs__ because
- # because self.poolmanager uses a lambda function, which isn't pickleable.
- self.proxy_manager = {}
- self.config = {}
-
- for attr, value in state.items():
- setattr(self, attr, value)
-
- self.init_poolmanager(self._pool_connections, self._pool_maxsize,
- block=self._pool_block, **self._conn_params)
-
-
-threadLocal = local()
-connect = settings['outgoing'].get('pool_connections', 100) # Magic number kept from previous code
-maxsize = settings['outgoing'].get('pool_maxsize', requests.adapters.DEFAULT_POOLSIZE) # Picked from constructor
-if settings['outgoing'].get('source_ips'):
- http_adapters = cycle(HTTPAdapterWithConnParams(pool_connections=connect, pool_maxsize=maxsize,
- source_address=(source_ip, 0))
- for source_ip in settings['outgoing']['source_ips'])
- https_adapters = cycle(HTTPAdapterWithConnParams(pool_connections=connect, pool_maxsize=maxsize,
- source_address=(source_ip, 0))
- for source_ip in settings['outgoing']['source_ips'])
-else:
- http_adapters = cycle((HTTPAdapterWithConnParams(pool_connections=connect, pool_maxsize=maxsize), ))
- https_adapters = cycle((HTTPAdapterWithConnParams(pool_connections=connect, pool_maxsize=maxsize), ))
-
-
-class SessionSinglePool(requests.Session):
-
- def __init__(self):
- super().__init__()
-
- # reuse the same adapters
- self.adapters.clear()
-
- https_adapter = threadLocal.__dict__.setdefault('https_adapter', next(https_adapters))
- self.mount('https://', https_adapter)
- if get_enable_http_protocol():
- http_adapter = threadLocal.__dict__.setdefault('http_adapter', next(http_adapters))
- self.mount('http://', http_adapter)
-
- def close(self):
- """Call super, but clear adapters since there are managed globaly"""
- self.adapters.clear()
- super().close()
-
-
-def set_timeout_for_thread(timeout, start_time=None):
- threadLocal.timeout = timeout
- threadLocal.start_time = start_time
-
-
-def set_enable_http_protocol(enable_http):
- threadLocal.enable_http = enable_http
-
-
-def get_enable_http_protocol():
- try:
- return threadLocal.enable_http
- except AttributeError:
- return False
-
-
-def reset_time_for_thread():
- threadLocal.total_time = 0
-
-
-def get_time_for_thread():
- return threadLocal.total_time
-
-
-def get_proxy_cycles(proxy_settings):
- if not proxy_settings:
- return None
- # Backwards compatibility for single proxy in settings.yml
- for protocol, proxy in proxy_settings.items():
- if isinstance(proxy, str):
- proxy_settings[protocol] = [proxy]
-
- for protocol in proxy_settings:
- proxy_settings[protocol] = cycle(proxy_settings[protocol])
- return proxy_settings
-
-
-GLOBAL_PROXY_CYCLES = get_proxy_cycles(settings['outgoing'].get('proxies'))
-
-
-def get_proxies(proxy_cycles):
- if proxy_cycles:
- return {protocol: next(proxy_cycle) for protocol, proxy_cycle in proxy_cycles.items()}
- return None
-
-
-def get_global_proxies():
- return get_proxies(GLOBAL_PROXY_CYCLES)
-
-
-def request(method, url, **kwargs):
- """same as requests/requests/api.py request(...)"""
- time_before_request = time()
-
- # session start
- session = SessionSinglePool()
-
- # proxies
- if not kwargs.get('proxies'):
- kwargs['proxies'] = get_global_proxies()
-
- # timeout
- if 'timeout' in kwargs:
- timeout = kwargs['timeout']
- else:
- timeout = getattr(threadLocal, 'timeout', None)
- if timeout is not None:
- kwargs['timeout'] = timeout
-
- # raise_for_error
- check_for_httperror = True
- if 'raise_for_httperror' in kwargs:
- check_for_httperror = kwargs['raise_for_httperror']
- del kwargs['raise_for_httperror']
-
- # do request
- response = session.request(method=method, url=url, **kwargs)
-
- time_after_request = time()
-
- # is there a timeout for this engine ?
- if timeout is not None:
- timeout_overhead = 0.2 # seconds
- # start_time = when the user request started
- start_time = getattr(threadLocal, 'start_time', time_before_request)
- search_duration = time_after_request - start_time
- if search_duration > timeout + timeout_overhead:
- raise requests.exceptions.Timeout(response=response)
-
- # session end
- session.close()
-
- if hasattr(threadLocal, 'total_time'):
- threadLocal.total_time += time_after_request - time_before_request
-
- # raise an exception
- if check_for_httperror:
- raise_for_httperror(response)
-
- return response
-
-
-def get(url, **kwargs):
- kwargs.setdefault('allow_redirects', True)
- return request('get', url, **kwargs)
-
-
-def options(url, **kwargs):
- kwargs.setdefault('allow_redirects', True)
- return request('options', url, **kwargs)
-
-
-def head(url, **kwargs):
- kwargs.setdefault('allow_redirects', False)
- return request('head', url, **kwargs)
-
-
-def post(url, data=None, **kwargs):
- return request('post', url, data=data, **kwargs)
-
-
-def put(url, data=None, **kwargs):
- return request('put', url, data=data, **kwargs)
-
-
-def patch(url, data=None, **kwargs):
- return request('patch', url, data=data, **kwargs)
-
-
-def delete(url, **kwargs):
- return request('delete', url, **kwargs)
diff --git a/searx/search/checker/impl.py b/searx/search/checker/impl.py
index ad45440ea..e54b3f68d 100644
--- a/searx/search/checker/impl.py
+++ b/searx/search/checker/impl.py
@@ -11,9 +11,9 @@ from urllib.parse import urlparse
import re
from langdetect import detect_langs
from langdetect.lang_detect_exception import LangDetectException
-import requests.exceptions
+import httpx
-from searx import poolrequests, logger
+from searx import network, logger
from searx.results import ResultContainer
from searx.search.models import SearchQuery, EngineRef
from searx.search.processors import EngineProcessor
@@ -75,8 +75,8 @@ def _is_url_image(image_url):
while retry > 0:
a = time()
try:
- poolrequests.set_timeout_for_thread(10.0, time())
- r = poolrequests.get(image_url, timeout=10.0, allow_redirects=True, headers={
+ network.set_timeout_for_thread(10.0, time())
+ r = network.get(image_url, timeout=10.0, allow_redirects=True, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US;q=0.5,en;q=0.3',
@@ -90,10 +90,10 @@ def _is_url_image(image_url):
if r.headers["content-type"].startswith('image/'):
return True
return False
- except requests.exceptions.Timeout:
+ except httpx.TimeoutException:
logger.error('Timeout for %s: %i', image_url, int(time() - a))
retry -= 1
- except requests.exceptions.RequestException:
+ except httpx.HTTPError:
logger.exception('Exception for %s', image_url)
return False
diff --git a/searx/search/processors/online.py b/searx/search/processors/online.py
index 1fc6444ad..66719ea9b 100644
--- a/searx/search/processors/online.py
+++ b/searx/search/processors/online.py
@@ -1,12 +1,12 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
-from urllib.parse import urlparse
from time import time
import threading
+import asyncio
-import requests.exceptions
+import httpx
-import searx.poolrequests as poolrequests
+import searx.network
from searx.engines import settings
from searx import logger
from searx.utils import gen_useragent
@@ -64,10 +64,6 @@ class OnlineProcessor(EngineProcessor):
auth=params['auth']
)
- # setting engine based proxies
- if hasattr(self.engine, 'proxies'):
- request_args['proxies'] = poolrequests.get_proxies(self.engine.proxies)
-
# max_redirects
max_redirects = params.get('max_redirects')
if max_redirects:
@@ -85,9 +81,9 @@ class OnlineProcessor(EngineProcessor):
# specific type of request (GET or POST)
if params['method'] == 'GET':
- req = poolrequests.get
+ req = searx.network.get
else:
- req = poolrequests.post
+ req = searx.network.post
request_args['data'] = params['data']
@@ -99,8 +95,8 @@ class OnlineProcessor(EngineProcessor):
# unexpected redirect : record an error
# but the engine might still return valid results.
status_code = str(response.status_code or '')
- reason = response.reason or ''
- hostname = str(urlparse(response.url or '').netloc)
+ reason = response.reason_phrase or ''
+ hostname = response.url.host
record_error(self.engine_name,
'{} redirects, maximum: {}'.format(len(response.history), soft_max_redirects),
(status_code, reason, hostname))
@@ -128,14 +124,14 @@ class OnlineProcessor(EngineProcessor):
def search(self, query, params, result_container, start_time, timeout_limit):
# set timeout for all HTTP requests
- poolrequests.set_timeout_for_thread(timeout_limit, start_time=start_time)
+ searx.network.set_timeout_for_thread(timeout_limit, start_time=start_time)
# reset the HTTP total time
- poolrequests.reset_time_for_thread()
- # enable HTTP only if explicitly enabled
- poolrequests.set_enable_http_protocol(self.engine.enable_http)
+ searx.network.reset_time_for_thread()
+ # set the network
+ searx.network.set_context_network_name(self.engine_name)
# suppose everything will be alright
- requests_exception = False
+ http_exception = False
suspended_time = None
try:
@@ -149,7 +145,7 @@ class OnlineProcessor(EngineProcessor):
# update engine time when there is no exception
engine_time = time() - start_time
- page_load_time = poolrequests.get_time_for_thread()
+ page_load_time = searx.network.get_time_for_thread()
result_container.add_timing(self.engine_name, engine_time, page_load_time)
with threading.RLock():
self.engine.stats['engine_time'] += engine_time
@@ -162,27 +158,27 @@ class OnlineProcessor(EngineProcessor):
# Timing
engine_time = time() - start_time
- page_load_time = poolrequests.get_time_for_thread()
+ page_load_time = searx.network.get_time_for_thread()
result_container.add_timing(self.engine_name, engine_time, page_load_time)
# Record the errors
with threading.RLock():
self.engine.stats['errors'] += 1
- if (issubclass(e.__class__, requests.exceptions.Timeout)):
+ if (issubclass(e.__class__, (httpx.TimeoutException, asyncio.TimeoutError))):
result_container.add_unresponsive_engine(self.engine_name, 'HTTP timeout')
# requests timeout (connect or read)
logger.error("engine {0} : HTTP requests timeout"
"(search duration : {1} s, timeout: {2} s) : {3}"
.format(self.engine_name, engine_time, timeout_limit, e.__class__.__name__))
- requests_exception = True
- elif (issubclass(e.__class__, requests.exceptions.RequestException)):
+ http_exception = True
+ elif (issubclass(e.__class__, (httpx.HTTPError, httpx.StreamError))):
result_container.add_unresponsive_engine(self.engine_name, 'HTTP error')
# other requests exception
logger.exception("engine {0} : requests exception"
"(search duration : {1} s, timeout: {2} s) : {3}"
.format(self.engine_name, engine_time, timeout_limit, e))
- requests_exception = True
+ http_exception = True
elif (issubclass(e.__class__, SearxEngineCaptchaException)):
result_container.add_unresponsive_engine(self.engine_name, 'CAPTCHA required')
logger.exception('engine {0} : CAPTCHA'.format(self.engine_name))
@@ -206,7 +202,7 @@ class OnlineProcessor(EngineProcessor):
# suspend the engine if there is an HTTP error
# or suspended_time is defined
with threading.RLock():
- if requests_exception or suspended_time:
+ if http_exception or suspended_time:
# update continuous_errors / suspend_end_time
self.engine.continuous_errors += 1
if suspended_time is None:
diff --git a/searx/settings.yml b/searx/settings.yml
index 50bccb1ef..1940739bc 100644
--- a/searx/settings.yml
+++ b/searx/settings.yml
@@ -67,19 +67,17 @@ ui:
# key : !!binary "your_morty_proxy_key"
outgoing: # communication with search engines
- request_timeout : 2.0 # default timeout in seconds, can be override by engine
+ request_timeout : 3.0 # default timeout in seconds, can be override by engine
# max_request_timeout: 10.0 # the maximum timeout in seconds
useragent_suffix : "" # suffix of searx_useragent, could contain informations like an email address to the administrator
- pool_connections : 100 # Number of different hosts
- pool_maxsize : 10 # Number of simultaneous requests by host
+ pool_connections : 100 # The maximum number of concurrent connections that may be established.
+ pool_maxsize : 20 # Allow the connection pool to maintain keep-alive connections below this point.
+ enable_http2: True # See https://www.python-httpx.org/http2/
# uncomment below section if you want to use a proxy
# see https://2.python-requests.org/en/latest/user/advanced/#proxies
# SOCKS proxies are also supported: see https://2.python-requests.org/en/latest/user/advanced/#socks
# proxies:
-# http:
-# - http://proxy1:8080
-# - http://proxy2:8080
-# https:
+# all://:
# - http://proxy1:8080
# - http://proxy2:8080
# using_tor_proxy : True
@@ -89,6 +87,7 @@ outgoing: # communication with search engines
# source_ips:
# - 1.1.1.1
# - 1.1.1.2
+# - fe80::/126
# External plugin configuration
# See http://searx.github.io/searx/dev/plugins.html for more details
@@ -197,6 +196,11 @@ engines:
# engine : base
# shortcut : bs
+ - name: bandcamp
+ engine: bandcamp
+ shortcut: bc
+ categories: music
+
- name : wikipedia
engine : wikipedia
shortcut : wp
@@ -700,6 +704,13 @@ engines:
require_api_key: false
results: HTML
+# - name : meilisearch
+# engine : meilisearch
+# shortcut: mes
+# enable_http: True
+# base_url : http://localhost:7700
+# index : my-index
+
- name : microsoft academic
engine : microsoft_academic
categories : science
@@ -846,11 +857,13 @@ engines:
engine : qwant
shortcut : qwi
categories : images
+ network: qwant
- name : qwant news
engine : qwant
shortcut : qwn
categories : news
+ network: qwant
# - name: library
# engine: recoll
@@ -1264,6 +1277,22 @@ engines:
categories: videos
disabled : True
+ - name: wordnik
+ engine: wordnik
+ shortcut: def
+ base_url: https://www.wordnik.com/
+ categories: general
+ timeout: 5.0
+ disabled: True
+
+ - name: słownik języka polskiego
+ engine: sjp
+ shortcut: sjp
+ base_url: https://sjp.pwn.pl/
+ categories: general
+ timeout: 5.0
+ disabled: True
+
# Doku engine lets you access to any Doku wiki instance:
# A public one or a privete/corporate one.
# - name : ubuntuwiki
diff --git a/searx/settings_loader.py b/searx/settings_loader.py
index e7126aa89..cfdeb4d91 100644
--- a/searx/settings_loader.py
+++ b/searx/settings_loader.py
@@ -57,7 +57,7 @@ def update_settings(default_settings, user_settings):
# merge everything except the engines
for k, v in user_settings.items():
if k not in ('use_default_settings', 'engines'):
- if k in default_settings:
+ if k in default_settings and isinstance(v, Mapping):
update_dict(default_settings[k], v)
else:
default_settings[k] = v
diff --git a/searx/static/themes/oscar/img/icons/bandcamp.png b/searx/static/themes/oscar/img/icons/bandcamp.png
new file mode 100644
index 000000000..2de405afe
--- /dev/null
+++ b/searx/static/themes/oscar/img/icons/bandcamp.png
Binary files differ
diff --git a/searx/templates/__common__/opensearch.xml b/searx/templates/__common__/opensearch.xml
index 2476258c0..230f327a5 100644
--- a/searx/templates/__common__/opensearch.xml
+++ b/searx/templates/__common__/opensearch.xml
@@ -3,7 +3,7 @@
<ShortName>{{ instance_name }}</ShortName>
<Description>a privacy-respecting, hackable metasearch engine</Description>
<InputEncoding>UTF-8</InputEncoding>
- <Image>{{ urljoin(host, url_for('static', filename='img/favicon.png')) }}</Image>
+ <Image>{{ url_for('static', filename='img/favicon.png', _external=True) }}</Image>
<LongName>searx metasearch</LongName>
{% if opensearch_method == 'get' %}
<Url rel="results" type="text/html" method="get" template="{{ url_for('search', _external=True) }}?q={searchTerms}"/>
@@ -13,7 +13,7 @@
</Url>
{% endif %}
{% if autocomplete %}
- <Url rel="suggestions" type="application/x-suggestions+json" template="{{ host }}autocompleter?q={searchTerms}"/>
+ <Url rel="suggestions" type="application/x-suggestions+json" template="{{ url_for('autocompleter', _external=True) }}?q={searchTerms}"/>
{% endif %}
<Url type="application/opensearchdescription+xml"
diff --git a/searx/templates/oscar/result_templates/default.html b/searx/templates/oscar/result_templates/default.html
index d743f928e..53cfee5cb 100644
--- a/searx/templates/oscar/result_templates/default.html
+++ b/searx/templates/oscar/result_templates/default.html
@@ -13,10 +13,10 @@
</div>
{%- endif -%}
-{%- if result.img_src -%}
+{%- if result.img_src or result.thumbnail -%}
<div class="container-fluid">{{- "" -}}
<div class="row">{{- "" -}}
- <img src="{{ image_proxify(result.img_src) }}" title="{{ result.title|striptags }}" style="width: auto; max-height: 60px; min-height: 60px;" class="col-xs-2 col-sm-4 col-md-4 result-content">
+ <img src="{{ image_proxify(result.img_src or result.thumbnail) }}" title="{{ result.title|striptags }}" style="width: auto; max-height: 60px; min-height: 60px;" class="col-xs-2 col-sm-4 col-md-4 result-content">
{%- if result.content %}<p class="result-content col-xs-8 col-sm-8 col-md-8">{{ result.content|safe }}</p>{% endif -%}
</div>{{- "" -}}
</div>
diff --git a/searx/testing.py b/searx/testing.py
index ec253cb3d..51ca92bd0 100644
--- a/searx/testing.py
+++ b/searx/testing.py
@@ -10,8 +10,8 @@ import traceback
from os.path import dirname, join, abspath, realpath
-from unittest import TestCase
from splinter import Browser
+import aiounittest
class SearxTestLayer:
@@ -82,7 +82,7 @@ def run_robot_tests(tests):
test(browser)
-class SearxTestCase(TestCase):
+class SearxTestCase(aiounittest.AsyncTestCase):
"""Base test case for non-robot tests."""
layer = SearxTestLayer
diff --git a/searx/utils.py b/searx/utils.py
index 3172ad8f3..55a386bd5 100644
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -45,7 +45,7 @@ def searx_useragent():
"""Return the searx User Agent"""
return 'searx/{searx_version} {suffix}'.format(
searx_version=VERSION_STRING,
- suffix=settings['outgoing'].get('useragent_suffix', ''))
+ suffix=settings['outgoing'].get('useragent_suffix', '')).strip()
def gen_useragent(os=None):
diff --git a/searx/webapp.py b/searx/webapp.py
index 7f874d3aa..8c59b8f3a 100755
--- a/searx/webapp.py
+++ b/searx/webapp.py
@@ -26,12 +26,26 @@ if __name__ == '__main__':
from os.path import realpath, dirname
sys.path.append(realpath(dirname(realpath(__file__)) + '/../'))
+# set Unix thread name
+try:
+ import setproctitle
+except ImportError:
+ pass
+else:
+ import threading
+ old_thread_init = threading.Thread.__init__
+
+ def new_thread_init(self, *args, **kwargs):
+ old_thread_init(self, *args, **kwargs)
+ setproctitle.setthreadtitle(self._name)
+ threading.Thread.__init__ = new_thread_init
+
import hashlib
import hmac
import json
import os
-import requests
+import httpx
from searx import logger
logger = logger.getChild('webapp')
@@ -40,7 +54,7 @@ from datetime import datetime, timedelta
from time import time
from html import escape
from io import StringIO
-from urllib.parse import urlencode, urljoin, urlparse
+from urllib.parse import urlencode, urlparse
from pygments import highlight
from pygments.lexers import get_lexer_by_name
@@ -79,7 +93,7 @@ from searx.plugins import plugins
from searx.plugins.oa_doi_rewrite import get_doi_resolver
from searx.preferences import Preferences, ValidationException, LANGUAGE_CODES
from searx.answerers import answerers
-from searx.poolrequests import get_global_proxies
+from searx.network import stream as http_stream
from searx.answerers import ask
from searx.metrology.error_recorder import errors_per_engines
@@ -270,14 +284,7 @@ def extract_domain(url):
def get_base_url():
- if settings['server']['base_url']:
- hostname = settings['server']['base_url']
- else:
- scheme = 'http'
- if request.is_secure:
- scheme = 'https'
- hostname = url_for('index', _external=True, _scheme=scheme)
- return hostname
+ return url_for('index', _external=True)
def get_current_theme_name(override=None):
@@ -310,10 +317,6 @@ def url_for_theme(endpoint, override_theme=None, **values):
if filename_with_theme in static_files:
values['filename'] = filename_with_theme
url = url_for(endpoint, **values)
- if settings['server']['base_url']:
- if url.startswith('/'):
- url = url[1:]
- url = urljoin(settings['server']['base_url'], url)
return url
@@ -824,7 +827,7 @@ def preferences():
# save preferences
if request.method == 'POST':
- resp = make_response(redirect(urljoin(settings['server']['base_url'], url_for('index'))))
+ resp = make_response(url_for('index', _external=True))
try:
request.preferences.parse_form(request.form)
except ValidationException:
@@ -901,50 +904,62 @@ def _is_selected_language_supported(engine, preferences):
@app.route('/image_proxy', methods=['GET'])
def image_proxy():
- url = request.args.get('url').encode()
+ url = request.args.get('url')
if not url:
return '', 400
- h = new_hmac(settings['server']['secret_key'], url)
+ h = new_hmac(settings['server']['secret_key'], url.encode())
if h != request.args.get('h'):
return '', 400
- headers = dict_subset(request.headers, {'If-Modified-Since', 'If-None-Match'})
- headers['User-Agent'] = gen_useragent()
-
- resp = requests.get(url,
- stream=True,
- timeout=settings['outgoing']['request_timeout'],
- headers=headers,
- proxies=get_global_proxies())
+ maximum_size = 5 * 1024 * 1024
- if resp.status_code == 304:
- return '', resp.status_code
-
- if resp.status_code != 200:
- logger.debug('image-proxy: wrong response code: {0}'.format(resp.status_code))
- if resp.status_code >= 400:
+ try:
+ headers = dict_subset(request.headers, {'If-Modified-Since', 'If-None-Match'})
+ headers['User-Agent'] = gen_useragent()
+ stream = http_stream(
+ method='GET',
+ url=url,
+ headers=headers,
+ timeout=settings['outgoing']['request_timeout'],
+ allow_redirects=True,
+ max_redirects=20)
+
+ resp = next(stream)
+ content_length = resp.headers.get('Content-Length')
+ if content_length and content_length.isdigit() and int(content_length) > maximum_size:
+ return 'Max size', 400
+
+ if resp.status_code == 304:
return '', resp.status_code
- return '', 400
- if not resp.headers.get('content-type', '').startswith('image/'):
- logger.debug('image-proxy: wrong content-type: {0}'.format(resp.headers.get('content-type')))
- return '', 400
+ if resp.status_code != 200:
+ logger.debug('image-proxy: wrong response code: {0}'.format(resp.status_code))
+ if resp.status_code >= 400:
+ return '', resp.status_code
+ return '', 400
- img = b''
- chunk_counter = 0
+ if not resp.headers.get('content-type', '').startswith('image/'):
+ logger.debug('image-proxy: wrong content-type: {0}'.format(resp.headers.get('content-type')))
+ return '', 400
- for chunk in resp.iter_content(1024 * 1024):
- chunk_counter += 1
- if chunk_counter > 5:
- return '', 502 # Bad gateway - file is too big (>5M)
- img += chunk
+ headers = dict_subset(resp.headers, {'Content-Length', 'Length', 'Date', 'Last-Modified', 'Expires', 'Etag'})
- headers = dict_subset(resp.headers, {'Content-Length', 'Length', 'Date', 'Last-Modified', 'Expires', 'Etag'})
+ total_length = 0
- return Response(img, mimetype=resp.headers['content-type'], headers=headers)
+ def forward_chunk():
+ nonlocal total_length
+ for chunk in stream:
+ total_length += len(chunk)
+ if total_length > maximum_size:
+ break
+ yield chunk
+
+ return Response(forward_chunk(), mimetype=resp.headers['Content-Type'], headers=headers)
+ except httpx.HTTPError:
+ return '', 400
@app.route('/stats', methods=['GET'])
@@ -1013,11 +1028,11 @@ def opensearch():
if request.headers.get('User-Agent', '').lower().find('webkit') >= 0:
method = 'get'
- ret = render('opensearch.xml',
- opensearch_method=method,
- host=get_base_url(),
- urljoin=urljoin,
- override_theme='__common__')
+ ret = render(
+ 'opensearch.xml',
+ opensearch_method=method,
+ override_theme='__common__'
+ )
resp = Response(response=ret,
status=200,
@@ -1038,7 +1053,7 @@ def favicon():
@app.route('/clear_cookies')
def clear_cookies():
- resp = make_response(redirect(urljoin(settings['server']['base_url'], url_for('index'))))
+ resp = make_response(redirect(url_for('index', _external=True)))
for cookie_name in request.cookies:
resp.delete_cookie(cookie_name)
return resp
@@ -1131,19 +1146,41 @@ class ReverseProxyPathFix:
'''
def __init__(self, app):
+
self.app = app
+ self.script_name = None
+ self.scheme = None
+ self.server = None
+
+ if settings['server']['base_url']:
+
+ # If base_url is specified, then these values from are given
+ # preference over any Flask's generics.
+
+ base_url = urlparse(settings['server']['base_url'])
+ self.script_name = base_url.path
+ if self.script_name.endswith('/'):
+ # remove trailing slash to avoid infinite redirect on the index
+ # see https://github.com/searx/searx/issues/2729
+ self.script_name = self.script_name[:-1]
+ self.scheme = base_url.scheme
+ self.server = base_url.netloc
def __call__(self, environ, start_response):
- script_name = environ.get('HTTP_X_SCRIPT_NAME', '')
+ script_name = self.script_name or environ.get('HTTP_X_SCRIPT_NAME', '')
if script_name:
environ['SCRIPT_NAME'] = script_name
path_info = environ['PATH_INFO']
if path_info.startswith(script_name):
environ['PATH_INFO'] = path_info[len(script_name):]
- scheme = environ.get('HTTP_X_SCHEME', '')
+ scheme = self.scheme or environ.get('HTTP_X_SCHEME', '')
if scheme:
environ['wsgi.url_scheme'] = scheme
+
+ server = self.server or environ.get('HTTP_X_FORWARDED_HOST', '')
+ if server:
+ environ['HTTP_HOST'] = server
return self.app(environ, start_response)
diff --git a/searx_extra/standalone_searx.py b/searx_extra/standalone_searx.py
index f52b7e80c..b30762d3f 100755
--- a/searx_extra/standalone_searx.py
+++ b/searx_extra/standalone_searx.py
@@ -31,7 +31,7 @@ Example to run it from python:
... engine_cs = list(searx.engines.categories.keys())
... # load module
... spec = importlib.util.spec_from_file_location(
-... 'utils.standalone_searx', 'utils/standalone_searx.py')
+... 'utils.standalone_searx', 'searx_extra/standalone_searx.py')
... sas = importlib.util.module_from_spec(spec)
... spec.loader.exec_module(sas)
... # use function from module
diff --git a/searx_extra/update/update_engine_descriptions.py b/searx_extra/update/update_engine_descriptions.py
index 109fdbfa0..cf9007da3 100755
--- a/searx_extra/update/update_engine_descriptions.py
+++ b/searx_extra/update/update_engine_descriptions.py
@@ -10,7 +10,7 @@ from searx.engines.wikidata import send_wikidata_query
from searx.utils import extract_text
import searx
import searx.search
-import searx.poolrequests
+import searx.network
SPARQL_WIKIPEDIA_ARTICLE = """
SELECT DISTINCT ?item ?name
@@ -59,7 +59,7 @@ def get_wikipedia_summary(language, pageid):
search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}'
url = search_url.format(title=quote(pageid), language=language)
try:
- response = searx.poolrequests.get(url)
+ response = searx.network.get(url)
response.raise_for_status()
api_result = json.loads(response.text)
return api_result.get('extract')
@@ -89,7 +89,7 @@ def get_website_description(url, lang1, lang2=None):
lang_list.append(lang2)
headers['Accept-Language'] = f'{",".join(lang_list)};q=0.8'
try:
- response = searx.poolrequests.get(url, headers=headers, timeout=10)
+ response = searx.network.get(url, headers=headers, timeout=10)
response.raise_for_status()
except Exception:
return (None, None)
diff --git a/searx_extra/update/update_external_bangs.py b/searx_extra/update/update_external_bangs.py
index e9dc0ff1d..e401e460a 100755
--- a/searx_extra/update/update_external_bangs.py
+++ b/searx_extra/update/update_external_bangs.py
@@ -17,7 +17,7 @@ import json
import re
from os.path import join
-import requests
+import httpx
from searx import searx_dir # pylint: disable=E0401 C0413
@@ -30,7 +30,7 @@ HTTP_COLON = 'http:'
def get_bang_url():
- response = requests.get(URL_BV1)
+ response = httpx.get(URL_BV1)
response.raise_for_status()
r = RE_BANG_VERSION.findall(response.text)
@@ -38,7 +38,7 @@ def get_bang_url():
def fetch_ddg_bangs(url):
- response = requests.get(url)
+ response = httpx.get(url)
response.raise_for_status()
return json.loads(response.content.decode())
diff --git a/tests/unit/network/__init__.py b/tests/unit/network/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/tests/unit/network/__init__.py
diff --git a/tests/unit/network/test_network.py b/tests/unit/network/test_network.py
new file mode 100644
index 000000000..246dfd85e
--- /dev/null
+++ b/tests/unit/network/test_network.py
@@ -0,0 +1,236 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+from mock import patch
+
+import httpx
+
+from searx.network.network import Network, NETWORKS
+from searx.testing import SearxTestCase
+
+
+class TestNetwork(SearxTestCase):
+
+ def test_simple(self):
+ network = Network()
+
+ self.assertEqual(next(network._local_addresses_cycle), None)
+ self.assertEqual(next(network._proxies_cycle), ())
+
+ def test_ipaddress_cycle(self):
+ network = NETWORKS['ipv6']
+ self.assertEqual(next(network._local_addresses_cycle), '::')
+ self.assertEqual(next(network._local_addresses_cycle), '::')
+
+ network = NETWORKS['ipv4']
+ self.assertEqual(next(network._local_addresses_cycle), '0.0.0.0')
+ self.assertEqual(next(network._local_addresses_cycle), '0.0.0.0')
+
+ network = Network(local_addresses=['192.168.0.1', '192.168.0.2'])
+ self.assertEqual(next(network._local_addresses_cycle), '192.168.0.1')
+ self.assertEqual(next(network._local_addresses_cycle), '192.168.0.2')
+ self.assertEqual(next(network._local_addresses_cycle), '192.168.0.1')
+
+ network = Network(local_addresses=['192.168.0.0/30'])
+ self.assertEqual(next(network._local_addresses_cycle), '192.168.0.1')
+ self.assertEqual(next(network._local_addresses_cycle), '192.168.0.2')
+ self.assertEqual(next(network._local_addresses_cycle), '192.168.0.1')
+ self.assertEqual(next(network._local_addresses_cycle), '192.168.0.2')
+
+ network = Network(local_addresses=['fe80::/10'])
+ self.assertEqual(next(network._local_addresses_cycle), 'fe80::1')
+ self.assertEqual(next(network._local_addresses_cycle), 'fe80::2')
+ self.assertEqual(next(network._local_addresses_cycle), 'fe80::3')
+
+ with self.assertRaises(ValueError):
+ Network(local_addresses=['not_an_ip_address'])
+
+ def test_proxy_cycles(self):
+ network = Network(proxies='http://localhost:1337')
+ self.assertEqual(next(network._proxies_cycle), (('all://', 'http://localhost:1337'),))
+
+ network = Network(proxies={
+ 'https': 'http://localhost:1337',
+ 'http': 'http://localhost:1338'
+ })
+ self.assertEqual(next(network._proxies_cycle),
+ (('https://', 'http://localhost:1337'), ('http://', 'http://localhost:1338')))
+ self.assertEqual(next(network._proxies_cycle),
+ (('https://', 'http://localhost:1337'), ('http://', 'http://localhost:1338')))
+
+ network = Network(proxies={
+ 'https': ['http://localhost:1337', 'http://localhost:1339'],
+ 'http': 'http://localhost:1338'
+ })
+ self.assertEqual(next(network._proxies_cycle),
+ (('https://', 'http://localhost:1337'), ('http://', 'http://localhost:1338')))
+ self.assertEqual(next(network._proxies_cycle),
+ (('https://', 'http://localhost:1339'), ('http://', 'http://localhost:1338')))
+
+ with self.assertRaises(ValueError):
+ Network(proxies=1)
+
+ def test_get_kwargs_clients(self):
+ kwargs = {
+ 'verify': True,
+ 'max_redirects': 5,
+ 'timeout': 2,
+ }
+ kwargs_client = Network.get_kwargs_clients(kwargs)
+
+ self.assertEqual(len(kwargs_client), 2)
+ self.assertEqual(len(kwargs), 1)
+
+ self.assertEqual(kwargs['timeout'], 2)
+
+ self.assertTrue(kwargs_client['verify'])
+ self.assertEqual(kwargs_client['max_redirects'], 5)
+
+ async def test_get_client(self):
+ network = Network(verify=True)
+ client1 = network.get_client()
+ client2 = network.get_client(verify=True)
+ client3 = network.get_client(max_redirects=10)
+ client4 = network.get_client(verify=True)
+ client5 = network.get_client(verify=False)
+ client6 = network.get_client(max_redirects=10)
+
+ self.assertEqual(client1, client2)
+ self.assertEqual(client1, client4)
+ self.assertNotEqual(client1, client3)
+ self.assertNotEqual(client1, client5)
+ self.assertEqual(client3, client6)
+
+ await network.aclose()
+
+ async def test_aclose(self):
+ network = Network(verify=True)
+ network.get_client()
+ await network.aclose()
+
+ async def test_request(self):
+ a_text = 'Lorem Ipsum'
+ response = httpx.Response(status_code=200, text=a_text)
+ with patch.object(httpx.AsyncClient, 'request', return_value=response):
+ network = Network(enable_http=True)
+ response = await network.request('GET', 'https://example.com/')
+ self.assertEqual(response.text, a_text)
+ await network.aclose()
+
+
+class TestNetworkRequestRetries(SearxTestCase):
+
+ TEXT = 'Lorem Ipsum'
+
+ @classmethod
+ def get_response_404_then_200(cls):
+ first = True
+
+ async def get_response(*args, **kwargs):
+ nonlocal first
+ if first:
+ first = False
+ return httpx.Response(status_code=403, text=TestNetworkRequestRetries.TEXT)
+ return httpx.Response(status_code=200, text=TestNetworkRequestRetries.TEXT)
+ return get_response
+
+ async def test_retries_ok(self):
+ with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()):
+ network = Network(enable_http=True, retries=1, retry_on_http_error=403)
+ response = await network.request('GET', 'https://example.com/')
+ self.assertEqual(response.text, TestNetworkRequestRetries.TEXT)
+ await network.aclose()
+
+ async def test_retries_fail_int(self):
+ with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()):
+ network = Network(enable_http=True, retries=0, retry_on_http_error=403)
+ response = await network.request('GET', 'https://example.com/')
+ self.assertEqual(response.status_code, 403)
+ await network.aclose()
+
+ async def test_retries_fail_list(self):
+ with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()):
+ network = Network(enable_http=True, retries=0, retry_on_http_error=[403, 429])
+ response = await network.request('GET', 'https://example.com/')
+ self.assertEqual(response.status_code, 403)
+ await network.aclose()
+
+ async def test_retries_fail_bool(self):
+ with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()):
+ network = Network(enable_http=True, retries=0, retry_on_http_error=True)
+ response = await network.request('GET', 'https://example.com/')
+ self.assertEqual(response.status_code, 403)
+ await network.aclose()
+
+ async def test_retries_exception_then_200(self):
+ request_count = 0
+
+ async def get_response(*args, **kwargs):
+ nonlocal request_count
+ request_count += 1
+ if request_count < 3:
+ raise httpx.RequestError('fake exception', request=None)
+ return httpx.Response(status_code=200, text=TestNetworkRequestRetries.TEXT)
+
+ with patch.object(httpx.AsyncClient, 'request', new=get_response):
+ network = Network(enable_http=True, retries=2)
+ response = await network.request('GET', 'https://example.com/')
+ self.assertEqual(response.status_code, 200)
+ self.assertEqual(response.text, TestNetworkRequestRetries.TEXT)
+ await network.aclose()
+
+ async def test_retries_exception(self):
+ async def get_response(*args, **kwargs):
+ raise httpx.RequestError('fake exception', request=None)
+
+ with patch.object(httpx.AsyncClient, 'request', new=get_response):
+ network = Network(enable_http=True, retries=0)
+ with self.assertRaises(httpx.RequestError):
+ await network.request('GET', 'https://example.com/')
+ await network.aclose()
+
+
+class TestNetworkStreamRetries(SearxTestCase):
+
+ TEXT = 'Lorem Ipsum'
+
+ @classmethod
+ def get_response_exception_then_200(cls):
+ first = True
+
+ def stream(*args, **kwargs):
+ nonlocal first
+ if first:
+ first = False
+ raise httpx.RequestError('fake exception', request=None)
+ return httpx.Response(status_code=200, text=TestNetworkStreamRetries.TEXT)
+ return stream
+
+ async def test_retries_ok(self):
+ with patch.object(httpx.AsyncClient, 'stream', new=TestNetworkStreamRetries.get_response_exception_then_200()):
+ network = Network(enable_http=True, retries=1, retry_on_http_error=403)
+ response = network.stream('GET', 'https://example.com/')
+ self.assertEqual(response.text, TestNetworkStreamRetries.TEXT)
+ await network.aclose()
+
+ async def test_retries_fail(self):
+ with patch.object(httpx.AsyncClient, 'stream', new=TestNetworkStreamRetries.get_response_exception_then_200()):
+ network = Network(enable_http=True, retries=0, retry_on_http_error=403)
+ with self.assertRaises(httpx.RequestError):
+ network.stream('GET', 'https://example.com/')
+ await network.aclose()
+
+ async def test_retries_exception(self):
+ first = True
+
+ def stream(*args, **kwargs):
+ nonlocal first
+ if first:
+ first = False
+ return httpx.Response(status_code=403, text=TestNetworkRequestRetries.TEXT)
+ return httpx.Response(status_code=200, text=TestNetworkRequestRetries.TEXT)
+
+ with patch.object(httpx.AsyncClient, 'stream', new=stream):
+ network = Network(enable_http=True, retries=0, retry_on_http_error=403)
+ response = network.stream('GET', 'https://example.com/')
+ self.assertEqual(response.status_code, 403)
+ await network.aclose()
diff --git a/tests/unit/test_engines_init.py b/tests/unit/test_engines_init.py
index cf4d50309..c75637f20 100644
--- a/tests/unit/test_engines_init.py
+++ b/tests/unit/test_engines_init.py
@@ -13,7 +13,7 @@ class TestEnginesInit(SearxTestCase):
engine_list = [{'engine': 'dummy', 'name': 'engine1', 'shortcut': 'e1'},
{'engine': 'dummy', 'name': 'engine2', 'shortcut': 'e2'}]
- engines.initialize_engines(engine_list)
+ engines.load_engines(engine_list)
self.assertEqual(len(engines.engines), 2)
self.assertIn('engine1', engines.engines)
self.assertIn('engine2', engines.engines)
diff --git a/tests/unit/test_poolrequests.py b/tests/unit/test_poolrequests.py
deleted file mode 100644
index b22685fd0..000000000
--- a/tests/unit/test_poolrequests.py
+++ /dev/null
@@ -1,89 +0,0 @@
-from unittest.mock import patch
-from requests.models import Response
-
-from searx.testing import SearxTestCase
-
-import searx.poolrequests
-from searx.poolrequests import get_proxy_cycles, get_proxies
-
-
-CONFIG = {'http': ['http://localhost:9090', 'http://localhost:9092'],
- 'https': ['http://localhost:9091', 'http://localhost:9093']}
-
-
-class TestProxy(SearxTestCase):
-
- def test_noconfig(self):
- cycles = get_proxy_cycles(None)
- self.assertIsNone(cycles)
-
- cycles = get_proxy_cycles(False)
- self.assertIsNone(cycles)
-
- def test_oldconfig(self):
- config = {
- 'http': 'http://localhost:9090',
- 'https': 'http://localhost:9091',
- }
- cycles = get_proxy_cycles(config)
- self.assertEqual(next(cycles['http']), 'http://localhost:9090')
- self.assertEqual(next(cycles['http']), 'http://localhost:9090')
- self.assertEqual(next(cycles['https']), 'http://localhost:9091')
- self.assertEqual(next(cycles['https']), 'http://localhost:9091')
-
- def test_one_proxy(self):
- config = {
- 'http': ['http://localhost:9090'],
- 'https': ['http://localhost:9091'],
- }
- cycles = get_proxy_cycles(config)
- self.assertEqual(next(cycles['http']), 'http://localhost:9090')
- self.assertEqual(next(cycles['http']), 'http://localhost:9090')
- self.assertEqual(next(cycles['https']), 'http://localhost:9091')
- self.assertEqual(next(cycles['https']), 'http://localhost:9091')
-
- def test_multiple_proxies(self):
- cycles = get_proxy_cycles(CONFIG)
- self.assertEqual(next(cycles['http']), 'http://localhost:9090')
- self.assertEqual(next(cycles['http']), 'http://localhost:9092')
- self.assertEqual(next(cycles['http']), 'http://localhost:9090')
- self.assertEqual(next(cycles['https']), 'http://localhost:9091')
- self.assertEqual(next(cycles['https']), 'http://localhost:9093')
- self.assertEqual(next(cycles['https']), 'http://localhost:9091')
-
- def test_getproxies_none(self):
- self.assertIsNone(get_proxies(None))
-
- def test_getproxies_config(self):
- cycles = get_proxy_cycles(CONFIG)
- self.assertEqual(get_proxies(cycles), {
- 'http': 'http://localhost:9090',
- 'https': 'http://localhost:9091'
- })
- self.assertEqual(get_proxies(cycles), {
- 'http': 'http://localhost:9092',
- 'https': 'http://localhost:9093'
- })
-
- @patch('searx.poolrequests.get_global_proxies')
- def test_request(self, mock_get_global_proxies):
- method = 'GET'
- url = 'http://localhost'
- custom_proxies = {
- 'https': 'http://localhost:1080'
- }
- global_proxies = {
- 'http': 'http://localhost:9092',
- 'https': 'http://localhost:9093'
- }
- mock_get_global_proxies.return_value = global_proxies
-
- # check the global proxies usage
- with patch.object(searx.poolrequests.SessionSinglePool, 'request', return_value=Response()) as mock_method:
- searx.poolrequests.request(method, url)
- mock_method.assert_called_once_with(method=method, url=url, proxies=global_proxies)
-
- # check if the proxies parameter overrides the global proxies
- with patch.object(searx.poolrequests.SessionSinglePool, 'request', return_value=Response()) as mock_method:
- searx.poolrequests.request(method, url, proxies=custom_proxies)
- mock_method.assert_called_once_with(method=method, url=url, proxies=custom_proxies)
diff --git a/tests/unit/test_query.py b/tests/unit/test_query.py
index 6bbfdb1e2..7a79ce242 100644
--- a/tests/unit/test_query.py
+++ b/tests/unit/test_query.py
@@ -1,7 +1,11 @@
+from mock import patch
+
from searx.search import initialize
from searx.query import RawTextQuery
from searx.testing import SearxTestCase
+import searx.engines
+
TEST_ENGINES = [
{
@@ -277,9 +281,10 @@ class TestBang(SearxTestCase):
self.assertEqual(query.getQuery(), '!dum the query')
def test_bang_autocomplete_empty(self):
- initialize()
- query = RawTextQuery('the query !', [])
- self.assertEqual(query.autocomplete_list, ['!images', '!wikipedia', '!osm'])
+ with patch.object(searx.engines, 'initialize_engines', searx.engines.load_engines):
+ initialize()
+ query = RawTextQuery('the query !', [])
+ self.assertEqual(query.autocomplete_list, ['!images', '!wikipedia', '!osm'])
- query = RawTextQuery('the query ?', ['osm'])
- self.assertEqual(query.autocomplete_list, ['?images', '?wikipedia'])
+ query = RawTextQuery('the query ?', ['osm'])
+ self.assertEqual(query.autocomplete_list, ['?images', '?wikipedia'])
diff --git a/tests/unit/test_webapp.py b/tests/unit/test_webapp.py
index 7d7a04fdc..d3498f51a 100644
--- a/tests/unit/test_webapp.py
+++ b/tests/unit/test_webapp.py
@@ -3,14 +3,19 @@
import json
from urllib.parse import ParseResult
from mock import Mock
-from searx import webapp
from searx.testing import SearxTestCase
from searx.search import Search
+import searx.engines
class ViewsTestCase(SearxTestCase):
def setUp(self):
+ # skip init function (no external HTTP request)
+ self.setattr4test(searx.engines, 'initialize_engines', searx.engines.load_engines)
+
+ from searx import webapp # pylint disable=import-outside-toplevel
+
webapp.app.config['TESTING'] = True # to get better error messages
self.app = webapp.app.test_client()
diff --git a/utils/searx.sh b/utils/searx.sh
index ed015bdd2..d2dbe79b6 100755
--- a/utils/searx.sh
+++ b/utils/searx.sh
@@ -35,7 +35,7 @@ SERVICE_GROUP="${SERVICE_USER}"
GIT_BRANCH="${GIT_BRANCH:-master}"
SEARX_PYENV="${SERVICE_HOME}/searx-pyenv"
SEARX_SRC="${SERVICE_HOME}/searx-src"
-SEARX_SETTINGS_PATH="/etc/searx/settings.yml"
+SEARX_SETTINGS_PATH="${SEARX_SETTINGS_PATH:-/etc/searx/settings.yml}"
SEARX_SETTINGS_TEMPLATE="${SEARX_SETTINGS_TEMPLATE:-${REPO_ROOT}/utils/templates/etc/searx/use_default_settings.yml}"
SEARX_UWSGI_APP="searx.ini"
# shellcheck disable=SC2034