From eaa694fb7d0e47b943bc6d6edb6cb6a40ab2d85e Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Thu, 18 Mar 2021 19:59:01 +0100 Subject: [enh] replace requests by httpx --- searx/poolrequests.py | 513 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 414 insertions(+), 99 deletions(-) (limited to 'searx/poolrequests.py') diff --git a/searx/poolrequests.py b/searx/poolrequests.py index ab327251b..8b365f913 100644 --- a/searx/poolrequests.py +++ b/searx/poolrequests.py @@ -1,14 +1,54 @@ +import atexit import sys +import threading +import asyncio +import logging +import concurrent.futures from time import time from itertools import cycle -from threading import local -import requests +import httpcore +import httpx +import h2.exceptions +from httpx_socks import AsyncProxyTransport +from python_socks import parse_proxy_url +import python_socks._errors from searx import settings from searx import logger from searx.raise_for_httperror import raise_for_httperror +# Optional uvloop (support Python 3.6) +try: + import uvloop +except ImportError: + pass +else: + uvloop.install() + +# queue.SimpleQueue: Support Python 3.6 +try: + from queue import SimpleQueue +except ImportError: + from queue import Empty + from collections import deque + + class SimpleQueue: + """Minimal backport of queue.SimpleQueue""" + + def __init__(self): + self._queue = deque() + self._count = threading.Semaphore(0) + + def put(self, item): + self._queue.append(item) + self._count.release() + + def get(self, timeout=None): + if not self._count.acquire(True, timeout): + raise Empty + return self._queue.popleft() + logger = logger.getChild('poolrequests') @@ -31,99 +71,63 @@ if not getattr(ssl, "HAS_SNI", False): sys.exit(1) -class HTTPAdapterWithConnParams(requests.adapters.HTTPAdapter): - - def __init__(self, pool_connections=requests.adapters.DEFAULT_POOLSIZE, - pool_maxsize=requests.adapters.DEFAULT_POOLSIZE, - max_retries=requests.adapters.DEFAULT_RETRIES, - pool_block=requests.adapters.DEFAULT_POOLBLOCK, - **conn_params): - if max_retries == requests.adapters.DEFAULT_RETRIES: - self.max_retries = requests.adapters.Retry(0, read=False) - else: - self.max_retries = requests.adapters.Retry.from_int(max_retries) - self.config = {} - self.proxy_manager = {} - - super().__init__() - - self._pool_connections = pool_connections - self._pool_maxsize = pool_maxsize - self._pool_block = pool_block - self._conn_params = conn_params - - self.init_poolmanager(pool_connections, pool_maxsize, block=pool_block, **conn_params) - - def __setstate__(self, state): - # Can't handle by adding 'proxy_manager' to self.__attrs__ because - # because self.poolmanager uses a lambda function, which isn't pickleable. - self.proxy_manager = {} - self.config = {} - - for attr, value in state.items(): - setattr(self, attr, value) +LOOP = None +CLIENTS = dict() +THREADLOCAL = threading.local() +LIMITS = httpx.Limits( + # Magic number kept from previous code + max_connections=settings['outgoing'].get('pool_connections', 100), + # Picked from constructor + max_keepalive_connections=settings['outgoing'].get('pool_maxsize', 10), + # + keepalive_expiry=settings['outgoing'].get('keepalive_expiry', 5.0) +) +# default parameters for AsyncHTTPTransport +# see https://github.com/encode/httpx/blob/e05a5372eb6172287458b37447c30f650047e1b8/httpx/_transports/default.py#L108-L121 # noqa +TRANSPORT_KWARGS = { + 'http2': settings['outgoing'].get('http2', False), + 'retries': 0, + 'trust_env': False, + 'backend': 'asyncio' +} +# requests compatibility when reading proxy settings from settings.yml +PROXY_PATTERN_MAPPING = { + 'http': 'https://', + 'https:': 'https://' +} +# default maximum redirect +# from https://github.com/psf/requests/blob/8c211a96cdbe9fe320d63d9e1ae15c5c07e179f8/requests/models.py#L55 +DEFAULT_REDIRECT_LIMIT = 30 - self.init_poolmanager(self._pool_connections, self._pool_maxsize, - block=self._pool_block, **self._conn_params) - -threadLocal = local() -connect = settings['outgoing'].get('pool_connections', 100) # Magic number kept from previous code -maxsize = settings['outgoing'].get('pool_maxsize', requests.adapters.DEFAULT_POOLSIZE) # Picked from constructor if settings['outgoing'].get('source_ips'): - http_adapters = cycle(HTTPAdapterWithConnParams(pool_connections=connect, pool_maxsize=maxsize, - source_address=(source_ip, 0)) - for source_ip in settings['outgoing']['source_ips']) - https_adapters = cycle(HTTPAdapterWithConnParams(pool_connections=connect, pool_maxsize=maxsize, - source_address=(source_ip, 0)) - for source_ip in settings['outgoing']['source_ips']) + LOCAL_ADDRESS_CYCLE = cycle(settings['outgoing'].get('source_ips')) else: - http_adapters = cycle((HTTPAdapterWithConnParams(pool_connections=connect, pool_maxsize=maxsize), )) - https_adapters = cycle((HTTPAdapterWithConnParams(pool_connections=connect, pool_maxsize=maxsize), )) - - -class SessionSinglePool(requests.Session): - - def __init__(self): - super().__init__() - - # reuse the same adapters - self.adapters.clear() - - https_adapter = threadLocal.__dict__.setdefault('https_adapter', next(https_adapters)) - self.mount('https://', https_adapter) - if get_enable_http_protocol(): - http_adapter = threadLocal.__dict__.setdefault('http_adapter', next(http_adapters)) - self.mount('http://', http_adapter) - - def close(self): - """Call super, but clear adapters since there are managed globaly""" - self.adapters.clear() - super().close() + LOCAL_ADDRESS_CYCLE = cycle((None, )) def set_timeout_for_thread(timeout, start_time=None): - threadLocal.timeout = timeout - threadLocal.start_time = start_time + THREADLOCAL.timeout = timeout + THREADLOCAL.start_time = start_time def set_enable_http_protocol(enable_http): - threadLocal.enable_http = enable_http + THREADLOCAL.enable_http = enable_http def get_enable_http_protocol(): try: - return threadLocal.enable_http + return THREADLOCAL.enable_http except AttributeError: return False def reset_time_for_thread(): - threadLocal.total_time = 0 + THREADLOCAL.total_time = 0 def get_time_for_thread(): - return threadLocal.total_time + return THREADLOCAL.total_time def get_proxy_cycles(proxy_settings): @@ -152,22 +156,197 @@ def get_global_proxies(): return get_proxies(GLOBAL_PROXY_CYCLES) +async def close_connections_for_url(connection_pool: httpcore.AsyncConnectionPool, url: httpcore._utils.URL): + origin = httpcore._utils.url_to_origin(url) + logger.debug('Drop connections for %r', origin) + connections_to_close = connection_pool._connections_for_origin(origin) + for connection in connections_to_close: + await connection_pool._remove_from_pool(connection) + try: + await connection.aclose() + except httpcore.NetworkError as e: + logger.warning('Error closing an existing connection', exc_info=e) + + +class AsyncHTTPTransportNoHttp(httpcore.AsyncHTTPTransport): + """Block HTTP request""" + + async def arequest(self, method, url, headers=None, stream=None, ext=None): + raise httpcore.UnsupportedProtocol("HTTP protocol is disabled") + + +class AsyncProxyTransportFixed(AsyncProxyTransport): + """Fix httpx_socks.AsyncProxyTransport + + Map python_socks exceptions to httpcore.ProxyError + + Map socket.gaierror to httpcore.ConnectError + + Note: keepalive_expiry is ignored, AsyncProxyTransport should call: + * self._keepalive_sweep() + * self._response_closed(self, connection) + + Note: AsyncProxyTransport inherit from AsyncConnectionPool + + Note: the API is going to change on httpx 0.18.0 + see https://github.com/encode/httpx/pull/1522 + """ + + async def arequest(self, method, url, headers=None, stream=None, ext=None): + retry = 2 + while retry > 0: + retry -= 1 + try: + return await super().arequest(method, url, headers, stream, ext) + except (python_socks._errors.ProxyConnectionError, + python_socks._errors.ProxyTimeoutError, + python_socks._errors.ProxyError) as e: + raise httpcore.ProxyError(e) + except OSError as e: + # socket.gaierror when DNS resolution fails + raise httpcore.NetworkError(e) + except httpcore.RemoteProtocolError as e: + # in case of httpcore.RemoteProtocolError: Server disconnected + await close_connections_for_url(self, url) + logger.warning('httpcore.RemoteProtocolError: retry', exc_info=e) + # retry + except (httpcore.NetworkError, httpcore.ProtocolError) as e: + # httpcore.WriteError on HTTP/2 connection leaves a new opened stream + # then each new request creates a new stream and raise the same WriteError + await close_connections_for_url(self, url) + raise e + + +class AsyncHTTPTransportFixed(httpx.AsyncHTTPTransport): + """Fix httpx.AsyncHTTPTransport""" + + async def arequest(self, method, url, headers=None, stream=None, ext=None): + retry = 2 + while retry > 0: + retry -= 1 + try: + return await super().arequest(method, url, headers, stream, ext) + except OSError as e: + # socket.gaierror when DNS resolution fails + raise httpcore.ConnectError(e) + except httpcore.CloseError as e: + # httpcore.CloseError: [Errno 104] Connection reset by peer + # raised by _keepalive_sweep() + # from https://github.com/encode/httpcore/blob/4b662b5c42378a61e54d673b4c949420102379f5/httpcore/_backends/asyncio.py#L198 # noqa + await close_connections_for_url(self._pool, url) + logger.warning('httpcore.CloseError: retry', exc_info=e) + # retry + except httpcore.RemoteProtocolError as e: + # in case of httpcore.RemoteProtocolError: Server disconnected + await close_connections_for_url(self._pool, url) + logger.warning('httpcore.RemoteProtocolError: retry', exc_info=e) + # retry + except (httpcore.ProtocolError, httpcore.NetworkError) as e: + await close_connections_for_url(self._pool, url) + raise e + + +def get_transport_for_socks_proxy(verify, local_address, proxy_url): + global LOOP, LIMITS, TRANSPORT_KWARGS + # support socks5h (requests compatibility): + # https://requests.readthedocs.io/en/master/user/advanced/#socks + # socks5:// hostname is resolved on client side + # socks5h:// hostname is resolved on proxy side + rdns = False + socks5h = 'socks5h://' + if proxy_url.startswith(socks5h): + proxy_url = 'socks5://' + proxy_url[len(socks5h):] + rdns = True + + proxy_type, proxy_host, proxy_port, proxy_username, proxy_password = parse_proxy_url(proxy_url) + + return AsyncProxyTransportFixed(proxy_type=proxy_type, proxy_host=proxy_host, proxy_port=proxy_port, + username=proxy_username, password=proxy_password, + rdns=rdns, + loop=LOOP, + verify=verify, + local_address=local_address, + max_connections=LIMITS.max_connections, + max_keepalive_connections=LIMITS.max_keepalive_connections, + keepalive_expiry=LIMITS.keepalive_expiry, + **TRANSPORT_KWARGS) + + +def get_transport(verify, local_address, proxy_url): + global LIMITS + return AsyncHTTPTransportFixed(verify=verify, + local_address=local_address, + limits=LIMITS, + proxy=httpx._config.Proxy(proxy_url) if proxy_url else None, + **TRANSPORT_KWARGS) + + +def iter_proxies(proxies): + # https://www.python-httpx.org/compatibility/#proxy-keys + if isinstance(proxies, str): + yield 'all://', proxies + elif isinstance(proxies, dict): + for pattern, proxy_url in proxies.items(): + pattern = PROXY_PATTERN_MAPPING.get(pattern, pattern) + yield pattern, proxy_url + + +def new_client(verify, local_address, proxies, max_redirects, enable_http): + # See https://www.python-httpx.org/advanced/#routing + mounts = {} + for pattern, proxy_url in iter_proxies(proxies): + if not enable_http and (pattern == 'http' or pattern.startswith('http://')): + continue + if proxy_url.startswith('socks4://') \ + or proxy_url.startswith('socks5://') \ + or proxy_url.startswith('socks5h://'): + mounts[pattern] = get_transport_for_socks_proxy(verify, local_address, proxy_url) + else: + mounts[pattern] = get_transport(verify, local_address, proxy_url) + + if not enable_http: + mounts['http://'] = AsyncHTTPTransportNoHttp() + + transport = get_transport(verify, local_address, None) + return httpx.AsyncClient(transport=transport, mounts=mounts, max_redirects=max_redirects) + + +def get_client(verify, local_address, proxies, max_redirects, allow_http): + global CLIENTS + key = (verify, local_address, repr(proxies), max_redirects, allow_http) + if key not in CLIENTS: + CLIENTS[key] = new_client(verify, local_address, proxies, max_redirects, allow_http) + return CLIENTS[key] + + +async def send_request(method, url, enable_http, kwargs): + if isinstance(url, bytes): + url = url.decode() + + verify = kwargs.pop('verify', True) + local_address = next(LOCAL_ADDRESS_CYCLE) + proxies = kwargs.pop('proxies', None) or get_global_proxies() + max_redirects = kwargs.pop('max_redirects', DEFAULT_REDIRECT_LIMIT) + + client = get_client(verify, local_address, proxies, max_redirects, enable_http) + response = await client.request(method.upper(), url, **kwargs) + + # requests compatibility + # see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses + response.ok = not response.is_error + + return response + + def request(method, url, **kwargs): """same as requests/requests/api.py request(...)""" time_before_request = time() - # session start - session = SessionSinglePool() - - # proxies - if not kwargs.get('proxies'): - kwargs['proxies'] = get_global_proxies() - # timeout if 'timeout' in kwargs: timeout = kwargs['timeout'] else: - timeout = getattr(threadLocal, 'timeout', None) + timeout = getattr(THREADLOCAL, 'timeout', None) if timeout is not None: kwargs['timeout'] = timeout @@ -178,24 +357,23 @@ def request(method, url, **kwargs): del kwargs['raise_for_httperror'] # do request - response = session.request(method=method, url=url, **kwargs) - - time_after_request = time() - - # is there a timeout for this engine ? - if timeout is not None: - timeout_overhead = 0.2 # seconds - # start_time = when the user request started - start_time = getattr(threadLocal, 'start_time', time_before_request) - search_duration = time_after_request - start_time - if search_duration > timeout + timeout_overhead: - raise requests.exceptions.Timeout(response=response) - - # session end - session.close() - - if hasattr(threadLocal, 'total_time'): - threadLocal.total_time += time_after_request - time_before_request + future = asyncio.run_coroutine_threadsafe(send_request(method, url, get_enable_http_protocol(), kwargs), LOOP) + try: + if timeout: + timeout += 0.2 # overhead + start_time = getattr(THREADLOCAL, 'start_time', time_before_request) + if start_time: + timeout -= time() - start_time + + response = future.result(timeout or 120) + except concurrent.futures.TimeoutError as e: + raise httpx.TimeoutException('Timeout', request=None) from e + + # update total_time. + # See get_time_for_thread() and reset_time_for_thread() + if hasattr(THREADLOCAL, 'total_time'): + time_after_request = time() + THREADLOCAL.total_time += time_after_request - time_before_request # raise an exception if check_for_httperror: @@ -204,6 +382,49 @@ def request(method, url, **kwargs): return response +async def stream_chunk_to_queue(method, url, q, **kwargs): + verify = kwargs.pop('verify', True) + local_address = next(LOCAL_ADDRESS_CYCLE) + proxies = kwargs.pop('proxies', None) or get_global_proxies() + # "30" from requests: + # https://github.com/psf/requests/blob/8c211a96cdbe9fe320d63d9e1ae15c5c07e179f8/requests/models.py#L55 + max_redirects = kwargs.pop('max_redirects', 30) + client = get_client(verify, local_address, proxies, max_redirects, True) + try: + async with client.stream(method, url, **kwargs) as response: + q.put(response) + async for chunk in response.aiter_bytes(65536): + if len(chunk) > 0: + q.put(chunk) + except (httpx.HTTPError, OSError, h2.exceptions.ProtocolError) as e: + q.put(e) + finally: + q.put(None) + + +def stream(method, url, **kwargs): + """Replace httpx.stream. + + Usage: + stream = poolrequests.stream(...) + response = next(stream) + for chunk in stream: + ... + + httpx.Client.stream requires to write the httpx.HTTPTransport version of the + the httpx.AsyncHTTPTransport declared above. + """ + q = SimpleQueue() + future = asyncio.run_coroutine_threadsafe(stream_chunk_to_queue(method, url, q, **kwargs), LOOP) + chunk_or_exception = q.get(timeout=60) + while chunk_or_exception is not None: + if isinstance(chunk_or_exception, Exception): + raise chunk_or_exception + yield chunk_or_exception + chunk_or_exception = q.get(timeout=60) + return future.result() + + def get(url, **kwargs): kwargs.setdefault('allow_redirects', True) return request('get', url, **kwargs) @@ -233,3 +454,97 @@ def patch(url, data=None, **kwargs): def delete(url, **kwargs): return request('delete', url, **kwargs) + + +def init(): + # log + for logger_name in ('hpack.hpack', 'hpack.table'): + logging.getLogger(logger_name).setLevel(logging.WARNING) + + # loop + def loop_thread(): + global LOOP + LOOP = asyncio.new_event_loop() + LOOP.run_forever() + + th = threading.Thread( + target=loop_thread, + name='asyncio_loop', + daemon=True, + ) + th.start() + + +@atexit.register +def done(): + """Close all HTTP client + + Avoid a warning at exit + see https://github.com/encode/httpx/blob/1a6e254f72d9fd5694a1c10a28927e193ab4f76b/httpx/_client.py#L1785 + """ + global LOOP + + async def close_client(client): + try: + await client.aclose() + except httpx.HTTPError: + pass + + async def close_clients(): + await asyncio.gather(*[close_client(client) for client in CLIENTS.values()], return_exceptions=False) + future = asyncio.run_coroutine_threadsafe(close_clients(), LOOP) + # wait 3 seconds to close the HTTP clients + future.result(3) + + +init() + + +# ## TEMPORARY DEBUG ## + + +def debug_connection(connection): + now = LOOP.time() + expired = (connection.state == httpcore._async.base.ConnectionState.IDLE + and connection.expires_at is not None + and now >= connection.expires_at) + return connection.info()\ + + (', connect_failed' if connection.connect_failed else '')\ + + (', expired' if expired else '') + + +def debug_origin(origin): + return origin[0].decode() + '://' + origin[1].decode() + ':' + str(origin[2]) + + +def debug_transport(transport): + result = { + '__class__': str(transport.__class__.__name__) + } + if isinstance(transport, (httpx.AsyncHTTPTransport, AsyncHTTPTransportFixed)): + pool = transport._pool + result['__pool_class__'] = str(pool.__class__.__name__) + if isinstance(pool, httpcore.AsyncConnectionPool): + for origin, connections in pool._connections.items(): + result[debug_origin(origin)] = [debug_connection(connection) for connection in connections] + return result + elif isinstance(transport, AsyncProxyTransportFixed): + for origin, connections in transport._connections.items(): + result[debug_origin(origin)] = [debug_connection(connection) for connection in connections] + return result + return result + + +def debug_asyncclient(client, key=None): + result = {} + if key: + result['__key__'] = [k if isinstance(k, (str, int, float, bool, type(None))) else repr(k) for k in key] + result['__default__'] = debug_transport(client._transport) + for urlpattern, transport in client._mounts.items(): + result[urlpattern.pattern] = debug_transport(transport) + return result + + +def debug_asyncclients(): + global CLIENTS + return [debug_asyncclient(client, key) for key, client in CLIENTS.items()] -- cgit v1.2.3 From d14994dc73ba5c95382812581dac146d9eceaafa Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Mon, 5 Apr 2021 10:43:33 +0200 Subject: [httpx] replace searx.poolrequests by searx.network settings.yml: * outgoing.networks: * can contains network definition * propertiers: enable_http, verify, http2, max_connections, max_keepalive_connections, keepalive_expiry, local_addresses, support_ipv4, support_ipv6, proxies, max_redirects, retries * retries: 0 by default, number of times searx retries to send the HTTP request (using different IP & proxy each time) * local_addresses can be "192.168.0.1/24" (it supports IPv6) * support_ipv4 & support_ipv6: both True by default see https://github.com/searx/searx/pull/1034 * each engine can define a "network" section: * either a full network description * either reference an existing network * all HTTP requests of engine use the same HTTP configuration (it was not the case before, see proxy configuration in master) --- searx/poolrequests.py | 550 -------------------------------------------------- 1 file changed, 550 deletions(-) delete mode 100644 searx/poolrequests.py (limited to 'searx/poolrequests.py') diff --git a/searx/poolrequests.py b/searx/poolrequests.py deleted file mode 100644 index 8b365f913..000000000 --- a/searx/poolrequests.py +++ /dev/null @@ -1,550 +0,0 @@ -import atexit -import sys -import threading -import asyncio -import logging -import concurrent.futures -from time import time -from itertools import cycle - -import httpcore -import httpx -import h2.exceptions -from httpx_socks import AsyncProxyTransport -from python_socks import parse_proxy_url -import python_socks._errors - -from searx import settings -from searx import logger -from searx.raise_for_httperror import raise_for_httperror - -# Optional uvloop (support Python 3.6) -try: - import uvloop -except ImportError: - pass -else: - uvloop.install() - -# queue.SimpleQueue: Support Python 3.6 -try: - from queue import SimpleQueue -except ImportError: - from queue import Empty - from collections import deque - - class SimpleQueue: - """Minimal backport of queue.SimpleQueue""" - - def __init__(self): - self._queue = deque() - self._count = threading.Semaphore(0) - - def put(self, item): - self._queue.append(item) - self._count.release() - - def get(self, timeout=None): - if not self._count.acquire(True, timeout): - raise Empty - return self._queue.popleft() - - -logger = logger.getChild('poolrequests') - - -try: - import ssl - if ssl.OPENSSL_VERSION_INFO[0:3] < (1, 0, 2): - # https://github.com/certifi/python-certifi#1024-bit-root-certificates - logger.critical('You are using an old openssl version({0}), please upgrade above 1.0.2!' - .format(ssl.OPENSSL_VERSION)) - sys.exit(1) -except ImportError: - ssl = None -if not getattr(ssl, "HAS_SNI", False): - try: - import OpenSSL # pylint: disable=unused-import - except ImportError: - logger.critical("ssl doesn't support SNI and the pyopenssl module is not installed.\n" - "Some HTTPS connections will fail") - sys.exit(1) - - -LOOP = None -CLIENTS = dict() -THREADLOCAL = threading.local() -LIMITS = httpx.Limits( - # Magic number kept from previous code - max_connections=settings['outgoing'].get('pool_connections', 100), - # Picked from constructor - max_keepalive_connections=settings['outgoing'].get('pool_maxsize', 10), - # - keepalive_expiry=settings['outgoing'].get('keepalive_expiry', 5.0) -) -# default parameters for AsyncHTTPTransport -# see https://github.com/encode/httpx/blob/e05a5372eb6172287458b37447c30f650047e1b8/httpx/_transports/default.py#L108-L121 # noqa -TRANSPORT_KWARGS = { - 'http2': settings['outgoing'].get('http2', False), - 'retries': 0, - 'trust_env': False, - 'backend': 'asyncio' -} -# requests compatibility when reading proxy settings from settings.yml -PROXY_PATTERN_MAPPING = { - 'http': 'https://', - 'https:': 'https://' -} -# default maximum redirect -# from https://github.com/psf/requests/blob/8c211a96cdbe9fe320d63d9e1ae15c5c07e179f8/requests/models.py#L55 -DEFAULT_REDIRECT_LIMIT = 30 - - -if settings['outgoing'].get('source_ips'): - LOCAL_ADDRESS_CYCLE = cycle(settings['outgoing'].get('source_ips')) -else: - LOCAL_ADDRESS_CYCLE = cycle((None, )) - - -def set_timeout_for_thread(timeout, start_time=None): - THREADLOCAL.timeout = timeout - THREADLOCAL.start_time = start_time - - -def set_enable_http_protocol(enable_http): - THREADLOCAL.enable_http = enable_http - - -def get_enable_http_protocol(): - try: - return THREADLOCAL.enable_http - except AttributeError: - return False - - -def reset_time_for_thread(): - THREADLOCAL.total_time = 0 - - -def get_time_for_thread(): - return THREADLOCAL.total_time - - -def get_proxy_cycles(proxy_settings): - if not proxy_settings: - return None - # Backwards compatibility for single proxy in settings.yml - for protocol, proxy in proxy_settings.items(): - if isinstance(proxy, str): - proxy_settings[protocol] = [proxy] - - for protocol in proxy_settings: - proxy_settings[protocol] = cycle(proxy_settings[protocol]) - return proxy_settings - - -GLOBAL_PROXY_CYCLES = get_proxy_cycles(settings['outgoing'].get('proxies')) - - -def get_proxies(proxy_cycles): - if proxy_cycles: - return {protocol: next(proxy_cycle) for protocol, proxy_cycle in proxy_cycles.items()} - return None - - -def get_global_proxies(): - return get_proxies(GLOBAL_PROXY_CYCLES) - - -async def close_connections_for_url(connection_pool: httpcore.AsyncConnectionPool, url: httpcore._utils.URL): - origin = httpcore._utils.url_to_origin(url) - logger.debug('Drop connections for %r', origin) - connections_to_close = connection_pool._connections_for_origin(origin) - for connection in connections_to_close: - await connection_pool._remove_from_pool(connection) - try: - await connection.aclose() - except httpcore.NetworkError as e: - logger.warning('Error closing an existing connection', exc_info=e) - - -class AsyncHTTPTransportNoHttp(httpcore.AsyncHTTPTransport): - """Block HTTP request""" - - async def arequest(self, method, url, headers=None, stream=None, ext=None): - raise httpcore.UnsupportedProtocol("HTTP protocol is disabled") - - -class AsyncProxyTransportFixed(AsyncProxyTransport): - """Fix httpx_socks.AsyncProxyTransport - - Map python_socks exceptions to httpcore.ProxyError - - Map socket.gaierror to httpcore.ConnectError - - Note: keepalive_expiry is ignored, AsyncProxyTransport should call: - * self._keepalive_sweep() - * self._response_closed(self, connection) - - Note: AsyncProxyTransport inherit from AsyncConnectionPool - - Note: the API is going to change on httpx 0.18.0 - see https://github.com/encode/httpx/pull/1522 - """ - - async def arequest(self, method, url, headers=None, stream=None, ext=None): - retry = 2 - while retry > 0: - retry -= 1 - try: - return await super().arequest(method, url, headers, stream, ext) - except (python_socks._errors.ProxyConnectionError, - python_socks._errors.ProxyTimeoutError, - python_socks._errors.ProxyError) as e: - raise httpcore.ProxyError(e) - except OSError as e: - # socket.gaierror when DNS resolution fails - raise httpcore.NetworkError(e) - except httpcore.RemoteProtocolError as e: - # in case of httpcore.RemoteProtocolError: Server disconnected - await close_connections_for_url(self, url) - logger.warning('httpcore.RemoteProtocolError: retry', exc_info=e) - # retry - except (httpcore.NetworkError, httpcore.ProtocolError) as e: - # httpcore.WriteError on HTTP/2 connection leaves a new opened stream - # then each new request creates a new stream and raise the same WriteError - await close_connections_for_url(self, url) - raise e - - -class AsyncHTTPTransportFixed(httpx.AsyncHTTPTransport): - """Fix httpx.AsyncHTTPTransport""" - - async def arequest(self, method, url, headers=None, stream=None, ext=None): - retry = 2 - while retry > 0: - retry -= 1 - try: - return await super().arequest(method, url, headers, stream, ext) - except OSError as e: - # socket.gaierror when DNS resolution fails - raise httpcore.ConnectError(e) - except httpcore.CloseError as e: - # httpcore.CloseError: [Errno 104] Connection reset by peer - # raised by _keepalive_sweep() - # from https://github.com/encode/httpcore/blob/4b662b5c42378a61e54d673b4c949420102379f5/httpcore/_backends/asyncio.py#L198 # noqa - await close_connections_for_url(self._pool, url) - logger.warning('httpcore.CloseError: retry', exc_info=e) - # retry - except httpcore.RemoteProtocolError as e: - # in case of httpcore.RemoteProtocolError: Server disconnected - await close_connections_for_url(self._pool, url) - logger.warning('httpcore.RemoteProtocolError: retry', exc_info=e) - # retry - except (httpcore.ProtocolError, httpcore.NetworkError) as e: - await close_connections_for_url(self._pool, url) - raise e - - -def get_transport_for_socks_proxy(verify, local_address, proxy_url): - global LOOP, LIMITS, TRANSPORT_KWARGS - # support socks5h (requests compatibility): - # https://requests.readthedocs.io/en/master/user/advanced/#socks - # socks5:// hostname is resolved on client side - # socks5h:// hostname is resolved on proxy side - rdns = False - socks5h = 'socks5h://' - if proxy_url.startswith(socks5h): - proxy_url = 'socks5://' + proxy_url[len(socks5h):] - rdns = True - - proxy_type, proxy_host, proxy_port, proxy_username, proxy_password = parse_proxy_url(proxy_url) - - return AsyncProxyTransportFixed(proxy_type=proxy_type, proxy_host=proxy_host, proxy_port=proxy_port, - username=proxy_username, password=proxy_password, - rdns=rdns, - loop=LOOP, - verify=verify, - local_address=local_address, - max_connections=LIMITS.max_connections, - max_keepalive_connections=LIMITS.max_keepalive_connections, - keepalive_expiry=LIMITS.keepalive_expiry, - **TRANSPORT_KWARGS) - - -def get_transport(verify, local_address, proxy_url): - global LIMITS - return AsyncHTTPTransportFixed(verify=verify, - local_address=local_address, - limits=LIMITS, - proxy=httpx._config.Proxy(proxy_url) if proxy_url else None, - **TRANSPORT_KWARGS) - - -def iter_proxies(proxies): - # https://www.python-httpx.org/compatibility/#proxy-keys - if isinstance(proxies, str): - yield 'all://', proxies - elif isinstance(proxies, dict): - for pattern, proxy_url in proxies.items(): - pattern = PROXY_PATTERN_MAPPING.get(pattern, pattern) - yield pattern, proxy_url - - -def new_client(verify, local_address, proxies, max_redirects, enable_http): - # See https://www.python-httpx.org/advanced/#routing - mounts = {} - for pattern, proxy_url in iter_proxies(proxies): - if not enable_http and (pattern == 'http' or pattern.startswith('http://')): - continue - if proxy_url.startswith('socks4://') \ - or proxy_url.startswith('socks5://') \ - or proxy_url.startswith('socks5h://'): - mounts[pattern] = get_transport_for_socks_proxy(verify, local_address, proxy_url) - else: - mounts[pattern] = get_transport(verify, local_address, proxy_url) - - if not enable_http: - mounts['http://'] = AsyncHTTPTransportNoHttp() - - transport = get_transport(verify, local_address, None) - return httpx.AsyncClient(transport=transport, mounts=mounts, max_redirects=max_redirects) - - -def get_client(verify, local_address, proxies, max_redirects, allow_http): - global CLIENTS - key = (verify, local_address, repr(proxies), max_redirects, allow_http) - if key not in CLIENTS: - CLIENTS[key] = new_client(verify, local_address, proxies, max_redirects, allow_http) - return CLIENTS[key] - - -async def send_request(method, url, enable_http, kwargs): - if isinstance(url, bytes): - url = url.decode() - - verify = kwargs.pop('verify', True) - local_address = next(LOCAL_ADDRESS_CYCLE) - proxies = kwargs.pop('proxies', None) or get_global_proxies() - max_redirects = kwargs.pop('max_redirects', DEFAULT_REDIRECT_LIMIT) - - client = get_client(verify, local_address, proxies, max_redirects, enable_http) - response = await client.request(method.upper(), url, **kwargs) - - # requests compatibility - # see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses - response.ok = not response.is_error - - return response - - -def request(method, url, **kwargs): - """same as requests/requests/api.py request(...)""" - time_before_request = time() - - # timeout - if 'timeout' in kwargs: - timeout = kwargs['timeout'] - else: - timeout = getattr(THREADLOCAL, 'timeout', None) - if timeout is not None: - kwargs['timeout'] = timeout - - # raise_for_error - check_for_httperror = True - if 'raise_for_httperror' in kwargs: - check_for_httperror = kwargs['raise_for_httperror'] - del kwargs['raise_for_httperror'] - - # do request - future = asyncio.run_coroutine_threadsafe(send_request(method, url, get_enable_http_protocol(), kwargs), LOOP) - try: - if timeout: - timeout += 0.2 # overhead - start_time = getattr(THREADLOCAL, 'start_time', time_before_request) - if start_time: - timeout -= time() - start_time - - response = future.result(timeout or 120) - except concurrent.futures.TimeoutError as e: - raise httpx.TimeoutException('Timeout', request=None) from e - - # update total_time. - # See get_time_for_thread() and reset_time_for_thread() - if hasattr(THREADLOCAL, 'total_time'): - time_after_request = time() - THREADLOCAL.total_time += time_after_request - time_before_request - - # raise an exception - if check_for_httperror: - raise_for_httperror(response) - - return response - - -async def stream_chunk_to_queue(method, url, q, **kwargs): - verify = kwargs.pop('verify', True) - local_address = next(LOCAL_ADDRESS_CYCLE) - proxies = kwargs.pop('proxies', None) or get_global_proxies() - # "30" from requests: - # https://github.com/psf/requests/blob/8c211a96cdbe9fe320d63d9e1ae15c5c07e179f8/requests/models.py#L55 - max_redirects = kwargs.pop('max_redirects', 30) - client = get_client(verify, local_address, proxies, max_redirects, True) - try: - async with client.stream(method, url, **kwargs) as response: - q.put(response) - async for chunk in response.aiter_bytes(65536): - if len(chunk) > 0: - q.put(chunk) - except (httpx.HTTPError, OSError, h2.exceptions.ProtocolError) as e: - q.put(e) - finally: - q.put(None) - - -def stream(method, url, **kwargs): - """Replace httpx.stream. - - Usage: - stream = poolrequests.stream(...) - response = next(stream) - for chunk in stream: - ... - - httpx.Client.stream requires to write the httpx.HTTPTransport version of the - the httpx.AsyncHTTPTransport declared above. - """ - q = SimpleQueue() - future = asyncio.run_coroutine_threadsafe(stream_chunk_to_queue(method, url, q, **kwargs), LOOP) - chunk_or_exception = q.get(timeout=60) - while chunk_or_exception is not None: - if isinstance(chunk_or_exception, Exception): - raise chunk_or_exception - yield chunk_or_exception - chunk_or_exception = q.get(timeout=60) - return future.result() - - -def get(url, **kwargs): - kwargs.setdefault('allow_redirects', True) - return request('get', url, **kwargs) - - -def options(url, **kwargs): - kwargs.setdefault('allow_redirects', True) - return request('options', url, **kwargs) - - -def head(url, **kwargs): - kwargs.setdefault('allow_redirects', False) - return request('head', url, **kwargs) - - -def post(url, data=None, **kwargs): - return request('post', url, data=data, **kwargs) - - -def put(url, data=None, **kwargs): - return request('put', url, data=data, **kwargs) - - -def patch(url, data=None, **kwargs): - return request('patch', url, data=data, **kwargs) - - -def delete(url, **kwargs): - return request('delete', url, **kwargs) - - -def init(): - # log - for logger_name in ('hpack.hpack', 'hpack.table'): - logging.getLogger(logger_name).setLevel(logging.WARNING) - - # loop - def loop_thread(): - global LOOP - LOOP = asyncio.new_event_loop() - LOOP.run_forever() - - th = threading.Thread( - target=loop_thread, - name='asyncio_loop', - daemon=True, - ) - th.start() - - -@atexit.register -def done(): - """Close all HTTP client - - Avoid a warning at exit - see https://github.com/encode/httpx/blob/1a6e254f72d9fd5694a1c10a28927e193ab4f76b/httpx/_client.py#L1785 - """ - global LOOP - - async def close_client(client): - try: - await client.aclose() - except httpx.HTTPError: - pass - - async def close_clients(): - await asyncio.gather(*[close_client(client) for client in CLIENTS.values()], return_exceptions=False) - future = asyncio.run_coroutine_threadsafe(close_clients(), LOOP) - # wait 3 seconds to close the HTTP clients - future.result(3) - - -init() - - -# ## TEMPORARY DEBUG ## - - -def debug_connection(connection): - now = LOOP.time() - expired = (connection.state == httpcore._async.base.ConnectionState.IDLE - and connection.expires_at is not None - and now >= connection.expires_at) - return connection.info()\ - + (', connect_failed' if connection.connect_failed else '')\ - + (', expired' if expired else '') - - -def debug_origin(origin): - return origin[0].decode() + '://' + origin[1].decode() + ':' + str(origin[2]) - - -def debug_transport(transport): - result = { - '__class__': str(transport.__class__.__name__) - } - if isinstance(transport, (httpx.AsyncHTTPTransport, AsyncHTTPTransportFixed)): - pool = transport._pool - result['__pool_class__'] = str(pool.__class__.__name__) - if isinstance(pool, httpcore.AsyncConnectionPool): - for origin, connections in pool._connections.items(): - result[debug_origin(origin)] = [debug_connection(connection) for connection in connections] - return result - elif isinstance(transport, AsyncProxyTransportFixed): - for origin, connections in transport._connections.items(): - result[debug_origin(origin)] = [debug_connection(connection) for connection in connections] - return result - return result - - -def debug_asyncclient(client, key=None): - result = {} - if key: - result['__key__'] = [k if isinstance(k, (str, int, float, bool, type(None))) else repr(k) for k in key] - result['__default__'] = debug_transport(client._transport) - for urlpattern, transport in client._mounts.items(): - result[urlpattern.pattern] = debug_transport(transport) - return result - - -def debug_asyncclients(): - global CLIENTS - return [debug_asyncclient(client, key) for key, client in CLIENTS.items()] -- cgit v1.2.3