diff options
| -rw-r--r-- | .github/workflows/data-update.yml | 4 | ||||
| -rw-r--r-- | Makefile | 3 | ||||
| -rw-r--r-- | docs/dev/engine_overview.rst | 1 | ||||
| -rw-r--r-- | searx/engines/duckduckgo.py | 20 | ||||
| -rw-r--r-- | searx/engines/json_engine.py | 20 | ||||
| -rw-r--r-- | searx/engines/mediathekviewweb.py | 68 | ||||
| -rw-r--r-- | searx/engines/seznam.py | 64 | ||||
| -rw-r--r-- | searx/engines/wikipedia.py | 11 | ||||
| -rw-r--r-- | searx/poolrequests.py | 12 | ||||
| -rw-r--r-- | searx/search/processors/online.py | 6 | ||||
| -rw-r--r-- | searx/settings.yml | 26 |
11 files changed, 198 insertions, 37 deletions
diff --git a/.github/workflows/data-update.yml b/.github/workflows/data-update.yml index 70e491153..b8218a47d 100644 --- a/.github/workflows/data-update.yml +++ b/.github/workflows/data-update.yml @@ -1,13 +1,13 @@ name: "Update searx.data" on: schedule: - - cron: "37 13 * * 0" + - cron: "05 15 * * 5" jobs: updateData: name: Update data runs-on: ubuntu-20.04 - if: secrets.DATA_PR_TOKEN != null + if: ${{ github.repository_owner == 'searx'}} steps: - name: Checkout uses: actions/checkout@v2 @@ -193,7 +193,8 @@ PYLINT_FILES=\ searx/engines/google.py \ searx/engines/google_news.py \ searx/engines/google_videos.py \ - searx/engines/google_images.py + searx/engines/google_images.py \ + searx/engines/mediathekviewweb.py test.pylint: pyenvinstall $(call cmd,pylint,$(PYLINT_FILES)) diff --git a/docs/dev/engine_overview.rst b/docs/dev/engine_overview.rst index 7007e7b99..b51181974 100644 --- a/docs/dev/engine_overview.rst +++ b/docs/dev/engine_overview.rst @@ -169,6 +169,7 @@ headers set HTTP header information data set HTTP data information cookies set HTTP cookies verify bool Performing SSL-Validity check +allow_redirects bool Follow redirects max_redirects int maximum redirects, hard limit soft_max_redirects int maximum redirects, soft limit. Record an error but don't stop the engine raise_for_httperror bool True by default: raise an exception if the HTTP code of response is >= 300 diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 7f1378264..92431b137 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -5,7 +5,8 @@ from lxml.html import fromstring from json import loads -from searx.utils import extract_text, match_language, eval_xpath +from searx.utils import extract_text, match_language, eval_xpath, dict_subset +from searx.poolrequests import get # about about = { @@ -35,6 +36,7 @@ language_aliases = { # search-url url = 'https://html.duckduckgo.com/html' +url_ping = 'https://duckduckgo.com/t/sl_h' time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm'} @@ -65,27 +67,33 @@ def request(query, params): params['url'] = url params['method'] = 'POST' - params['data']['b'] = '' params['data']['q'] = query - params['data']['df'] = '' + params['data']['b'] = '' region_code = get_region_code(params['language'], supported_languages) if region_code: params['data']['kl'] = region_code params['cookies']['kl'] = region_code + if params['time_range'] in time_range_dict: params['data']['df'] = time_range_dict[params['time_range']] + params['allow_redirects'] = False return params # get response from search-request def response(resp): - results = [] + if resp.status_code == 303: + return [] - doc = fromstring(resp.text) + # ping + headers_ping = dict_subset(resp.request.headers, ['User-Agent', 'Accept-Encoding', 'Accept', 'Cookie']) + get(url_ping, headers=headers_ping) - # parse results + # parse the response + results = [] + doc = fromstring(resp.text) for i, r in enumerate(eval_xpath(doc, result_xpath)): if i >= 30: break diff --git a/searx/engines/json_engine.py b/searx/engines/json_engine.py index f4a5ff6d2..8a04d34b2 100644 --- a/searx/engines/json_engine.py +++ b/searx/engines/json_engine.py @@ -3,13 +3,15 @@ from collections.abc import Iterable from json import loads from urllib.parse import urlencode -from searx.utils import to_string +from searx.utils import to_string, html_to_text search_url = None url_query = None content_query = None title_query = None +content_html_to_text = False +title_html_to_text = False paging = False suggestion_query = '' results_query = '' @@ -92,9 +94,17 @@ def request(query, params): return params +def identity(arg): + return arg + + def response(resp): results = [] json = loads(resp.text) + + title_filter = html_to_text if title_html_to_text else identity + content_filter = html_to_text if content_html_to_text else identity + if results_query: rs = query(json, results_query) if not len(rs): @@ -111,8 +121,8 @@ def response(resp): content = "" results.append({ 'url': to_string(url), - 'title': to_string(title), - 'content': to_string(content), + 'title': title_filter(to_string(title)), + 'content': content_filter(to_string(content)), }) else: for url, title, content in zip( @@ -122,8 +132,8 @@ def response(resp): ): results.append({ 'url': to_string(url), - 'title': to_string(title), - 'content': to_string(content), + 'title': title_filter(to_string(title)), + 'content': content_filter(to_string(content)), }) if not suggestion_query: diff --git a/searx/engines/mediathekviewweb.py b/searx/engines/mediathekviewweb.py new file mode 100644 index 000000000..fa442c937 --- /dev/null +++ b/searx/engines/mediathekviewweb.py @@ -0,0 +1,68 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""MediathekViewWeb (API) + +""" + +# pylint: disable=missing-function-docstring + +import datetime +from json import loads, dumps + +about = { + "website": 'https://mediathekviewweb.de/', + "wikidata_id": 'Q27877380', + "official_api_documentation": 'https://gist.github.com/bagbag/a2888478d27de0e989cf777f81fb33de', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +categories = ['videos'] +paging = True +time_range_support = False +safesearch = False + +def request(query, params): + + params['url'] = 'https://mediathekviewweb.de/api/query' + params['method'] = 'POST' + params['headers']['Content-type'] = 'text/plain' + params['data'] = dumps({ + 'queries' : [ + { + 'fields' : [ + 'title', + 'topic', + ], + 'query' : query + }, + ], + 'sortBy' : 'timestamp', + 'sortOrder' : 'desc', + 'future' : True, + 'offset' : (params['pageno'] - 1 )* 10, + 'size' : 10 + }) + return params + +def response(resp): + + resp = loads(resp.text) + + mwv_result = resp['result'] + mwv_result_list = mwv_result['results'] + + results = [] + + for item in mwv_result_list: + + item['hms'] = str(datetime.timedelta(seconds=item['duration'])) + + results.append({ + 'url' : item['url_video_hd'], + 'title' : "%(channel)s: %(title)s (%(hms)s)" % item, + 'length' : item['hms'], + 'content' : "%(description)s" % item, + }) + + return results diff --git a/searx/engines/seznam.py b/searx/engines/seznam.py new file mode 100644 index 000000000..1df92a845 --- /dev/null +++ b/searx/engines/seznam.py @@ -0,0 +1,64 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Seznam +""" + +from urllib.parse import urlencode, urlparse +from lxml import html +from searx.poolrequests import get +from searx.exceptions import SearxEngineAccessDeniedException +from searx.utils import extract_text, eval_xpath_list, eval_xpath_getindex + +# about +about = { + "website": "https://www.seznam.cz/", + "wikidata_id": "Q3490485", + "official_api_documentation": "https://api.sklik.cz/", + "use_official_api": False, + "require_api_key": False, + "results": "HTML", +} + +base_url = 'https://search.seznam.cz/' + + +def request(query, params): + response_index = get(base_url, headers=params['headers'], raise_for_httperror=True) + dom = html.fromstring(response_index.text) + + url_params = {'q': query} + for e in eval_xpath_list(dom, '//input[@type="hidden"]'): + name = e.get('name') + value = e.get('value') + url_params[name] = value + + params['url'] = base_url + '?' + urlencode(url_params) + params['cookies'] = response_index.cookies + return params + + +def response(resp): + resp_url = urlparse(resp.url) + if resp_url.path.startswith('/verify'): + raise SearxEngineAccessDeniedException() + + results = [] + + dom = html.fromstring(resp.content.decode()) + for result_element in eval_xpath_list(dom, '//div[@id="searchpage-root"]//div[@data-dot="results"]/div'): + dot_data = eval_xpath_getindex(result_element, './div/div[@data-dot-data]/@data-dot-data', 0, default=None) + if dot_data is None: + title_element = eval_xpath_getindex(result_element, './/h3/a', 0) + results.append({ + 'url': title_element.get('href'), + 'title': extract_text(title_element), + 'content': extract_text(eval_xpath_getindex(title_element, '../../div[2]', 0)), + }) + elif dot_data == '{"reporter_name":"hint/related/relates"}': + suggestions_element = eval_xpath_getindex(result_element, + './div/div[@data-dot="main-box"]', 0, default=None) + if suggestions_element is not None: + for suggestion in eval_xpath_list(suggestions_element, './/ul/li'): + results.append({'suggestion': extract_text(suggestion)}) + + return results diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py index c8e589e64..2adfefa69 100644 --- a/searx/engines/wikipedia.py +++ b/searx/engines/wikipedia.py @@ -56,6 +56,17 @@ def request(query, params): def response(resp): if resp.status_code == 404: return [] + + if resp.status_code == 400: + try: + api_result = loads(resp.text) + except: + pass + else: + if api_result['type'] == 'https://mediawiki.org/wiki/HyperSwitch/errors/bad_request' \ + and api_result['detail'] == 'title-invalid-characters': + return [] + raise_for_httperror(resp) results = [] diff --git a/searx/poolrequests.py b/searx/poolrequests.py index 25a6baed9..8b8681437 100644 --- a/searx/poolrequests.py +++ b/searx/poolrequests.py @@ -1,7 +1,7 @@ import sys from time import time from itertools import cycle -from threading import RLock, local +from threading import local import requests @@ -88,10 +88,12 @@ class SessionSinglePool(requests.Session): super().__init__() # reuse the same adapters - with RLock(): - self.adapters.clear() - self.mount('https://', next(https_adapters)) - self.mount('http://', next(http_adapters)) + self.adapters.clear() + + https_adapter = threadLocal.__dict__.setdefault('https_adapter', next(https_adapters)) + http_adapter = threadLocal.__dict__.setdefault('http_adapter', next(http_adapters)) + self.mount('https://', https_adapter) + self.mount('http://', http_adapter) def close(self): """Call super, but clear adapters since there are managed globaly""" diff --git a/searx/search/processors/online.py b/searx/search/processors/online.py index d79edd542..df0ab8c21 100644 --- a/searx/search/processors/online.py +++ b/searx/search/processors/online.py @@ -73,11 +73,15 @@ class OnlineProcessor(EngineProcessor): if max_redirects: request_args['max_redirects'] = max_redirects + # allow_redirects + if 'allow_redirects' in params: + request_args['allow_redirects'] = params['allow_redirects'] + # soft_max_redirects soft_max_redirects = params.get('soft_max_redirects', max_redirects or 0) # raise_for_status - request_args['raise_for_httperror'] = params.get('raise_for_httperror', False) + request_args['raise_for_httperror'] = params.get('raise_for_httperror', True) # specific type of request (GET or POST) if params['method'] == 'GET': diff --git a/searx/settings.yml b/searx/settings.yml index 9c0b8a1d1..2f96cce22 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -267,7 +267,9 @@ engines: search_url : https://search.crossref.org/dois?q={query}&page={pageno} url_query : doi title_query : title + title_html_to_text: True content_query : fullCitation + content_html_to_text: True categories : science shortcut : cr about: @@ -757,6 +759,7 @@ engines: url_query : metadata/oaf:entity/oaf:result/children/instance/webresource/url/$ title_query : metadata/oaf:entity/oaf:result/title/$ content_query : metadata/oaf:entity/oaf:result/description/$ + content_html_to_text: True categories : science shortcut : oad timeout: 5.0 @@ -776,6 +779,7 @@ engines: url_query : metadata/oaf:entity/oaf:result/children/instance/webresource/url/$ title_query : metadata/oaf:entity/oaf:result/title/$ content_query : metadata/oaf:entity/oaf:result/description/$ + content_html_to_text: True categories : science shortcut : oap timeout: 5.0 @@ -1165,24 +1169,8 @@ engines: - name : seznam shortcut: szn - engine: xpath - paging : True - search_url : https://search.seznam.cz/?q={query}&count=10&from={pageno} - results_xpath: //div[@class="Page-content"]//div[contains(@class, "Result ")] - url_xpath : ./h3/a/@href - title_xpath : ./h3 - content_xpath : .//p[@class="Result-description"] - suggestion_xpath: //div[@class="Related-container"]//div[@class="RelatedItem"]/div/span/a - first_page_num : 0 - page_size : 10 + engine: seznam disabled : True - about: - website: https://www.seznam.cz/ - wikidata_id: Q3490485 - official_api_documentation: https://api.sklik.cz/ - use_official_api: false - require_api_key: false - results: HTML - name : mojeek shortcut: mjk @@ -1253,6 +1241,10 @@ engines: categories: videos disabled : True + - name : mediathekviewweb + engine : mediathekviewweb + shortcut : mvw + # - name : yacy # engine : yacy # shortcut : ya |