diff options
| -rw-r--r-- | Dockerfile | 2 | ||||
| -rw-r--r-- | docs/src/searx.engines.google.rst | 55 | ||||
| -rwxr-xr-x | manage | 2 | ||||
| -rw-r--r-- | searx/engines/google.py | 76 | ||||
| -rw-r--r-- | searx/engines/google_images.py | 13 | ||||
| -rw-r--r-- | searx/engines/google_news.py | 9 | ||||
| -rw-r--r-- | searx/engines/google_videos.py | 11 | ||||
| -rw-r--r-- | searx/settings.yml | 4 |
8 files changed, 122 insertions, 50 deletions
diff --git a/Dockerfile b/Dockerfile index 7ad892bb2..13885611b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM alpine:3.13 +FROM alpine:3.14 ENTRYPOINT ["/sbin/tini","--","/usr/local/searx/dockerfiles/docker-entrypoint.sh"] EXPOSE 8080 VOLUME /etc/searx diff --git a/docs/src/searx.engines.google.rst b/docs/src/searx.engines.google.rst new file mode 100644 index 000000000..2d10b5eea --- /dev/null +++ b/docs/src/searx.engines.google.rst @@ -0,0 +1,55 @@ +.. _google engines: + +============== +Google Engines +============== + +.. contents:: Contents + :depth: 2 + :local: + :backlinks: entry + + +.. _google API: + +google API +========== + +.. _Query Parameter Definitions: + https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions + +For detailed description of the *REST-full* API see: `Query Parameter +Definitions`_. Not all parameters can be appied and some engines are *special* +(e.g. :ref:`google news engine`). + +.. _google web engine: + +Google WEB +========== + +.. automodule:: searx.engines.google + :members: + +.. _google images engine: + +Google Images +============= + +.. automodule:: searx.engines.google_images + :members: + +.. _google videos engine: + +Google Videos +============= + +.. automodule:: searx.engines.google_videos + :members: + +.. _google news engine: + +Google News +=========== + +.. automodule:: searx.engines.google_news + :members: @@ -337,7 +337,7 @@ pygments.less() { } py.build() { - build_msg BUILD "[pylint] python package ${PYDIST}" + build_msg BUILD "python package ${PYDIST}" pyenv.cmd python setup.py \ sdist -d "${PYDIST}" \ bdist_wheel --bdist-dir "${PYBUILD}" -d "${PYDIST}" diff --git a/searx/engines/google.py b/searx/engines/google.py index 841212e09..2ad9b4ca8 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -1,12 +1,28 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""Google (Web) +"""This is the implementation of the google WEB engine. Some of this +implementations are shared by other engines: -For detailed description of the *REST-full* API see: `Query Parameter -Definitions`_. +- :ref:`google images engine` +- :ref:`google news engine` +- :ref:`google videos engine` + +The google WEB engine itself has a special setup option: + +.. code:: yaml + + - name: google + ... + use_mobile_ui: true + +``use_mobile_ui``: (default: ``true``) + Enables to use *mobile endpoint* to bypass the google blocking (see + :issue:`159`). On the mobile UI of Google Search, the button :guilabel:`More + results` is not affected by Google rate limiting and we can still do requests + while actively blocked by the original Google search. By activate + ``use_mobile_ui`` this behavior is simulated by adding the parameter + ``async=use_ac:true,_fmt:pc`` to the :py:func:`request`. -.. _Query Parameter Definitions: - https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions """ # pylint: disable=invalid-name, missing-function-docstring @@ -34,6 +50,7 @@ categories = ['general'] paging = True time_range_support = True safesearch = True +use_mobile_ui = False supported_languages_url = 'https://www.google.com/preferences?#languages' # based on https://en.wikipedia.org/wiki/List_of_Google_domains and tests @@ -136,8 +153,9 @@ spelling_suggestion_xpath = '//div[@class="med"]/p/a' def get_lang_info(params, lang_list, custom_aliases, supported_any_language): """Composing various language properties for the google engines. - This function is called by the various google engines (google itself, - google-images, -news, -scholar, -videos). + This function is called by the various google engines (:ref:`google web + engine`, :ref:`google images engine`, :ref:`google news engine` and + :ref:`google videos engine`). :param dict param: request parameters of the engine @@ -145,7 +163,7 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language): :py:obj:`ENGINES_LANGUAGES[engine-name] <searx.data.ENGINES_LANGUAGES>` :param dict lang_list: custom aliases for non standard language codes - (used when calling :py:func:`searx.utils.match_language) + (used when calling :py:func:`searx.utils.match_language`) :param bool supported_any_language: When a language is not specified, the language interpretation is left up to Google to decide how the search @@ -158,7 +176,7 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language): Py-Dictionary with the key/value pairs: language: - Return value from :py:func:`searx.utils.match_language + Return value from :py:func:`searx.utils.match_language` country: The country code (e.g. US, AT, CA, FR, DE ..) @@ -266,6 +284,12 @@ def request(query, params): params, supported_languages, language_aliases, True ) + additional_parameters = {} + if use_mobile_ui: + additional_parameters = { + 'async': 'use_ac:true,_fmt:pc', + } + # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({ 'q': query, @@ -273,6 +297,7 @@ def request(query, params): 'ie': "utf8", 'oe': "utf8", 'start': offset, + **additional_parameters, }) if params['time_range'] in time_range_dict: @@ -282,9 +307,12 @@ def request(query, params): params['url'] = query_url params['headers'].update(lang_info['headers']) - params['headers']['Accept'] = ( - 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' - ) + if use_mobile_ui: + params['headers']['Accept'] = '*/*' + else: + params['headers']['Accept'] = ( + 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' + ) return params @@ -300,21 +328,23 @@ def response(resp): dom = html.fromstring(resp.text) # results --> answer - answer = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]//text()') - if answer: - results.append({'answer': ' '.join(answer)}) + answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]') + if answer_list: + answer_list = [_.xpath("normalize-space()") for _ in answer_list] + results.append({'answer': ' '.join(answer_list)}) else: logger.debug("did not find 'answer'") # results --> number_of_results - try: - _txt = eval_xpath_getindex(dom, '//div[@id="result-stats"]//text()', 0) - _digit = ''.join([n for n in _txt if n.isdigit()]) - number_of_results = int(_digit) - results.append({'number_of_results': number_of_results}) - except Exception as e: # pylint: disable=broad-except - logger.debug("did not 'number_of_results'") - logger.error(e, exc_info=True) + if not use_mobile_ui: + try: + _txt = eval_xpath_getindex(dom, '//div[@id="result-stats"]//text()', 0) + _digit = ''.join([n for n in _txt if n.isdigit()]) + number_of_results = int(_digit) + results.append({'number_of_results': number_of_results}) + except Exception as e: # pylint: disable=broad-except + logger.debug("did not 'number_of_results'") + logger.error(e, exc_info=True) # parse results for result in eval_xpath_list(dom, results_xpath): diff --git a/searx/engines/google_images.py b/searx/engines/google_images.py index e7382a6fe..c17227a64 100644 --- a/searx/engines/google_images.py +++ b/searx/engines/google_images.py @@ -1,19 +1,14 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""Google (Images) +"""This is the implementation of the google images engine. -For detailed description of the *REST-full* API see: `Query Parameter -Definitions`_. - -.. _admonition:: Content-Security-Policy (CSP) +.. admonition:: Content-Security-Policy (CSP) This engine needs to allow images from the `data URLs`_ (prefixed with the - ``data:` scheme).:: + ``data:`` scheme):: - Header set Content-Security-Policy "img-src 'self' data: ;" + Header set Content-Security-Policy "img-src 'self' data: ;" -.. _Query Parameter Definitions: - https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions .. _data URLs: https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs """ diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index c1c97b700..e6d50855e 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -1,16 +1,11 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""Google (News) - -For detailed description of the *REST-full* API see: `Query Parameter -Definitions`_. Not all parameters can be appied: +"""This is the implementation of the google news engine. The google news API +ignores some parameters from the common :ref:`google API`: - num_ : the number of search results is ignored - save_ : is ignored / Google-News results are always *SafeSearch* -.. _Query Parameter Definitions: - https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions - .. _num: https://developers.google.com/custom-search/docs/xml_results#numsp .. _save: https://developers.google.com/custom-search/docs/xml_results#safesp diff --git a/searx/engines/google_videos.py b/searx/engines/google_videos.py index c57db4e63..3747e85a5 100644 --- a/searx/engines/google_videos.py +++ b/searx/engines/google_videos.py @@ -1,19 +1,14 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""Google (Video) +"""This is the implementation of the google videos engine. -For detailed description of the *REST-full* API see: `Query Parameter -Definitions`_. Not all parameters can be appied. - -.. _admonition:: Content-Security-Policy (CSP) +.. admonition:: Content-Security-Policy (CSP) This engine needs to allow images from the `data URLs`_ (prefixed with the - ``data:` scheme).:: + ``data:`` scheme):: Header set Content-Security-Policy "img-src 'self' data: ;" -.. _Query Parameter Definitions: - https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions .. _data URLs: https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs diff --git a/searx/settings.yml b/searx/settings.yml index 1b19cc29e..3925081d6 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -583,6 +583,8 @@ engines: - name: google engine: google shortcut: go + # see https://searxng.github.io/searxng/src/searx.engines.google.html#module-searx.engines.google + use_mobile_ui: true # additional_tests: # android: *test_android @@ -1454,7 +1456,7 @@ engines: timeout: 5.0 disabled: true - - name: słownik języka polskiego + - name: slownik jezyka polskiego engine: sjp shortcut: sjp base_url: https://sjp.pwn.pl/ |