diff options
Diffstat (limited to 'searx/engines/google.py')
| -rw-r--r-- | searx/engines/google.py | 76 |
1 files changed, 53 insertions, 23 deletions
diff --git a/searx/engines/google.py b/searx/engines/google.py index 841212e09..2ad9b4ca8 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -1,12 +1,28 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""Google (Web) +"""This is the implementation of the google WEB engine. Some of this +implementations are shared by other engines: -For detailed description of the *REST-full* API see: `Query Parameter -Definitions`_. +- :ref:`google images engine` +- :ref:`google news engine` +- :ref:`google videos engine` + +The google WEB engine itself has a special setup option: + +.. code:: yaml + + - name: google + ... + use_mobile_ui: true + +``use_mobile_ui``: (default: ``true``) + Enables to use *mobile endpoint* to bypass the google blocking (see + :issue:`159`). On the mobile UI of Google Search, the button :guilabel:`More + results` is not affected by Google rate limiting and we can still do requests + while actively blocked by the original Google search. By activate + ``use_mobile_ui`` this behavior is simulated by adding the parameter + ``async=use_ac:true,_fmt:pc`` to the :py:func:`request`. -.. _Query Parameter Definitions: - https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions """ # pylint: disable=invalid-name, missing-function-docstring @@ -34,6 +50,7 @@ categories = ['general'] paging = True time_range_support = True safesearch = True +use_mobile_ui = False supported_languages_url = 'https://www.google.com/preferences?#languages' # based on https://en.wikipedia.org/wiki/List_of_Google_domains and tests @@ -136,8 +153,9 @@ spelling_suggestion_xpath = '//div[@class="med"]/p/a' def get_lang_info(params, lang_list, custom_aliases, supported_any_language): """Composing various language properties for the google engines. - This function is called by the various google engines (google itself, - google-images, -news, -scholar, -videos). + This function is called by the various google engines (:ref:`google web + engine`, :ref:`google images engine`, :ref:`google news engine` and + :ref:`google videos engine`). :param dict param: request parameters of the engine @@ -145,7 +163,7 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language): :py:obj:`ENGINES_LANGUAGES[engine-name] <searx.data.ENGINES_LANGUAGES>` :param dict lang_list: custom aliases for non standard language codes - (used when calling :py:func:`searx.utils.match_language) + (used when calling :py:func:`searx.utils.match_language`) :param bool supported_any_language: When a language is not specified, the language interpretation is left up to Google to decide how the search @@ -158,7 +176,7 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language): Py-Dictionary with the key/value pairs: language: - Return value from :py:func:`searx.utils.match_language + Return value from :py:func:`searx.utils.match_language` country: The country code (e.g. US, AT, CA, FR, DE ..) @@ -266,6 +284,12 @@ def request(query, params): params, supported_languages, language_aliases, True ) + additional_parameters = {} + if use_mobile_ui: + additional_parameters = { + 'async': 'use_ac:true,_fmt:pc', + } + # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({ 'q': query, @@ -273,6 +297,7 @@ def request(query, params): 'ie': "utf8", 'oe': "utf8", 'start': offset, + **additional_parameters, }) if params['time_range'] in time_range_dict: @@ -282,9 +307,12 @@ def request(query, params): params['url'] = query_url params['headers'].update(lang_info['headers']) - params['headers']['Accept'] = ( - 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' - ) + if use_mobile_ui: + params['headers']['Accept'] = '*/*' + else: + params['headers']['Accept'] = ( + 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' + ) return params @@ -300,21 +328,23 @@ def response(resp): dom = html.fromstring(resp.text) # results --> answer - answer = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]//text()') - if answer: - results.append({'answer': ' '.join(answer)}) + answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]') + if answer_list: + answer_list = [_.xpath("normalize-space()") for _ in answer_list] + results.append({'answer': ' '.join(answer_list)}) else: logger.debug("did not find 'answer'") # results --> number_of_results - try: - _txt = eval_xpath_getindex(dom, '//div[@id="result-stats"]//text()', 0) - _digit = ''.join([n for n in _txt if n.isdigit()]) - number_of_results = int(_digit) - results.append({'number_of_results': number_of_results}) - except Exception as e: # pylint: disable=broad-except - logger.debug("did not 'number_of_results'") - logger.error(e, exc_info=True) + if not use_mobile_ui: + try: + _txt = eval_xpath_getindex(dom, '//div[@id="result-stats"]//text()', 0) + _digit = ''.join([n for n in _txt if n.isdigit()]) + number_of_results = int(_digit) + results.append({'number_of_results': number_of_results}) + except Exception as e: # pylint: disable=broad-except + logger.debug("did not 'number_of_results'") + logger.error(e, exc_info=True) # parse results for result in eval_xpath_list(dom, results_xpath): |