diff options
Diffstat (limited to 'searx/engines/google_videos.py')
| -rw-r--r-- | searx/engines/google_videos.py | 115 |
1 files changed, 32 insertions, 83 deletions
diff --git a/searx/engines/google_videos.py b/searx/engines/google_videos.py index 5ab29f9ff..985189df5 100644 --- a/searx/engines/google_videos.py +++ b/searx/engines/google_videos.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""This is the implementation of the google videos engine. +"""This is the implementation of the Google Videos engine. .. admonition:: Content-Security-Policy (CSP) @@ -14,9 +14,8 @@ """ -# pylint: disable=invalid-name +from typing import TYPE_CHECKING -import re from urllib.parse import urlencode from lxml import html @@ -27,20 +26,22 @@ from searx.utils import ( extract_text, ) +from searx.engines.google import fetch_traits # pylint: disable=unused-import from searx.engines.google import ( - get_lang_info, + get_google_info, time_range_dict, filter_mapping, - g_section_with_header, - title_xpath, suggestion_xpath, detect_google_sorry, ) +from searx.enginelib.traits import EngineTraits -# pylint: disable=unused-import -from searx.engines.google import supported_languages_url, _fetch_supported_languages, fetch_traits +if TYPE_CHECKING: + import logging -# pylint: enable=unused-import + logger: logging.Logger + +traits: EngineTraits # about about = { @@ -55,70 +56,32 @@ about = { # engine dependent config categories = ['videos', 'web'] -paging = False +paging = True language_support = True -use_locale_domain = True time_range_support = True safesearch = True -send_accept_language_header = True - -RE_CACHE = {} - - -def _re(regexpr): - """returns compiled regular expression""" - RE_CACHE[regexpr] = RE_CACHE.get(regexpr, re.compile(regexpr)) - return RE_CACHE[regexpr] - - -def scrap_out_thumbs_src(dom): - ret_val = {} - thumb_name = 'dimg_' - for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'): - _script = script.text - # "dimg_35":"https://i.ytimg.c....", - _dimurl = _re("s='([^']*)").findall(_script) - for k, v in _re('(' + thumb_name + '[0-9]*)":"(http[^"]*)').findall(_script): - v = v.replace(r'\u003d', '=') - v = v.replace(r'\u0026', '&') - ret_val[k] = v - logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys()) - return ret_val - - -def scrap_out_thumbs(dom): - """Scrap out thumbnail data from <script> tags.""" - ret_val = {} - thumb_name = 'dimg_' - - for script in eval_xpath_list(dom, '//script[contains(., "_setImagesSrc")]'): - _script = script.text - - # var s='data:image/jpeg;base64, ...' - _imgdata = _re("s='([^']*)").findall(_script) - if not _imgdata: - continue - - # var ii=['dimg_17'] - for _vidthumb in _re(r"(%s\d+)" % thumb_name).findall(_script): - # At least the equal sign in the URL needs to be decoded - ret_val[_vidthumb] = _imgdata[0].replace(r"\x3d", "=") - - logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys()) - return ret_val def request(query, params): """Google-Video search request""" - lang_info = get_lang_info(params, supported_languages, language_aliases, False) + google_info = get_google_info(params, traits) query_url = ( 'https://' - + lang_info['subdomain'] + + google_info['subdomain'] + '/search' + "?" - + urlencode({'q': query, 'tbm': "vid", **lang_info['params'], 'ie': "utf8", 'oe': "utf8"}) + + urlencode( + { + 'q': query, + 'tbm': "vid", + 'start': 10 * params['pageno'], + **google_info['params'], + 'asearch': 'arc', + 'async': 'use_ac:true,_fmt:html', + } + ) ) if params['time_range'] in time_range_dict: @@ -127,9 +90,8 @@ def request(query, params): query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) params['url'] = query_url - params['cookies']['CONSENT'] = "YES+" - params['headers'].update(lang_info['headers']) - params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' + params['cookies'] = google_info['cookies'] + params['headers'].update(google_info['headers']) return params @@ -141,43 +103,30 @@ def response(resp): # convert the text to dom dom = html.fromstring(resp.text) - vidthumb_imgdata = scrap_out_thumbs(dom) - thumbs_src = scrap_out_thumbs_src(dom) - logger.debug(str(thumbs_src)) # parse results for result in eval_xpath_list(dom, '//div[contains(@class, "g ")]'): - # ignore google *sections* - if extract_text(eval_xpath(result, g_section_with_header)): - logger.debug("ignoring <g-section-with-header>") - continue - - # ingnore articles without an image id / e.g. news articles - img_id = eval_xpath_getindex(result, './/g-img/img/@id', 0, default=None) - if img_id is None: - logger.error("no img_id found in item %s (news article?)", len(results) + 1) + img_src = eval_xpath_getindex(result, './/img/@src', 0, None) + if img_src is None: continue - img_src = vidthumb_imgdata.get(img_id, None) - if not img_src: - img_src = thumbs_src.get(img_id, "") + title = extract_text(eval_xpath_getindex(result, './/a/h3[1]', 0)) + url = eval_xpath_getindex(result, './/a/h3[1]/../@href', 0) - title = extract_text(eval_xpath_getindex(result, title_xpath, 0)) - url = eval_xpath_getindex(result, './/div[@class="dXiKIc"]//a/@href', 0) - length = extract_text(eval_xpath(result, './/div[contains(@class, "P7xzyf")]/span/span')) c_node = eval_xpath_getindex(result, './/div[@class="Uroaid"]', 0) content = extract_text(c_node) - pub_info = extract_text(eval_xpath(result, './/div[@class="Zg1NU"]')) + pub_info = extract_text(eval_xpath(result, './/div[@class="P7xzyf"]')) + length = extract_text(eval_xpath(result, './/div[@class="J1mWY"]')) results.append( { 'url': url, 'title': title, 'content': content, - 'length': length, 'author': pub_info, 'thumbnail': img_src, + 'length': length, 'template': 'videos.html', } ) |