diff options
| -rw-r--r-- | searx/engines/google.py | 11 | ||||
| -rw-r--r-- | searx/engines/google_videos.py | 57 |
2 files changed, 32 insertions, 36 deletions
diff --git a/searx/engines/google.py b/searx/engines/google.py index 4e6fa6190..578dec60c 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -138,12 +138,7 @@ content_xpath = './/div[@class="IsZvec"]' # Suggestions are links placed in a *card-section*, we extract only the text # from the links not the links itself. -suggestion_xpath = '//div[contains(@class, "card-section")]//a' - -# Since google does *auto-correction* on the first query these are not really -# *spelling suggestions*, we use them anyway. -spelling_suggestion_xpath = '//div[@class="med"]/p/a' - +suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a' def get_lang_info(params, lang_list, custom_aliases, supported_any_language): """Composing various language properties for the google engines. @@ -322,7 +317,6 @@ def response(resp): # convert the text to dom dom = html.fromstring(resp.text) - # results --> answer answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]') if answer_list: @@ -379,9 +373,6 @@ def response(resp): # append suggestion results.append({'suggestion': extract_text(suggestion)}) - for correction in eval_xpath_list(dom, spelling_suggestion_xpath): - results.append({'correction': extract_text(correction)}) - # return results return results diff --git a/searx/engines/google_videos.py b/searx/engines/google_videos.py index 9403ef4f7..abf046f4c 100644 --- a/searx/engines/google_videos.py +++ b/searx/engines/google_videos.py @@ -31,13 +31,9 @@ from searx.engines.google import ( get_lang_info, time_range_dict, filter_mapping, - results_xpath, g_section_with_header, title_xpath, - href_xpath, - content_xpath, suggestion_xpath, - spelling_suggestion_xpath, detect_google_sorry, ) @@ -74,11 +70,27 @@ def _re(regexpr): RE_CACHE[regexpr] = RE_CACHE.get(regexpr, re.compile(regexpr)) return RE_CACHE[regexpr] + +def scrap_out_thumbs_src(dom): + ret_val = {} + thumb_name = 'dimg_' + for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'): + _script = script.text + # "dimg_35":"https://i.ytimg.c....", + _dimurl = _re("s='([^']*)").findall( _script) + for k,v in _re('(' + thumb_name + '[0-9]*)":"(http[^"]*)' ).findall(_script): + v = v.replace(r'\u003d','=') + v = v.replace(r'\u0026','&') + ret_val[k] = v + logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys()) + return ret_val + + def scrap_out_thumbs(dom): """Scrap out thumbnail data from <script> tags. """ ret_val = {} - thumb_name = 'vidthumb' + thumb_name = 'dimg_' for script in eval_xpath_list(dom, '//script[contains(., "_setImagesSrc")]'): _script = script.text @@ -88,20 +100,11 @@ def scrap_out_thumbs(dom): if not _imgdata: continue - # var ii=['vidthumb4','vidthumb7'] + # var ii=['dimg_17'] for _vidthumb in _re(r"(%s\d+)" % thumb_name).findall(_script): # At least the equal sign in the URL needs to be decoded ret_val[_vidthumb] = _imgdata[0].replace(r"\x3d", "=") - # {google.ldidly=-1;google.ldi={"vidthumb8":"https://... - for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'): - _script = script.text - for key_val in _re(r'"%s\d+\":\"[^\"]*"' % thumb_name).findall( _script) : - match = _re(r'"(%s\d+)":"(.*)"' % thumb_name).search(key_val) - if match: - # At least the equal sign in the URL needs to be decoded - ret_val[match.group(1)] = match.group(2).replace(r"\u003d", "=") - logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys()) return ret_val @@ -145,9 +148,11 @@ def response(resp): # convert the text to dom dom = html.fromstring(resp.text) vidthumb_imgdata = scrap_out_thumbs(dom) + thumbs_src = scrap_out_thumbs_src(dom) + logger.debug(str(thumbs_src)) # parse results - for result in eval_xpath_list(dom, results_xpath): + for result in eval_xpath_list(dom, '//div[contains(@class, "g ")]'): # google *sections* if extract_text(eval_xpath(result, g_section_with_header)): @@ -155,21 +160,24 @@ def response(resp): continue title = extract_text(eval_xpath_getindex(result, title_xpath, 0)) - url = eval_xpath_getindex(result, href_xpath, 0) - c_node = eval_xpath_getindex(result, content_xpath, 0) + url = eval_xpath_getindex(result, './/div[@class="dXiKIc"]//a/@href', 0) # <img id="vidthumb1" ...> - img_id = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@id', 0, default=None) + img_id = eval_xpath_getindex(result, './/g-img/img/@id', 0, default=None) if img_id is None: + logger.error("no img_id for: %s" % result) continue + img_src = vidthumb_imgdata.get(img_id, None) if not img_src: logger.error("no vidthumb imgdata for: %s" % img_id) - img_src = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@src', 0) + img_src = thumbs_src.get(img_id, "") - length = extract_text(eval_xpath(c_node, './/div[1]//a/div[3]')) - content = extract_text(eval_xpath(c_node, './/div[2]/span')) - pub_info = extract_text(eval_xpath(c_node, './/div[2]/div')) + length = extract_text(eval_xpath( + result, './/div[contains(@class, "P7xzyf")]/span/span')) + c_node = eval_xpath_getindex(result, './/div[@class="Uroaid"]', 0) + content = extract_text(c_node) + pub_info = extract_text(eval_xpath(result, './/div[@class="Zg1NU"]')) results.append({ 'url': url, @@ -186,7 +194,4 @@ def response(resp): # append suggestion results.append({'suggestion': extract_text(suggestion)}) - for correction in eval_xpath_list(dom, spelling_suggestion_xpath): - results.append({'correction': extract_text(correction)}) - return results |