summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexandre Flament <alex@al-f.net>2021-11-26 08:25:38 +0100
committerGitHub <noreply@github.com>2021-11-26 08:25:38 +0100
commit328473befd61c3df6e91772fc9dcce3bfb23830d (patch)
treec4590a6783dda203c368c4bd986cf8084ec8db12
parentbd285de48a7db9f67c9e5dd50aba8742f552a245 (diff)
parent1ce09df9aa4d08d2125dca8f83906c5954048d0a (diff)
Merge pull request #547 from return42/fix-442
[fix] google & google video engines
-rw-r--r--searx/engines/google.py11
-rw-r--r--searx/engines/google_videos.py57
2 files changed, 32 insertions, 36 deletions
diff --git a/searx/engines/google.py b/searx/engines/google.py
index 4e6fa6190..578dec60c 100644
--- a/searx/engines/google.py
+++ b/searx/engines/google.py
@@ -138,12 +138,7 @@ content_xpath = './/div[@class="IsZvec"]'
# Suggestions are links placed in a *card-section*, we extract only the text
# from the links not the links itself.
-suggestion_xpath = '//div[contains(@class, "card-section")]//a'
-
-# Since google does *auto-correction* on the first query these are not really
-# *spelling suggestions*, we use them anyway.
-spelling_suggestion_xpath = '//div[@class="med"]/p/a'
-
+suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a'
def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
"""Composing various language properties for the google engines.
@@ -322,7 +317,6 @@ def response(resp):
# convert the text to dom
dom = html.fromstring(resp.text)
-
# results --> answer
answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
if answer_list:
@@ -379,9 +373,6 @@ def response(resp):
# append suggestion
results.append({'suggestion': extract_text(suggestion)})
- for correction in eval_xpath_list(dom, spelling_suggestion_xpath):
- results.append({'correction': extract_text(correction)})
-
# return results
return results
diff --git a/searx/engines/google_videos.py b/searx/engines/google_videos.py
index 9403ef4f7..abf046f4c 100644
--- a/searx/engines/google_videos.py
+++ b/searx/engines/google_videos.py
@@ -31,13 +31,9 @@ from searx.engines.google import (
get_lang_info,
time_range_dict,
filter_mapping,
- results_xpath,
g_section_with_header,
title_xpath,
- href_xpath,
- content_xpath,
suggestion_xpath,
- spelling_suggestion_xpath,
detect_google_sorry,
)
@@ -74,11 +70,27 @@ def _re(regexpr):
RE_CACHE[regexpr] = RE_CACHE.get(regexpr, re.compile(regexpr))
return RE_CACHE[regexpr]
+
+def scrap_out_thumbs_src(dom):
+ ret_val = {}
+ thumb_name = 'dimg_'
+ for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'):
+ _script = script.text
+ # "dimg_35":"https://i.ytimg.c....",
+ _dimurl = _re("s='([^']*)").findall( _script)
+ for k,v in _re('(' + thumb_name + '[0-9]*)":"(http[^"]*)' ).findall(_script):
+ v = v.replace(r'\u003d','=')
+ v = v.replace(r'\u0026','&')
+ ret_val[k] = v
+ logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys())
+ return ret_val
+
+
def scrap_out_thumbs(dom):
"""Scrap out thumbnail data from <script> tags.
"""
ret_val = {}
- thumb_name = 'vidthumb'
+ thumb_name = 'dimg_'
for script in eval_xpath_list(dom, '//script[contains(., "_setImagesSrc")]'):
_script = script.text
@@ -88,20 +100,11 @@ def scrap_out_thumbs(dom):
if not _imgdata:
continue
- # var ii=['vidthumb4','vidthumb7']
+ # var ii=['dimg_17']
for _vidthumb in _re(r"(%s\d+)" % thumb_name).findall(_script):
# At least the equal sign in the URL needs to be decoded
ret_val[_vidthumb] = _imgdata[0].replace(r"\x3d", "=")
- # {google.ldidly=-1;google.ldi={"vidthumb8":"https://...
- for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'):
- _script = script.text
- for key_val in _re(r'"%s\d+\":\"[^\"]*"' % thumb_name).findall( _script) :
- match = _re(r'"(%s\d+)":"(.*)"' % thumb_name).search(key_val)
- if match:
- # At least the equal sign in the URL needs to be decoded
- ret_val[match.group(1)] = match.group(2).replace(r"\u003d", "=")
-
logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys())
return ret_val
@@ -145,9 +148,11 @@ def response(resp):
# convert the text to dom
dom = html.fromstring(resp.text)
vidthumb_imgdata = scrap_out_thumbs(dom)
+ thumbs_src = scrap_out_thumbs_src(dom)
+ logger.debug(str(thumbs_src))
# parse results
- for result in eval_xpath_list(dom, results_xpath):
+ for result in eval_xpath_list(dom, '//div[contains(@class, "g ")]'):
# google *sections*
if extract_text(eval_xpath(result, g_section_with_header)):
@@ -155,21 +160,24 @@ def response(resp):
continue
title = extract_text(eval_xpath_getindex(result, title_xpath, 0))
- url = eval_xpath_getindex(result, href_xpath, 0)
- c_node = eval_xpath_getindex(result, content_xpath, 0)
+ url = eval_xpath_getindex(result, './/div[@class="dXiKIc"]//a/@href', 0)
# <img id="vidthumb1" ...>
- img_id = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@id', 0, default=None)
+ img_id = eval_xpath_getindex(result, './/g-img/img/@id', 0, default=None)
if img_id is None:
+ logger.error("no img_id for: %s" % result)
continue
+
img_src = vidthumb_imgdata.get(img_id, None)
if not img_src:
logger.error("no vidthumb imgdata for: %s" % img_id)
- img_src = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@src', 0)
+ img_src = thumbs_src.get(img_id, "")
- length = extract_text(eval_xpath(c_node, './/div[1]//a/div[3]'))
- content = extract_text(eval_xpath(c_node, './/div[2]/span'))
- pub_info = extract_text(eval_xpath(c_node, './/div[2]/div'))
+ length = extract_text(eval_xpath(
+ result, './/div[contains(@class, "P7xzyf")]/span/span'))
+ c_node = eval_xpath_getindex(result, './/div[@class="Uroaid"]', 0)
+ content = extract_text(c_node)
+ pub_info = extract_text(eval_xpath(result, './/div[@class="Zg1NU"]'))
results.append({
'url': url,
@@ -186,7 +194,4 @@ def response(resp):
# append suggestion
results.append({'suggestion': extract_text(suggestion)})
- for correction in eval_xpath_list(dom, spelling_suggestion_xpath):
- results.append({'correction': extract_text(correction)})
-
return results