summaryrefslogtreecommitdiff
path: root/searx/engines/startpage.py
diff options
context:
space:
mode:
authorBnyro <bnyro@tutanota.com>2025-02-13 11:43:45 +0100
committerMarkus Heiser <markus.heiser@darmarIT.de>2025-02-20 13:44:28 +0100
commit0f2fc5879dfd789467609bc0611a6e4d7b6a7ab5 (patch)
treeff40351c2498e2465be8c1cef67d7e5331883c6b /searx/engines/startpage.py
parentfeb15e3878920ee7bf6e3d726fac0fcd1f89a896 (diff)
[feat] startpage: support for news and images
Diffstat (limited to 'searx/engines/startpage.py')
-rw-r--r--searx/engines/startpage.py167
1 files changed, 102 insertions, 65 deletions
diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py
index f90cf2ac9..54e05604b 100644
--- a/searx/engines/startpage.py
+++ b/searx/engines/startpage.py
@@ -74,24 +74,25 @@ Startpage's category (for Web-search, News, Videos, ..) is set by
.. hint::
- The default category is ``web`` .. and other categories than ``web`` are not
- yet implemented.
+ Supported categories are ``web``, ``news`` and ``images``.
"""
# pylint: disable=too-many-statements
+from __future__ import annotations
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
from collections import OrderedDict
import re
from unicodedata import normalize, combining
from time import time
from datetime import datetime, timedelta
+from json import loads
import dateutil.parser
import lxml.html
import babel.localedata
-from searx.utils import extract_text, eval_xpath, gen_useragent
+from searx.utils import extr, extract_text, eval_xpath, gen_useragent, html_to_text, humanize_bytes, remove_pua_from_str
from searx.network import get # see https://github.com/searxng/searxng/issues/762
from searx.exceptions import SearxEngineCaptchaException
from searx.locales import region_tag
@@ -250,22 +251,13 @@ def request(query, params):
Additionally the arguments form Startpage's search form needs to be set in
HTML POST data / compare ``<input>`` elements: :py:obj:`search_form_xpath`.
"""
- if startpage_categ == 'web':
- return _request_cat_web(query, params)
-
- logger.error("Startpages's category '%' is not yet implemented.", startpage_categ)
- return params
-
-
-def _request_cat_web(query, params):
-
engine_region = traits.get_region(params['searxng_locale'], 'en-US')
engine_language = traits.get_language(params['searxng_locale'], 'en')
# build arguments
args = {
'query': query,
- 'cat': 'web',
+ 'cat': startpage_categ,
't': 'device',
'sc': get_sc_code(params['searxng_locale'], params), # hint: this func needs HTTP headers,
'with_date': time_range_dict.get(params['time_range'], ''),
@@ -317,73 +309,118 @@ def _request_cat_web(query, params):
return params
-# get response from search-request
-def response(resp):
- dom = lxml.html.fromstring(resp.text)
+def _parse_published_date(content: str) -> tuple[str, datetime | None]:
+ published_date = None
- if startpage_categ == 'web':
- return _response_cat_web(dom)
+ # check if search result starts with something like: "2 Sep 2014 ... "
+ if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
+ date_pos = content.find('...') + 4
+ date_string = content[0 : date_pos - 5]
+ # fix content string
+ content = content[date_pos:]
- logger.error("Startpages's category '%' is not yet implemented.", startpage_categ)
- return []
+ try:
+ published_date = dateutil.parser.parse(date_string, dayfirst=True)
+ except ValueError:
+ pass
+ # check if search result starts with something like: "5 days ago ... "
+ elif re.match(r"^[0-9]+ days? ago \.\.\. ", content):
+ date_pos = content.find('...') + 4
+ date_string = content[0 : date_pos - 5]
-def _response_cat_web(dom):
- results = []
+ # calculate datetime
+ published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group())) # type: ignore
- # parse results
- for result in eval_xpath(dom, '//div[@class="w-gl"]/div[contains(@class, "result")]'):
- links = eval_xpath(result, './/a[contains(@class, "result-title result-link")]')
- if not links:
- continue
- link = links[0]
- url = link.attrib.get('href')
+ # fix content string
+ content = content[date_pos:]
- # block google-ad url's
- if re.match(r"^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url):
- continue
+ return content, published_date
- # block startpage search url's
- if re.match(r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url):
- continue
- title = extract_text(eval_xpath(link, 'h2'))
- content = eval_xpath(result, './/p[contains(@class, "description")]')
- content = extract_text(content, allow_none=True) or ''
+def _get_web_result(result):
+ content = html_to_text(result.get('description'))
+ content, publishedDate = _parse_published_date(content)
+
+ return {
+ 'url': result['clickUrl'],
+ 'title': html_to_text(result['title']),
+ 'content': content,
+ 'publishedDate': publishedDate,
+ }
- published_date = None
- # check if search result starts with something like: "2 Sep 2014 ... "
- if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
- date_pos = content.find('...') + 4
- date_string = content[0 : date_pos - 5]
- # fix content string
- content = content[date_pos:]
+def _get_news_result(result):
- try:
- published_date = dateutil.parser.parse(date_string, dayfirst=True)
- except ValueError:
- pass
+ title = remove_pua_from_str(html_to_text(result['title']))
+ content = remove_pua_from_str(html_to_text(result.get('description')))
+
+ publishedDate = None
+ if result.get('date'):
+ publishedDate = datetime.fromtimestamp(result['date'] / 1000)
- # check if search result starts with something like: "5 days ago ... "
- elif re.match(r"^[0-9]+ days? ago \.\.\. ", content):
- date_pos = content.find('...') + 4
- date_string = content[0 : date_pos - 5]
+ thumbnailUrl = None
+ if result.get('thumbnailUrl'):
+ thumbnailUrl = base_url + result['thumbnailUrl']
- # calculate datetime
- published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group())) # type: ignore
+ return {
+ 'url': result['clickUrl'],
+ 'title': title,
+ 'content': content,
+ 'publishedDate': publishedDate,
+ 'thumbnail': thumbnailUrl,
+ }
- # fix content string
- content = content[date_pos:]
- if published_date:
- # append result
- results.append({'url': url, 'title': title, 'content': content, 'publishedDate': published_date})
- else:
- # append result
- results.append({'url': url, 'title': title, 'content': content})
+def _get_image_result(result) -> dict[str, Any] | None:
+ url = result.get('altClickUrl')
+ if not url:
+ return None
+
+ thumbnailUrl = None
+ if result.get('thumbnailUrl'):
+ thumbnailUrl = base_url + result['thumbnailUrl']
+
+ resolution = None
+ if result.get('width') and result.get('height'):
+ resolution = f"{result['width']}x{result['height']}"
+
+ filesize = None
+ if result.get('filesize'):
+ size_str = ''.join(filter(str.isdigit, result['filesize']))
+ filesize = humanize_bytes(int(size_str))
+
+ return {
+ 'template': 'images.html',
+ 'url': url,
+ 'title': html_to_text(result['title']),
+ 'content': '',
+ 'img_src': result.get('rawImageUrl'),
+ 'thumbnail_src': thumbnailUrl,
+ 'resolution': resolution,
+ 'img_format': result.get('format'),
+ 'filesize': filesize,
+ }
+
+
+def response(resp):
+ categ = startpage_categ.capitalize()
+ results_raw = '{' + extr(resp.text, f"React.createElement(UIStartpage.AppSerp{categ}, {{", '}})') + '}}'
+ results_json = loads(results_raw)
+ results_obj = results_json.get('render', {}).get('presenter', {}).get('regions', {})
+
+ results = []
+ for results_categ in results_obj.get('mainline', []):
+ for item in results_categ.get('results', []):
+ if results_categ['display_type'] == 'web-google':
+ results.append(_get_web_result(item))
+ elif results_categ['display_type'] == 'news-bing':
+ results.append(_get_news_result(item))
+ elif 'images' in results_categ['display_type']:
+ item = _get_image_result(item)
+ if item:
+ results.append(item)
- # return results
return results