summaryrefslogtreecommitdiff
path: root/searx/engines/presearch.py
diff options
context:
space:
mode:
Diffstat (limited to 'searx/engines/presearch.py')
-rw-r--r--searx/engines/presearch.py148
1 files changed, 113 insertions, 35 deletions
diff --git a/searx/engines/presearch.py b/searx/engines/presearch.py
index c41cf3b37..1e20465ed 100644
--- a/searx/engines/presearch.py
+++ b/searx/engines/presearch.py
@@ -1,6 +1,20 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Presearch (general, images, videos, news)
+
+.. hint::
+
+ The results in the video category are most often links to pages that contain
+ a video, for instance many links from preasearch's video category link
+ content from facebook (aka Meta) or Twitter (aka X). Since these are not
+ real links to video streams SearXNG can't use the video template for this and
+ if SearXNG can't use this template, then the user doesn't want to see these
+ hits in the videos category.
+
+ TL;DR; by default presearch's video category is placed into categories::
+
+ categories: [general, web]
+
"""
from urllib.parse import urlencode
@@ -19,12 +33,18 @@ paging = True
time_range_support = True
categories = ["general", "web"] # general, images, videos, news
-search_type = "search" # must be any of "search", "images", "videos", "news"
+search_type = "search"
+"""must be any of ``search``, ``images``, ``videos``, ``news``"""
base_url = "https://presearch.com"
safesearch_map = {0: 'false', 1: 'true', 2: 'true'}
+def init(_):
+ if search_type not in ['search', 'images', 'videos', 'news']:
+ raise ValueError(f'presearch search_type: {search_type}')
+
+
def _get_request_id(query, page, time_range, safesearch):
args = {
"q": query,
@@ -38,7 +58,7 @@ def _get_request_id(query, page, time_range, safesearch):
'User-Agent': gen_useragent(),
'Cookie': f"b=1;presearch_session=;use_safe_search={safesearch_map[safesearch]}",
}
- resp_text = get(url, headers=headers).text
+ resp_text = get(url, headers=headers).text # type: ignore
for line in resp_text.split("\n"):
if "window.searchId = " in line:
@@ -47,11 +67,6 @@ def _get_request_id(query, page, time_range, safesearch):
return None
-def _is_valid_img_src(url):
- # in some cases, the image url is a base64 encoded string, which has to be skipped
- return "https://" in url
-
-
def request(query, params):
request_id = _get_request_id(query, params["pageno"], params["time_range"], params["safesearch"])
@@ -61,42 +76,105 @@ def request(query, params):
return params
-def response(resp):
- results = []
+def _strip_leading_strings(text):
+ for x in ['wikipedia', 'google']:
+ if text.lower().endswith(x):
+ text = text[: -len(x)]
+ return text.strip()
- json = resp.json()
- json_results = []
- if search_type == "search":
- json_results = json['results'].get('standardResults', [])
- else:
- json_results = json.get(search_type, [])
+def parse_search_query(json_results):
+ results = []
- for json_result in json_results:
+ for item in json_results.get('specialSections', {}).get('topStoriesCompact', {}).get('data', []):
result = {
- 'url': json_result['link'],
- 'title': json_result['title'],
- 'content': html_to_text(json_result.get('description', '')),
+ 'url': item['link'],
+ 'title': item['title'],
+ 'img_src': item['image'],
+ 'content': '',
+ 'metadata': item.get('source'),
}
- if search_type == "images":
- result['template'] = 'images.html'
-
- if not _is_valid_img_src(json_result['image']):
- continue
-
- result['img_src'] = json_result['image']
- if _is_valid_img_src(json_result['thumbnail']):
- result['thumbnail'] = json_result['thumbnail']
+ results.append(result)
- elif search_type == "videos":
- result['template'] = 'videos.html'
+ for item in json_results.get('standardResults', []):
+ result = {
+ 'url': item['link'],
+ 'title': item['title'],
+ 'content': html_to_text(item['description']),
+ }
+ results.append(result)
- if _is_valid_img_src(json_result['image']):
- result['thumbnail'] = json_result['image']
+ info = json_results.get('infoSection', {}).get('data')
+ if info:
+ attributes = []
+ for item in info.get('about', []):
+ label, value = html_to_text(item).split(':', 1)
+ value = _strip_leading_strings(value)
+ attributes.append({'label': label, 'value': value})
+ content = []
+ for item in [info['subtitle'], info['description']]:
+ item = _strip_leading_strings(html_to_text(item))
+ if item:
+ content.append(item)
+
+ results.append(
+ {
+ 'infobox': info['title'],
+ 'id': info['title'],
+ 'img_src': info.get('image'),
+ 'content': ' | '.join(content),
+ 'attributes': attributes,
+ }
+ )
+ return results
- result['duration'] = json_result['duration']
- result['length'] = json_result['duration']
- results.append(result)
+def response(resp):
+ results = []
+ json_resp = resp.json()
+
+ if search_type == 'search':
+ results = parse_search_query(json_resp['results'])
+
+ elif search_type == 'images':
+ for item in json_resp['images']:
+ results.append(
+ {
+ 'template': 'images.html',
+ 'title': item['title'],
+ 'url': item['link'],
+ 'img_src': item['image'],
+ 'thumbnail_src': item['thumbnail'],
+ }
+ )
+
+ elif search_type == 'videos':
+ # The results in the video category are most often links to pages that contain
+ # a video and not to a video stream --> SearXNG can't use the video template.
+
+ for item in json_resp['videos']:
+ metadata = [x for x in [item.get('description'), item.get('duration')] if x]
+ results.append(
+ {
+ 'title': item['title'],
+ 'url': item['link'],
+ 'content': '',
+ 'metadata': ' / '.join(metadata),
+ 'img_src': item.get('image'),
+ }
+ )
+
+ elif search_type == 'news':
+ for item in json_resp['news']:
+ metadata = [x for x in [item.get('source'), item.get('time')] if x]
+ results.append(
+ {
+ 'title': item['title'],
+ 'url': item['link'],
+ 'content': item['description'],
+ 'metadata': ' / '.join(metadata),
+ 'img_src': item.get('image'),
+ }
+ )
return results