summaryrefslogtreecommitdiff
path: root/searx
diff options
context:
space:
mode:
authorAadniz <8147434+Aadniz@users.noreply.github.com>2025-03-07 23:28:28 +0100
committerBnyro <bnyro@tutanota.com>2025-03-08 12:39:16 +0100
commita88b4d7036002639ee09d01efb0279c87b8e23c3 (patch)
tree1958999d5ebe5c8af37c084264ccae80495266d1 /searx
parent73d50f57481cfe2951c3231b2f5dfd70c0e6b49f (diff)
[fix] presearch engine: domain sometimes included in beginning of titles
Diffstat (limited to 'searx')
-rw-r--r--searx/engines/presearch.py27
1 files changed, 24 insertions, 3 deletions
diff --git a/searx/engines/presearch.py b/searx/engines/presearch.py
index 8a2614bb5..870f2383b 100644
--- a/searx/engines/presearch.py
+++ b/searx/engines/presearch.py
@@ -64,7 +64,7 @@ Implementations
"""
-from urllib.parse import urlencode
+from urllib.parse import urlencode, urlparse
from searx import locales
from searx.network import get
from searx.utils import gen_useragent, html_to_text
@@ -155,13 +155,34 @@ def _strip_leading_strings(text):
return text.strip()
+def _fix_title(title, url):
+ """
+ Titles from Presearch shows domain + title without spacing, and HTML
+ This function removes these 2 issues.
+ Transforming "translate.google.co.in<em>Google</em> Translate" into "Google Translate"
+ """
+ parsed_url = urlparse(url)
+ domain = parsed_url.netloc
+ title = html_to_text(title)
+ # Fixes issue where domain would show up in the title
+ # translate.google.co.inGoogle Translate -> Google Translate
+ if (
+ title.startswith(domain)
+ and len(title) > len(domain)
+ and not title.startswith(domain + "/")
+ and not title.startswith(domain + " ")
+ ):
+ title = title.removeprefix(domain)
+ return title
+
+
def parse_search_query(json_results):
results = []
for item in json_results.get('specialSections', {}).get('topStoriesCompact', {}).get('data', []):
result = {
'url': item['link'],
- 'title': html_to_text(item['title']),
+ 'title': _fix_title(item['title'], item['link']),
'thumbnail': item['image'],
'content': '',
'metadata': item.get('source'),
@@ -171,7 +192,7 @@ def parse_search_query(json_results):
for item in json_results.get('standardResults', []):
result = {
'url': item['link'],
- 'title': html_to_text(item['title']),
+ 'title': _fix_title(item['title'], item['link']),
'content': html_to_text(item['description']),
}
results.append(result)