diff options
| author | Aadniz <8147434+Aadniz@users.noreply.github.com> | 2025-03-07 23:28:28 +0100 |
|---|---|---|
| committer | Bnyro <bnyro@tutanota.com> | 2025-03-08 12:39:16 +0100 |
| commit | a88b4d7036002639ee09d01efb0279c87b8e23c3 (patch) | |
| tree | 1958999d5ebe5c8af37c084264ccae80495266d1 /searx | |
| parent | 73d50f57481cfe2951c3231b2f5dfd70c0e6b49f (diff) | |
[fix] presearch engine: domain sometimes included in beginning of titles
Diffstat (limited to 'searx')
| -rw-r--r-- | searx/engines/presearch.py | 27 |
1 files changed, 24 insertions, 3 deletions
diff --git a/searx/engines/presearch.py b/searx/engines/presearch.py index 8a2614bb5..870f2383b 100644 --- a/searx/engines/presearch.py +++ b/searx/engines/presearch.py @@ -64,7 +64,7 @@ Implementations """ -from urllib.parse import urlencode +from urllib.parse import urlencode, urlparse from searx import locales from searx.network import get from searx.utils import gen_useragent, html_to_text @@ -155,13 +155,34 @@ def _strip_leading_strings(text): return text.strip() +def _fix_title(title, url): + """ + Titles from Presearch shows domain + title without spacing, and HTML + This function removes these 2 issues. + Transforming "translate.google.co.in<em>Google</em> Translate" into "Google Translate" + """ + parsed_url = urlparse(url) + domain = parsed_url.netloc + title = html_to_text(title) + # Fixes issue where domain would show up in the title + # translate.google.co.inGoogle Translate -> Google Translate + if ( + title.startswith(domain) + and len(title) > len(domain) + and not title.startswith(domain + "/") + and not title.startswith(domain + " ") + ): + title = title.removeprefix(domain) + return title + + def parse_search_query(json_results): results = [] for item in json_results.get('specialSections', {}).get('topStoriesCompact', {}).get('data', []): result = { 'url': item['link'], - 'title': html_to_text(item['title']), + 'title': _fix_title(item['title'], item['link']), 'thumbnail': item['image'], 'content': '', 'metadata': item.get('source'), @@ -171,7 +192,7 @@ def parse_search_query(json_results): for item in json_results.get('standardResults', []): result = { 'url': item['link'], - 'title': html_to_text(item['title']), + 'title': _fix_title(item['title'], item['link']), 'content': html_to_text(item['description']), } results.append(result) |