summaryrefslogtreecommitdiff
path: root/searx
diff options
context:
space:
mode:
authorBnyro <bnyro@tutanota.com>2025-11-04 18:00:02 +0100
committerBnyro <bnyro@tutanota.com>2025-11-05 17:56:13 +0100
commit1be19f8b5820d1c7b369f80cc48c6014a6d41085 (patch)
tree8ccab7cf0f8710cd320309f38681f6c55bd6ce8b /searx
parent3763b4bff4b6ad6d4d84ba74539755292d33bdec (diff)
[feat] sourcehut engine: implement as custom module, fix user agent
SourceHut uses a foss bot protection tool called `go-away` (which I can recommend BTW). It blocks common crawler user agents, such as the standard Firefox user agent. Hence, we're now using our custom SearXNG user agent to clarify we're not a crawler. Closes: https://github.com/searxng/searxng/issues/5270 Co-authored-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx')
-rw-r--r--searx/engines/sourcehut.py90
-rw-r--r--searx/settings.yml19
2 files changed, 93 insertions, 16 deletions
diff --git a/searx/engines/sourcehut.py b/searx/engines/sourcehut.py
new file mode 100644
index 000000000..fa3df8b23
--- /dev/null
+++ b/searx/engines/sourcehut.py
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+"""Engine to search in the collaborative software platform SourceHut_.
+
+.. _SourceHut: https://sourcehut.org/
+
+Configuration
+=============
+
+You can configure the following setting:
+
+- :py:obj:`sourcehut_sort_order`
+
+.. code:: yaml
+
+ - name: sourcehut
+ shortcut: srht
+ engine: sourcehut
+ # sourcehut_sort_order: longest-active
+
+Implementations
+===============
+
+"""
+
+import typing as t
+
+from urllib.parse import urlencode
+from lxml import html
+
+from searx.utils import eval_xpath, eval_xpath_list, extract_text, searxng_useragent
+from searx.result_types import EngineResults
+
+if t.TYPE_CHECKING:
+ from searx.extended_types import SXNG_Response
+ from searx.search.processors import OnlineParams
+
+
+about = {
+ "website": "https://sourcehut.org",
+ "wikidata_id": "Q78514485",
+ "official_api_documentation": "https://man.sr.ht/",
+ "use_official_api": False,
+ "require_api_key": False,
+ "results": "HTML",
+}
+
+categories = ["it", "repos"]
+paging = True
+
+base_url: str = "https://sr.ht/projects"
+"""Browse public projects."""
+
+
+sourcehut_sort_order: str = "recently-updated"
+"""The sort order of the results. Possible values:
+
+- ``recently-updated``
+- ``longest-active``
+"""
+
+
+def request(query: str, params: "OnlineParams") -> None:
+
+ args = {"search": query, "page": params["pageno"], "sort": sourcehut_sort_order}
+ params["url"] = f"{base_url}?{urlencode(args)}"
+
+ # standard user agents are blocked by 'go-away', a foss bot detection tool
+ params["headers"]["User-Agent"] = searxng_useragent()
+
+
+def response(resp: "SXNG_Response") -> EngineResults:
+
+ res = EngineResults()
+ doc = html.fromstring(resp.text)
+
+ for item in eval_xpath_list(doc, "(//div[@class='event-list'])[1]/div[contains(@class, 'event')]"):
+ res.add(
+ res.types.LegacyResult(
+ template="packages.html",
+ url=base_url + (extract_text(eval_xpath(item, "./h4/a[2]/@href")) or ""),
+ title=extract_text(eval_xpath(item, "./h4")),
+ package_name=extract_text(eval_xpath(item, "./h4/a[2]")),
+ content=extract_text(eval_xpath(item, "./p")),
+ maintainer=(extract_text(eval_xpath(item, "./h4/a[1]")) or "").removeprefix("~"),
+ tags=[
+ tag.removeprefix("#") for tag in eval_xpath_list(item, "./div[contains(@class, 'tags')]/a/text()")
+ ],
+ )
+ )
+ return res
diff --git a/searx/settings.yml b/searx/settings.yml
index 2e418177a..a3cccb5d5 100644
--- a/searx/settings.yml
+++ b/searx/settings.yml
@@ -2677,23 +2677,10 @@ engines:
- name: sourcehut
shortcut: srht
- engine: xpath
- paging: true
- search_url: https://sr.ht/projects?page={pageno}&search={query}
- results_xpath: (//div[@class="event-list"])[1]/div[@class="event"]
- url_xpath: ./h4/a[2]/@href
- title_xpath: ./h4/a[2]
- content_xpath: ./p
- first_page_num: 1
- categories: [it, repos]
+ engine: sourcehut
+ # https://docs.searxng.org/dev/engines/online/sourcehut.html
+ # sourcehut_sort_order: longest-active
disabled: true
- about:
- website: https://sr.ht
- wikidata_id: Q78514485
- official_api_documentation: https://man.sr.ht/
- use_official_api: false
- require_api_key: false
- results: HTML
- name: goo
shortcut: goo