summaryrefslogtreecommitdiff
path: root/searx/engines
diff options
context:
space:
mode:
authorBnyro <bnyro@tutanota.com>2025-11-04 18:00:02 +0100
committerBnyro <bnyro@tutanota.com>2025-11-05 17:56:13 +0100
commit1be19f8b5820d1c7b369f80cc48c6014a6d41085 (patch)
tree8ccab7cf0f8710cd320309f38681f6c55bd6ce8b /searx/engines
parent3763b4bff4b6ad6d4d84ba74539755292d33bdec (diff)
[feat] sourcehut engine: implement as custom module, fix user agent
SourceHut uses a foss bot protection tool called `go-away` (which I can recommend BTW). It blocks common crawler user agents, such as the standard Firefox user agent. Hence, we're now using our custom SearXNG user agent to clarify we're not a crawler. Closes: https://github.com/searxng/searxng/issues/5270 Co-authored-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx/engines')
-rw-r--r--searx/engines/sourcehut.py90
1 files changed, 90 insertions, 0 deletions
diff --git a/searx/engines/sourcehut.py b/searx/engines/sourcehut.py
new file mode 100644
index 000000000..fa3df8b23
--- /dev/null
+++ b/searx/engines/sourcehut.py
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+"""Engine to search in the collaborative software platform SourceHut_.
+
+.. _SourceHut: https://sourcehut.org/
+
+Configuration
+=============
+
+You can configure the following setting:
+
+- :py:obj:`sourcehut_sort_order`
+
+.. code:: yaml
+
+ - name: sourcehut
+ shortcut: srht
+ engine: sourcehut
+ # sourcehut_sort_order: longest-active
+
+Implementations
+===============
+
+"""
+
+import typing as t
+
+from urllib.parse import urlencode
+from lxml import html
+
+from searx.utils import eval_xpath, eval_xpath_list, extract_text, searxng_useragent
+from searx.result_types import EngineResults
+
+if t.TYPE_CHECKING:
+ from searx.extended_types import SXNG_Response
+ from searx.search.processors import OnlineParams
+
+
+about = {
+ "website": "https://sourcehut.org",
+ "wikidata_id": "Q78514485",
+ "official_api_documentation": "https://man.sr.ht/",
+ "use_official_api": False,
+ "require_api_key": False,
+ "results": "HTML",
+}
+
+categories = ["it", "repos"]
+paging = True
+
+base_url: str = "https://sr.ht/projects"
+"""Browse public projects."""
+
+
+sourcehut_sort_order: str = "recently-updated"
+"""The sort order of the results. Possible values:
+
+- ``recently-updated``
+- ``longest-active``
+"""
+
+
+def request(query: str, params: "OnlineParams") -> None:
+
+ args = {"search": query, "page": params["pageno"], "sort": sourcehut_sort_order}
+ params["url"] = f"{base_url}?{urlencode(args)}"
+
+ # standard user agents are blocked by 'go-away', a foss bot detection tool
+ params["headers"]["User-Agent"] = searxng_useragent()
+
+
+def response(resp: "SXNG_Response") -> EngineResults:
+
+ res = EngineResults()
+ doc = html.fromstring(resp.text)
+
+ for item in eval_xpath_list(doc, "(//div[@class='event-list'])[1]/div[contains(@class, 'event')]"):
+ res.add(
+ res.types.LegacyResult(
+ template="packages.html",
+ url=base_url + (extract_text(eval_xpath(item, "./h4/a[2]/@href")) or ""),
+ title=extract_text(eval_xpath(item, "./h4")),
+ package_name=extract_text(eval_xpath(item, "./h4/a[2]")),
+ content=extract_text(eval_xpath(item, "./p")),
+ maintainer=(extract_text(eval_xpath(item, "./h4/a[1]")) or "").removeprefix("~"),
+ tags=[
+ tag.removeprefix("#") for tag in eval_xpath_list(item, "./div[contains(@class, 'tags')]/a/text()")
+ ],
+ )
+ )
+ return res