From 1be19f8b5820d1c7b369f80cc48c6014a6d41085 Mon Sep 17 00:00:00 2001 From: Bnyro Date: Tue, 4 Nov 2025 18:00:02 +0100 Subject: [feat] sourcehut engine: implement as custom module, fix user agent SourceHut uses a foss bot protection tool called `go-away` (which I can recommend BTW). It blocks common crawler user agents, such as the standard Firefox user agent. Hence, we're now using our custom SearXNG user agent to clarify we're not a crawler. Closes: https://github.com/searxng/searxng/issues/5270 Co-authored-by: Markus Heiser --- searx/engines/sourcehut.py | 90 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 searx/engines/sourcehut.py (limited to 'searx/engines') diff --git a/searx/engines/sourcehut.py b/searx/engines/sourcehut.py new file mode 100644 index 000000000..fa3df8b23 --- /dev/null +++ b/searx/engines/sourcehut.py @@ -0,0 +1,90 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Engine to search in the collaborative software platform SourceHut_. + +.. _SourceHut: https://sourcehut.org/ + +Configuration +============= + +You can configure the following setting: + +- :py:obj:`sourcehut_sort_order` + +.. code:: yaml + + - name: sourcehut + shortcut: srht + engine: sourcehut + # sourcehut_sort_order: longest-active + +Implementations +=============== + +""" + +import typing as t + +from urllib.parse import urlencode +from lxml import html + +from searx.utils import eval_xpath, eval_xpath_list, extract_text, searxng_useragent +from searx.result_types import EngineResults + +if t.TYPE_CHECKING: + from searx.extended_types import SXNG_Response + from searx.search.processors import OnlineParams + + +about = { + "website": "https://sourcehut.org", + "wikidata_id": "Q78514485", + "official_api_documentation": "https://man.sr.ht/", + "use_official_api": False, + "require_api_key": False, + "results": "HTML", +} + +categories = ["it", "repos"] +paging = True + +base_url: str = "https://sr.ht/projects" +"""Browse public projects.""" + + +sourcehut_sort_order: str = "recently-updated" +"""The sort order of the results. Possible values: + +- ``recently-updated`` +- ``longest-active`` +""" + + +def request(query: str, params: "OnlineParams") -> None: + + args = {"search": query, "page": params["pageno"], "sort": sourcehut_sort_order} + params["url"] = f"{base_url}?{urlencode(args)}" + + # standard user agents are blocked by 'go-away', a foss bot detection tool + params["headers"]["User-Agent"] = searxng_useragent() + + +def response(resp: "SXNG_Response") -> EngineResults: + + res = EngineResults() + doc = html.fromstring(resp.text) + + for item in eval_xpath_list(doc, "(//div[@class='event-list'])[1]/div[contains(@class, 'event')]"): + res.add( + res.types.LegacyResult( + template="packages.html", + url=base_url + (extract_text(eval_xpath(item, "./h4/a[2]/@href")) or ""), + title=extract_text(eval_xpath(item, "./h4")), + package_name=extract_text(eval_xpath(item, "./h4/a[2]")), + content=extract_text(eval_xpath(item, "./p")), + maintainer=(extract_text(eval_xpath(item, "./h4/a[1]")) or "").removeprefix("~"), + tags=[ + tag.removeprefix("#") for tag in eval_xpath_list(item, "./div[contains(@class, 'tags')]/a/text()") + ], + ) + ) + return res -- cgit v1.2.3