[fix] annas archive: engine broken due to site HTML changes

Apparently the layout of https://annas-archive.org has changed, making changes necessary. The issue has been reported in #5146, see there for more details. - closes #5146
author: Bnyro <bnyro@tutanota.com> 2025-08-28 11:06:38 +0200
committer: Markus Heiser <markus.heiser@darmarIT.de> 2025-08-28 19:24:37 +0200
commit: f971774773b49b336c932812a8565f0f0b4e6703 (patch)
tree: 365f2385c917dc4ebb734ed2383d0ea1ff1d2f2e /searx
parent: 5ca08c18134123fd5f9a457829090410221a93ce (diff)
1 files changed, 16 insertions, 16 deletions
diff --git a/searx/engines/annas_archive.py b/searx/engines/annas_archive.py
index f9b466ed4..f5312f7e5 100644
--- a/searx/engines/annas_archive.py
+++ b/searx/engines/annas_archive.py
@@ -40,6 +40,7 @@ from lxml import html
 from searx.utils import extract_text, eval_xpath, eval_xpath_getindex, eval_xpath_list
 from searx.enginelib.traits import EngineTraits
 from searx.data import ENGINE_TRAITS
+from searx.exceptions import SearxEngineXPathException
 
 # about
 about: Dict[str, Any] = {
@@ -118,30 +119,29 @@ def response(resp) -> List[Dict[str, Optional[str]]]:
     results: List[Dict[str, Optional[str]]] = []
     dom = html.fromstring(resp.text)
 
-    for item in eval_xpath_list(dom, '//main//div[contains(@class, "h-[125]")]/a'):
-        results.append(_get_result(item))
-
-    # The rendering of the WEB page is very strange; except the first position
-    # all other positions of Anna's result page are enclosed in SGML comments.
-    # These comments are *uncommented* by some JS code, see query of class
-    # '.js-scroll-hidden' in Anna's HTML template:
+    # The rendering of the WEB page is strange; positions of Anna's result page
+    # are enclosed in SGML comments.  These comments are *uncommented* by some
+    # JS code, see query of class '.js-scroll-hidden' in Anna's HTML template:
     #   https://annas-software.org/AnnaArchivist/annas-archive/-/blob/main/allthethings/templates/macros/md5_list.html
 
-    for item in eval_xpath_list(dom, '//main//div[contains(@class, "js-scroll-hidden")]'):
-        item = html.fromstring(item.xpath('./comment()')[0].text)
-        results.append(_get_result(item))
-
+    for item in eval_xpath_list(dom, '//main//div[contains(@class, "js-aarecord-list-outer")]/div'):
+        try:
+            results.append(_get_result(item))
+        except SearxEngineXPathException:
+            pass
     return results
 
 
 def _get_result(item):
     return {
         'template': 'paper.html',
-        'url': base_url + extract_text(eval_xpath_getindex(item, './@href', 0)),
-        'title': extract_text(eval_xpath(item, './/h3/text()[1]')),
-        'publisher': extract_text(eval_xpath(item, './/div[contains(@class, "text-sm")]')),
-        'authors': [extract_text(eval_xpath(item, './/div[contains(@class, "italic")]'))],
-        'content': extract_text(eval_xpath(item, './/div[contains(@class, "text-xs")]')),
+        'url': base_url + extract_text(eval_xpath_getindex(item, './a/@href', 0)),
+        'title': extract_text(eval_xpath(item, './div//a[starts-with(@href, "/md5")]')),
+        'authors': [extract_text(eval_xpath_getindex(item, './/a[starts-with(@href, "/search")]', 0))],
+        'publisher': extract_text(
+            eval_xpath_getindex(item, './/a[starts-with(@href, "/search")]', 1, default=None), allow_none=True
+        ),
+        'content': extract_text(eval_xpath(item, './/div[contains(@class, "relative")]')),
         'thumbnail': extract_text(eval_xpath_getindex(item, './/img/@src', 0, default=None), allow_none=True),
     }
author	Bnyro <bnyro@tutanota.com>	2025-08-28 11:06:38 +0200
committer	Markus Heiser <markus.heiser@darmarIT.de>	2025-08-28 19:24:37 +0200
commit	f971774773b49b336c932812a8565f0f0b4e6703 (patch)
tree	365f2385c917dc4ebb734ed2383d0ea1ff1d2f2e /searx
parent	5ca08c18134123fd5f9a457829090410221a93ce (diff)