[feat] engines: add www.acfun.cn

author: Zhijie He <hezhijie0327@hotmail.com> 2025-03-05 18:48:58 +0800
committer: Markus Heiser <markus.heiser@darmarIT.de> 2025-03-06 17:52:16 +0100
commit: 066aabc112f7869f03966553aa048e9508f89545 (patch)
tree: ccc5b14a5f7de4e0edcfc0b858c6cadf120008ab /searx/engines/acfun.py
parent: 8fe49046197eb6c3b3b45c0689835ddf54303c22 (diff)
1 files changed, 108 insertions, 0 deletions
diff --git a/searx/engines/acfun.py b/searx/engines/acfun.py
new file mode 100644
index 000000000..ab30c13f2
--- /dev/null
+++ b/searx/engines/acfun.py
@@ -0,0 +1,108 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+"""Acfun search engine for searxng"""
+
+from urllib.parse import urlencode
+import re
+import json
+from datetime import datetime, timedelta
+from lxml import html
+
+from searx.utils import extract_text
+
+# Metadata
+about = {
+    "website": "https://www.acfun.cn/",
+    "wikidata_id": "Q3077675",
+    "use_official_api": False,
+    "require_api_key": False,
+    "results": "HTML",
+}
+
+# Engine Configuration
+categories = ["videos"]
+paging = True
+
+# Base URL
+base_url = "https://www.acfun.cn"
+
+
+def request(query, params):
+    query_params = {"keyword": query, "pCursor": params["pageno"]}
+    params["url"] = f"{base_url}/search?{urlencode(query_params)}"
+    return params
+
+
+def response(resp):
+    results = []
+
+    matches = re.findall(r'bigPipe\.onPageletArrive\((\{.*?\})\);', resp.text, re.DOTALL)
+    if not matches:
+        return results
+
+    for match in matches:
+        try:
+            json_data = json.loads(match)
+            raw_html = json_data.get("html", "")
+            if not raw_html:
+                continue
+
+            tree = html.fromstring(raw_html)
+
+            video_blocks = tree.xpath('//div[contains(@class, "search-video")]')
+            if not video_blocks:
+                continue
+
+            for video_block in video_blocks:
+                video_info = extract_video_data(video_block)
+                if video_info and video_info["title"] and video_info["url"]:
+                    results.append(video_info)
+
+        except json.JSONDecodeError:
+            continue
+
+    return results
+
+
+def extract_video_data(video_block):
+    try:
+        data_exposure_log = video_block.get('data-exposure-log')
+        video_data = json.loads(data_exposure_log)
+
+        content_id = video_data.get("content_id", "")
+        title = video_data.get("title", "")
+
+        url = f"{base_url}/v/ac{content_id}"
+        iframe_src = f"{base_url}/player/ac{content_id}"
+
+        create_time = extract_text(video_block.xpath('.//span[contains(@class, "info__create-time")]'))
+        video_cover = extract_text(video_block.xpath('.//div[contains(@class, "video__cover")]/a/img/@src')[0])
+        video_duration = extract_text(video_block.xpath('.//span[contains(@class, "video__duration")]'))
+        video_intro = extract_text(video_block.xpath('.//div[contains(@class, "video__main__intro")]'))
+
+        published_date = None
+        if create_time:
+            try:
+                published_date = datetime.strptime(create_time.strip(), "%Y-%m-%d")
+            except (ValueError, TypeError):
+                pass
+
+        length = None
+        if video_duration:
+            try:
+                timediff = datetime.strptime(video_duration.strip(), "%M:%S")
+                length = timedelta(minutes=timediff.minute, seconds=timediff.second)
+            except (ValueError, TypeError):
+                pass
+
+        return {
+            "title": title,
+            "url": url,
+            "content": video_intro,
+            "thumbnail": video_cover,
+            "length": length,
+            "publishedDate": published_date,
+            "iframe_src": iframe_src,
+        }
+
+    except (json.JSONDecodeError, AttributeError, TypeError, ValueError):
+        return None
author	Zhijie He <hezhijie0327@hotmail.com>	2025-03-05 18:48:58 +0800
committer	Markus Heiser <markus.heiser@darmarIT.de>	2025-03-06 17:52:16 +0100
commit	066aabc112f7869f03966553aa048e9508f89545 (patch)
tree	ccc5b14a5f7de4e0edcfc0b858c6cadf120008ab /searx/engines/acfun.py
parent	8fe49046197eb6c3b3b45c0689835ddf54303c22 (diff)