diff options
| author | Zhijie He <hezhijie0327@hotmail.com> | 2025-03-05 18:48:58 +0800 |
|---|---|---|
| committer | Markus Heiser <markus.heiser@darmarIT.de> | 2025-03-06 17:52:16 +0100 |
| commit | 066aabc112f7869f03966553aa048e9508f89545 (patch) | |
| tree | ccc5b14a5f7de4e0edcfc0b858c6cadf120008ab /searx/engines/acfun.py | |
| parent | 8fe49046197eb6c3b3b45c0689835ddf54303c22 (diff) | |
[feat] engines: add www.acfun.cn
Diffstat (limited to 'searx/engines/acfun.py')
| -rw-r--r-- | searx/engines/acfun.py | 108 |
1 files changed, 108 insertions, 0 deletions
diff --git a/searx/engines/acfun.py b/searx/engines/acfun.py new file mode 100644 index 000000000..ab30c13f2 --- /dev/null +++ b/searx/engines/acfun.py @@ -0,0 +1,108 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Acfun search engine for searxng""" + +from urllib.parse import urlencode +import re +import json +from datetime import datetime, timedelta +from lxml import html + +from searx.utils import extract_text + +# Metadata +about = { + "website": "https://www.acfun.cn/", + "wikidata_id": "Q3077675", + "use_official_api": False, + "require_api_key": False, + "results": "HTML", +} + +# Engine Configuration +categories = ["videos"] +paging = True + +# Base URL +base_url = "https://www.acfun.cn" + + +def request(query, params): + query_params = {"keyword": query, "pCursor": params["pageno"]} + params["url"] = f"{base_url}/search?{urlencode(query_params)}" + return params + + +def response(resp): + results = [] + + matches = re.findall(r'bigPipe\.onPageletArrive\((\{.*?\})\);', resp.text, re.DOTALL) + if not matches: + return results + + for match in matches: + try: + json_data = json.loads(match) + raw_html = json_data.get("html", "") + if not raw_html: + continue + + tree = html.fromstring(raw_html) + + video_blocks = tree.xpath('//div[contains(@class, "search-video")]') + if not video_blocks: + continue + + for video_block in video_blocks: + video_info = extract_video_data(video_block) + if video_info and video_info["title"] and video_info["url"]: + results.append(video_info) + + except json.JSONDecodeError: + continue + + return results + + +def extract_video_data(video_block): + try: + data_exposure_log = video_block.get('data-exposure-log') + video_data = json.loads(data_exposure_log) + + content_id = video_data.get("content_id", "") + title = video_data.get("title", "") + + url = f"{base_url}/v/ac{content_id}" + iframe_src = f"{base_url}/player/ac{content_id}" + + create_time = extract_text(video_block.xpath('.//span[contains(@class, "info__create-time")]')) + video_cover = extract_text(video_block.xpath('.//div[contains(@class, "video__cover")]/a/img/@src')[0]) + video_duration = extract_text(video_block.xpath('.//span[contains(@class, "video__duration")]')) + video_intro = extract_text(video_block.xpath('.//div[contains(@class, "video__main__intro")]')) + + published_date = None + if create_time: + try: + published_date = datetime.strptime(create_time.strip(), "%Y-%m-%d") + except (ValueError, TypeError): + pass + + length = None + if video_duration: + try: + timediff = datetime.strptime(video_duration.strip(), "%M:%S") + length = timedelta(minutes=timediff.minute, seconds=timediff.second) + except (ValueError, TypeError): + pass + + return { + "title": title, + "url": url, + "content": video_intro, + "thumbnail": video_cover, + "length": length, + "publishedDate": published_date, + "iframe_src": iframe_src, + } + + except (json.JSONDecodeError, AttributeError, TypeError, ValueError): + return None |