summaryrefslogtreecommitdiff
path: root/searx/engines/soundcloud.py
diff options
context:
space:
mode:
authorMarkus Heiser <markus.heiser@darmarit.de>2025-03-01 15:34:38 +0100
committerMarkus Heiser <markus.heiser@darmarIT.de>2025-03-01 17:51:14 +0100
commitd0022d86d298150d40c59e18da4701a81c8610c7 (patch)
tree72cb85581b5873d57af00b98fc4e6c3e03c80c6b /searx/engines/soundcloud.py
parent1d16b94279b252bb2e298a2afffb9561d8b5bd85 (diff)
[refactor] soundcloud engine
Closes: https://github.com/searxng/searxng/issues/4226 Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx/engines/soundcloud.py')
-rw-r--r--searx/engines/soundcloud.py180
1 files changed, 112 insertions, 68 deletions
diff --git a/searx/engines/soundcloud.py b/searx/engines/soundcloud.py
index 3281ea398..3fee87b27 100644
--- a/searx/engines/soundcloud.py
+++ b/searx/engines/soundcloud.py
@@ -1,102 +1,146 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
-"""
- Soundcloud (Music)
-"""
+"""SoundCloud is a German audio streaming service."""
import re
-from json import loads
from urllib.parse import quote_plus, urlencode
-from lxml import html
+import datetime
+
from dateutil import parser
+from lxml import html
+
from searx.network import get as http_get
-# about
about = {
- "website": 'https://soundcloud.com',
- "wikidata_id": 'Q568769',
- "official_api_documentation": 'https://developers.soundcloud.com/',
- "use_official_api": True,
+ "website": "ttps://soundcloud.com",
+ "wikidata_id": "Q568769",
+ "official_api_documentation": "https://developers.soundcloud.com/docs/api/guide",
+ "use_official_api": False,
"require_api_key": False,
"results": 'JSON',
}
-# engine dependent config
-categories = ['music']
+categories = ["music"]
paging = True
-# search-url
-# missing attribute: user_id, app_version, app_locale
-url = 'https://api-v2.soundcloud.com/'
-search_url = (
- url + 'search?{query}'
- '&variant_ids='
- '&facet=model'
- '&limit=20'
- '&offset={offset}'
- '&linked_partitioning=1'
- '&client_id={client_id}'
-) # noqa
+search_url = "https://api-v2.soundcloud.com/search"
+"""This is not the offical (developer) url, it is the API which is used by the
+HTML frontend of the common WEB site.
+"""
cid_re = re.compile(r'client_id:"([^"]*)"', re.I | re.U)
-guest_client_id = ''
+guest_client_id = ""
+results_per_page = 10
+
+soundcloud_facet = "model"
+
+app_locale_map = {
+ "de": "de",
+ "en": "en",
+ "es": "es",
+ "fr": "fr",
+ "oc": "fr",
+ "it": "it",
+ "nl": "nl",
+ "pl": "pl",
+ "szl": "pl",
+ "pt": "pt_BR",
+ "pap": "pt_BR",
+ "sv": "sv",
+}
-def get_client_id():
- resp = http_get("https://soundcloud.com")
+def request(query, params):
+
+ # missing attributes: user_id, app_version
+ # - user_id=451561-497874-703312-310156
+ # - app_version=1740727428
- if resp.ok:
- tree = html.fromstring(resp.content)
- # script_tags has been moved from /assets/app/ to /assets/ path. I
- # found client_id in https://a-v2.sndcdn.com/assets/49-a0c01933-3.js
- script_tags = tree.xpath("//script[contains(@src, '/assets/')]")
- app_js_urls = [script_tag.get('src') for script_tag in script_tags if script_tag is not None]
+ args = {
+ "q": query,
+ "offset": (params['pageno'] - 1) * results_per_page,
+ "limit": results_per_page,
+ "facet": soundcloud_facet,
+ "client_id": guest_client_id,
+ "app_locale": app_locale_map.get(params["language"].split("-")[0], "en"),
+ }
+
+ params['url'] = f"{search_url}?{urlencode(args)}"
+ return params
+
+
+def response(resp):
+ results = []
+ data = resp.json()
+
+ for result in data.get("collection", []):
+
+ if result["kind"] in ("track", "playlist"):
+ url = result.get("permalink_url")
+ if not url:
+ continue
+ uri = quote_plus(result.get("uri"))
+ content = [
+ result.get("description"),
+ result.get("label_name"),
+ ]
+ res = {
+ "url": url,
+ "title": result["title"],
+ "content": " / ".join([c for c in content if c]),
+ "publishedDate": parser.parse(result["last_modified"]),
+ "iframe_src": "https://w.soundcloud.com/player/?url=" + uri,
+ "views": result.get("likes_count"),
+ }
+ thumbnail = result["artwork_url"] or result["user"]["avatar_url"]
+ res["thumbnail"] = thumbnail or None
+ length = int(result.get("duration", 0) / 1000)
+ if length:
+ length = datetime.timedelta(seconds=length)
+ res["length"] = length
+ res["views"] = result.get("playback_count", 0) or None
+ res["author"] = result.get("user", {}).get("full_name") or None
+ results.append(res)
- # extracts valid app_js urls from soundcloud.com content
- for app_js_url in app_js_urls[::-1]:
- # gets app_js and searches for the clientid
- resp = http_get(app_js_url)
- if resp.ok:
- cids = cid_re.search(resp.content.decode())
- if cids is not None and len(cids.groups()):
- return cids.groups()[0]
- logger.warning("Unable to fetch guest client_id from SoundCloud, check parser!")
- return ""
+ return results
def init(engine_settings=None): # pylint: disable=unused-argument
global guest_client_id # pylint: disable=global-statement
- # api-key
guest_client_id = get_client_id()
-# do search-request
-def request(query, params):
- offset = (params['pageno'] - 1) * 20
+def get_client_id() -> str:
- params['url'] = search_url.format(query=urlencode({'q': query}), offset=offset, client_id=guest_client_id)
+ client_id = ""
+ url = "https://soundcloud.com"
+ resp = http_get(url, timeout=10)
- return params
+ if not resp.ok:
+ logger.error("init: GET %s failed", url)
+ return client_id
+ tree = html.fromstring(resp.content)
+ script_tags = tree.xpath("//script[contains(@src, '/assets/')]")
+ app_js_urls = [tag.get("src") for tag in script_tags if tag is not None]
-def response(resp):
- results = []
- search_res = loads(resp.text)
+ # extracts valid app_js urls from soundcloud.com content
- # parse results
- for result in search_res.get('collection', []):
+ for url in app_js_urls[::-1]:
- if result['kind'] in ('track', 'playlist'):
- uri = quote_plus(result['uri'])
- res = {
- 'url': result['permalink_url'],
- 'title': result['title'],
- 'content': result['description'] or '',
- 'publishedDate': parser.parse(result['last_modified']),
- 'iframe_src': "https://w.soundcloud.com/player/?url=" + uri,
- }
- thumbnail = result['artwork_url'] or result['user']['avatar_url']
- if thumbnail:
- res['thumbnail'] = thumbnail
- results.append(res)
+ # gets app_js and search for the client_id
+ resp = http_get(url)
- return results
+ if not resp.ok:
+ logger.error("init: app_js GET %s failed", url)
+ continue
+
+ cids = cid_re.search(resp.content.decode())
+ if cids and len(cids.groups()):
+ client_id = cids.groups()[0]
+ break
+
+ if client_id:
+ logger.info("using client_id '%s' for soundclud queries", client_id)
+ else:
+ logger.warning("missing valid client_id for soundclud queries")
+ return client_id