summaryrefslogtreecommitdiff
path: root/searx/engines/soundcloud.py
blob: b1bb329e1b322c52efe921592881978a4d5cdae7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# SPDX-License-Identifier: AGPL-3.0-or-later
"""SoundCloud is a German audio streaming service."""

import re
import datetime

from urllib.parse import quote_plus, urlencode

from dateutil import parser
from lxml import html

from searx.network import get as http_get
from searx.enginelib import EngineCache

about = {
    "website": "https://soundcloud.com",
    "wikidata_id": "Q568769",
    "official_api_documentation": "https://developers.soundcloud.com/docs/api/guide",
    "use_official_api": False,
    "require_api_key": False,
    "results": 'JSON',
}

categories = ["music"]
paging = True

search_url = "https://api-v2.soundcloud.com/search"
"""This is not the official (developer) url, it is the API which is used by the
HTML frontend of the common WEB site.
"""

cid_re = re.compile(r'client_id:"([^"]*)"', re.I | re.U)
results_per_page = 10

soundcloud_facet = "model"

app_locale_map = {
    "de": "de",
    "en": "en",
    "es": "es",
    "fr": "fr",
    "oc": "fr",
    "it": "it",
    "nl": "nl",
    "pl": "pl",
    "szl": "pl",
    "pt": "pt_BR",
    "pap": "pt_BR",
    "sv": "sv",
}

CACHE: EngineCache
"""Persistent (SQLite) key/value cache that deletes its values after ``expire``
seconds."""


def request(query, params):

    # missing attributes: user_id, app_version
    # - user_id=451561-497874-703312-310156
    # - app_version=1740727428

    guest_client_id = CACHE.get("guest_client_id")
    if guest_client_id is None:
        guest_client_id = get_client_id()
        if guest_client_id:
            CACHE.set(key="guest_client_id", value=guest_client_id)

    args = {
        "q": query,
        "offset": (params['pageno'] - 1) * results_per_page,
        "limit": results_per_page,
        "facet": soundcloud_facet,
        "client_id": guest_client_id,
        "app_locale": app_locale_map.get(params["language"].split("-")[0], "en"),
    }

    params['url'] = f"{search_url}?{urlencode(args)}"
    return params


def response(resp):
    results = []
    data = resp.json()

    for result in data.get("collection", []):

        if result["kind"] in ("track", "playlist"):
            url = result.get("permalink_url")
            if not url:
                continue
            uri = quote_plus(result.get("uri"))
            content = [
                result.get("description"),
                result.get("label_name"),
            ]
            res = {
                "url": url,
                "title": result["title"],
                "content": " / ".join([c for c in content if c]),
                "publishedDate": parser.parse(result["last_modified"]),
                "iframe_src": "https://w.soundcloud.com/player/?url=" + uri,
                "views": result.get("likes_count"),
            }
            thumbnail = result["artwork_url"] or result["user"]["avatar_url"]
            res["thumbnail"] = thumbnail or None
            length = int(result.get("duration", 0) / 1000)
            if length:
                length = datetime.timedelta(seconds=length)
                res["length"] = length
            res["views"] = result.get("playback_count", 0) or None
            res["author"] = result.get("user", {}).get("full_name") or None
            results.append(res)

    return results


def init(engine_settings):  # pylint: disable=unused-argument
    global CACHE  # pylint: disable=global-statement
    CACHE = EngineCache(engine_settings["name"])  # type:ignore


def get_client_id() -> str | None:

    client_id = ""
    url = "https://soundcloud.com"
    resp = http_get(url, timeout=10)

    if not resp.ok:
        logger.error("init: GET %s failed", url)
        return client_id

    tree = html.fromstring(resp.content)
    script_tags = tree.xpath("//script[contains(@src, '/assets/')]")
    app_js_urls = [tag.get("src") for tag in script_tags if tag is not None]

    # extracts valid app_js urls from soundcloud.com content

    for url in app_js_urls[::-1]:

        # gets app_js and search for the client_id
        resp = http_get(url)

        if not resp.ok:
            logger.error("init: app_js GET %s failed", url)
            continue

        cids = cid_re.search(resp.content.decode())
        if cids and len(cids.groups()):
            client_id = cids.groups()[0]
            break

    if client_id:
        logger.info("using client_id '%s' for soundclud queries", client_id)
    else:
        logger.warning("missing valid client_id for soundclud queries")
    return client_id or None