searx/engines/dictzone.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104

# SPDX-License-Identifier: AGPL-3.0-or-later
"""
 Dictzone
"""

import urllib.parse
from lxml import html

from searx.utils import eval_xpath, extract_text
from searx.result_types import EngineResults
from searx.network import get as http_get  # https://github.com/searxng/searxng/issues/762

# about
about = {
    "website": 'https://dictzone.com/',
    "wikidata_id": None,
    "official_api_documentation": None,
    "use_official_api": False,
    "require_api_key": False,
    "results": 'HTML',
}

engine_type = 'online_dictionary'
categories = ['general', 'translate']
base_url = "https://dictzone.com"
weight = 100


def request(query, params):  # pylint: disable=unused-argument

    from_lang = params["from_lang"][2]  # "english"
    to_lang = params["to_lang"][2]  # "german"
    query = params["query"]

    params["url"] = f"{base_url}/{from_lang}-{to_lang}-dictionary/{urllib.parse.quote_plus(query)}"
    return params


def _clean_up_node(node):
    for x in ["./i", "./span", "./button"]:
        for n in node.xpath(x):
            n.getparent().remove(n)


def response(resp) -> EngineResults:
    results = EngineResults()

    item_list = []

    if not resp.ok:
        return results

    dom = html.fromstring(resp.text)

    for result in eval_xpath(dom, ".//table[@id='r']//tr"):

        # each row is an Translations.Item

        td_list = result.xpath("./td")
        if len(td_list) != 2:
            # ignore header columns "tr/th"
            continue

        col_from, col_to = td_list
        _clean_up_node(col_from)

        text = f"{extract_text(col_from)}"

        synonyms = []
        p_list = col_to.xpath(".//p")

        for i, p_item in enumerate(p_list):

            smpl: str = extract_text(p_list[i].xpath("./i[@class='smpl']"))  # type: ignore
            _clean_up_node(p_item)
            p_text: str = extract_text(p_item)  # type: ignore

            if smpl:
                p_text += " // " + smpl

            if i == 0:
                text += f" : {p_text}"
                continue

            synonyms.append(p_text)

        item = results.types.Translations.Item(text=text, synonyms=synonyms)
        item_list.append(item)

    # the "autotranslate" of dictzone is loaded by the JS from URL:
    #  https://dictzone.com/trans/hello%20world/en_de

    from_lang = resp.search_params["from_lang"][1]  # "en"
    to_lang = resp.search_params["to_lang"][1]  # "de"
    query = resp.search_params["query"]

    # works only sometimes?
    autotranslate = http_get(f"{base_url}/trans/{query}/{from_lang}_{to_lang}", timeout=1.0)
    if autotranslate.ok and autotranslate.text:
        item_list.insert(0, results.types.Translations.Item(text=autotranslate.text))

    if item_list:
        results.add(results.types.Translations(translations=item_list, url=resp.search_params["url"]))
    return results