searx/engines/gigablast.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134

# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""
 Gigablast (Web)
"""
# pylint: disable=invalid-name

import re
from time import time
from json import loads
from urllib.parse import urlencode
from searx.network import get

# about
about = {
    "website": 'https://www.gigablast.com',
    "wikidata_id": 'Q3105449',
    "official_api_documentation": 'https://gigablast.com/api.html',
    "use_official_api": True,
    "require_api_key": False,
    "results": 'JSON',
}

# engine dependent config
categories = ['general']
# gigablast's pagination is totally damaged, don't use it
paging = False
safesearch = True

# search-url
base_url = 'https://gigablast.com'
search_path = '/search?'

# ugly hack: gigablast requires a random extra parameter which can be extracted
# from the source code of the gigablast HTTP client
extra_param = ''
# timestamp of the last fetch of extra_param
extra_param_ts = 0
# after how many seconds extra_param expire
extra_param_expiration_delay = 3000


def fetch_extra_param(query_args, headers):

    # example:
    #
    # var uxrl='/search?c=main&qlangcountry=en-us&q=south&s=10&rand=1590740241635&n';
    # uxrl=uxrl+'sab=730863287';
    #
    # extra_param --> "rand=1590740241635&nsab=730863287"

    global extra_param, extra_param_ts  # pylint: disable=global-statement

    extra_param_ts = time()
    extra_param_path = search_path + urlencode(query_args)
    text = get(base_url + extra_param_path, headers=headers).text

    re_var= None
    for line in text.splitlines():
        if re_var is None and extra_param_path in line:
            var = line.split("=")[0].split()[1]  # e.g. var --> 'uxrl'
            re_var = re.compile(var + "\\s*=\\s*" + var + "\\s*\\+\\s*'" + "(.*)" + "'(.*)")
            extra_param = line.split("'")[1][len(extra_param_path):]
            continue
        if re_var is not None and re_var.search(line):
            extra_param += re_var.search(line).group(1)
            break


# do search-request
def request(query, params):  # pylint: disable=unused-argument
    query_args = dict(
        c = 'main'
        , q = query
        , dr = 1
        , showgoodimages = 0
    )

    if params['language'] and params['language'] != 'all':
        query_args['qlangcountry'] = params['language']
        query_args['qlang'] = params['language'].split('-')[0]

    if params['safesearch'] >= 1:
        query_args['ff'] = 1

    # see API http://www.gigablast.com/api.html#/search
    # Take into account, that the API has some quirks ..
    if time() > (extra_param_ts + extra_param_expiration_delay):
        fetch_extra_param(query_args, params['headers'])

    query_args['format'] = 'json'
    params['url'] = base_url + search_path + urlencode(query_args) + extra_param

    return params

# get response from search-request
def response(resp):
    results = []

    response_json = loads(resp.text)

    # logger.debug('gigablast returns %s results', len(response_json['results']))

    for result in response_json['results']:
        # see "Example JSON Output (&format=json)"
        # at http://www.gigablast.com/api.html#/search

        # sort out meaningless result

        title = result.get('title')
        if len(title) < 2:
            continue

        url = result.get('url')
        if len(url) < 9:
            continue

        content = result.get('sum')
        if len(content) < 5:
            continue

        # extend fields

        subtitle = result.get('title')
        if len(subtitle) > 3 and subtitle != title:
            title += " - " + subtitle

        results.append(dict(
            url = url
            , title = title
            , content = content
        ))

    return results