searx/engines/github_code.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272

# SPDX-License-Identifier: AGPL-3.0-or-lat_er
"""GitHub code search with `search syntax`_ as described in `Constructing a
search query`_ in the documentation of GitHub's REST API.

.. _search syntax:
    https://docs.github.com/en/search-github/getting-started-with-searching-on-github/understanding-the-search-syntax
.. _Constructing a search query:
    https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#constructing-a-search-query
.. _Github REST API for code search:
    https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#search-code
.. _Github REST API auth for code search:
    https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#search-code--fine-grained-access-tokens

Configuration
=============

The engine has the following mandatory setting:

- :py:obj:`ghc_auth`
  Change the authentication method used when using the API, defaults to none.

Optional settings are:

- :py:obj:`ghc_highlight_matching_lines`
   Control the highlighting of the matched text (turns off/on).
- :py:obj:`ghc_strip_new_lines`
   Strip new lines at the start or end of each code fragment.
- :py:obj:`ghc_strip_whitespace`
   Strip any whitespace at the start or end of each code fragment.
- :py:obj:`ghc_insert_block_separator`
   Add a `...` between each code fragment before merging them.

.. code:: yaml

  - name: github code
    engine: github_code
    shortcut: ghc
    ghc_auth:
      type: "none"

  - name: github code
    engine: github_code
    shortcut: ghc
    ghc_auth:
      type: "personal_access_token"
      token: "<token>"
    ghc_highlight_matching_lines: true
    ghc_strip_whitespace: true
    ghc_strip_new_lines: true


  - name: github code
    engine: github_code
    shortcut: ghc
    ghc_auth:
      type: "bearer"
      token: "<token>"

Implementation
===============

GitHub does not return the code line indices alongside the code fragment in the
search API. Since these are not super important for the user experience all the
code lines are just relabeled (starting from 1) and appended (a disjoint set of
code blocks in a single file might be returned from the API).
"""

from __future__ import annotations

import typing as t
from urllib.parse import urlencode, urlparse

from pygments.lexers import guess_lexer_for_filename
from pygments.util import ClassNotFound
from searx.result_types import EngineResults
from searx.extended_types import SXNG_Response
from searx.network import raise_for_httperror

# about
about = {
    "website": 'https://github.com/',
    "wikidata_id": 'Q364',
    "official_api_documentation": 'https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#search-code',
    "use_official_api": True,
    "require_api_key": False,
    "results": 'JSON',
}

# engine dependent config
categories = ['code']


search_url = 'https://api.github.com/search/code?sort=indexed&{query}&{page}'
# https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#text-match-metadata
accept_header = 'application/vnd.github.text-match+json'
paging = True

ghc_auth = {
    "type": "none",
    "token": "",
}
"""Change the method of authenticating to the github API.

``type`` needs to be one of ``none``, ``personal_access_token``, or ``bearer``.
When type is not `none` a token is expected to be passed as well in
``auth.token``.

If there is any privacy concerns about generating a token, one can use the API
without authentication.  The calls will be heavily rate limited, this is what the
API returns on such calls::

    API rate limit exceeded for <redacted ip>.
    (But here's the good news: Authenticated requests get a higher rate limit)

The personal access token or a bearer for an org or a group can be generated [in
the `GitHub settings`_.

.. _GitHub settings:
   https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#search-code--fine-grained-access-tokens
"""

ghc_highlight_matching_lines = True
"""Highlight the matching code lines."""

ghc_strip_new_lines = True
"""Strip leading and trailing newlines for each returned fragment.
Single file might return multiple code fragments.
"""

ghc_strip_whitespace = False
"""Strip all leading and trailing whitespace for each returned fragment.
Single file might return multiple code fragments. Enabling this might break
code indentation.
"""

ghc_api_version = "2022-11-28"
"""The version of the GitHub REST API.
"""

ghc_insert_block_separator = False
"""Each file possibly consists of more than one code block that matches the
search, if this is set to true, the blocks will be separated with ``...`` line.
This might break the lexer and thus result in the lack of code highlighting.
"""


def request(query: str, params: dict[str, t.Any]) -> None:

    params['url'] = search_url.format(query=urlencode({'q': query}), page=urlencode({'page': params['pageno']}))
    params['headers']['Accept'] = accept_header
    params['headers']['X-GitHub-Api-Version'] = ghc_api_version

    if ghc_auth['type'] == "none":
        # Without the auth header the query fails, so add a dummy instead.
        # Queries without auth are heavily rate limited.
        params['headers']['Authorization'] = "placeholder"
    if ghc_auth['type'] == "personal_access_token":
        params['headers']['Authorization'] = f"token {ghc_auth['token']}"
    if ghc_auth['type'] == "bearer":
        params['headers']['Authorization'] = f"Bearer {ghc_auth['token']}"

    params['raise_for_httperror'] = False


def get_code_language_name(filename: str, code_snippet: str) -> str | None:
    """Returns a code language name by pulling information from the filename if
    possible otherwise by scanning the passed code snippet. In case there is any
    parsing error just default to no syntax highlighting."""
    try:
        lexer = guess_lexer_for_filename(filename, _text=code_snippet)
        if lexer is None:
            return None
        code_name_aliases = lexer.aliases
        if len(code_name_aliases) == 0:
            return None
        return code_name_aliases[0]
    except ClassNotFound:
        return None


def extract_code(code_matches: list[dict[str, t.Any]]) -> tuple[list[str], set[int]]:
    """
    Iterate over multiple possible matches, for each extract a code fragment.
    GitHub additionally sends context for _word_ highlights; pygments supports
    highlighting lines, as such we calculate which lines to highlight while
    traversing the text.
    """
    lines: list[str] = []
    highlighted_lines_index: set[int] = set()

    for i, match in enumerate(code_matches):
        if i > 0 and ghc_insert_block_separator:
            lines.append("...")
        buffer: list[str] = []
        highlight_groups = [highlight_group['indices'] for highlight_group in match['matches']]

        code: str = match['fragment']
        original_code_lenght = len(code)

        if ghc_strip_whitespace:
            code = code.lstrip()
        if ghc_strip_new_lines:
            code = code.lstrip("\n")

        offset = original_code_lenght - len(code)

        if ghc_strip_whitespace:
            code = code.rstrip()
        if ghc_strip_new_lines:
            code = code.rstrip("\n")

        for i, letter in enumerate(code):
            if len(highlight_groups) > 0:
                # the API ensures these are sorted already, and we have a
                # guaranteed match in the code (all indices are in the range 0
                # and len(fragment)), so only check the first highlight group
                [after, before] = highlight_groups[0]
                if after <= (i + offset) < before:
                    # pygments enumerates lines from 1, highlight the next line
                    highlighted_lines_index.add(len(lines) + 1)
                    highlight_groups.pop(0)

            if letter == "\n":
                lines.append("".join(buffer))
                buffer = []
                continue

            buffer.append(letter)
        lines.append("".join(buffer))
    return lines, highlighted_lines_index


def response(resp: SXNG_Response) -> EngineResults:
    results = EngineResults()

    if resp.status_code == 422:
        # on a invalid search term the status code 422 "Unprocessable Content"
        # is returned / e.g. search term is "user: foo" instead "user:foo"
        return results
    # raise for other errors
    raise_for_httperror(resp)

    for item in resp.json().get('items', []):
        repo = item['repository']
        text_matches = item['text_matches']
        # ensure picking only the code contents in the blob
        code_matches = [
            match for match in text_matches if match["object_type"] == "FileContent" and match["property"] == "content"
        ]
        lines, highlighted_lines_index = extract_code(code_matches)
        if not ghc_highlight_matching_lines:
            highlighted_lines_index: set[int] = set()

        code_snippet = "\n".join(lines)

        kwargs: dict[str, t.Any] = {
            'template': 'code.html',
            'url': item['html_url'],
            'title': f"{repo['full_name']} · {item['path']}",
            'content': repo['description'],
            'repository': repo['html_url'],
            'codelines': [(i + 1, line) for (i, line) in enumerate(lines)],
            'hl_lines': highlighted_lines_index,
            'code_language': get_code_language_name(filename=item['name'], code_snippet=code_snippet),
            # important to set for highlighing
            'strip_whitespace': ghc_strip_whitespace,
            'strip_new_lines': ghc_strip_new_lines,
            'parsed_url': urlparse(item['html_url']),
        }
        results.add(results.types.LegacyResult(**kwargs))

    return results