diff options
Diffstat (limited to 'searx/engines/github_code.py')
| -rw-r--r-- | searx/engines/github_code.py | 272 |
1 files changed, 272 insertions, 0 deletions
diff --git a/searx/engines/github_code.py b/searx/engines/github_code.py new file mode 100644 index 000000000..4bafe9c0d --- /dev/null +++ b/searx/engines/github_code.py @@ -0,0 +1,272 @@ +# SPDX-License-Identifier: AGPL-3.0-or-lat_er +"""GitHub code search with `search syntax`_ as described in `Constructing a +search query`_ in the documentation of GitHub's REST API. + +.. _search syntax: + https://docs.github.com/en/search-github/getting-started-with-searching-on-github/understanding-the-search-syntax +.. _Constructing a search query: + https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#constructing-a-search-query +.. _Github REST API for code search: + https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#search-code +.. _Github REST API auth for code search: + https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#search-code--fine-grained-access-tokens + +Configuration +============= + +The engine has the following mandatory setting: + +- :py:obj:`ghc_auth` + Change the authentication method used when using the API, defaults to none. + +Optional settings are: + +- :py:obj:`ghc_highlight_matching_lines` + Control the highlighting of the matched text (turns off/on). +- :py:obj:`ghc_strip_new_lines` + Strip new lines at the start or end of each code fragment. +- :py:obj:`ghc_strip_whitespace` + Strip any whitespace at the start or end of each code fragment. +- :py:obj:`ghc_insert_block_separator` + Add a `...` between each code fragment before merging them. + +.. code:: yaml + + - name: github code + engine: github_code + shortcut: ghc + ghc_auth: + type: "none" + + - name: github code + engine: github_code + shortcut: ghc + ghc_auth: + type: "personal_access_token" + token: "<token>" + ghc_highlight_matching_lines: true + ghc_strip_whitespace: true + ghc_strip_new_lines: true + + + - name: github code + engine: github_code + shortcut: ghc + ghc_auth: + type: "bearer" + token: "<token>" + +Implementation +=============== + +GitHub does not return the code line indices alongside the code fragment in the +search API. Since these are not super important for the user experience all the +code lines are just relabeled (starting from 1) and appended (a disjoint set of +code blocks in a single file might be returned from the API). +""" + +from __future__ import annotations + +import typing as t +from urllib.parse import urlencode, urlparse + +from pygments.lexers import guess_lexer_for_filename +from pygments.util import ClassNotFound +from searx.result_types import EngineResults +from searx.extended_types import SXNG_Response +from searx.network import raise_for_httperror + +# about +about = { + "website": 'https://github.com/', + "wikidata_id": 'Q364', + "official_api_documentation": 'https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#search-code', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +# engine dependent config +categories = ['code'] + + +search_url = 'https://api.github.com/search/code?sort=indexed&{query}&{page}' +# https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#text-match-metadata +accept_header = 'application/vnd.github.text-match+json' +paging = True + +ghc_auth = { + "type": "none", + "token": "", +} +"""Change the method of authenticating to the github API. + +``type`` needs to be one of ``none``, ``personal_access_token``, or ``bearer``. +When type is not `none` a token is expected to be passed as well in +``auth.token``. + +If there is any privacy concerns about generating a token, one can use the API +without authentication. The calls will be heavily rate limited, this is what the +API returns on such calls:: + + API rate limit exceeded for <redacted ip>. + (But here's the good news: Authenticated requests get a higher rate limit) + +The personal access token or a bearer for an org or a group can be generated [in +the `GitHub settings`_. + +.. _GitHub settings: + https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#search-code--fine-grained-access-tokens +""" + +ghc_highlight_matching_lines = True +"""Highlight the matching code lines.""" + +ghc_strip_new_lines = True +"""Strip leading and trailing newlines for each returned fragment. +Single file might return multiple code fragments. +""" + +ghc_strip_whitespace = False +"""Strip all leading and trailing whitespace for each returned fragment. +Single file might return multiple code fragments. Enabling this might break +code indentation. +""" + +ghc_api_version = "2022-11-28" +"""The version of the GitHub REST API. +""" + +ghc_insert_block_separator = False +"""Each file possibly consists of more than one code block that matches the +search, if this is set to true, the blocks will be separated with ``...`` line. +This might break the lexer and thus result in the lack of code highlighting. +""" + + +def request(query: str, params: dict[str, t.Any]) -> None: + + params['url'] = search_url.format(query=urlencode({'q': query}), page=urlencode({'page': params['pageno']})) + params['headers']['Accept'] = accept_header + params['headers']['X-GitHub-Api-Version'] = ghc_api_version + + if ghc_auth['type'] == "none": + # Without the auth header the query fails, so add a dummy instead. + # Queries without auth are heavily rate limited. + params['headers']['Authorization'] = "placeholder" + if ghc_auth['type'] == "personal_access_token": + params['headers']['Authorization'] = f"token {ghc_auth['token']}" + if ghc_auth['type'] == "bearer": + params['headers']['Authorization'] = f"Bearer {ghc_auth['token']}" + + params['raise_for_httperror'] = False + + +def get_code_language_name(filename: str, code_snippet: str) -> str | None: + """Returns a code language name by pulling information from the filename if + possible otherwise by scanning the passed code snippet. In case there is any + parsing error just default to no syntax highlighting.""" + try: + lexer = guess_lexer_for_filename(filename, _text=code_snippet) + if lexer is None: + return None + code_name_aliases = lexer.aliases + if len(code_name_aliases) == 0: + return None + return code_name_aliases[0] + except ClassNotFound: + return None + + +def extract_code(code_matches: list[dict[str, t.Any]]) -> tuple[list[str], set[int]]: + """ + Iterate over multiple possible matches, for each extract a code fragment. + GitHub additionally sends context for _word_ highlights; pygments supports + highlighting lines, as such we calculate which lines to highlight while + traversing the text. + """ + lines: list[str] = [] + highlighted_lines_index: set[int] = set() + + for i, match in enumerate(code_matches): + if i > 0 and ghc_insert_block_separator: + lines.append("...") + buffer: list[str] = [] + highlight_groups = [highlight_group['indices'] for highlight_group in match['matches']] + + code: str = match['fragment'] + original_code_lenght = len(code) + + if ghc_strip_whitespace: + code = code.lstrip() + if ghc_strip_new_lines: + code = code.lstrip("\n") + + offset = original_code_lenght - len(code) + + if ghc_strip_whitespace: + code = code.rstrip() + if ghc_strip_new_lines: + code = code.rstrip("\n") + + for i, letter in enumerate(code): + if len(highlight_groups) > 0: + # the API ensures these are sorted already, and we have a + # guaranteed match in the code (all indices are in the range 0 + # and len(fragment)), so only check the first highlight group + [after, before] = highlight_groups[0] + if after <= (i + offset) < before: + # pygments enumerates lines from 1, highlight the next line + highlighted_lines_index.add(len(lines) + 1) + highlight_groups.pop(0) + + if letter == "\n": + lines.append("".join(buffer)) + buffer = [] + continue + + buffer.append(letter) + lines.append("".join(buffer)) + return lines, highlighted_lines_index + + +def response(resp: SXNG_Response) -> EngineResults: + results = EngineResults() + + if resp.status_code == 422: + # on a invalid search term the status code 422 "Unprocessable Content" + # is returned / e.g. search term is "user: foo" instead "user:foo" + return results + # raise for other errors + raise_for_httperror(resp) + + for item in resp.json().get('items', []): + repo = item['repository'] + text_matches = item['text_matches'] + # ensure picking only the code contents in the blob + code_matches = [ + match for match in text_matches if match["object_type"] == "FileContent" and match["property"] == "content" + ] + lines, highlighted_lines_index = extract_code(code_matches) + if not ghc_highlight_matching_lines: + highlighted_lines_index: set[int] = set() + + code_snippet = "\n".join(lines) + + kwargs: dict[str, t.Any] = { + 'template': 'code.html', + 'url': item['html_url'], + 'title': f"{repo['full_name']} ยท {item['path']}", + 'content': repo['description'], + 'repository': repo['html_url'], + 'codelines': [(i + 1, line) for (i, line) in enumerate(lines)], + 'hl_lines': highlighted_lines_index, + 'code_language': get_code_language_name(filename=item['name'], code_snippet=code_snippet), + # important to set for highlighing + 'strip_whitespace': ghc_strip_whitespace, + 'strip_new_lines': ghc_strip_new_lines, + 'parsed_url': urlparse(item['html_url']), + } + results.add(results.types.LegacyResult(**kwargs)) + + return results |