1 files changed, 272 insertions, 0 deletions
diff --git a/searx/engines/github_code.py b/searx/engines/github_code.py
new file mode 100644
index 000000000..4bafe9c0d
--- /dev/null
+++ b/searx/engines/github_code.py
@@ -0,0 +1,272 @@
+# SPDX-License-Identifier: AGPL-3.0-or-lat_er
+"""GitHub code search with `search syntax`_ as described in `Constructing a
+search query`_ in the documentation of GitHub's REST API.
+
+.. _search syntax:
+    https://docs.github.com/en/search-github/getting-started-with-searching-on-github/understanding-the-search-syntax
+.. _Constructing a search query:
+    https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#constructing-a-search-query
+.. _Github REST API for code search:
+    https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#search-code
+.. _Github REST API auth for code search:
+    https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#search-code--fine-grained-access-tokens
+
+Configuration
+=============
+
+The engine has the following mandatory setting:
+
+- :py:obj:`ghc_auth`
+  Change the authentication method used when using the API, defaults to none.
+
+Optional settings are:
+
+- :py:obj:`ghc_highlight_matching_lines`
+   Control the highlighting of the matched text (turns off/on).
+- :py:obj:`ghc_strip_new_lines`
+   Strip new lines at the start or end of each code fragment.
+- :py:obj:`ghc_strip_whitespace`
+   Strip any whitespace at the start or end of each code fragment.
+- :py:obj:`ghc_insert_block_separator`
+   Add a `...` between each code fragment before merging them.
+
+.. code:: yaml
+
+  - name: github code
+    engine: github_code
+    shortcut: ghc
+    ghc_auth:
+      type: "none"
+
+  - name: github code
+    engine: github_code
+    shortcut: ghc
+    ghc_auth:
+      type: "personal_access_token"
+      token: "<token>"
+    ghc_highlight_matching_lines: true
+    ghc_strip_whitespace: true
+    ghc_strip_new_lines: true
+
+
+  - name: github code
+    engine: github_code
+    shortcut: ghc
+    ghc_auth:
+      type: "bearer"
+      token: "<token>"
+
+Implementation
+===============
+
+GitHub does not return the code line indices alongside the code fragment in the
+search API. Since these are not super important for the user experience all the
+code lines are just relabeled (starting from 1) and appended (a disjoint set of
+code blocks in a single file might be returned from the API).
+"""
+
+from __future__ import annotations
+
+import typing as t
+from urllib.parse import urlencode, urlparse
+
+from pygments.lexers import guess_lexer_for_filename
+from pygments.util import ClassNotFound
+from searx.result_types import EngineResults
+from searx.extended_types import SXNG_Response
+from searx.network import raise_for_httperror
+
+# about
+about = {
+    "website": 'https://github.com/',
+    "wikidata_id": 'Q364',
+    "official_api_documentation": 'https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#search-code',
+    "use_official_api": True,
+    "require_api_key": False,
+    "results": 'JSON',
+}
+
+# engine dependent config
+categories = ['code']
+
+
+search_url = 'https://api.github.com/search/code?sort=indexed&{query}&{page}'
+# https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#text-match-metadata
+accept_header = 'application/vnd.github.text-match+json'
+paging = True
+
+ghc_auth = {
+    "type": "none",
+    "token": "",
+}
+"""Change the method of authenticating to the github API.
+
+``type`` needs to be one of ``none``, ``personal_access_token``, or ``bearer``.
+When type is not `none` a token is expected to be passed as well in
+``auth.token``.
+
+If there is any privacy concerns about generating a token, one can use the API
+without authentication.  The calls will be heavily rate limited, this is what the
+API returns on such calls::
+
+    API rate limit exceeded for <redacted ip>.
+    (But here's the good news: Authenticated requests get a higher rate limit)
+
+The personal access token or a bearer for an org or a group can be generated [in
+the `GitHub settings`_.
+
+.. _GitHub settings:
+   https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#search-code--fine-grained-access-tokens
+"""
+
+ghc_highlight_matching_lines = True
+"""Highlight the matching code lines."""
+
+ghc_strip_new_lines = True
+"""Strip leading and trailing newlines for each returned fragment.
+Single file might return multiple code fragments.
+"""
+
+ghc_strip_whitespace = False
+"""Strip all leading and trailing whitespace for each returned fragment.
+Single file might return multiple code fragments. Enabling this might break
+code indentation.
+"""
+
+ghc_api_version = "2022-11-28"
+"""The version of the GitHub REST API.
+"""
+
+ghc_insert_block_separator = False
+"""Each file possibly consists of more than one code block that matches the
+search, if this is set to true, the blocks will be separated with ``...`` line.
+This might break the lexer and thus result in the lack of code highlighting.
+"""
+
+
+def request(query: str, params: dict[str, t.Any]) -> None:
+
+    params['url'] = search_url.format(query=urlencode({'q': query}), page=urlencode({'page': params['pageno']}))
+    params['headers']['Accept'] = accept_header
+    params['headers']['X-GitHub-Api-Version'] = ghc_api_version
+
+    if ghc_auth['type'] == "none":
+        # Without the auth header the query fails, so add a dummy instead.
+        # Queries without auth are heavily rate limited.
+        params['headers']['Authorization'] = "placeholder"
+    if ghc_auth['type'] == "personal_access_token":
+        params['headers']['Authorization'] = f"token {ghc_auth['token']}"
+    if ghc_auth['type'] == "bearer":
+        params['headers']['Authorization'] = f"Bearer {ghc_auth['token']}"
+
+    params['raise_for_httperror'] = False
+
+
+def get_code_language_name(filename: str, code_snippet: str) -> str | None:
+    """Returns a code language name by pulling information from the filename if
+    possible otherwise by scanning the passed code snippet. In case there is any
+    parsing error just default to no syntax highlighting."""
+    try:
+        lexer = guess_lexer_for_filename(filename, _text=code_snippet)
+        if lexer is None:
+            return None
+        code_name_aliases = lexer.aliases
+        if len(code_name_aliases) == 0:
+            return None
+        return code_name_aliases[0]
+    except ClassNotFound:
+        return None
+
+
+def extract_code(code_matches: list[dict[str, t.Any]]) -> tuple[list[str], set[int]]:
+    """
+    Iterate over multiple possible matches, for each extract a code fragment.
+    GitHub additionally sends context for _word_ highlights; pygments supports
+    highlighting lines, as such we calculate which lines to highlight while
+    traversing the text.
+    """
+    lines: list[str] = []
+    highlighted_lines_index: set[int] = set()
+
+    for i, match in enumerate(code_matches):
+        if i > 0 and ghc_insert_block_separator:
+            lines.append("...")
+        buffer: list[str] = []
+        highlight_groups = [highlight_group['indices'] for highlight_group in match['matches']]
+
+        code: str = match['fragment']
+        original_code_lenght = len(code)
+
+        if ghc_strip_whitespace:
+            code = code.lstrip()
+        if ghc_strip_new_lines:
+            code = code.lstrip("\n")
+
+        offset = original_code_lenght - len(code)
+
+        if ghc_strip_whitespace:
+            code = code.rstrip()
+        if ghc_strip_new_lines:
+            code = code.rstrip("\n")
+
+        for i, letter in enumerate(code):
+            if len(highlight_groups) > 0:
+                # the API ensures these are sorted already, and we have a
+                # guaranteed match in the code (all indices are in the range 0
+                # and len(fragment)), so only check the first highlight group
+                [after, before] = highlight_groups[0]
+                if after <= (i + offset) < before:
+                    # pygments enumerates lines from 1, highlight the next line
+                    highlighted_lines_index.add(len(lines) + 1)
+                    highlight_groups.pop(0)
+
+            if letter == "\n":
+                lines.append("".join(buffer))
+                buffer = []
+                continue
+
+            buffer.append(letter)
+        lines.append("".join(buffer))
+    return lines, highlighted_lines_index
+
+
+def response(resp: SXNG_Response) -> EngineResults:
+    results = EngineResults()
+
+    if resp.status_code == 422:
+        # on a invalid search term the status code 422 "Unprocessable Content"
+        # is returned / e.g. search term is "user: foo" instead "user:foo"
+        return results
+    # raise for other errors
+    raise_for_httperror(resp)
+
+    for item in resp.json().get('items', []):
+        repo = item['repository']
+        text_matches = item['text_matches']
+        # ensure picking only the code contents in the blob
+        code_matches = [
+            match for match in text_matches if match["object_type"] == "FileContent" and match["property"] == "content"
+        ]
+        lines, highlighted_lines_index = extract_code(code_matches)
+        if not ghc_highlight_matching_lines:
+            highlighted_lines_index: set[int] = set()
+
+        code_snippet = "\n".join(lines)
+
+        kwargs: dict[str, t.Any] = {
+            'template': 'code.html',
+            'url': item['html_url'],
+            'title': f"{repo['full_name']} · {item['path']}",
+            'content': repo['description'],
+            'repository': repo['html_url'],
+            'codelines': [(i + 1, line) for (i, line) in enumerate(lines)],
+            'hl_lines': highlighted_lines_index,
+            'code_language': get_code_language_name(filename=item['name'], code_snippet=code_snippet),
+            # important to set for highlighing
+            'strip_whitespace': ghc_strip_whitespace,
+            'strip_new_lines': ghc_strip_new_lines,
+            'parsed_url': urlparse(item['html_url']),
+        }
+        results.add(results.types.LegacyResult(**kwargs))
+
+    return results