9 files changed, 509 insertions, 7 deletions
diff --git a/AUTHORS.rst b/AUTHORS.rst
index ef0800bb0..23d8d6db6 100644
--- a/AUTHORS.rst
+++ b/AUTHORS.rst
@@ -178,3 +178,4 @@ features or generally made SearXNG better:
 - `Bearz314 <https://github.com/bearz314>`_
 - Tommaso Colella `<https://github.com/gioleppe>`
 - @AgentScrubbles
+- Filip Mikina `<https://github.com/fiffek>`
diff --git a/docs/dev/engines/online/github_code.rst b/docs/dev/engines/online/github_code.rst
new file mode 100644
index 000000000..12082f29f
--- /dev/null
+++ b/docs/dev/engines/online/github_code.rst
@@ -0,0 +1,8 @@
+.. _github code engine:
+
+===========
+Github Code
+===========
+
+.. automodule:: searx.engines.github_code
+   :members:
diff --git a/searx/engines/github_code.py b/searx/engines/github_code.py
new file mode 100644
index 000000000..4bafe9c0d
--- /dev/null
+++ b/searx/engines/github_code.py
@@ -0,0 +1,272 @@
+# SPDX-License-Identifier: AGPL-3.0-or-lat_er
+"""GitHub code search with `search syntax`_ as described in `Constructing a
+search query`_ in the documentation of GitHub's REST API.
+
+.. _search syntax:
+    https://docs.github.com/en/search-github/getting-started-with-searching-on-github/understanding-the-search-syntax
+.. _Constructing a search query:
+    https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#constructing-a-search-query
+.. _Github REST API for code search:
+    https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#search-code
+.. _Github REST API auth for code search:
+    https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#search-code--fine-grained-access-tokens
+
+Configuration
+=============
+
+The engine has the following mandatory setting:
+
+- :py:obj:`ghc_auth`
+  Change the authentication method used when using the API, defaults to none.
+
+Optional settings are:
+
+- :py:obj:`ghc_highlight_matching_lines`
+   Control the highlighting of the matched text (turns off/on).
+- :py:obj:`ghc_strip_new_lines`
+   Strip new lines at the start or end of each code fragment.
+- :py:obj:`ghc_strip_whitespace`
+   Strip any whitespace at the start or end of each code fragment.
+- :py:obj:`ghc_insert_block_separator`
+   Add a `...` between each code fragment before merging them.
+
+.. code:: yaml
+
+  - name: github code
+    engine: github_code
+    shortcut: ghc
+    ghc_auth:
+      type: "none"
+
+  - name: github code
+    engine: github_code
+    shortcut: ghc
+    ghc_auth:
+      type: "personal_access_token"
+      token: "<token>"
+    ghc_highlight_matching_lines: true
+    ghc_strip_whitespace: true
+    ghc_strip_new_lines: true
+
+
+  - name: github code
+    engine: github_code
+    shortcut: ghc
+    ghc_auth:
+      type: "bearer"
+      token: "<token>"
+
+Implementation
+===============
+
+GitHub does not return the code line indices alongside the code fragment in the
+search API. Since these are not super important for the user experience all the
+code lines are just relabeled (starting from 1) and appended (a disjoint set of
+code blocks in a single file might be returned from the API).
+"""
+
+from __future__ import annotations
+
+import typing as t
+from urllib.parse import urlencode, urlparse
+
+from pygments.lexers import guess_lexer_for_filename
+from pygments.util import ClassNotFound
+from searx.result_types import EngineResults
+from searx.extended_types import SXNG_Response
+from searx.network import raise_for_httperror
+
+# about
+about = {
+    "website": 'https://github.com/',
+    "wikidata_id": 'Q364',
+    "official_api_documentation": 'https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#search-code',
+    "use_official_api": True,
+    "require_api_key": False,
+    "results": 'JSON',
+}
+
+# engine dependent config
+categories = ['code']
+
+
+search_url = 'https://api.github.com/search/code?sort=indexed&{query}&{page}'
+# https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#text-match-metadata
+accept_header = 'application/vnd.github.text-match+json'
+paging = True
+
+ghc_auth = {
+    "type": "none",
+    "token": "",
+}
+"""Change the method of authenticating to the github API.
+
+``type`` needs to be one of ``none``, ``personal_access_token``, or ``bearer``.
+When type is not `none` a token is expected to be passed as well in
+``auth.token``.
+
+If there is any privacy concerns about generating a token, one can use the API
+without authentication.  The calls will be heavily rate limited, this is what the
+API returns on such calls::
+
+    API rate limit exceeded for <redacted ip>.
+    (But here's the good news: Authenticated requests get a higher rate limit)
+
+The personal access token or a bearer for an org or a group can be generated [in
+the `GitHub settings`_.
+
+.. _GitHub settings:
+   https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#search-code--fine-grained-access-tokens
+"""
+
+ghc_highlight_matching_lines = True
+"""Highlight the matching code lines."""
+
+ghc_strip_new_lines = True
+"""Strip leading and trailing newlines for each returned fragment.
+Single file might return multiple code fragments.
+"""
+
+ghc_strip_whitespace = False
+"""Strip all leading and trailing whitespace for each returned fragment.
+Single file might return multiple code fragments. Enabling this might break
+code indentation.
+"""
+
+ghc_api_version = "2022-11-28"
+"""The version of the GitHub REST API.
+"""
+
+ghc_insert_block_separator = False
+"""Each file possibly consists of more than one code block that matches the
+search, if this is set to true, the blocks will be separated with ``...`` line.
+This might break the lexer and thus result in the lack of code highlighting.
+"""
+
+
+def request(query: str, params: dict[str, t.Any]) -> None:
+
+    params['url'] = search_url.format(query=urlencode({'q': query}), page=urlencode({'page': params['pageno']}))
+    params['headers']['Accept'] = accept_header
+    params['headers']['X-GitHub-Api-Version'] = ghc_api_version
+
+    if ghc_auth['type'] == "none":
+        # Without the auth header the query fails, so add a dummy instead.
+        # Queries without auth are heavily rate limited.
+        params['headers']['Authorization'] = "placeholder"
+    if ghc_auth['type'] == "personal_access_token":
+        params['headers']['Authorization'] = f"token {ghc_auth['token']}"
+    if ghc_auth['type'] == "bearer":
+        params['headers']['Authorization'] = f"Bearer {ghc_auth['token']}"
+
+    params['raise_for_httperror'] = False
+
+
+def get_code_language_name(filename: str, code_snippet: str) -> str | None:
+    """Returns a code language name by pulling information from the filename if
+    possible otherwise by scanning the passed code snippet. In case there is any
+    parsing error just default to no syntax highlighting."""
+    try:
+        lexer = guess_lexer_for_filename(filename, _text=code_snippet)
+        if lexer is None:
+            return None
+        code_name_aliases = lexer.aliases
+        if len(code_name_aliases) == 0:
+            return None
+        return code_name_aliases[0]
+    except ClassNotFound:
+        return None
+
+
+def extract_code(code_matches: list[dict[str, t.Any]]) -> tuple[list[str], set[int]]:
+    """
+    Iterate over multiple possible matches, for each extract a code fragment.
+    GitHub additionally sends context for _word_ highlights; pygments supports
+    highlighting lines, as such we calculate which lines to highlight while
+    traversing the text.
+    """
+    lines: list[str] = []
+    highlighted_lines_index: set[int] = set()
+
+    for i, match in enumerate(code_matches):
+        if i > 0 and ghc_insert_block_separator:
+            lines.append("...")
+        buffer: list[str] = []
+        highlight_groups = [highlight_group['indices'] for highlight_group in match['matches']]
+
+        code: str = match['fragment']
+        original_code_lenght = len(code)
+
+        if ghc_strip_whitespace:
+            code = code.lstrip()
+        if ghc_strip_new_lines:
+            code = code.lstrip("\n")
+
+        offset = original_code_lenght - len(code)
+
+        if ghc_strip_whitespace:
+            code = code.rstrip()
+        if ghc_strip_new_lines:
+            code = code.rstrip("\n")
+
+        for i, letter in enumerate(code):
+            if len(highlight_groups) > 0:
+                # the API ensures these are sorted already, and we have a
+                # guaranteed match in the code (all indices are in the range 0
+                # and len(fragment)), so only check the first highlight group
+                [after, before] = highlight_groups[0]
+                if after <= (i + offset) < before:
+                    # pygments enumerates lines from 1, highlight the next line
+                    highlighted_lines_index.add(len(lines) + 1)
+                    highlight_groups.pop(0)
+
+            if letter == "\n":
+                lines.append("".join(buffer))
+                buffer = []
+                continue
+
+            buffer.append(letter)
+        lines.append("".join(buffer))
+    return lines, highlighted_lines_index
+
+
+def response(resp: SXNG_Response) -> EngineResults:
+    results = EngineResults()
+
+    if resp.status_code == 422:
+        # on a invalid search term the status code 422 "Unprocessable Content"
+        # is returned / e.g. search term is "user: foo" instead "user:foo"
+        return results
+    # raise for other errors
+    raise_for_httperror(resp)
+
+    for item in resp.json().get('items', []):
+        repo = item['repository']
+        text_matches = item['text_matches']
+        # ensure picking only the code contents in the blob
+        code_matches = [
+            match for match in text_matches if match["object_type"] == "FileContent" and match["property"] == "content"
+        ]
+        lines, highlighted_lines_index = extract_code(code_matches)
+        if not ghc_highlight_matching_lines:
+            highlighted_lines_index: set[int] = set()
+
+        code_snippet = "\n".join(lines)
+
+        kwargs: dict[str, t.Any] = {
+            'template': 'code.html',
+            'url': item['html_url'],
+            'title': f"{repo['full_name']} · {item['path']}",
+            'content': repo['description'],
+            'repository': repo['html_url'],
+            'codelines': [(i + 1, line) for (i, line) in enumerate(lines)],
+            'hl_lines': highlighted_lines_index,
+            'code_language': get_code_language_name(filename=item['name'], code_snippet=code_snippet),
+            # important to set for highlighing
+            'strip_whitespace': ghc_strip_whitespace,
+            'strip_new_lines': ghc_strip_new_lines,
+            'parsed_url': urlparse(item['html_url']),
+        }
+        results.add(results.types.LegacyResult(**kwargs))
+
+    return results
diff --git a/searx/engines/searchcode_code.py b/searx/engines/searchcode_code.py
index 7cfe2ce71..2196b0ad2 100644
--- a/searx/engines/searchcode_code.py
+++ b/searx/engines/searchcode_code.py
@@ -70,6 +70,8 @@ def response(resp):
                 'codelines': sorted(lines.items()),
                 'code_language': code_language,
                 'template': 'code.html',
+                'strip_whitespace': True,
+                'strip_new_lines': True,
             }
         )
 
diff --git a/searx/settings.yml b/searx/settings.yml
index d21192651..0cd293d7e 100644
--- a/searx/settings.yml
+++ b/searx/settings.yml
@@ -983,6 +983,24 @@ engines:
     engine: github
     shortcut: gh
 
+  - name: github code
+    engine: github_code
+    shortcut: ghc
+    disabled: true
+    ghc_auth:
+      # type is one of:
+      # * none
+      # * personal_access_token
+      # * bearer
+      # When none is passed, the token is not requried.
+      type: "none"
+      token: "token"
+    # specify whether to highlight the matching lines to the query
+    ghc_highlight_matching_lines: true
+    ghc_strip_new_lines: true
+    ghc_strip_whitespace: false
+    timeout: 10.0
+
   - name: codeberg
     # https://docs.searxng.org/dev/engines/online/gitea.html
     engine: gitea
diff --git a/searx/templates/simple/result_templates/code.html b/searx/templates/simple/result_templates/code.html
index 49326aed5..bcde94358 100644
--- a/searx/templates/simple/result_templates/code.html
+++ b/searx/templates/simple/result_templates/code.html
@@ -25,7 +25,7 @@
 {%- endif -%}
 
 <div dir="ltr" class="codelines">
-    {{- result.codelines|code_highlighter(result.code_language)|safe -}}
+    {{- result.codelines|code_highlighter(result.code_language, result.hl_lines, result.strip_whitespace, result.strip_new_lines)|safe -}}
 </div>
 
 {{- result_sub_footer(result) -}}
diff --git a/searx/webapp.py b/searx/webapp.py
index 2dd7ddb08..9b590eeab 100755
--- a/searx/webapp.py
+++ b/searx/webapp.py
@@ -181,24 +181,32 @@ def _get_locale_rfc5646(locale):
 
 # code-highlighter
 @app.template_filter('code_highlighter')
-def code_highlighter(codelines, language=None):
+def code_highlighter(codelines, language=None, hl_lines=None, strip_whitespace=True, strip_new_lines=True):
     if not language:
         language = 'text'
 
     try:
-        # find lexer by programming language
-        lexer = get_lexer_by_name(language, stripall=True)
+        lexer = get_lexer_by_name(language, stripall=strip_whitespace, stripnl=strip_new_lines)
 
     except Exception as e:  # pylint: disable=broad-except
         logger.warning("pygments lexer: %s " % e)
         # if lexer is not found, using default one
-        lexer = get_lexer_by_name('text', stripall=True)
+        lexer = get_lexer_by_name('text', stripall=strip_whitespace, stripnl=strip_new_lines)
 
     html_code = ''
     tmp_code = ''
     last_line = None
     line_code_start = None
 
+    def offset_hl_lines(hl_lines, start):
+        """
+        hl_lines in pygments are expected to be relative to the input
+        """
+        if hl_lines is None:
+            return None
+
+        return [line - start + 1 for line in hl_lines]
+
     # parse lines
     for line, code in codelines:
         if not last_line:
@@ -208,7 +216,12 @@ def code_highlighter(codelines, language=None):
         if last_line is not None and last_line + 1 != line:
 
             # highlight last codepart
-            formatter = HtmlFormatter(linenos='inline', linenostart=line_code_start, cssclass="code-highlight")
+            formatter = HtmlFormatter(
+                linenos='inline',
+                linenostart=line_code_start,
+                cssclass="code-highlight",
+                hl_lines=offset_hl_lines(hl_lines, line_code_start),
+            )
             html_code = html_code + highlight(tmp_code, lexer, formatter)
 
             # reset conditions for next codepart
@@ -222,7 +235,12 @@ def code_highlighter(codelines, language=None):
         last_line = line
 
     # highlight last codepart
-    formatter = HtmlFormatter(linenos='inline', linenostart=line_code_start, cssclass="code-highlight")
+    formatter = HtmlFormatter(
+        linenos='inline',
+        linenostart=line_code_start,
+        cssclass="code-highlight",
+        hl_lines=offset_hl_lines(hl_lines, line_code_start),
+    )
     html_code = html_code + highlight(tmp_code, lexer, formatter)
 
     return html_code
diff --git a/tests/unit/settings/test_github_code.yml b/tests/unit/settings/test_github_code.yml
new file mode 100644
index 000000000..2cf039138
--- /dev/null
+++ b/tests/unit/settings/test_github_code.yml
@@ -0,0 +1,13 @@
+# This SearXNG setup is used in unit tests
+
+use_default_settings:
+
+  engines:
+    keep_only: []
+
+engines:
+
+  - name: github code
+    engine: github_code
+    shortcut: "ghc"
+    disabled: true
diff --git a/tests/unit/test_engine_github_code.py b/tests/unit/test_engine_github_code.py
new file mode 100644
index 000000000..d10081f28
--- /dev/null
+++ b/tests/unit/test_engine_github_code.py
@@ -0,0 +1,170 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# pylint: disable=missing-module-docstring,disable=missing-class-docstring
+
+import logging
+from unittest.mock import Mock
+from urllib.parse import urlparse
+from parameterized import parameterized
+
+import searx.engines
+from tests import SearxTestCase
+from searx.result_types import EngineResults
+
+
+class GithubCodeTests(SearxTestCase):
+
+    TEST_SETTINGS = "test_github_code.yml"
+
+    def setUp(self):
+        super().setUp()
+        self.ghc = searx.engines.engines['github code']
+        self.ghc.logger.setLevel(logging.INFO)
+
+    def tearDown(self):
+        searx.search.load_engines([])
+
+    @parameterized.expand(
+        [
+            [
+                [
+                    {
+                        "fragment": "    - [Tab management](#tab-management)\n    - [Buffer/window management]"
+                        "(#bufferwindow-management)\n- [🎨 Highlights](#-highlights)",
+                        "matches": [{"indices": [47, 53], "text": "Buffer"}, {"indices": [74, 80], "text": "buffer"}],
+                    },
+                    {
+                        "fragment": "To conditionally activate plugins, the best solution is to use the\n"
+                        "[LazyVim VSCode extra](https://www.lazyvim.org/extras/vscode). However, "
+                        "`packer.nvim` and `lazy.nvim` have built-in\nsupport for "
+                        "`cond = vim.g.vscode` and `vim-plug` has a",
+                        "matches": [
+                            {"indices": [68, 75], "text": "LazyVim"},
+                            {"indices": [102, 109], "text": "lazyvim"},
+                        ],
+                    },
+                ],
+                [
+                    "    - [Tab management](#tab-management)",
+                    "    - [Buffer/window management](#bufferwindow-management)",
+                    "- [🎨 Highlights](#-highlights)",
+                    "To conditionally activate plugins, the best solution is to use the",
+                    "[LazyVim VSCode extra](https://www.lazyvim.org/extras/vscode)."
+                    " However, `packer.nvim` and `lazy.nvim` have built-in",
+                    "support for `cond = vim.g.vscode` and `vim-plug` has a",
+                ],
+                {2, 5},
+            ],
+            [
+                [
+                    {
+                        "fragment": "\n| `<leader>uf` | Toggle format (global) |\n"
+                        "| `<leader>uF` | Toggle format (buffer) |\n"
+                        "| `<leader>us` | Toggle spelling |\n",
+                        "matches": [{"indices": [74, 80], "text": "buffer"}],
+                    },
+                ],
+                [
+                    "| `<leader>uf` | Toggle format (global) |",
+                    "| `<leader>uF` | Toggle format (buffer) |",
+                    "| `<leader>us` | Toggle spelling |",
+                ],
+                {2},
+            ],
+            [
+                [
+                    {
+                        "fragment": "\n\n\n1\n2\n3\n4",
+                        "matches": [{"indices": [3, 4], "text": "1"}],
+                    },
+                ],
+                [
+                    "1",
+                    "2",
+                    "3",
+                    "4",
+                ],
+                {1},
+            ],
+            [
+                [
+                    {
+                        "fragment": "placeholder",
+                        "matches": [],
+                    },
+                ],
+                [
+                    "placeholder",
+                ],
+                set(),
+            ],
+        ]
+    )
+    def test_code_extraction(self, code_matches, expected_code, expected_highlighted_lines):
+        code, highlights = self.ghc.extract_code(code_matches=code_matches)
+        self.assertEqual(code, expected_code)
+        self.assertEqual(highlights, expected_highlighted_lines)
+
+    def test_transforms_response(self):
+        response = Mock()
+        response.json.return_value = {
+            "items": [
+                {
+                    "name": "TODO.md",
+                    "path": "TODO.md",
+                    "html_url": "https://github.com/folke/dot/blob/3140f4f5720c3cc6b5034c624eb7706f8533a82c/TODO.md",
+                    "repository": {
+                        "full_name": "folke/dot",
+                        "html_url": "https://github.com/folke/dot",
+                        "description": "☕️   My Dot Files",
+                    },
+                    "text_matches": [
+                        {
+                            "object_type": "FileContent",
+                            "property": "content",
+                            "fragment": "- [x] windows picker\n"
+                            "- [x] toggle cwd / root (LazyVim)\n"
+                            "- [x] dynamic workspace symbol",
+                            "matches": [{"indices": [46, 53], "text": "LazyVim"}],
+                        },
+                        {
+                            "object_type": "FileContent",
+                            "property": "content",
+                            "fragment": "- [x] smart stops working after custom\n"
+                            "- [x] edit in empty buffer\n"
+                            "- [x] support toggling line nr for preview",
+                            "matches": [{"indices": [59, 65], "text": "buffer"}, {"indices": [89, 93], "text": "line"}],
+                        },
+                    ],
+                }
+            ]
+        }
+        response.status_code = 200
+        results = self.ghc.response(response)
+        expected_results = EngineResults()
+        expected_results.add(
+            expected_results.types.LegacyResult(
+                **{
+                    'url': "https://github.com/folke/dot/blob/3140f4f5720c3cc6b5034c624eb7706f8533a82c/TODO.md",
+                    'title': "folke/dot · TODO.md",
+                    'content': "☕️   My Dot Files",
+                    'repository': "https://github.com/folke/dot",
+                    'codelines': [
+                        (1, "- [x] windows picker"),
+                        (2, "- [x] toggle cwd / root (LazyVim)"),
+                        (3, "- [x] dynamic workspace symbol"),
+                        (4, "- [x] smart stops working after custom"),
+                        (5, "- [x] edit in empty buffer"),
+                        (6, "- [x] support toggling line nr for preview"),
+                    ],
+                    'hl_lines': {2, 5, 6},
+                    'code_language': "markdown",
+                    'template': 'code.html',
+                    'strip_whitespace': False,
+                    'strip_new_lines': True,
+                    'parsed_url': urlparse(
+                        "https://github.com/folke/dot/blob/3140f4f5720c3cc6b5034c624eb7706f8533a82c/TODO.md"
+                    ),
+                }
+            )
+        )
+        self.assertEqual(results, expected_results)