summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--AUTHORS.rst1
-rw-r--r--docs/dev/engines/online/github_code.rst8
-rw-r--r--searx/engines/github_code.py272
-rw-r--r--searx/engines/searchcode_code.py2
-rw-r--r--searx/settings.yml18
-rw-r--r--searx/templates/simple/result_templates/code.html2
-rwxr-xr-xsearx/webapp.py30
-rw-r--r--tests/unit/settings/test_github_code.yml13
-rw-r--r--tests/unit/test_engine_github_code.py170
9 files changed, 509 insertions, 7 deletions
diff --git a/AUTHORS.rst b/AUTHORS.rst
index ef0800bb0..23d8d6db6 100644
--- a/AUTHORS.rst
+++ b/AUTHORS.rst
@@ -178,3 +178,4 @@ features or generally made SearXNG better:
- `Bearz314 <https://github.com/bearz314>`_
- Tommaso Colella `<https://github.com/gioleppe>`
- @AgentScrubbles
+- Filip Mikina `<https://github.com/fiffek>`
diff --git a/docs/dev/engines/online/github_code.rst b/docs/dev/engines/online/github_code.rst
new file mode 100644
index 000000000..12082f29f
--- /dev/null
+++ b/docs/dev/engines/online/github_code.rst
@@ -0,0 +1,8 @@
+.. _github code engine:
+
+===========
+Github Code
+===========
+
+.. automodule:: searx.engines.github_code
+ :members:
diff --git a/searx/engines/github_code.py b/searx/engines/github_code.py
new file mode 100644
index 000000000..4bafe9c0d
--- /dev/null
+++ b/searx/engines/github_code.py
@@ -0,0 +1,272 @@
+# SPDX-License-Identifier: AGPL-3.0-or-lat_er
+"""GitHub code search with `search syntax`_ as described in `Constructing a
+search query`_ in the documentation of GitHub's REST API.
+
+.. _search syntax:
+ https://docs.github.com/en/search-github/getting-started-with-searching-on-github/understanding-the-search-syntax
+.. _Constructing a search query:
+ https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#constructing-a-search-query
+.. _Github REST API for code search:
+ https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#search-code
+.. _Github REST API auth for code search:
+ https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#search-code--fine-grained-access-tokens
+
+Configuration
+=============
+
+The engine has the following mandatory setting:
+
+- :py:obj:`ghc_auth`
+ Change the authentication method used when using the API, defaults to none.
+
+Optional settings are:
+
+- :py:obj:`ghc_highlight_matching_lines`
+ Control the highlighting of the matched text (turns off/on).
+- :py:obj:`ghc_strip_new_lines`
+ Strip new lines at the start or end of each code fragment.
+- :py:obj:`ghc_strip_whitespace`
+ Strip any whitespace at the start or end of each code fragment.
+- :py:obj:`ghc_insert_block_separator`
+ Add a `...` between each code fragment before merging them.
+
+.. code:: yaml
+
+ - name: github code
+ engine: github_code
+ shortcut: ghc
+ ghc_auth:
+ type: "none"
+
+ - name: github code
+ engine: github_code
+ shortcut: ghc
+ ghc_auth:
+ type: "personal_access_token"
+ token: "<token>"
+ ghc_highlight_matching_lines: true
+ ghc_strip_whitespace: true
+ ghc_strip_new_lines: true
+
+
+ - name: github code
+ engine: github_code
+ shortcut: ghc
+ ghc_auth:
+ type: "bearer"
+ token: "<token>"
+
+Implementation
+===============
+
+GitHub does not return the code line indices alongside the code fragment in the
+search API. Since these are not super important for the user experience all the
+code lines are just relabeled (starting from 1) and appended (a disjoint set of
+code blocks in a single file might be returned from the API).
+"""
+
+from __future__ import annotations
+
+import typing as t
+from urllib.parse import urlencode, urlparse
+
+from pygments.lexers import guess_lexer_for_filename
+from pygments.util import ClassNotFound
+from searx.result_types import EngineResults
+from searx.extended_types import SXNG_Response
+from searx.network import raise_for_httperror
+
+# about
+about = {
+ "website": 'https://github.com/',
+ "wikidata_id": 'Q364',
+ "official_api_documentation": 'https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#search-code',
+ "use_official_api": True,
+ "require_api_key": False,
+ "results": 'JSON',
+}
+
+# engine dependent config
+categories = ['code']
+
+
+search_url = 'https://api.github.com/search/code?sort=indexed&{query}&{page}'
+# https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#text-match-metadata
+accept_header = 'application/vnd.github.text-match+json'
+paging = True
+
+ghc_auth = {
+ "type": "none",
+ "token": "",
+}
+"""Change the method of authenticating to the github API.
+
+``type`` needs to be one of ``none``, ``personal_access_token``, or ``bearer``.
+When type is not `none` a token is expected to be passed as well in
+``auth.token``.
+
+If there is any privacy concerns about generating a token, one can use the API
+without authentication. The calls will be heavily rate limited, this is what the
+API returns on such calls::
+
+ API rate limit exceeded for <redacted ip>.
+ (But here's the good news: Authenticated requests get a higher rate limit)
+
+The personal access token or a bearer for an org or a group can be generated [in
+the `GitHub settings`_.
+
+.. _GitHub settings:
+ https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#search-code--fine-grained-access-tokens
+"""
+
+ghc_highlight_matching_lines = True
+"""Highlight the matching code lines."""
+
+ghc_strip_new_lines = True
+"""Strip leading and trailing newlines for each returned fragment.
+Single file might return multiple code fragments.
+"""
+
+ghc_strip_whitespace = False
+"""Strip all leading and trailing whitespace for each returned fragment.
+Single file might return multiple code fragments. Enabling this might break
+code indentation.
+"""
+
+ghc_api_version = "2022-11-28"
+"""The version of the GitHub REST API.
+"""
+
+ghc_insert_block_separator = False
+"""Each file possibly consists of more than one code block that matches the
+search, if this is set to true, the blocks will be separated with ``...`` line.
+This might break the lexer and thus result in the lack of code highlighting.
+"""
+
+
+def request(query: str, params: dict[str, t.Any]) -> None:
+
+ params['url'] = search_url.format(query=urlencode({'q': query}), page=urlencode({'page': params['pageno']}))
+ params['headers']['Accept'] = accept_header
+ params['headers']['X-GitHub-Api-Version'] = ghc_api_version
+
+ if ghc_auth['type'] == "none":
+ # Without the auth header the query fails, so add a dummy instead.
+ # Queries without auth are heavily rate limited.
+ params['headers']['Authorization'] = "placeholder"
+ if ghc_auth['type'] == "personal_access_token":
+ params['headers']['Authorization'] = f"token {ghc_auth['token']}"
+ if ghc_auth['type'] == "bearer":
+ params['headers']['Authorization'] = f"Bearer {ghc_auth['token']}"
+
+ params['raise_for_httperror'] = False
+
+
+def get_code_language_name(filename: str, code_snippet: str) -> str | None:
+ """Returns a code language name by pulling information from the filename if
+ possible otherwise by scanning the passed code snippet. In case there is any
+ parsing error just default to no syntax highlighting."""
+ try:
+ lexer = guess_lexer_for_filename(filename, _text=code_snippet)
+ if lexer is None:
+ return None
+ code_name_aliases = lexer.aliases
+ if len(code_name_aliases) == 0:
+ return None
+ return code_name_aliases[0]
+ except ClassNotFound:
+ return None
+
+
+def extract_code(code_matches: list[dict[str, t.Any]]) -> tuple[list[str], set[int]]:
+ """
+ Iterate over multiple possible matches, for each extract a code fragment.
+ GitHub additionally sends context for _word_ highlights; pygments supports
+ highlighting lines, as such we calculate which lines to highlight while
+ traversing the text.
+ """
+ lines: list[str] = []
+ highlighted_lines_index: set[int] = set()
+
+ for i, match in enumerate(code_matches):
+ if i > 0 and ghc_insert_block_separator:
+ lines.append("...")
+ buffer: list[str] = []
+ highlight_groups = [highlight_group['indices'] for highlight_group in match['matches']]
+
+ code: str = match['fragment']
+ original_code_lenght = len(code)
+
+ if ghc_strip_whitespace:
+ code = code.lstrip()
+ if ghc_strip_new_lines:
+ code = code.lstrip("\n")
+
+ offset = original_code_lenght - len(code)
+
+ if ghc_strip_whitespace:
+ code = code.rstrip()
+ if ghc_strip_new_lines:
+ code = code.rstrip("\n")
+
+ for i, letter in enumerate(code):
+ if len(highlight_groups) > 0:
+ # the API ensures these are sorted already, and we have a
+ # guaranteed match in the code (all indices are in the range 0
+ # and len(fragment)), so only check the first highlight group
+ [after, before] = highlight_groups[0]
+ if after <= (i + offset) < before:
+ # pygments enumerates lines from 1, highlight the next line
+ highlighted_lines_index.add(len(lines) + 1)
+ highlight_groups.pop(0)
+
+ if letter == "\n":
+ lines.append("".join(buffer))
+ buffer = []
+ continue
+
+ buffer.append(letter)
+ lines.append("".join(buffer))
+ return lines, highlighted_lines_index
+
+
+def response(resp: SXNG_Response) -> EngineResults:
+ results = EngineResults()
+
+ if resp.status_code == 422:
+ # on a invalid search term the status code 422 "Unprocessable Content"
+ # is returned / e.g. search term is "user: foo" instead "user:foo"
+ return results
+ # raise for other errors
+ raise_for_httperror(resp)
+
+ for item in resp.json().get('items', []):
+ repo = item['repository']
+ text_matches = item['text_matches']
+ # ensure picking only the code contents in the blob
+ code_matches = [
+ match for match in text_matches if match["object_type"] == "FileContent" and match["property"] == "content"
+ ]
+ lines, highlighted_lines_index = extract_code(code_matches)
+ if not ghc_highlight_matching_lines:
+ highlighted_lines_index: set[int] = set()
+
+ code_snippet = "\n".join(lines)
+
+ kwargs: dict[str, t.Any] = {
+ 'template': 'code.html',
+ 'url': item['html_url'],
+ 'title': f"{repo['full_name']} · {item['path']}",
+ 'content': repo['description'],
+ 'repository': repo['html_url'],
+ 'codelines': [(i + 1, line) for (i, line) in enumerate(lines)],
+ 'hl_lines': highlighted_lines_index,
+ 'code_language': get_code_language_name(filename=item['name'], code_snippet=code_snippet),
+ # important to set for highlighing
+ 'strip_whitespace': ghc_strip_whitespace,
+ 'strip_new_lines': ghc_strip_new_lines,
+ 'parsed_url': urlparse(item['html_url']),
+ }
+ results.add(results.types.LegacyResult(**kwargs))
+
+ return results
diff --git a/searx/engines/searchcode_code.py b/searx/engines/searchcode_code.py
index 7cfe2ce71..2196b0ad2 100644
--- a/searx/engines/searchcode_code.py
+++ b/searx/engines/searchcode_code.py
@@ -70,6 +70,8 @@ def response(resp):
'codelines': sorted(lines.items()),
'code_language': code_language,
'template': 'code.html',
+ 'strip_whitespace': True,
+ 'strip_new_lines': True,
}
)
diff --git a/searx/settings.yml b/searx/settings.yml
index d21192651..0cd293d7e 100644
--- a/searx/settings.yml
+++ b/searx/settings.yml
@@ -983,6 +983,24 @@ engines:
engine: github
shortcut: gh
+ - name: github code
+ engine: github_code
+ shortcut: ghc
+ disabled: true
+ ghc_auth:
+ # type is one of:
+ # * none
+ # * personal_access_token
+ # * bearer
+ # When none is passed, the token is not requried.
+ type: "none"
+ token: "token"
+ # specify whether to highlight the matching lines to the query
+ ghc_highlight_matching_lines: true
+ ghc_strip_new_lines: true
+ ghc_strip_whitespace: false
+ timeout: 10.0
+
- name: codeberg
# https://docs.searxng.org/dev/engines/online/gitea.html
engine: gitea
diff --git a/searx/templates/simple/result_templates/code.html b/searx/templates/simple/result_templates/code.html
index 49326aed5..bcde94358 100644
--- a/searx/templates/simple/result_templates/code.html
+++ b/searx/templates/simple/result_templates/code.html
@@ -25,7 +25,7 @@
{%- endif -%}
<div dir="ltr" class="codelines">
- {{- result.codelines|code_highlighter(result.code_language)|safe -}}
+ {{- result.codelines|code_highlighter(result.code_language, result.hl_lines, result.strip_whitespace, result.strip_new_lines)|safe -}}
</div>
{{- result_sub_footer(result) -}}
diff --git a/searx/webapp.py b/searx/webapp.py
index 2dd7ddb08..9b590eeab 100755
--- a/searx/webapp.py
+++ b/searx/webapp.py
@@ -181,24 +181,32 @@ def _get_locale_rfc5646(locale):
# code-highlighter
@app.template_filter('code_highlighter')
-def code_highlighter(codelines, language=None):
+def code_highlighter(codelines, language=None, hl_lines=None, strip_whitespace=True, strip_new_lines=True):
if not language:
language = 'text'
try:
- # find lexer by programming language
- lexer = get_lexer_by_name(language, stripall=True)
+ lexer = get_lexer_by_name(language, stripall=strip_whitespace, stripnl=strip_new_lines)
except Exception as e: # pylint: disable=broad-except
logger.warning("pygments lexer: %s " % e)
# if lexer is not found, using default one
- lexer = get_lexer_by_name('text', stripall=True)
+ lexer = get_lexer_by_name('text', stripall=strip_whitespace, stripnl=strip_new_lines)
html_code = ''
tmp_code = ''
last_line = None
line_code_start = None
+ def offset_hl_lines(hl_lines, start):
+ """
+ hl_lines in pygments are expected to be relative to the input
+ """
+ if hl_lines is None:
+ return None
+
+ return [line - start + 1 for line in hl_lines]
+
# parse lines
for line, code in codelines:
if not last_line:
@@ -208,7 +216,12 @@ def code_highlighter(codelines, language=None):
if last_line is not None and last_line + 1 != line:
# highlight last codepart
- formatter = HtmlFormatter(linenos='inline', linenostart=line_code_start, cssclass="code-highlight")
+ formatter = HtmlFormatter(
+ linenos='inline',
+ linenostart=line_code_start,
+ cssclass="code-highlight",
+ hl_lines=offset_hl_lines(hl_lines, line_code_start),
+ )
html_code = html_code + highlight(tmp_code, lexer, formatter)
# reset conditions for next codepart
@@ -222,7 +235,12 @@ def code_highlighter(codelines, language=None):
last_line = line
# highlight last codepart
- formatter = HtmlFormatter(linenos='inline', linenostart=line_code_start, cssclass="code-highlight")
+ formatter = HtmlFormatter(
+ linenos='inline',
+ linenostart=line_code_start,
+ cssclass="code-highlight",
+ hl_lines=offset_hl_lines(hl_lines, line_code_start),
+ )
html_code = html_code + highlight(tmp_code, lexer, formatter)
return html_code
diff --git a/tests/unit/settings/test_github_code.yml b/tests/unit/settings/test_github_code.yml
new file mode 100644
index 000000000..2cf039138
--- /dev/null
+++ b/tests/unit/settings/test_github_code.yml
@@ -0,0 +1,13 @@
+# This SearXNG setup is used in unit tests
+
+use_default_settings:
+
+ engines:
+ keep_only: []
+
+engines:
+
+ - name: github code
+ engine: github_code
+ shortcut: "ghc"
+ disabled: true
diff --git a/tests/unit/test_engine_github_code.py b/tests/unit/test_engine_github_code.py
new file mode 100644
index 000000000..d10081f28
--- /dev/null
+++ b/tests/unit/test_engine_github_code.py
@@ -0,0 +1,170 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# pylint: disable=missing-module-docstring,disable=missing-class-docstring
+
+import logging
+from unittest.mock import Mock
+from urllib.parse import urlparse
+from parameterized import parameterized
+
+import searx.engines
+from tests import SearxTestCase
+from searx.result_types import EngineResults
+
+
+class GithubCodeTests(SearxTestCase):
+
+ TEST_SETTINGS = "test_github_code.yml"
+
+ def setUp(self):
+ super().setUp()
+ self.ghc = searx.engines.engines['github code']
+ self.ghc.logger.setLevel(logging.INFO)
+
+ def tearDown(self):
+ searx.search.load_engines([])
+
+ @parameterized.expand(
+ [
+ [
+ [
+ {
+ "fragment": " - [Tab management](#tab-management)\n - [Buffer/window management]"
+ "(#bufferwindow-management)\n- [🎨 Highlights](#-highlights)",
+ "matches": [{"indices": [47, 53], "text": "Buffer"}, {"indices": [74, 80], "text": "buffer"}],
+ },
+ {
+ "fragment": "To conditionally activate plugins, the best solution is to use the\n"
+ "[LazyVim VSCode extra](https://www.lazyvim.org/extras/vscode). However, "
+ "`packer.nvim` and `lazy.nvim` have built-in\nsupport for "
+ "`cond = vim.g.vscode` and `vim-plug` has a",
+ "matches": [
+ {"indices": [68, 75], "text": "LazyVim"},
+ {"indices": [102, 109], "text": "lazyvim"},
+ ],
+ },
+ ],
+ [
+ " - [Tab management](#tab-management)",
+ " - [Buffer/window management](#bufferwindow-management)",
+ "- [🎨 Highlights](#-highlights)",
+ "To conditionally activate plugins, the best solution is to use the",
+ "[LazyVim VSCode extra](https://www.lazyvim.org/extras/vscode)."
+ " However, `packer.nvim` and `lazy.nvim` have built-in",
+ "support for `cond = vim.g.vscode` and `vim-plug` has a",
+ ],
+ {2, 5},
+ ],
+ [
+ [
+ {
+ "fragment": "\n| `<leader>uf` | Toggle format (global) |\n"
+ "| `<leader>uF` | Toggle format (buffer) |\n"
+ "| `<leader>us` | Toggle spelling |\n",
+ "matches": [{"indices": [74, 80], "text": "buffer"}],
+ },
+ ],
+ [
+ "| `<leader>uf` | Toggle format (global) |",
+ "| `<leader>uF` | Toggle format (buffer) |",
+ "| `<leader>us` | Toggle spelling |",
+ ],
+ {2},
+ ],
+ [
+ [
+ {
+ "fragment": "\n\n\n1\n2\n3\n4",
+ "matches": [{"indices": [3, 4], "text": "1"}],
+ },
+ ],
+ [
+ "1",
+ "2",
+ "3",
+ "4",
+ ],
+ {1},
+ ],
+ [
+ [
+ {
+ "fragment": "placeholder",
+ "matches": [],
+ },
+ ],
+ [
+ "placeholder",
+ ],
+ set(),
+ ],
+ ]
+ )
+ def test_code_extraction(self, code_matches, expected_code, expected_highlighted_lines):
+ code, highlights = self.ghc.extract_code(code_matches=code_matches)
+ self.assertEqual(code, expected_code)
+ self.assertEqual(highlights, expected_highlighted_lines)
+
+ def test_transforms_response(self):
+ response = Mock()
+ response.json.return_value = {
+ "items": [
+ {
+ "name": "TODO.md",
+ "path": "TODO.md",
+ "html_url": "https://github.com/folke/dot/blob/3140f4f5720c3cc6b5034c624eb7706f8533a82c/TODO.md",
+ "repository": {
+ "full_name": "folke/dot",
+ "html_url": "https://github.com/folke/dot",
+ "description": "☕️ My Dot Files",
+ },
+ "text_matches": [
+ {
+ "object_type": "FileContent",
+ "property": "content",
+ "fragment": "- [x] windows picker\n"
+ "- [x] toggle cwd / root (LazyVim)\n"
+ "- [x] dynamic workspace symbol",
+ "matches": [{"indices": [46, 53], "text": "LazyVim"}],
+ },
+ {
+ "object_type": "FileContent",
+ "property": "content",
+ "fragment": "- [x] smart stops working after custom\n"
+ "- [x] edit in empty buffer\n"
+ "- [x] support toggling line nr for preview",
+ "matches": [{"indices": [59, 65], "text": "buffer"}, {"indices": [89, 93], "text": "line"}],
+ },
+ ],
+ }
+ ]
+ }
+ response.status_code = 200
+ results = self.ghc.response(response)
+ expected_results = EngineResults()
+ expected_results.add(
+ expected_results.types.LegacyResult(
+ **{
+ 'url': "https://github.com/folke/dot/blob/3140f4f5720c3cc6b5034c624eb7706f8533a82c/TODO.md",
+ 'title': "folke/dot · TODO.md",
+ 'content': "☕️ My Dot Files",
+ 'repository': "https://github.com/folke/dot",
+ 'codelines': [
+ (1, "- [x] windows picker"),
+ (2, "- [x] toggle cwd / root (LazyVim)"),
+ (3, "- [x] dynamic workspace symbol"),
+ (4, "- [x] smart stops working after custom"),
+ (5, "- [x] edit in empty buffer"),
+ (6, "- [x] support toggling line nr for preview"),
+ ],
+ 'hl_lines': {2, 5, 6},
+ 'code_language': "markdown",
+ 'template': 'code.html',
+ 'strip_whitespace': False,
+ 'strip_new_lines': True,
+ 'parsed_url': urlparse(
+ "https://github.com/folke/dot/blob/3140f4f5720c3cc6b5034c624eb7706f8533a82c/TODO.md"
+ ),
+ }
+ )
+ )
+ self.assertEqual(results, expected_results)