From a3d7e9c285d8c5947c9cb67b1587e7732ac4bc90 Mon Sep 17 00:00:00 2001
From: Bnyro <bnyro@tutanota.com>
Date: Fri, 8 Sep 2023 08:40:22 +0200
Subject: [mod] utils.py: add markdown_to_text helper function

---
 searx/utils.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'searx/utils.py')

diff --git a/searx/utils.py b/searx/utils.py
index 161983011..7ddd2305a 100644
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -15,6 +15,7 @@ from os.path import splitext, join
 from random import choice
 from html.parser import HTMLParser
 from urllib.parse import urljoin, urlparse
+from markdown_it import MarkdownIt
 
 from lxml import html
 from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult
@@ -158,6 +159,29 @@ def html_to_text(html_str: str) -> str:
     return s.get_text()
 
 
+def markdown_to_text(markdown_str: str) -> str:
+    """Extract text from a Markdown string
+
+    Args:
+        * markdown_str (str): string Markdown
+
+    Returns:
+        * str: extracted text
+
+    Examples:
+        >>> markdown_to_text('[example](https://example.com)')
+        'example'
+
+        >>> markdown_to_text('## Headline')
+        'Headline'
+    """
+
+    html_str = (
+        MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(markdown_str)
+    )
+    return html_to_text(html_str)
+
+
 def extract_text(xpath_results, allow_none: bool = False) -> Optional[str]:
     """Extract text from a lxml result
 
-- 
cgit v1.2.3