summaryrefslogtreecommitdiff
path: root/searx/utils.py
diff options
context:
space:
mode:
Diffstat (limited to 'searx/utils.py')
-rw-r--r--searx/utils.py26
1 files changed, 26 insertions, 0 deletions
diff --git a/searx/utils.py b/searx/utils.py
index 670499805..53300181f 100644
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -3,6 +3,32 @@ from HTMLParser import HTMLParser
import csv
import codecs
import cStringIO
+import re
+
+def highlight_content(content, query):
+
+ if not content:
+ return None
+ # ignoring html contents
+ # TODO better html content detection
+ if content.find('<') != -1:
+ return content
+
+ query = query.decode('utf-8')
+ if content.lower().find(query.lower()) > -1:
+ query_regex = u'({0})'.format(re.escape(query))
+ content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U)
+ else:
+ regex_parts = []
+ for chunk in query.split():
+ if len(chunk) == 1:
+ regex_parts.append(u'\W+{0}\W+'.format(re.escape(chunk)))
+ else:
+ regex_parts.append(u'{0}'.format(re.escape(chunk)))
+ query_regex = u'({0})'.format('|'.join(regex_parts))
+ content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U)
+
+ return content
class HTMLTextExtractor(HTMLParser):
def __init__(self):