From 5d977056f7aa216eae09a22c3baaff73546f6ff1 Mon Sep 17 00:00:00 2001 From: Cqoicebordel Date: Mon, 29 Dec 2014 21:31:04 +0100 Subject: Flake8 and Twitter corrections Lots of Flake8 corrections Maybe we should change the rule to allow lines of 120 chars. It seems more usable. Big twitter correction : now it outputs the words in right order... --- searx/utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'searx/utils.py') diff --git a/searx/utils.py b/searx/utils.py index dc831ef5f..b725a8b95 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -30,8 +30,9 @@ def gen_useragent(): def searx_useragent(): - return 'searx/{searx_version} {suffix}'.format(searx_version=VERSION_STRING, - suffix=settings['server'].get('useragent_suffix', '')) + return 'searx/{searx_version} {suffix}'.format( + searx_version=VERSION_STRING, + suffix=settings['server'].get('useragent_suffix', '')) def highlight_content(content, query): -- cgit v1.2.3 From 1408859b4b0ca9efc590ca0e112c6bc0cb984e2c Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Thu, 1 Jan 2015 14:13:56 +0100 Subject: [fix] ignore scripts/styles in html_to_text --- searx/utils.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'searx/utils.py') diff --git a/searx/utils.py b/searx/utils.py index b725a8b95..8a3f35531 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -23,6 +23,9 @@ ua_os = ('Windows NT 6.3; WOW64', ua = "Mozilla/5.0 ({os}) Gecko/20100101 Firefox/{version}" +blocked_tags = ('script', + 'style') + def gen_useragent(): # TODO @@ -67,11 +70,29 @@ class HTMLTextExtractor(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.result = [] + self.tags = [] + + def handle_starttag(self, tag, attrs): + print tag + self.tags.append(tag) + + def handle_endtag(self, tag): + print tag,tag + if tag != self.tags[-1]: + raise Exception("invalid html") + self.tags.pop() + + def is_valid_tag(self): + return not self.tags or self.tags[-1] not in blocked_tags def handle_data(self, d): + if not self.is_valid_tag(): + return self.result.append(d) def handle_charref(self, number): + if not self.is_valid_tag(): + return if number[0] in (u'x', u'X'): codepoint = int(number[1:], 16) else: @@ -79,6 +100,8 @@ class HTMLTextExtractor(HTMLParser): self.result.append(unichr(codepoint)) def handle_entityref(self, name): + if not self.is_valid_tag(): + return # codepoint = htmlentitydefs.name2codepoint[name] # self.result.append(unichr(codepoint)) self.result.append(name) -- cgit v1.2.3 From 9f12605f7ebc9ca5575fc4ee9900e0e821366c4d Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Thu, 1 Jan 2015 17:48:12 +0100 Subject: [enh] themes static content refactor --- searx/utils.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'searx/utils.py') diff --git a/searx/utils.py b/searx/utils.py index 8a3f35531..0594339d5 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -73,11 +73,9 @@ class HTMLTextExtractor(HTMLParser): self.tags = [] def handle_starttag(self, tag, attrs): - print tag self.tags.append(tag) def handle_endtag(self, tag): - print tag,tag if tag != self.tags[-1]: raise Exception("invalid html") self.tags.pop() @@ -156,11 +154,17 @@ def get_themes(root): """Returns available themes list.""" static_path = os.path.join(root, 'static') - static_names = set(os.listdir(static_path)) templates_path = os.path.join(root, 'templates') - templates_names = set(os.listdir(templates_path)) - themes = [] - for name in static_names.intersection(templates_names): - themes += [name] + themes = os.listdir(os.path.join(static_path, 'themes')) return static_path, templates_path, themes + + +def get_static_files(base_path): + static_files = set() + base_path_length = len(base_path+'/static') + 1 + for directory, _, files in os.walk(os.path.join(base_path, 'static')): + for filename in files: + f = os.path.join(directory[base_path_length:], filename) + static_files.add(f) + return static_files -- cgit v1.2.3 From 2f9a386c0db884ffbea27f43bdcff5bfd1876ad1 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Thu, 1 Jan 2015 18:59:53 +0100 Subject: [enh] better result template handling --- searx/utils.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'searx/utils.py') diff --git a/searx/utils.py b/searx/utils.py index 0594339d5..5bd1ced4d 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -161,10 +161,23 @@ def get_themes(root): def get_static_files(base_path): + base_path = os.path.join(base_path, 'static') static_files = set() - base_path_length = len(base_path+'/static') + 1 - for directory, _, files in os.walk(os.path.join(base_path, 'static')): + base_path_length = len(base_path) + 1 + for directory, _, files in os.walk(base_path): for filename in files: f = os.path.join(directory[base_path_length:], filename) static_files.add(f) return static_files + + +def get_result_templates(base_path): + base_path = os.path.join(base_path, 'templates') + result_templates = set() + base_path_length = len(base_path) + 1 + for directory, _, files in os.walk(base_path): + if directory.endswith('result_templates'): + for filename in files: + f = os.path.join(directory[base_path_length:], filename) + result_templates.add(f) + return result_templates -- cgit v1.2.3