From 9b9f097adbf39d7908931203e9d8966748900bde Mon Sep 17 00:00:00 2001
From: Thomas Pointhuber <thomas.pointhuber@gmx.at>
Date: Sun, 14 Sep 2014 11:09:44 +0200
Subject: Implementing https rewrite support #71

* parsing XML-Files which contain target, exclusions and rules
* convert regex if required (is a little hack, probably does not work
for all rules)
* check if target rule apply for http url, and use the rules to rewrite
it
* add pice of code, to check if domain name has not changed during
rewrite (should be rewritten, using publicsuffix instead of little hack)
---
 searx/webapp.py | 52 ++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 46 insertions(+), 6 deletions(-)

(limited to 'searx/webapp.py')

diff --git a/searx/webapp.py b/searx/webapp.py
index 2bf3afaf4..7952415af 100644
--- a/searx/webapp.py
+++ b/searx/webapp.py
@@ -49,6 +49,9 @@ from searx.languages import language_codes
 from searx.search import Search
 from searx.autocomplete import backends as autocomplete_backends
 
+from urlparse import urlparse
+import re
+
 
 static_path, templates_path, themes =\
     get_themes(settings['themes_path']
@@ -197,16 +200,53 @@ def index():
         if not search.paging and engines[result['engine']].paging:
             search.paging = True
 
+        # check if HTTPS rewrite is required 
         if settings['server']['https_rewrite']\
            and result['parsed_url'].scheme == 'http':
 
-            for http_regex, https_url in https_rules:
-                if http_regex.match(result['url']):
-                    result['url'] = http_regex.sub(https_url, result['url'])
-                    # TODO result['parsed_url'].scheme
-                    break
+            skip_https_rewrite = False
+
+            # check if HTTPS rewrite is possible
+            for target, rules, exclusions in https_rules:
+
+                # check if target regex match with url
+                if target.match(result['url']):
+                    # process exclusions
+                    for exclusion in exclusions:
+                        # check if exclusion match with url
+                        if exclusion.match(result['url']):
+                            skip_https_rewrite = True
+                            break
+
+                    # skip https rewrite if required
+                    if skip_https_rewrite:
+                        break
+
+                    # process rules
+                    for rule in rules:
+                        # TODO, precompile rule
+                        p = re.compile(rule[0])
+                        # rewrite url if possible
+                        new_result_url = p.sub(rule[1], result['url'])
+
+                        # parse new url
+                        new_parsed_url = urlparse(new_result_url)
+
+                        # continiue if nothing was rewritten
+                        if result['url'] == new_result_url:
+                            continue
+
+                        # get domainname from result
+                        # TODO, does only work correct with TLD's like asdf.com, not for asdf.com.de
+                        # TODO, using publicsuffix instead of this rewrite rule
+                        old_result_domainname = '.'.join(result['parsed_url'].hostname.split('.')[-2:])
+                        new_result_domainname = '.'.join(new_parsed_url.hostname.split('.')[-2:])
+
+                        # check if rewritten hostname is the same, to protect against wrong or malicious rewrite rules
+                        if old_result_domainname == new_result_domainname:
+                            # set new url
+                            result['url'] = new_result_url
 
-        # HTTPS rewrite
         if search.request_data.get('format', 'html') == 'html':
             if 'content' in result:
                 result['content'] = highlight_content(result['content'],
-- 
cgit v1.2.3


From 0616d26feb0f96b3d4fd6b2744ae0288c9fed96b Mon Sep 17 00:00:00 2001
From: Thomas Pointhuber <thomas.pointhuber@gmx.at>
Date: Sun, 14 Sep 2014 14:17:12 +0200
Subject: improve https rewriting

---
 searx/webapp.py | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'searx/webapp.py')

diff --git a/searx/webapp.py b/searx/webapp.py
index 7952415af..d9dc5f710 100644
--- a/searx/webapp.py
+++ b/searx/webapp.py
@@ -247,6 +247,9 @@ def index():
                             # set new url
                             result['url'] = new_result_url
 
+                    # target has matched, do not search over the other rules 
+                    break
+
         if search.request_data.get('format', 'html') == 'html':
             if 'content' in result:
                 result['content'] = highlight_content(result['content'],
-- 
cgit v1.2.3


From bb126da0fb49d1c9640eeb3371d0bbcf005bcd2b Mon Sep 17 00:00:00 2001
From: Thomas Pointhuber <thomas.pointhuber@gmx.at>
Date: Wed, 15 Oct 2014 14:47:03 +0200
Subject: improve https rewrite code

---
 searx/webapp.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'searx/webapp.py')

diff --git a/searx/webapp.py b/searx/webapp.py
index d9dc5f710..25c99d94c 100644
--- a/searx/webapp.py
+++ b/searx/webapp.py
@@ -224,10 +224,14 @@ def index():
 
                     # process rules
                     for rule in rules:
-                        # TODO, precompile rule
-                        p = re.compile(rule[0])
-                        # rewrite url if possible
-                        new_result_url = p.sub(rule[1], result['url'])
+                        try:
+                            # TODO, precompile rule
+                            p = re.compile(rule[0])
+                            
+                            # rewrite url if possible
+                            new_result_url = p.sub(rule[1], result['url'])
+                        except:
+                            break
 
                         # parse new url
                         new_parsed_url = urlparse(new_result_url)
-- 
cgit v1.2.3