Implementing https rewrite support #71

* parsing XML-Files which contain target, exclusions and rules * convert regex if required (is a little hack, probably does not work for all rules) * check if target rule apply for http url, and use the rules to rewrite it * add pice of code, to check if domain name has not changed during rewrite (should be rewritten, using publicsuffix instead of little hack)
author: Thomas Pointhuber <thomas.pointhuber@gmx.at> 2014-09-14 11:09:44 +0200
committer: Thomas Pointhuber <thomas.pointhuber@gmx.at> 2014-10-15 14:05:41 +0200
commit: 9b9f097adbf39d7908931203e9d8966748900bde (patch)
tree: 2b938c027fc9b486f3de551c8e0e4245749d379d /searx/https_rewrite.py
parent: d1d55f2ca41fbaf10a66bfc66d69e0fccf673413 (diff)
1 files changed, 133 insertions, 8 deletions
diff --git a/searx/https_rewrite.py b/searx/https_rewrite.py
index 44ada9450..814eda2de 100644
--- a/searx/https_rewrite.py
+++ b/searx/https_rewrite.py
@@ -1,14 +1,139 @@
+'''
+searx is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+searx is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with searx. If not, see < http://www.gnu.org/licenses/ >.
+
+(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
+'''
+
 import re
+from lxml import etree
+from os import listdir
+from os.path import isfile, join
+
 
 # https://gitweb.torproject.org/\
 # pde/https-everywhere.git/tree/4.0:/src/chrome/content/rules
 
 # HTTPS rewrite rules
-https_rules = (
-    # from
-    (re.compile(r'^http://(www\.|m\.|)?xkcd\.(?:com|org)/', re.I | re.U),
-     # to
-     r'https://\1xkcd.com/'),
-    (re.compile(r'^https?://(?:ssl)?imgs\.xkcd\.com/', re.I | re.U),
-     r'https://sslimgs.xkcd.com/'),
-)
+https_rules = []
+
+
+# load single ruleset from a xml file
+def load_single_https_ruleset(filepath):
+    ruleset = ()
+
+    # init parser
+    parser = etree.XMLParser()
+
+    # load and parse xml-file
+    try:
+        tree = etree.parse(filepath, parser)
+    except:
+        # TODO, error message
+        return ()
+
+    # get root node
+    root = tree.getroot()
+
+    #print(etree.tostring(tree))
+
+    # check if root is a node with the name ruleset
+    # TODO improve parsing
+    if root.tag != 'ruleset':        
+        return ()
+
+    # check if rule is deactivated by default
+    if root.attrib.get('default_off'):
+        return ()
+
+    # check if rule does only work for specific platforms
+    if root.attrib.get('platform'):
+        return ()
+
+    hosts = []
+    rules = []
+    exclusions = []
+
+    # parse childs from ruleset
+    for ruleset in root:
+        # this child define a target
+        if ruleset.tag == 'target':
+            # check if required tags available 
+            if not ruleset.attrib.get('host'):
+                continue
+
+            # convert host-rule to valid regex
+            host = ruleset.attrib.get('host').replace('.', '\.').replace('*', '.*')
+
+            # append to host list
+            hosts.append(host)
+
+        # this child define a rule
+        elif ruleset.tag == 'rule':
+            # check if required tags available 
+            if not ruleset.attrib.get('from')\
+               or not ruleset.attrib.get('to'):
+                continue
+
+            # TODO hack, which convert a javascript regex group into a valid python regex group
+            rule_from = ruleset.attrib.get('from').replace('$', '\\')
+            rule_to = ruleset.attrib.get('to').replace('$', '\\')
+
+            # TODO, not working yet because of the hack above, currently doing that in webapp.py
+            #rule_from_rgx = re.compile(rule_from, re.I)
+
+            # append rule
+            rules.append((rule_from, rule_to))
+
+        # this child define an exclusion
+        elif ruleset.tag == 'exclusion':
+            # check if required tags available 
+            if not ruleset.attrib.get('pattern'):
+                continue
+
+            exclusion_rgx = re.compile(ruleset.attrib.get('pattern'))
+
+            # append exclusion
+            exclusions.append(exclusion_rgx)
+
+    # convert list of possible hosts to a simple regex
+    # TODO compress regex to improve performance
+    try:
+        target_hosts = re.compile('^(' + '|'.join(hosts) + ')', re.I | re.U)
+    except:
+        return ()
+
+    # return ruleset
+    return (target_hosts, rules, exclusions)
+
+
+# load all https rewrite rules
+def load_https_rules(rules_path):
+    # add / to path if not set yet
+    if rules_path[-1:] != '/':
+        rules_path += '/'
+
+    # search all xml files which are stored in the https rule directory
+    xml_files = [ join(rules_path,f) for f in listdir(rules_path) if isfile(join(rules_path,f)) and f[-4:] == '.xml' ]
+
+    # load xml-files
+    for ruleset_file in xml_files:
+        # calculate rewrite-rules
+        ruleset = load_single_https_ruleset(ruleset_file)
+
+        # skip if no ruleset returned
+        if not ruleset:
+            continue
+
+        # append ruleset
+        https_rules.append(ruleset)
author	Thomas Pointhuber <thomas.pointhuber@gmx.at>	2014-09-14 11:09:44 +0200
committer	Thomas Pointhuber <thomas.pointhuber@gmx.at>	2014-10-15 14:05:41 +0200
commit	9b9f097adbf39d7908931203e9d8966748900bde (patch)
tree	2b938c027fc9b486f3de551c8e0e4245749d379d /searx/https_rewrite.py
parent	d1d55f2ca41fbaf10a66bfc66d69e0fccf673413 (diff)