[mod] https rewrite pluginification

author: Adam Tauber <asciimoo@gmail.com> 2015-04-13 00:30:12 +0200
committer: Adam Tauber <asciimoo@gmail.com> 2015-04-13 00:30:12 +0200
commit: d2a636f75d24953f5094ea97ab54a8a4353a65ff (patch)
tree: 22da091679dd5b9460391a50810b96b80020e15f /searx/https_rewrite.py
parent: 146928a74980b90de614b71512334ac0a6373048 (diff)
1 files changed, 0 insertions, 209 deletions
diff --git a/searx/https_rewrite.py b/searx/https_rewrite.py
deleted file mode 100644
index 71aec1c9b..000000000
--- a/searx/https_rewrite.py
+++ /dev/null
@@ -1,209 +0,0 @@
-'''
-searx is free software: you can redistribute it and/or modify
-it under the terms of the GNU Affero General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-searx is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU Affero General Public License for more details.
-
-You should have received a copy of the GNU Affero General Public License
-along with searx. If not, see < http://www.gnu.org/licenses/ >.
-
-(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
-'''
-
-import re
-from urlparse import urlparse
-from lxml import etree
-from os import listdir
-from os.path import isfile, isdir, join
-from searx import logger
-
-
-logger = logger.getChild("https_rewrite")
-
-# https://gitweb.torproject.org/\
-# pde/https-everywhere.git/tree/4.0:/src/chrome/content/rules
-
-# HTTPS rewrite rules
-https_rules = []
-
-
-# load single ruleset from a xml file
-def load_single_https_ruleset(filepath):
-    ruleset = ()
-
-    # init parser
-    parser = etree.XMLParser()
-
-    # load and parse xml-file
-    try:
-        tree = etree.parse(filepath, parser)
-    except:
-        # TODO, error message
-        return ()
-
-    # get root node
-    root = tree.getroot()
-
-    # check if root is a node with the name ruleset
-    # TODO improve parsing
-    if root.tag != 'ruleset':
-        return ()
-
-    # check if rule is deactivated by default
-    if root.attrib.get('default_off'):
-        return ()
-
-    # check if rule does only work for specific platforms
-    if root.attrib.get('platform'):
-        return ()
-
-    hosts = []
-    rules = []
-    exclusions = []
-
-    # parse childs from ruleset
-    for ruleset in root:
-        # this child define a target
-        if ruleset.tag == 'target':
-            # check if required tags available
-            if not ruleset.attrib.get('host'):
-                continue
-
-            # convert host-rule to valid regex
-            host = ruleset.attrib.get('host')\
-                .replace('.', '\.').replace('*', '.*')
-
-            # append to host list
-            hosts.append(host)
-
-        # this child define a rule
-        elif ruleset.tag == 'rule':
-            # check if required tags available
-            if not ruleset.attrib.get('from')\
-               or not ruleset.attrib.get('to'):
-                continue
-
-            # TODO hack, which convert a javascript regex group
-            # into a valid python regex group
-            rule_from = ruleset.attrib['from'].replace('$', '\\')
-            if rule_from.endswith('\\'):
-                rule_from = rule_from[:-1]+'$'
-            rule_to = ruleset.attrib['to'].replace('$', '\\')
-            if rule_to.endswith('\\'):
-                rule_to = rule_to[:-1]+'$'
-
-            # TODO, not working yet because of the hack above,
-            # currently doing that in webapp.py
-            # rule_from_rgx = re.compile(rule_from, re.I)
-
-            # append rule
-            try:
-                rules.append((re.compile(rule_from, re.I | re.U), rule_to))
-            except:
-                # TODO log regex error
-                continue
-
-        # this child define an exclusion
-        elif ruleset.tag == 'exclusion':
-            # check if required tags available
-            if not ruleset.attrib.get('pattern'):
-                continue
-
-            exclusion_rgx = re.compile(ruleset.attrib.get('pattern'))
-
-            # append exclusion
-            exclusions.append(exclusion_rgx)
-
-    # convert list of possible hosts to a simple regex
-    # TODO compress regex to improve performance
-    try:
-        target_hosts = re.compile('^(' + '|'.join(hosts) + ')', re.I | re.U)
-    except:
-        return ()
-
-    # return ruleset
-    return (target_hosts, rules, exclusions)
-
-
-# load all https rewrite rules
-def load_https_rules(rules_path):
-    # check if directory exists
-    if not isdir(rules_path):
-        logger.error("directory not found: '" + rules_path + "'")
-        return
-
-    # search all xml files which are stored in the https rule directory
-    xml_files = [join(rules_path, f)
-                 for f in listdir(rules_path)
-                 if isfile(join(rules_path, f)) and f[-4:] == '.xml']
-
-    # load xml-files
-    for ruleset_file in xml_files:
-        # calculate rewrite-rules
-        ruleset = load_single_https_ruleset(ruleset_file)
-
-        # skip if no ruleset returned
-        if not ruleset:
-            continue
-
-        # append ruleset
-        https_rules.append(ruleset)
-
-    logger.info('{n} rules loaded'.format(n=len(https_rules)))
-
-
-def https_url_rewrite(result):
-    skip_https_rewrite = False
-    # check if HTTPS rewrite is possible
-    for target, rules, exclusions in https_rules:
-
-        # check if target regex match with url
-        if target.match(result['parsed_url'].netloc):
-            # process exclusions
-            for exclusion in exclusions:
-                # check if exclusion match with url
-                if exclusion.match(result['url']):
-                    skip_https_rewrite = True
-                    break
-
-            # skip https rewrite if required
-            if skip_https_rewrite:
-                break
-
-            # process rules
-            for rule in rules:
-                try:
-                    new_result_url = rule[0].sub(rule[1], result['url'])
-                except:
-                    break
-
-                # parse new url
-                new_parsed_url = urlparse(new_result_url)
-
-                # continiue if nothing was rewritten
-                if result['url'] == new_result_url:
-                    continue
-
-                # get domainname from result
-                # TODO, does only work correct with TLD's like
-                #  asdf.com, not for asdf.com.de
-                # TODO, using publicsuffix instead of this rewrite rule
-                old_result_domainname = '.'.join(
-                    result['parsed_url'].hostname.split('.')[-2:])
-                new_result_domainname = '.'.join(
-                    new_parsed_url.hostname.split('.')[-2:])
-
-                # check if rewritten hostname is the same,
-                # to protect against wrong or malicious rewrite rules
-                if old_result_domainname == new_result_domainname:
-                    # set new url
-                    result['url'] = new_result_url
-
-            # target has matched, do not search over the other rules
-            break
-    return result
author	Adam Tauber <asciimoo@gmail.com>	2015-04-13 00:30:12 +0200
committer	Adam Tauber <asciimoo@gmail.com>	2015-04-13 00:30:12 +0200
commit	d2a636f75d24953f5094ea97ab54a8a4353a65ff (patch)
tree	22da091679dd5b9460391a50810b96b80020e15f /searx/https_rewrite.py
parent	146928a74980b90de614b71512334ac0a6373048 (diff)