diff options
| author | Thomas Pointhuber <thomas.pointhuber@gmx.at> | 2014-09-14 11:09:44 +0200 |
|---|---|---|
| committer | Thomas Pointhuber <thomas.pointhuber@gmx.at> | 2014-10-15 14:05:41 +0200 |
| commit | 9b9f097adbf39d7908931203e9d8966748900bde (patch) | |
| tree | 2b938c027fc9b486f3de551c8e0e4245749d379d /searx/https_rewrite.py | |
| parent | d1d55f2ca41fbaf10a66bfc66d69e0fccf673413 (diff) | |
Implementing https rewrite support #71
* parsing XML-Files which contain target, exclusions and rules
* convert regex if required (is a little hack, probably does not work
for all rules)
* check if target rule apply for http url, and use the rules to rewrite
it
* add pice of code, to check if domain name has not changed during
rewrite (should be rewritten, using publicsuffix instead of little hack)
Diffstat (limited to 'searx/https_rewrite.py')
| -rw-r--r-- | searx/https_rewrite.py | 141 |
1 files changed, 133 insertions, 8 deletions
diff --git a/searx/https_rewrite.py b/searx/https_rewrite.py index 44ada9450..814eda2de 100644 --- a/searx/https_rewrite.py +++ b/searx/https_rewrite.py @@ -1,14 +1,139 @@ +''' +searx is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +searx is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with searx. If not, see < http://www.gnu.org/licenses/ >. + +(C) 2013- by Adam Tauber, <asciimoo@gmail.com> +''' + import re +from lxml import etree +from os import listdir +from os.path import isfile, join + # https://gitweb.torproject.org/\ # pde/https-everywhere.git/tree/4.0:/src/chrome/content/rules # HTTPS rewrite rules -https_rules = ( - # from - (re.compile(r'^http://(www\.|m\.|)?xkcd\.(?:com|org)/', re.I | re.U), - # to - r'https://\1xkcd.com/'), - (re.compile(r'^https?://(?:ssl)?imgs\.xkcd\.com/', re.I | re.U), - r'https://sslimgs.xkcd.com/'), -) +https_rules = [] + + +# load single ruleset from a xml file +def load_single_https_ruleset(filepath): + ruleset = () + + # init parser + parser = etree.XMLParser() + + # load and parse xml-file + try: + tree = etree.parse(filepath, parser) + except: + # TODO, error message + return () + + # get root node + root = tree.getroot() + + #print(etree.tostring(tree)) + + # check if root is a node with the name ruleset + # TODO improve parsing + if root.tag != 'ruleset': + return () + + # check if rule is deactivated by default + if root.attrib.get('default_off'): + return () + + # check if rule does only work for specific platforms + if root.attrib.get('platform'): + return () + + hosts = [] + rules = [] + exclusions = [] + + # parse childs from ruleset + for ruleset in root: + # this child define a target + if ruleset.tag == 'target': + # check if required tags available + if not ruleset.attrib.get('host'): + continue + + # convert host-rule to valid regex + host = ruleset.attrib.get('host').replace('.', '\.').replace('*', '.*') + + # append to host list + hosts.append(host) + + # this child define a rule + elif ruleset.tag == 'rule': + # check if required tags available + if not ruleset.attrib.get('from')\ + or not ruleset.attrib.get('to'): + continue + + # TODO hack, which convert a javascript regex group into a valid python regex group + rule_from = ruleset.attrib.get('from').replace('$', '\\') + rule_to = ruleset.attrib.get('to').replace('$', '\\') + + # TODO, not working yet because of the hack above, currently doing that in webapp.py + #rule_from_rgx = re.compile(rule_from, re.I) + + # append rule + rules.append((rule_from, rule_to)) + + # this child define an exclusion + elif ruleset.tag == 'exclusion': + # check if required tags available + if not ruleset.attrib.get('pattern'): + continue + + exclusion_rgx = re.compile(ruleset.attrib.get('pattern')) + + # append exclusion + exclusions.append(exclusion_rgx) + + # convert list of possible hosts to a simple regex + # TODO compress regex to improve performance + try: + target_hosts = re.compile('^(' + '|'.join(hosts) + ')', re.I | re.U) + except: + return () + + # return ruleset + return (target_hosts, rules, exclusions) + + +# load all https rewrite rules +def load_https_rules(rules_path): + # add / to path if not set yet + if rules_path[-1:] != '/': + rules_path += '/' + + # search all xml files which are stored in the https rule directory + xml_files = [ join(rules_path,f) for f in listdir(rules_path) if isfile(join(rules_path,f)) and f[-4:] == '.xml' ] + + # load xml-files + for ruleset_file in xml_files: + # calculate rewrite-rules + ruleset = load_single_https_ruleset(ruleset_file) + + # skip if no ruleset returned + if not ruleset: + continue + + # append ruleset + https_rules.append(ruleset) |