searxng/searx/https_rewrite.py

'''
searx is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

searx is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU Affero General Public License for more details.

You should have received a copy of the GNU Affero General Public License
along with searx. If not, see < http://www.gnu.org/licenses/ >.

(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
'''

import re
from lxml import etree
from os import listdir
from os.path import isfile, isdir, join


# https://gitweb.torproject.org/\
# pde/https-everywhere.git/tree/4.0:/src/chrome/content/rules

# HTTPS rewrite rules
https_rules = []


# load single ruleset from a xml file
def load_single_https_ruleset(filepath):
    ruleset = ()

    # init parser
    parser = etree.XMLParser()

    # load and parse xml-file
    try:
        tree = etree.parse(filepath, parser)
    except:
        # TODO, error message
        return ()

    # get root node
    root = tree.getroot()

    # check if root is a node with the name ruleset
    # TODO improve parsing
    if root.tag != 'ruleset':
        return ()

    # check if rule is deactivated by default
    if root.attrib.get('default_off'):
        return ()

    # check if rule does only work for specific platforms
    if root.attrib.get('platform'):
        return ()

    hosts = []
    rules = []
    exclusions = []

    # parse childs from ruleset
    for ruleset in root:
        # this child define a target
        if ruleset.tag == 'target':
            # check if required tags available
            if not ruleset.attrib.get('host'):
                continue

            # convert host-rule to valid regex
            host = ruleset.attrib.get('host')\
                .replace('.', '\.').replace('*', '.*')

            # append to host list
            hosts.append(host)

        # this child define a rule
        elif ruleset.tag == 'rule':
            # check if required tags available
            if not ruleset.attrib.get('from')\
               or not ruleset.attrib.get('to'):
                continue

            # TODO hack, which convert a javascript regex group
            # into a valid python regex group
            rule_from = ruleset.attrib.get('from').replace('$', '\\')
            rule_to = ruleset.attrib.get('to').replace('$', '\\')

            # TODO, not working yet because of the hack above,
            # currently doing that in webapp.py
            # rule_from_rgx = re.compile(rule_from, re.I)

            # append rule
            rules.append((rule_from, rule_to))

        # this child define an exclusion
        elif ruleset.tag == 'exclusion':
            # check if required tags available
            if not ruleset.attrib.get('pattern'):
                continue

            exclusion_rgx = re.compile(ruleset.attrib.get('pattern'))

            # append exclusion
            exclusions.append(exclusion_rgx)

    # convert list of possible hosts to a simple regex
    # TODO compress regex to improve performance
    try:
        target_hosts = re.compile('^(' + '|'.join(hosts) + ')', re.I | re.U)
    except:
        return ()

    # return ruleset
    return (target_hosts, rules, exclusions)


# load all https rewrite rules
def load_https_rules(rules_path):
    # check if directory exists
    if not isdir(rules_path):
        print("[E] directory not found: '" + rules_path + "'")
        return

    # search all xml files which are stored in the https rule directory
    xml_files = [join(rules_path, f)
                 for f in listdir(rules_path)
                 if isfile(join(rules_path, f)) and f[-4:] == '.xml']

    # load xml-files
    for ruleset_file in xml_files:
        # calculate rewrite-rules
        ruleset = load_single_https_ruleset(ruleset_file)

        # skip if no ruleset returned
        if not ruleset:
            continue

        # append ruleset
        https_rules.append(ruleset)

    print(' * {n} https-rules loaded'.format(n=len(https_rules)))