[enh] Add onions category with Ahmia, Not Evil and Torch

Xpath engine and results template changed to account for the fact that archive.org doesn't cache .onions, though some onion engines migth have their own cache. Disabled by default. Can be enabled by setting the SOCKS proxies to wherever Tor is listening and setting using_tor_proxy as True. Requires Tor and updating packages. To avoid manually adding the timeout on each engine, you can set extra_proxy_timeout to account for Tor's (or whatever proxy used) extra time.
2026-07-30 03:41:25 +00:00 · 2016-05-19 00:38:43 -05:00
parent 0a44fa8bb7
commit c3daa08537
11 changed files with 399 additions and 14 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,7 @@ setup.cfg
 *.pyc
 */*.pyc
 *~
 *.swp
 /node_modules
--- a/searx/engines/init.py
+++ b/searx/engines/init.py
@@ -142,6 +142,17 @@ def load_engine(engine_data):
        engine.stats['page_load_time'] = 0
        engine.stats['page_load_count'] = 0
    # tor related settings
    if settings['outgoing'].get('using_tor_proxy'):
        # use onion url if using tor.
        if hasattr(engine, 'onion_url'):
            engine.search_url = engine.onion_url + getattr(engine, 'search_path', '')
    elif 'onions' in engine.categories:
        # exclude onion engines if not using tor.
        return None
    engine.timeout += settings['outgoing'].get('extra_proxy_timeout', 0)
    for category_name in engine.categories:
        categories.setdefault(category_name, []).append(engine)
@@ -252,8 +263,9 @@ def get_engines_stats(preferences):
 def load_engines(engine_list):
-    global engines
+    global engines, engine_shortcuts
    engines.clear()
    engine_shortcuts.clear()
    for engine_data in engine_list:
        engine = load_engine(engine_data)
        if engine is not None:
--- a/searx/engines/ahmia.py
+++ b/searx/engines/ahmia.py
@@ -0,0 +1,82 @@
 """
 Ahmia (Onions)
 @website      http://msydqstlz2kzerdg.onion
 @provides-api no
 @using-api    no
 @results      HTML
 @stable       no
 @parse        url, title, content
 """
 from urllib.parse import urlencode, urlparse, parse_qs
 from lxml.html import fromstring
 from searx.engines.xpath import extract_url, extract_text
 # engine config
 categories = ['onions']
 paging = True
 page_size = 10
 # search url
 search_url = 'http://msydqstlz2kzerdg.onion/search/?{query}'
 time_range_support = True
 time_range_dict = {'day': 1,
                   'week': 7,
                   'month': 30}
 # xpaths
 results_xpath = '//li[@class="result"]'
 url_xpath = './h4/a/@href'
 title_xpath = './h4/a[1]'
 content_xpath = './/p[1]'
 correction_xpath = '//*[@id="didYouMean"]//a'
 number_of_results_xpath = '//*[@id="totalResults"]'
 def request(query, params):
    params['url'] = search_url.format(query=urlencode({'q': query}))
    if params['time_range'] in time_range_dict:
        params['url'] += '&' + urlencode({'d': time_range_dict[params['time_range']]})
    return params
 def response(resp):
    results = []
    dom = fromstring(resp.text)
    # trim results so there's not way too many at once
    first_result_index = page_size * (resp.search_params.get('pageno', 1) - 1)
    all_results = dom.xpath(results_xpath)
    trimmed_results = all_results[first_result_index:first_result_index + page_size]
    # get results
    for result in trimmed_results:
        # remove ahmia url and extract the actual url for the result
        raw_url = extract_url(result.xpath(url_xpath), search_url)
        cleaned_url = parse_qs(urlparse(raw_url).query).get('redirect_url', [''])[0]
        title = extract_text(result.xpath(title_xpath))
        content = extract_text(result.xpath(content_xpath))
        results.append({'url': cleaned_url,
                        'title': title,
                        'content': content,
                        'is_onion': True})
    # get spelling corrections
    for correction in dom.xpath(correction_xpath):
        results.append({'correction': extract_text(correction)})
    # get number of results
    number_of_results = dom.xpath(number_of_results_xpath)
    if number_of_results:
        try:
            results.append({'number_of_results': int(extract_text(number_of_results))})
        except:
            pass
    return results
--- a/searx/engines/not_evil.py
+++ b/searx/engines/not_evil.py
@@ -0,0 +1,64 @@
 """
 not Evil (Onions)
 @website     http://hss3uro2hsxfogfq.onion
 @provide-api yes (http://hss3uro2hsxfogfq.onion/api.htm)
 @using-api   no
 @results     HTML
 @stable      no
 @parse       url, title, content
 """
 from urllib.parse import urlencode
 from lxml import html
 from searx.engines.xpath import extract_text
 # engine dependent config
 categories = ['onions']
 paging = True
 page_size = 20
 # search-url
 base_url = 'http://hss3uro2hsxfogfq.onion/'
 search_url = 'index.php?{query}&hostLimit=20&start={pageno}&numRows={page_size}'
 # specific xpath variables
 results_xpath = '//*[@id="content"]/div/p'
 url_xpath = './span[1]'
 title_xpath = './a[1]'
 content_xpath = './text()'
 # do search-request
 def request(query, params):
    offset = (params['pageno'] - 1) * page_size
    params['url'] = base_url + search_url.format(pageno=offset,
                                                 query=urlencode({'q': query}),
                                                 page_size=page_size)
    return params
 # get response from search-request
 def response(resp):
    results = []
    # needed because otherwise requests guesses wrong encoding
    resp.encoding = 'utf8'
    dom = html.fromstring(resp.text)
    # parse results
    for result in dom.xpath(results_xpath):
        url = extract_text(result.xpath(url_xpath)[0])
        title = extract_text(result.xpath(title_xpath)[0])
        content = extract_text(result.xpath(content_xpath))
        # append result
        results.append({'url': url,
                        'title': title,
                        'content': content,
                        'is_onion': True})
    return results
--- a/searx/engines/xpath.py
+++ b/searx/engines/xpath.py
@@ -10,6 +10,8 @@ thumbnail_xpath = False
 paging = False
 suggestion_xpath = ''
 results_xpath = ''
 cached_xpath = ''
 cached_url = ''
 # parameters for engines with paging support
 #
@@ -36,6 +38,8 @@ def request(query, params):
 def response(resp):
    results = []
    dom = html.fromstring(resp.text)
    is_onion = True if 'onions' in categories else False
    if results_xpath:
        for result in eval_xpath(dom, results_xpath):
            url = extract_url(eval_xpath(result, url_xpath), search_url)
@@ -49,15 +53,33 @@ def response(resp):
                if len(thumbnail_xpath_result) > 0:
                    tmp_result['img_src'] = extract_url(thumbnail_xpath_result, search_url)
            # add alternative cached url if available
            if cached_xpath:
                tmp_result['cached_url'] = cached_url + extract_text(result.xpath(cached_xpath))
            if is_onion:
                tmp_result['is_onion'] = True
            results.append(tmp_result)
    else:
        if cached_xpath:
            for url, title, content, cached in zip(
                (extract_url(x, search_url) for
                 x in dom.xpath(url_xpath)),
                map(extract_text, dom.xpath(title_xpath)),
                map(extract_text, dom.xpath(content_xpath)),
                map(extract_text, dom.xpath(cached_xpath))
            ):
                results.append({'url': url, 'title': title, 'content': content,
                                'cached_url': cached_url + cached, 'is_onion': is_onion})
        else:
            for url, title, content in zip(
                (extract_url(x, search_url) for
-             x in eval_xpath(dom, url_xpath)),
+                 x in dom.xpath(url_xpath)),
-            map(extract_text, eval_xpath(dom, title_xpath)),
+                map(extract_text, dom.xpath(title_xpath)),
-            map(extract_text, eval_xpath(dom, content_xpath))
+                map(extract_text, dom.xpath(content_xpath))
            ):
-            results.append({'url': url, 'title': title, 'content': content})
+                results.append({'url': url, 'title': title, 'content': content, 'is_onion': is_onion})
    if not suggestion_xpath:
        return results
--- a/searx/settings.yml
+++ b/searx/settings.yml
@@ -60,8 +60,10 @@ outgoing: # communication with search engines
 # see http://docs.python-requests.org/en/latest/user/advanced/#proxies
 # SOCKS proxies are also supported: see http://requests.readthedocs.io/en/master/user/advanced/#socks
 #    proxies :
-#        http : http://127.0.0.1:8080
+#        http : socks5h://127.0.0.1:9050
-#        https: http://127.0.0.1:8080
+#        https: socks5h://127.0.0.1:9050
 #    using_tor_proxy : True
 #    extra_proxy_timeout : 10.0 # Extra seconds to add in order to account for the time taken by the proxy
 # uncomment below section only if you have more than one network interface
 # which can be the source of outgoing search requests
 #    source_ips:
@@ -89,6 +91,12 @@ engines:
    shortcut: apkm
    disabled: True
 # Requires Tor
  - name : ahmia
    engine : ahmia
    categories : onions
    shortcut : ah
  - name : arch linux wiki
    engine : archlinux
    shortcut : al
@@ -185,7 +193,7 @@ engines:
  - name : deviantart
    engine : deviantart
    shortcut : da
-    timeout: 3.0
+    timeout : 3.0
  - name : ddg definitions
    engine : duckduckgo_definitions
@@ -514,6 +522,11 @@ engines:
    timeout: 5.0
    shortcut : npm
 # Requires Tor
  - name : not evil
    engine : not_evil
    shortcut : ne
  - name : nyaa
    engine : nyaa
    shortcut : nt
@@ -698,6 +711,18 @@ engines:
    url: https://torrentz2.eu/
    timeout : 3.0
 # Requires Tor
  - name : torch
    engine : xpath
    paging : True
    search_url : http://xmh57jrknzkhv6y3ls3ubitzfqnkrwxhopf5aygthi7d6rplyvk3noyd.onion/cgi-bin/omega/omega?P={query}&DEFAULTOP=and
    results_xpath : //table//tr
    url_xpath : ./td[2]/a
    title_xpath : ./td[2]/b
    content_xpath : ./td[2]/small
    categories : onions
    shortcut : tch
  - name : twitter
    engine : twitter
    shortcut : tw
--- a/searx/templates/legacy/result_templates/default.html
+++ b/searx/templates/legacy/result_templates/default.html
@@ -1,6 +1,11 @@
 <div class="result {{ result.class }}{% for e in result.engines %} {{ e }}{% endfor %}">
    <h3 class="result_title">{% if "icon_"~result.engine~".ico" in favicons %}<img width="14" height="14" class="favicon" src="{{ url_for('static', filename='img/icons/icon_'+result.engine+'.ico') }}" alt="{{result.engine}}" />{% endif %}<a href="{{ result.url }}" {% if results_on_new_tab %}target="_blank" rel="noopener noreferrer"{% else %}rel="noreferrer"{% endif %}>{{ result.title|safe }}</a></h3>
-    <p class="url">{{ result.pretty_url }}&lrm; <a class="cache_link" href="https://web.archive.org/web/{{ result.url }}" {% if results_on_new_tab %}target="_blank" rel="noopener noreferrer"{% else %}rel="noreferrer"{% endif %}>{{ _('cached') }}</a>
+    <p class="url">{{ result.pretty_url }}&lrm;
    {% if result.cached_url %}
        <a class="cache_link" href="{{ result.cached_url }}" {% if results_on_new_tab %}target="_blank" rel="noopener noreferrer"{% else %}rel="noreferrer"{% endif %}>{{ _('cached') }}</a>
    {% elif not result.is_onion %}
        <a class="cache_link" href="https://web.archive.org/web/{{ result.url }}" {% if results_on_new_tab %}target="_blank" rel="noopener noreferrer"{% else %}rel="noreferrer"{% endif %}>{{ _('cached') }}</a>
    {% endif %}
    {% if result.publishedDate %}<span class="published_date">{{ result.publishedDate }}</span>{% endif %}</p>
    <p class="content">{% if result.img_src %}<img src="{{ image_proxify(result.img_src) }}" class="image" />{% endif %}{% if result.content %}{{ result.content|safe }}<br class="last"/>{% endif %}</p>
 </div>
--- a/searx/templates/oscar/macros.html
+++ b/searx/templates/oscar/macros.html
@@ -32,7 +32,11 @@
            <span class="label label-default">{{ engine }}</span>
        {%- endfor -%}
        {%- if result.url -%}
            {% if result.cached_url %}
            <small>{{ result_link(result.cached_url, icon('link') + _('cached'), "text-info", id) }}</small>
            {% elif not result.is_onion %}
            <small>{{ result_link("https://web.archive.org/web/" + result.url, icon('link') + _('cached'), "text-info", id) }}</small>
            {% endif %}
        {%- endif -%}
        {%- if proxify -%}
        <small>{{ result_link(proxify(result.url), icon('sort') + _('proxied'), "text-info", id) }}</small>
@@ -50,7 +54,11 @@
        <span class="label label-default">{{ engine }}</span>
    {%- endfor %}
    {%- if result.url -%}
        {% if result.cached_url %}
        <small>{{ result_link(result.cached_url, icon('link') + _('cached'), "text-info", id) }}</small>
        {% elif not result.is_onion %}
        <small>{{ result_link("https://web.archive.org/web/" + result.url, icon('link') + _('cached'), "text-info", id) }}</small>
        {% endif %}
    {%- endif -%}
    {% if proxify -%}
    <small>{{ result_link(proxify(result.url), icon('sort') + _('proxied'), "text-info", id) }}</small>
--- a/searx/webapp.py
+++ b/searx/webapp.py
@@ -146,6 +146,7 @@ _category_names = (gettext('files'),
                   gettext('it'),
                   gettext('news'),
                   gettext('map'),
                   gettext('onions'),
                   gettext('science'))
 outgoing_proxies = settings['outgoing'].get('proxies') or None
--- a/tests/unit/engines/test_xpath.py
+++ b/tests/unit/engines/test_xpath.py
@@ -0,0 +1,121 @@
 # -*- coding: utf-8 -*-
 from collections import defaultdict
 import mock
 from searx.engines import xpath
 from searx.testing import SearxTestCase
 class TestXpathEngine(SearxTestCase):
    def test_request(self):
        xpath.search_url = 'https://url.com/{query}'
        xpath.categories = []
        xpath.paging = False
        query = 'test_query'
        dicto = defaultdict(dict)
        params = xpath.request(query, dicto)
        self.assertIn('url', params)
        self.assertEquals('https://url.com/test_query', params['url'])
        xpath.search_url = 'https://url.com/q={query}&p={pageno}'
        xpath.paging = True
        query = 'test_query'
        dicto = defaultdict(dict)
        dicto['pageno'] = 1
        params = xpath.request(query, dicto)
        self.assertIn('url', params)
        self.assertEquals('https://url.com/q=test_query&p=1', params['url'])
    def test_response(self):
        # without results_xpath
        xpath.url_xpath = '//div[@class="search_result"]//a[@class="result"]/@href'
        xpath.title_xpath = '//div[@class="search_result"]//a[@class="result"]'
        xpath.content_xpath = '//div[@class="search_result"]//p[@class="content"]'
        self.assertRaises(AttributeError, xpath.response, None)
        self.assertRaises(AttributeError, xpath.response, [])
        self.assertRaises(AttributeError, xpath.response, '')
        self.assertRaises(AttributeError, xpath.response, '[]')
        response = mock.Mock(text='<html></html>')
        self.assertEqual(xpath.response(response), [])
        html = u"""
        <div>
            <div class="search_result">
                <a class="result" href="https://result1.com">Result 1</a>
                <p class="content">Content 1</p>
                <a class="cached" href="https://cachedresult1.com">Cache</a>
            </div>
            <div class="search_result">
                <a class="result" href="https://result2.com">Result 2</a>
                <p class="content">Content 2</p>
                <a class="cached" href="https://cachedresult2.com">Cache</a>
            </div>
        </div>
        """
        response = mock.Mock(text=html)
        results = xpath.response(response)
        self.assertEqual(type(results), list)
        self.assertEqual(len(results), 2)
        self.assertEqual(results[0]['title'], 'Result 1')
        self.assertEqual(results[0]['url'], 'https://result1.com/')
        self.assertEqual(results[0]['content'], 'Content 1')
        self.assertEqual(results[1]['title'], 'Result 2')
        self.assertEqual(results[1]['url'], 'https://result2.com/')
        self.assertEqual(results[1]['content'], 'Content 2')
        # with cached urls, without results_xpath
        xpath.cached_xpath = '//div[@class="search_result"]//a[@class="cached"]/@href'
        results = xpath.response(response)
        self.assertEqual(type(results), list)
        self.assertEqual(len(results), 2)
        self.assertEqual(results[0]['cached_url'], 'https://cachedresult1.com')
        self.assertEqual(results[1]['cached_url'], 'https://cachedresult2.com')
        self.assertFalse(results[0].get('is_onion', False))
        # results are onion urls (no results_xpath)
        xpath.categories = ['onions']
        results = xpath.response(response)
        self.assertTrue(results[0]['is_onion'])
        # with results_xpath
        xpath.results_xpath = '//div[@class="search_result"]'
        xpath.url_xpath = './/a[@class="result"]/@href'
        xpath.title_xpath = './/a[@class="result"]'
        xpath.content_xpath = './/p[@class="content"]'
        xpath.cached_xpath = None
        xpath.categories = []
        self.assertRaises(AttributeError, xpath.response, None)
        self.assertRaises(AttributeError, xpath.response, [])
        self.assertRaises(AttributeError, xpath.response, '')
        self.assertRaises(AttributeError, xpath.response, '[]')
        response = mock.Mock(text='<html></html>')
        self.assertEqual(xpath.response(response), [])
        response = mock.Mock(text=html)
        results = xpath.response(response)
        self.assertEqual(type(results), list)
        self.assertEqual(len(results), 2)
        self.assertEqual(results[0]['title'], 'Result 1')
        self.assertEqual(results[0]['url'], 'https://result1.com/')
        self.assertEqual(results[0]['content'], 'Content 1')
        self.assertEqual(results[1]['title'], 'Result 2')
        self.assertEqual(results[1]['url'], 'https://result2.com/')
        self.assertEqual(results[1]['content'], 'Content 2')
        # with cached urls, with results_xpath
        xpath.cached_xpath = './/a[@class="cached"]/@href'
        results = xpath.response(response)
        self.assertEqual(type(results), list)
        self.assertEqual(len(results), 2)
        self.assertEqual(results[0]['cached_url'], 'https://cachedresult1.com')
        self.assertEqual(results[1]['cached_url'], 'https://cachedresult2.com')
        self.assertFalse(results[0].get('is_onion', False))
        # results are onion urls (with results_xpath)
        xpath.categories = ['onions']
        results = xpath.response(response)
        self.assertTrue(results[0]['is_onion'])
--- a/tests/unit/test_engines_init.py
+++ b/tests/unit/test_engines_init.py
@@ -0,0 +1,44 @@
 from searx.testing import SearxTestCase
 from searx import settings, engines
 class TestEnginesInit(SearxTestCase):
    @classmethod
    def tearDownClass(cls):
        settings['outgoing']['using_tor_proxy'] = False
        settings['outgoing']['extra_proxy_timeout'] = 0
    def test_initialize_engines_default(self):
        engine_list = [{'engine': 'dummy', 'name': 'engine1', 'shortcut': 'e1'},
                       {'engine': 'dummy', 'name': 'engine2', 'shortcut': 'e2'}]
        engines.initialize_engines(engine_list)
        self.assertEqual(len(engines.engines), 2)
        self.assertIn('engine1', engines.engines)
        self.assertIn('engine2', engines.engines)
    def test_initialize_engines_exclude_onions(self):
        settings['outgoing']['using_tor_proxy'] = False
        engine_list = [{'engine': 'dummy', 'name': 'engine1', 'shortcut': 'e1', 'categories': 'general'},
                       {'engine': 'dummy', 'name': 'engine2', 'shortcut': 'e2', 'categories': 'onions'}]
        engines.initialize_engines(engine_list)
        self.assertEqual(len(engines.engines), 1)
        self.assertIn('engine1', engines.engines)
        self.assertNotIn('onions', engines.categories)
    def test_initialize_engines_include_onions(self):
        settings['outgoing']['using_tor_proxy'] = True
        settings['outgoing']['extra_proxy_timeout'] = 100.0
        engine_list = [{'engine': 'dummy', 'name': 'engine1', 'shortcut': 'e1', 'categories': 'general',
                        'timeout': 20.0, 'onion_url': 'http://engine1.onion'},
                       {'engine': 'dummy', 'name': 'engine2', 'shortcut': 'e2', 'categories': 'onions'}]
        engines.initialize_engines(engine_list)
        self.assertEqual(len(engines.engines), 2)
        self.assertIn('engine1', engines.engines)
        self.assertIn('engine2', engines.engines)
        self.assertIn('onions', engines.categories)
        self.assertIn('http://engine1.onion', engines.engines['engine1'].search_url)
        self.assertEqual(engines.engines['engine1'].timeout, 120.0)