diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 82b37f9d1..485edf4a7 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -6,7 +6,7 @@ DuckDuckGo Lite from typing import TYPE_CHECKING import re -from urllib.parse import urlencode +from urllib.parse import urlencode, quote_plus import json import babel import lxml.html @@ -18,12 +18,12 @@ from searx import ( ) from searx.utils import ( eval_xpath, + extr, extract_text, ) from searx.network import get # see https://github.com/searxng/searxng/issues/762 from searx import redisdb from searx.enginelib.traits import EngineTraits -from searx.utils import extr from searx.exceptions import SearxEngineCaptchaException if TYPE_CHECKING: @@ -60,25 +60,25 @@ form_data = {'v': 'l', 'api': 'd.js', 'o': 'json'} __CACHE = [] -def _cache_key(data: dict): - return 'SearXNG_ddg_web_vqd' + redislib.secret_hash(f"{data['q']}//{data['kl']}") +def _cache_key(query, region): + return 'SearXNG_ddg_web_vqd' + redislib.secret_hash(f"{query}//{region}") -def cache_vqd(data: dict, value): +def cache_vqd(query, region, value): """Caches a ``vqd`` value from a query.""" c = redisdb.client() if c: logger.debug("cache vqd value: %s", value) - c.set(_cache_key(data), value, ex=600) + c.set(_cache_key(query, region), value, ex=600) else: logger.debug("MEM cache vqd value: %s", value) if len(__CACHE) > 100: # cache vqd from last 100 queries __CACHE.pop(0) - __CACHE.append((_cache_key(data), value)) + __CACHE.append((_cache_key(query, region), value)) -def get_vqd(data): +def get_vqd(query, region): """Returns the ``vqd`` that fits to the *query* (``data`` from HTTP POST). DDG's bot detection is sensitive to the ``vqd`` value. For some search terms @@ -108,7 +108,7 @@ def get_vqd(data): """ - key = _cache_key(data) + key = _cache_key(query, region) value = None c = redisdb.client() if c: @@ -126,6 +126,23 @@ def get_vqd(data): return None +def extract_vqd(query): + """ + Scrape the vqd value matching the query from DuckDuckGo Web. + That function is currently not required for general results, + only for extra results such as images, videos, ... + + Also see ``get_vqd(query, region)`` above. + """ + resp = get(f'https://duckduckgo.com/?q={quote_plus(query)}') + if resp.status_code != 200: + return None + + vqd = extr(resp.text, 'vqd="', '"') + + return vqd + + def get_ddg_lang(eng_traits: EngineTraits, sxng_locale, default='en_US'): """Get DuckDuckGo's language identifier from SearXNG's locale. @@ -313,7 +330,7 @@ def request(query, params): # from here on no more params['data'] shuld be set, since this dict is # needed to get a vqd value from the cache .. - vqd = get_vqd(params['data']) + vqd = get_vqd(query, eng_region) # Certain conditions must be met in order to call up one of the # following pages ... @@ -362,7 +379,7 @@ def response(resp): form = form[0] form_vqd = eval_xpath(form, '//input[@name="vqd"]/@value')[0] - cache_vqd(resp.search_params["data"], form_vqd) + cache_vqd(resp.search_params['data']['q'], resp.search_params['data']['kl'], form_vqd) # just select "web-result" and ignore results of class "result--ad result--ad--small" for div_result in eval_xpath(doc, '//div[@id="links"]/div[contains(@class, "web-result")]'): diff --git a/searx/engines/duckduckgo_extra.py b/searx/engines/duckduckgo_extra.py index b30574d6c..ae85d3bee 100644 --- a/searx/engines/duckduckgo_extra.py +++ b/searx/engines/duckduckgo_extra.py @@ -11,6 +11,8 @@ from searx.utils import get_embeded_stream_url from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import from searx.engines.duckduckgo import ( + cache_vqd, + extract_vqd, get_ddg_lang, get_vqd, ) @@ -48,15 +50,20 @@ search_path_map = {'images': 'i', 'videos': 'v', 'news': 'news'} def request(query, params): + eng_region = traits.get_region(params['searxng_locale'], traits.all_locale) # request needs a vqd argument - vqd = get_vqd(query) + vqd = get_vqd(query, eng_region) + if not vqd: + vqd = extract_vqd(query) + if vqd: + cache_vqd(query, eng_region, vqd) + if not vqd: # some search terms do not have results and therefore no vqd value params['url'] = None return params - eng_region = traits.get_region(params['searxng_locale'], traits.all_locale) eng_lang = get_ddg_lang(traits, params['searxng_locale']) args = { @@ -86,6 +93,12 @@ def request(query, params): params['url'] = f'https://duckduckgo.com/{search_path_map[ddg_category]}.js?{urlencode(args)}' + # sending these two headers prevents rate limiting for the query + params['headers'] = { + 'Referer': 'https://duckduckgo.com/', + 'X-Requested-With': 'XMLHttpRequest', + } + return params