[fix] duckduckgo extra: crashes and returns no results

This commit is contained in:
Bnyro 2024-11-23 17:56:03 +01:00
parent c4b874e9b0
commit ca81860c0b
2 changed files with 43 additions and 13 deletions

View File

@ -6,7 +6,7 @@ DuckDuckGo Lite
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
import re import re
from urllib.parse import urlencode from urllib.parse import urlencode, quote_plus
import json import json
import babel import babel
import lxml.html import lxml.html
@ -18,12 +18,12 @@ from searx import (
) )
from searx.utils import ( from searx.utils import (
eval_xpath, eval_xpath,
extr,
extract_text, extract_text,
) )
from searx.network import get # see https://github.com/searxng/searxng/issues/762 from searx.network import get # see https://github.com/searxng/searxng/issues/762
from searx import redisdb from searx import redisdb
from searx.enginelib.traits import EngineTraits from searx.enginelib.traits import EngineTraits
from searx.utils import extr
from searx.exceptions import SearxEngineCaptchaException from searx.exceptions import SearxEngineCaptchaException
if TYPE_CHECKING: if TYPE_CHECKING:
@ -60,25 +60,25 @@ form_data = {'v': 'l', 'api': 'd.js', 'o': 'json'}
__CACHE = [] __CACHE = []
def _cache_key(data: dict): def _cache_key(query, region):
return 'SearXNG_ddg_web_vqd' + redislib.secret_hash(f"{data['q']}//{data['kl']}") return 'SearXNG_ddg_web_vqd' + redislib.secret_hash(f"{query}//{region}")
def cache_vqd(data: dict, value): def cache_vqd(query, region, value):
"""Caches a ``vqd`` value from a query.""" """Caches a ``vqd`` value from a query."""
c = redisdb.client() c = redisdb.client()
if c: if c:
logger.debug("cache vqd value: %s", value) logger.debug("cache vqd value: %s", value)
c.set(_cache_key(data), value, ex=600) c.set(_cache_key(query, region), value, ex=600)
else: else:
logger.debug("MEM cache vqd value: %s", value) logger.debug("MEM cache vqd value: %s", value)
if len(__CACHE) > 100: # cache vqd from last 100 queries if len(__CACHE) > 100: # cache vqd from last 100 queries
__CACHE.pop(0) __CACHE.pop(0)
__CACHE.append((_cache_key(data), value)) __CACHE.append((_cache_key(query, region), value))
def get_vqd(data): def get_vqd(query, region):
"""Returns the ``vqd`` that fits to the *query* (``data`` from HTTP POST). """Returns the ``vqd`` that fits to the *query* (``data`` from HTTP POST).
DDG's bot detection is sensitive to the ``vqd`` value. For some search terms DDG's bot detection is sensitive to the ``vqd`` value. For some search terms
@ -108,7 +108,7 @@ def get_vqd(data):
""" """
key = _cache_key(data) key = _cache_key(query, region)
value = None value = None
c = redisdb.client() c = redisdb.client()
if c: if c:
@ -126,6 +126,23 @@ def get_vqd(data):
return None return None
def extract_vqd(query):
"""
Scrape the vqd value matching the query from DuckDuckGo Web.
That function is currently not required for general results,
only for extra results such as images, videos, ...
Also see ``get_vqd(query, region)`` above.
"""
resp = get(f'https://duckduckgo.com/?q={quote_plus(query)}')
if resp.status_code != 200:
return None
vqd = extr(resp.text, 'vqd="', '"')
return vqd
def get_ddg_lang(eng_traits: EngineTraits, sxng_locale, default='en_US'): def get_ddg_lang(eng_traits: EngineTraits, sxng_locale, default='en_US'):
"""Get DuckDuckGo's language identifier from SearXNG's locale. """Get DuckDuckGo's language identifier from SearXNG's locale.
@ -313,7 +330,7 @@ def request(query, params):
# from here on no more params['data'] shuld be set, since this dict is # from here on no more params['data'] shuld be set, since this dict is
# needed to get a vqd value from the cache .. # needed to get a vqd value from the cache ..
vqd = get_vqd(params['data']) vqd = get_vqd(query, eng_region)
# Certain conditions must be met in order to call up one of the # Certain conditions must be met in order to call up one of the
# following pages ... # following pages ...
@ -362,7 +379,7 @@ def response(resp):
form = form[0] form = form[0]
form_vqd = eval_xpath(form, '//input[@name="vqd"]/@value')[0] form_vqd = eval_xpath(form, '//input[@name="vqd"]/@value')[0]
cache_vqd(resp.search_params["data"], form_vqd) cache_vqd(resp.search_params['data']['q'], resp.search_params['data']['kl'], form_vqd)
# just select "web-result" and ignore results of class "result--ad result--ad--small" # just select "web-result" and ignore results of class "result--ad result--ad--small"
for div_result in eval_xpath(doc, '//div[@id="links"]/div[contains(@class, "web-result")]'): for div_result in eval_xpath(doc, '//div[@id="links"]/div[contains(@class, "web-result")]'):

View File

@ -11,6 +11,8 @@ from searx.utils import get_embeded_stream_url
from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import
from searx.engines.duckduckgo import ( from searx.engines.duckduckgo import (
cache_vqd,
extract_vqd,
get_ddg_lang, get_ddg_lang,
get_vqd, get_vqd,
) )
@ -48,15 +50,20 @@ search_path_map = {'images': 'i', 'videos': 'v', 'news': 'news'}
def request(query, params): def request(query, params):
eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
# request needs a vqd argument # request needs a vqd argument
vqd = get_vqd(query) vqd = get_vqd(query, eng_region)
if not vqd:
vqd = extract_vqd(query)
if vqd:
cache_vqd(query, eng_region, vqd)
if not vqd: if not vqd:
# some search terms do not have results and therefore no vqd value # some search terms do not have results and therefore no vqd value
params['url'] = None params['url'] = None
return params return params
eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
eng_lang = get_ddg_lang(traits, params['searxng_locale']) eng_lang = get_ddg_lang(traits, params['searxng_locale'])
args = { args = {
@ -86,6 +93,12 @@ def request(query, params):
params['url'] = f'https://duckduckgo.com/{search_path_map[ddg_category]}.js?{urlencode(args)}' params['url'] = f'https://duckduckgo.com/{search_path_map[ddg_category]}.js?{urlencode(args)}'
# sending these two headers prevents rate limiting for the query
params['headers'] = {
'Referer': 'https://duckduckgo.com/',
'X-Requested-With': 'XMLHttpRequest',
}
return params return params