[refactor] engine: duckduckgo - https://html.duckduckgo.com/html

The entire source code of the duckduckgo engine has been reengineered and purified. 1. DDG used the URL https://html.duckduckgo.com/html for no-JS requests whose response is also easier to parse than the previous https://lite.duckduckgo.com/lite/ URL 2. the bot detection of DDG has so far caused problems and often led to a CAPTCHA, this can be circumvented using `'Sec-Fetch-Mode'] = “navigate”` Closes: https://github.com/searxng/searxng/issues/3927 Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2024-10-22 08:49:34 +02:00 · 2024-10-22 08:49:34 +02:00 · b183e620d8
parent f63f97c56c
commit b183e620d8
1 changed files with 116 additions and 106 deletions
--- a/searx/engines/duckduckgo.py
+++ b/searx/engines/duckduckgo.py
@ -18,7 +18,6 @@ from searx import (
 )
 from searx.utils import (
    eval_xpath,
    eval_xpath_getindex,
    extract_text,
 )
 from searx.network import get  # see https://github.com/searxng/searxng/issues/762
@ -54,31 +53,33 @@ paging = True
 time_range_support = True
 safesearch = True  # user can't select but the results are filtered
-url = 'https://lite.duckduckgo.com/lite/'
+url = "https://html.duckduckgo.com/html"
 # url_ping = 'https://duckduckgo.com/t/sl_l'
 time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
 form_data = {'v': 'l', 'api': 'd.js', 'o': 'json'}
 __CACHE = []
-def cache_vqd(query, value):
+def _cache_key(data: dict):
    return 'SearXNG_ddg_web_vqd' + redislib.secret_hash(f"{data['q']}//{data['kl']}")
 def cache_vqd(data: dict, value):
    """Caches a ``vqd`` value from a query."""
    c = redisdb.client()
    if c:
        logger.debug("cache vqd value: %s", value)
-        key = 'SearXNG_ddg_web_vqd' + redislib.secret_hash(query)
+        c.set(_cache_key(data), value, ex=600)
-        c.set(key, value, ex=600)
+
    else:
        logger.debug("MEM cache vqd value: %s", value)
        if len(__CACHE) > 100:  # cache vqd from last 100 queries
            __CACHE.pop(0)
        __CACHE.append((_cache_key(data), value))
-def get_vqd(query):
+def get_vqd(data):
-    """Returns the ``vqd`` that fits to the *query*.  If there is no ``vqd`` cached
+    """Returns the ``vqd`` that fits to the *query* (``data`` from HTTP POST).
    (:py:obj:`cache_vqd`) the query is sent to DDG to get a vqd value from the
    response.
    .. hint::
       If an empty string is returned there are no results for the ``query`` and
       therefore no ``vqd`` value.
    DDG's bot detection is sensitive to the ``vqd`` value.  For some search terms
    (such as extremely long search terms that are often sent by bots), no ``vqd``
@ -106,28 +107,23 @@ def get_vqd(query):
    - DuckDuckGo News: ``https://duckduckgo.com/news.js??q=...&vqd=...``
    """
    key = _cache_key(data)
    value = None
    c = redisdb.client()
    if c:
        key = 'SearXNG_ddg_web_vqd' + redislib.secret_hash(query)
        value = c.get(key)
        if value or value == b'':
            value = value.decode('utf-8')
-            logger.debug("re-use cached vqd value: %s", value)
+            logger.debug("re-use CACHED vqd value: %s", value)
            return value
-    query_url = 'https://duckduckgo.com/?' + urlencode({'q': query})
+    else:
-    res = get(query_url)
+        for k, value in __CACHE:
-    doc = lxml.html.fromstring(res.text)
+            if k == key:
-    for script in doc.xpath("//script[@type='text/javascript']"):
+                logger.debug("MEM re-use CACHED vqd value: %s", value)
        script = script.text
        if 'vqd="' in script:
            value = extr(script, 'vqd="', '"')
            break
    logger.debug("new vqd value: '%s'", value)
    if value is not None:
        cache_vqd(query, value)
                return value
    return None
 def get_ddg_lang(eng_traits: EngineTraits, sxng_locale, default='en_US'):
@ -155,9 +151,10 @@ def get_ddg_lang(eng_traits: EngineTraits, sxng_locale, default='en_US'):
    .. hint::
-       `DDG-lite <https://lite.duckduckgo.com/lite>`__ does not offer a language
+       `DDG-lite <https://lite.duckduckgo.com/lite>`__ and the *no Javascript*
-       selection to the user, only a region can be selected by the user
+       page https://html.duckduckgo.com/html do not offer a language selection
-       (``eng_region`` from the example above).  DDG-lite stores the selected
+       to the user, only a region can be selected by the user (``eng_region``
       from the example above).  DDG-lite and *no Javascript* store the selected
       region in a cookie::
         params['cookies']['kl'] = eng_region  # 'ar-es'
@ -241,10 +238,25 @@ def request(query, params):
    query = quote_ddg_bangs(query)
-    # request needs a vqd argument
+    if len(query) >= 500:
-    vqd = get_vqd(query)
+        # DDG does not accept queries with more than 499 chars
        params["url"] = None
        return
    # Advanced search syntax ends in CAPTCHA
    # https://duckduckgo.com/duckduckgo-help-pages/results/syntax/
    query = [
        x.removeprefix("site:").removeprefix("intitle:").removeprefix("inurl:").removeprefix("filetype:")
        for x in query.split()
    ]
    eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
    if eng_region == "wt-wt":
        # https://html.duckduckgo.com/html sets an empty value for "all".
        eng_region = ""
    params['data']['kl'] = eng_region
    params['cookies']['kl'] = eng_region
    # eng_lang = get_ddg_lang(traits, params['searxng_locale'])
    params['url'] = url
@ -252,54 +264,82 @@ def request(query, params):
    params['data']['q'] = query
    # The API is not documented, so we do some reverse engineering and emulate
-    # what https://lite.duckduckgo.com/lite/ does when you press "next Page"
+    # what https://html.duckduckgo.com/html does when you press "next Page" link
-    # link again and again ..
+    # again and again ..
    params['headers']['Content-Type'] = 'application/x-www-form-urlencoded'
    params['data']['vqd'] = vqd
-    # initial page does not have an offset
+    params['headers']['Sec-Fetch-Dest'] = "document"
    params['headers']['Sec-Fetch-Mode'] = "navigate"  # at least this one is used by ddg's bot detection
    params['headers']['Sec-Fetch-Site'] = "same-origin"
    params['headers']['Sec-Fetch-User'] = "?1"
    # Form of the initial search page does have empty values in the form
    if params['pageno'] == 1:
        params['data']['b'] = ""
    params['data']['df'] = ''
    if params['time_range'] in time_range_dict:
        params['data']['df'] = time_range_dict[params['time_range']]
        params['cookies']['df'] = time_range_dict[params['time_range']]
    if params['pageno'] == 2:
        # second page does have an offset of 20
        offset = (params['pageno'] - 1) * 20
        params['data']['s'] = offset
        params['data']['dc'] = offset + 1
    elif params['pageno'] > 2:
        # third and following pages do have an offset of 20 + n*50
        offset = 20 + (params['pageno'] - 2) * 50
        params['data']['s'] = offset
        params['data']['dc'] = offset + 1
    # initial page does not have additional data in the input form
    if params['pageno'] > 1:
        # initial page does not have these additional data in the input form
        params['data']['o'] = form_data.get('o', 'json')
        params['data']['api'] = form_data.get('api', 'd.js')
        params['data']['nextParams'] = form_data.get('nextParams', '')
        params['data']['v'] = form_data.get('v', 'l')
-        params['headers']['Referer'] = 'https://lite.duckduckgo.com/'
+        params['headers']['Referer'] = url
-    params['data']['kl'] = eng_region
+        # from here on no more params['data'] shuld be set, since this dict is
-    params['cookies']['kl'] = eng_region
+        # needed to get a vqd value from the cache ..
-    params['data']['df'] = ''
+        vqd = get_vqd(params['data'])
-    if params['time_range'] in time_range_dict:
+
-        params['data']['df'] = time_range_dict[params['time_range']]
+        # Certain conditions must be met in order to call up one of the
-        params['cookies']['df'] = time_range_dict[params['time_range']]
+        # following pages ...
        if vqd:
            params['data']['vqd'] = vqd  # follow up pages / requests needs a vqd argument
        else:
            # Don't try to call follow up pages without a vqd value.  DDG
            # recognizes this as a request from a bot.  This lowers the
            # reputation of the SearXNG IP and DDG starts to activate CAPTCHAs.
            params["url"] = None
            return
        if params['searxng_locale'].startswith("zh"):
            # Some locales (at least China) do not have a "next page" button and ddg
            # will return a HTTP/2 403 Forbidden for a request of such a page.
            params["url"] = None
            return
    logger.debug("param data: %s", params['data'])
    logger.debug("param cookies: %s", params['cookies'])
    return params
-def detect_ddg_captcha(dom):
+def is_ddg_captcha(dom):
-    """In case of CAPTCHA ddg open its own *not a Robot* dialog and is
+    """In case of CAPTCHA ddg response its own *not a Robot* dialog and is not
-    not redirected to CAPTCHA page.
+    redirected to a CAPTCHA page."""
-    """
+
-    if eval_xpath(dom, "//form[@id='challenge-form']"):
+    return bool(eval_xpath(dom, "//form[@id='challenge-form']"))
        # set suspend time to zero is OK --> ddg does not block the IP
        raise SearxEngineCaptchaException(suspended_time=0)
 def response(resp):
@ -309,37 +349,34 @@ def response(resp):
    results = []
    doc = lxml.html.fromstring(resp.text)
    detect_ddg_captcha(doc)
-    result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table')
+    if is_ddg_captcha(doc):
        # set suspend time to zero is OK --> ddg does not block the IP
        raise SearxEngineCaptchaException(suspended_time=0, message=f"CAPTCHA ({resp.search_params['data'].get('kl')})")
-    if len(result_table) == 2:
+    form = eval_xpath(doc, '//input[@name="vqd"]/..')
        # some locales (at least China) does not have a "next page" button and
        # the layout of the HTML tables is different.
        result_table = result_table[1]
    elif not len(result_table) >= 3:
        # no more results
        return []
    else:
        result_table = result_table[2]
        # update form data from response
        form = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table//input/..')
    if len(form):
-
+        # some locales (at least China) does not have a "next page" button
        form = form[0]
-            form_data['v'] = eval_xpath(form, '//input[@name="v"]/@value')[0]
+        form_vqd = eval_xpath(form, '//input[@name="vqd"]/@value')[0]
            form_data['api'] = eval_xpath(form, '//input[@name="api"]/@value')[0]
            form_data['o'] = eval_xpath(form, '//input[@name="o"]/@value')[0]
            logger.debug('form_data: %s', form_data)
-    tr_rows = eval_xpath(result_table, './/tr')
+        cache_vqd(resp.search_params["data"], form_vqd)
    # In the last <tr> is the form of the 'previous/next page' links
    tr_rows = tr_rows[:-1]
-    len_tr_rows = len(tr_rows)
+    # just select "web-result" and ignore results of class "result--ad result--ad--small"
-    offset = 0
+    for div_result in eval_xpath(doc, '//div[@id="links"]/div[contains(@class, "web-result")]'):
-    zero_click_info_xpath = '//html/body/form/div/table[2]/tr[2]/td/text()'
+        item = {}
        title = eval_xpath(div_result, './/h2/a')
        if not title:
            # this is the "No results." item in the result list
            continue
        item["title"] = extract_text(title)
        item["url"] = eval_xpath(div_result, './/h2/a/@href')[0]
        item["content"] = extract_text(eval_xpath(div_result, './/a[contains(@class, "result__snippet")]')[0])
        results.append(item)
    zero_click_info_xpath = '//div[@id="zero_click_abstract"]'
    zero_click = extract_text(eval_xpath(doc, zero_click_info_xpath)).strip()
    if zero_click and "Your IP address is" not in zero_click and "Your user agent:" not in zero_click:
@ -352,33 +389,6 @@ def response(resp):
            }
        )
    while len_tr_rows >= offset + 4:
        # assemble table rows we need to scrap
        tr_title = tr_rows[offset]
        tr_content = tr_rows[offset + 1]
        offset += 4
        # ignore sponsored Adds <tr class="result-sponsored">
        if tr_content.get('class') == 'result-sponsored':
            continue
        a_tag = eval_xpath_getindex(tr_title, './/td//a[@class="result-link"]', 0, None)
        if a_tag is None:
            continue
        td_content = eval_xpath_getindex(tr_content, './/td[@class="result-snippet"]', 0, None)
        if td_content is None:
            continue
        results.append(
            {
                'title': a_tag.text_content(),
                'content': extract_text(td_content),
                'url': a_tag.get('href'),
            }
        )
    return results