Compare commits

...

4 Commits

Author SHA1 Message Date
Grant Lanham Jr dc728ed705
Merge 23bf9f9d95 into 10d3af84b8 2024-11-20 08:28:28 +08:00
Markus Heiser 10d3af84b8 [fix] engine: duckduckgo - don't quote query string
The query string send to DDG must not be qouted.

The query string was URL-qouted in #4011, but the URL-qouted query string result
in unexpected *URL decoded* and other garbish results as reported in #4019
and #4020.  To test compare the results of a query like::

    !ddg Häuser und Straßen :de
    !ddg Häuser und Straßen :all
    !ddg 房屋和街道 :all
    !ddg 房屋和街道 :zh

Closed:

- [#4019] https://github.com/searxng/searxng/issues/4019
- [#4020] https://github.com/searxng/searxng/issues/4020

Related:

- [#4011] https://github.com/searxng/searxng/pull/4011

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2024-11-17 18:14:22 +01:00
Grant Lanham 23bf9f9d95 add base http params 2024-04-25 22:58:19 -04:00
Grant Lanham 2b48584bb4 Fix bing page numbering, add sc to parameter, minor refactor
Bing page numbering doesn't increase by 10 each time. The first page returns 10 results, and all pages thereafter return 14 results. This means we need to update the page numbering

Next, the 'sc' parameter, whatever it means, needs to be present in order to not return the same results.

Finally, the code to check the page had some duplicate checks, so I refactored the code in this section which is low-risk.
2024-04-23 21:03:51 -04:00
2 changed files with 19 additions and 8 deletions

View File

@ -95,17 +95,24 @@ def request(query, params):
# don't ask why it is only sometimes / its M$ and they have never been # don't ask why it is only sometimes / its M$ and they have never been
# deterministic ;) # deterministic ;)
'pq': query, 'pq': query,
# TODO: Figure out how below parameters are populated
'sc': '0-0',
"sp": "-1",
"lq": "0",
"qs": "n",
"ghsh": "0",
"ghacc": "0",
"ghpl": "",
} }
# To get correct page, arg first and this arg FORM is needed, the value PERE # To get correct page, arg first and this arg FORM is needed, the value PERE
# is on page 2, on page 3 its PERE1 and on page 4 its PERE2 .. and so forth. # is on page 2, on page 3 its PERE1 and on page 4 its PERE2 .. and so forth.
# The 'first' arg should never send on page 1. # The 'first' arg should never send on page 1.
if page > 1: if page > 1:
query_params['first'] = _page_offset(page) # see also arg FORM query_params['first'] = _page_offset(page) # see also arg FORM
if page == 2: if page == 2:
query_params['FORM'] = 'PERE' query_params['FORM'] = 'PERE'
elif page > 2: else: # page > 2:
query_params['FORM'] = 'PERE%s' % (page - 2) query_params['FORM'] = 'PERE%s' % (page - 2)
params['url'] = f'{base_url}?{urlencode(query_params)}' params['url'] = f'{base_url}?{urlencode(query_params)}'

View File

@ -6,7 +6,7 @@ DuckDuckGo Lite
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
import re import re
from urllib.parse import urlencode, quote_plus from urllib.parse import urlencode
import json import json
import babel import babel
import lxml.html import lxml.html
@ -263,7 +263,7 @@ def request(query, params):
params['url'] = url params['url'] = url
params['method'] = 'POST' params['method'] = 'POST'
params['data']['q'] = quote_plus(query) params['data']['q'] = query
# The API is not documented, so we do some reverse engineering and emulate # The API is not documented, so we do some reverse engineering and emulate
# what https://html.duckduckgo.com/html does when you press "next Page" link # what https://html.duckduckgo.com/html does when you press "next Page" link
@ -381,7 +381,11 @@ def response(resp):
zero_click_info_xpath = '//div[@id="zero_click_abstract"]' zero_click_info_xpath = '//div[@id="zero_click_abstract"]'
zero_click = extract_text(eval_xpath(doc, zero_click_info_xpath)).strip() zero_click = extract_text(eval_xpath(doc, zero_click_info_xpath)).strip()
if zero_click and "Your IP address is" not in zero_click and "Your user agent:" not in zero_click: if zero_click and (
"Your IP address is" not in zero_click
and "Your user agent:" not in zero_click
and "URL Decoded:" not in zero_click
):
current_query = resp.search_params["data"].get("q") current_query = resp.search_params["data"].get("q")
results.append( results.append(