From f1f5e69c425389a5cb7e7a437b3a39c0d7513022 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Wed, 5 Jan 2022 13:00:52 +0100 Subject: [PATCH 1/6] [fix] startpage engine - avoid captcha Startpage has introduced new anti-scraping measures that make SearXNG instances run into captchas: 1. some arguments has been removed and a new `sc` has been added. 2. search path changed from `do/search` to `sp/search` 3. POST request is no longer needed Closes: https://github.com/searxng/searxng/issues/692 Signed-off-by: Markus Heiser --- searx/engines/startpage.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index 97891921c..1fd259dad 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -3,6 +3,8 @@ Startpage (Web) """ +from urllib.parse import urlencode + from lxml import html from dateutil import parser from datetime import datetime, timedelta @@ -33,7 +35,7 @@ supported_languages_url = 'https://www.startpage.com/do/settings' # search-url base_url = 'https://startpage.com/' -search_url = base_url + 'do/search' +search_url = base_url + 'sp/search?' # specific xpath variables # ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"] @@ -46,14 +48,12 @@ content_xpath = './/p[@class="w-gl__description"]' # do search-request def request(query, params): - params['url'] = search_url - params['method'] = 'POST' - params['data'] = { + args = { 'query': query, 'page': params['pageno'], 'cat': 'web', - 'cmd': 'process_search', - 'engine0': 'v1all', + # 'abp': "-1", + 'sc': 'Mj4jZy61QETj20', } # set language if specified @@ -61,9 +61,10 @@ def request(query, params): lang_code = match_language(params['language'], supported_languages, fallback=None) if lang_code: language_name = supported_languages[lang_code]['alias'] - params['data']['language'] = language_name - params['data']['lui'] = language_name + args['language'] = language_name + args['lui'] = language_name + params['url'] = search_url + urlencode(args) return params From 1cbcddb3f703b0ee076ac5ce0b514246a21472ec Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Wed, 5 Jan 2022 13:08:56 +0100 Subject: [PATCH 2/6] [pylint] Startpage engine Fix remarks from pylint Signed-off-by: Markus Heiser --- searx/engines/startpage.py | 40 +++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index 1fd259dad..eaa157705 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -1,17 +1,20 @@ # SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Startpage (Web) + """ - Startpage (Web) -""" + +import re from urllib.parse import urlencode - -from lxml import html -from dateutil import parser -from datetime import datetime, timedelta -import re from unicodedata import normalize, combining +from datetime import datetime, timedelta + +from dateutil import parser +from lxml import html from babel import Locale from babel.localedata import locale_identifiers + from searx.utils import extract_text, eval_xpath, match_language # about @@ -135,10 +138,11 @@ def response(resp): # get supported languages from their site def _fetch_supported_languages(resp): - # startpage's language selector is a mess - # each option has a displayed name and a value, either of which may represent the language name - # in the native script, the language name in English, an English transliteration of the native name, - # the English name of the writing script used by the language, or occasionally something else entirely. + # startpage's language selector is a mess each option has a displayed name + # and a value, either of which may represent the language name in the native + # script, the language name in English, an English transliteration of the + # native name, the English name of the writing script used by the language, + # or occasionally something else entirely. # this cases are so special they need to be hardcoded, a couple of them are mispellings language_names = { @@ -152,7 +156,15 @@ def _fetch_supported_languages(resp): } # get the English name of every language known by babel - language_names.update({name.lower(): lang_code for lang_code, name in Locale('en')._data['languages'].items()}) + language_names.update( + { + # fmt: off + name.lower(): lang_code + # pylint: disable=protected-access + for lang_code, name in Locale('en')._data['languages'].items() + # fmt: on + } + ) # get the native name of every language known by babel for lang_code in filter(lambda lang_code: lang_code.find('_') == -1, locale_identifiers()): @@ -177,8 +189,8 @@ def _fetch_supported_languages(resp): if isinstance(lang_code, str): supported_languages[lang_code] = {'alias': sp_option_value} elif isinstance(lang_code, list): - for lc in lang_code: - supported_languages[lc] = {'alias': sp_option_value} + for _lc in lang_code: + supported_languages[_lc] = {'alias': sp_option_value} else: print('Unknown language option in Startpage: {} ({})'.format(sp_option_value, sp_option_text)) From 2f4e567e904278f19c4c392fb9a222fcf0afec1c Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Thu, 6 Jan 2022 18:29:04 +0100 Subject: [PATCH 3/6] [fix] Get an actual `sc` argument from startpage's home page. Signed-off-by: Markus Heiser --- searx/engines/startpage.py | 39 +++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index eaa157705..f5448dd47 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -5,6 +5,7 @@ """ import re +from time import time from urllib.parse import urlencode from unicodedata import normalize, combining @@ -15,6 +16,7 @@ from lxml import html from babel import Locale from babel.localedata import locale_identifiers +from searx import network from searx.utils import extract_text, eval_xpath, match_language # about @@ -47,6 +49,41 @@ results_xpath = '//div[@class="w-gl__result__main"]' link_xpath = './/a[@class="w-gl__result-title result-link"]' content_xpath = './/p[@class="w-gl__description"]' +# timestamp of the last fetch of 'sc' code +sc_code_ts = 0 +sc_code = '' + + +def get_sc_code(headers): + """Get an actual `sc` argument from startpage's home page. + + Startpage puts a `sc` argument on every link. Without this argument + startpage considers the request is from a bot. We do not know what is + encoded in the value of the `sc` argument, but it seems to be a kind of a + *time-stamp*. This *time-stamp* is valid for a few hours. + + This function scrap a new *time-stamp* from startpage's home page every hour + (3000 sec). + + """ + + global sc_code_ts, sc_code # pylint: disable=global-statement + + if time() > (sc_code_ts + 3000): + logger.debug("query new sc time-stamp ...") + + resp = network.get(base_url, headers=headers) + dom = html.fromstring(resp.text) + + # href --> '/?sc=adrKJMgF8xwp20' + href = eval_xpath(dom, '//a[@class="footer-home__logo"]')[0].get('href') + + sc_code = href[5:] + sc_code_ts = time() + logger.debug("new value is: %s", sc_code) + + return sc_code + # do search-request def request(query, params): @@ -56,7 +93,7 @@ def request(query, params): 'page': params['pageno'], 'cat': 'web', # 'abp': "-1", - 'sc': 'Mj4jZy61QETj20', + 'sc': get_sc_code(params['headers']), } # set language if specified From 21e884f36903e67a2d786498c25ea428bf8349b5 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sun, 9 Jan 2022 16:05:25 +0100 Subject: [PATCH 4/6] [fix] startpage engine: fetch CAPTCHA & issues related to PR-695 In case of CAPTCHA raise a SearxEngineCaptchaException and suspend for 7 days. When get_sc_code() fails raise a SearxEngineResponseException and suspend for 7 days. [1] https://github.com/searxng/searxng/pull/695 Signed-off-by: Markus Heiser --- searx/engines/startpage.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index f5448dd47..ae7916fc3 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -18,6 +18,11 @@ from babel.localedata import locale_identifiers from searx import network from searx.utils import extract_text, eval_xpath, match_language +from searx.exceptions import ( + SearxEngineResponseException, + SearxEngineCaptchaException, +) + # about about = { @@ -54,6 +59,13 @@ sc_code_ts = 0 sc_code = '' +def raise_captcha(resp): + + if str(resp.url).startswith('https://www.startpage.com/sp/captcha'): + # suspend CAPTCHA for 7 days + raise SearxEngineCaptchaException(suspended_time=7 * 24 * 3600) + + def get_sc_code(headers): """Get an actual `sc` argument from startpage's home page. @@ -73,10 +85,17 @@ def get_sc_code(headers): logger.debug("query new sc time-stamp ...") resp = network.get(base_url, headers=headers) + raise_captcha(resp) dom = html.fromstring(resp.text) - # href --> '/?sc=adrKJMgF8xwp20' - href = eval_xpath(dom, '//a[@class="footer-home__logo"]')[0].get('href') + try: + # href --> '/?sc=adrKJMgF8xwp20' + href = eval_xpath(dom, '//a[@class="footer-home__logo"]')[0].get('href') + except IndexError as exc: + # suspend startpage API --> https://github.com/searxng/searxng/pull/695 + raise SearxEngineResponseException( + suspended_time=7 * 24 * 3600, message="PR-695: query new sc time-stamp failed!" + ) sc_code = href[5:] sc_code_ts = time() From df238e944c8902ac7e075123ca22902c367fd0de Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sun, 9 Jan 2022 16:11:22 +0100 Subject: [PATCH 5/6] [mod] starpage engine: add comment about Startpage's FFox add-on Signed-off-by: Markus Heiser --- searx/engines/startpage.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index ae7916fc3..5e4490afb 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -95,7 +95,7 @@ def get_sc_code(headers): # suspend startpage API --> https://github.com/searxng/searxng/pull/695 raise SearxEngineResponseException( suspended_time=7 * 24 * 3600, message="PR-695: query new sc time-stamp failed!" - ) + ) from exc sc_code = href[5:] sc_code_ts = time() @@ -107,10 +107,19 @@ def get_sc_code(headers): # do search-request def request(query, params): + # pylint: disable=line-too-long + # The format string from Startpage's FFox add-on [1]:: + # + # https://www.startpage.com/do/dsearch?query={searchTerms}&cat=web&pl=ext-ff&language=__MSG_extensionUrlLanguage__&extVersion=1.3.0 + # + # [1] https://addons.mozilla.org/en-US/firefox/addon/startpage-private-search/ + args = { 'query': query, 'page': params['pageno'], 'cat': 'web', + # 'pl': 'ext-ff', + # 'extVersion': '1.3.0', # 'abp': "-1", 'sc': get_sc_code(params['headers']), } From f9271d595fb08ff29408eecb5a0eb5fd9f7cc314 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Sat, 15 Jan 2022 22:56:34 +0100 Subject: [PATCH 6/6] [fix] startpage: workaround to use the startpage network workaround for the issue #762 --- searx/engines/startpage.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index 5e4490afb..cf6872717 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -16,7 +16,7 @@ from lxml import html from babel import Locale from babel.localedata import locale_identifiers -from searx import network +from searx.network import get from searx.utils import extract_text, eval_xpath, match_language from searx.exceptions import ( SearxEngineResponseException, @@ -84,7 +84,7 @@ def get_sc_code(headers): if time() > (sc_code_ts + 3000): logger.debug("query new sc time-stamp ...") - resp = network.get(base_url, headers=headers) + resp = get(base_url, headers=headers) raise_captcha(resp) dom = html.fromstring(resp.text)