Merge branch 'master' of github.com:y0nei/searxng

This commit is contained in:
y0nei 2023-10-03 10:37:13 +02:00
commit 3732b7fef5
No known key found for this signature in database
GPG Key ID: 89A8EAB009DF28DC
16 changed files with 1865 additions and 639 deletions

2
.nvmrc
View File

@ -1 +1 @@
v16.15.1 v16.20.2

2
manage
View File

@ -41,7 +41,7 @@ PATH="${REPO_ROOT}/node_modules/.bin:${PATH}"
PYOBJECTS="searx" PYOBJECTS="searx"
PY_SETUP_EXTRAS='[test]' PY_SETUP_EXTRAS='[test]'
GECKODRIVER_VERSION="v0.30.0" GECKODRIVER_VERSION="v0.33.0"
# SPHINXOPTS= # SPHINXOPTS=
BLACK_OPTIONS=("--target-version" "py311" "--line-length" "120" "--skip-string-normalization") BLACK_OPTIONS=("--target-version" "py311" "--line-length" "120" "--skip-string-normalization")
BLACK_TARGETS=("--exclude" "searx/static,searx/languages.py" "--include" 'searxng.msg|\.pyi?$' "searx" "searxng_extra" "tests") BLACK_TARGETS=("--exclude" "searx/static,searx/languages.py" "--include" 'searxng.msg|\.pyi?$' "searx" "searxng_extra" "tests")

View File

@ -1,7 +1,7 @@
{ {
"dependencies": { "dependencies": {
"eslint": "^8.18.0", "eslint": "^8.50.0",
"pyright": "^1.1.255" "pyright": "^1.1.329"
}, },
"scripts": { "scripts": {
"clean": "rm -Rf node_modules package-lock.json" "clean": "rm -Rf node_modules package-lock.json"

View File

@ -51,7 +51,6 @@ from . import (
http_accept, http_accept,
http_accept_encoding, http_accept_encoding,
http_accept_language, http_accept_language,
http_connection,
http_user_agent, http_user_agent,
ip_limit, ip_limit,
ip_lists, ip_lists,
@ -136,7 +135,6 @@ def filter_request(request: flask.Request) -> werkzeug.Response | None:
http_accept, http_accept,
http_accept_encoding, http_accept_encoding,
http_accept_language, http_accept_language,
http_connection,
http_user_agent, http_user_agent,
ip_limit, ip_limit,
]: ]:

File diff suppressed because it is too large Load Diff

View File

@ -8,19 +8,18 @@ implementations are shared by other engines:
- :ref:`bing videos engine` - :ref:`bing videos engine`
On the `preference page`_ Bing offers a lot of languages an regions (see section On the `preference page`_ Bing offers a lot of languages an regions (see section
'Search results languages' and 'Country/region'). However, the abundant choice LANGUAGE and COUNTRY/REGION). The Language is the language of the UI, we need
does not correspond to reality, where Bing has a full-text indexer only for a in SearXNG to get the translations of data such as *"published last week"*.
limited number of languages. By example: you can select a language like Māori
but you never get a result in this language.
What comes a bit closer to the truth are the `search-APIs`_ but they don`t seem There is a description of the offical search-APIs_, unfortunately this is not
to be completely correct either (if you take a closer look you will find some the API we can use or that bing itself would use. You can look up some things
inaccuracies there too): in the API to get a better picture of bing, but the value specifications like
the market codes are usually outdated or at least no longer used by bing itself.
- :py:obj:`searx.engines.bing.bing_traits_url` The market codes have been harmonized and are identical for web, video and
- :py:obj:`searx.engines.bing_videos.bing_traits_url` images. The news area has also been harmonized with the other categories. Only
- :py:obj:`searx.engines.bing_images.bing_traits_url` political adjustments still seem to be made -- for example, there is no news
- :py:obj:`searx.engines.bing_news.bing_traits_url` category for the Chinese market.
.. _preference page: https://www.bing.com/account/general .. _preference page: https://www.bing.com/account/general
.. _search-APIs: https://learn.microsoft.com/en-us/bing/search-apis/ .. _search-APIs: https://learn.microsoft.com/en-us/bing/search-apis/
@ -30,9 +29,8 @@ inaccuracies there too):
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
import base64 import base64
import datetime
import re import re
import uuid import time
from urllib.parse import parse_qs, urlencode, urlparse from urllib.parse import parse_qs, urlencode, urlparse
from lxml import html from lxml import html
import babel import babel
@ -45,7 +43,7 @@ from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING: if TYPE_CHECKING:
import logging import logging
logger: logging.Logger logger = logging.getLogger()
traits: EngineTraits traits: EngineTraits
@ -58,124 +56,63 @@ about = {
"results": 'HTML', "results": 'HTML',
} }
send_accept_language_header = True
"""Bing tries to guess user's language and territory from the HTTP
Accept-Language. Optional the user can select a search-language (can be
different to the UI language) and a region (market code)."""
# engine dependent config # engine dependent config
categories = ['general', 'web'] categories = ['general', 'web']
paging = True paging = True
time_range_support = True time_range_support = True
safesearch = True safesearch = True
safesearch_types = {2: 'STRICT', 1: 'DEMOTE', 0: 'OFF'} # cookie: ADLT=STRICT """Bing results are always SFW. To get NSFW links from bing some age
verification by a cookie is needed / thats not possible in SearXNG.
"""
base_url = 'https://www.bing.com/search' base_url = 'https://www.bing.com/search'
"""Bing (Web) search URL""" """Bing (Web) search URL"""
bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/reference/market-codes'
"""Bing (Web) search API description""" def _page_offset(pageno):
return (int(pageno) - 1) * 10 + 1
def _get_offset_from_pageno(pageno): def set_bing_cookies(params, engine_language, engine_region):
return (pageno - 1) * 10 + 1 params['cookies']['_EDGE_CD'] = f'm={engine_region}&u={engine_language}'
params['cookies']['_EDGE_S'] = f'mkt={engine_region}&ui={engine_language}'
logger.debug("bing cookies: %s", params['cookies'])
def set_bing_cookies(params, engine_language, engine_region, SID):
# set cookies
# -----------
params['cookies']['_EDGE_V'] = '1'
# _EDGE_S: F=1&SID=3A5253BD6BCA609509B741876AF961CA&mkt=zh-tw
_EDGE_S = [
'F=1',
'SID=%s' % SID,
'mkt=%s' % engine_region.lower(),
'ui=%s' % engine_language.lower(),
]
params['cookies']['_EDGE_S'] = '&'.join(_EDGE_S)
logger.debug("cookie _EDGE_S=%s", params['cookies']['_EDGE_S'])
# "_EDGE_CD": "m=zh-tw",
_EDGE_CD = [ # pylint: disable=invalid-name
'm=%s' % engine_region.lower(), # search region: zh-cn
'u=%s' % engine_language.lower(), # UI: en-us
]
params['cookies']['_EDGE_CD'] = '&'.join(_EDGE_CD) + ';'
logger.debug("cookie _EDGE_CD=%s", params['cookies']['_EDGE_CD'])
SRCHHPGUSR = [ # pylint: disable=invalid-name
'SRCHLANG=%s' % engine_language,
# Trying to set ADLT cookie here seems not to have any effect, I assume
# there is some age verification by a cookie (and/or session ID) needed,
# to disable the SafeSearch.
'ADLT=%s' % safesearch_types.get(params['safesearch'], 'DEMOTE'),
]
params['cookies']['SRCHHPGUSR'] = '&'.join(SRCHHPGUSR)
logger.debug("cookie SRCHHPGUSR=%s", params['cookies']['SRCHHPGUSR'])
def request(query, params): def request(query, params):
"""Assemble a Bing-Web request.""" """Assemble a Bing-Web request."""
engine_region = traits.get_region(params['searxng_locale'], 'en-US') engine_region = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
engine_language = traits.get_language(params['searxng_locale'], 'en') engine_language = traits.get_language(params['searxng_locale'], 'en') # type: ignore
set_bing_cookies(params, engine_language, engine_region)
SID = uuid.uuid1().hex.upper() page = params.get('pageno', 1)
CVID = uuid.uuid1().hex.upper()
set_bing_cookies(params, engine_language, engine_region, SID)
# build URL query
# ---------------
# query term
page = int(params.get('pageno', 1))
query_params = { query_params = {
# fmt: off
'q': query, 'q': query,
# if arg 'pq' is missed, somtimes on page 4 we get results from page 1,
# don't ask why it is only sometimes / its M$ and they have never been
# deterministic ;)
'pq': query, 'pq': query,
'cvid': CVID,
'qs': 'n',
'sp': '-1'
# fmt: on
} }
# page # To get correct page, arg first and this arg FORM is needed, the value PERE
# is on page 2, on page 3 its PERE1 and on page 4 its PERE2 .. and so forth.
# The 'first' arg should never send on page 1.
if page > 1: if page > 1:
referer = base_url + '?' + urlencode(query_params) query_params['first'] = _page_offset(page) # see also arg FORM
params['headers']['Referer'] = referer
logger.debug("headers.Referer --> %s", referer)
query_params['first'] = _get_offset_from_pageno(page)
if page == 2: if page == 2:
query_params['FORM'] = 'PERE' query_params['FORM'] = 'PERE'
elif page > 2: elif page > 2:
query_params['FORM'] = 'PERE%s' % (page - 2) query_params['FORM'] = 'PERE%s' % (page - 2)
filters = '' params['url'] = f'{base_url}?{urlencode(query_params)}'
if params['time_range']:
query_params['filt'] = 'custom'
if params['time_range'] == 'day': if params.get('time_range'):
filters = 'ex1:"ez1"' unix_day = int(time.time() / 86400)
elif params['time_range'] == 'week': time_ranges = {'day': '1', 'week': '2', 'month': '3', 'year': f'5_{unix_day-365}_{unix_day}'}
filters = 'ex1:"ez2"' params['url'] += f'&filters=ex1:"ez{time_ranges[params["time_range"]]}"'
elif params['time_range'] == 'month':
filters = 'ex1:"ez3"'
elif params['time_range'] == 'year':
epoch_1970 = datetime.date(1970, 1, 1)
today_no = (datetime.date.today() - epoch_1970).days
filters = 'ex1:"ez5_%s_%s"' % (today_no - 365, today_no)
params['url'] = base_url + '?' + urlencode(query_params)
if filters:
params['url'] = params['url'] + '&filters=' + filters
return params return params
@ -197,10 +134,11 @@ def response(resp):
url = link.attrib.get('href') url = link.attrib.get('href')
title = extract_text(link) title = extract_text(link)
content = eval_xpath(result, '(.//p)[1]') content = eval_xpath(result, './/p')
for p in content: for p in content:
# Make sure that the element is free of <a href> links # Make sure that the element is free of:
for e in p.xpath('.//a'): # <span class="algoSlug_icon" # data-priority="2">Web</span>
for e in p.xpath('.//span[@class="algoSlug_icon"]'):
e.getparent().remove(e) e.getparent().remove(e)
content = extract_text(content) content = extract_text(content)
@ -236,7 +174,7 @@ def response(resp):
except Exception as e: # pylint: disable=broad-except except Exception as e: # pylint: disable=broad-except
logger.debug('result error :\n%s', e) logger.debug('result error :\n%s', e)
if result_len and _get_offset_from_pageno(resp.search_params.get("pageno", 0)) > result_len: if result_len and _page_offset(resp.search_params.get("pageno", 0)) > result_len:
# Avoid reading more results than avalaible. # Avoid reading more results than avalaible.
# For example, if there is 100 results from some search and we try to get results from 120 to 130, # For example, if there is 100 results from some search and we try to get results from 120 to 130,
# Bing will send back the results from 0 to 10 and no error. # Bing will send back the results from 0 to 10 and no error.
@ -249,72 +187,76 @@ def response(resp):
def fetch_traits(engine_traits: EngineTraits): def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages and regions from Bing-Web.""" """Fetch languages and regions from Bing-Web."""
# pylint: disable=import-outside-toplevel
xpath_market_codes = '//table[1]/tbody/tr/td[3]'
# xpath_country_codes = '//table[2]/tbody/tr/td[2]'
xpath_language_codes = '//table[3]/tbody/tr/td[2]'
_fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes)
def _fetch_traits(engine_traits: EngineTraits, url: str, xpath_language_codes: str, xpath_market_codes: str):
# pylint: disable=too-many-locals,import-outside-toplevel
from searx.network import get # see https://github.com/searxng/searxng/issues/762 from searx.network import get # see https://github.com/searxng/searxng/issues/762
# insert alias to map from a language (zh) to a language + script (zh_Hans) resp = get("https://www.bing.com/account/general")
engine_traits.languages['zh'] = 'zh-hans'
resp = get(url)
if not resp.ok: # type: ignore if not resp.ok: # type: ignore
print("ERROR: response from peertube is not OK.") print("ERROR: response from bing is not OK.")
dom = html.fromstring(resp.text) # type: ignore dom = html.fromstring(resp.text) # type: ignore
map_lang = {'jp': 'ja'} # languages
for td in eval_xpath(dom, xpath_language_codes):
eng_lang = td.text
if eng_lang in ('en-gb', 'pt-br'): engine_traits.languages['zh'] = 'zh-hans'
# language 'en' is already in the list and a language 'en-gb' can't
# be handled in SearXNG, same with pt-br which is covered by pt-pt.
continue
babel_lang = map_lang.get(eng_lang, eng_lang).replace('-', '_') map_lang = {'prs': 'fa-AF', 'en': 'en-us'}
try: bing_ui_lang_map = {
sxng_tag = language_tag(babel.Locale.parse(babel_lang)) # HINT: this list probably needs to be supplemented
except babel.UnknownLocaleError: 'en': 'us', # en --> en-us
print("ERROR: language (%s) is unknown by babel" % (eng_lang)) 'da': 'dk', # da --> da-dk
continue
conflict = engine_traits.languages.get(sxng_tag)
if conflict:
if conflict != eng_lang:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang))
continue
engine_traits.languages[sxng_tag] = eng_lang
map_region = {
'en-ID': 'id_ID',
'no-NO': 'nb_NO',
} }
for td in eval_xpath(dom, xpath_market_codes): for href in eval_xpath(dom, '//div[@id="language-section"]//li/a/@href'):
eng_region = td.text eng_lang = parse_qs(urlparse(href).query)['setlang'][0]
babel_region = map_region.get(eng_region, eng_region).replace('-', '_') babel_lang = map_lang.get(eng_lang, eng_lang)
if eng_region == 'en-WW':
engine_traits.all_locale = eng_region
continue
try: try:
sxng_tag = region_tag(babel.Locale.parse(babel_region)) sxng_tag = language_tag(babel.Locale.parse(babel_lang.replace('-', '_')))
except babel.UnknownLocaleError: except babel.UnknownLocaleError:
print("ERROR: region (%s) is unknown by babel" % (eng_region)) print("ERROR: language (%s) is unknown by babel" % (babel_lang))
continue continue
# Language (e.g. 'en' or 'de') from https://www.bing.com/account/general
# is converted by bing to 'en-us' or 'de-de'. But only if there is not
# already a '-' delemitter in the language. For instance 'pt-PT' -->
# 'pt-pt' and 'pt-br' --> 'pt-br'
bing_ui_lang = eng_lang.lower()
if '-' not in bing_ui_lang:
bing_ui_lang = bing_ui_lang + '-' + bing_ui_lang_map.get(bing_ui_lang, bing_ui_lang)
conflict = engine_traits.languages.get(sxng_tag)
if conflict:
if conflict != bing_ui_lang:
print(f"CONFLICT: babel {sxng_tag} --> {conflict}, {bing_ui_lang}")
continue
engine_traits.languages[sxng_tag] = bing_ui_lang
# regions (aka "market codes")
engine_traits.regions['zh-CN'] = 'zh-cn'
map_market_codes = {
'zh-hk': 'en-hk', # not sure why, but at M$ this is the market code for Hongkong
}
for href in eval_xpath(dom, '//div[@id="region-section"]//li/a/@href'):
cc_tag = parse_qs(urlparse(href).query)['cc'][0]
if cc_tag == 'clear':
engine_traits.all_locale = cc_tag
continue
# add market codes from official languages of the country ..
for lang_tag in babel.languages.get_official_languages(cc_tag, de_facto=True):
if lang_tag not in engine_traits.languages.keys():
# print("ignore lang: %s <-- %s" % (cc_tag, lang_tag))
continue
lang_tag = lang_tag.split('_')[0] # zh_Hant --> zh
market_code = f"{lang_tag}-{cc_tag}" # zh-tw
market_code = map_market_codes.get(market_code, market_code)
sxng_tag = region_tag(babel.Locale.parse('%s_%s' % (lang_tag, cc_tag.upper())))
conflict = engine_traits.regions.get(sxng_tag) conflict = engine_traits.regions.get(sxng_tag)
if conflict: if conflict:
if conflict != eng_region: if conflict != market_code:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_region)) print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, market_code))
continue continue
engine_traits.regions[sxng_tag] = eng_region engine_traits.regions[sxng_tag] = market_code

View File

@ -6,23 +6,20 @@
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
import uuid
import json import json
from urllib.parse import urlencode from urllib.parse import urlencode
from lxml import html from lxml import html
from searx.enginelib.traits import EngineTraits from searx.enginelib.traits import EngineTraits
from searx.engines.bing import ( from searx.engines.bing import set_bing_cookies
set_bing_cookies, from searx.engines.bing import fetch_traits # pylint: disable=unused-import
_fetch_traits,
)
from searx.engines.bing import send_accept_language_header # pylint: disable=unused-import
if TYPE_CHECKING: if TYPE_CHECKING:
import logging import logging
logger: logging.Logger logger = logging.getLogger()
traits: EngineTraits traits: EngineTraits
@ -45,39 +42,29 @@ time_range_support = True
base_url = 'https://www.bing.com/images/async' base_url = 'https://www.bing.com/images/async'
"""Bing (Images) search URL""" """Bing (Images) search URL"""
bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-image-search/reference/market-codes'
"""Bing (Images) search API description"""
time_map = { time_map = {
# fmt: off
'day': 60 * 24, 'day': 60 * 24,
'week': 60 * 24 * 7, 'week': 60 * 24 * 7,
'month': 60 * 24 * 31, 'month': 60 * 24 * 31,
'year': 60 * 24 * 365, 'year': 60 * 24 * 365,
# fmt: on
} }
def request(query, params): def request(query, params):
"""Assemble a Bing-Image request.""" """Assemble a Bing-Image request."""
engine_region = traits.get_region(params['searxng_locale'], 'en-US') engine_region = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
engine_language = traits.get_language(params['searxng_locale'], 'en') engine_language = traits.get_language(params['searxng_locale'], 'en') # type: ignore
set_bing_cookies(params, engine_language, engine_region)
SID = uuid.uuid1().hex.upper()
set_bing_cookies(params, engine_language, engine_region, SID)
# build URL query # build URL query
# - example: https://www.bing.com/images/async?q=foo&first=155&count=35 # - example: https://www.bing.com/images/async?q=foo&async=content&first=1&count=35
query_params = { query_params = {
# fmt: off
'q': query, 'q': query,
'async' : 'content', 'async': '1',
# to simplify the page count lets use the default of 35 images per page # to simplify the page count lets use the default of 35 images per page
'first': (int(params.get('pageno', 1)) - 1) * 35 + 1, 'first': (int(params.get('pageno', 1)) - 1) * 35 + 1,
'count': 35, 'count': 35,
# fmt: on
} }
# time range # time range
@ -120,13 +107,3 @@ def response(resp):
} }
) )
return results return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages and regions from Bing-News."""
xpath_market_codes = '//table[1]/tbody/tr/td[3]'
# xpath_country_codes = '//table[2]/tbody/tr/td[2]'
xpath_language_codes = '//table[3]/tbody/tr/td[2]'
_fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes)

View File

@ -1,22 +1,23 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint # lint: pylint
"""Bing-News: description see :py:obj:`searx.engines.bing`. """Bing-News: description see :py:obj:`searx.engines.bing`.
.. hint::
Bing News is *different* in some ways!
""" """
# pylint: disable=invalid-name # pylint: disable=invalid-name
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
import uuid
from urllib.parse import urlencode from urllib.parse import urlencode
from lxml import html from lxml import html
from searx.utils import eval_xpath, extract_text, eval_xpath_list, eval_xpath_getindex
from searx.enginelib.traits import EngineTraits from searx.enginelib.traits import EngineTraits
from searx.engines.bing import ( from searx.engines.bing import set_bing_cookies
set_bing_cookies,
_fetch_traits,
)
from searx.engines.bing import send_accept_language_header # pylint: disable=unused-import
if TYPE_CHECKING: if TYPE_CHECKING:
import logging import logging
@ -39,58 +40,48 @@ about = {
# engine dependent config # engine dependent config
categories = ['news'] categories = ['news']
paging = True paging = True
"""If go through the pages and there are actually no new results for another
page, then bing returns the results from the last page again."""
time_range_support = True time_range_support = True
time_map = { time_map = {
'day': '4', 'day': 'interval="4"',
'week': '8', 'week': 'interval="7"',
'month': '9', 'month': 'interval="9"',
} }
"""A string '4' means *last hour*. We use *last hour* for ``day`` here since the """A string '4' means *last hour*. We use *last hour* for ``day`` here since the
difference of *last day* and *last week* in the result list is just marginally. difference of *last day* and *last week* in the result list is just marginally.
""" Bing does not have news range ``year`` / we use ``month`` instead."""
base_url = 'https://www.bing.com/news/infinitescrollajax' base_url = 'https://www.bing.com/news/infinitescrollajax'
"""Bing (News) search URL""" """Bing (News) search URL"""
bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-news-search/reference/market-codes'
"""Bing (News) search API description"""
mkt_alias = {
'zh': 'en-WW',
'zh-CN': 'en-WW',
}
"""Bing News has an official market code 'zh-CN' but we won't get a result with
this market code. For 'zh' and 'zh-CN' we better use the *Worldwide aggregate*
market code (en-WW).
"""
def request(query, params): def request(query, params):
"""Assemble a Bing-News request.""" """Assemble a Bing-News request."""
sxng_locale = params['searxng_locale'] engine_region = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
engine_region = traits.get_region(mkt_alias.get(sxng_locale, sxng_locale), traits.all_locale) engine_language = traits.get_language(params['searxng_locale'], 'en') # type: ignore
engine_language = traits.get_language(sxng_locale, 'en') set_bing_cookies(params, engine_language, engine_region)
SID = uuid.uuid1().hex.upper()
set_bing_cookies(params, engine_language, engine_region, SID)
# build URL query # build URL query
# #
# example: https://www.bing.com/news/infinitescrollajax?q=london&first=1 # example: https://www.bing.com/news/infinitescrollajax?q=london&first=1
page = int(params.get('pageno', 1)) - 1
query_params = { query_params = {
# fmt: off
'q': query, 'q': query,
'InfiniteScroll': 1, 'InfiniteScroll': 1,
# to simplify the page count lets use the default of 10 images per page # to simplify the page count lets use the default of 10 images per page
'first' : (int(params.get('pageno', 1)) - 1) * 10 + 1, 'first': page * 10 + 1,
# fmt: on 'SFX': page,
'form': 'PTFTNR',
'setlang': engine_region.split('-')[0],
'cc': engine_region.split('-')[-1],
} }
if params['time_range']: if params['time_range']:
# qft=interval:"7" query_params['qft'] = time_map.get(params['time_range'], 'interval="9"')
query_params['qft'] = 'qft=interval="%s"' % time_map.get(params['time_range'], '9')
params['url'] = base_url + '?' + urlencode(query_params) params['url'] = base_url + '?' + urlencode(query_params)
@ -106,18 +97,34 @@ def response(resp):
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
for newsitem in dom.xpath('//div[contains(@class, "newsitem")]'): for newsitem in eval_xpath_list(dom, '//div[contains(@class, "newsitem")]'):
link = eval_xpath_getindex(newsitem, './/a[@class="title"]', 0, None)
if link is None:
continue
url = link.attrib.get('href')
title = extract_text(link)
content = extract_text(eval_xpath(newsitem, './/div[@class="snippet"]'))
metadata = []
source = eval_xpath_getindex(newsitem, './/div[contains(@class, "source")]', 0, None)
if source is not None:
for item in (
eval_xpath_getindex(source, './/span[@aria-label]/@aria-label', 0, None),
# eval_xpath_getindex(source, './/a', 0, None),
# eval_xpath_getindex(source, './div/span', 3, None),
link.attrib.get('data-author'),
):
if item is not None:
t = extract_text(item)
if t and t.strip():
metadata.append(t.strip())
metadata = ' | '.join(metadata)
url = newsitem.xpath('./@url')[0]
title = ' '.join(newsitem.xpath('.//div[@class="caption"]//a[@class="title"]/text()')).strip()
content = ' '.join(newsitem.xpath('.//div[@class="snippet"]/text()')).strip()
thumbnail = None thumbnail = None
author = newsitem.xpath('./@data-author')[0] imagelink = eval_xpath_getindex(newsitem, './/a[@class="imagelink"]//img', 0, None)
metadata = ' '.join(newsitem.xpath('.//div[@class="source"]/span/text()')).strip() if imagelink is not None:
thumbnail = 'https://www.bing.com/' + imagelink.attrib.get('src')
img_src = newsitem.xpath('.//a[@class="imagelink"]//img/@src')
if img_src:
thumbnail = 'https://www.bing.com/' + img_src[0]
results.append( results.append(
{ {
@ -125,7 +132,6 @@ def response(resp):
'title': title, 'title': title,
'content': content, 'content': content,
'img_src': thumbnail, 'img_src': thumbnail,
'author': author,
'metadata': metadata, 'metadata': metadata,
} }
) )
@ -134,17 +140,20 @@ def response(resp):
def fetch_traits(engine_traits: EngineTraits): def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages and regions from Bing-News. """Fetch languages and regions from Bing-News."""
# pylint: disable=import-outside-toplevel
The :py:obj:`description <searx.engines.bing_news.bing_traits_url>` of the from searx.engines.bing import fetch_traits as _f
first table says *"query parameter when calling the Video Search API."*
.. that's why I use the 4. table "News Category API markets" for the
``xpath_market_codes``.
""" _f(engine_traits)
xpath_market_codes = '//table[4]/tbody/tr/td[3]' # fix market codes not known by bing news:
# xpath_country_codes = '//table[2]/tbody/tr/td[2]'
xpath_language_codes = '//table[3]/tbody/tr/td[2]'
_fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes) # In bing the market code 'zh-cn' exists, but there is no 'news' category in
# bing for this market. Alternatively we use the the market code from Honk
# Kong. Even if this is not correct, it is better than having no hits at
# all, or sending false queries to bing that could raise the suspicion of a
# bot.
# HINT: 'en-hk' is the region code it does not indicate the language en!!
engine_traits.regions['zh-CN'] = 'en-hk'

View File

@ -5,18 +5,15 @@
# pylint: disable=invalid-name # pylint: disable=invalid-name
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
import uuid
import json import json
from urllib.parse import urlencode from urllib.parse import urlencode
from lxml import html from lxml import html
from searx.enginelib.traits import EngineTraits from searx.enginelib.traits import EngineTraits
from searx.engines.bing import ( from searx.engines.bing import set_bing_cookies
set_bing_cookies, from searx.engines.bing import fetch_traits # pylint: disable=unused-import
_fetch_traits, from searx.engines.bing_images import time_map
)
from searx.engines.bing import send_accept_language_header # pylint: disable=unused-import
if TYPE_CHECKING: if TYPE_CHECKING:
import logging import logging
@ -44,40 +41,24 @@ time_range_support = True
base_url = 'https://www.bing.com/videos/asyncv2' base_url = 'https://www.bing.com/videos/asyncv2'
"""Bing (Videos) async search URL.""" """Bing (Videos) async search URL."""
bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-video-search/reference/market-codes'
"""Bing (Video) search API description"""
time_map = {
# fmt: off
'day': 60 * 24,
'week': 60 * 24 * 7,
'month': 60 * 24 * 31,
'year': 60 * 24 * 365,
# fmt: on
}
def request(query, params): def request(query, params):
"""Assemble a Bing-Video request.""" """Assemble a Bing-Video request."""
engine_region = traits.get_region(params['searxng_locale'], 'en-US') engine_region = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
engine_language = traits.get_language(params['searxng_locale'], 'en') engine_language = traits.get_language(params['searxng_locale'], 'en') # type: ignore
set_bing_cookies(params, engine_language, engine_region)
SID = uuid.uuid1().hex.upper()
set_bing_cookies(params, engine_language, engine_region, SID)
# build URL query # build URL query
# #
# example: https://www.bing.com/videos/asyncv2?q=foo&async=content&first=1&count=35 # example: https://www.bing.com/videos/asyncv2?q=foo&async=content&first=1&count=35
query_params = { query_params = {
# fmt: off
'q': query, 'q': query,
'async': 'content', 'async': 'content',
# to simplify the page count lets use the default of 35 images per page # to simplify the page count lets use the default of 35 images per page
'first': (int(params.get('pageno', 1)) - 1) * 35 + 1, 'first': (int(params.get('pageno', 1)) - 1) * 35 + 1,
'count': 35, 'count': 35,
# fmt: on
} }
# time range # time range
@ -116,13 +97,3 @@ def response(resp):
) )
return results return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages and regions from Bing-Videos."""
xpath_market_codes = '//table[1]/tbody/tr/td[3]'
# xpath_country_codes = '//table[2]/tbody/tr/td[2]'
xpath_language_codes = '//table[3]/tbody/tr/td[2]'
_fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes)

View File

@ -14,7 +14,6 @@ from lxml import html
from searx.utils import ( from searx.utils import (
eval_xpath_list, eval_xpath_list,
eval_xpath_getindex,
extract_text, extract_text,
) )
@ -28,11 +27,9 @@ about = {
} }
categories = [] categories = []
paging = False
time_range_support = False
base_url = 'https://emojipedia.org' base_url = 'https://emojipedia.org'
search_url = base_url + '/search/?{query}' search_url = base_url + '/search?{query}'
def request(query, params): def request(query, params):
@ -47,20 +44,10 @@ def response(resp):
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
for result in eval_xpath_list(dom, "//ol[@class='search-results']/li"): for result in eval_xpath_list(dom, '//div[starts-with(@class, "EmojisList")]/a'):
extracted_desc = extract_text(eval_xpath_getindex(result, './/p', 0)) url = base_url + result.attrib.get('href')
res = {'url': url, 'title': extract_text(result), 'content': ''}
if 'No results found.' in extracted_desc:
break
link = eval_xpath_getindex(result, './/h2/a', 0)
url = base_url + link.attrib.get('href')
title = extract_text(link)
content = extracted_desc
res = {'url': url, 'title': title, 'content': content}
results.append(res) results.append(res)

View File

@ -0,0 +1,45 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Matrixrooms.info (social media)
"""
from urllib.parse import quote_plus
about = {
"website": 'https://matrixrooms.info',
"wikidata_id": 'Q107565255',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'JSON',
}
paging = True
categories = ['social media']
base_url = "https://apicdn.matrixrooms.info"
matrix_url = "https://matrix.to"
page_size = 20
def request(query, params):
params['url'] = f"{base_url}/search/{quote_plus(query)}/{page_size}/{(params['pageno']-1)*page_size}"
return params
def response(resp):
results = []
for result in resp.json():
results.append(
{
'url': matrix_url + '/#/' + result['alias'],
'title': result['name'],
'content': result['topic']
+ f" // {result['members']} members"
+ f" // {result['alias']}"
+ f" // {result['server']}",
'thumbnail': result['avatar_url'],
}
)
return results

View File

@ -105,11 +105,11 @@ def response(resp):
item['metadata'] = html_to_text(result.get('meta_short', '')) item['metadata'] = html_to_text(result.get('meta_short', ''))
if result.get('image'): if result.get('image'):
item['thumbnail'] = image_url.format(image_id=result['image'], filename=result['image_filename']) item['img_src'] = image_url.format(image_id=result['image'], filename=result['image_filename'])
else: else:
item['url'] = result['url'] item['url'] = result['url']
item['content'] = ', '.join([result['class'], result['info'], result['more']]) item['content'] = ', '.join([result['class'], result['info'], result['more']])
item['thumbnail'] = result['image'] item['img_src'] = result['image']
results.append(item) results.append(item)

View File

@ -0,0 +1,63 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Pinterest (images)
"""
from json import dumps
about = {
"website": 'https://www.pinterest.com/',
"wikidata_id": 'Q255381',
"official_api_documentation": 'https://developers.pinterest.com/docs/api/v5/',
"use_official_api": False,
"require_api_key": False,
"results": 'JSON',
}
categories = ['images']
paging = True
base_url = 'https://www.pinterest.com'
def request(query, params):
args = {
'options': {
'query': query,
'bookmarks': [params['engine_data'].get('bookmark', '')],
},
'context': {},
}
params['url'] = f"{base_url}/resource/BaseSearchResource/get/?data={dumps(args)}"
return params
def response(resp):
results = []
json_resp = resp.json()
results.append(
{
'engine_data': json_resp['resource_response']['bookmark'],
# it's called bookmark by pinterest, but it's rather a nextpage
# parameter to get the next results
'key': 'bookmark',
}
)
for result in json_resp['resource_response']['data']['results']:
results.append(
{
'template': 'images.html',
'url': result['link'] or f"{base_url}/pin/{result['id']}/",
'title': result.get('title') or result.get('grid_title'),
'content': (result.get('rich_summary') or {}).get('display_description') or "",
'img_src': result['images']['orig']['url'],
'thumbnail_src': result['images']['236x']['url'],
'source': (result.get('rich_summary') or {}).get('site_name'),
}
)
return results

View File

@ -42,6 +42,17 @@ paging = True
results_per_page = 10 results_per_page = 10
base_url = "https://www.tagesschau.de" base_url = "https://www.tagesschau.de"
use_source_url = True
"""When set to false, display URLs from Tagesschau, and not the actual source
(e.g. NDR, WDR, SWR, HR, ...)
.. note::
The actual source may contain additional content, such as commentary, that is
not displayed in the Tagesschau.
"""
def request(query, params): def request(query, params):
args = { args = {
@ -78,7 +89,7 @@ def _story(item):
'thumbnail': item.get('teaserImage', {}).get('imageVariants', {}).get('16x9-256'), 'thumbnail': item.get('teaserImage', {}).get('imageVariants', {}).get('16x9-256'),
'publishedDate': datetime.strptime(item['date'][:19], '%Y-%m-%dT%H:%M:%S'), 'publishedDate': datetime.strptime(item['date'][:19], '%Y-%m-%dT%H:%M:%S'),
'content': item['firstSentence'], 'content': item['firstSentence'],
'url': item['shareURL'], 'url': item['shareURL'] if use_source_url else item['detailsweb'],
} }

View File

@ -133,12 +133,7 @@ def response(resp):
continue continue
url = parse_url(url) url = parse_url(url)
title = eval_xpath_getindex(result, './/h3/a', 0, default=None) title = extract_text(result.xpath('.//h3//a/@aria-label'))
if title is None:
continue
offset = len(extract_text(title.xpath('span')))
title = extract_text(title)[offset:]
content = eval_xpath_getindex(result, './/div[contains(@class, "compText")]', 0, default='') content = eval_xpath_getindex(result, './/div[contains(@class, "compText")]', 0, default='')
content = extract_text(content, allow_none=True) content = extract_text(content, allow_none=True)
@ -164,7 +159,7 @@ def fetch_traits(engine_traits: EngineTraits):
resp = network.get('https://search.yahoo.com/preferences/languages') resp = network.get('https://search.yahoo.com/preferences/languages')
if not resp.ok: if not resp.ok:
print("ERROR: response from peertube is not OK.") print("ERROR: response from yahoo is not OK.")
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
offset = len('lang_') offset = len('lang_')

View File

@ -1018,6 +1018,11 @@ engines:
require_api_key: false require_api_key: false
results: HTML results: HTML
- name: matrixrooms
engine: matrixrooms
shortcut: mtrx
disabled: true
- name: metacpan - name: metacpan
engine: metacpan engine: metacpan
shortcut: cpan shortcut: cpan
@ -1201,6 +1206,10 @@ engines:
engine: photon engine: photon
shortcut: ph shortcut: ph
- name: pinterest
engine: pinterest
shortcut: pin
- name: piped - name: piped
engine: piped engine: piped
shortcut: ppd shortcut: ppd
@ -1483,6 +1492,9 @@ engines:
- name: tagesschau - name: tagesschau
engine: tagesschau engine: tagesschau
# when set to false, display URLs from Tagesschau, and not the actual source
# (e.g. NDR, WDR, SWR, HR, ...)
use_source_url: true
shortcut: ts shortcut: ts
disabled: true disabled: true