Merge branch 'searxng:master' into master

This commit is contained in:
y0nei 2023-10-02 21:56:01 +00:00 committed by GitHub
commit 7208feaf7e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 1865 additions and 639 deletions

2
.nvmrc
View File

@ -1 +1 @@
v16.15.1
v16.20.2

2
manage
View File

@ -41,7 +41,7 @@ PATH="${REPO_ROOT}/node_modules/.bin:${PATH}"
PYOBJECTS="searx"
PY_SETUP_EXTRAS='[test]'
GECKODRIVER_VERSION="v0.30.0"
GECKODRIVER_VERSION="v0.33.0"
# SPHINXOPTS=
BLACK_OPTIONS=("--target-version" "py311" "--line-length" "120" "--skip-string-normalization")
BLACK_TARGETS=("--exclude" "searx/static,searx/languages.py" "--include" 'searxng.msg|\.pyi?$' "searx" "searxng_extra" "tests")

View File

@ -1,7 +1,7 @@
{
"dependencies": {
"eslint": "^8.18.0",
"pyright": "^1.1.255"
"eslint": "^8.50.0",
"pyright": "^1.1.329"
},
"scripts": {
"clean": "rm -Rf node_modules package-lock.json"

View File

@ -51,7 +51,6 @@ from . import (
http_accept,
http_accept_encoding,
http_accept_language,
http_connection,
http_user_agent,
ip_limit,
ip_lists,
@ -136,7 +135,6 @@ def filter_request(request: flask.Request) -> werkzeug.Response | None:
http_accept,
http_accept_encoding,
http_accept_language,
http_connection,
http_user_agent,
ip_limit,
]:

File diff suppressed because it is too large Load Diff

View File

@ -8,19 +8,18 @@ implementations are shared by other engines:
- :ref:`bing videos engine`
On the `preference page`_ Bing offers a lot of languages an regions (see section
'Search results languages' and 'Country/region'). However, the abundant choice
does not correspond to reality, where Bing has a full-text indexer only for a
limited number of languages. By example: you can select a language like Māori
but you never get a result in this language.
LANGUAGE and COUNTRY/REGION). The Language is the language of the UI, we need
in SearXNG to get the translations of data such as *"published last week"*.
What comes a bit closer to the truth are the `search-APIs`_ but they don`t seem
to be completely correct either (if you take a closer look you will find some
inaccuracies there too):
There is a description of the offical search-APIs_, unfortunately this is not
the API we can use or that bing itself would use. You can look up some things
in the API to get a better picture of bing, but the value specifications like
the market codes are usually outdated or at least no longer used by bing itself.
- :py:obj:`searx.engines.bing.bing_traits_url`
- :py:obj:`searx.engines.bing_videos.bing_traits_url`
- :py:obj:`searx.engines.bing_images.bing_traits_url`
- :py:obj:`searx.engines.bing_news.bing_traits_url`
The market codes have been harmonized and are identical for web, video and
images. The news area has also been harmonized with the other categories. Only
political adjustments still seem to be made -- for example, there is no news
category for the Chinese market.
.. _preference page: https://www.bing.com/account/general
.. _search-APIs: https://learn.microsoft.com/en-us/bing/search-apis/
@ -30,9 +29,8 @@ inaccuracies there too):
from typing import TYPE_CHECKING
import base64
import datetime
import re
import uuid
import time
from urllib.parse import parse_qs, urlencode, urlparse
from lxml import html
import babel
@ -45,7 +43,7 @@ from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
logger = logging.getLogger()
traits: EngineTraits
@ -58,124 +56,63 @@ about = {
"results": 'HTML',
}
send_accept_language_header = True
"""Bing tries to guess user's language and territory from the HTTP
Accept-Language. Optional the user can select a search-language (can be
different to the UI language) and a region (market code)."""
# engine dependent config
categories = ['general', 'web']
paging = True
time_range_support = True
safesearch = True
safesearch_types = {2: 'STRICT', 1: 'DEMOTE', 0: 'OFF'} # cookie: ADLT=STRICT
"""Bing results are always SFW. To get NSFW links from bing some age
verification by a cookie is needed / thats not possible in SearXNG.
"""
base_url = 'https://www.bing.com/search'
"""Bing (Web) search URL"""
bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/reference/market-codes'
"""Bing (Web) search API description"""
def _page_offset(pageno):
return (int(pageno) - 1) * 10 + 1
def _get_offset_from_pageno(pageno):
return (pageno - 1) * 10 + 1
def set_bing_cookies(params, engine_language, engine_region, SID):
# set cookies
# -----------
params['cookies']['_EDGE_V'] = '1'
# _EDGE_S: F=1&SID=3A5253BD6BCA609509B741876AF961CA&mkt=zh-tw
_EDGE_S = [
'F=1',
'SID=%s' % SID,
'mkt=%s' % engine_region.lower(),
'ui=%s' % engine_language.lower(),
]
params['cookies']['_EDGE_S'] = '&'.join(_EDGE_S)
logger.debug("cookie _EDGE_S=%s", params['cookies']['_EDGE_S'])
# "_EDGE_CD": "m=zh-tw",
_EDGE_CD = [ # pylint: disable=invalid-name
'm=%s' % engine_region.lower(), # search region: zh-cn
'u=%s' % engine_language.lower(), # UI: en-us
]
params['cookies']['_EDGE_CD'] = '&'.join(_EDGE_CD) + ';'
logger.debug("cookie _EDGE_CD=%s", params['cookies']['_EDGE_CD'])
SRCHHPGUSR = [ # pylint: disable=invalid-name
'SRCHLANG=%s' % engine_language,
# Trying to set ADLT cookie here seems not to have any effect, I assume
# there is some age verification by a cookie (and/or session ID) needed,
# to disable the SafeSearch.
'ADLT=%s' % safesearch_types.get(params['safesearch'], 'DEMOTE'),
]
params['cookies']['SRCHHPGUSR'] = '&'.join(SRCHHPGUSR)
logger.debug("cookie SRCHHPGUSR=%s", params['cookies']['SRCHHPGUSR'])
def set_bing_cookies(params, engine_language, engine_region):
params['cookies']['_EDGE_CD'] = f'm={engine_region}&u={engine_language}'
params['cookies']['_EDGE_S'] = f'mkt={engine_region}&ui={engine_language}'
logger.debug("bing cookies: %s", params['cookies'])
def request(query, params):
"""Assemble a Bing-Web request."""
engine_region = traits.get_region(params['searxng_locale'], 'en-US')
engine_language = traits.get_language(params['searxng_locale'], 'en')
engine_region = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
engine_language = traits.get_language(params['searxng_locale'], 'en') # type: ignore
set_bing_cookies(params, engine_language, engine_region)
SID = uuid.uuid1().hex.upper()
CVID = uuid.uuid1().hex.upper()
set_bing_cookies(params, engine_language, engine_region, SID)
# build URL query
# ---------------
# query term
page = int(params.get('pageno', 1))
page = params.get('pageno', 1)
query_params = {
# fmt: off
'q': query,
# if arg 'pq' is missed, somtimes on page 4 we get results from page 1,
# don't ask why it is only sometimes / its M$ and they have never been
# deterministic ;)
'pq': query,
'cvid': CVID,
'qs': 'n',
'sp': '-1'
# fmt: on
}
# page
# To get correct page, arg first and this arg FORM is needed, the value PERE
# is on page 2, on page 3 its PERE1 and on page 4 its PERE2 .. and so forth.
# The 'first' arg should never send on page 1.
if page > 1:
referer = base_url + '?' + urlencode(query_params)
params['headers']['Referer'] = referer
logger.debug("headers.Referer --> %s", referer)
query_params['first'] = _get_offset_from_pageno(page)
query_params['first'] = _page_offset(page) # see also arg FORM
if page == 2:
query_params['FORM'] = 'PERE'
elif page > 2:
query_params['FORM'] = 'PERE%s' % (page - 2)
filters = ''
if params['time_range']:
query_params['filt'] = 'custom'
params['url'] = f'{base_url}?{urlencode(query_params)}'
if params['time_range'] == 'day':
filters = 'ex1:"ez1"'
elif params['time_range'] == 'week':
filters = 'ex1:"ez2"'
elif params['time_range'] == 'month':
filters = 'ex1:"ez3"'
elif params['time_range'] == 'year':
epoch_1970 = datetime.date(1970, 1, 1)
today_no = (datetime.date.today() - epoch_1970).days
filters = 'ex1:"ez5_%s_%s"' % (today_no - 365, today_no)
if params.get('time_range'):
unix_day = int(time.time() / 86400)
time_ranges = {'day': '1', 'week': '2', 'month': '3', 'year': f'5_{unix_day-365}_{unix_day}'}
params['url'] += f'&filters=ex1:"ez{time_ranges[params["time_range"]]}"'
params['url'] = base_url + '?' + urlencode(query_params)
if filters:
params['url'] = params['url'] + '&filters=' + filters
return params
@ -197,10 +134,11 @@ def response(resp):
url = link.attrib.get('href')
title = extract_text(link)
content = eval_xpath(result, '(.//p)[1]')
content = eval_xpath(result, './/p')
for p in content:
# Make sure that the element is free of <a href> links
for e in p.xpath('.//a'):
# Make sure that the element is free of:
# <span class="algoSlug_icon" # data-priority="2">Web</span>
for e in p.xpath('.//span[@class="algoSlug_icon"]'):
e.getparent().remove(e)
content = extract_text(content)
@ -236,7 +174,7 @@ def response(resp):
except Exception as e: # pylint: disable=broad-except
logger.debug('result error :\n%s', e)
if result_len and _get_offset_from_pageno(resp.search_params.get("pageno", 0)) > result_len:
if result_len and _page_offset(resp.search_params.get("pageno", 0)) > result_len:
# Avoid reading more results than avalaible.
# For example, if there is 100 results from some search and we try to get results from 120 to 130,
# Bing will send back the results from 0 to 10 and no error.
@ -249,72 +187,76 @@ def response(resp):
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages and regions from Bing-Web."""
xpath_market_codes = '//table[1]/tbody/tr/td[3]'
# xpath_country_codes = '//table[2]/tbody/tr/td[2]'
xpath_language_codes = '//table[3]/tbody/tr/td[2]'
_fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes)
def _fetch_traits(engine_traits: EngineTraits, url: str, xpath_language_codes: str, xpath_market_codes: str):
# pylint: disable=too-many-locals,import-outside-toplevel
# pylint: disable=import-outside-toplevel
from searx.network import get # see https://github.com/searxng/searxng/issues/762
# insert alias to map from a language (zh) to a language + script (zh_Hans)
engine_traits.languages['zh'] = 'zh-hans'
resp = get(url)
resp = get("https://www.bing.com/account/general")
if not resp.ok: # type: ignore
print("ERROR: response from peertube is not OK.")
print("ERROR: response from bing is not OK.")
dom = html.fromstring(resp.text) # type: ignore
map_lang = {'jp': 'ja'}
for td in eval_xpath(dom, xpath_language_codes):
eng_lang = td.text
# languages
if eng_lang in ('en-gb', 'pt-br'):
# language 'en' is already in the list and a language 'en-gb' can't
# be handled in SearXNG, same with pt-br which is covered by pt-pt.
continue
engine_traits.languages['zh'] = 'zh-hans'
babel_lang = map_lang.get(eng_lang, eng_lang).replace('-', '_')
try:
sxng_tag = language_tag(babel.Locale.parse(babel_lang))
except babel.UnknownLocaleError:
print("ERROR: language (%s) is unknown by babel" % (eng_lang))
continue
conflict = engine_traits.languages.get(sxng_tag)
if conflict:
if conflict != eng_lang:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang))
continue
engine_traits.languages[sxng_tag] = eng_lang
map_region = {
'en-ID': 'id_ID',
'no-NO': 'nb_NO',
map_lang = {'prs': 'fa-AF', 'en': 'en-us'}
bing_ui_lang_map = {
# HINT: this list probably needs to be supplemented
'en': 'us', # en --> en-us
'da': 'dk', # da --> da-dk
}
for td in eval_xpath(dom, xpath_market_codes):
eng_region = td.text
babel_region = map_region.get(eng_region, eng_region).replace('-', '_')
if eng_region == 'en-WW':
engine_traits.all_locale = eng_region
continue
for href in eval_xpath(dom, '//div[@id="language-section"]//li/a/@href'):
eng_lang = parse_qs(urlparse(href).query)['setlang'][0]
babel_lang = map_lang.get(eng_lang, eng_lang)
try:
sxng_tag = region_tag(babel.Locale.parse(babel_region))
sxng_tag = language_tag(babel.Locale.parse(babel_lang.replace('-', '_')))
except babel.UnknownLocaleError:
print("ERROR: region (%s) is unknown by babel" % (eng_region))
print("ERROR: language (%s) is unknown by babel" % (babel_lang))
continue
# Language (e.g. 'en' or 'de') from https://www.bing.com/account/general
# is converted by bing to 'en-us' or 'de-de'. But only if there is not
# already a '-' delemitter in the language. For instance 'pt-PT' -->
# 'pt-pt' and 'pt-br' --> 'pt-br'
bing_ui_lang = eng_lang.lower()
if '-' not in bing_ui_lang:
bing_ui_lang = bing_ui_lang + '-' + bing_ui_lang_map.get(bing_ui_lang, bing_ui_lang)
conflict = engine_traits.languages.get(sxng_tag)
if conflict:
if conflict != bing_ui_lang:
print(f"CONFLICT: babel {sxng_tag} --> {conflict}, {bing_ui_lang}")
continue
engine_traits.languages[sxng_tag] = bing_ui_lang
# regions (aka "market codes")
engine_traits.regions['zh-CN'] = 'zh-cn'
map_market_codes = {
'zh-hk': 'en-hk', # not sure why, but at M$ this is the market code for Hongkong
}
for href in eval_xpath(dom, '//div[@id="region-section"]//li/a/@href'):
cc_tag = parse_qs(urlparse(href).query)['cc'][0]
if cc_tag == 'clear':
engine_traits.all_locale = cc_tag
continue
# add market codes from official languages of the country ..
for lang_tag in babel.languages.get_official_languages(cc_tag, de_facto=True):
if lang_tag not in engine_traits.languages.keys():
# print("ignore lang: %s <-- %s" % (cc_tag, lang_tag))
continue
lang_tag = lang_tag.split('_')[0] # zh_Hant --> zh
market_code = f"{lang_tag}-{cc_tag}" # zh-tw
market_code = map_market_codes.get(market_code, market_code)
sxng_tag = region_tag(babel.Locale.parse('%s_%s' % (lang_tag, cc_tag.upper())))
conflict = engine_traits.regions.get(sxng_tag)
if conflict:
if conflict != eng_region:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_region))
if conflict != market_code:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, market_code))
continue
engine_traits.regions[sxng_tag] = eng_region
engine_traits.regions[sxng_tag] = market_code

View File

@ -6,23 +6,20 @@
from typing import TYPE_CHECKING
import uuid
import json
from urllib.parse import urlencode
from lxml import html
from searx.enginelib.traits import EngineTraits
from searx.engines.bing import (
set_bing_cookies,
_fetch_traits,
)
from searx.engines.bing import send_accept_language_header # pylint: disable=unused-import
from searx.engines.bing import set_bing_cookies
from searx.engines.bing import fetch_traits # pylint: disable=unused-import
if TYPE_CHECKING:
import logging
logger: logging.Logger
logger = logging.getLogger()
traits: EngineTraits
@ -45,39 +42,29 @@ time_range_support = True
base_url = 'https://www.bing.com/images/async'
"""Bing (Images) search URL"""
bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-image-search/reference/market-codes'
"""Bing (Images) search API description"""
time_map = {
# fmt: off
'day': 60 * 24,
'week': 60 * 24 * 7,
'month': 60 * 24 * 31,
'year': 60 * 24 * 365,
# fmt: on
}
def request(query, params):
"""Assemble a Bing-Image request."""
engine_region = traits.get_region(params['searxng_locale'], 'en-US')
engine_language = traits.get_language(params['searxng_locale'], 'en')
SID = uuid.uuid1().hex.upper()
set_bing_cookies(params, engine_language, engine_region, SID)
engine_region = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
engine_language = traits.get_language(params['searxng_locale'], 'en') # type: ignore
set_bing_cookies(params, engine_language, engine_region)
# build URL query
# - example: https://www.bing.com/images/async?q=foo&first=155&count=35
# - example: https://www.bing.com/images/async?q=foo&async=content&first=1&count=35
query_params = {
# fmt: off
'q': query,
'async' : 'content',
'async': '1',
# to simplify the page count lets use the default of 35 images per page
'first': (int(params.get('pageno', 1)) - 1) * 35 + 1,
'count': 35,
# fmt: on
}
# time range
@ -120,13 +107,3 @@ def response(resp):
}
)
return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages and regions from Bing-News."""
xpath_market_codes = '//table[1]/tbody/tr/td[3]'
# xpath_country_codes = '//table[2]/tbody/tr/td[2]'
xpath_language_codes = '//table[3]/tbody/tr/td[2]'
_fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes)

View File

@ -1,22 +1,23 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Bing-News: description see :py:obj:`searx.engines.bing`.
.. hint::
Bing News is *different* in some ways!
"""
# pylint: disable=invalid-name
from typing import TYPE_CHECKING
import uuid
from urllib.parse import urlencode
from lxml import html
from searx.utils import eval_xpath, extract_text, eval_xpath_list, eval_xpath_getindex
from searx.enginelib.traits import EngineTraits
from searx.engines.bing import (
set_bing_cookies,
_fetch_traits,
)
from searx.engines.bing import send_accept_language_header # pylint: disable=unused-import
from searx.engines.bing import set_bing_cookies
if TYPE_CHECKING:
import logging
@ -39,58 +40,48 @@ about = {
# engine dependent config
categories = ['news']
paging = True
"""If go through the pages and there are actually no new results for another
page, then bing returns the results from the last page again."""
time_range_support = True
time_map = {
'day': '4',
'week': '8',
'month': '9',
'day': 'interval="4"',
'week': 'interval="7"',
'month': 'interval="9"',
}
"""A string '4' means *last hour*. We use *last hour* for ``day`` here since the
difference of *last day* and *last week* in the result list is just marginally.
"""
Bing does not have news range ``year`` / we use ``month`` instead."""
base_url = 'https://www.bing.com/news/infinitescrollajax'
"""Bing (News) search URL"""
bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-news-search/reference/market-codes'
"""Bing (News) search API description"""
mkt_alias = {
'zh': 'en-WW',
'zh-CN': 'en-WW',
}
"""Bing News has an official market code 'zh-CN' but we won't get a result with
this market code. For 'zh' and 'zh-CN' we better use the *Worldwide aggregate*
market code (en-WW).
"""
def request(query, params):
"""Assemble a Bing-News request."""
sxng_locale = params['searxng_locale']
engine_region = traits.get_region(mkt_alias.get(sxng_locale, sxng_locale), traits.all_locale)
engine_language = traits.get_language(sxng_locale, 'en')
SID = uuid.uuid1().hex.upper()
set_bing_cookies(params, engine_language, engine_region, SID)
engine_region = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
engine_language = traits.get_language(params['searxng_locale'], 'en') # type: ignore
set_bing_cookies(params, engine_language, engine_region)
# build URL query
#
# example: https://www.bing.com/news/infinitescrollajax?q=london&first=1
page = int(params.get('pageno', 1)) - 1
query_params = {
# fmt: off
'q': query,
'InfiniteScroll': 1,
# to simplify the page count lets use the default of 10 images per page
'first' : (int(params.get('pageno', 1)) - 1) * 10 + 1,
# fmt: on
'first': page * 10 + 1,
'SFX': page,
'form': 'PTFTNR',
'setlang': engine_region.split('-')[0],
'cc': engine_region.split('-')[-1],
}
if params['time_range']:
# qft=interval:"7"
query_params['qft'] = 'qft=interval="%s"' % time_map.get(params['time_range'], '9')
query_params['qft'] = time_map.get(params['time_range'], 'interval="9"')
params['url'] = base_url + '?' + urlencode(query_params)
@ -106,18 +97,34 @@ def response(resp):
dom = html.fromstring(resp.text)
for newsitem in dom.xpath('//div[contains(@class, "newsitem")]'):
for newsitem in eval_xpath_list(dom, '//div[contains(@class, "newsitem")]'):
link = eval_xpath_getindex(newsitem, './/a[@class="title"]', 0, None)
if link is None:
continue
url = link.attrib.get('href')
title = extract_text(link)
content = extract_text(eval_xpath(newsitem, './/div[@class="snippet"]'))
metadata = []
source = eval_xpath_getindex(newsitem, './/div[contains(@class, "source")]', 0, None)
if source is not None:
for item in (
eval_xpath_getindex(source, './/span[@aria-label]/@aria-label', 0, None),
# eval_xpath_getindex(source, './/a', 0, None),
# eval_xpath_getindex(source, './div/span', 3, None),
link.attrib.get('data-author'),
):
if item is not None:
t = extract_text(item)
if t and t.strip():
metadata.append(t.strip())
metadata = ' | '.join(metadata)
url = newsitem.xpath('./@url')[0]
title = ' '.join(newsitem.xpath('.//div[@class="caption"]//a[@class="title"]/text()')).strip()
content = ' '.join(newsitem.xpath('.//div[@class="snippet"]/text()')).strip()
thumbnail = None
author = newsitem.xpath('./@data-author')[0]
metadata = ' '.join(newsitem.xpath('.//div[@class="source"]/span/text()')).strip()
img_src = newsitem.xpath('.//a[@class="imagelink"]//img/@src')
if img_src:
thumbnail = 'https://www.bing.com/' + img_src[0]
imagelink = eval_xpath_getindex(newsitem, './/a[@class="imagelink"]//img', 0, None)
if imagelink is not None:
thumbnail = 'https://www.bing.com/' + imagelink.attrib.get('src')
results.append(
{
@ -125,7 +132,6 @@ def response(resp):
'title': title,
'content': content,
'img_src': thumbnail,
'author': author,
'metadata': metadata,
}
)
@ -134,17 +140,20 @@ def response(resp):
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages and regions from Bing-News.
"""Fetch languages and regions from Bing-News."""
# pylint: disable=import-outside-toplevel
The :py:obj:`description <searx.engines.bing_news.bing_traits_url>` of the
first table says *"query parameter when calling the Video Search API."*
.. that's why I use the 4. table "News Category API markets" for the
``xpath_market_codes``.
from searx.engines.bing import fetch_traits as _f
"""
_f(engine_traits)
xpath_market_codes = '//table[4]/tbody/tr/td[3]'
# xpath_country_codes = '//table[2]/tbody/tr/td[2]'
xpath_language_codes = '//table[3]/tbody/tr/td[2]'
# fix market codes not known by bing news:
_fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes)
# In bing the market code 'zh-cn' exists, but there is no 'news' category in
# bing for this market. Alternatively we use the the market code from Honk
# Kong. Even if this is not correct, it is better than having no hits at
# all, or sending false queries to bing that could raise the suspicion of a
# bot.
# HINT: 'en-hk' is the region code it does not indicate the language en!!
engine_traits.regions['zh-CN'] = 'en-hk'

View File

@ -5,18 +5,15 @@
# pylint: disable=invalid-name
from typing import TYPE_CHECKING
import uuid
import json
from urllib.parse import urlencode
from lxml import html
from searx.enginelib.traits import EngineTraits
from searx.engines.bing import (
set_bing_cookies,
_fetch_traits,
)
from searx.engines.bing import send_accept_language_header # pylint: disable=unused-import
from searx.engines.bing import set_bing_cookies
from searx.engines.bing import fetch_traits # pylint: disable=unused-import
from searx.engines.bing_images import time_map
if TYPE_CHECKING:
import logging
@ -44,40 +41,24 @@ time_range_support = True
base_url = 'https://www.bing.com/videos/asyncv2'
"""Bing (Videos) async search URL."""
bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-video-search/reference/market-codes'
"""Bing (Video) search API description"""
time_map = {
# fmt: off
'day': 60 * 24,
'week': 60 * 24 * 7,
'month': 60 * 24 * 31,
'year': 60 * 24 * 365,
# fmt: on
}
def request(query, params):
"""Assemble a Bing-Video request."""
engine_region = traits.get_region(params['searxng_locale'], 'en-US')
engine_language = traits.get_language(params['searxng_locale'], 'en')
SID = uuid.uuid1().hex.upper()
set_bing_cookies(params, engine_language, engine_region, SID)
engine_region = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
engine_language = traits.get_language(params['searxng_locale'], 'en') # type: ignore
set_bing_cookies(params, engine_language, engine_region)
# build URL query
#
# example: https://www.bing.com/videos/asyncv2?q=foo&async=content&first=1&count=35
query_params = {
# fmt: off
'q': query,
'async': 'content',
# to simplify the page count lets use the default of 35 images per page
'first': (int(params.get('pageno', 1)) - 1) * 35 + 1,
'count': 35,
# fmt: on
}
# time range
@ -116,13 +97,3 @@ def response(resp):
)
return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages and regions from Bing-Videos."""
xpath_market_codes = '//table[1]/tbody/tr/td[3]'
# xpath_country_codes = '//table[2]/tbody/tr/td[2]'
xpath_language_codes = '//table[3]/tbody/tr/td[2]'
_fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes)

View File

@ -14,7 +14,6 @@ from lxml import html
from searx.utils import (
eval_xpath_list,
eval_xpath_getindex,
extract_text,
)
@ -28,11 +27,9 @@ about = {
}
categories = []
paging = False
time_range_support = False
base_url = 'https://emojipedia.org'
search_url = base_url + '/search/?{query}'
search_url = base_url + '/search?{query}'
def request(query, params):
@ -47,20 +44,10 @@ def response(resp):
dom = html.fromstring(resp.text)
for result in eval_xpath_list(dom, "//ol[@class='search-results']/li"):
for result in eval_xpath_list(dom, '//div[starts-with(@class, "EmojisList")]/a'):
extracted_desc = extract_text(eval_xpath_getindex(result, './/p', 0))
if 'No results found.' in extracted_desc:
break
link = eval_xpath_getindex(result, './/h2/a', 0)
url = base_url + link.attrib.get('href')
title = extract_text(link)
content = extracted_desc
res = {'url': url, 'title': title, 'content': content}
url = base_url + result.attrib.get('href')
res = {'url': url, 'title': extract_text(result), 'content': ''}
results.append(res)

View File

@ -0,0 +1,45 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Matrixrooms.info (social media)
"""
from urllib.parse import quote_plus
about = {
"website": 'https://matrixrooms.info',
"wikidata_id": 'Q107565255',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'JSON',
}
paging = True
categories = ['social media']
base_url = "https://apicdn.matrixrooms.info"
matrix_url = "https://matrix.to"
page_size = 20
def request(query, params):
params['url'] = f"{base_url}/search/{quote_plus(query)}/{page_size}/{(params['pageno']-1)*page_size}"
return params
def response(resp):
results = []
for result in resp.json():
results.append(
{
'url': matrix_url + '/#/' + result['alias'],
'title': result['name'],
'content': result['topic']
+ f" // {result['members']} members"
+ f" // {result['alias']}"
+ f" // {result['server']}",
'thumbnail': result['avatar_url'],
}
)
return results

View File

@ -105,11 +105,11 @@ def response(resp):
item['metadata'] = html_to_text(result.get('meta_short', ''))
if result.get('image'):
item['thumbnail'] = image_url.format(image_id=result['image'], filename=result['image_filename'])
item['img_src'] = image_url.format(image_id=result['image'], filename=result['image_filename'])
else:
item['url'] = result['url']
item['content'] = ', '.join([result['class'], result['info'], result['more']])
item['thumbnail'] = result['image']
item['img_src'] = result['image']
results.append(item)

View File

@ -0,0 +1,63 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Pinterest (images)
"""
from json import dumps
about = {
"website": 'https://www.pinterest.com/',
"wikidata_id": 'Q255381',
"official_api_documentation": 'https://developers.pinterest.com/docs/api/v5/',
"use_official_api": False,
"require_api_key": False,
"results": 'JSON',
}
categories = ['images']
paging = True
base_url = 'https://www.pinterest.com'
def request(query, params):
args = {
'options': {
'query': query,
'bookmarks': [params['engine_data'].get('bookmark', '')],
},
'context': {},
}
params['url'] = f"{base_url}/resource/BaseSearchResource/get/?data={dumps(args)}"
return params
def response(resp):
results = []
json_resp = resp.json()
results.append(
{
'engine_data': json_resp['resource_response']['bookmark'],
# it's called bookmark by pinterest, but it's rather a nextpage
# parameter to get the next results
'key': 'bookmark',
}
)
for result in json_resp['resource_response']['data']['results']:
results.append(
{
'template': 'images.html',
'url': result['link'] or f"{base_url}/pin/{result['id']}/",
'title': result.get('title') or result.get('grid_title'),
'content': (result.get('rich_summary') or {}).get('display_description') or "",
'img_src': result['images']['orig']['url'],
'thumbnail_src': result['images']['236x']['url'],
'source': (result.get('rich_summary') or {}).get('site_name'),
}
)
return results

View File

@ -42,6 +42,17 @@ paging = True
results_per_page = 10
base_url = "https://www.tagesschau.de"
use_source_url = True
"""When set to false, display URLs from Tagesschau, and not the actual source
(e.g. NDR, WDR, SWR, HR, ...)
.. note::
The actual source may contain additional content, such as commentary, that is
not displayed in the Tagesschau.
"""
def request(query, params):
args = {
@ -78,7 +89,7 @@ def _story(item):
'thumbnail': item.get('teaserImage', {}).get('imageVariants', {}).get('16x9-256'),
'publishedDate': datetime.strptime(item['date'][:19], '%Y-%m-%dT%H:%M:%S'),
'content': item['firstSentence'],
'url': item['shareURL'],
'url': item['shareURL'] if use_source_url else item['detailsweb'],
}

View File

@ -133,12 +133,7 @@ def response(resp):
continue
url = parse_url(url)
title = eval_xpath_getindex(result, './/h3/a', 0, default=None)
if title is None:
continue
offset = len(extract_text(title.xpath('span')))
title = extract_text(title)[offset:]
title = extract_text(result.xpath('.//h3//a/@aria-label'))
content = eval_xpath_getindex(result, './/div[contains(@class, "compText")]', 0, default='')
content = extract_text(content, allow_none=True)
@ -164,7 +159,7 @@ def fetch_traits(engine_traits: EngineTraits):
resp = network.get('https://search.yahoo.com/preferences/languages')
if not resp.ok:
print("ERROR: response from peertube is not OK.")
print("ERROR: response from yahoo is not OK.")
dom = html.fromstring(resp.text)
offset = len('lang_')

View File

@ -1018,6 +1018,11 @@ engines:
require_api_key: false
results: HTML
- name: matrixrooms
engine: matrixrooms
shortcut: mtrx
disabled: true
- name: metacpan
engine: metacpan
shortcut: cpan
@ -1201,6 +1206,10 @@ engines:
engine: photon
shortcut: ph
- name: pinterest
engine: pinterest
shortcut: pin
- name: piped
engine: piped
shortcut: ppd
@ -1483,6 +1492,9 @@ engines:
- name: tagesschau
engine: tagesschau
# when set to false, display URLs from Tagesschau, and not the actual source
# (e.g. NDR, WDR, SWR, HR, ...)
use_source_url: true
shortcut: ts
disabled: true