Compare commits

..

2 Commits

Author SHA1 Message Date
Bnyro b7648fb6be
Merge d5b0fb3d03 into b8f1a329d3 2024-11-22 10:03:04 +01:00
Bnyro d5b0fb3d03 [feat] engine: add adobe stock photos 2024-11-09 17:46:49 +01:00
9 changed files with 139 additions and 266 deletions

View File

@ -1,13 +0,0 @@
.. _adobe stock engine:
===========
Adobe Stock
===========
.. contents:: Contents
:depth: 2
:local:
:backlinks: entry
.. automodule:: searx.engines.adobe_stock
:members:

View File

@ -4,27 +4,22 @@ Welcome to SearXNG
*Search without being tracked.*
.. jinja:: searx
SearXNG is a free internet metasearch engine which aggregates results from up
to {{engines | length}} :ref:`search services <configured engines>`. Users
are neither tracked nor profiled. Additionally, SearXNG can be used over Tor
for online anonymity.
SearXNG is a free internet metasearch engine which aggregates results from more
than 70 search services. Users are neither tracked nor profiled. Additionally,
SearXNG can be used over Tor for online anonymity.
Get started with SearXNG by using one of the instances listed at searx.space_.
If you don't trust anyone, you can set up your own, see :ref:`installation`.
.. jinja:: searx
.. sidebar:: features
.. sidebar:: features
- :ref:`self hosted <installation>`
- :ref:`no user tracking / no profiling <SearXNG protect privacy>`
- script & cookies are optional
- secure, encrypted connections
- :ref:`{{engines | length}} search engines <configured engines>`
- `58 translations <https://translate.codeberg.org/projects/searxng/searxng/>`_
- about 70 `well maintained <https://uptime.searxng.org/>`__ instances on searx.space_
- :ref:`about 200 search engines <configured engines>`
- `about 60 translations <https://translate.codeberg.org/projects/searxng/searxng/>`_
- about 100 `well maintained <https://uptime.searxng.org/>`__ instances on searx.space_
- :ref:`easy integration of search engines <demo online engine>`
- professional development: `CI <https://github.com/searxng/searxng/actions>`_,
`quality assurance <https://dev.searxng.org/>`_ &

View File

@ -19,4 +19,3 @@ tomli==2.0.2; python_version < '3.11'
msgspec==0.18.6
eval_type_backport; python_version < '3.9'
typer-slim==0.13.1
isodate==0.7.2

View File

@ -1,229 +1,67 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""`Adobe Stock`_ is a service that gives access to millions of royalty-free
assets. Assets types include photos, vectors, illustrations, templates, 3D
assets, videos, motion graphics templates and audio tracks.
.. Adobe Stock: https://stock.adobe.com/
Configuration
=============
The engine has the following mandatory setting:
- SearXNG's :ref:`engine categories`
- Adobe-Stock's :py:obj:`adobe_order`
- Adobe-Stock's :py:obj:`adobe_content_types`
.. code:: yaml
- name: adobe stock
engine: adobe_stock
shortcut: asi
categories: [images]
adobe_order: relevance
adobe_content_types: ["photo", "illustration", "zip_vector", "template", "3d", "image"]
- name: adobe stock video
engine: adobe_stock
network: adobe stock
shortcut: asi
categories: [videos]
adobe_order: relevance
adobe_content_types: ["video"]
Implementation
==============
"""Adobe Stock (images)
"""
from __future__ import annotations
from typing import TYPE_CHECKING
from datetime import datetime, timedelta
from urllib.parse import urlencode
import isodate
if TYPE_CHECKING:
import logging
logger: logging.Logger
from searx.utils import gen_useragent
about = {
"website": "https://stock.adobe.com/",
"wikidata_id": "Q5977430",
"website": 'https://stock.adobe.com/',
"wikidata_id": 'Q5977430',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": "JSON",
"results": 'JSON',
}
categories = []
categories = ['images']
paging = True
send_accept_language_header = True
base_url = 'https://stock.adobe.com'
results_per_page = 10
base_url = "https://stock.adobe.com"
adobe_order: str = ""
"""Sort order, can be one of:
- ``relevance`` or
- ``featured`` or
- ``creation`` (most recent) or
- ``nb_downloads`` (number of downloads)
"""
ADOBE_VALID_TYPES = ["photo", "illustration", "zip_vector", "video", "template", "3d", "audio", "image"]
adobe_content_types: list = []
"""A list of of content types. The following content types are offered:
- Images: ``image``
- Videos: ``video``
- Templates: ``template``
- 3D: ``3d``
- Audio ``audio``
Additional subcategories:
- Photos: ``photo``
- Illustrations: ``illustration``
- Vectors: ``zip_vector`` (Vectors),
"""
# Do we need support for "free_collection" and "include_stock_enterprise"?
def init(_):
if not categories:
raise ValueError("adobe_stock engine: categories is unset")
# adobe_order
if not adobe_order:
raise ValueError("adobe_stock engine: adobe_order is unset")
if adobe_order not in ["relevance", "featured", "creation", "nb_downloads"]:
raise ValueError(f"unsupported adobe_order: {adobe_order}")
# adobe_content_types
if not adobe_content_types:
raise ValueError("adobe_stock engine: adobe_content_types is unset")
if isinstance(adobe_content_types, list):
for t in adobe_content_types:
if t not in ADOBE_VALID_TYPES:
raise ValueError("adobe_stock engine: adobe_content_types: '%s' is invalid" % t)
else:
raise ValueError(
"adobe_stock engine: adobe_content_types must be a list of strings not %s" % type(adobe_content_types)
)
adobe_order = "relevance" # one of 'relevant', 'featured', 'creation' or 'nb_downloads'
def request(query, params):
args = {
"k": query,
"limit": results_per_page,
"order": adobe_order,
"search_page": params["pageno"],
"search_type": "pagination",
'k': query,
'limit': results_per_page,
'order': adobe_order,
'search_page': params['pageno'],
'search_type': 'pagination',
'filters[content_type:video]': 0,
'filters[content_type:audio]': 0,
}
for content_type in ADOBE_VALID_TYPES:
args[f"filters[content_type:{content_type}]"] = 1 if content_type in adobe_content_types else 0
params["url"] = f"{base_url}/de/Ajax/Search?{urlencode(args)}"
params['url'] = f"{base_url}/de/Ajax/Search?{urlencode(args)}"
# headers required to bypass bot-detection
if params["searxng_locale"] == "all":
params["headers"]["Accept-Language"] = "en-US,en;q=0.5"
params['headers'] = {
"User-Agent": gen_useragent(),
"Accept-Language": "en-US,en;q=0.5",
}
return params
def parse_image_item(item):
return {
"template": "images.html",
"url": item["content_url"],
"title": item["title"],
"content": item["asset_type"],
"img_src": item["content_thumb_extra_large_url"],
"thumbnail_src": item["thumbnail_url"],
"resolution": f"{item['content_original_width']}x{item['content_original_height']}",
"img_format": item["format"],
"author": item["author"],
}
def parse_video_item(item):
# in video items, the title is more or less a "content description", we try
# to reduce the lenght of the title ..
title = item["title"]
content = ""
if "." in title.strip()[:-1]:
content = title
title = title.split(".", 1)[0]
elif "," in title:
content = title
title = title.split(",", 1)[0]
elif len(title) > 50:
content = title
title = ""
for w in content.split(" "):
title += f" {w}"
if len(title) > 50:
title = title.strip() + "\u2026"
break
return {
"template": "videos.html",
"url": item["content_url"],
"title": title,
"content": content,
# https://en.wikipedia.org/wiki/ISO_8601#Durations
"length": isodate.parse_duration(item["time_duration"]),
"publishedDate": datetime.strptime(item["creation_date"], "%Y-%m-%d"),
"thumbnail": item["thumbnail_url"],
"iframe_src": item["video_small_preview_url"],
"metadata": item["asset_type"],
}
def parse_audio_item(item):
audio_data = item["audio_data"]
content = audio_data.get("description") or ""
if audio_data.get("album"):
content = audio_data["album"] + " - " + content
return {
"url": item["content_url"],
"title": item["title"],
"content": content,
# "thumbnail": base_url + item["thumbnail_url"],
"iframe_src": audio_data["preview"]["url"],
"publishedDate": datetime.fromisoformat(audio_data["release_date"]) if audio_data["release_date"] else None,
"length": timedelta(seconds=round(audio_data["duration"] / 1000)) if audio_data["duration"] else None,
"author": item.get("artist_name"),
}
def response(resp):
results = []
json_resp = resp.json()
if isinstance(json_resp["items"], list):
return None
for item in json_resp["items"].values():
if item["asset_type"].lower() in ["image", "premium-image", "illustration", "vector"]:
result = parse_image_item(item)
elif item["asset_type"].lower() == "video":
result = parse_video_item(item)
elif item["asset_type"].lower() == "audio":
result = parse_audio_item(item)
else:
logger.error("no handle for %s --> %s", item["asset_type"], item)
continue
results.append(result)
for item in json_resp['items'].values():
results.append(
{
'template': 'images.html',
'url': item['content_url'],
'title': item['title'],
'content': '',
'img_src': item['content_thumb_extra_large_url'],
'thumbnail_src': item['thumbnail_url'],
'resolution': f"{item['content_original_width']}x{item['content_original_height']}",
'img_format': item['format'],
'author': item['author'],
}
)
return results

View File

@ -0,0 +1,71 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Internet Archive scholar(science)
"""
from datetime import datetime
from urllib.parse import urlencode
from searx.utils import html_to_text
about = {
"website": "https://scholar.archive.org/",
"wikidata_id": "Q115667709",
"official_api_documentation": "https://scholar.archive.org/api/redoc",
"use_official_api": True,
"require_api_key": False,
"results": "JSON",
}
categories = ['science', 'scientific publications']
paging = True
base_url = "https://scholar.archive.org"
results_per_page = 15
def request(query, params):
args = {
"q": query,
"limit": results_per_page,
"offset": (params["pageno"] - 1) * results_per_page,
}
params["url"] = f"{base_url}/search?{urlencode(args)}"
params["headers"]["Accept"] = "application/json"
return params
def response(resp):
results = []
json = resp.json()
for result in json["results"]:
publishedDate, content, doi = None, '', None
if result['biblio'].get('release_date'):
publishedDate = datetime.strptime(result['biblio']['release_date'], "%Y-%m-%d")
if len(result['abstracts']) > 0:
content = result['abstracts'][0].get('body')
elif len(result['_highlights']) > 0:
content = result['_highlights'][0]
if len(result['releases']) > 0:
doi = result['releases'][0].get('doi')
results.append(
{
'template': 'paper.html',
'url': result['fulltext']['access_url'],
'title': result['biblio'].get('title') or result['biblio'].get('container_name'),
'content': html_to_text(content),
'publisher': result['biblio'].get('publisher'),
'doi': doi,
'journal': result['biblio'].get('container_name'),
'authors': result['biblio'].get('contrib_names'),
'tags': result['tags'],
'publishedDate': publishedDate,
'issns': result['biblio'].get('issns'),
'pdf_url': result['fulltext'].get('access_url'),
}
)
return results

View File

@ -27,7 +27,7 @@ categories = ['images']
paging = True
endpoint = 'photos'
base_url = 'https://www.loc.gov'
base_url = 'https://loc.gov'
search_string = "/{endpoint}/?sp={page}&{query}&fo=json"

View File

@ -233,7 +233,8 @@ class Network:
del kwargs['raise_for_httperror']
return do_raise_for_httperror
def patch_response(self, response, do_raise_for_httperror):
@staticmethod
def patch_response(response, do_raise_for_httperror):
if isinstance(response, httpx.Response):
# requests compatibility (response is not streamed)
# see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses
@ -241,11 +242,8 @@ class Network:
# raise an exception
if do_raise_for_httperror:
try:
raise_for_httperror(response)
except:
self._logger.warning(f"HTTP Request failed: {response.request.method} {response.request.url}")
raise
return response
def is_valid_response(self, response):
@ -271,7 +269,7 @@ class Network:
else:
response = await client.request(method, url, **kwargs)
if self.is_valid_response(response) or retries <= 0:
return self.patch_response(response, do_raise_for_httperror)
return Network.patch_response(response, do_raise_for_httperror)
except httpx.RemoteProtocolError as e:
if not was_disconnected:
# the server has closed the connection:

View File

@ -137,6 +137,9 @@ class OnlineProcessor(EngineProcessor):
self.engine.request(query, params)
# ignoring empty urls
if params['url'] is None:
return None
if not params['url']:
return None

View File

@ -327,32 +327,9 @@ engines:
- name: adobe stock
engine: adobe_stock
shortcut: asi
categories: ["images"]
# https://docs.searxng.org/dev/engines/online/adobe_stock.html
adobe_order: relevance
adobe_content_types: ["photo", "illustration", "zip_vector", "template", "3d", "image"]
timeout: 6
disabled: true
- name: adobe stock video
engine: adobe_stock
shortcut: asv
network: adobe stock
categories: ["videos"]
adobe_order: relevance
adobe_content_types: ["video"]
timeout: 6
disabled: true
- name: adobe stock audio
engine: adobe_stock
shortcut: asa
network: adobe stock
categories: ["music"]
adobe_order: relevance
adobe_content_types: ["audio"]
timeout: 6
# available search orders: 'relevant', 'featured', 'creation', 'nb_downloads'
# adobe_order: relevance
shortcut: as
disabled: true
- name: alpine linux packages
@ -1652,6 +1629,11 @@ engines:
api_site: 'askubuntu'
categories: [it, q&a]
- name: internetarchivescholar
engine: internet_archive_scholar
shortcut: ias
timeout: 15.0
- name: superuser
engine: stackexchange
shortcut: su