searxng/searx/engines/loc.py

# SPDX-License-Identifier: AGPL-3.0-or-later
"""Library of Congress: query Photo, Print and Drawing from API endpoint_
``photos``.

.. _endpoint: https://www.loc.gov/apis/json-and-yaml/requests/endpoints/

.. note::

   Beside the ``photos`` endpoint_ there are more endpoints available / we are
   looking forward for contributions implementing more endpoints.

"""

from urllib.parse import urlencode
from searx.network import raise_for_httperror

about = {
    "website": 'https://www.loc.gov/pictures/',
    "wikidata_id": 'Q131454',
    "official_api_documentation": 'https://www.loc.gov/api',
    "use_official_api": True,
    "require_api_key": False,
    "results": 'JSON',
}

categories = ['images']
paging = True

endpoint = 'photos'
base_url = 'https://www.loc.gov'
search_string = "/{endpoint}/?sp={page}&{query}&fo=json"


def request(query, params):

    search_path = search_string.format(
        endpoint=endpoint,
        query=urlencode({'q': query}),
        page=params['pageno'],
    )
    params['url'] = base_url + search_path
    params['raise_for_httperror'] = False
    return params


def response(resp):

    results = []
    json_data = resp.json()

    json_results = json_data.get('results')
    if not json_results:
        # when a search term has none results, loc sends a JSON in a HTTP 404
        # response and the HTTP status code is set in the 'status' element.
        if json_data.get('status') == 404:
            return results

    raise_for_httperror(resp)

    for result in json_results:

        url = result["item"].get("link")
        if not url:
            continue

        img_src = result['item'].get('service_medium')
        if not img_src or img_src == 'https://memory.loc.gov/pp/grp.gif':
            continue

        title = result['title']
        if title.startswith('['):
            title = title.strip('[]')

        content_items = [
            result['item'].get('created_published_date'),
            result['item'].get('summary', [None])[0],
            result['item'].get('notes', [None])[0],
            result['item'].get('part_of', [None])[0],
        ]

        author = None
        if result['item'].get('creators'):
            author = result['item']['creators'][0]['title']

        results.append(
            {
                'template': 'images.html',
                'url': url,
                'title': title,
                'content': ' / '.join([i for i in content_items if i]),
                'img_src': img_src,
                'thumbnail_src': result['item'].get('thumb_gallery'),
                'author': author,
            }
        )

    return results
Add Library of Congress engine 2021-02-07 21:10:20 +00:00			`# SPDX-License-Identifier: AGPL-3.0-or-later`
[mod] library of congress: fix engine 2023-09-11 06:22:32 +00:00			`"""Library of Congress: query Photo, Print and Drawing from API endpoint_`
			``photos``.

			`.. _endpoint: https://www.loc.gov/apis/json-and-yaml/requests/endpoints/`

			`.. note::`
Add Library of Congress engine 2021-02-07 21:10:20 +00:00
[mod] library of congress: fix engine 2023-09-11 06:22:32 +00:00			Beside the ``photos`` endpoint_ there are more endpoints available / we are
			`looking forward for contributions implementing more endpoints.`
Add Library of Congress engine 2021-02-07 21:10:20 +00:00
			`"""`

			`from urllib.parse import urlencode`
[mod] library of congress: fix engine 2023-09-11 06:22:32 +00:00			`from searx.network import raise_for_httperror`
Add Library of Congress engine 2021-02-07 21:10:20 +00:00
			`about = {`
			`"website": 'https://www.loc.gov/pictures/',`
			`"wikidata_id": 'Q131454',`
[mod] library of congress: fix engine 2023-09-11 06:22:32 +00:00			`"official_api_documentation": 'https://www.loc.gov/api',`
Add Library of Congress engine 2021-02-07 21:10:20 +00:00			`"use_official_api": True,`
			`"require_api_key": False,`
			`"results": 'JSON',`
			`}`

			`categories = ['images']`
			`paging = True`

[mod] library of congress: fix engine 2023-09-11 06:22:32 +00:00			`endpoint = 'photos'`
[fix] engine Library of Congress: fix API URL loc.gov -> www.loc.gov Avoid HTTP 404 and redirects. Requests to the JSON/YAML API use the base url [1] https://www.loc.gov/{endpoint}/?fo=json [1] https://www.loc.gov/apis/json-and-yaml/requests/ Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2024-11-23 09:32:15 +00:00			`base_url = 'https://www.loc.gov'`
[mod] library of congress: fix engine 2023-09-11 06:22:32 +00:00			`search_string = "/{endpoint}/?sp={page}&{query}&fo=json"`
Add Library of Congress engine 2021-02-07 21:10:20 +00:00

			`def request(query, params):`

[mod] library of congress: fix engine 2023-09-11 06:22:32 +00:00			`search_path = search_string.format(`
			`endpoint=endpoint,`
			`query=urlencode({'q': query}),`
			`page=params['pageno'],`
			`)`
Add Library of Congress engine 2021-02-07 21:10:20 +00:00			`params['url'] = base_url + search_path`
[mod] library of congress: fix engine 2023-09-11 06:22:32 +00:00			`params['raise_for_httperror'] = False`
Add Library of Congress engine 2021-02-07 21:10:20 +00:00			`return params`


			`def response(resp):`
[mod] library of congress: fix engine 2023-09-11 06:22:32 +00:00
Add Library of Congress engine 2021-02-07 21:10:20 +00:00			`results = []`
[mod] library of congress: fix engine 2023-09-11 06:22:32 +00:00			`json_data = resp.json()`
Add Library of Congress engine 2021-02-07 21:10:20 +00:00
[mod] library of congress: fix engine 2023-09-11 06:22:32 +00:00			`json_results = json_data.get('results')`
			`if not json_results:`
			`# when a search term has none results, loc sends a JSON in a HTTP 404`
			`# response and the HTTP status code is set in the 'status' element.`
			`if json_data.get('status') == 404:`
			`return results`

			`raise_for_httperror(resp)`

			`for result in json_results:`

			`url = result["item"].get("link")`
			`if not url:`
			`continue`

			`img_src = result['item'].get('service_medium')`
			`if not img_src or img_src == 'https://memory.loc.gov/pp/grp.gif':`
			`continue`

			`title = result['title']`
			`if title.startswith('['):`
			`title = title.strip('[]')`

			`content_items = [`
			`result['item'].get('created_published_date'),`
			`result['item'].get('summary', [None])[0],`
			`result['item'].get('notes', [None])[0],`
			`result['item'].get('part_of', [None])[0],`
			`]`

			`author = None`
			`if result['item'].get('creators'):`
			`author = result['item']['creators'][0]['title']`
Add Library of Congress engine 2021-02-07 21:10:20 +00:00
[format.python] initial formatting of the python code This patch was generated by black [1]:: make format.python [1] https://github.com/psf/black Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2021-12-27 08:26:22 +00:00			`results.append(`
			`{`
			`'template': 'images.html',`
[mod] library of congress: fix engine 2023-09-11 06:22:32 +00:00			`'url': url,`
			`'title': title,`
			`'content': ' / '.join([i for i in content_items if i]),`
			`'img_src': img_src,`
			`'thumbnail_src': result['item'].get('thumb_gallery'),`
			`'author': author,`
[format.python] initial formatting of the python code This patch was generated by black [1]:: make format.python [1] https://github.com/psf/black Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2021-12-27 08:26:22 +00:00			`}`
			`)`
Add Library of Congress engine 2021-02-07 21:10:20 +00:00
			`return results`