Fix tineye engine url, datetime parsing, and minor refactor

Changes made to tineye engine:
1. Importing logging if TYPE_CHECKING is enabled
2. Remove unecessary try-catch around json parsing the response, as this
masked the original error and had no immediate benefit
3. Improve error handling explicitely for status code 422 and 400
upfront, deferring json_parsing only for these status codes and
successful status codes
4. Unit test all new applicable changes to ensure compatability
This commit is contained in:
Grant Lanham 2024-08-19 23:02:06 -04:00 committed by Markus Heiser
parent 5be55e3309
commit 5276219b9d
2 changed files with 130 additions and 29 deletions

View File

@ -14,10 +14,16 @@ billion images `[tineye.com] <https://tineye.com/how>`_.
""" """
from typing import TYPE_CHECKING
from urllib.parse import urlencode from urllib.parse import urlencode
from datetime import datetime from datetime import datetime
from flask_babel import gettext from flask_babel import gettext
if TYPE_CHECKING:
import logging
logger = logging.getLogger()
about = { about = {
"website": 'https://tineye.com', "website": 'https://tineye.com',
"wikidata_id": 'Q2382535', "wikidata_id": 'Q2382535',
@ -34,7 +40,7 @@ categories = ['general']
paging = True paging = True
safesearch = False safesearch = False
base_url = 'https://tineye.com' base_url = 'https://tineye.com'
search_string = '/result_json/?page={page}&{query}' search_string = '/api/v1/result_json/?page={page}&{query}'
FORMAT_NOT_SUPPORTED = gettext( FORMAT_NOT_SUPPORTED = gettext(
"Could not read that image url. This may be due to an unsupported file" "Could not read that image url. This may be due to an unsupported file"
@ -120,7 +126,7 @@ def parse_tineye_match(match_json):
crawl_date = backlink_json.get("crawl_date") crawl_date = backlink_json.get("crawl_date")
if crawl_date: if crawl_date:
crawl_date = datetime.fromisoformat(crawl_date[:-3]) crawl_date = datetime.strptime(crawl_date, '%Y-%m-%d')
else: else:
crawl_date = datetime.min crawl_date = datetime.min
@ -150,29 +156,15 @@ def parse_tineye_match(match_json):
def response(resp): def response(resp):
"""Parse HTTP response from TinEye.""" """Parse HTTP response from TinEye."""
results = []
try: # handle the 422 client side errors, and the possible 400 status code error
json_data = resp.json()
except Exception as exc: # pylint: disable=broad-except
msg = "can't parse JSON response // %s" % exc
logger.error(msg)
json_data = {'error': msg}
# handle error codes from Tineye
if resp.is_error:
if resp.status_code in (400, 422): if resp.status_code in (400, 422):
json_data = resp.json()
suggestions = json_data.get('suggestions', {})
message = f'HTTP Status Code: {resp.status_code}'
message = 'HTTP status: %s' % resp.status_code if resp.status_code == 422:
error = json_data.get('error') s_key = suggestions.get('key', '')
s_key = json_data.get('suggestions', {}).get('key', '')
if error and s_key:
message = "%s (%s)" % (error, s_key)
elif error:
message = error
if s_key == "Invalid image URL": if s_key == "Invalid image URL":
# test https://docs.searxng.org/_static/searxng-wordmark.svg # test https://docs.searxng.org/_static/searxng-wordmark.svg
message = FORMAT_NOT_SUPPORTED message = FORMAT_NOT_SUPPORTED
@ -182,16 +174,23 @@ def response(resp):
elif s_key == 'Download Error': elif s_key == 'Download Error':
# test https://notexists # test https://notexists
message = DOWNLOAD_ERROR message = DOWNLOAD_ERROR
else:
logger.warning("Unknown suggestion key encountered: %s", s_key)
else: # 400
description = suggestions.get('description')
if isinstance(description, list):
message = ','.join(description)
# see https://github.com/searxng/searxng/pull/1456#issuecomment-1193105023 # see https://github.com/searxng/searxng/pull/1456#issuecomment-1193105023
# results.append({'answer': message}) # results.append({'answer': message})
logger.error(message) logger.error(message)
return []
return results # Raise for all other responses
resp.raise_for_status() resp.raise_for_status()
# append results from matches results = []
json_data = resp.json()
for match_json in json_data['matches']: for match_json in json_data['matches']:

102
tests/unit/test_tineye.py Normal file
View File

@ -0,0 +1,102 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=missing-module-docstring
from datetime import datetime
from unittest.mock import Mock
from requests import HTTPError
from searx.engines import load_engines, tineye
from tests import SearxTestCase
class TinEyeTests(SearxTestCase): # pylint: disable=missing-class-docstring
def setUp(self):
load_engines([{'name': 'tineye', 'engine': 'tineye', 'shortcut': 'tin', 'timeout': 9.0, 'disabled': True}])
def tearDown(self):
load_engines([])
def test_status_code_raises(self):
response = Mock()
response.status_code = 401
response.raise_for_status.side_effect = HTTPError()
self.assertRaises(HTTPError, lambda: tineye.response(response))
def test_returns_empty_list_for_422(self):
response = Mock()
response.json.return_value = {}
response.status_code = 422
response.raise_for_status.side_effect = HTTPError()
with self.assertLogs(tineye.logger) as _dev_null:
results = tineye.response(response)
self.assertEqual(0, len(results))
def test_logs_format_for_422(self):
response = Mock()
response.json.return_value = {"suggestions": {"key": "Invalid image URL"}}
response.status_code = 422
response.raise_for_status.side_effect = HTTPError()
with self.assertLogs(tineye.logger) as assert_logs_context:
tineye.response(response)
self.assertIn(tineye.FORMAT_NOT_SUPPORTED, ','.join(assert_logs_context.output))
def test_logs_signature_for_422(self):
response = Mock()
response.json.return_value = {"suggestions": {"key": "NO_SIGNATURE_ERROR"}}
response.status_code = 422
response.raise_for_status.side_effect = HTTPError()
with self.assertLogs(tineye.logger) as assert_logs_context:
tineye.response(response)
self.assertIn(tineye.NO_SIGNATURE_ERROR, ','.join(assert_logs_context.output))
def test_logs_download_for_422(self):
response = Mock()
response.json.return_value = {"suggestions": {"key": "Download Error"}}
response.status_code = 422
response.raise_for_status.side_effect = HTTPError()
with self.assertLogs(tineye.logger) as assert_logs_context:
tineye.response(response)
self.assertIn(tineye.DOWNLOAD_ERROR, ','.join(assert_logs_context.output))
def test_empty_list_for_400(self):
response = Mock()
response.json.return_value = {}
response.status_code = 400
response.raise_for_status.side_effect = HTTPError()
with self.assertLogs(tineye.logger) as _dev_null:
results = tineye.response(response)
self.assertEqual(0, len(results))
def test_logs_description_for_400(self):
description = 'There was a problem with that request. Error ID: ad5fc955-a934-43c1-8187-f9a61d301645'
response = Mock()
response.json.return_value = {"suggestions": {"description": [description], "title": "Oops! We're sorry!"}}
response.status_code = 400
response.raise_for_status.side_effect = HTTPError()
with self.assertLogs(tineye.logger) as assert_logs_context:
tineye.response(response)
self.assertIn(description, ','.join(assert_logs_context.output))
def test_crawl_date_parses(self):
date_str = '2020-05-25'
date = datetime.strptime(date_str, '%Y-%m-%d')
response = Mock()
response.json.return_value = {
'matches': [
{
'backlinks': [
{
'crawl_date': date_str,
}
]
}
]
}
response.status_code = 200
results = tineye.response(response)
self.assertEqual(date, results[0]['publishedDate'])