mirror of
https://github.com/searxng/searxng.git
synced 2025-02-20 12:20:04 +00:00
Create initial Kagi engine: kagi.py
This commit is contained in:
parent
2a421825be
commit
6697cb6950
148
searx/engines/kagi.py
Normal file
148
searx/engines/kagi.py
Normal file
@ -0,0 +1,148 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Kagi Search
|
||||
Scrapes Kagi's HTML search results.
|
||||
"""
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from lxml import html
|
||||
|
||||
from searx.utils import extract_text, eval_xpath, eval_xpath_list
|
||||
from searx.exceptions import SearxEngineAPIException
|
||||
from searx import logger
|
||||
|
||||
logger = logger.getChild('kagi')
|
||||
|
||||
about = {
|
||||
"website": 'https://kagi.com',
|
||||
"wikidata_id": None,
|
||||
"official_api_documentation": None,
|
||||
"use_official_api": False,
|
||||
"require_api_key": True,
|
||||
"results": 'HTML',
|
||||
}
|
||||
|
||||
categories = ['general', 'web']
|
||||
paging = True
|
||||
time_range_support = False
|
||||
safesearch = False
|
||||
|
||||
base_url = 'https://kagi.com/html/search'
|
||||
|
||||
api_key = None # Set in settings.yml
|
||||
|
||||
# Global cookie storage for Kagi authentication
|
||||
kagi_cookies = {'kagi_session': None, '_kagi_search_': None}
|
||||
|
||||
|
||||
def request(query, params):
|
||||
if not api_key:
|
||||
raise SearxEngineAPIException('missing Kagi API key')
|
||||
|
||||
page = params['pageno']
|
||||
|
||||
if 'cookies' not in params:
|
||||
params['cookies'] = {}
|
||||
params['cookies'].update(kagi_cookies)
|
||||
|
||||
if kagi_cookies['kagi_session'] and kagi_cookies['_kagi_search_']:
|
||||
logger.debug(
|
||||
"Using Kagi cookies for authentication - session: %s, search: %s",
|
||||
kagi_cookies['kagi_session'],
|
||||
kagi_cookies['_kagi_search_'],
|
||||
)
|
||||
search_url = base_url + '?' + urlencode({'q': query, 'batch': page})
|
||||
else:
|
||||
missing = []
|
||||
if not kagi_cookies['kagi_session']:
|
||||
missing.append('kagi_session')
|
||||
if not kagi_cookies['_kagi_search_']:
|
||||
missing.append('_kagi_search_')
|
||||
logger.debug("Missing cookies %s, using API key for initial authentication", missing)
|
||||
search_url = base_url + '?' + urlencode({'q': query, 'token': api_key, 'batch': page})
|
||||
|
||||
params['url'] = search_url
|
||||
params['headers'].update(
|
||||
{
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
|
||||
'AppleWebKit/537.36 (KHTML, like Gecko) '
|
||||
'Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
'DNT': '1',
|
||||
}
|
||||
)
|
||||
params['allow_redirects'] = True
|
||||
params['verify'] = True
|
||||
params['max_redirects'] = 1
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
if 'set-cookie' in resp.headers:
|
||||
cookies = resp.headers.get_list('set-cookie')
|
||||
for cookie in cookies:
|
||||
try:
|
||||
cookie_parts = cookie.split('=', 1)
|
||||
if len(cookie_parts) != 2:
|
||||
continue
|
||||
|
||||
name = cookie_parts[0].strip()
|
||||
value = cookie_parts[1].split(';')[0].strip()
|
||||
|
||||
if name == 'kagi_session':
|
||||
if value != kagi_cookies['kagi_session']:
|
||||
kagi_cookies['kagi_session'] = value
|
||||
resp.search_params['cookies']['kagi_session'] = value
|
||||
logger.debug("Updated kagi_session cookie: %s", value)
|
||||
elif name == '_kagi_search_': # Exact match for search cookie
|
||||
if value != kagi_cookies['_kagi_search_']:
|
||||
kagi_cookies['_kagi_search_'] = value
|
||||
resp.search_params['cookies']['_kagi_search_'] = value
|
||||
logger.debug("Updated _kagi_search_ cookie: %s", value)
|
||||
except ValueError as e:
|
||||
logger.warning("Failed to parse Kagi cookie: %s", str(e))
|
||||
|
||||
logger.debug(
|
||||
"Global Kagi cookies - session: %s, search: %s", kagi_cookies['kagi_session'], kagi_cookies['_kagi_search_']
|
||||
)
|
||||
logger.debug(
|
||||
"Request Kagi cookies - session: %s, search: %s",
|
||||
resp.search_params['cookies'].get('kagi_session'),
|
||||
resp.search_params['cookies'].get('_kagi_search_'),
|
||||
)
|
||||
|
||||
if resp.status_code == 401:
|
||||
kagi_cookies['kagi_session'] = None
|
||||
kagi_cookies['_kagi_search_'] = None
|
||||
resp.search_params['cookies'].clear()
|
||||
logger.debug("Cleared invalid Kagi cookies")
|
||||
|
||||
raise SearxEngineAPIException('Invalid Kagi authentication')
|
||||
if resp.status_code == 429:
|
||||
raise SearxEngineAPIException('Kagi rate limit exceeded')
|
||||
if resp.status_code != 200:
|
||||
raise SearxEngineAPIException(f'Unexpected HTTP status code: {resp.status_code}')
|
||||
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
for result in eval_xpath_list(dom, '//div[contains(@class, "_0_SRI")]'):
|
||||
try:
|
||||
title_tag = eval_xpath(result, './/a[contains(@class, "__sri_title_link")]')[0]
|
||||
title = extract_text(title_tag)
|
||||
url = title_tag.get('href')
|
||||
content_tag = eval_xpath(result, './/div[contains(@class, "__sri-desc")]')
|
||||
content = extract_text(content_tag[0]) if content_tag else ''
|
||||
domain = eval_xpath(result, './/span[contains(@class, "host")]/text()')
|
||||
if domain:
|
||||
domain = domain[0]
|
||||
|
||||
search_result = {'url': url, 'title': title, 'content': content, 'domain': domain}
|
||||
results.append(search_result)
|
||||
|
||||
except (IndexError, KeyError):
|
||||
continue
|
||||
|
||||
return results
|
Loading…
Reference in New Issue
Block a user