[mod] data: implement a simple tracker URL (SQL) database

On demand, the tracker data is loaded directly into the cache, so that the
maintenance of this data via PRs is no longer necessary.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser
2025-05-24 17:53:57 +02:00
committed by Bnyro
parent 58c10f758b
commit 2dd4f7b972
7 changed files with 168 additions and 2067 deletions

View File

@@ -1,36 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Fetch trackers"""
import json
import httpx
from searx.data import data_dir
DATA_FILE = data_dir / "tracker_patterns.json"
CLEAR_LIST_URL = "https://raw.githubusercontent.com/ClearURLs/Rules/refs/heads/master/data.min.json"
def fetch_clear_url_filters():
resp = httpx.get(CLEAR_LIST_URL)
if resp.status_code != 200:
# pylint: disable=broad-exception-raised
raise Exception(f"Error fetching ClearURL filter lists, HTTP code {resp.status_code}")
providers = resp.json()["providers"]
rules = []
for rule in providers.values():
rules.append(
{
"urlPattern": rule["urlPattern"].replace("\\\\", "\\"), # fix javascript regex syntax
"exceptions": [exc.replace("\\\\", "\\") for exc in rule["exceptions"]],
"trackerParams": rule["rules"],
}
)
return rules
if __name__ == '__main__':
filter_list = fetch_clear_url_filters()
with DATA_FILE.open("w", encoding='utf-8') as f:
json.dump(filter_list, f, indent=4, sort_keys=True, ensure_ascii=False)