Merge remote-tracking branch 'origin/master'

This commit is contained in:
Kang-min Liu
2015-11-14 00:05:44 +01:00
73 changed files with 2780 additions and 1334 deletions

View File

@@ -75,7 +75,7 @@ def load_engine(engine_data):
engine.safesearch = False
if not hasattr(engine, 'timeout'):
engine.timeout = settings['server']['request_timeout']
engine.timeout = settings['outgoing']['request_timeout']
if not hasattr(engine, 'shortcut'):
engine.shortcut = ''

View File

@@ -52,7 +52,7 @@ def request(query, params):
def response(resp):
results = []
dom = html.fromstring(resp.content)
dom = html.fromstring(resp.text)
# parse results
for result in dom.xpath('//div[@class="sa_cc"]'):

View File

@@ -63,7 +63,7 @@ def request(query, params):
def response(resp):
results = []
dom = html.fromstring(resp.content)
dom = html.fromstring(resp.text)
# init regex for yaml-parsing
p = re.compile('({|,)([a-z]+):(")')

View File

@@ -13,6 +13,8 @@
from urllib import urlencode
from cgi import escape
from lxml import etree
from random import randint
from time import time
# engine dependent config
categories = ['general']
@@ -21,7 +23,7 @@ number_of_results = 5
# search-url, invalid HTTPS certificate
base_url = 'http://gigablast.com/'
search_string = 'search?{query}&n={number_of_results}&s={offset}&xml=1&qh=0'
search_string = 'search?{query}&n={number_of_results}&s={offset}&xml=1&qh=0&uxid={uxid}&rand={rand}'
# specific xpath variables
results_xpath = '//response//result'
@@ -37,7 +39,9 @@ def request(query, params):
search_path = search_string.format(
query=urlencode({'q': query}),
offset=offset,
number_of_results=number_of_results)
number_of_results=number_of_results,
uxid=randint(10000, 10000000),
rand=int(time()))
params['url'] = base_url + search_path

View File

@@ -9,11 +9,15 @@
# @parse url, title, content, suggestion
import re
from cgi import escape
from urllib import urlencode
from urlparse import urlparse, parse_qsl
from lxml import html
from lxml import html, etree
from searx.poolrequests import get
from searx.engines.xpath import extract_text, extract_url
from searx.search import logger
logger = logger.getChild('google engine')
# engine dependent config
@@ -167,7 +171,7 @@ def parse_url(url_string, google_hostname):
def extract_text_from_dom(result, xpath):
r = result.xpath(xpath)
if len(r) > 0:
return extract_text(r[0])
return escape(extract_text(r[0]))
return None
@@ -224,8 +228,8 @@ def response(resp):
# parse results
for result in dom.xpath(results_xpath):
title = extract_text(result.xpath(title_xpath)[0])
try:
title = extract_text(result.xpath(title_xpath)[0])
url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname)
parsed_url = urlparse(url, google_hostname)
@@ -268,12 +272,13 @@ def response(resp):
'content': content
})
except:
logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True))
continue
# parse suggestion
for suggestion in dom.xpath(suggestion_xpath):
# append suggestion
results.append({'suggestion': extract_text(suggestion)})
results.append({'suggestion': escape(extract_text(suggestion))})
# return results
return results

View File

@@ -20,7 +20,7 @@ categories = ['videos', 'music', 'files']
paging = True
# search-url
url = 'https://thepiratebay.am/'
url = 'https://thepiratebay.se/'
search_url = url + 'search/{search_term}/{pageno}/99/{search_type}'
# piratebay specific type-definitions

View File

@@ -34,6 +34,11 @@ def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query}),
pageno=params['pageno']-1)
# Disable SSL verification
# error: (60) SSL certificate problem: unable to get local issuer
# certificate
params['verify'] = False
return params

View File

@@ -27,6 +27,11 @@ def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query}),
pageno=params['pageno']-1)
# Disable SSL verification
# error: (60) SSL certificate problem: unable to get local issuer
# certificate
params['verify'] = False
return params

View File

@@ -12,6 +12,8 @@
from lxml import html
from cgi import escape
from dateutil import parser
from datetime import datetime, timedelta
import re
from searx.engines.xpath import extract_text
@@ -66,20 +68,57 @@ def response(resp):
url = link.attrib.get('href')
# block google-ad url's
if re.match("^http(s|)://www.google.[a-z]+/aclk.*$", url):
if re.match("^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url):
continue
# block startpage search url's
if re.match("^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url):
continue
# block ixquick search url's
if re.match("^http(s|)://(www\.)?ixquick\.com/do/search\?.*$", url):
continue
title = escape(extract_text(link))
if result.xpath('./p[@class="desc"]'):
content = escape(extract_text(result.xpath('./p[@class="desc"]')))
if result.xpath('./p[@class="desc clk"]'):
content = escape(extract_text(result.xpath('./p[@class="desc clk"]')))
else:
content = ''
# append result
results.append({'url': url,
'title': title,
'content': content})
published_date = None
# check if search result starts with something like: "2 Sep 2014 ... "
if re.match("^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
date_pos = content.find('...')+4
date_string = content[0:date_pos-5]
published_date = parser.parse(date_string, dayfirst=True)
# fix content string
content = content[date_pos:]
# check if search result starts with something like: "5 days ago ... "
elif re.match("^[0-9]+ days? ago \.\.\. ", content):
date_pos = content.find('...')+4
date_string = content[0:date_pos-5]
# calculate datetime
published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))
# fix content string
content = content[date_pos:]
if published_date:
# append result
results.append({'url': url,
'title': title,
'content': content,
'publishedDate': published_date})
else:
# append result
results.append({'url': url,
'title': title,
'content': content})
# return results
return results

View File

@@ -55,10 +55,14 @@ def response(resp):
# parse results
for tweet in dom.xpath(results_xpath):
link = tweet.xpath(link_xpath)[0]
try:
link = tweet.xpath(link_xpath)[0]
content = extract_text(tweet.xpath(content_xpath)[0])
except Exception:
continue
url = urljoin(base_url, link.attrib.get('href'))
title = extract_text(tweet.xpath(title_xpath))
content = extract_text(tweet.xpath(content_xpath)[0])
pubdate = tweet.xpath(timestamp_xpath)
if len(pubdate) > 0:

View File

@@ -1,8 +1,15 @@
import json
from urllib import urlencode
from searx import logger
from searx.poolrequests import get
from searx.utils import format_date_by_locale
from datetime import datetime
from dateutil.parser import parse as dateutil_parse
from urllib import urlencode
logger = logger.getChild('wikidata')
result_count = 1
wikidata_host = 'https://www.wikidata.org'
wikidata_api = wikidata_host + '/w/api.php'
@@ -164,14 +171,12 @@ def getDetail(jsonresponse, wikidata_id, language, locale):
if postal_code is not None:
attributes.append({'label': 'Postal code(s)', 'value': postal_code})
date_of_birth = get_time(claims, 'P569', None)
date_of_birth = get_time(claims, 'P569', locale, None)
if date_of_birth is not None:
date_of_birth = format_date_by_locale(date_of_birth[8:], locale)
attributes.append({'label': 'Date of birth', 'value': date_of_birth})
date_of_death = get_time(claims, 'P570', None)
date_of_death = get_time(claims, 'P570', locale, None)
if date_of_death is not None:
date_of_death = format_date_by_locale(date_of_death[8:], locale)
attributes.append({'label': 'Date of death', 'value': date_of_death})
if len(attributes) == 0 and len(urls) == 2 and len(description) == 0:
@@ -229,7 +234,7 @@ def get_string(claims, propertyName, defaultValue=None):
return result[0]
def get_time(claims, propertyName, defaultValue=None):
def get_time(claims, propertyName, locale, defaultValue=None):
propValue = claims.get(propertyName, {})
if len(propValue) == 0:
return defaultValue
@@ -244,9 +249,22 @@ def get_time(claims, propertyName, defaultValue=None):
result.append(value.get('time', ''))
if len(result) == 0:
return defaultValue
date_string = defaultValue
else:
return ', '.join(result)
date_string = ', '.join(result)
try:
parsed_date = datetime.strptime(date_string, "+%Y-%m-%dT%H:%M:%SZ")
except:
if date_string.startswith('-'):
return date_string.split('T')[0]
try:
parsed_date = dateutil_parse(date_string, fuzzy=False, default=False)
except:
logger.debug('could not parse date %s', date_string)
return date_string.split('T')[0]
return format_date_by_locale(parsed_date, locale)
def get_geolink(claims, propertyName, defaultValue=''):

62
searx/engines/yandex.py Normal file
View File

@@ -0,0 +1,62 @@
"""
Yahoo (Web)
@website https://yandex.ru/
@provide-api ?
@using-api no
@results HTML (using search portal)
@stable no (HTML can change)
@parse url, title, content
"""
from urllib import urlencode
from lxml import html
from searx.search import logger
logger = logger.getChild('yandex engine')
# engine dependent config
categories = ['general']
paging = True
language_support = True # TODO
default_tld = 'com'
language_map = {'ru': 'ru',
'ua': 'uk',
'tr': 'com.tr'}
# search-url
base_url = 'https://yandex.{tld}/'
search_url = 'search/?{query}&p={page}'
results_xpath = '//div[@class="serp-item serp-item_plain_yes clearfix i-bem"]'
url_xpath = './/h2/a/@href'
title_xpath = './/h2/a//text()'
content_xpath = './/div[@class="serp-item__text"]//text()'
def request(query, params):
lang = params['language'].split('_')[0]
host = base_url.format(tld=language_map.get(lang) or default_tld)
params['url'] = host + search_url.format(page=params['pageno']-1,
query=urlencode({'text': query}))
return params
# get response from search-request
def response(resp):
dom = html.fromstring(resp.text)
results = []
for result in dom.xpath(results_xpath):
try:
res = {'url': result.xpath(url_xpath)[0],
'title': ''.join(result.xpath(title_xpath)),
'content': ''.join(result.xpath(content_xpath))}
except:
logger.exception('yandex parse crash')
continue
results.append(res)
return results

View File

@@ -1,93 +0,0 @@
# Youtube (Videos)
#
# @website https://www.youtube.com/
# @provide-api yes (http://gdata-samples-youtube-search-py.appspot.com/)
#
# @using-api yes
# @results JSON
# @stable yes
# @parse url, title, content, publishedDate, thumbnail, embedded
from json import loads
from urllib import urlencode
from dateutil import parser
# engine dependent config
categories = ['videos', 'music']
paging = True
language_support = True
# search-url
base_url = 'https://gdata.youtube.com/feeds/api/videos'
search_url = base_url + '?alt=json&{query}&start-index={index}&max-results=5'
embedded_url = '<iframe width="540" height="304" ' +\
'data-src="//www.youtube-nocookie.com/embed/{videoid}" ' +\
'frameborder="0" allowfullscreen></iframe>'
# do search-request
def request(query, params):
index = (params['pageno'] - 1) * 5 + 1
params['url'] = search_url.format(query=urlencode({'q': query}),
index=index)
# add language tag if specified
if params['language'] != 'all':
params['url'] += '&lr=' + params['language'].split('_')[0]
return params
# get response from search-request
def response(resp):
results = []
search_results = loads(resp.text)
# return empty array if there are no results
if 'feed' not in search_results:
return []
feed = search_results['feed']
# parse results
for result in feed['entry']:
url = [x['href'] for x in result['link'] if x['type'] == 'text/html']
if not url:
continue
# remove tracking
url = url[0].replace('feature=youtube_gdata', '')
if url.endswith('&'):
url = url[:-1]
videoid = url[32:]
title = result['title']['$t']
content = ''
thumbnail = ''
pubdate = result['published']['$t']
publishedDate = parser.parse(pubdate)
if 'media$thumbnail' in result['media$group']:
thumbnail = result['media$group']['media$thumbnail'][0]['url']
content = result['content']['$t']
embedded = embedded_url.format(videoid=videoid)
# append result
results.append({'url': url,
'title': title,
'content': content,
'template': 'videos.html',
'publishedDate': publishedDate,
'embedded': embedded,
'thumbnail': thumbnail})
# return results
return results