Merge remote-tracking branch 'origin/master'

2026-01-01 08:30:01 +00:00 · 2015-11-14 00:05:44 +01:00
parent c7c6c35ccd e98aef6fc4
commit ac8759cd3f
73 changed files with 2780 additions and 1334 deletions
--- a/searx/engines/init.py
+++ b/searx/engines/init.py
@@ -75,7 +75,7 @@ def load_engine(engine_data):
        engine.safesearch = False

    if not hasattr(engine, 'timeout'):
-        engine.timeout = settings['server']['request_timeout']
+        engine.timeout = settings['outgoing']['request_timeout']

    if not hasattr(engine, 'shortcut'):
        engine.shortcut = ''
--- a/searx/engines/bing.py
+++ b/searx/engines/bing.py
@@ -52,7 +52,7 @@ def request(query, params):
 def response(resp):
    results = []

-    dom = html.fromstring(resp.content)
+    dom = html.fromstring(resp.text)

    # parse results
    for result in dom.xpath('//div[@class="sa_cc"]'):
--- a/searx/engines/bing_images.py
+++ b/searx/engines/bing_images.py
@@ -63,7 +63,7 @@ def request(query, params):
 def response(resp):
    results = []

-    dom = html.fromstring(resp.content)
+    dom = html.fromstring(resp.text)

    # init regex for yaml-parsing
    p = re.compile('({|,)([a-z]+):(")')
--- a/searx/engines/gigablast.py
+++ b/searx/engines/gigablast.py
@@ -13,6 +13,8 @@
 from urllib import urlencode
 from cgi import escape
 from lxml import etree
+from random import randint
+from time import time

 # engine dependent config
 categories = ['general']
@@ -21,7 +23,7 @@ number_of_results = 5

 # search-url, invalid HTTPS certificate
 base_url = 'http://gigablast.com/'
-search_string = 'search?{query}&n={number_of_results}&s={offset}&xml=1&qh=0'
+search_string = 'search?{query}&n={number_of_results}&s={offset}&xml=1&qh=0&uxid={uxid}&rand={rand}'

 # specific xpath variables
 results_xpath = '//response//result'
@@ -37,7 +39,9 @@ def request(query, params):
    search_path = search_string.format(
        query=urlencode({'q': query}),
        offset=offset,
-        number_of_results=number_of_results)
+        number_of_results=number_of_results,
+        uxid=randint(10000, 10000000),
+        rand=int(time()))

    params['url'] = base_url + search_path

--- a/searx/engines/google.py
+++ b/searx/engines/google.py
@@ -9,11 +9,15 @@
 # @parse       url, title, content, suggestion

 import re
+from cgi import escape
 from urllib import urlencode
 from urlparse import urlparse, parse_qsl
-from lxml import html
+from lxml import html, etree
 from searx.poolrequests import get
 from searx.engines.xpath import extract_text, extract_url
+from searx.search import logger
+
+logger = logger.getChild('google engine')


 # engine dependent config
@@ -167,7 +171,7 @@ def parse_url(url_string, google_hostname):
 def extract_text_from_dom(result, xpath):
    r = result.xpath(xpath)
    if len(r) > 0:
-        return extract_text(r[0])
+        return escape(extract_text(r[0]))
    return None


@@ -224,8 +228,8 @@ def response(resp):

    # parse results
    for result in dom.xpath(results_xpath):
-        title = extract_text(result.xpath(title_xpath)[0])
        try:
+            title = extract_text(result.xpath(title_xpath)[0])
            url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname)
            parsed_url = urlparse(url, google_hostname)

@@ -268,12 +272,13 @@ def response(resp):
                                'content': content
                                })
        except:
+            logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True))
            continue

    # parse suggestion
    for suggestion in dom.xpath(suggestion_xpath):
        # append suggestion
-        results.append({'suggestion': extract_text(suggestion)})
+        results.append({'suggestion': escape(extract_text(suggestion))})

    # return results
    return results
--- a/searx/engines/piratebay.py
+++ b/searx/engines/piratebay.py
@@ -20,7 +20,7 @@ categories = ['videos', 'music', 'files']
 paging = True

 # search-url
-url = 'https://thepiratebay.am/'
+url = 'https://thepiratebay.se/'
 search_url = url + 'search/{search_term}/{pageno}/99/{search_type}'

 # piratebay specific type-definitions
--- a/searx/engines/searchcode_code.py
+++ b/searx/engines/searchcode_code.py
@@ -34,6 +34,11 @@ def request(query, params):
    params['url'] = search_url.format(query=urlencode({'q': query}),
                                      pageno=params['pageno']-1)

+    # Disable SSL verification
+    # error: (60) SSL certificate problem: unable to get local issuer
+    # certificate
+    params['verify'] = False
+
    return params


--- a/searx/engines/searchcode_doc.py
+++ b/searx/engines/searchcode_doc.py
@@ -27,6 +27,11 @@ def request(query, params):
    params['url'] = search_url.format(query=urlencode({'q': query}),
                                      pageno=params['pageno']-1)

+    # Disable SSL verification
+    # error: (60) SSL certificate problem: unable to get local issuer
+    # certificate
+    params['verify'] = False
+
    return params


--- a/searx/engines/startpage.py
+++ b/searx/engines/startpage.py
@@ -12,6 +12,8 @@

 from lxml import html
 from cgi import escape
+from dateutil import parser
+from datetime import datetime, timedelta
 import re
 from searx.engines.xpath import extract_text

@@ -66,20 +68,57 @@ def response(resp):
        url = link.attrib.get('href')

        # block google-ad url's
-        if re.match("^http(s|)://www.google.[a-z]+/aclk.*$", url):
+        if re.match("^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url):
+            continue
+
+        # block startpage search url's
+        if re.match("^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url):
+            continue
+
+        # block ixquick search url's
+        if re.match("^http(s|)://(www\.)?ixquick\.com/do/search\?.*$", url):
            continue

        title = escape(extract_text(link))

-        if result.xpath('./p[@class="desc"]'):
-            content = escape(extract_text(result.xpath('./p[@class="desc"]')))
+        if result.xpath('./p[@class="desc clk"]'):
+            content = escape(extract_text(result.xpath('./p[@class="desc clk"]')))
        else:
            content = ''

-        # append result
-        results.append({'url': url,
-                        'title': title,
-                        'content': content})
+        published_date = None
+
+        # check if search result starts with something like: "2 Sep 2014 ... "
+        if re.match("^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
+            date_pos = content.find('...')+4
+            date_string = content[0:date_pos-5]
+            published_date = parser.parse(date_string, dayfirst=True)
+
+            # fix content string
+            content = content[date_pos:]
+
+        # check if search result starts with something like: "5 days ago ... "
+        elif re.match("^[0-9]+ days? ago \.\.\. ", content):
+            date_pos = content.find('...')+4
+            date_string = content[0:date_pos-5]
+
+            # calculate datetime
+            published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))
+
+            # fix content string
+            content = content[date_pos:]
+
+        if published_date:
+            # append result
+            results.append({'url': url,
+                            'title': title,
+                            'content': content,
+                            'publishedDate': published_date})
+        else:
+            # append result
+            results.append({'url': url,
+                            'title': title,
+                            'content': content})

    # return results
    return results
--- a/searx/engines/twitter.py
+++ b/searx/engines/twitter.py
@@ -55,10 +55,14 @@ def response(resp):

    # parse results
    for tweet in dom.xpath(results_xpath):
-        link = tweet.xpath(link_xpath)[0]
+        try:
+            link = tweet.xpath(link_xpath)[0]
+            content = extract_text(tweet.xpath(content_xpath)[0])
+        except Exception:
+            continue
+
        url = urljoin(base_url, link.attrib.get('href'))
        title = extract_text(tweet.xpath(title_xpath))
-        content = extract_text(tweet.xpath(content_xpath)[0])

        pubdate = tweet.xpath(timestamp_xpath)
        if len(pubdate) > 0:
--- a/searx/engines/wikidata.py
+++ b/searx/engines/wikidata.py
@@ -1,8 +1,15 @@
 import json
-from urllib import urlencode
+
+from searx import logger
 from searx.poolrequests import get
 from searx.utils import format_date_by_locale

+from datetime import datetime
+from dateutil.parser import parse as dateutil_parse
+from urllib import urlencode
+
+
+logger = logger.getChild('wikidata')
 result_count = 1
 wikidata_host = 'https://www.wikidata.org'
 wikidata_api = wikidata_host + '/w/api.php'
@@ -164,14 +171,12 @@ def getDetail(jsonresponse, wikidata_id, language, locale):
    if postal_code is not None:
        attributes.append({'label': 'Postal code(s)', 'value': postal_code})

-    date_of_birth = get_time(claims, 'P569', None)
+    date_of_birth = get_time(claims, 'P569', locale, None)
    if date_of_birth is not None:
-        date_of_birth = format_date_by_locale(date_of_birth[8:], locale)
        attributes.append({'label': 'Date of birth', 'value': date_of_birth})

-    date_of_death = get_time(claims, 'P570', None)
+    date_of_death = get_time(claims, 'P570', locale, None)
    if date_of_death is not None:
-        date_of_death = format_date_by_locale(date_of_death[8:], locale)
        attributes.append({'label': 'Date of death', 'value': date_of_death})

    if len(attributes) == 0 and len(urls) == 2 and len(description) == 0:
@@ -229,7 +234,7 @@ def get_string(claims, propertyName, defaultValue=None):
        return result[0]


-def get_time(claims, propertyName, defaultValue=None):
+def get_time(claims, propertyName, locale, defaultValue=None):
    propValue = claims.get(propertyName, {})
    if len(propValue) == 0:
        return defaultValue
@@ -244,9 +249,22 @@ def get_time(claims, propertyName, defaultValue=None):
            result.append(value.get('time', ''))

    if len(result) == 0:
-        return defaultValue
+        date_string = defaultValue
    else:
-        return ', '.join(result)
+        date_string = ', '.join(result)
+
+    try:
+        parsed_date = datetime.strptime(date_string, "+%Y-%m-%dT%H:%M:%SZ")
+    except:
+        if date_string.startswith('-'):
+            return date_string.split('T')[0]
+        try:
+            parsed_date = dateutil_parse(date_string, fuzzy=False, default=False)
+        except:
+            logger.debug('could not parse date %s', date_string)
+            return date_string.split('T')[0]
+
+    return format_date_by_locale(parsed_date, locale)


 def get_geolink(claims, propertyName, defaultValue=''):
--- a/searx/engines/yandex.py
+++ b/searx/engines/yandex.py
@@ -0,0 +1,62 @@
+"""
+ Yahoo (Web)
+
+ @website     https://yandex.ru/
+ @provide-api ?
+ @using-api   no
+ @results     HTML (using search portal)
+ @stable      no (HTML can change)
+ @parse       url, title, content
+"""
+
+from urllib import urlencode
+from lxml import html
+from searx.search import logger
+
+logger = logger.getChild('yandex engine')
+
+# engine dependent config
+categories = ['general']
+paging = True
+language_support = True  # TODO
+
+default_tld = 'com'
+language_map = {'ru': 'ru',
+                'ua': 'uk',
+                'tr': 'com.tr'}
+
+# search-url
+base_url = 'https://yandex.{tld}/'
+search_url = 'search/?{query}&p={page}'
+
+results_xpath = '//div[@class="serp-item serp-item_plain_yes clearfix i-bem"]'
+url_xpath = './/h2/a/@href'
+title_xpath = './/h2/a//text()'
+content_xpath = './/div[@class="serp-item__text"]//text()'
+
+
+def request(query, params):
+    lang = params['language'].split('_')[0]
+    host = base_url.format(tld=language_map.get(lang) or default_tld)
+    params['url'] = host + search_url.format(page=params['pageno']-1,
+                                             query=urlencode({'text': query}))
+    return params
+
+
+# get response from search-request
+def response(resp):
+    dom = html.fromstring(resp.text)
+    results = []
+
+    for result in dom.xpath(results_xpath):
+        try:
+            res = {'url': result.xpath(url_xpath)[0],
+                   'title': ''.join(result.xpath(title_xpath)),
+                   'content': ''.join(result.xpath(content_xpath))}
+        except:
+            logger.exception('yandex parse crash')
+            continue
+
+        results.append(res)
+
+    return results
--- a/searx/engines/youtube.py
+++ b/searx/engines/youtube.py
@@ -1,93 +0,0 @@
-# Youtube (Videos)
-#
-# @website     https://www.youtube.com/
-# @provide-api yes (http://gdata-samples-youtube-search-py.appspot.com/)
-#
-# @using-api   yes
-# @results     JSON
-# @stable      yes
-# @parse       url, title, content, publishedDate, thumbnail, embedded
-
-from json import loads
-from urllib import urlencode
-from dateutil import parser
-
-# engine dependent config
-categories = ['videos', 'music']
-paging = True
-language_support = True
-
-# search-url
-base_url = 'https://gdata.youtube.com/feeds/api/videos'
-search_url = base_url + '?alt=json&{query}&start-index={index}&max-results=5'
-
-embedded_url = '<iframe width="540" height="304" ' +\
-    'data-src="//www.youtube-nocookie.com/embed/{videoid}" ' +\
-    'frameborder="0" allowfullscreen></iframe>'
-
-
-# do search-request
-def request(query, params):
-    index = (params['pageno'] - 1) * 5 + 1
-
-    params['url'] = search_url.format(query=urlencode({'q': query}),
-                                      index=index)
-
-    # add language tag if specified
-    if params['language'] != 'all':
-        params['url'] += '&lr=' + params['language'].split('_')[0]
-
-    return params
-
-
-# get response from search-request
-def response(resp):
-    results = []
-
-    search_results = loads(resp.text)
-
-    # return empty array if there are no results
-    if 'feed' not in search_results:
-        return []
-
-    feed = search_results['feed']
-
-    # parse results
-    for result in feed['entry']:
-        url = [x['href'] for x in result['link'] if x['type'] == 'text/html']
-
-        if not url:
-            continue
-
-        # remove tracking
-        url = url[0].replace('feature=youtube_gdata', '')
-        if url.endswith('&'):
-            url = url[:-1]
-
-        videoid = url[32:]
-
-        title = result['title']['$t']
-        content = ''
-        thumbnail = ''
-
-        pubdate = result['published']['$t']
-        publishedDate = parser.parse(pubdate)
-
-        if 'media$thumbnail' in result['media$group']:
-            thumbnail = result['media$group']['media$thumbnail'][0]['url']
-
-        content = result['content']['$t']
-
-        embedded = embedded_url.format(videoid=videoid)
-
-        # append result
-        results.append({'url': url,
-                        'title': title,
-                        'content': content,
-                        'template': 'videos.html',
-                        'publishedDate': publishedDate,
-                        'embedded': embedded,
-                        'thumbnail': thumbnail})
-
-    # return results
-    return results