mirror of https://github.com/searxng/searxng.git
[enh] removing result html tags
This commit is contained in:
parent
14a53e3430
commit
17bf00ee42
|
@ -1,5 +1,6 @@
|
||||||
from json import loads
|
from json import loads
|
||||||
from urllib import urlencode
|
from urllib import urlencode
|
||||||
|
from searx.utils import html_to_text
|
||||||
|
|
||||||
url = 'https://duckduckgo.com/'
|
url = 'https://duckduckgo.com/'
|
||||||
search_url = url + 'd.js?{query}&l=us-en&p=1&s=0'
|
search_url = url + 'd.js?{query}&l=us-en&p=1&s=0'
|
||||||
|
@ -16,7 +17,7 @@ def response(resp):
|
||||||
if not r.get('t'):
|
if not r.get('t'):
|
||||||
continue
|
continue
|
||||||
results.append({'title': r['t']
|
results.append({'title': r['t']
|
||||||
,'content': r['a']
|
,'content': html_to_text(r['a'])
|
||||||
,'url': r['u']
|
,'url': r['u']
|
||||||
})
|
})
|
||||||
return results
|
return results
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from urllib import quote
|
from urllib import urlencode
|
||||||
from lxml import html
|
from lxml import html
|
||||||
from urlparse import urlparse
|
from urlparse import urlparse
|
||||||
from cgi import escape
|
from cgi import escape
|
||||||
|
@ -8,7 +8,7 @@ search_url = base_url+'do/search'
|
||||||
|
|
||||||
def request(query, params):
|
def request(query, params):
|
||||||
global search_url
|
global search_url
|
||||||
query = quote(query.replace(' ', '+'), safe='+')
|
query = urlencode({'q': query})[2:]
|
||||||
params['url'] = search_url
|
params['url'] = search_url
|
||||||
params['method'] = 'POST'
|
params['method'] = 'POST'
|
||||||
params['data'] = {'query': query}
|
params['data'] = {'query': query}
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
from urlparse import urljoin
|
from urlparse import urljoin
|
||||||
from urllib import urlencode
|
from urllib import urlencode
|
||||||
from lxml import html
|
from lxml import html
|
||||||
|
from cgi import escape
|
||||||
|
|
||||||
categories = ['social media']
|
categories = ['social media']
|
||||||
|
|
||||||
|
@ -21,6 +22,6 @@ def response(resp):
|
||||||
link = tweet.xpath('.//small[@class="time"]//a')[0]
|
link = tweet.xpath('.//small[@class="time"]//a')[0]
|
||||||
url = urljoin(base_url, link.attrib.get('href'))
|
url = urljoin(base_url, link.attrib.get('href'))
|
||||||
title = ''.join(tweet.xpath('.//span[@class="username js-action-profile-name"]//text()'))
|
title = ''.join(tweet.xpath('.//span[@class="username js-action-profile-name"]//text()'))
|
||||||
content = ''.join(map(html.tostring, tweet.xpath('.//p[@class="js-tweet-text tweet-text"]//*')))
|
content = escape(''.join(tweet.xpath('.//p[@class="js-tweet-text tweet-text"]//text()')))
|
||||||
results.append({'url': url, 'title': title, 'content': content})
|
results.append({'url': url, 'title': title, 'content': content})
|
||||||
return results
|
return results
|
||||||
|
|
|
@ -46,12 +46,11 @@ def request(query, params):
|
||||||
def response(resp):
|
def response(resp):
|
||||||
results = []
|
results = []
|
||||||
dom = html.fromstring(resp.text)
|
dom = html.fromstring(resp.text)
|
||||||
query = resp.search_params['query']
|
|
||||||
if results_xpath:
|
if results_xpath:
|
||||||
for result in dom.xpath(results_xpath):
|
for result in dom.xpath(results_xpath):
|
||||||
url = extract_url(result.xpath(url_xpath))
|
url = extract_url(result.xpath(url_xpath))
|
||||||
title = ' '.join(result.xpath(title_xpath))
|
title = ' '.join(result.xpath(title_xpath))
|
||||||
content = escape(' '.join(result.xpath(content_xpath))).replace(query, '<b>{0}</b>'.format(query))
|
content = escape(' '.join(result.xpath(content_xpath)))
|
||||||
results.append({'url': url, 'title': title, 'content': content})
|
results.append({'url': url, 'title': title, 'content': content})
|
||||||
else:
|
else:
|
||||||
for content, url, title in zip(dom.xpath(content_xpath), map(extract_url, dom.xpath(url_xpath)), dom.xpath(title_xpath)):
|
for content, url, title in zip(dom.xpath(content_xpath), map(extract_url, dom.xpath(url_xpath)), dom.xpath(title_xpath)):
|
||||||
|
|
Loading…
Reference in New Issue