mirror of https://github.com/searxng/searxng.git
[enh] fix content fetching, parse published date from description
This commit is contained in:
parent
a959977ab4
commit
4508c96667
|
@ -12,6 +12,8 @@
|
|||
|
||||
from lxml import html
|
||||
from cgi import escape
|
||||
from dateutil import parser
|
||||
from datetime import datetime, timedelta
|
||||
import re
|
||||
from searx.engines.xpath import extract_text
|
||||
|
||||
|
@ -79,11 +81,40 @@ def response(resp):
|
|||
|
||||
title = escape(extract_text(link))
|
||||
|
||||
if result.xpath('./p[@class="desc"]'):
|
||||
content = escape(extract_text(result.xpath('./p[@class="desc"]')))
|
||||
if result.xpath('./p[@class="desc clk"]'):
|
||||
content = escape(extract_text(result.xpath('./p[@class="desc clk"]')))
|
||||
else:
|
||||
content = ''
|
||||
|
||||
published_date = None
|
||||
|
||||
# check if search result starts with something like: "2 Sep 2014 ... "
|
||||
if re.match("^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
|
||||
date_pos = content.find('...')+4
|
||||
date_string = content[0:date_pos-5]
|
||||
published_date = parser.parse(date_string, dayfirst=True)
|
||||
|
||||
# fix content string
|
||||
content = content[date_pos:]
|
||||
|
||||
# check if search result starts with something like: "5 days ago ... "
|
||||
elif re.match("^[0-9]+ days? ago \.\.\. ", content):
|
||||
date_pos = content.find('...')+4
|
||||
date_string = content[0:date_pos-5]
|
||||
|
||||
# calculate datetime
|
||||
published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))
|
||||
|
||||
# fix content string
|
||||
content = content[date_pos:]
|
||||
|
||||
if published_date:
|
||||
# append result
|
||||
results.append({'url': url,
|
||||
'title': title,
|
||||
'content': content,
|
||||
'publishedDate': published_date})
|
||||
else:
|
||||
# append result
|
||||
results.append({'url': url,
|
||||
'title': title,
|
||||
|
|
|
@ -42,7 +42,7 @@ class TestStartpageEngine(SearxTestCase):
|
|||
</a>
|
||||
<span id='title_stars_2' name='title_stars_2'> </span>
|
||||
</h3>
|
||||
<p class='desc'>
|
||||
<p class='desc clk'>
|
||||
This should be the content.
|
||||
</p>
|
||||
<p>
|
||||
|
@ -78,7 +78,7 @@ class TestStartpageEngine(SearxTestCase):
|
|||
</a>
|
||||
<span id='title_stars_2' name='title_stars_2'> </span>
|
||||
</h3>
|
||||
<p class='desc'>
|
||||
<p class='desc clk'>
|
||||
This should be the content.
|
||||
</p>
|
||||
<p>
|
||||
|
@ -101,7 +101,7 @@ class TestStartpageEngine(SearxTestCase):
|
|||
<h3>
|
||||
<span id='title_stars_2' name='title_stars_2'> </span>
|
||||
</h3>
|
||||
<p class='desc'>
|
||||
<p class='desc clk'>
|
||||
This should be the content.
|
||||
</p>
|
||||
<p>
|
||||
|
|
Loading…
Reference in New Issue