mirror of https://github.com/searxng/searxng.git
fix Wikipedia's paragraph extraction
This commit is contained in:
parent
6d18769ccf
commit
77b9faa8df
|
@ -49,29 +49,6 @@ def request(query, params):
|
||||||
return params
|
return params
|
||||||
|
|
||||||
|
|
||||||
# get first meaningful paragraph
|
|
||||||
# this should filter out disambiguation pages and notes above first paragraph
|
|
||||||
# "magic numbers" were obtained by fine tuning
|
|
||||||
def extract_first_paragraph(content, title, image):
|
|
||||||
first_paragraph = None
|
|
||||||
|
|
||||||
failed_attempts = 0
|
|
||||||
for paragraph in content.split('\n'):
|
|
||||||
|
|
||||||
starts_with_title = paragraph.lower().find(title.lower(), 0, len(title) + 35)
|
|
||||||
length = len(paragraph)
|
|
||||||
|
|
||||||
if length >= 200 or (starts_with_title >= 0 and (image or length >= 150)):
|
|
||||||
first_paragraph = paragraph
|
|
||||||
break
|
|
||||||
|
|
||||||
failed_attempts += 1
|
|
||||||
if failed_attempts > 3:
|
|
||||||
return None
|
|
||||||
|
|
||||||
return first_paragraph
|
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
# get response from search-request
|
||||||
def response(resp):
|
def response(resp):
|
||||||
results = []
|
results = []
|
||||||
|
@ -97,10 +74,7 @@ def response(resp):
|
||||||
if image:
|
if image:
|
||||||
image = image.get('source')
|
image = image.get('source')
|
||||||
|
|
||||||
extract = page.get('extract')
|
summary = page.get('extract', '').split('\n')[0].replace('()', '')
|
||||||
|
|
||||||
summary = extract_first_paragraph(extract, title, image)
|
|
||||||
summary = summary.replace('() ', '')
|
|
||||||
|
|
||||||
# link to wikipedia article
|
# link to wikipedia article
|
||||||
wikipedia_link = base_url.format(language=url_lang(resp.search_params['language'])) \
|
wikipedia_link = base_url.format(language=url_lang(resp.search_params['language'])) \
|
||||||
|
|
Loading…
Reference in New Issue