diff --git a/AUTHORS.rst b/AUTHORS.rst index 974fbeb15..3605332ea 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -43,3 +43,4 @@ generally made searx better: - Kang-min Liu - Kirill Isakov - Guilhem Bonnefille +- Marc Abonce Seguin diff --git a/searx/engines/duckduckgo_definitions.py b/searx/engines/duckduckgo_definitions.py index 793e97d22..208ccca28 100644 --- a/searx/engines/duckduckgo_definitions.py +++ b/searx/engines/duckduckgo_definitions.py @@ -1,5 +1,6 @@ import json from urllib import urlencode +from re import compile, sub from lxml import html from searx.utils import html_to_text from searx.engines.xpath import extract_text @@ -7,6 +8,8 @@ from searx.engines.xpath import extract_text url = 'https://api.duckduckgo.com/'\ + '?{query}&format=json&pretty=0&no_redirect=1&d=1' +http_regex = compile(r'^http:') + def result_to_text(url, text, htmlResult): # TODO : remove result ending with "Meaning" or "Category" @@ -19,8 +22,8 @@ def result_to_text(url, text, htmlResult): def request(query, params): - # TODO add kl={locale} params['url'] = url.format(query=urlencode({'q': query})) + params['headers']['Accept-Language'] = params['language'] return params @@ -103,6 +106,10 @@ def response(resp): urls.append({'title': search_res.get('DefinitionSource'), 'url': definitionURL}) + # to merge with wikidata's infobox + if infobox_id: + infobox_id = http_regex.sub('https:', infobox_id) + # entity entity = search_res.get('Entity', None) # TODO continent / country / department / location / waterfall / diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index 9f3496b72..8aa2fcd5c 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -86,15 +86,15 @@ def getDetail(jsonresponse, wikidata_id, language, locale): results.append({'title': title, 'url': official_website}) wikipedia_link_count = 0 - if language != 'en': - wikipedia_link_count += add_url(urls, - 'Wikipedia (' + language + ')', - get_wikilink(result, language + - 'wiki')) - wikipedia_en_link = get_wikilink(result, 'enwiki') + wikipedia_link = get_wikilink(result, language + 'wiki') wikipedia_link_count += add_url(urls, - 'Wikipedia (en)', - wikipedia_en_link) + 'Wikipedia (' + language + ')', + wikipedia_link) + if language != 'en': + wikipedia_en_link = get_wikilink(result, 'enwiki') + wikipedia_link_count += add_url(urls, + 'Wikipedia (en)', + wikipedia_en_link) if wikipedia_link_count == 0: misc_language = get_wiki_firstlanguage(result, 'wiki') if misc_language is not None: @@ -188,7 +188,7 @@ def getDetail(jsonresponse, wikidata_id, language, locale): else: results.append({ 'infobox': title, - 'id': wikipedia_en_link, + 'id': wikipedia_link, 'content': description, 'attributes': attributes, 'urls': urls diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py new file mode 100644 index 000000000..fed7b263f --- /dev/null +++ b/searx/engines/wikipedia.py @@ -0,0 +1,114 @@ +""" + Wikipedia (Web) + + @website https://{language}.wikipedia.org + @provide-api yes + + @using-api yes + @results JSON + @stable yes + @parse url, infobox +""" + +from json import loads +from urllib import urlencode, quote + +# search-url +base_url = 'https://{language}.wikipedia.org/' +search_postfix = 'w/api.php?'\ + 'action=query'\ + '&format=json'\ + '&{query}'\ + '&prop=extracts|pageimages'\ + '&exintro'\ + '&explaintext'\ + '&pithumbsize=300'\ + '&redirects' + + +# set language in base_url +def url_lang(lang): + if lang == 'all': + language = 'en' + else: + language = lang.split('_')[0] + + return base_url.format(language=language) + + +# do search-request +def request(query, params): + if query.islower(): + query += '|' + query.title() + + params['url'] = url_lang(params['language']) \ + + search_postfix.format(query=urlencode({'titles': query})) + + return params + + +# get first meaningful paragraph +# this should filter out disambiguation pages and notes above first paragraph +# "magic numbers" were obtained by fine tuning +def extract_first_paragraph(content, title, image): + first_paragraph = None + + failed_attempts = 0 + for paragraph in content.split('\n'): + + starts_with_title = paragraph.lower().find(title.lower(), 0, len(title) + 35) + length = len(paragraph) + + if length >= 200 or (starts_with_title >= 0 and (image or length >= 150)): + first_paragraph = paragraph + break + + failed_attempts += 1 + if failed_attempts > 3: + return None + + return first_paragraph + + +# get response from search-request +def response(resp): + results = [] + + search_result = loads(resp.content) + + # wikipedia article's unique id + # first valid id is assumed to be the requested article + for article_id in search_result['query']['pages']: + page = search_result['query']['pages'][article_id] + if int(article_id) > 0: + break + + if int(article_id) < 0: + return [] + + title = page.get('title') + + image = page.get('thumbnail') + if image: + image = image.get('source') + + extract = page.get('extract') + + summary = extract_first_paragraph(extract, title, image) + if not summary: + return [] + + # link to wikipedia article + # parenthesis are not quoted to make infobox mergeable with wikidata's + wikipedia_link = url_lang(resp.search_params['language']) \ + + 'wiki/' + quote(title.replace(' ', '_').encode('utf8')).replace('%28', '(').replace('%29', ')') + + results.append({'url': wikipedia_link, 'title': title}) + + results.append({'infobox': title, + 'id': wikipedia_link, + 'content': summary, + 'img_src': image, + 'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]}) + + return results diff --git a/searx/results.py b/searx/results.py index 5d51eb5b5..c3040b305 100644 --- a/searx/results.py +++ b/searx/results.py @@ -37,7 +37,7 @@ def merge_two_infoboxes(infobox1, infobox2): urls1 = infobox1.get('urls', None) if urls1 is None: urls1 = [] - infobox1.set('urls', urls1) + infobox1['urls'] = urls1 urlSet = set() for url in infobox1.get('urls', []): @@ -47,11 +47,17 @@ def merge_two_infoboxes(infobox1, infobox2): if url.get('url', None) not in urlSet: urls1.append(url) + if 'img_src' in infobox2: + img1 = infobox1.get('img_src', None) + img2 = infobox2.get('img_src') + if img1 is None: + infobox1['img_src'] = img2 + if 'attributes' in infobox2: attributes1 = infobox1.get('attributes', None) if attributes1 is None: attributes1 = [] - infobox1.set('attributes', attributes1) + infobox1['attributes'] = attributes1 attributeSet = set() for attribute in infobox1.get('attributes', []): @@ -68,7 +74,7 @@ def merge_two_infoboxes(infobox1, infobox2): if result_content_len(content2) > result_content_len(content1): infobox1['content'] = content2 else: - infobox1.set('content', content2) + infobox1['content'] = content2 def result_score(result): diff --git a/searx/settings.yml b/searx/settings.yml index 96ac4e716..ff85684ac 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -43,10 +43,9 @@ engines: shortcut : bs - name : wikipedia - engine : mediawiki + engine : wikipedia shortcut : wp base_url : 'https://{language}.wikipedia.org/' - number_of_results : 1 - name : bing engine : bing @@ -93,6 +92,7 @@ engines: - name : ddg definitions engine : duckduckgo_definitions shortcut : ddd + disabled : True - name : digg engine : digg diff --git a/searx/templates/oscar/infobox.html b/searx/templates/oscar/infobox.html index d87d98453..c72cfb638 100644 --- a/searx/templates/oscar/infobox.html +++ b/searx/templates/oscar/infobox.html @@ -1,8 +1,9 @@
{{ infobox.content }}
{% endif %} @@ -28,5 +29,6 @@ {% endfor %}