diff options
Diffstat (limited to 'searx/engines')
81 files changed, 1164 insertions, 773 deletions
diff --git a/searx/engines/1337x.py b/searx/engines/1337x.py index 0de04bd95..76a7a1634 100644 --- a/searx/engines/1337x.py +++ b/searx/engines/1337x.py @@ -1,7 +1,8 @@ +from urllib.parse import quote, urljoin from lxml import html from searx.engines.xpath import extract_text from searx.utils import get_torrent_size -from searx.url_utils import quote, urljoin + url = 'https://1337x.to/' search_url = url + 'search/{search_term}/{pageno}/' diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index 9ccef8b54..9fcf812b0 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -55,6 +55,7 @@ engine_default_args = {'paging': False, 'continuous_errors': 0, 'time_range_support': False, 'offline': False, + 'display_error_messages': True, 'tokens': []} @@ -73,6 +74,9 @@ def load_engine(engine_data): try: engine = load_module(engine_module + '.py', engine_dir) + except (SyntaxError, KeyboardInterrupt, SystemExit, SystemError, ImportError, RuntimeError) as e: + logger.exception('Fatal exception in engine "{}"'.format(engine_module)) + sys.exit(1) except: logger.exception('Cannot load engine "{}"'.format(engine_module)) return None diff --git a/searx/engines/acgsou.py b/searx/engines/acgsou.py index cca28f0db..d5d3e3178 100644 --- a/searx/engines/acgsou.py +++ b/searx/engines/acgsou.py @@ -9,9 +9,9 @@ @parse url, title, content, seed, leech, torrentfile """ +from urllib.parse import urlencode from lxml import html from searx.engines.xpath import extract_text -from searx.url_utils import urlencode from searx.utils import get_torrent_size, int_or_zero # engine dependent config @@ -63,7 +63,7 @@ def response(resp): except: pass # I didn't add download/seed/leech count since as I figured out they are generated randomly everytime - content = u'Category: "{category}".' + content = 'Category: "{category}".' content = content.format(category=category) results.append({'url': href, diff --git a/searx/engines/apkmirror.py b/searx/engines/apkmirror.py index f2ee12b29..4e6dcd486 100644 --- a/searx/engines/apkmirror.py +++ b/searx/engines/apkmirror.py @@ -9,9 +9,10 @@ @parse url, title, thumbnail_src """ +from urllib.parse import urlencode from lxml import html from searx.engines.xpath import extract_text -from searx.url_utils import urlencode + # engine dependent config categories = ['it'] diff --git a/searx/engines/archlinux.py b/searx/engines/archlinux.py index dce862f55..e2f44b0f5 100644 --- a/searx/engines/archlinux.py +++ b/searx/engines/archlinux.py @@ -11,9 +11,9 @@ @parse url, title """ +from urllib.parse import urlencode, urljoin from lxml import html from searx.engines.xpath import extract_text -from searx.url_utils import urlencode, urljoin # engine dependent config categories = ['it'] @@ -105,7 +105,7 @@ def request(query, params): # if our language is hosted on the main site, we need to add its name # to the query in order to narrow the results to that language if language in main_langs: - query += b' (' + main_langs[language] + b')' + query += ' (' + main_langs[language] + ')' # prepare the request parameters query = urlencode({'search': query}) diff --git a/searx/engines/arxiv.py b/searx/engines/arxiv.py index e3c871d17..77ddc572e 100644 --- a/searx/engines/arxiv.py +++ b/searx/engines/arxiv.py @@ -11,9 +11,9 @@ More info on api: https://arxiv.org/help/api/user-manual """ +from urllib.parse import urlencode from lxml import html from datetime import datetime -from searx.url_utils import urlencode categories = ['science'] @@ -30,7 +30,7 @@ def request(query, params): # basic search offset = (params['pageno'] - 1) * number_of_results - string_args = dict(query=query.decode('utf-8'), + string_args = dict(query=query, offset=offset, number_of_results=number_of_results) diff --git a/searx/engines/base.py b/searx/engines/base.py index f1b1cf671..0114f9798 100755 --- a/searx/engines/base.py +++ b/searx/engines/base.py @@ -13,10 +13,10 @@ More info on api: http://base-search.net/about/download/base_interface.pdf """ +from urllib.parse import urlencode from lxml import etree from datetime import datetime import re -from searx.url_utils import urlencode from searx.utils import searx_useragent @@ -55,7 +55,7 @@ shorcut_dict = { def request(query, params): # replace shortcuts with API advanced search keywords for key in shorcut_dict.keys(): - query = re.sub(key, shorcut_dict[key], str(query)) + query = re.sub(key, shorcut_dict[key], query) # basic search offset = (params['pageno'] - 1) * number_of_results diff --git a/searx/engines/bing.py b/searx/engines/bing.py index afb776acd..c7b619369 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -14,10 +14,10 @@ """ import re +from urllib.parse import urlencode from lxml import html from searx import logger, utils from searx.engines.xpath import extract_text -from searx.url_utils import urlencode from searx.utils import match_language, gen_useragent, eval_xpath logger = logger.getChild('bing engine') @@ -47,7 +47,7 @@ def request(query, params): else: lang = match_language(params['language'], supported_languages, language_aliases) - query = u'language:{} {}'.format(lang.split('-')[0].upper(), query.decode('utf-8')).encode('utf-8') + query = 'language:{} {}'.format(lang.split('-')[0].upper(), query) search_path = search_string.format( query=urlencode({'q': query}), diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py index 138ed11c6..10da42b5c 100644 --- a/searx/engines/bing_images.py +++ b/searx/engines/bing_images.py @@ -12,10 +12,10 @@ """ +from urllib.parse import urlencode from lxml import html from json import loads import re -from searx.url_utils import urlencode from searx.utils import match_language from searx.engines.bing import _fetch_supported_languages, supported_languages_url, language_aliases @@ -80,19 +80,18 @@ def response(resp): # parse results for result in dom.xpath('//div[@class="imgpt"]'): - - img_format = result.xpath('./div[contains(@class, "img_info")]/span/text()')[0] - # Microsoft seems to experiment with this code so don't make the path too specific, - # just catch the text section for the first anchor in img_info assuming this to be - # the originating site. - source = result.xpath('./div[contains(@class, "img_info")]//a/text()')[0] - try: + img_format = result.xpath('./div[contains(@class, "img_info")]/span/text()')[0] + # Microsoft seems to experiment with this code so don't make the path too specific, + # just catch the text section for the first anchor in img_info assuming this to be + # the originating site. + source = result.xpath('./div[contains(@class, "img_info")]//a/text()')[0] + m = loads(result.xpath('./a/@m')[0]) # strip 'Unicode private use area' highlighting, they render to Tux # the Linux penguin and a standing diamond on my machine... - title = m.get('t', '').replace(u'\ue000', '').replace(u'\ue001', '') + title = m.get('t', '').replace('\ue000', '').replace('\ue001', '') results.append({'template': 'images.html', 'url': m['purl'], 'thumbnail_src': m['turl'], diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py index d13be777c..fbe51faed 100644 --- a/searx/engines/bing_news.py +++ b/searx/engines/bing_news.py @@ -13,10 +13,9 @@ from datetime import datetime from dateutil import parser +from urllib.parse import urlencode, urlparse, parse_qsl from lxml import etree from searx.utils import list_get, match_language -from searx.url_utils import urlencode, urlparse, parse_qsl - from searx.engines.bing import _fetch_supported_languages, supported_languages_url, language_aliases # engine dependent config diff --git a/searx/engines/bing_videos.py b/searx/engines/bing_videos.py index f048f0d8e..63264de6f 100644 --- a/searx/engines/bing_videos.py +++ b/searx/engines/bing_videos.py @@ -12,7 +12,7 @@ from json import loads from lxml import html -from searx.url_utils import urlencode +from urllib.parse import urlencode from searx.utils import match_language from searx.engines.bing import _fetch_supported_languages, supported_languages_url, language_aliases diff --git a/searx/engines/btdigg.py b/searx/engines/btdigg.py index 82eedc24b..2faade3e2 100644 --- a/searx/engines/btdigg.py +++ b/searx/engines/btdigg.py @@ -12,8 +12,8 @@ from lxml import html from operator import itemgetter +from urllib.parse import quote, urljoin from searx.engines.xpath import extract_text -from searx.url_utils import quote, urljoin from searx.utils import get_torrent_size # engine dependent config diff --git a/searx/engines/command.py b/searx/engines/command.py new file mode 100644 index 000000000..b9e672ffa --- /dev/null +++ b/searx/engines/command.py @@ -0,0 +1,184 @@ +''' +searx is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +searx is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with searx. If not, see < http://www.gnu.org/licenses/ >. +''' + + +from os.path import expanduser, isabs, realpath, commonprefix +from re import MULTILINE, search as re_search +from shlex import split as shlex_split +from subprocess import Popen, PIPE +from time import time +from threading import Thread + +from searx import logger + + +offline = True +paging = True +command = [] +delimiter = {} +parse_regex = {} +query_type = '' +query_enum = [] +environment_variables = {} +working_dir = realpath('.') +result_separator = '\n' +result_template = 'key-value.html' +timeout = 4.0 + +_command_logger = logger.getChild('command') +_compiled_parse_regex = {} + + +def init(engine_settings): + check_parsing_options(engine_settings) + + if 'command' not in engine_settings: + raise ValueError('engine command : missing configuration key: command') + + global command, working_dir, result_template, delimiter, parse_regex, timeout, environment_variables + + command = engine_settings['command'] + + if 'working_dir' in engine_settings: + working_dir = engine_settings['working_dir'] + if not isabs(engine_settings['working_dir']): + working_dir = realpath(working_dir) + + if 'parse_regex' in engine_settings: + parse_regex = engine_settings['parse_regex'] + for result_key, regex in parse_regex.items(): + _compiled_parse_regex[result_key] = re.compile(regex, flags=MULTILINE) + if 'delimiter' in engine_settings: + delimiter = engine_settings['delimiter'] + + if 'environment_variables' in engine_settings: + environment_variables = engine_settings['environment_variables'] + + +def search(query, params): + cmd = _get_command_to_run(query) + if not cmd: + return [] + + results = [] + reader_thread = Thread(target=_get_results_from_process, args=(results, cmd, params['pageno'])) + reader_thread.start() + reader_thread.join(timeout=timeout) + + return results + + +def _get_command_to_run(query): + params = shlex_split(query.decode('utf-8')) + __check_query_params(params) + + cmd = [] + for c in command: + if c == '{{QUERY}}': + cmd.extend(params) + else: + cmd.append(c) + + return cmd + + +def _get_results_from_process(results, cmd, pageno): + leftover = '' + count = 0 + start, end = __get_results_limits(pageno) + with Popen(cmd, stdout=PIPE, stderr=PIPE, env=environment_variables) as process: + line = process.stdout.readline() + while line: + buf = leftover + line.decode('utf-8') + raw_results = buf.split(result_separator) + if raw_results[-1]: + leftover = raw_results[-1] + raw_results = raw_results[:-1] + + for raw_result in raw_results: + result = __parse_single_result(raw_result) + if result is None: + _command_logger.debug('skipped result:', raw_result) + continue + + if start <= count and count <= end: + result['template'] = result_template + results.append(result) + + count += 1 + if end < count: + return results + + line = process.stdout.readline() + + return_code = process.wait(timeout=timeout) + if return_code != 0: + raise RuntimeError('non-zero return code when running command', cmd, return_code) + + +def __get_results_limits(pageno): + start = (pageno - 1) * 10 + end = start + 9 + return start, end + + +def __check_query_params(params): + if not query_type: + return + + if query_type == 'path': + query_path = params[-1] + query_path = expanduser(query_path) + if commonprefix([realpath(query_path), working_dir]) != working_dir: + raise ValueError('requested path is outside of configured working directory') + elif query_type == 'enum' and len(query_enum) > 0: + for param in params: + if param not in query_enum: + raise ValueError('submitted query params is not allowed', param, 'allowed params:', query_enum) + + +def check_parsing_options(engine_settings): + """ Checks if delimiter based parsing or regex parsing is configured correctly """ + + if 'delimiter' not in engine_settings and 'parse_regex' not in engine_settings: + raise ValueError('failed to init settings for parsing lines: missing delimiter or parse_regex') + if 'delimiter' in engine_settings and 'parse_regex' in engine_settings: + raise ValueError('failed to init settings for parsing lines: too many settings') + + if 'delimiter' in engine_settings: + if 'chars' not in engine_settings['delimiter'] or 'keys' not in engine_settings['delimiter']: + raise ValueError + + +def __parse_single_result(raw_result): + """ Parses command line output based on configuration """ + + result = {} + + if delimiter: + elements = raw_result.split(delimiter['chars'], maxsplit=len(delimiter['keys']) - 1) + if len(elements) != len(delimiter['keys']): + return {} + for i in range(len(elements)): + result[delimiter['keys'][i]] = elements[i] + + if parse_regex: + for result_key, regex in _compiled_parse_regex.items(): + found = regex.search(raw_result) + if not found: + return {} + result[result_key] = raw_result[found.start():found.end()] + + return result diff --git a/searx/engines/currency_convert.py b/searx/engines/currency_convert.py index 8eab8f673..c6067c4a8 100644 --- a/searx/engines/currency_convert.py +++ b/searx/engines/currency_convert.py @@ -1,26 +1,23 @@ import json import re import os -import sys import unicodedata from io import open from datetime import datetime -if sys.version_info[0] == 3: - unicode = str categories = [] url = 'https://duckduckgo.com/js/spice/currency/1/{0}/{1}' weight = 100 -parser_re = re.compile(b'.*?(\\d+(?:\\.\\d+)?) ([^.0-9]+) (?:in|to) ([^.0-9]+)', re.I) +parser_re = re.compile('.*?(\\d+(?:\\.\\d+)?) ([^.0-9]+) (?:in|to) ([^.0-9]+)', re.I) db = 1 def normalize_name(name): - name = name.decode('utf-8').lower().replace('-', ' ').rstrip('s') + name = name.lower().replace('-', ' ').rstrip('s') name = re.sub(' +', ' ', name) return unicodedata.normalize('NFKD', name).lower() diff --git a/searx/engines/dailymotion.py b/searx/engines/dailymotion.py index 1038e64bf..1e24e41da 100644 --- a/searx/engines/dailymotion.py +++ b/searx/engines/dailymotion.py @@ -14,7 +14,7 @@ from json import loads from datetime import datetime -from searx.url_utils import urlencode +from urllib.parse import urlencode from searx.utils import match_language, html_to_text # engine dependent config diff --git a/searx/engines/deezer.py b/searx/engines/deezer.py index af63478fb..48c0429a7 100644 --- a/searx/engines/deezer.py +++ b/searx/engines/deezer.py @@ -11,7 +11,7 @@ """ from json import loads -from searx.url_utils import urlencode +from urllib.parse import urlencode # engine dependent config categories = ['music'] @@ -50,7 +50,7 @@ def response(resp): if url.startswith('http://'): url = 'https' + url[4:] - content = u'{} - {} - {}'.format( + content = '{} - {} - {}'.format( result['artist']['name'], result['album']['title'], result['title']) diff --git a/searx/engines/deviantart.py b/searx/engines/deviantart.py index a0e27e622..2bd21fa5d 100644 --- a/searx/engines/deviantart.py +++ b/searx/engines/deviantart.py @@ -14,8 +14,9 @@ from lxml import html import re +from urllib.parse import urlencode from searx.engines.xpath import extract_text -from searx.url_utils import urlencode + # engine dependent config categories = ['images'] diff --git a/searx/engines/dictzone.py b/searx/engines/dictzone.py index 423af0971..5a1fea3cf 100644 --- a/searx/engines/dictzone.py +++ b/searx/engines/dictzone.py @@ -10,15 +10,15 @@ """ import re +from urllib.parse import urljoin from lxml import html from searx.utils import is_valid_lang, eval_xpath -from searx.url_utils import urljoin categories = ['general'] -url = u'https://dictzone.com/{from_lang}-{to_lang}-dictionary/{query}' +url = 'https://dictzone.com/{from_lang}-{to_lang}-dictionary/{query}' weight = 100 -parser_re = re.compile(b'.*?([a-z]+)-([a-z]+) ([^ ]+)$', re.I) +parser_re = re.compile('.*?([a-z]+)-([a-z]+) ([^ ]+)$', re.I) results_xpath = './/table[@id="r"]/tr' @@ -37,7 +37,7 @@ def request(query, params): params['url'] = url.format(from_lang=from_lang[2], to_lang=to_lang[2], - query=query.decode('utf-8')) + query=query) return params diff --git a/searx/engines/digbt.py b/searx/engines/digbt.py index ff2f94593..e2c0389c6 100644 --- a/searx/engines/digbt.py +++ b/searx/engines/digbt.py @@ -10,14 +10,11 @@ @parse url, title, content, magnetlink """ -from sys import version_info +from urllib.parse import urljoin from lxml import html from searx.engines.xpath import extract_text from searx.utils import get_torrent_size -from searx.url_utils import urljoin -if version_info[0] == 3: - unicode = str categories = ['videos', 'music', 'files'] paging = True diff --git a/searx/engines/digg.py b/searx/engines/digg.py index 073410eb0..24a932d53 100644 --- a/searx/engines/digg.py +++ b/searx/engines/digg.py @@ -14,8 +14,8 @@ import random import string from dateutil import parser from json import loads +from urllib.parse import urlencode from lxml import html -from searx.url_utils import urlencode from datetime import datetime # engine dependent config diff --git a/searx/engines/doku.py b/searx/engines/doku.py index d20e66026..513ffda89 100644 --- a/searx/engines/doku.py +++ b/searx/engines/doku.py @@ -9,10 +9,10 @@ # @stable yes # @parse (general) url, title, content +from urllib.parse import urlencode from lxml.html import fromstring from searx.engines.xpath import extract_text from searx.utils import eval_xpath -from searx.url_utils import urlencode # engine dependent config categories = ['general'] # TODO , 'images', 'music', 'videos', 'files' diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 0d2c0af2d..fb1ea2b2d 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -15,9 +15,9 @@ from lxml.html import fromstring from json import loads +from urllib.parse import urlencode from searx.engines.xpath import extract_text from searx.poolrequests import get -from searx.url_utils import urlencode from searx.utils import match_language, eval_xpath # engine dependent config @@ -50,6 +50,7 @@ result_xpath = '//div[@class="result results_links results_links_deep web-result url_xpath = './/a[@class="result__a"]/@href' title_xpath = './/a[@class="result__a"]' content_xpath = './/a[@class="result__snippet"]' +correction_xpath = '//div[@id="did_you_mean"]//a' # match query's language to a region code that duckduckgo will accept @@ -125,6 +126,11 @@ def response(resp): 'content': content, 'url': res_url}) + # parse correction + for correction in eval_xpath(doc, correction_xpath): + # append correction + results.append({'correction': extract_text(correction)}) + # return results return results diff --git a/searx/engines/duckduckgo_definitions.py b/searx/engines/duckduckgo_definitions.py index 79d10c303..73154a525 100644 --- a/searx/engines/duckduckgo_definitions.py +++ b/searx/engines/duckduckgo_definitions.py @@ -10,11 +10,11 @@ DuckDuckGo (definitions) """ import json +from urllib.parse import urlencode from lxml import html from re import compile from searx.engines.xpath import extract_text from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, language_aliases -from searx.url_utils import urlencode from searx.utils import html_to_text, match_language url = 'https://api.duckduckgo.com/'\ diff --git a/searx/engines/duckduckgo_images.py b/searx/engines/duckduckgo_images.py index 89924b71c..38e141f8b 100644 --- a/searx/engines/duckduckgo_images.py +++ b/searx/engines/duckduckgo_images.py @@ -14,13 +14,13 @@ """ from json import loads +from urllib.parse import urlencode from searx.engines.xpath import extract_text from searx.engines.duckduckgo import ( _fetch_supported_languages, supported_languages_url, get_region_code, language_aliases ) from searx.poolrequests import get -from searx.url_utils import urlencode # engine dependent config categories = ['images'] diff --git a/searx/engines/duden.py b/searx/engines/duden.py index cf2f1a278..a711f422e 100644 --- a/searx/engines/duden.py +++ b/searx/engines/duden.py @@ -10,9 +10,9 @@ from lxml import html, etree import re +from urllib.parse import quote, urljoin from searx.engines.xpath import extract_text from searx.utils import eval_xpath -from searx.url_utils import quote, urljoin from searx import logger categories = ['general'] diff --git a/searx/engines/etools.py b/searx/engines/etools.py index a9eb0980d..efc102ef6 100644 --- a/searx/engines/etools.py +++ b/searx/engines/etools.py @@ -10,8 +10,8 @@ """ from lxml import html +from urllib.parse import quote from searx.engines.xpath import extract_text -from searx.url_utils import quote from searx.utils import eval_xpath categories = ['general'] diff --git a/searx/engines/fdroid.py b/searx/engines/fdroid.py index 4066dc716..a2a5114df 100644 --- a/searx/engines/fdroid.py +++ b/searx/engines/fdroid.py @@ -9,9 +9,9 @@ @parse url, title, content """ +from urllib.parse import urlencode from lxml import html from searx.engines.xpath import extract_text -from searx.url_utils import urlencode # engine dependent config categories = ['files'] diff --git a/searx/engines/filecrop.py b/searx/engines/filecrop.py index ed57a6bf3..eef5be6e8 100644 --- a/searx/engines/filecrop.py +++ b/searx/engines/filecrop.py @@ -1,9 +1,6 @@ -from searx.url_utils import urlencode +from html.parser import HTMLParser +from urllib.parse import urlencode -try: - from HTMLParser import HTMLParser -except: - from html.parser import HTMLParser url = 'http://www.filecrop.com/' search_url = url + '/search.php?{query}&size_i=0&size_f=100000000&engine_r=1&engine_d=1&engine_e=1&engine_4=1&engine_m=1&pos={index}' # noqa diff --git a/searx/engines/flickr.py b/searx/engines/flickr.py index de1769370..b23c447b8 100644 --- a/searx/engines/flickr.py +++ b/searx/engines/flickr.py @@ -14,7 +14,7 @@ """ from json import loads -from searx.url_utils import urlencode +from urllib.parse import urlencode categories = ['images'] diff --git a/searx/engines/flickr_noapi.py b/searx/engines/flickr_noapi.py index c8ee34f7a..4bcf837cb 100644 --- a/searx/engines/flickr_noapi.py +++ b/searx/engines/flickr_noapi.py @@ -15,8 +15,8 @@ from json import loads from time import time import re +from urllib.parse import urlencode from searx.engines import logger -from searx.url_utils import urlencode from searx.utils import ecma_unescape, html_to_text logger = logger.getChild('flickr-noapi') @@ -117,14 +117,10 @@ def response(resp): 'img_format': img_format, 'template': 'images.html' } - try: - result['author'] = author - result['title'] = title - result['content'] = content - except: - result['author'] = '' - result['title'] = '' - result['content'] = '' + result['author'] = author.encode(errors='ignore').decode() + result['source'] = source.encode(errors='ignore').decode() + result['title'] = title.encode(errors='ignore').decode() + result['content'] = content.encode(errors='ignore').decode() results.append(result) return results diff --git a/searx/engines/framalibre.py b/searx/engines/framalibre.py index f3441fa5f..14b659b5f 100644 --- a/searx/engines/framalibre.py +++ b/searx/engines/framalibre.py @@ -10,13 +10,10 @@ @parse url, title, content, thumbnail, img_src """ -try: - from cgi import escape -except: - from html import escape +from html import escape +from urllib.parse import urljoin, urlencode from lxml import html from searx.engines.xpath import extract_text -from searx.url_utils import urljoin, urlencode # engine dependent config categories = ['it'] diff --git a/searx/engines/frinkiac.py b/searx/engines/frinkiac.py index a67b42dbe..5b174a687 100644 --- a/searx/engines/frinkiac.py +++ b/searx/engines/frinkiac.py @@ -10,7 +10,7 @@ Frinkiac (Images) """ from json import loads -from searx.url_utils import urlencode +from urllib.parse import urlencode categories = ['images'] diff --git a/searx/engines/genius.py b/searx/engines/genius.py index aa5afad9b..feb7d79d1 100644 --- a/searx/engines/genius.py +++ b/searx/engines/genius.py @@ -11,7 +11,7 @@ Genius """ from json import loads -from searx.url_utils import urlencode +from urllib.parse import urlencode from datetime import datetime # engine dependent config diff --git a/searx/engines/gentoo.py b/searx/engines/gentoo.py index a7a966cc9..b6bc99fab 100644 --- a/searx/engines/gentoo.py +++ b/searx/engines/gentoo.py @@ -11,9 +11,9 @@ @parse url, title """ +from urllib.parse import urlencode, urljoin from lxml import html from searx.engines.xpath import extract_text -from searx.url_utils import urlencode, urljoin # engine dependent config categories = ['it'] @@ -90,7 +90,7 @@ def request(query, params): # if our language is hosted on the main site, we need to add its name # to the query in order to narrow the results to that language if language in main_langs: - query += b' (' + (main_langs[language]).encode('utf-8') + b')' + query += ' (' + main_langs[language] + ')' # prepare the request parameters query = urlencode({'search': query}) diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py index 2bb29a9fe..1d71b18e9 100644 --- a/searx/engines/gigablast.py +++ b/searx/engines/gigablast.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Gigablast (Web) @@ -9,121 +10,117 @@ @stable yes @parse url, title, content """ +# pylint: disable=missing-function-docstring, invalid-name -import random +import re from json import loads -from time import time -from lxml.html import fromstring +from urllib.parse import urlencode +# from searx import logger from searx.poolrequests import get -from searx.url_utils import urlencode -from searx.utils import eval_xpath # engine dependent config categories = ['general'] -paging = True -number_of_results = 10 +# gigablast's pagination is totally damaged, don't use it +paging = False language_support = True safesearch = True # search-url -base_url = 'https://gigablast.com/' -search_string = 'search?{query}'\ - '&n={number_of_results}'\ - '&c=main'\ - '&s={offset}'\ - '&format=json'\ - '&langcountry={lang}'\ - '&ff={safesearch}'\ - '&rand={rxikd}' -# specific xpath variables -results_xpath = '//response//result' -url_xpath = './/url' -title_xpath = './/title' -content_xpath = './/sum' - -supported_languages_url = 'https://gigablast.com/search?&rxikd=1' - -extra_param = '' # gigablast requires a random extra parameter -# which can be extracted from the source code of the search page +base_url = 'https://gigablast.com' +# ugly hack: gigablast requires a random extra parameter which can be extracted +# from the source code of the gigablast HTTP client +extra_param = '' +extra_param_path='/search?c=main&qlangcountry=en-us&q=south&s=10' def parse_extra_param(text): - global extra_param - param_lines = [x for x in text.splitlines() if x.startswith('var url=') or x.startswith('url=url+')] - extra_param = '' - for l in param_lines: - extra_param += l.split("'")[1] - extra_param = extra_param.split('&')[-1] - -def init(engine_settings=None): - parse_extra_param(get('http://gigablast.com/search?c=main&qlangcountry=en-us&q=south&s=10').text) + # example: + # + # var uxrl='/search?c=main&qlangcountry=en-us&q=south&s=10&rand=1590740241635&n'; + # uxrl=uxrl+'sab=730863287'; + # + # extra_param --> "rand=1590740241635&nsab=730863287" + + global extra_param # pylint: disable=global-statement + re_var= None + for line in text.splitlines(): + if re_var is None and extra_param_path in line: + var = line.split("=")[0].split()[1] # e.g. var --> 'uxrl' + re_var = re.compile(var + "\\s*=\\s*" + var + "\\s*\\+\\s*'" + "(.*)" + "'(.*)") + extra_param = line.split("'")[1][len(extra_param_path):] + continue + if re_var is not None and re_var.search(line): + extra_param += re_var.search(line).group(1) + break + # logger.debug('gigablast extra_param="%s"', extra_param) + +def init(engine_settings=None): # pylint: disable=unused-argument + parse_extra_param(get(base_url + extra_param_path).text) # do search-request -def request(query, params): - print("EXTRAPARAM:", extra_param) - offset = (params['pageno'] - 1) * number_of_results +def request(query, params): # pylint: disable=unused-argument - if params['language'] == 'all': - language = 'xx' - else: - language = params['language'].replace('-', '_').lower() - if language.split('-')[0] != 'zh': - language = language.split('-')[0] + # see API http://www.gigablast.com/api.html#/search + # Take into account, that the API has some quirks .. - if params['safesearch'] >= 1: - safesearch = 1 - else: - safesearch = 0 + query_args = dict( + c = 'main' + , format = 'json' + , q = query + , dr = 1 + , showgoodimages = 0 + ) - # rxieu is some kind of hash from the search query, but accepts random atm - search_path = search_string.format(query=urlencode({'q': query}), - offset=offset, - number_of_results=number_of_results, - lang=language, - rxikd=int(time() * 1000), - safesearch=safesearch) + if params['language'] and params['language'] != 'all': + query_args['qlangcountry'] = params['language'] + query_args['qlang'] = params['language'].split('-')[0] - params['url'] = base_url + search_path + '&' + extra_param + if params['safesearch'] >= 1: + query_args['ff'] = 1 - return params + search_url = '/search?' + urlencode(query_args) + params['url'] = base_url + search_url + extra_param + return params # get response from search-request def response(resp): results = [] - # parse results - try: - response_json = loads(resp.text) - except: - parse_extra_param(resp.text) - raise Exception('extra param expired, please reload') + response_json = loads(resp.text) + + # logger.debug('gigablast returns %s results', len(response_json['results'])) for result in response_json['results']: - # append result - results.append({'url': result['url'], - 'title': result['title'], - 'content': result['sum']}) + # see "Example JSON Output (&format=json)" + # at http://www.gigablast.com/api.html#/search - # return results - return results + # sort out meaningless result + title = result.get('title') + if len(title) < 2: + continue -# get supported languages from their site -def _fetch_supported_languages(resp): - supported_languages = [] - dom = fromstring(resp.text) - links = eval_xpath(dom, '//span[@id="menu2"]/a') - for link in links: - href = eval_xpath(link, './@href')[0].split('lang%3A') - if len(href) == 2: - code = href[1].split('_') - if len(code) == 2: - code = code[0] + '-' + code[1].upper() - else: - code = code[0] - supported_languages.append(code) - - return supported_languages + url = result.get('url') + if len(url) < 9: + continue + + content = result.get('sum') + if len(content) < 5: + continue + + # extend fields + + subtitle = result.get('title') + if len(subtitle) > 3 and subtitle != title: + title += " - " + subtitle + + results.append(dict( + url = url + , title = title + , content = content + )) + + return results diff --git a/searx/engines/github.py b/searx/engines/github.py index eaa00da4f..80b50ceda 100644 --- a/searx/engines/github.py +++ b/searx/engines/github.py @@ -11,7 +11,7 @@ """ from json import loads -from searx.url_utils import urlencode +from urllib.parse import urlencode # engine dependent config categories = ['it'] diff --git a/searx/engines/google.py b/searx/engines/google.py index eed3a044e..dfc8a0ab8 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -1,210 +1,211 @@ -# Google (Web) -# -# @website https://www.google.com -# @provide-api yes (https://developers.google.com/custom-search/) -# -# @using-api no -# @results HTML -# @stable no (HTML can change) -# @parse url, title, content, suggestion - -import re +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Google (Web) + +:website: https://www.google.com +:provide-api: yes (https://developers.google.com/custom-search/) +:using-api: not the offical, since it needs registration to another service +:results: HTML +:stable: no +:parse: url, title, content, number_of_results, answer, suggestion, correction + +For detailed description of the *REST-full* API see: `Query Parameter +Definitions`_. + +.. _Query Parameter Definitions: + https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions + +""" + +# pylint: disable=invalid-name, missing-function-docstring + +from urllib.parse import urlencode, urlparse +from lxml import html from flask_babel import gettext -from lxml import html, etree -from searx.engines.xpath import extract_text, extract_url +from searx.engines.xpath import extract_text from searx import logger -from searx.url_utils import urlencode, urlparse, parse_qsl from searx.utils import match_language, eval_xpath logger = logger.getChild('google engine') - # engine dependent config categories = ['general'] paging = True language_support = True -use_locale_domain = True time_range_support = True +safesearch = True +supported_languages_url = 'https://www.google.com/preferences?#languages' # based on https://en.wikipedia.org/wiki/List_of_Google_domains and tests -default_hostname = 'www.google.com' - -country_to_hostname = { - 'BG': 'www.google.bg', # Bulgaria - 'CZ': 'www.google.cz', # Czech Republic - 'DE': 'www.google.de', # Germany - 'DK': 'www.google.dk', # Denmark - 'AT': 'www.google.at', # Austria - 'CH': 'www.google.ch', # Switzerland - 'GR': 'www.google.gr', # Greece - 'AU': 'www.google.com.au', # Australia - 'CA': 'www.google.ca', # Canada - 'GB': 'www.google.co.uk', # United Kingdom - 'ID': 'www.google.co.id', # Indonesia - 'IE': 'www.google.ie', # Ireland - 'IN': 'www.google.co.in', # India - 'MY': 'www.google.com.my', # Malaysia - 'NZ': 'www.google.co.nz', # New Zealand - 'PH': 'www.google.com.ph', # Philippines - 'SG': 'www.google.com.sg', # Singapore - # 'US': 'www.google.us', # United States, redirect to .com - 'ZA': 'www.google.co.za', # South Africa - 'AR': 'www.google.com.ar', # Argentina - 'CL': 'www.google.cl', # Chile - 'ES': 'www.google.es', # Spain - 'MX': 'www.google.com.mx', # Mexico - 'EE': 'www.google.ee', # Estonia - 'FI': 'www.google.fi', # Finland - 'BE': 'www.google.be', # Belgium - 'FR': 'www.google.fr', # France - 'IL': 'www.google.co.il', # Israel - 'HR': 'www.google.hr', # Croatia - 'HU': 'www.google.hu', # Hungary - 'IT': 'www.google.it', # Italy - 'JP': 'www.google.co.jp', # Japan - 'KR': 'www.google.co.kr', # South Korea - 'LT': 'www.google.lt', # Lithuania - 'LV': 'www.google.lv', # Latvia - 'NO': 'www.google.no', # Norway - 'NL': 'www.google.nl', # Netherlands - 'PL': 'www.google.pl', # Poland - 'BR': 'www.google.com.br', # Brazil - 'PT': 'www.google.pt', # Portugal - 'RO': 'www.google.ro', # Romania - 'RU': 'www.google.ru', # Russia - 'SK': 'www.google.sk', # Slovakia - 'SI': 'www.google.si', # Slovenia - 'SE': 'www.google.se', # Sweden - 'TH': 'www.google.co.th', # Thailand - 'TR': 'www.google.com.tr', # Turkey - 'UA': 'www.google.com.ua', # Ukraine - # 'CN': 'www.google.cn', # China, only from China ? - 'HK': 'www.google.com.hk', # Hong Kong - 'TW': 'www.google.com.tw' # Taiwan +google_domains = { + 'BG': 'google.bg', # Bulgaria + 'CZ': 'google.cz', # Czech Republic + 'DE': 'google.de', # Germany + 'DK': 'google.dk', # Denmark + 'AT': 'google.at', # Austria + 'CH': 'google.ch', # Switzerland + 'GR': 'google.gr', # Greece + 'AU': 'google.com.au', # Australia + 'CA': 'google.ca', # Canada + 'GB': 'google.co.uk', # United Kingdom + 'ID': 'google.co.id', # Indonesia + 'IE': 'google.ie', # Ireland + 'IN': 'google.co.in', # India + 'MY': 'google.com.my', # Malaysia + 'NZ': 'google.co.nz', # New Zealand + 'PH': 'google.com.ph', # Philippines + 'SG': 'google.com.sg', # Singapore + # 'US': 'google.us', # United States, redirect to .com + 'ZA': 'google.co.za', # South Africa + 'AR': 'google.com.ar', # Argentina + 'CL': 'google.cl', # Chile + 'ES': 'google.es', # Spain + 'MX': 'google.com.mx', # Mexico + 'EE': 'google.ee', # Estonia + 'FI': 'google.fi', # Finland + 'BE': 'google.be', # Belgium + 'FR': 'google.fr', # France + 'IL': 'google.co.il', # Israel + 'HR': 'google.hr', # Croatia + 'HU': 'google.hu', # Hungary + 'IT': 'google.it', # Italy + 'JP': 'google.co.jp', # Japan + 'KR': 'google.co.kr', # South Korea + 'LT': 'google.lt', # Lithuania + 'LV': 'google.lv', # Latvia + 'NO': 'google.no', # Norway + 'NL': 'google.nl', # Netherlands + 'PL': 'google.pl', # Poland + 'BR': 'google.com.br', # Brazil + 'PT': 'google.pt', # Portugal + 'RO': 'google.ro', # Romania + 'RU': 'google.ru', # Russia + 'SK': 'google.sk', # Slovakia + 'SI': 'google.si', # Slovenia + 'SE': 'google.se', # Sweden + 'TH': 'google.co.th', # Thailand + 'TR': 'google.com.tr', # Turkey + 'UA': 'google.com.ua', # Ukraine + # 'CN': 'google.cn', # China, only from China ? + 'HK': 'google.com.hk', # Hong Kong + 'TW': 'google.com.tw' # Taiwan } -# osm -url_map = 'https://www.openstreetmap.org/'\ - + '?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M' - -# search-url -search_path = '/search' -search_url = ('https://{hostname}' + - search_path + - '?{query}&start={offset}&gws_rd=cr&gbv=1&lr={lang}&hl={lang_short}&ei=x') - -time_range_search = "&tbs=qdr:{range}" -time_range_dict = {'day': 'd', - 'week': 'w', - 'month': 'm', - 'year': 'y'} - -# other URLs -map_hostname_start = 'maps.google.' -maps_path = '/maps' -redirect_path = '/url' -images_path = '/images' -supported_languages_url = 'https://www.google.com/preferences?#languages' +time_range_dict = { + 'day': 'd', + 'week': 'w', + 'month': 'm', + 'year': 'y' +} + +# Filter results. 0: None, 1: Moderate, 2: Strict +filter_mapping = { + 0: 'off', + 1: 'medium', + 2: 'high' +} # specific xpath variables -results_xpath = '//div[contains(@class, "ZINbbc")]' -url_xpath = './/div[@class="kCrYT"][1]/a/@href' -title_xpath = './/div[@class="kCrYT"][1]/a/div[1]' -content_xpath = './/div[@class="kCrYT"][2]//div[contains(@class, "BNeawe")]//div[contains(@class, "BNeawe")]' -suggestion_xpath = '//div[contains(@class, "ZINbbc")][last()]//div[@class="rVLSBd"]/a//div[contains(@class, "BNeawe")]' -spelling_suggestion_xpath = '//div[@id="scc"]//a' - -# map : detail location -map_address_xpath = './/div[@class="s"]//table//td[2]/span/text()' -map_phone_xpath = './/div[@class="s"]//table//td[2]/span/span' -map_website_url_xpath = 'h3[2]/a/@href' -map_website_title_xpath = 'h3[2]' - -# map : near the location -map_near = 'table[@class="ts"]//tr' -map_near_title = './/h4' -map_near_url = './/h4/a/@href' -map_near_phone = './/span[@class="nobr"]' - -# images -images_xpath = './/div/a' -image_url_xpath = './@href' -image_img_src_xpath = './img/@src' - -# property names -# FIXME : no translation -property_address = "Address" -property_phone = "Phone number" - - -# remove google-specific tracking-url -def parse_url(url_string, google_hostname): - # sanity check - if url_string is None: - return url_string - - # normal case - parsed_url = urlparse(url_string) - if (parsed_url.netloc in [google_hostname, ''] - and parsed_url.path == redirect_path): - query = dict(parse_qsl(parsed_url.query)) - return query['q'] - else: - return url_string +# ------------------------ + +# google results are grouped into <div class="g" ../> +results_xpath = '//div[@class="g"]' + +# google *sections* are no usual *results*, we ignore them +g_section_with_header = './g-section-with-header' + +# the title is a h3 tag relative to the result group +title_xpath = './/h3[1]' + +# in the result group there is <div class="r" ../> it's first child is a <a +# href=...> (on some results, the <a> is the first "descendant", not ""child") +href_xpath = './/div[@class="r"]//a/@href' + +# in the result group there is <div class="s" ../> containing he *content* +content_xpath = './/div[@class="s"]' + +# Suggestions are links placed in a *card-section*, we extract only the text +# from the links not the links itself. +suggestion_xpath = '//div[contains(@class, "card-section")]//a' + +# Since google does *auto-correction* on the first query these are not really +# *spelling suggestions*, we use them anyway. +spelling_suggestion_xpath = '//div[@class="med"]/p/a' -# returns extract_text on the first result selected by the xpath or None def extract_text_from_dom(result, xpath): + """returns extract_text on the first result selected by the xpath or None""" r = eval_xpath(result, xpath) if len(r) > 0: return extract_text(r[0]) return None -# do search-request -def request(query, params): - offset = (params['pageno'] - 1) * 10 - - if params['language'] == 'all' or params['language'] == 'en-US': - language = 'en-GB' - else: - language = match_language(params['language'], supported_languages, language_aliases) +def get_lang_country(params, lang_list, custom_aliases): + """Returns a tuple with *langauage* on its first and *country* on its second + position.""" + language = params['language'] + if language == 'all': + language = 'en-US' language_array = language.split('-') - if params['language'].find('-') > 0: - country = params['language'].split('-')[1] - elif len(language_array) == 2: + + if len(language_array) == 2: country = language_array[1] else: - country = 'US' + country = language_array[0].upper() - url_lang = 'lang_' + language + language = match_language(language, lang_list, custom_aliases) + lang_country = '%s-%s' % (language, country) + if lang_country == 'en-EN': + lang_country = 'en' - if use_locale_domain: - google_hostname = country_to_hostname.get(country.upper(), default_hostname) - else: - google_hostname = default_hostname - - # original format: ID=3e2b6616cee08557:TM=5556667580:C=r:IP=4.1.12.5-:S=23ASdf0soFgF2d34dfgf-_22JJOmHdfgg - params['cookies']['GOOGLE_ABUSE_EXEMPTION'] = 'x' - params['url'] = search_url.format(offset=offset, - query=urlencode({'q': query}), - hostname=google_hostname, - lang=url_lang, - lang_short=language) - if params['time_range'] in time_range_dict: - params['url'] += time_range_search.format(range=time_range_dict[params['time_range']]) + return language, country, lang_country - params['headers']['Accept-Language'] = language + ',' + language + '-' + country - params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' - params['google_hostname'] = google_hostname +def request(query, params): + """Google search request""" + + offset = (params['pageno'] - 1) * 10 + language, country, lang_country = get_lang_country( + # pylint: disable=undefined-variable + params, supported_languages, language_aliases + ) + subdomain = 'www.' + google_domains.get(country.upper(), 'google.com') + + # https://www.google.de/search?q=corona&hl=de-DE&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium + query_url = 'https://' + subdomain + '/search' + "?" + urlencode({ + 'q': query, + 'hl': lang_country, + 'lr': "lang_" + language, + 'ie': "utf8", + 'oe': "utf8", + 'start': offset, + }) + + if params['time_range'] in time_range_dict: + query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]}) + if params['safesearch']: + query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) + + params['url'] = query_url + logger.debug("query_url --> %s", query_url) + + # en-US,en;q=0.8,en;q=0.5 + params['headers']['Accept-Language'] = ( + lang_country + ',' + language + ';q=0.8,' + language + ';q=0.5' + ) + logger.debug("HTTP header Accept-Language --> %s", + params['headers']['Accept-Language']) + params['headers']['Accept'] = ( + 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' + ) + # params['google_subdomain'] = subdomain return params -# get response from search-request def response(resp): + """Get response from google's search request""" results = [] # detect google sorry @@ -215,68 +216,53 @@ def response(resp): if resp_url.path.startswith('/sorry'): raise RuntimeWarning(gettext('CAPTCHA required')) - # which hostname ? - google_hostname = resp.search_params.get('google_hostname') - google_url = "https://" + google_hostname + # which subdomain ? + # subdomain = resp.search_params.get('google_subdomain') # convert the text to dom dom = html.fromstring(resp.text) - instant_answer = eval_xpath(dom, '//div[@id="_vBb"]//text()') - if instant_answer: - results.append({'answer': u' '.join(instant_answer)}) + # results --> answer + answer = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]//text()') + if answer: + results.append({'answer': ' '.join(answer)}) + else: + logger.debug("did not found 'answer'") + + # results --> number_of_results try: - results_num = int(eval_xpath(dom, '//div[@id="resultStats"]//text()')[0] - .split()[1].replace(',', '')) - results.append({'number_of_results': results_num}) - except: - pass + _txt = eval_xpath(dom, '//div[@id="result-stats"]//text()')[0] + _digit = ''.join([n for n in _txt if n.isdigit()]) + number_of_results = int(_digit) + results.append({'number_of_results': number_of_results}) + + except Exception as e: # pylint: disable=broad-except + logger.debug("did not 'number_of_results'") + logger.error(e, exc_info=True) # parse results for result in eval_xpath(dom, results_xpath): + + # google *sections* + if extract_text(eval_xpath(result, g_section_with_header)): + logger.debug("ingoring <g-section-with-header>") + continue + try: title = extract_text(eval_xpath(result, title_xpath)[0]) - url = parse_url(extract_url(eval_xpath(result, url_xpath), google_url), google_hostname) - parsed_url = urlparse(url, google_hostname) - - # map result - if parsed_url.netloc == google_hostname: - # TODO fix inside links - continue - # if parsed_url.path.startswith(maps_path) or parsed_url.netloc.startswith(map_hostname_start): - # print "yooooo"*30 - # x = eval_xpath(result, map_near) - # if len(x) > 0: - # # map : near the location - # results = results + parse_map_near(parsed_url, x, google_hostname) - # else: - # # map : detail about a location - # results = results + parse_map_detail(parsed_url, result, google_hostname) - # # google news - # elif parsed_url.path == search_path: - # # skipping news results - # pass - - # # images result - # elif parsed_url.path == images_path: - # # only thumbnail image provided, - # # so skipping image results - # # results = results + parse_images(result, google_hostname) - # pass - - else: - # normal result - content = extract_text_from_dom(result, content_xpath) - if content is None: - continue - - # append result - results.append({'url': url, - 'title': title, - 'content': content - }) - except: - logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True)) + url = eval_xpath(result, href_xpath)[0] + content = extract_text_from_dom(result, content_xpath) + results.append({ + 'url': url, + 'title': title, + 'content': content + }) + except Exception as e: # pylint: disable=broad-except + logger.error(e, exc_info=True) + # from lxml import etree + # logger.debug(etree.tostring(result, pretty_print=True)) + # import pdb + # pdb.set_trace() continue # parse suggestion @@ -291,101 +277,16 @@ def response(resp): return results -def parse_images(result, google_hostname): - results = [] - for image in eval_xpath(result, images_xpath): - url = parse_url(extract_text(eval_xpath(image, image_url_xpath)[0]), google_hostname) - img_src = extract_text(eval_xpath(image, image_img_src_xpath)[0]) - - # append result - results.append({'url': url, - 'title': '', - 'content': '', - 'img_src': img_src, - 'template': 'images.html' - }) - - return results - - -def parse_map_near(parsed_url, x, google_hostname): - results = [] - - for result in x: - title = extract_text_from_dom(result, map_near_title) - url = parse_url(extract_text_from_dom(result, map_near_url), google_hostname) - attributes = [] - phone = extract_text_from_dom(result, map_near_phone) - add_attributes(attributes, property_phone, phone, 'tel:' + phone) - results.append({'title': title, - 'url': url, - 'content': attributes_to_html(attributes) - }) - - return results - - -def parse_map_detail(parsed_url, result, google_hostname): - results = [] - - # try to parse the geoloc - m = re.search(r'@([0-9\.]+),([0-9\.]+),([0-9]+)', parsed_url.path) - if m is None: - m = re.search(r'll\=([0-9\.]+),([0-9\.]+)\&z\=([0-9]+)', parsed_url.query) - - if m is not None: - # geoloc found (ignored) - lon = float(m.group(2)) # noqa - lat = float(m.group(1)) # noqa - zoom = int(m.group(3)) # noqa - - # attributes - attributes = [] - address = extract_text_from_dom(result, map_address_xpath) - phone = extract_text_from_dom(result, map_phone_xpath) - add_attributes(attributes, property_address, address, 'geo:' + str(lat) + ',' + str(lon)) - add_attributes(attributes, property_phone, phone, 'tel:' + phone) - - # title / content / url - website_title = extract_text_from_dom(result, map_website_title_xpath) - content = extract_text_from_dom(result, content_xpath) - website_url = parse_url(extract_text_from_dom(result, map_website_url_xpath), google_hostname) - - # add a result if there is a website - if website_url is not None: - results.append({'title': website_title, - 'content': (content + '<br />' if content is not None else '') - + attributes_to_html(attributes), - 'url': website_url - }) - - return results - - -def add_attributes(attributes, name, value, url): - if value is not None and len(value) > 0: - attributes.append({'label': name, 'value': value, 'url': url}) - - -def attributes_to_html(attributes): - retval = '<table class="table table-striped">' - for a in attributes: - value = a.get('value') - if 'url' in a: - value = '<a href="' + a.get('url') + '">' + value + '</a>' - retval = retval + '<tr><th>' + a.get('label') + '</th><td>' + value + '</td></tr>' - retval = retval + '</table>' - return retval - - # get supported languages from their site def _fetch_supported_languages(resp): - supported_languages = {} + ret_val = {} dom = html.fromstring(resp.text) - options = eval_xpath(dom, '//*[@id="langSec"]//input[@name="lr"]') - for option in options: - code = eval_xpath(option, './@value')[0].split('_')[-1] - name = eval_xpath(option, './@data-name')[0].title() - supported_languages[code] = {"name": name} - return supported_languages + radio_buttons = eval_xpath(dom, '//*[@id="langSec"]//input[@name="lang"]') + + for x in radio_buttons: + name = x.get("data-name") + code = x.get("value") + ret_val[code] = {"name": name} + + return ret_val diff --git a/searx/engines/google_images.py b/searx/engines/google_images.py index 636913114..9dd5fad2c 100644 --- a/searx/engines/google_images.py +++ b/searx/engines/google_images.py @@ -1,97 +1,225 @@ -""" - Google (Images) +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Google (Images) + +:website: https://images.google.com (redirected to subdomain www.) +:provide-api: yes (https://developers.google.com/custom-search/) +:using-api: not the offical, since it needs registration to another service +:results: HTML +:stable: no +:template: images.html +:parse: url, title, content, source, thumbnail_src, img_src + +For detailed description of the *REST-full* API see: `Query Parameter +Definitions`_. + +.. _admonition:: Content-Security-Policy (CSP) - @website https://www.google.com - @provide-api yes (https://developers.google.com/custom-search/) + This engine needs to allow images from the `data URLs`_ (prefixed with the + ``data:` scheme).:: + + Header set Content-Security-Policy "img-src 'self' data: ;" + +.. _Query Parameter Definitions: + https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions - @using-api no - @results HTML chunks with JSON inside - @stable no - @parse url, title, img_src """ -from datetime import date, timedelta -from json import loads +from urllib.parse import urlencode, urlparse, unquote from lxml import html -from searx.url_utils import urlencode +from flask_babel import gettext +from searx import logger +from searx.utils import eval_xpath +from searx.engines.xpath import extract_text + +# pylint: disable=unused-import +from searx.engines.google import ( + supported_languages_url, + _fetch_supported_languages, +) +# pylint: enable=unused-import + +from searx.engines.google import ( + get_lang_country, + google_domains, + time_range_dict, +) + +logger = logger.getChild('google images') # engine dependent config + categories = ['images'] -paging = True -safesearch = True +paging = False +language_support = True +use_locale_domain = True time_range_support = True -number_of_results = 100 +safesearch = True -search_url = 'https://www.google.com/search'\ - '?{query}'\ - '&tbm=isch'\ - '&yv=2'\ - '&{search_options}' -time_range_attr = "qdr:{range}" -time_range_custom_attr = "cdr:1,cd_min:{start},cd_max{end}" -time_range_dict = {'day': 'd', - 'week': 'w', - 'month': 'm'} +filter_mapping = { + 0: 'images', + 1: 'active', + 2: 'active' +} + + +def scrap_out_thumbs(dom): + """Scrap out thumbnail data from <script> tags. + """ + ret_val = dict() + for script in eval_xpath(dom, '//script[contains(., "_setImgSrc(")]'): + _script = script.text + # _setImgSrc('0','data:image\/jpeg;base64,\/9j\/4AAQSkZJR ....'); + _thumb_no, _img_data = _script[len("_setImgSrc("):-2].split(",", 1) + _thumb_no = _thumb_no.replace("'", "") + _img_data = _img_data.replace("'", "") + _img_data = _img_data.replace(r"\/", r"/") + ret_val[_thumb_no] = _img_data.replace(r"\x3d", "=") + return ret_val + + +def scrap_img_by_id(script, data_id): + """Get full image URL by data-id in parent element + """ + img_url = '' + _script = script.split('\n') + for i, line in enumerate(_script): + if 'gstatic.com/images' in line and data_id in line: + url_line = _script[i + 1] + img_url = url_line.split('"')[1] + img_url = unquote(img_url.replace(r'\u00', r'%')) + return img_url -# do search-request def request(query, params): - search_options = { - 'ijn': params['pageno'] - 1, - 'start': (params['pageno'] - 1) * number_of_results - } + """Google-Video search request""" + + language, country, lang_country = get_lang_country( + # pylint: disable=undefined-variable + params, supported_languages, language_aliases + ) + subdomain = 'www.' + google_domains.get(country.upper(), 'google.com') + + query_url = 'https://' + subdomain + '/search' + "?" + urlencode({ + 'q': query, + 'tbm': "isch", + 'hl': lang_country, + 'lr': "lang_" + language, + 'ie': "utf8", + 'oe': "utf8", + 'num': 30, + }) if params['time_range'] in time_range_dict: - search_options['tbs'] = time_range_attr.format(range=time_range_dict[params['time_range']]) - elif params['time_range'] == 'year': - now = date.today() - then = now - timedelta(days=365) - start = then.strftime('%m/%d/%Y') - end = now.strftime('%m/%d/%Y') - search_options['tbs'] = time_range_custom_attr.format(start=start, end=end) - - if safesearch and params['safesearch']: - search_options['safe'] = 'on' - - params['url'] = search_url.format(query=urlencode({'q': query}), - search_options=urlencode(search_options)) - + query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]}) + if params['safesearch']: + query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) + + params['url'] = query_url + logger.debug("query_url --> %s", query_url) + + params['headers']['Accept-Language'] = ( + "%s,%s;q=0.8,%s;q=0.5" % (lang_country, language, language)) + logger.debug( + "HTTP Accept-Language --> %s", params['headers']['Accept-Language']) + params['headers']['Accept'] = ( + 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' + ) + # params['google_subdomain'] = subdomain return params -# get response from search-request def response(resp): + """Get response from google's search request""" results = [] + # detect google sorry + resp_url = urlparse(resp.url) + if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect': + raise RuntimeWarning('sorry.google.com') + + if resp_url.path.startswith('/sorry'): + raise RuntimeWarning(gettext('CAPTCHA required')) + + # which subdomain ? + # subdomain = resp.search_params.get('google_subdomain') + + # convert the text to dom dom = html.fromstring(resp.text) + img_bas64_map = scrap_out_thumbs(dom) + img_src_script = eval_xpath(dom, '//script[contains(., "AF_initDataCallback({key: ")]')[1].text # parse results - for result in dom.xpath('//div[contains(@class, "rg_meta")]/text()'): + # + # root element:: + # <div id="islmp" ..> + # result div per image:: + # <div jsmodel="tTXmib"> / <div jsaction="..." data-id="..." + # The data-id matches to a item in a json-data structure in:: + # <script nonce="I+vqelcy/01CKiBJi5Z1Ow">AF_initDataCallback({key: 'ds:1', ... data:function(){return [ ... + # In this structure the link to the origin PNG, JPG or whatever is given + # first link per image-div contains a <img> with the data-iid for bas64 encoded image data:: + # <img class="rg_i Q4LuWd" data-iid="0" + # second link per image-div is the target link:: + # <a class="VFACy kGQAp" href="https://en.wikipedia.org/wiki/The_Sacrament_of_the_Last_Supper"> + # the second link also contains two div tags with the *description* and *publisher*:: + # <div class="WGvvNb">The Sacrament of the Last Supper ...</div> + # <div class="fxgdke">en.wikipedia.org</div> + + root = eval_xpath(dom, '//div[@id="islmp"]') + if not root: + logger.error("did not find root element id='islmp'") + return results + + root = root[0] + for img_node in eval_xpath(root, './/img[contains(@class, "rg_i")]'): try: - metadata = loads(result) - - img_format = metadata.get('ity', '') - img_width = metadata.get('ow', '') - img_height = metadata.get('oh', '') - if img_width and img_height: - img_format += " {0}x{1}".format(img_width, img_height) - - source = metadata.get('st', '') - source_url = metadata.get('isu', '') - if source_url: - source += " ({0})".format(source_url) - - results.append({'url': metadata['ru'], - 'title': metadata['pt'], - 'content': metadata.get('s', ''), - 'source': source, - 'img_format': img_format, - 'thumbnail_src': metadata['tu'], - 'img_src': metadata['ou'], - 'template': 'images.html'}) - - except: + img_alt = eval_xpath(img_node, '@alt')[0] + + img_base64_id = eval_xpath(img_node, '@data-iid') + if img_base64_id: + img_base64_id = img_base64_id[0] + thumbnail_src = img_bas64_map[img_base64_id] + else: + thumbnail_src = eval_xpath(img_node, '@src') + if not thumbnail_src: + thumbnail_src = eval_xpath(img_node, '@data-src') + if thumbnail_src: + thumbnail_src = thumbnail_src[0] + else: + thumbnail_src = '' + + link_node = eval_xpath(img_node, '../../../a[2]')[0] + url = eval_xpath(link_node, '@href')[0] + + pub_nodes = eval_xpath(link_node, './div/div') + pub_descr = img_alt + pub_source = '' + if pub_nodes: + pub_descr = extract_text(pub_nodes[0]) + pub_source = extract_text(pub_nodes[1]) + + img_src_id = eval_xpath(img_node, '../../../@data-id')[0] + src_url = scrap_img_by_id(img_src_script, img_src_id) + if not src_url: + src_url = thumbnail_src + + results.append({ + 'url': url, + 'title': img_alt, + 'content': pub_descr, + 'source': pub_source, + 'img_src': src_url, + # 'img_format': img_format, + 'thumbnail_src': thumbnail_src, + 'template': 'images.html' + }) + except Exception as e: # pylint: disable=broad-except + logger.error(e, exc_info=True) + # from lxml import etree + # logger.debug(etree.tostring(img_node, pretty_print=True)) + # import pdb + # pdb.set_trace() continue return results diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index c9cc75435..08875328c 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -10,9 +10,9 @@ @parse url, title, content, publishedDate """ +from urllib.parse import urlencode from lxml import html from searx.engines.google import _fetch_supported_languages, supported_languages_url -from searx.url_utils import urlencode from searx.utils import match_language # search-url diff --git a/searx/engines/google_videos.py b/searx/engines/google_videos.py index fd6b2e3be..08af55902 100644 --- a/searx/engines/google_videos.py +++ b/searx/engines/google_videos.py @@ -12,9 +12,9 @@ from datetime import date, timedelta from json import loads +from urllib.parse import urlencode from lxml import html from searx.engines.xpath import extract_text -from searx.url_utils import urlencode import re # engine dependent config diff --git a/searx/engines/ina.py b/searx/engines/ina.py index ea509649f..cce580273 100644 --- a/searx/engines/ina.py +++ b/searx/engines/ina.py @@ -12,15 +12,12 @@ # @todo embedded (needs some md5 from video page) from json import loads +from urllib.parse import urlencode from lxml import html from dateutil import parser +from html.parser import HTMLParser from searx.engines.xpath import extract_text -from searx.url_utils import urlencode -try: - from HTMLParser import HTMLParser -except: - from html.parser import HTMLParser # engine dependent config categories = ['videos'] diff --git a/searx/engines/invidious.py b/searx/engines/invidious.py index 8d81691fc..6ea942699 100644 --- a/searx/engines/invidious.py +++ b/searx/engines/invidious.py @@ -6,9 +6,9 @@ # @using-api yes # @results JSON # @stable yes -# @parse url, title, content, publishedDate, thumbnail, embedded +# @parse url, title, content, publishedDate, thumbnail, embedded, author, length -from searx.url_utils import quote_plus +from urllib.parse import quote_plus from dateutil import parser import time @@ -84,13 +84,20 @@ def response(resp): publishedDate = parser.parse( time.ctime(result.get("published", 0)) ) + length = time.gmtime(result.get("lengthSeconds")) + if length.tm_hour: + length = time.strftime("%H:%M:%S", length) + else: + length = time.strftime("%M:%S", length) results.append( { "url": url, "title": result.get("title", ""), "content": result.get("description", ""), + 'length': length, "template": "videos.html", + "author": result.get("author"), "publishedDate": publishedDate, "embedded": embedded, "thumbnail": thumbnail, diff --git a/searx/engines/json_engine.py b/searx/engines/json_engine.py index 785b0c490..1e5c39ac4 100644 --- a/searx/engines/json_engine.py +++ b/searx/engines/json_engine.py @@ -1,11 +1,8 @@ from collections import Iterable from json import loads -from sys import version_info -from searx.url_utils import urlencode +from urllib.parse import urlencode from searx.utils import to_string -if version_info[0] == 3: - unicode = str search_url = None url_query = None @@ -37,8 +34,6 @@ def iterate(iterable): def is_iterable(obj): if type(obj) == str: return False - if type(obj) == unicode: - return False return isinstance(obj, Iterable) diff --git a/searx/engines/kickass.py b/searx/engines/kickass.py index 5e897c96f..af48d990b 100644 --- a/searx/engines/kickass.py +++ b/searx/engines/kickass.py @@ -12,9 +12,9 @@ from lxml import html from operator import itemgetter +from urllib.parse import quote, urljoin from searx.engines.xpath import extract_text from searx.utils import get_torrent_size, convert_str_to_int -from searx.url_utils import quote, urljoin # engine dependent config categories = ['videos', 'music', 'files'] diff --git a/searx/engines/mediawiki.py b/searx/engines/mediawiki.py index 0607ac93b..50ba74efc 100644 --- a/searx/engines/mediawiki.py +++ b/searx/engines/mediawiki.py @@ -14,7 +14,7 @@ from json import loads from string import Formatter -from searx.url_utils import urlencode, quote +from urllib.parse import urlencode, quote # engine dependent config categories = ['general'] @@ -79,7 +79,7 @@ def response(resp): if result.get('snippet', '').startswith('#REDIRECT'): continue url = base_url.format(language=resp.search_params['language']) +\ - 'wiki/' + quote(result['title'].replace(' ', '_').encode('utf-8')) + 'wiki/' + quote(result['title'].replace(' ', '_').encode()) # append result results.append({'url': url, diff --git a/searx/engines/microsoft_academic.py b/searx/engines/microsoft_academic.py index 9bac0069c..7426eef7e 100644 --- a/searx/engines/microsoft_academic.py +++ b/searx/engines/microsoft_academic.py @@ -12,8 +12,7 @@ Microsoft Academic (Science) from datetime import datetime from json import loads from uuid import uuid4 - -from searx.url_utils import urlencode +from urllib.parse import urlencode from searx.utils import html_to_text categories = ['images'] diff --git a/searx/engines/mixcloud.py b/searx/engines/mixcloud.py index 470c007ea..0606350a9 100644 --- a/searx/engines/mixcloud.py +++ b/searx/engines/mixcloud.py @@ -12,7 +12,7 @@ from json import loads from dateutil import parser -from searx.url_utils import urlencode +from urllib.parse import urlencode # engine dependent config categories = ['music'] diff --git a/searx/engines/nyaa.py b/searx/engines/nyaa.py index c57979a5f..ed8897ddc 100644 --- a/searx/engines/nyaa.py +++ b/searx/engines/nyaa.py @@ -10,8 +10,8 @@ """ from lxml import html +from urllib.parse import urlencode from searx.engines.xpath import extract_text -from searx.url_utils import urlencode from searx.utils import get_torrent_size, int_or_zero # engine dependent config diff --git a/searx/engines/openstreetmap.py b/searx/engines/openstreetmap.py index cec10a3c7..5475c7a6d 100644 --- a/searx/engines/openstreetmap.py +++ b/searx/engines/openstreetmap.py @@ -10,7 +10,9 @@ @parse url, title """ +import re from json import loads +from flask_babel import gettext # engine dependent config categories = ['map'] @@ -21,10 +23,15 @@ base_url = 'https://nominatim.openstreetmap.org/' search_string = 'search/{query}?format=json&polygon_geojson=1&addressdetails=1' result_base_url = 'https://openstreetmap.org/{osm_type}/{osm_id}' +route_url = 'https://graphhopper.com/maps/?point={}&point={}&locale=en-US&vehicle=car&weighting=fastest&turn_costs=true&use_miles=false&layer=Omniscale' # noqa +route_re = re.compile('(?:from )?(.+) to (.+)') + # do search-request def request(query, params): - params['url'] = base_url + search_string.format(query=query.decode('utf-8')) + + params['url'] = base_url + search_string.format(query=query) + params['route'] = route_re.match(query) return params @@ -34,12 +41,18 @@ def response(resp): results = [] json = loads(resp.text) + if resp.search_params['route']: + results.append({ + 'answer': gettext('Get directions'), + 'url': route_url.format(*resp.search_params['route'].groups()), + }) + # parse results for r in json: if 'display_name' not in r: continue - title = r['display_name'] or u'' + title = r['display_name'] or '' osm_type = r.get('osm_type', r.get('type')) url = result_base_url.format(osm_type=osm_type, osm_id=r['osm_id']) @@ -51,7 +64,7 @@ def response(resp): # if no geojson is found and osm_type is a node, add geojson Point if not geojson and osm_type == 'node': - geojson = {u'type': u'Point', u'coordinates': [r['lon'], r['lat']]} + geojson = {'type': 'Point', 'coordinates': [r['lon'], r['lat']]} address_raw = r.get('address') address = {} diff --git a/searx/engines/peertube.py b/searx/engines/peertube.py new file mode 100644 index 000000000..58ff38c02 --- /dev/null +++ b/searx/engines/peertube.py @@ -0,0 +1,95 @@ +""" + peertube (Videos) + + @website https://www.peertube.live + @provide-api yes (https://docs.joinpeertube.org/api-rest-reference.html) + + @using-api yes + @results JSON + @stable yes + @parse url, title, thumbnail, publishedDate, embedded + + @todo implement time range support +""" + +from json import loads +from datetime import datetime +from urllib.parse import urlencode +from searx.utils import html_to_text + +# engine dependent config +categories = ["videos"] +paging = True +language_support = True +base_url = "https://peer.tube/" +supported_languages_url = base_url + "api/v1/videos/languages" + + +# do search-request +def request(query, params): + pageno = (params["pageno"] - 1) * 15 + search_url = base_url + "api/v1/search/videos/?pageno={pageno}&{query}" + query_dict = {"search": query} + language = params["language"].split("-")[0] + # pylint: disable=undefined-variable + if "all" != language and language in supported_languages: + query_dict["languageOneOf"] = language + params["url"] = search_url.format( + query=urlencode(query_dict), pageno=pageno + ) + return params + + +def _get_offset_from_pageno(pageno): + return (pageno - 1) * 15 + 1 + + +# get response from search-request +def response(resp): + results = [] + + search_res = loads(resp.text) + + embedded_url = ( + '<iframe width="560" height="315" sandbox="allow-same-origin allow-scripts allow-popups" ' + + 'src="' + + base_url + + '{embed_path}" frameborder="0" allowfullscreen></iframe>' + ) + # return empty array if there are no results + if "data" not in search_res: + return [] + + # parse results + for res in search_res["data"]: + title = res["name"] + url = base_url + "/videos/watch/" + res["uuid"] + description = res["description"] + if description: + content = html_to_text(res["description"]) + else: + content = None + thumbnail = base_url + res["thumbnailPath"] + publishedDate = datetime.strptime(res["publishedAt"], "%Y-%m-%dT%H:%M:%S.%fZ") + embedded = embedded_url.format(embed_path=res["embedPath"][1:]) + + results.append( + { + "template": "videos.html", + "url": url, + "title": title, + "content": content, + "publishedDate": publishedDate, + "embedded": embedded, + "thumbnail": thumbnail, + } + ) + + # return results + return results + + +def _fetch_supported_languages(resp): + ret_val = {} + peertube_languages = list(loads(resp.text).keys()) + return peertube_languages diff --git a/searx/engines/photon.py b/searx/engines/photon.py index 15236f680..9201fc168 100644 --- a/searx/engines/photon.py +++ b/searx/engines/photon.py @@ -11,8 +11,8 @@ """ from json import loads +from urllib.parse import urlencode from searx.utils import searx_useragent -from searx.url_utils import urlencode # engine dependent config categories = ['map'] diff --git a/searx/engines/piratebay.py b/searx/engines/piratebay.py index 2f3f22a97..42866d058 100644 --- a/searx/engines/piratebay.py +++ b/searx/engines/piratebay.py @@ -1,44 +1,52 @@ # Piratebay (Videos, Music, Files) # -# @website https://thepiratebay.se -# @provide-api no (nothing found) +# @website https://thepiratebay.org +# @provide-api yes (https://apibay.org/) # -# @using-api no -# @results HTML (using search portal) -# @stable yes (HTML can change) -# @parse url, title, content, seed, leech, magnetlink +# @using-api yes +# @results JSON +# @stable no (the API is not documented nor versioned) +# @parse url, title, seed, leech, magnetlink, filesize, publishedDate -from lxml import html +from json import loads +from datetime import datetime from operator import itemgetter + +from urllib.parse import quote, urljoin from searx.engines.xpath import extract_text -from searx.url_utils import quote, urljoin +from searx.utils import get_torrent_size # engine dependent config -categories = ['videos', 'music', 'files'] -paging = True +categories = ["videos", "music", "files"] # search-url -url = 'https://thepiratebay.org/' -search_url = url + 'search/{search_term}/{pageno}/99/{search_type}' +url = "https://thepiratebay.org/" +search_url = "https://apibay.org/q.php?q={search_term}&cat={search_type}" + +# default trackers provided by thepiratebay +trackers = [ + "udp://tracker.coppersurfer.tk:6969/announce", + "udp://9.rarbg.to:2920/announce", + "udp://tracker.opentrackr.org:1337", + "udp://tracker.internetwarriors.net:1337/announce", + "udp://tracker.leechers-paradise.org:6969/announce", + "udp://tracker.coppersurfer.tk:6969/announce", + "udp://tracker.pirateparty.gr:6969/announce", + "udp://tracker.cyberia.is:6969/announce", +] # piratebay specific type-definitions -search_types = {'files': '0', - 'music': '100', - 'videos': '200'} - -# specific xpath variables -magnet_xpath = './/a[@title="Download this torrent using magnet"]' -torrent_xpath = './/a[@title="Download this torrent"]' -content_xpath = './/font[@class="detDesc"]' +search_types = {"files": "0", + "music": "100", + "videos": "200"} # do search-request def request(query, params): - search_type = search_types.get(params['category'], '0') + search_type = search_types.get(params["category"], "0") - params['url'] = search_url.format(search_term=quote(query), - search_type=search_type, - pageno=params['pageno'] - 1) + params["url"] = search_url.format(search_term=quote(query), + search_type=search_type) return params @@ -47,50 +55,43 @@ def request(query, params): def response(resp): results = [] - dom = html.fromstring(resp.text) - - search_res = dom.xpath('//table[@id="searchResult"]//tr') + search_res = loads(resp.text) # return empty array if nothing is found - if not search_res: + if search_res[0]["name"] == "No results returned": return [] # parse results - for result in search_res[1:]: - link = result.xpath('.//div[@class="detName"]//a')[0] - href = urljoin(url, link.attrib.get('href')) - title = extract_text(link) - content = extract_text(result.xpath(content_xpath)) - seed, leech = result.xpath('.//td[@align="right"]/text()')[:2] - - # convert seed to int if possible - if seed.isdigit(): - seed = int(seed) - else: - seed = 0 - - # convert leech to int if possible - if leech.isdigit(): - leech = int(leech) - else: - leech = 0 - - magnetlink = result.xpath(magnet_xpath)[0] - torrentfile_links = result.xpath(torrent_xpath) - if torrentfile_links: - torrentfile_link = torrentfile_links[0].attrib.get('href') - else: - torrentfile_link = None + for result in search_res: + link = url + "description.php?id=" + result["id"] + magnetlink = "magnet:?xt=urn:btih:" + result["info_hash"] + "&dn=" + result["name"]\ + + "&tr=" + "&tr=".join(trackers) + + params = { + "url": link, + "title": result["name"], + "seed": result["seeders"], + "leech": result["leechers"], + "magnetlink": magnetlink, + "template": "torrent.html" + } + + # extract and convert creation date + try: + date = datetime.fromtimestamp(float(result["added"])) + params['publishedDate'] = date + except: + pass + + # let's try to calculate the torrent size + try: + filesize = get_torrent_size(result["size"], "B") + params['filesize'] = filesize + except: + pass # append result - results.append({'url': href, - 'title': title, - 'content': content, - 'seed': seed, - 'leech': leech, - 'magnetlink': magnetlink.attrib.get('href'), - 'torrentfile': torrentfile_link, - 'template': 'torrent.html'}) + results.append(params) # return results sorted by seeder - return sorted(results, key=itemgetter('seed'), reverse=True) + return sorted(results, key=itemgetter("seed"), reverse=True) diff --git a/searx/engines/pubmed.py b/searx/engines/pubmed.py index 055f09226..7eb2e92f9 100644 --- a/searx/engines/pubmed.py +++ b/searx/engines/pubmed.py @@ -14,7 +14,7 @@ from flask_babel import gettext from lxml import etree from datetime import datetime -from searx.url_utils import urlencode +from urllib.parse import urlencode from searx.poolrequests import get diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py index 54e9dafad..ac918b905 100644 --- a/searx/engines/qwant.py +++ b/searx/engines/qwant.py @@ -12,9 +12,9 @@ from datetime import datetime from json import loads -from searx.utils import html_to_text -from searx.url_utils import urlencode -from searx.utils import match_language +from urllib.parse import urlencode +from searx.utils import html_to_text, match_language + # engine dependent config categories = None diff --git a/searx/engines/reddit.py b/searx/engines/reddit.py index d19724906..e732875cb 100644 --- a/searx/engines/reddit.py +++ b/searx/engines/reddit.py @@ -12,7 +12,7 @@ import json from datetime import datetime -from searx.url_utils import urlencode, urljoin, urlparse +from urllib.parse import urlencode, urljoin, urlparse # engine dependent config categories = ['general', 'images', 'news', 'social media'] diff --git a/searx/engines/scanr_structures.py b/searx/engines/scanr_structures.py index 7208dcb70..6dbbf4fd9 100644 --- a/searx/engines/scanr_structures.py +++ b/searx/engines/scanr_structures.py @@ -11,7 +11,7 @@ """ from json import loads, dumps -from searx.utils import html_to_text +from urllib.parse import html_to_text # engine dependent config categories = ['science'] @@ -29,7 +29,7 @@ def request(query, params): params['url'] = search_url params['method'] = 'POST' params['headers']['Content-type'] = "application/json" - params['data'] = dumps({"query": query.decode('utf-8'), + params['data'] = dumps({"query": query, "searchField": "ALL", "sortDirection": "ASC", "sortOrder": "RELEVANCY", diff --git a/searx/engines/searchcode_code.py b/searx/engines/searchcode_code.py index 789e8e7a9..706285814 100644 --- a/searx/engines/searchcode_code.py +++ b/searx/engines/searchcode_code.py @@ -11,7 +11,7 @@ """ from json import loads -from searx.url_utils import urlencode +from urllib.parse import urlencode # engine dependent config diff --git a/searx/engines/searchcode_doc.py b/searx/engines/searchcode_doc.py index 4b8e9a84a..878d2e792 100644 --- a/searx/engines/searchcode_doc.py +++ b/searx/engines/searchcode_doc.py @@ -11,7 +11,7 @@ """ from json import loads -from searx.url_utils import urlencode +from urllib.parse import urlencode # engine dependent config categories = ['it'] diff --git a/searx/engines/seedpeer.py b/searx/engines/seedpeer.py index f9b1f99c8..3778abe7b 100644 --- a/searx/engines/seedpeer.py +++ b/searx/engines/seedpeer.py @@ -11,7 +11,7 @@ from lxml import html from json import loads from operator import itemgetter -from searx.url_utils import quote, urljoin +from urllib.parse import quote, urljoin from searx.engines.xpath import extract_text diff --git a/searx/engines/soundcloud.py b/searx/engines/soundcloud.py index 284689bf6..5165ea3ea 100644 --- a/searx/engines/soundcloud.py +++ b/searx/engines/soundcloud.py @@ -14,14 +14,11 @@ import re from json import loads from lxml import html from dateutil import parser +from io import StringIO +from urllib.parse import quote_plus, urlencode from searx import logger from searx.poolrequests import get as http_get -from searx.url_utils import quote_plus, urlencode -try: - from cStringIO import StringIO -except: - from io import StringIO # engine dependent config categories = ['music'] @@ -61,7 +58,7 @@ def get_client_id(): # gets app_js and searches for the clientid response = http_get(app_js_url) if response.ok: - cids = cid_re.search(response.content.decode("utf-8")) + cids = cid_re.search(response.content.decode()) if cids is not None and len(cids.groups()): return cids.groups()[0] logger.warning("Unable to fetch guest client_id from SoundCloud, check parser!") diff --git a/searx/engines/spotify.py b/searx/engines/spotify.py index 00c395706..74942326e 100644 --- a/searx/engines/spotify.py +++ b/searx/engines/spotify.py @@ -11,7 +11,7 @@ """ from json import loads -from searx.url_utils import urlencode +from urllib.parse import urlencode import requests import base64 @@ -39,8 +39,8 @@ def request(query, params): 'https://accounts.spotify.com/api/token', data={'grant_type': 'client_credentials'}, headers={'Authorization': 'Basic ' + base64.b64encode( - "{}:{}".format(api_client_id, api_client_secret).encode('utf-8') - ).decode('utf-8')} + "{}:{}".format(api_client_id, api_client_secret).encode() + ).decode()} ) j = loads(r.text) params['headers'] = {'Authorization': 'Bearer {}'.format(j.get('access_token'))} @@ -59,7 +59,7 @@ def response(resp): if result['type'] == 'track': title = result['name'] url = result['external_urls']['spotify'] - content = u'{} - {} - {}'.format( + content = '{} - {} - {}'.format( result['artists'][0]['name'], result['album']['name'], result['name']) diff --git a/searx/engines/stackoverflow.py b/searx/engines/stackoverflow.py index 25875aa15..90e4543d7 100644 --- a/searx/engines/stackoverflow.py +++ b/searx/engines/stackoverflow.py @@ -10,9 +10,9 @@ @parse url, title, content """ +from urllib.parse import urlencode, urljoin from lxml import html from searx.engines.xpath import extract_text -from searx.url_utils import urlencode, urljoin # engine dependent config categories = ['it'] diff --git a/searx/engines/tokyotoshokan.py b/searx/engines/tokyotoshokan.py index 773212043..9c8774d7c 100644 --- a/searx/engines/tokyotoshokan.py +++ b/searx/engines/tokyotoshokan.py @@ -11,10 +11,10 @@ """ import re +from urllib.parse import urlencode from lxml import html from searx.engines.xpath import extract_text from datetime import datetime -from searx.url_utils import urlencode from searx.utils import get_torrent_size, int_or_zero # engine dependent config diff --git a/searx/engines/torrentz.py b/searx/engines/torrentz.py index fd4164a66..fcc8c042c 100644 --- a/searx/engines/torrentz.py +++ b/searx/engines/torrentz.py @@ -1,21 +1,21 @@ """ - Torrentz2.eu (BitTorrent meta-search engine) + Torrentz2.is (BitTorrent meta-search engine) - @website https://torrentz2.eu/ + @website https://torrentz2.is/ @provide-api no @using-api no @results HTML @stable no (HTML can change, although unlikely, - see https://torrentz.eu/torrentz.btsearch) + see https://torrentz.is/torrentz.btsearch) @parse url, title, publishedDate, seed, leech, filesize, magnetlink """ import re +from urllib.parse import urlencode from lxml import html from datetime import datetime from searx.engines.xpath import extract_text -from searx.url_utils import urlencode from searx.utils import get_torrent_size # engine dependent config @@ -23,8 +23,8 @@ categories = ['files', 'videos', 'music'] paging = True # search-url -# https://torrentz2.eu/search?f=EXAMPLE&p=6 -base_url = 'https://torrentz2.eu/' +# https://torrentz2.is/search?f=EXAMPLE&p=6 +base_url = 'https://torrentz2.is/' search_url = base_url + 'search?{query}' diff --git a/searx/engines/translated.py b/searx/engines/translated.py index 5c7b17033..a50e7c830 100644 --- a/searx/engines/translated.py +++ b/searx/engines/translated.py @@ -9,23 +9,19 @@ @parse url, title, content """ import re -from sys import version_info from searx.utils import is_valid_lang -if version_info[0] == 3: - unicode = str - categories = ['general'] -url = u'http://api.mymemory.translated.net/get?q={query}&langpair={from_lang}|{to_lang}{key}' -web_url = u'http://mymemory.translated.net/en/{from_lang}/{to_lang}/{query}' +url = 'https://api.mymemory.translated.net/get?q={query}&langpair={from_lang}|{to_lang}{key}' +web_url = 'https://mymemory.translated.net/en/{from_lang}/{to_lang}/{query}' weight = 100 -parser_re = re.compile(u'.*?([a-z]+)-([a-z]+) (.{2,})$', re.I) +parser_re = re.compile('.*?([a-z]+)-([a-z]+) (.{2,})$', re.I) api_key = '' def request(query, params): - m = parser_re.match(unicode(query, 'utf8')) + m = parser_re.match(query) if not m: return params diff --git a/searx/engines/twitter.py b/searx/engines/twitter.py index d2a8d2088..549b14e96 100644 --- a/searx/engines/twitter.py +++ b/searx/engines/twitter.py @@ -12,10 +12,10 @@ @todo publishedDate """ +from urllib.parse import urlencode, urljoin from lxml import html from datetime import datetime from searx.engines.xpath import extract_text -from searx.url_utils import urlencode, urljoin # engine dependent config categories = ['social media'] diff --git a/searx/engines/unsplash.py b/searx/engines/unsplash.py index 2e8d6fdfc..45c6b30da 100644 --- a/searx/engines/unsplash.py +++ b/searx/engines/unsplash.py @@ -10,7 +10,7 @@ @parse url, title, img_src, thumbnail_src """ -from searx.url_utils import urlencode, urlparse, urlunparse, parse_qsl +from urllib.parse import urlencode, urlparse, urlunparse, parse_qsl from json import loads url = 'https://unsplash.com/' diff --git a/searx/engines/vimeo.py b/searx/engines/vimeo.py index a92271019..fd3abc858 100644 --- a/searx/engines/vimeo.py +++ b/searx/engines/vimeo.py @@ -12,9 +12,9 @@ # @todo rewrite to api # @todo set content-parameter with correct data +from urllib.parse import urlencode from json import loads from dateutil import parser -from searx.url_utils import urlencode # engine dependent config categories = ['videos'] diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index e913b3915..ffa3724fd 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -15,9 +15,9 @@ from searx import logger from searx.poolrequests import get from searx.engines.xpath import extract_text from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url -from searx.url_utils import urlencode from searx.utils import match_language, eval_xpath +from urllib.parse import urlencode from json import loads from lxml.html import fromstring from lxml import etree @@ -76,7 +76,7 @@ def request(query, params): def response(resp): results = [] htmlparser = etree.HTMLParser() - html = fromstring(resp.content.decode("utf-8"), parser=htmlparser) + html = fromstring(resp.content.decode(), parser=htmlparser) search_results = eval_xpath(html, wikidata_ids_xpath) if resp.search_params['language'].split('-')[0] == 'all': @@ -89,7 +89,7 @@ def response(resp): wikidata_id = search_result.split('/')[-1] url = url_detail.format(query=urlencode({'page': wikidata_id, 'uselang': language})) htmlresponse = get(url) - jsonresponse = loads(htmlresponse.content.decode("utf-8")) + jsonresponse = loads(htmlresponse.content.decode()) results += getDetail(jsonresponse, wikidata_id, language, resp.search_params['language'], htmlparser) return results @@ -382,7 +382,7 @@ def add_attribute(attributes, id_cache, property_id, default_label=None, date=Fa # requires property_id unless it's a wiki link (defined in link_type) def add_url(urls, result, id_cache, property_id=None, default_label=None, url_prefix=None, results=None, - link_type=None): + link_type=None, only_first=True): links = [] # wiki links don't have property in wikidata page @@ -414,11 +414,15 @@ def add_url(urls, result, id_cache, property_id=None, default_label=None, url_pr # append urls for url in links: if url is not None: - urls.append({'title': default_label or label, - 'url': url}) + u = {'title': default_label or label, 'url': url} + if property_id == 'P856': + u['official'] = True + u['domain'] = url.split('/')[2] + urls.append(u) if results is not None: - results.append({'title': default_label or label, - 'url': url}) + results.append(u) + if only_first: + break def get_imdblink(result, url_prefix): @@ -449,16 +453,16 @@ def get_geolink(result): latitude, longitude = coordinates.split(',') # convert to decimal - lat = int(latitude[:latitude.find(u'°')]) + lat = int(latitude[:latitude.find('°')]) if latitude.find('\'') >= 0: - lat += int(latitude[latitude.find(u'°') + 1:latitude.find('\'')] or 0) / 60.0 + lat += int(latitude[latitude.find('°') + 1:latitude.find('\'')] or 0) / 60.0 if latitude.find('"') >= 0: lat += float(latitude[latitude.find('\'') + 1:latitude.find('"')] or 0) / 3600.0 if latitude.find('S') >= 0: lat *= -1 - lon = int(longitude[:longitude.find(u'°')]) + lon = int(longitude[:longitude.find('°')]) if longitude.find('\'') >= 0: - lon += int(longitude[longitude.find(u'°') + 1:longitude.find('\'')] or 0) / 60.0 + lon += int(longitude[longitude.find('°') + 1:longitude.find('\'')] or 0) / 60.0 if longitude.find('"') >= 0: lon += float(longitude[longitude.find('\'') + 1:longitude.find('"')] or 0) / 3600.0 if longitude.find('W') >= 0: diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py index a216ba886..620ec3c14 100644 --- a/searx/engines/wikipedia.py +++ b/searx/engines/wikipedia.py @@ -1,7 +1,7 @@ """ Wikipedia (Web) - @website https://{language}.wikipedia.org + @website https://en.wikipedia.org/api/rest_v1/ @provide-api yes @using-api yes @@ -10,23 +10,13 @@ @parse url, infobox """ +from urllib.parse import quote from json import loads from lxml.html import fromstring -from searx.url_utils import quote, urlencode -from searx.utils import match_language +from searx.utils import match_language, searx_useragent # search-url -base_url = u'https://{language}.wikipedia.org/' -search_url = base_url + u'w/api.php?'\ - 'action=query'\ - '&format=json'\ - '&{query}'\ - '&prop=extracts|pageimages|pageprops'\ - '&ppprop=disambiguation'\ - '&exintro'\ - '&explaintext'\ - '&pithumbsize=300'\ - '&redirects' +search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}' supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias' @@ -41,77 +31,37 @@ def url_lang(lang): # do search-request def request(query, params): if query.islower(): - query = u'{0}|{1}'.format(query.decode('utf-8'), query.decode('utf-8').title()).encode('utf-8') + query = query.title() - params['url'] = search_url.format(query=urlencode({'titles': query}), + params['url'] = search_url.format(title=quote(query), language=url_lang(params['language'])) - return params - - -# get first meaningful paragraph -# this should filter out disambiguation pages and notes above first paragraph -# "magic numbers" were obtained by fine tuning -def extract_first_paragraph(content, title, image): - first_paragraph = None - - failed_attempts = 0 - for paragraph in content.split('\n'): - - starts_with_title = paragraph.lower().find(title.lower(), 0, len(title) + 35) - length = len(paragraph) + params['headers']['User-Agent'] = searx_useragent() - if length >= 200 or (starts_with_title >= 0 and (image or length >= 150)): - first_paragraph = paragraph - break - - failed_attempts += 1 - if failed_attempts > 3: - return None - - return first_paragraph + return params # get response from search-request def response(resp): - results = [] - - search_result = loads(resp.text) - - # wikipedia article's unique id - # first valid id is assumed to be the requested article - if 'pages' not in search_result['query']: - return results - - for article_id in search_result['query']['pages']: - page = search_result['query']['pages'][article_id] - if int(article_id) > 0: - break - - if int(article_id) < 0 or 'disambiguation' in page.get('pageprops', {}): + if not resp.ok: return [] - title = page.get('title') - - image = page.get('thumbnail') - if image: - image = image.get('source') - - extract = page.get('extract') + results = [] + api_result = loads(resp.text) - summary = extract_first_paragraph(extract, title, image) - summary = summary.replace('() ', '') + # skip disambiguation pages + if api_result['type'] != 'standard': + return [] - # link to wikipedia article - wikipedia_link = base_url.format(language=url_lang(resp.search_params['language'])) \ - + 'wiki/' + quote(title.replace(' ', '_').encode('utf8')) + title = api_result['title'] + wikipedia_link = api_result['content_urls']['desktop']['page'] results.append({'url': wikipedia_link, 'title': title}) results.append({'infobox': title, 'id': wikipedia_link, - 'content': summary, - 'img_src': image, + 'content': api_result.get('extract', ''), + 'img_src': api_result.get('thumbnail', {}).get('source'), 'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]}) return results diff --git a/searx/engines/wolframalpha_api.py b/searx/engines/wolframalpha_api.py index 1c58c4a9b..520eaa209 100644 --- a/searx/engines/wolframalpha_api.py +++ b/searx/engines/wolframalpha_api.py @@ -9,7 +9,7 @@ # @parse url, infobox from lxml import etree -from searx.url_utils import urlencode +from urllib.parse import urlencode # search-url search_url = 'https://api.wolframalpha.com/v2/query?appid={api_key}&{query}' @@ -45,15 +45,15 @@ def request(query, params): # replace private user area characters to make text legible def replace_pua_chars(text): - pua_chars = {u'\uf522': u'\u2192', # rigth arrow - u'\uf7b1': u'\u2115', # set of natural numbers - u'\uf7b4': u'\u211a', # set of rational numbers - u'\uf7b5': u'\u211d', # set of real numbers - u'\uf7bd': u'\u2124', # set of integer numbers - u'\uf74c': 'd', # differential - u'\uf74d': u'\u212f', # euler's number - u'\uf74e': 'i', # imaginary number - u'\uf7d9': '='} # equals sign + pua_chars = {'\uf522': '\u2192', # rigth arrow + '\uf7b1': '\u2115', # set of natural numbers + '\uf7b4': '\u211a', # set of rational numbers + '\uf7b5': '\u211d', # set of real numbers + '\uf7bd': '\u2124', # set of integer numbers + '\uf74c': 'd', # differential + '\uf74d': '\u212f', # euler's number + '\uf74e': 'i', # imaginary number + '\uf7d9': '='} # equals sign for k, v in pua_chars.items(): text = text.replace(k, v) diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py index 387c9fa17..943d4f3fb 100644 --- a/searx/engines/wolframalpha_noapi.py +++ b/searx/engines/wolframalpha_noapi.py @@ -10,9 +10,9 @@ from json import loads from time import time +from urllib.parse import urlencode from searx.poolrequests import get as http_get -from searx.url_utils import urlencode # search-url url = 'https://www.wolframalpha.com/' diff --git a/searx/engines/www1x.py b/searx/engines/www1x.py index f1154b16d..1cb74dbad 100644 --- a/searx/engines/www1x.py +++ b/searx/engines/www1x.py @@ -11,7 +11,7 @@ """ from lxml import html -from searx.url_utils import urlencode, urljoin +from urllib.parse import urlencode, urljoin from searx.engines.xpath import extract_text # engine dependent config diff --git a/searx/engines/xpath.py b/searx/engines/xpath.py index b75896cc7..bd97a93a5 100644 --- a/searx/engines/xpath.py +++ b/searx/engines/xpath.py @@ -1,7 +1,7 @@ +from urllib.parse import unquote, urlencode, urljoin, urlparse from lxml import html from lxml.etree import _ElementStringResult, _ElementUnicodeResult from searx.utils import html_to_text, eval_xpath -from searx.url_utils import unquote, urlencode, urljoin, urlparse search_url = None url_xpath = None @@ -56,11 +56,15 @@ def extract_url(xpath_results, search_url): if url.startswith('//'): # add http or https to this kind of url //example.com/ parsed_search_url = urlparse(search_url) - url = u'{0}:{1}'.format(parsed_search_url.scheme or 'http', url) + url = '{0}:{1}'.format(parsed_search_url.scheme or 'http', url) elif url.startswith('/'): # fix relative url to the search engine url = urljoin(search_url, url) + # fix relative urls that fall through the crack + if '://' not in url: + url = urljoin(search_url, url) + # normalize url url = normalize_url(url) @@ -82,7 +86,7 @@ def normalize_url(url): p = parsed_url.path mark = p.find('/**') if mark != -1: - return unquote(p[mark + 3:]).decode('utf-8') + return unquote(p[mark + 3:]).decode() return url diff --git a/searx/engines/yacy.py b/searx/engines/yacy.py index 25bc83687..daa151082 100644 --- a/searx/engines/yacy.py +++ b/searx/engines/yacy.py @@ -14,7 +14,7 @@ from json import loads from dateutil import parser -from searx.url_utils import urlencode +from urllib.parse import urlencode from searx.utils import html_to_text @@ -75,7 +75,7 @@ def response(resp): for result in search_results[0].get('items', []): # parse image results - if result.get('image'): + if result.get('image') and result.get('width') and result.get('height'): result_url = '' if 'url' in result: diff --git a/searx/engines/yahoo.py b/searx/engines/yahoo.py index 36c1a11f8..0133b57b5 100644 --- a/searx/engines/yahoo.py +++ b/searx/engines/yahoo.py @@ -11,9 +11,9 @@ @parse url, title, content, suggestion """ +from urllib.parse import unquote, urlencode from lxml import html from searx.engines.xpath import extract_text, extract_url -from searx.url_utils import unquote, urlencode from searx.utils import match_language, eval_xpath # engine dependent config @@ -33,7 +33,7 @@ supported_languages_url = 'https://search.yahoo.com/web/advanced' results_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' Sr ')]" url_xpath = './/h3/a/@href' title_xpath = './/h3/a' -content_xpath = './/div[@class="compText aAbs"]' +content_xpath = './/div[contains(@class, "compText")]' suggestion_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' AlsoTry ')]//a" time_range_dict = {'day': ['1d', 'd'], diff --git a/searx/engines/yahoo_news.py b/searx/engines/yahoo_news.py index 9f6a4159b..345e4d91f 100644 --- a/searx/engines/yahoo_news.py +++ b/searx/engines/yahoo_news.py @@ -11,13 +11,13 @@ import re from datetime import datetime, timedelta +from urllib.parse import urlencode from lxml import html from searx.engines.xpath import extract_text, extract_url from searx.engines.yahoo import ( parse_url, _fetch_supported_languages, supported_languages_url, language_aliases ) from dateutil import parser -from searx.url_utils import urlencode from searx.utils import match_language # engine dependent config @@ -58,7 +58,7 @@ def request(query, params): def sanitize_url(url): if ".yahoo.com/" in url: - return re.sub(u"\\;\\_ylt\\=.+$", "", url) + return re.sub("\\;\\_ylt\\=.+$", "", url) else: return url diff --git a/searx/engines/yandex.py b/searx/engines/yandex.py index 1c789f6cb..ff1ef5a26 100644 --- a/searx/engines/yandex.py +++ b/searx/engines/yandex.py @@ -9,9 +9,9 @@ @parse url, title, content """ +from urllib.parse import urlencode from lxml import html from searx import logger -from searx.url_utils import urlencode logger = logger.getChild('yandex engine') diff --git a/searx/engines/yggtorrent.py b/searx/engines/yggtorrent.py new file mode 100644 index 000000000..37bf3b1d9 --- /dev/null +++ b/searx/engines/yggtorrent.py @@ -0,0 +1,124 @@ +# Yggtorrent (Videos, Music, Files) +# +# @website https://www2.yggtorrent.si +# @provide-api no (nothing found) +# +# @using-api no +# @results HTML (using search portal) +# @stable no (HTML can change) +# @parse url, title, seed, leech, publishedDate, filesize + +from lxml import html +from operator import itemgetter +from datetime import datetime +from urllib.parse import quote +from searx.engines.xpath import extract_text +from searx.utils import get_torrent_size +from searx.poolrequests import get as http_get + +# engine dependent config +categories = ['videos', 'music', 'files'] +paging = True + +# search-url +url = 'https://www2.yggtorrent.si/' +search_url = url + 'engine/search?name={search_term}&do=search&page={pageno}&category={search_type}' + +# yggtorrent specific type-definitions +search_types = {'files': 'all', + 'music': '2139', + 'videos': '2145'} + +cookies = dict() + + +def init(engine_settings=None): + global cookies + # initial cookies + resp = http_get(url, allow_redirects=False) + if resp.ok: + for r in resp.history: + cookies.update(r.cookies) + cookies.update(resp.cookies) + + +# do search-request +def request(query, params): + search_type = search_types.get(params['category'], 'all') + pageno = (params['pageno'] - 1) * 50 + + params['url'] = search_url.format(search_term=quote(query), + search_type=search_type, + pageno=pageno) + + params['cookies'] = cookies + + return params + + +# get response from search-request +def response(resp): + results = [] + dom = html.fromstring(resp.text) + + search_res = dom.xpath('//section[@id="#torrents"]/div/table/tbody/tr') + + # return empty array if nothing is found + if not search_res: + return [] + + # parse results + for result in search_res: + link = result.xpath('.//a[@id="torrent_name"]')[0] + href = link.attrib.get('href') + title = extract_text(link) + seed = result.xpath('.//td[8]/text()')[0] + leech = result.xpath('.//td[9]/text()')[0] + + # convert seed to int if possible + if seed.isdigit(): + seed = int(seed) + else: + seed = 0 + + # convert leech to int if possible + if leech.isdigit(): + leech = int(leech) + else: + leech = 0 + + params = {'url': href, + 'title': title, + 'seed': seed, + 'leech': leech, + 'template': 'torrent.html'} + + # let's try to calculate the torrent size + try: + filesize_info = result.xpath('.//td[6]/text()')[0] + filesize = filesize_info[:-2] + filesize_multiplier = filesize_info[-2:].lower() + multiplier_french_to_english = { + 'to': 'TiB', + 'go': 'GiB', + 'mo': 'MiB', + 'ko': 'KiB' + } + filesize = get_torrent_size(filesize, multiplier_french_to_english[filesize_multiplier]) + params['filesize'] = filesize + except: + pass + + # extract and convert creation date + try: + date_ts = result.xpath('.//td[5]/div/text()')[0] + date = datetime.fromtimestamp(float(date_ts)) + params['publishedDate'] = date + except: + pass + + # append result + results.append(params) + + # return results sorted by seeder + return sorted(results, key=itemgetter('seed'), reverse=True) diff --git a/searx/engines/youtube_api.py b/searx/engines/youtube_api.py index bc4c0d58e..2542169a6 100644 --- a/searx/engines/youtube_api.py +++ b/searx/engines/youtube_api.py @@ -10,7 +10,7 @@ from json import loads from dateutil import parser -from searx.url_utils import urlencode +from urllib.parse import urlencode # engine dependent config categories = ['videos', 'music'] diff --git a/searx/engines/youtube_noapi.py b/searx/engines/youtube_noapi.py index 49d0ae604..fef501458 100644 --- a/searx/engines/youtube_noapi.py +++ b/searx/engines/youtube_noapi.py @@ -10,9 +10,9 @@ from functools import reduce from json import loads +from urllib.parse import quote_plus from searx.engines.xpath import extract_text from searx.utils import list_get -from searx.url_utils import quote_plus # engine dependent config categories = ['videos', 'music'] @@ -70,11 +70,15 @@ def response(resp): title = get_text_from_json(video.get('title', {})) content = get_text_from_json(video.get('descriptionSnippet', {})) embedded = embedded_url.format(videoid=videoid) + author = get_text_from_json(video.get('ownerText', {})) + length = get_text_from_json(video.get('lengthText', {})) # append result results.append({'url': url, 'title': title, 'content': content, + 'author': author, + 'length': length, 'template': 'videos.html', 'embedded': embedded, 'thumbnail': thumbnail}) |