summaryrefslogtreecommitdiff
path: root/searx/engines
diff options
context:
space:
mode:
Diffstat (limited to 'searx/engines')
-rw-r--r--searx/engines/1337x.py3
-rw-r--r--searx/engines/__init__.py4
-rw-r--r--searx/engines/acgsou.py4
-rw-r--r--searx/engines/apkmirror.py3
-rw-r--r--searx/engines/archlinux.py4
-rw-r--r--searx/engines/arxiv.py4
-rwxr-xr-xsearx/engines/base.py4
-rw-r--r--searx/engines/bing.py4
-rw-r--r--searx/engines/bing_images.py17
-rw-r--r--searx/engines/bing_news.py3
-rw-r--r--searx/engines/bing_videos.py2
-rw-r--r--searx/engines/btdigg.py2
-rw-r--r--searx/engines/command.py184
-rw-r--r--searx/engines/currency_convert.py7
-rw-r--r--searx/engines/dailymotion.py2
-rw-r--r--searx/engines/deezer.py4
-rw-r--r--searx/engines/deviantart.py3
-rw-r--r--searx/engines/dictzone.py8
-rw-r--r--searx/engines/digbt.py5
-rw-r--r--searx/engines/digg.py2
-rw-r--r--searx/engines/doku.py2
-rw-r--r--searx/engines/duckduckgo.py8
-rw-r--r--searx/engines/duckduckgo_definitions.py2
-rw-r--r--searx/engines/duckduckgo_images.py2
-rw-r--r--searx/engines/duden.py2
-rw-r--r--searx/engines/etools.py2
-rw-r--r--searx/engines/fdroid.py2
-rw-r--r--searx/engines/filecrop.py7
-rw-r--r--searx/engines/flickr.py2
-rw-r--r--searx/engines/flickr_noapi.py14
-rw-r--r--searx/engines/framalibre.py7
-rw-r--r--searx/engines/frinkiac.py2
-rw-r--r--searx/engines/genius.py2
-rw-r--r--searx/engines/gentoo.py4
-rw-r--r--searx/engines/gigablast.py167
-rw-r--r--searx/engines/github.py2
-rw-r--r--searx/engines/google.py529
-rw-r--r--searx/engines/google_images.py264
-rw-r--r--searx/engines/google_news.py2
-rw-r--r--searx/engines/google_videos.py2
-rw-r--r--searx/engines/ina.py7
-rw-r--r--searx/engines/invidious.py11
-rw-r--r--searx/engines/json_engine.py7
-rw-r--r--searx/engines/kickass.py2
-rw-r--r--searx/engines/mediawiki.py4
-rw-r--r--searx/engines/microsoft_academic.py3
-rw-r--r--searx/engines/mixcloud.py2
-rw-r--r--searx/engines/nyaa.py2
-rw-r--r--searx/engines/openstreetmap.py19
-rw-r--r--searx/engines/peertube.py95
-rw-r--r--searx/engines/photon.py2
-rw-r--r--searx/engines/piratebay.py125
-rw-r--r--searx/engines/pubmed.py2
-rw-r--r--searx/engines/qwant.py6
-rw-r--r--searx/engines/reddit.py2
-rw-r--r--searx/engines/scanr_structures.py4
-rw-r--r--searx/engines/searchcode_code.py2
-rw-r--r--searx/engines/searchcode_doc.py2
-rw-r--r--searx/engines/seedpeer.py2
-rw-r--r--searx/engines/soundcloud.py9
-rw-r--r--searx/engines/spotify.py8
-rw-r--r--searx/engines/stackoverflow.py2
-rw-r--r--searx/engines/tokyotoshokan.py2
-rw-r--r--searx/engines/torrentz.py12
-rw-r--r--searx/engines/translated.py12
-rw-r--r--searx/engines/twitter.py2
-rw-r--r--searx/engines/unsplash.py2
-rw-r--r--searx/engines/vimeo.py2
-rw-r--r--searx/engines/wikidata.py28
-rw-r--r--searx/engines/wikipedia.py86
-rw-r--r--searx/engines/wolframalpha_api.py20
-rw-r--r--searx/engines/wolframalpha_noapi.py2
-rw-r--r--searx/engines/www1x.py2
-rw-r--r--searx/engines/xpath.py10
-rw-r--r--searx/engines/yacy.py4
-rw-r--r--searx/engines/yahoo.py4
-rw-r--r--searx/engines/yahoo_news.py4
-rw-r--r--searx/engines/yandex.py2
-rw-r--r--searx/engines/yggtorrent.py124
-rw-r--r--searx/engines/youtube_api.py2
-rw-r--r--searx/engines/youtube_noapi.py6
81 files changed, 1164 insertions, 773 deletions
diff --git a/searx/engines/1337x.py b/searx/engines/1337x.py
index 0de04bd95..76a7a1634 100644
--- a/searx/engines/1337x.py
+++ b/searx/engines/1337x.py
@@ -1,7 +1,8 @@
+from urllib.parse import quote, urljoin
from lxml import html
from searx.engines.xpath import extract_text
from searx.utils import get_torrent_size
-from searx.url_utils import quote, urljoin
+
url = 'https://1337x.to/'
search_url = url + 'search/{search_term}/{pageno}/'
diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py
index 9ccef8b54..9fcf812b0 100644
--- a/searx/engines/__init__.py
+++ b/searx/engines/__init__.py
@@ -55,6 +55,7 @@ engine_default_args = {'paging': False,
'continuous_errors': 0,
'time_range_support': False,
'offline': False,
+ 'display_error_messages': True,
'tokens': []}
@@ -73,6 +74,9 @@ def load_engine(engine_data):
try:
engine = load_module(engine_module + '.py', engine_dir)
+ except (SyntaxError, KeyboardInterrupt, SystemExit, SystemError, ImportError, RuntimeError) as e:
+ logger.exception('Fatal exception in engine "{}"'.format(engine_module))
+ sys.exit(1)
except:
logger.exception('Cannot load engine "{}"'.format(engine_module))
return None
diff --git a/searx/engines/acgsou.py b/searx/engines/acgsou.py
index cca28f0db..d5d3e3178 100644
--- a/searx/engines/acgsou.py
+++ b/searx/engines/acgsou.py
@@ -9,9 +9,9 @@
@parse url, title, content, seed, leech, torrentfile
"""
+from urllib.parse import urlencode
from lxml import html
from searx.engines.xpath import extract_text
-from searx.url_utils import urlencode
from searx.utils import get_torrent_size, int_or_zero
# engine dependent config
@@ -63,7 +63,7 @@ def response(resp):
except:
pass
# I didn't add download/seed/leech count since as I figured out they are generated randomly everytime
- content = u'Category: "{category}".'
+ content = 'Category: "{category}".'
content = content.format(category=category)
results.append({'url': href,
diff --git a/searx/engines/apkmirror.py b/searx/engines/apkmirror.py
index f2ee12b29..4e6dcd486 100644
--- a/searx/engines/apkmirror.py
+++ b/searx/engines/apkmirror.py
@@ -9,9 +9,10 @@
@parse url, title, thumbnail_src
"""
+from urllib.parse import urlencode
from lxml import html
from searx.engines.xpath import extract_text
-from searx.url_utils import urlencode
+
# engine dependent config
categories = ['it']
diff --git a/searx/engines/archlinux.py b/searx/engines/archlinux.py
index dce862f55..e2f44b0f5 100644
--- a/searx/engines/archlinux.py
+++ b/searx/engines/archlinux.py
@@ -11,9 +11,9 @@
@parse url, title
"""
+from urllib.parse import urlencode, urljoin
from lxml import html
from searx.engines.xpath import extract_text
-from searx.url_utils import urlencode, urljoin
# engine dependent config
categories = ['it']
@@ -105,7 +105,7 @@ def request(query, params):
# if our language is hosted on the main site, we need to add its name
# to the query in order to narrow the results to that language
if language in main_langs:
- query += b' (' + main_langs[language] + b')'
+ query += ' (' + main_langs[language] + ')'
# prepare the request parameters
query = urlencode({'search': query})
diff --git a/searx/engines/arxiv.py b/searx/engines/arxiv.py
index e3c871d17..77ddc572e 100644
--- a/searx/engines/arxiv.py
+++ b/searx/engines/arxiv.py
@@ -11,9 +11,9 @@
More info on api: https://arxiv.org/help/api/user-manual
"""
+from urllib.parse import urlencode
from lxml import html
from datetime import datetime
-from searx.url_utils import urlencode
categories = ['science']
@@ -30,7 +30,7 @@ def request(query, params):
# basic search
offset = (params['pageno'] - 1) * number_of_results
- string_args = dict(query=query.decode('utf-8'),
+ string_args = dict(query=query,
offset=offset,
number_of_results=number_of_results)
diff --git a/searx/engines/base.py b/searx/engines/base.py
index f1b1cf671..0114f9798 100755
--- a/searx/engines/base.py
+++ b/searx/engines/base.py
@@ -13,10 +13,10 @@
More info on api: http://base-search.net/about/download/base_interface.pdf
"""
+from urllib.parse import urlencode
from lxml import etree
from datetime import datetime
import re
-from searx.url_utils import urlencode
from searx.utils import searx_useragent
@@ -55,7 +55,7 @@ shorcut_dict = {
def request(query, params):
# replace shortcuts with API advanced search keywords
for key in shorcut_dict.keys():
- query = re.sub(key, shorcut_dict[key], str(query))
+ query = re.sub(key, shorcut_dict[key], query)
# basic search
offset = (params['pageno'] - 1) * number_of_results
diff --git a/searx/engines/bing.py b/searx/engines/bing.py
index afb776acd..c7b619369 100644
--- a/searx/engines/bing.py
+++ b/searx/engines/bing.py
@@ -14,10 +14,10 @@
"""
import re
+from urllib.parse import urlencode
from lxml import html
from searx import logger, utils
from searx.engines.xpath import extract_text
-from searx.url_utils import urlencode
from searx.utils import match_language, gen_useragent, eval_xpath
logger = logger.getChild('bing engine')
@@ -47,7 +47,7 @@ def request(query, params):
else:
lang = match_language(params['language'], supported_languages, language_aliases)
- query = u'language:{} {}'.format(lang.split('-')[0].upper(), query.decode('utf-8')).encode('utf-8')
+ query = 'language:{} {}'.format(lang.split('-')[0].upper(), query)
search_path = search_string.format(
query=urlencode({'q': query}),
diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py
index 138ed11c6..10da42b5c 100644
--- a/searx/engines/bing_images.py
+++ b/searx/engines/bing_images.py
@@ -12,10 +12,10 @@
"""
+from urllib.parse import urlencode
from lxml import html
from json import loads
import re
-from searx.url_utils import urlencode
from searx.utils import match_language
from searx.engines.bing import _fetch_supported_languages, supported_languages_url, language_aliases
@@ -80,19 +80,18 @@ def response(resp):
# parse results
for result in dom.xpath('//div[@class="imgpt"]'):
-
- img_format = result.xpath('./div[contains(@class, "img_info")]/span/text()')[0]
- # Microsoft seems to experiment with this code so don't make the path too specific,
- # just catch the text section for the first anchor in img_info assuming this to be
- # the originating site.
- source = result.xpath('./div[contains(@class, "img_info")]//a/text()')[0]
-
try:
+ img_format = result.xpath('./div[contains(@class, "img_info")]/span/text()')[0]
+ # Microsoft seems to experiment with this code so don't make the path too specific,
+ # just catch the text section for the first anchor in img_info assuming this to be
+ # the originating site.
+ source = result.xpath('./div[contains(@class, "img_info")]//a/text()')[0]
+
m = loads(result.xpath('./a/@m')[0])
# strip 'Unicode private use area' highlighting, they render to Tux
# the Linux penguin and a standing diamond on my machine...
- title = m.get('t', '').replace(u'\ue000', '').replace(u'\ue001', '')
+ title = m.get('t', '').replace('\ue000', '').replace('\ue001', '')
results.append({'template': 'images.html',
'url': m['purl'],
'thumbnail_src': m['turl'],
diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py
index d13be777c..fbe51faed 100644
--- a/searx/engines/bing_news.py
+++ b/searx/engines/bing_news.py
@@ -13,10 +13,9 @@
from datetime import datetime
from dateutil import parser
+from urllib.parse import urlencode, urlparse, parse_qsl
from lxml import etree
from searx.utils import list_get, match_language
-from searx.url_utils import urlencode, urlparse, parse_qsl
-
from searx.engines.bing import _fetch_supported_languages, supported_languages_url, language_aliases
# engine dependent config
diff --git a/searx/engines/bing_videos.py b/searx/engines/bing_videos.py
index f048f0d8e..63264de6f 100644
--- a/searx/engines/bing_videos.py
+++ b/searx/engines/bing_videos.py
@@ -12,7 +12,7 @@
from json import loads
from lxml import html
-from searx.url_utils import urlencode
+from urllib.parse import urlencode
from searx.utils import match_language
from searx.engines.bing import _fetch_supported_languages, supported_languages_url, language_aliases
diff --git a/searx/engines/btdigg.py b/searx/engines/btdigg.py
index 82eedc24b..2faade3e2 100644
--- a/searx/engines/btdigg.py
+++ b/searx/engines/btdigg.py
@@ -12,8 +12,8 @@
from lxml import html
from operator import itemgetter
+from urllib.parse import quote, urljoin
from searx.engines.xpath import extract_text
-from searx.url_utils import quote, urljoin
from searx.utils import get_torrent_size
# engine dependent config
diff --git a/searx/engines/command.py b/searx/engines/command.py
new file mode 100644
index 000000000..b9e672ffa
--- /dev/null
+++ b/searx/engines/command.py
@@ -0,0 +1,184 @@
+'''
+searx is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+searx is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with searx. If not, see < http://www.gnu.org/licenses/ >.
+'''
+
+
+from os.path import expanduser, isabs, realpath, commonprefix
+from re import MULTILINE, search as re_search
+from shlex import split as shlex_split
+from subprocess import Popen, PIPE
+from time import time
+from threading import Thread
+
+from searx import logger
+
+
+offline = True
+paging = True
+command = []
+delimiter = {}
+parse_regex = {}
+query_type = ''
+query_enum = []
+environment_variables = {}
+working_dir = realpath('.')
+result_separator = '\n'
+result_template = 'key-value.html'
+timeout = 4.0
+
+_command_logger = logger.getChild('command')
+_compiled_parse_regex = {}
+
+
+def init(engine_settings):
+ check_parsing_options(engine_settings)
+
+ if 'command' not in engine_settings:
+ raise ValueError('engine command : missing configuration key: command')
+
+ global command, working_dir, result_template, delimiter, parse_regex, timeout, environment_variables
+
+ command = engine_settings['command']
+
+ if 'working_dir' in engine_settings:
+ working_dir = engine_settings['working_dir']
+ if not isabs(engine_settings['working_dir']):
+ working_dir = realpath(working_dir)
+
+ if 'parse_regex' in engine_settings:
+ parse_regex = engine_settings['parse_regex']
+ for result_key, regex in parse_regex.items():
+ _compiled_parse_regex[result_key] = re.compile(regex, flags=MULTILINE)
+ if 'delimiter' in engine_settings:
+ delimiter = engine_settings['delimiter']
+
+ if 'environment_variables' in engine_settings:
+ environment_variables = engine_settings['environment_variables']
+
+
+def search(query, params):
+ cmd = _get_command_to_run(query)
+ if not cmd:
+ return []
+
+ results = []
+ reader_thread = Thread(target=_get_results_from_process, args=(results, cmd, params['pageno']))
+ reader_thread.start()
+ reader_thread.join(timeout=timeout)
+
+ return results
+
+
+def _get_command_to_run(query):
+ params = shlex_split(query.decode('utf-8'))
+ __check_query_params(params)
+
+ cmd = []
+ for c in command:
+ if c == '{{QUERY}}':
+ cmd.extend(params)
+ else:
+ cmd.append(c)
+
+ return cmd
+
+
+def _get_results_from_process(results, cmd, pageno):
+ leftover = ''
+ count = 0
+ start, end = __get_results_limits(pageno)
+ with Popen(cmd, stdout=PIPE, stderr=PIPE, env=environment_variables) as process:
+ line = process.stdout.readline()
+ while line:
+ buf = leftover + line.decode('utf-8')
+ raw_results = buf.split(result_separator)
+ if raw_results[-1]:
+ leftover = raw_results[-1]
+ raw_results = raw_results[:-1]
+
+ for raw_result in raw_results:
+ result = __parse_single_result(raw_result)
+ if result is None:
+ _command_logger.debug('skipped result:', raw_result)
+ continue
+
+ if start <= count and count <= end:
+ result['template'] = result_template
+ results.append(result)
+
+ count += 1
+ if end < count:
+ return results
+
+ line = process.stdout.readline()
+
+ return_code = process.wait(timeout=timeout)
+ if return_code != 0:
+ raise RuntimeError('non-zero return code when running command', cmd, return_code)
+
+
+def __get_results_limits(pageno):
+ start = (pageno - 1) * 10
+ end = start + 9
+ return start, end
+
+
+def __check_query_params(params):
+ if not query_type:
+ return
+
+ if query_type == 'path':
+ query_path = params[-1]
+ query_path = expanduser(query_path)
+ if commonprefix([realpath(query_path), working_dir]) != working_dir:
+ raise ValueError('requested path is outside of configured working directory')
+ elif query_type == 'enum' and len(query_enum) > 0:
+ for param in params:
+ if param not in query_enum:
+ raise ValueError('submitted query params is not allowed', param, 'allowed params:', query_enum)
+
+
+def check_parsing_options(engine_settings):
+ """ Checks if delimiter based parsing or regex parsing is configured correctly """
+
+ if 'delimiter' not in engine_settings and 'parse_regex' not in engine_settings:
+ raise ValueError('failed to init settings for parsing lines: missing delimiter or parse_regex')
+ if 'delimiter' in engine_settings and 'parse_regex' in engine_settings:
+ raise ValueError('failed to init settings for parsing lines: too many settings')
+
+ if 'delimiter' in engine_settings:
+ if 'chars' not in engine_settings['delimiter'] or 'keys' not in engine_settings['delimiter']:
+ raise ValueError
+
+
+def __parse_single_result(raw_result):
+ """ Parses command line output based on configuration """
+
+ result = {}
+
+ if delimiter:
+ elements = raw_result.split(delimiter['chars'], maxsplit=len(delimiter['keys']) - 1)
+ if len(elements) != len(delimiter['keys']):
+ return {}
+ for i in range(len(elements)):
+ result[delimiter['keys'][i]] = elements[i]
+
+ if parse_regex:
+ for result_key, regex in _compiled_parse_regex.items():
+ found = regex.search(raw_result)
+ if not found:
+ return {}
+ result[result_key] = raw_result[found.start():found.end()]
+
+ return result
diff --git a/searx/engines/currency_convert.py b/searx/engines/currency_convert.py
index 8eab8f673..c6067c4a8 100644
--- a/searx/engines/currency_convert.py
+++ b/searx/engines/currency_convert.py
@@ -1,26 +1,23 @@
import json
import re
import os
-import sys
import unicodedata
from io import open
from datetime import datetime
-if sys.version_info[0] == 3:
- unicode = str
categories = []
url = 'https://duckduckgo.com/js/spice/currency/1/{0}/{1}'
weight = 100
-parser_re = re.compile(b'.*?(\\d+(?:\\.\\d+)?) ([^.0-9]+) (?:in|to) ([^.0-9]+)', re.I)
+parser_re = re.compile('.*?(\\d+(?:\\.\\d+)?) ([^.0-9]+) (?:in|to) ([^.0-9]+)', re.I)
db = 1
def normalize_name(name):
- name = name.decode('utf-8').lower().replace('-', ' ').rstrip('s')
+ name = name.lower().replace('-', ' ').rstrip('s')
name = re.sub(' +', ' ', name)
return unicodedata.normalize('NFKD', name).lower()
diff --git a/searx/engines/dailymotion.py b/searx/engines/dailymotion.py
index 1038e64bf..1e24e41da 100644
--- a/searx/engines/dailymotion.py
+++ b/searx/engines/dailymotion.py
@@ -14,7 +14,7 @@
from json import loads
from datetime import datetime
-from searx.url_utils import urlencode
+from urllib.parse import urlencode
from searx.utils import match_language, html_to_text
# engine dependent config
diff --git a/searx/engines/deezer.py b/searx/engines/deezer.py
index af63478fb..48c0429a7 100644
--- a/searx/engines/deezer.py
+++ b/searx/engines/deezer.py
@@ -11,7 +11,7 @@
"""
from json import loads
-from searx.url_utils import urlencode
+from urllib.parse import urlencode
# engine dependent config
categories = ['music']
@@ -50,7 +50,7 @@ def response(resp):
if url.startswith('http://'):
url = 'https' + url[4:]
- content = u'{} - {} - {}'.format(
+ content = '{} - {} - {}'.format(
result['artist']['name'],
result['album']['title'],
result['title'])
diff --git a/searx/engines/deviantart.py b/searx/engines/deviantart.py
index a0e27e622..2bd21fa5d 100644
--- a/searx/engines/deviantart.py
+++ b/searx/engines/deviantart.py
@@ -14,8 +14,9 @@
from lxml import html
import re
+from urllib.parse import urlencode
from searx.engines.xpath import extract_text
-from searx.url_utils import urlencode
+
# engine dependent config
categories = ['images']
diff --git a/searx/engines/dictzone.py b/searx/engines/dictzone.py
index 423af0971..5a1fea3cf 100644
--- a/searx/engines/dictzone.py
+++ b/searx/engines/dictzone.py
@@ -10,15 +10,15 @@
"""
import re
+from urllib.parse import urljoin
from lxml import html
from searx.utils import is_valid_lang, eval_xpath
-from searx.url_utils import urljoin
categories = ['general']
-url = u'https://dictzone.com/{from_lang}-{to_lang}-dictionary/{query}'
+url = 'https://dictzone.com/{from_lang}-{to_lang}-dictionary/{query}'
weight = 100
-parser_re = re.compile(b'.*?([a-z]+)-([a-z]+) ([^ ]+)$', re.I)
+parser_re = re.compile('.*?([a-z]+)-([a-z]+) ([^ ]+)$', re.I)
results_xpath = './/table[@id="r"]/tr'
@@ -37,7 +37,7 @@ def request(query, params):
params['url'] = url.format(from_lang=from_lang[2],
to_lang=to_lang[2],
- query=query.decode('utf-8'))
+ query=query)
return params
diff --git a/searx/engines/digbt.py b/searx/engines/digbt.py
index ff2f94593..e2c0389c6 100644
--- a/searx/engines/digbt.py
+++ b/searx/engines/digbt.py
@@ -10,14 +10,11 @@
@parse url, title, content, magnetlink
"""
-from sys import version_info
+from urllib.parse import urljoin
from lxml import html
from searx.engines.xpath import extract_text
from searx.utils import get_torrent_size
-from searx.url_utils import urljoin
-if version_info[0] == 3:
- unicode = str
categories = ['videos', 'music', 'files']
paging = True
diff --git a/searx/engines/digg.py b/searx/engines/digg.py
index 073410eb0..24a932d53 100644
--- a/searx/engines/digg.py
+++ b/searx/engines/digg.py
@@ -14,8 +14,8 @@ import random
import string
from dateutil import parser
from json import loads
+from urllib.parse import urlencode
from lxml import html
-from searx.url_utils import urlencode
from datetime import datetime
# engine dependent config
diff --git a/searx/engines/doku.py b/searx/engines/doku.py
index d20e66026..513ffda89 100644
--- a/searx/engines/doku.py
+++ b/searx/engines/doku.py
@@ -9,10 +9,10 @@
# @stable yes
# @parse (general) url, title, content
+from urllib.parse import urlencode
from lxml.html import fromstring
from searx.engines.xpath import extract_text
from searx.utils import eval_xpath
-from searx.url_utils import urlencode
# engine dependent config
categories = ['general'] # TODO , 'images', 'music', 'videos', 'files'
diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py
index 0d2c0af2d..fb1ea2b2d 100644
--- a/searx/engines/duckduckgo.py
+++ b/searx/engines/duckduckgo.py
@@ -15,9 +15,9 @@
from lxml.html import fromstring
from json import loads
+from urllib.parse import urlencode
from searx.engines.xpath import extract_text
from searx.poolrequests import get
-from searx.url_utils import urlencode
from searx.utils import match_language, eval_xpath
# engine dependent config
@@ -50,6 +50,7 @@ result_xpath = '//div[@class="result results_links results_links_deep web-result
url_xpath = './/a[@class="result__a"]/@href'
title_xpath = './/a[@class="result__a"]'
content_xpath = './/a[@class="result__snippet"]'
+correction_xpath = '//div[@id="did_you_mean"]//a'
# match query's language to a region code that duckduckgo will accept
@@ -125,6 +126,11 @@ def response(resp):
'content': content,
'url': res_url})
+ # parse correction
+ for correction in eval_xpath(doc, correction_xpath):
+ # append correction
+ results.append({'correction': extract_text(correction)})
+
# return results
return results
diff --git a/searx/engines/duckduckgo_definitions.py b/searx/engines/duckduckgo_definitions.py
index 79d10c303..73154a525 100644
--- a/searx/engines/duckduckgo_definitions.py
+++ b/searx/engines/duckduckgo_definitions.py
@@ -10,11 +10,11 @@ DuckDuckGo (definitions)
"""
import json
+from urllib.parse import urlencode
from lxml import html
from re import compile
from searx.engines.xpath import extract_text
from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, language_aliases
-from searx.url_utils import urlencode
from searx.utils import html_to_text, match_language
url = 'https://api.duckduckgo.com/'\
diff --git a/searx/engines/duckduckgo_images.py b/searx/engines/duckduckgo_images.py
index 89924b71c..38e141f8b 100644
--- a/searx/engines/duckduckgo_images.py
+++ b/searx/engines/duckduckgo_images.py
@@ -14,13 +14,13 @@
"""
from json import loads
+from urllib.parse import urlencode
from searx.engines.xpath import extract_text
from searx.engines.duckduckgo import (
_fetch_supported_languages, supported_languages_url,
get_region_code, language_aliases
)
from searx.poolrequests import get
-from searx.url_utils import urlencode
# engine dependent config
categories = ['images']
diff --git a/searx/engines/duden.py b/searx/engines/duden.py
index cf2f1a278..a711f422e 100644
--- a/searx/engines/duden.py
+++ b/searx/engines/duden.py
@@ -10,9 +10,9 @@
from lxml import html, etree
import re
+from urllib.parse import quote, urljoin
from searx.engines.xpath import extract_text
from searx.utils import eval_xpath
-from searx.url_utils import quote, urljoin
from searx import logger
categories = ['general']
diff --git a/searx/engines/etools.py b/searx/engines/etools.py
index a9eb0980d..efc102ef6 100644
--- a/searx/engines/etools.py
+++ b/searx/engines/etools.py
@@ -10,8 +10,8 @@
"""
from lxml import html
+from urllib.parse import quote
from searx.engines.xpath import extract_text
-from searx.url_utils import quote
from searx.utils import eval_xpath
categories = ['general']
diff --git a/searx/engines/fdroid.py b/searx/engines/fdroid.py
index 4066dc716..a2a5114df 100644
--- a/searx/engines/fdroid.py
+++ b/searx/engines/fdroid.py
@@ -9,9 +9,9 @@
@parse url, title, content
"""
+from urllib.parse import urlencode
from lxml import html
from searx.engines.xpath import extract_text
-from searx.url_utils import urlencode
# engine dependent config
categories = ['files']
diff --git a/searx/engines/filecrop.py b/searx/engines/filecrop.py
index ed57a6bf3..eef5be6e8 100644
--- a/searx/engines/filecrop.py
+++ b/searx/engines/filecrop.py
@@ -1,9 +1,6 @@
-from searx.url_utils import urlencode
+from html.parser import HTMLParser
+from urllib.parse import urlencode
-try:
- from HTMLParser import HTMLParser
-except:
- from html.parser import HTMLParser
url = 'http://www.filecrop.com/'
search_url = url + '/search.php?{query}&size_i=0&size_f=100000000&engine_r=1&engine_d=1&engine_e=1&engine_4=1&engine_m=1&pos={index}' # noqa
diff --git a/searx/engines/flickr.py b/searx/engines/flickr.py
index de1769370..b23c447b8 100644
--- a/searx/engines/flickr.py
+++ b/searx/engines/flickr.py
@@ -14,7 +14,7 @@
"""
from json import loads
-from searx.url_utils import urlencode
+from urllib.parse import urlencode
categories = ['images']
diff --git a/searx/engines/flickr_noapi.py b/searx/engines/flickr_noapi.py
index c8ee34f7a..4bcf837cb 100644
--- a/searx/engines/flickr_noapi.py
+++ b/searx/engines/flickr_noapi.py
@@ -15,8 +15,8 @@
from json import loads
from time import time
import re
+from urllib.parse import urlencode
from searx.engines import logger
-from searx.url_utils import urlencode
from searx.utils import ecma_unescape, html_to_text
logger = logger.getChild('flickr-noapi')
@@ -117,14 +117,10 @@ def response(resp):
'img_format': img_format,
'template': 'images.html'
}
- try:
- result['author'] = author
- result['title'] = title
- result['content'] = content
- except:
- result['author'] = ''
- result['title'] = ''
- result['content'] = ''
+ result['author'] = author.encode(errors='ignore').decode()
+ result['source'] = source.encode(errors='ignore').decode()
+ result['title'] = title.encode(errors='ignore').decode()
+ result['content'] = content.encode(errors='ignore').decode()
results.append(result)
return results
diff --git a/searx/engines/framalibre.py b/searx/engines/framalibre.py
index f3441fa5f..14b659b5f 100644
--- a/searx/engines/framalibre.py
+++ b/searx/engines/framalibre.py
@@ -10,13 +10,10 @@
@parse url, title, content, thumbnail, img_src
"""
-try:
- from cgi import escape
-except:
- from html import escape
+from html import escape
+from urllib.parse import urljoin, urlencode
from lxml import html
from searx.engines.xpath import extract_text
-from searx.url_utils import urljoin, urlencode
# engine dependent config
categories = ['it']
diff --git a/searx/engines/frinkiac.py b/searx/engines/frinkiac.py
index a67b42dbe..5b174a687 100644
--- a/searx/engines/frinkiac.py
+++ b/searx/engines/frinkiac.py
@@ -10,7 +10,7 @@ Frinkiac (Images)
"""
from json import loads
-from searx.url_utils import urlencode
+from urllib.parse import urlencode
categories = ['images']
diff --git a/searx/engines/genius.py b/searx/engines/genius.py
index aa5afad9b..feb7d79d1 100644
--- a/searx/engines/genius.py
+++ b/searx/engines/genius.py
@@ -11,7 +11,7 @@ Genius
"""
from json import loads
-from searx.url_utils import urlencode
+from urllib.parse import urlencode
from datetime import datetime
# engine dependent config
diff --git a/searx/engines/gentoo.py b/searx/engines/gentoo.py
index a7a966cc9..b6bc99fab 100644
--- a/searx/engines/gentoo.py
+++ b/searx/engines/gentoo.py
@@ -11,9 +11,9 @@
@parse url, title
"""
+from urllib.parse import urlencode, urljoin
from lxml import html
from searx.engines.xpath import extract_text
-from searx.url_utils import urlencode, urljoin
# engine dependent config
categories = ['it']
@@ -90,7 +90,7 @@ def request(query, params):
# if our language is hosted on the main site, we need to add its name
# to the query in order to narrow the results to that language
if language in main_langs:
- query += b' (' + (main_langs[language]).encode('utf-8') + b')'
+ query += ' (' + main_langs[language] + ')'
# prepare the request parameters
query = urlencode({'search': query})
diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py
index 2bb29a9fe..1d71b18e9 100644
--- a/searx/engines/gigablast.py
+++ b/searx/engines/gigablast.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Gigablast (Web)
@@ -9,121 +10,117 @@
@stable yes
@parse url, title, content
"""
+# pylint: disable=missing-function-docstring, invalid-name
-import random
+import re
from json import loads
-from time import time
-from lxml.html import fromstring
+from urllib.parse import urlencode
+# from searx import logger
from searx.poolrequests import get
-from searx.url_utils import urlencode
-from searx.utils import eval_xpath
# engine dependent config
categories = ['general']
-paging = True
-number_of_results = 10
+# gigablast's pagination is totally damaged, don't use it
+paging = False
language_support = True
safesearch = True
# search-url
-base_url = 'https://gigablast.com/'
-search_string = 'search?{query}'\
- '&n={number_of_results}'\
- '&c=main'\
- '&s={offset}'\
- '&format=json'\
- '&langcountry={lang}'\
- '&ff={safesearch}'\
- '&rand={rxikd}'
-# specific xpath variables
-results_xpath = '//response//result'
-url_xpath = './/url'
-title_xpath = './/title'
-content_xpath = './/sum'
-
-supported_languages_url = 'https://gigablast.com/search?&rxikd=1'
-
-extra_param = '' # gigablast requires a random extra parameter
-# which can be extracted from the source code of the search page
+base_url = 'https://gigablast.com'
+# ugly hack: gigablast requires a random extra parameter which can be extracted
+# from the source code of the gigablast HTTP client
+extra_param = ''
+extra_param_path='/search?c=main&qlangcountry=en-us&q=south&s=10'
def parse_extra_param(text):
- global extra_param
- param_lines = [x for x in text.splitlines() if x.startswith('var url=') or x.startswith('url=url+')]
- extra_param = ''
- for l in param_lines:
- extra_param += l.split("'")[1]
- extra_param = extra_param.split('&')[-1]
-
-def init(engine_settings=None):
- parse_extra_param(get('http://gigablast.com/search?c=main&qlangcountry=en-us&q=south&s=10').text)
+ # example:
+ #
+ # var uxrl='/search?c=main&qlangcountry=en-us&q=south&s=10&rand=1590740241635&n';
+ # uxrl=uxrl+'sab=730863287';
+ #
+ # extra_param --> "rand=1590740241635&nsab=730863287"
+
+ global extra_param # pylint: disable=global-statement
+ re_var= None
+ for line in text.splitlines():
+ if re_var is None and extra_param_path in line:
+ var = line.split("=")[0].split()[1] # e.g. var --> 'uxrl'
+ re_var = re.compile(var + "\\s*=\\s*" + var + "\\s*\\+\\s*'" + "(.*)" + "'(.*)")
+ extra_param = line.split("'")[1][len(extra_param_path):]
+ continue
+ if re_var is not None and re_var.search(line):
+ extra_param += re_var.search(line).group(1)
+ break
+ # logger.debug('gigablast extra_param="%s"', extra_param)
+
+def init(engine_settings=None): # pylint: disable=unused-argument
+ parse_extra_param(get(base_url + extra_param_path).text)
# do search-request
-def request(query, params):
- print("EXTRAPARAM:", extra_param)
- offset = (params['pageno'] - 1) * number_of_results
+def request(query, params): # pylint: disable=unused-argument
- if params['language'] == 'all':
- language = 'xx'
- else:
- language = params['language'].replace('-', '_').lower()
- if language.split('-')[0] != 'zh':
- language = language.split('-')[0]
+ # see API http://www.gigablast.com/api.html#/search
+ # Take into account, that the API has some quirks ..
- if params['safesearch'] >= 1:
- safesearch = 1
- else:
- safesearch = 0
+ query_args = dict(
+ c = 'main'
+ , format = 'json'
+ , q = query
+ , dr = 1
+ , showgoodimages = 0
+ )
- # rxieu is some kind of hash from the search query, but accepts random atm
- search_path = search_string.format(query=urlencode({'q': query}),
- offset=offset,
- number_of_results=number_of_results,
- lang=language,
- rxikd=int(time() * 1000),
- safesearch=safesearch)
+ if params['language'] and params['language'] != 'all':
+ query_args['qlangcountry'] = params['language']
+ query_args['qlang'] = params['language'].split('-')[0]
- params['url'] = base_url + search_path + '&' + extra_param
+ if params['safesearch'] >= 1:
+ query_args['ff'] = 1
- return params
+ search_url = '/search?' + urlencode(query_args)
+ params['url'] = base_url + search_url + extra_param
+ return params
# get response from search-request
def response(resp):
results = []
- # parse results
- try:
- response_json = loads(resp.text)
- except:
- parse_extra_param(resp.text)
- raise Exception('extra param expired, please reload')
+ response_json = loads(resp.text)
+
+ # logger.debug('gigablast returns %s results', len(response_json['results']))
for result in response_json['results']:
- # append result
- results.append({'url': result['url'],
- 'title': result['title'],
- 'content': result['sum']})
+ # see "Example JSON Output (&format=json)"
+ # at http://www.gigablast.com/api.html#/search
- # return results
- return results
+ # sort out meaningless result
+ title = result.get('title')
+ if len(title) < 2:
+ continue
-# get supported languages from their site
-def _fetch_supported_languages(resp):
- supported_languages = []
- dom = fromstring(resp.text)
- links = eval_xpath(dom, '//span[@id="menu2"]/a')
- for link in links:
- href = eval_xpath(link, './@href')[0].split('lang%3A')
- if len(href) == 2:
- code = href[1].split('_')
- if len(code) == 2:
- code = code[0] + '-' + code[1].upper()
- else:
- code = code[0]
- supported_languages.append(code)
-
- return supported_languages
+ url = result.get('url')
+ if len(url) < 9:
+ continue
+
+ content = result.get('sum')
+ if len(content) < 5:
+ continue
+
+ # extend fields
+
+ subtitle = result.get('title')
+ if len(subtitle) > 3 and subtitle != title:
+ title += " - " + subtitle
+
+ results.append(dict(
+ url = url
+ , title = title
+ , content = content
+ ))
+
+ return results
diff --git a/searx/engines/github.py b/searx/engines/github.py
index eaa00da4f..80b50ceda 100644
--- a/searx/engines/github.py
+++ b/searx/engines/github.py
@@ -11,7 +11,7 @@
"""
from json import loads
-from searx.url_utils import urlencode
+from urllib.parse import urlencode
# engine dependent config
categories = ['it']
diff --git a/searx/engines/google.py b/searx/engines/google.py
index eed3a044e..dfc8a0ab8 100644
--- a/searx/engines/google.py
+++ b/searx/engines/google.py
@@ -1,210 +1,211 @@
-# Google (Web)
-#
-# @website https://www.google.com
-# @provide-api yes (https://developers.google.com/custom-search/)
-#
-# @using-api no
-# @results HTML
-# @stable no (HTML can change)
-# @parse url, title, content, suggestion
-
-import re
+# SPDX-License-Identifier: AGPL-3.0-or-later
+"""Google (Web)
+
+:website: https://www.google.com
+:provide-api: yes (https://developers.google.com/custom-search/)
+:using-api: not the offical, since it needs registration to another service
+:results: HTML
+:stable: no
+:parse: url, title, content, number_of_results, answer, suggestion, correction
+
+For detailed description of the *REST-full* API see: `Query Parameter
+Definitions`_.
+
+.. _Query Parameter Definitions:
+ https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
+
+"""
+
+# pylint: disable=invalid-name, missing-function-docstring
+
+from urllib.parse import urlencode, urlparse
+from lxml import html
from flask_babel import gettext
-from lxml import html, etree
-from searx.engines.xpath import extract_text, extract_url
+from searx.engines.xpath import extract_text
from searx import logger
-from searx.url_utils import urlencode, urlparse, parse_qsl
from searx.utils import match_language, eval_xpath
logger = logger.getChild('google engine')
-
# engine dependent config
categories = ['general']
paging = True
language_support = True
-use_locale_domain = True
time_range_support = True
+safesearch = True
+supported_languages_url = 'https://www.google.com/preferences?#languages'
# based on https://en.wikipedia.org/wiki/List_of_Google_domains and tests
-default_hostname = 'www.google.com'
-
-country_to_hostname = {
- 'BG': 'www.google.bg', # Bulgaria
- 'CZ': 'www.google.cz', # Czech Republic
- 'DE': 'www.google.de', # Germany
- 'DK': 'www.google.dk', # Denmark
- 'AT': 'www.google.at', # Austria
- 'CH': 'www.google.ch', # Switzerland
- 'GR': 'www.google.gr', # Greece
- 'AU': 'www.google.com.au', # Australia
- 'CA': 'www.google.ca', # Canada
- 'GB': 'www.google.co.uk', # United Kingdom
- 'ID': 'www.google.co.id', # Indonesia
- 'IE': 'www.google.ie', # Ireland
- 'IN': 'www.google.co.in', # India
- 'MY': 'www.google.com.my', # Malaysia
- 'NZ': 'www.google.co.nz', # New Zealand
- 'PH': 'www.google.com.ph', # Philippines
- 'SG': 'www.google.com.sg', # Singapore
- # 'US': 'www.google.us', # United States, redirect to .com
- 'ZA': 'www.google.co.za', # South Africa
- 'AR': 'www.google.com.ar', # Argentina
- 'CL': 'www.google.cl', # Chile
- 'ES': 'www.google.es', # Spain
- 'MX': 'www.google.com.mx', # Mexico
- 'EE': 'www.google.ee', # Estonia
- 'FI': 'www.google.fi', # Finland
- 'BE': 'www.google.be', # Belgium
- 'FR': 'www.google.fr', # France
- 'IL': 'www.google.co.il', # Israel
- 'HR': 'www.google.hr', # Croatia
- 'HU': 'www.google.hu', # Hungary
- 'IT': 'www.google.it', # Italy
- 'JP': 'www.google.co.jp', # Japan
- 'KR': 'www.google.co.kr', # South Korea
- 'LT': 'www.google.lt', # Lithuania
- 'LV': 'www.google.lv', # Latvia
- 'NO': 'www.google.no', # Norway
- 'NL': 'www.google.nl', # Netherlands
- 'PL': 'www.google.pl', # Poland
- 'BR': 'www.google.com.br', # Brazil
- 'PT': 'www.google.pt', # Portugal
- 'RO': 'www.google.ro', # Romania
- 'RU': 'www.google.ru', # Russia
- 'SK': 'www.google.sk', # Slovakia
- 'SI': 'www.google.si', # Slovenia
- 'SE': 'www.google.se', # Sweden
- 'TH': 'www.google.co.th', # Thailand
- 'TR': 'www.google.com.tr', # Turkey
- 'UA': 'www.google.com.ua', # Ukraine
- # 'CN': 'www.google.cn', # China, only from China ?
- 'HK': 'www.google.com.hk', # Hong Kong
- 'TW': 'www.google.com.tw' # Taiwan
+google_domains = {
+ 'BG': 'google.bg', # Bulgaria
+ 'CZ': 'google.cz', # Czech Republic
+ 'DE': 'google.de', # Germany
+ 'DK': 'google.dk', # Denmark
+ 'AT': 'google.at', # Austria
+ 'CH': 'google.ch', # Switzerland
+ 'GR': 'google.gr', # Greece
+ 'AU': 'google.com.au', # Australia
+ 'CA': 'google.ca', # Canada
+ 'GB': 'google.co.uk', # United Kingdom
+ 'ID': 'google.co.id', # Indonesia
+ 'IE': 'google.ie', # Ireland
+ 'IN': 'google.co.in', # India
+ 'MY': 'google.com.my', # Malaysia
+ 'NZ': 'google.co.nz', # New Zealand
+ 'PH': 'google.com.ph', # Philippines
+ 'SG': 'google.com.sg', # Singapore
+ # 'US': 'google.us', # United States, redirect to .com
+ 'ZA': 'google.co.za', # South Africa
+ 'AR': 'google.com.ar', # Argentina
+ 'CL': 'google.cl', # Chile
+ 'ES': 'google.es', # Spain
+ 'MX': 'google.com.mx', # Mexico
+ 'EE': 'google.ee', # Estonia
+ 'FI': 'google.fi', # Finland
+ 'BE': 'google.be', # Belgium
+ 'FR': 'google.fr', # France
+ 'IL': 'google.co.il', # Israel
+ 'HR': 'google.hr', # Croatia
+ 'HU': 'google.hu', # Hungary
+ 'IT': 'google.it', # Italy
+ 'JP': 'google.co.jp', # Japan
+ 'KR': 'google.co.kr', # South Korea
+ 'LT': 'google.lt', # Lithuania
+ 'LV': 'google.lv', # Latvia
+ 'NO': 'google.no', # Norway
+ 'NL': 'google.nl', # Netherlands
+ 'PL': 'google.pl', # Poland
+ 'BR': 'google.com.br', # Brazil
+ 'PT': 'google.pt', # Portugal
+ 'RO': 'google.ro', # Romania
+ 'RU': 'google.ru', # Russia
+ 'SK': 'google.sk', # Slovakia
+ 'SI': 'google.si', # Slovenia
+ 'SE': 'google.se', # Sweden
+ 'TH': 'google.co.th', # Thailand
+ 'TR': 'google.com.tr', # Turkey
+ 'UA': 'google.com.ua', # Ukraine
+ # 'CN': 'google.cn', # China, only from China ?
+ 'HK': 'google.com.hk', # Hong Kong
+ 'TW': 'google.com.tw' # Taiwan
}
-# osm
-url_map = 'https://www.openstreetmap.org/'\
- + '?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M'
-
-# search-url
-search_path = '/search'
-search_url = ('https://{hostname}' +
- search_path +
- '?{query}&start={offset}&gws_rd=cr&gbv=1&lr={lang}&hl={lang_short}&ei=x')
-
-time_range_search = "&tbs=qdr:{range}"
-time_range_dict = {'day': 'd',
- 'week': 'w',
- 'month': 'm',
- 'year': 'y'}
-
-# other URLs
-map_hostname_start = 'maps.google.'
-maps_path = '/maps'
-redirect_path = '/url'
-images_path = '/images'
-supported_languages_url = 'https://www.google.com/preferences?#languages'
+time_range_dict = {
+ 'day': 'd',
+ 'week': 'w',
+ 'month': 'm',
+ 'year': 'y'
+}
+
+# Filter results. 0: None, 1: Moderate, 2: Strict
+filter_mapping = {
+ 0: 'off',
+ 1: 'medium',
+ 2: 'high'
+}
# specific xpath variables
-results_xpath = '//div[contains(@class, "ZINbbc")]'
-url_xpath = './/div[@class="kCrYT"][1]/a/@href'
-title_xpath = './/div[@class="kCrYT"][1]/a/div[1]'
-content_xpath = './/div[@class="kCrYT"][2]//div[contains(@class, "BNeawe")]//div[contains(@class, "BNeawe")]'
-suggestion_xpath = '//div[contains(@class, "ZINbbc")][last()]//div[@class="rVLSBd"]/a//div[contains(@class, "BNeawe")]'
-spelling_suggestion_xpath = '//div[@id="scc"]//a'
-
-# map : detail location
-map_address_xpath = './/div[@class="s"]//table//td[2]/span/text()'
-map_phone_xpath = './/div[@class="s"]//table//td[2]/span/span'
-map_website_url_xpath = 'h3[2]/a/@href'
-map_website_title_xpath = 'h3[2]'
-
-# map : near the location
-map_near = 'table[@class="ts"]//tr'
-map_near_title = './/h4'
-map_near_url = './/h4/a/@href'
-map_near_phone = './/span[@class="nobr"]'
-
-# images
-images_xpath = './/div/a'
-image_url_xpath = './@href'
-image_img_src_xpath = './img/@src'
-
-# property names
-# FIXME : no translation
-property_address = "Address"
-property_phone = "Phone number"
-
-
-# remove google-specific tracking-url
-def parse_url(url_string, google_hostname):
- # sanity check
- if url_string is None:
- return url_string
-
- # normal case
- parsed_url = urlparse(url_string)
- if (parsed_url.netloc in [google_hostname, '']
- and parsed_url.path == redirect_path):
- query = dict(parse_qsl(parsed_url.query))
- return query['q']
- else:
- return url_string
+# ------------------------
+
+# google results are grouped into <div class="g" ../>
+results_xpath = '//div[@class="g"]'
+
+# google *sections* are no usual *results*, we ignore them
+g_section_with_header = './g-section-with-header'
+
+# the title is a h3 tag relative to the result group
+title_xpath = './/h3[1]'
+
+# in the result group there is <div class="r" ../> it's first child is a <a
+# href=...> (on some results, the <a> is the first "descendant", not ""child")
+href_xpath = './/div[@class="r"]//a/@href'
+
+# in the result group there is <div class="s" ../> containing he *content*
+content_xpath = './/div[@class="s"]'
+
+# Suggestions are links placed in a *card-section*, we extract only the text
+# from the links not the links itself.
+suggestion_xpath = '//div[contains(@class, "card-section")]//a'
+
+# Since google does *auto-correction* on the first query these are not really
+# *spelling suggestions*, we use them anyway.
+spelling_suggestion_xpath = '//div[@class="med"]/p/a'
-# returns extract_text on the first result selected by the xpath or None
def extract_text_from_dom(result, xpath):
+ """returns extract_text on the first result selected by the xpath or None"""
r = eval_xpath(result, xpath)
if len(r) > 0:
return extract_text(r[0])
return None
-# do search-request
-def request(query, params):
- offset = (params['pageno'] - 1) * 10
-
- if params['language'] == 'all' or params['language'] == 'en-US':
- language = 'en-GB'
- else:
- language = match_language(params['language'], supported_languages, language_aliases)
+def get_lang_country(params, lang_list, custom_aliases):
+ """Returns a tuple with *langauage* on its first and *country* on its second
+ position."""
+ language = params['language']
+ if language == 'all':
+ language = 'en-US'
language_array = language.split('-')
- if params['language'].find('-') > 0:
- country = params['language'].split('-')[1]
- elif len(language_array) == 2:
+
+ if len(language_array) == 2:
country = language_array[1]
else:
- country = 'US'
+ country = language_array[0].upper()
- url_lang = 'lang_' + language
+ language = match_language(language, lang_list, custom_aliases)
+ lang_country = '%s-%s' % (language, country)
+ if lang_country == 'en-EN':
+ lang_country = 'en'
- if use_locale_domain:
- google_hostname = country_to_hostname.get(country.upper(), default_hostname)
- else:
- google_hostname = default_hostname
-
- # original format: ID=3e2b6616cee08557:TM=5556667580:C=r:IP=4.1.12.5-:S=23ASdf0soFgF2d34dfgf-_22JJOmHdfgg
- params['cookies']['GOOGLE_ABUSE_EXEMPTION'] = 'x'
- params['url'] = search_url.format(offset=offset,
- query=urlencode({'q': query}),
- hostname=google_hostname,
- lang=url_lang,
- lang_short=language)
- if params['time_range'] in time_range_dict:
- params['url'] += time_range_search.format(range=time_range_dict[params['time_range']])
+ return language, country, lang_country
- params['headers']['Accept-Language'] = language + ',' + language + '-' + country
- params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
- params['google_hostname'] = google_hostname
+def request(query, params):
+ """Google search request"""
+
+ offset = (params['pageno'] - 1) * 10
+ language, country, lang_country = get_lang_country(
+ # pylint: disable=undefined-variable
+ params, supported_languages, language_aliases
+ )
+ subdomain = 'www.' + google_domains.get(country.upper(), 'google.com')
+
+ # https://www.google.de/search?q=corona&hl=de-DE&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium
+ query_url = 'https://' + subdomain + '/search' + "?" + urlencode({
+ 'q': query,
+ 'hl': lang_country,
+ 'lr': "lang_" + language,
+ 'ie': "utf8",
+ 'oe': "utf8",
+ 'start': offset,
+ })
+
+ if params['time_range'] in time_range_dict:
+ query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
+ if params['safesearch']:
+ query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
+
+ params['url'] = query_url
+ logger.debug("query_url --> %s", query_url)
+
+ # en-US,en;q=0.8,en;q=0.5
+ params['headers']['Accept-Language'] = (
+ lang_country + ',' + language + ';q=0.8,' + language + ';q=0.5'
+ )
+ logger.debug("HTTP header Accept-Language --> %s",
+ params['headers']['Accept-Language'])
+ params['headers']['Accept'] = (
+ 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
+ )
+ # params['google_subdomain'] = subdomain
return params
-# get response from search-request
def response(resp):
+ """Get response from google's search request"""
results = []
# detect google sorry
@@ -215,68 +216,53 @@ def response(resp):
if resp_url.path.startswith('/sorry'):
raise RuntimeWarning(gettext('CAPTCHA required'))
- # which hostname ?
- google_hostname = resp.search_params.get('google_hostname')
- google_url = "https://" + google_hostname
+ # which subdomain ?
+ # subdomain = resp.search_params.get('google_subdomain')
# convert the text to dom
dom = html.fromstring(resp.text)
- instant_answer = eval_xpath(dom, '//div[@id="_vBb"]//text()')
- if instant_answer:
- results.append({'answer': u' '.join(instant_answer)})
+ # results --> answer
+ answer = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]//text()')
+ if answer:
+ results.append({'answer': ' '.join(answer)})
+ else:
+ logger.debug("did not found 'answer'")
+
+ # results --> number_of_results
try:
- results_num = int(eval_xpath(dom, '//div[@id="resultStats"]//text()')[0]
- .split()[1].replace(',', ''))
- results.append({'number_of_results': results_num})
- except:
- pass
+ _txt = eval_xpath(dom, '//div[@id="result-stats"]//text()')[0]
+ _digit = ''.join([n for n in _txt if n.isdigit()])
+ number_of_results = int(_digit)
+ results.append({'number_of_results': number_of_results})
+
+ except Exception as e: # pylint: disable=broad-except
+ logger.debug("did not 'number_of_results'")
+ logger.error(e, exc_info=True)
# parse results
for result in eval_xpath(dom, results_xpath):
+
+ # google *sections*
+ if extract_text(eval_xpath(result, g_section_with_header)):
+ logger.debug("ingoring <g-section-with-header>")
+ continue
+
try:
title = extract_text(eval_xpath(result, title_xpath)[0])
- url = parse_url(extract_url(eval_xpath(result, url_xpath), google_url), google_hostname)
- parsed_url = urlparse(url, google_hostname)
-
- # map result
- if parsed_url.netloc == google_hostname:
- # TODO fix inside links
- continue
- # if parsed_url.path.startswith(maps_path) or parsed_url.netloc.startswith(map_hostname_start):
- # print "yooooo"*30
- # x = eval_xpath(result, map_near)
- # if len(x) > 0:
- # # map : near the location
- # results = results + parse_map_near(parsed_url, x, google_hostname)
- # else:
- # # map : detail about a location
- # results = results + parse_map_detail(parsed_url, result, google_hostname)
- # # google news
- # elif parsed_url.path == search_path:
- # # skipping news results
- # pass
-
- # # images result
- # elif parsed_url.path == images_path:
- # # only thumbnail image provided,
- # # so skipping image results
- # # results = results + parse_images(result, google_hostname)
- # pass
-
- else:
- # normal result
- content = extract_text_from_dom(result, content_xpath)
- if content is None:
- continue
-
- # append result
- results.append({'url': url,
- 'title': title,
- 'content': content
- })
- except:
- logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True))
+ url = eval_xpath(result, href_xpath)[0]
+ content = extract_text_from_dom(result, content_xpath)
+ results.append({
+ 'url': url,
+ 'title': title,
+ 'content': content
+ })
+ except Exception as e: # pylint: disable=broad-except
+ logger.error(e, exc_info=True)
+ # from lxml import etree
+ # logger.debug(etree.tostring(result, pretty_print=True))
+ # import pdb
+ # pdb.set_trace()
continue
# parse suggestion
@@ -291,101 +277,16 @@ def response(resp):
return results
-def parse_images(result, google_hostname):
- results = []
- for image in eval_xpath(result, images_xpath):
- url = parse_url(extract_text(eval_xpath(image, image_url_xpath)[0]), google_hostname)
- img_src = extract_text(eval_xpath(image, image_img_src_xpath)[0])
-
- # append result
- results.append({'url': url,
- 'title': '',
- 'content': '',
- 'img_src': img_src,
- 'template': 'images.html'
- })
-
- return results
-
-
-def parse_map_near(parsed_url, x, google_hostname):
- results = []
-
- for result in x:
- title = extract_text_from_dom(result, map_near_title)
- url = parse_url(extract_text_from_dom(result, map_near_url), google_hostname)
- attributes = []
- phone = extract_text_from_dom(result, map_near_phone)
- add_attributes(attributes, property_phone, phone, 'tel:' + phone)
- results.append({'title': title,
- 'url': url,
- 'content': attributes_to_html(attributes)
- })
-
- return results
-
-
-def parse_map_detail(parsed_url, result, google_hostname):
- results = []
-
- # try to parse the geoloc
- m = re.search(r'@([0-9\.]+),([0-9\.]+),([0-9]+)', parsed_url.path)
- if m is None:
- m = re.search(r'll\=([0-9\.]+),([0-9\.]+)\&z\=([0-9]+)', parsed_url.query)
-
- if m is not None:
- # geoloc found (ignored)
- lon = float(m.group(2)) # noqa
- lat = float(m.group(1)) # noqa
- zoom = int(m.group(3)) # noqa
-
- # attributes
- attributes = []
- address = extract_text_from_dom(result, map_address_xpath)
- phone = extract_text_from_dom(result, map_phone_xpath)
- add_attributes(attributes, property_address, address, 'geo:' + str(lat) + ',' + str(lon))
- add_attributes(attributes, property_phone, phone, 'tel:' + phone)
-
- # title / content / url
- website_title = extract_text_from_dom(result, map_website_title_xpath)
- content = extract_text_from_dom(result, content_xpath)
- website_url = parse_url(extract_text_from_dom(result, map_website_url_xpath), google_hostname)
-
- # add a result if there is a website
- if website_url is not None:
- results.append({'title': website_title,
- 'content': (content + '<br />' if content is not None else '')
- + attributes_to_html(attributes),
- 'url': website_url
- })
-
- return results
-
-
-def add_attributes(attributes, name, value, url):
- if value is not None and len(value) > 0:
- attributes.append({'label': name, 'value': value, 'url': url})
-
-
-def attributes_to_html(attributes):
- retval = '<table class="table table-striped">'
- for a in attributes:
- value = a.get('value')
- if 'url' in a:
- value = '<a href="' + a.get('url') + '">' + value + '</a>'
- retval = retval + '<tr><th>' + a.get('label') + '</th><td>' + value + '</td></tr>'
- retval = retval + '</table>'
- return retval
-
-
# get supported languages from their site
def _fetch_supported_languages(resp):
- supported_languages = {}
+ ret_val = {}
dom = html.fromstring(resp.text)
- options = eval_xpath(dom, '//*[@id="langSec"]//input[@name="lr"]')
- for option in options:
- code = eval_xpath(option, './@value')[0].split('_')[-1]
- name = eval_xpath(option, './@data-name')[0].title()
- supported_languages[code] = {"name": name}
- return supported_languages
+ radio_buttons = eval_xpath(dom, '//*[@id="langSec"]//input[@name="lang"]')
+
+ for x in radio_buttons:
+ name = x.get("data-name")
+ code = x.get("value")
+ ret_val[code] = {"name": name}
+
+ return ret_val
diff --git a/searx/engines/google_images.py b/searx/engines/google_images.py
index 636913114..9dd5fad2c 100644
--- a/searx/engines/google_images.py
+++ b/searx/engines/google_images.py
@@ -1,97 +1,225 @@
-"""
- Google (Images)
+# SPDX-License-Identifier: AGPL-3.0-or-later
+"""Google (Images)
+
+:website: https://images.google.com (redirected to subdomain www.)
+:provide-api: yes (https://developers.google.com/custom-search/)
+:using-api: not the offical, since it needs registration to another service
+:results: HTML
+:stable: no
+:template: images.html
+:parse: url, title, content, source, thumbnail_src, img_src
+
+For detailed description of the *REST-full* API see: `Query Parameter
+Definitions`_.
+
+.. _admonition:: Content-Security-Policy (CSP)
- @website https://www.google.com
- @provide-api yes (https://developers.google.com/custom-search/)
+ This engine needs to allow images from the `data URLs`_ (prefixed with the
+ ``data:` scheme).::
+
+ Header set Content-Security-Policy "img-src 'self' data: ;"
+
+.. _Query Parameter Definitions:
+ https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
- @using-api no
- @results HTML chunks with JSON inside
- @stable no
- @parse url, title, img_src
"""
-from datetime import date, timedelta
-from json import loads
+from urllib.parse import urlencode, urlparse, unquote
from lxml import html
-from searx.url_utils import urlencode
+from flask_babel import gettext
+from searx import logger
+from searx.utils import eval_xpath
+from searx.engines.xpath import extract_text
+
+# pylint: disable=unused-import
+from searx.engines.google import (
+ supported_languages_url,
+ _fetch_supported_languages,
+)
+# pylint: enable=unused-import
+
+from searx.engines.google import (
+ get_lang_country,
+ google_domains,
+ time_range_dict,
+)
+
+logger = logger.getChild('google images')
# engine dependent config
+
categories = ['images']
-paging = True
-safesearch = True
+paging = False
+language_support = True
+use_locale_domain = True
time_range_support = True
-number_of_results = 100
+safesearch = True
-search_url = 'https://www.google.com/search'\
- '?{query}'\
- '&tbm=isch'\
- '&yv=2'\
- '&{search_options}'
-time_range_attr = "qdr:{range}"
-time_range_custom_attr = "cdr:1,cd_min:{start},cd_max{end}"
-time_range_dict = {'day': 'd',
- 'week': 'w',
- 'month': 'm'}
+filter_mapping = {
+ 0: 'images',
+ 1: 'active',
+ 2: 'active'
+}
+
+
+def scrap_out_thumbs(dom):
+ """Scrap out thumbnail data from <script> tags.
+ """
+ ret_val = dict()
+ for script in eval_xpath(dom, '//script[contains(., "_setImgSrc(")]'):
+ _script = script.text
+ # _setImgSrc('0','data:image\/jpeg;base64,\/9j\/4AAQSkZJR ....');
+ _thumb_no, _img_data = _script[len("_setImgSrc("):-2].split(",", 1)
+ _thumb_no = _thumb_no.replace("'", "")
+ _img_data = _img_data.replace("'", "")
+ _img_data = _img_data.replace(r"\/", r"/")
+ ret_val[_thumb_no] = _img_data.replace(r"\x3d", "=")
+ return ret_val
+
+
+def scrap_img_by_id(script, data_id):
+ """Get full image URL by data-id in parent element
+ """
+ img_url = ''
+ _script = script.split('\n')
+ for i, line in enumerate(_script):
+ if 'gstatic.com/images' in line and data_id in line:
+ url_line = _script[i + 1]
+ img_url = url_line.split('"')[1]
+ img_url = unquote(img_url.replace(r'\u00', r'%'))
+ return img_url
-# do search-request
def request(query, params):
- search_options = {
- 'ijn': params['pageno'] - 1,
- 'start': (params['pageno'] - 1) * number_of_results
- }
+ """Google-Video search request"""
+
+ language, country, lang_country = get_lang_country(
+ # pylint: disable=undefined-variable
+ params, supported_languages, language_aliases
+ )
+ subdomain = 'www.' + google_domains.get(country.upper(), 'google.com')
+
+ query_url = 'https://' + subdomain + '/search' + "?" + urlencode({
+ 'q': query,
+ 'tbm': "isch",
+ 'hl': lang_country,
+ 'lr': "lang_" + language,
+ 'ie': "utf8",
+ 'oe': "utf8",
+ 'num': 30,
+ })
if params['time_range'] in time_range_dict:
- search_options['tbs'] = time_range_attr.format(range=time_range_dict[params['time_range']])
- elif params['time_range'] == 'year':
- now = date.today()
- then = now - timedelta(days=365)
- start = then.strftime('%m/%d/%Y')
- end = now.strftime('%m/%d/%Y')
- search_options['tbs'] = time_range_custom_attr.format(start=start, end=end)
-
- if safesearch and params['safesearch']:
- search_options['safe'] = 'on'
-
- params['url'] = search_url.format(query=urlencode({'q': query}),
- search_options=urlencode(search_options))
-
+ query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
+ if params['safesearch']:
+ query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
+
+ params['url'] = query_url
+ logger.debug("query_url --> %s", query_url)
+
+ params['headers']['Accept-Language'] = (
+ "%s,%s;q=0.8,%s;q=0.5" % (lang_country, language, language))
+ logger.debug(
+ "HTTP Accept-Language --> %s", params['headers']['Accept-Language'])
+ params['headers']['Accept'] = (
+ 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
+ )
+ # params['google_subdomain'] = subdomain
return params
-# get response from search-request
def response(resp):
+ """Get response from google's search request"""
results = []
+ # detect google sorry
+ resp_url = urlparse(resp.url)
+ if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect':
+ raise RuntimeWarning('sorry.google.com')
+
+ if resp_url.path.startswith('/sorry'):
+ raise RuntimeWarning(gettext('CAPTCHA required'))
+
+ # which subdomain ?
+ # subdomain = resp.search_params.get('google_subdomain')
+
+ # convert the text to dom
dom = html.fromstring(resp.text)
+ img_bas64_map = scrap_out_thumbs(dom)
+ img_src_script = eval_xpath(dom, '//script[contains(., "AF_initDataCallback({key: ")]')[1].text
# parse results
- for result in dom.xpath('//div[contains(@class, "rg_meta")]/text()'):
+ #
+ # root element::
+ # <div id="islmp" ..>
+ # result div per image::
+ # <div jsmodel="tTXmib"> / <div jsaction="..." data-id="..."
+ # The data-id matches to a item in a json-data structure in::
+ # <script nonce="I+vqelcy/01CKiBJi5Z1Ow">AF_initDataCallback({key: 'ds:1', ... data:function(){return [ ...
+ # In this structure the link to the origin PNG, JPG or whatever is given
+ # first link per image-div contains a <img> with the data-iid for bas64 encoded image data::
+ # <img class="rg_i Q4LuWd" data-iid="0"
+ # second link per image-div is the target link::
+ # <a class="VFACy kGQAp" href="https://en.wikipedia.org/wiki/The_Sacrament_of_the_Last_Supper">
+ # the second link also contains two div tags with the *description* and *publisher*::
+ # <div class="WGvvNb">The Sacrament of the Last Supper ...</div>
+ # <div class="fxgdke">en.wikipedia.org</div>
+
+ root = eval_xpath(dom, '//div[@id="islmp"]')
+ if not root:
+ logger.error("did not find root element id='islmp'")
+ return results
+
+ root = root[0]
+ for img_node in eval_xpath(root, './/img[contains(@class, "rg_i")]'):
try:
- metadata = loads(result)
-
- img_format = metadata.get('ity', '')
- img_width = metadata.get('ow', '')
- img_height = metadata.get('oh', '')
- if img_width and img_height:
- img_format += " {0}x{1}".format(img_width, img_height)
-
- source = metadata.get('st', '')
- source_url = metadata.get('isu', '')
- if source_url:
- source += " ({0})".format(source_url)
-
- results.append({'url': metadata['ru'],
- 'title': metadata['pt'],
- 'content': metadata.get('s', ''),
- 'source': source,
- 'img_format': img_format,
- 'thumbnail_src': metadata['tu'],
- 'img_src': metadata['ou'],
- 'template': 'images.html'})
-
- except:
+ img_alt = eval_xpath(img_node, '@alt')[0]
+
+ img_base64_id = eval_xpath(img_node, '@data-iid')
+ if img_base64_id:
+ img_base64_id = img_base64_id[0]
+ thumbnail_src = img_bas64_map[img_base64_id]
+ else:
+ thumbnail_src = eval_xpath(img_node, '@src')
+ if not thumbnail_src:
+ thumbnail_src = eval_xpath(img_node, '@data-src')
+ if thumbnail_src:
+ thumbnail_src = thumbnail_src[0]
+ else:
+ thumbnail_src = ''
+
+ link_node = eval_xpath(img_node, '../../../a[2]')[0]
+ url = eval_xpath(link_node, '@href')[0]
+
+ pub_nodes = eval_xpath(link_node, './div/div')
+ pub_descr = img_alt
+ pub_source = ''
+ if pub_nodes:
+ pub_descr = extract_text(pub_nodes[0])
+ pub_source = extract_text(pub_nodes[1])
+
+ img_src_id = eval_xpath(img_node, '../../../@data-id')[0]
+ src_url = scrap_img_by_id(img_src_script, img_src_id)
+ if not src_url:
+ src_url = thumbnail_src
+
+ results.append({
+ 'url': url,
+ 'title': img_alt,
+ 'content': pub_descr,
+ 'source': pub_source,
+ 'img_src': src_url,
+ # 'img_format': img_format,
+ 'thumbnail_src': thumbnail_src,
+ 'template': 'images.html'
+ })
+ except Exception as e: # pylint: disable=broad-except
+ logger.error(e, exc_info=True)
+ # from lxml import etree
+ # logger.debug(etree.tostring(img_node, pretty_print=True))
+ # import pdb
+ # pdb.set_trace()
continue
return results
diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py
index c9cc75435..08875328c 100644
--- a/searx/engines/google_news.py
+++ b/searx/engines/google_news.py
@@ -10,9 +10,9 @@
@parse url, title, content, publishedDate
"""
+from urllib.parse import urlencode
from lxml import html
from searx.engines.google import _fetch_supported_languages, supported_languages_url
-from searx.url_utils import urlencode
from searx.utils import match_language
# search-url
diff --git a/searx/engines/google_videos.py b/searx/engines/google_videos.py
index fd6b2e3be..08af55902 100644
--- a/searx/engines/google_videos.py
+++ b/searx/engines/google_videos.py
@@ -12,9 +12,9 @@
from datetime import date, timedelta
from json import loads
+from urllib.parse import urlencode
from lxml import html
from searx.engines.xpath import extract_text
-from searx.url_utils import urlencode
import re
# engine dependent config
diff --git a/searx/engines/ina.py b/searx/engines/ina.py
index ea509649f..cce580273 100644
--- a/searx/engines/ina.py
+++ b/searx/engines/ina.py
@@ -12,15 +12,12 @@
# @todo embedded (needs some md5 from video page)
from json import loads
+from urllib.parse import urlencode
from lxml import html
from dateutil import parser
+from html.parser import HTMLParser
from searx.engines.xpath import extract_text
-from searx.url_utils import urlencode
-try:
- from HTMLParser import HTMLParser
-except:
- from html.parser import HTMLParser
# engine dependent config
categories = ['videos']
diff --git a/searx/engines/invidious.py b/searx/engines/invidious.py
index 8d81691fc..6ea942699 100644
--- a/searx/engines/invidious.py
+++ b/searx/engines/invidious.py
@@ -6,9 +6,9 @@
# @using-api yes
# @results JSON
# @stable yes
-# @parse url, title, content, publishedDate, thumbnail, embedded
+# @parse url, title, content, publishedDate, thumbnail, embedded, author, length
-from searx.url_utils import quote_plus
+from urllib.parse import quote_plus
from dateutil import parser
import time
@@ -84,13 +84,20 @@ def response(resp):
publishedDate = parser.parse(
time.ctime(result.get("published", 0))
)
+ length = time.gmtime(result.get("lengthSeconds"))
+ if length.tm_hour:
+ length = time.strftime("%H:%M:%S", length)
+ else:
+ length = time.strftime("%M:%S", length)
results.append(
{
"url": url,
"title": result.get("title", ""),
"content": result.get("description", ""),
+ 'length': length,
"template": "videos.html",
+ "author": result.get("author"),
"publishedDate": publishedDate,
"embedded": embedded,
"thumbnail": thumbnail,
diff --git a/searx/engines/json_engine.py b/searx/engines/json_engine.py
index 785b0c490..1e5c39ac4 100644
--- a/searx/engines/json_engine.py
+++ b/searx/engines/json_engine.py
@@ -1,11 +1,8 @@
from collections import Iterable
from json import loads
-from sys import version_info
-from searx.url_utils import urlencode
+from urllib.parse import urlencode
from searx.utils import to_string
-if version_info[0] == 3:
- unicode = str
search_url = None
url_query = None
@@ -37,8 +34,6 @@ def iterate(iterable):
def is_iterable(obj):
if type(obj) == str:
return False
- if type(obj) == unicode:
- return False
return isinstance(obj, Iterable)
diff --git a/searx/engines/kickass.py b/searx/engines/kickass.py
index 5e897c96f..af48d990b 100644
--- a/searx/engines/kickass.py
+++ b/searx/engines/kickass.py
@@ -12,9 +12,9 @@
from lxml import html
from operator import itemgetter
+from urllib.parse import quote, urljoin
from searx.engines.xpath import extract_text
from searx.utils import get_torrent_size, convert_str_to_int
-from searx.url_utils import quote, urljoin
# engine dependent config
categories = ['videos', 'music', 'files']
diff --git a/searx/engines/mediawiki.py b/searx/engines/mediawiki.py
index 0607ac93b..50ba74efc 100644
--- a/searx/engines/mediawiki.py
+++ b/searx/engines/mediawiki.py
@@ -14,7 +14,7 @@
from json import loads
from string import Formatter
-from searx.url_utils import urlencode, quote
+from urllib.parse import urlencode, quote
# engine dependent config
categories = ['general']
@@ -79,7 +79,7 @@ def response(resp):
if result.get('snippet', '').startswith('#REDIRECT'):
continue
url = base_url.format(language=resp.search_params['language']) +\
- 'wiki/' + quote(result['title'].replace(' ', '_').encode('utf-8'))
+ 'wiki/' + quote(result['title'].replace(' ', '_').encode())
# append result
results.append({'url': url,
diff --git a/searx/engines/microsoft_academic.py b/searx/engines/microsoft_academic.py
index 9bac0069c..7426eef7e 100644
--- a/searx/engines/microsoft_academic.py
+++ b/searx/engines/microsoft_academic.py
@@ -12,8 +12,7 @@ Microsoft Academic (Science)
from datetime import datetime
from json import loads
from uuid import uuid4
-
-from searx.url_utils import urlencode
+from urllib.parse import urlencode
from searx.utils import html_to_text
categories = ['images']
diff --git a/searx/engines/mixcloud.py b/searx/engines/mixcloud.py
index 470c007ea..0606350a9 100644
--- a/searx/engines/mixcloud.py
+++ b/searx/engines/mixcloud.py
@@ -12,7 +12,7 @@
from json import loads
from dateutil import parser
-from searx.url_utils import urlencode
+from urllib.parse import urlencode
# engine dependent config
categories = ['music']
diff --git a/searx/engines/nyaa.py b/searx/engines/nyaa.py
index c57979a5f..ed8897ddc 100644
--- a/searx/engines/nyaa.py
+++ b/searx/engines/nyaa.py
@@ -10,8 +10,8 @@
"""
from lxml import html
+from urllib.parse import urlencode
from searx.engines.xpath import extract_text
-from searx.url_utils import urlencode
from searx.utils import get_torrent_size, int_or_zero
# engine dependent config
diff --git a/searx/engines/openstreetmap.py b/searx/engines/openstreetmap.py
index cec10a3c7..5475c7a6d 100644
--- a/searx/engines/openstreetmap.py
+++ b/searx/engines/openstreetmap.py
@@ -10,7 +10,9 @@
@parse url, title
"""
+import re
from json import loads
+from flask_babel import gettext
# engine dependent config
categories = ['map']
@@ -21,10 +23,15 @@ base_url = 'https://nominatim.openstreetmap.org/'
search_string = 'search/{query}?format=json&polygon_geojson=1&addressdetails=1'
result_base_url = 'https://openstreetmap.org/{osm_type}/{osm_id}'
+route_url = 'https://graphhopper.com/maps/?point={}&point={}&locale=en-US&vehicle=car&weighting=fastest&turn_costs=true&use_miles=false&layer=Omniscale' # noqa
+route_re = re.compile('(?:from )?(.+) to (.+)')
+
# do search-request
def request(query, params):
- params['url'] = base_url + search_string.format(query=query.decode('utf-8'))
+
+ params['url'] = base_url + search_string.format(query=query)
+ params['route'] = route_re.match(query)
return params
@@ -34,12 +41,18 @@ def response(resp):
results = []
json = loads(resp.text)
+ if resp.search_params['route']:
+ results.append({
+ 'answer': gettext('Get directions'),
+ 'url': route_url.format(*resp.search_params['route'].groups()),
+ })
+
# parse results
for r in json:
if 'display_name' not in r:
continue
- title = r['display_name'] or u''
+ title = r['display_name'] or ''
osm_type = r.get('osm_type', r.get('type'))
url = result_base_url.format(osm_type=osm_type,
osm_id=r['osm_id'])
@@ -51,7 +64,7 @@ def response(resp):
# if no geojson is found and osm_type is a node, add geojson Point
if not geojson and osm_type == 'node':
- geojson = {u'type': u'Point', u'coordinates': [r['lon'], r['lat']]}
+ geojson = {'type': 'Point', 'coordinates': [r['lon'], r['lat']]}
address_raw = r.get('address')
address = {}
diff --git a/searx/engines/peertube.py b/searx/engines/peertube.py
new file mode 100644
index 000000000..58ff38c02
--- /dev/null
+++ b/searx/engines/peertube.py
@@ -0,0 +1,95 @@
+"""
+ peertube (Videos)
+
+ @website https://www.peertube.live
+ @provide-api yes (https://docs.joinpeertube.org/api-rest-reference.html)
+
+ @using-api yes
+ @results JSON
+ @stable yes
+ @parse url, title, thumbnail, publishedDate, embedded
+
+ @todo implement time range support
+"""
+
+from json import loads
+from datetime import datetime
+from urllib.parse import urlencode
+from searx.utils import html_to_text
+
+# engine dependent config
+categories = ["videos"]
+paging = True
+language_support = True
+base_url = "https://peer.tube/"
+supported_languages_url = base_url + "api/v1/videos/languages"
+
+
+# do search-request
+def request(query, params):
+ pageno = (params["pageno"] - 1) * 15
+ search_url = base_url + "api/v1/search/videos/?pageno={pageno}&{query}"
+ query_dict = {"search": query}
+ language = params["language"].split("-")[0]
+ # pylint: disable=undefined-variable
+ if "all" != language and language in supported_languages:
+ query_dict["languageOneOf"] = language
+ params["url"] = search_url.format(
+ query=urlencode(query_dict), pageno=pageno
+ )
+ return params
+
+
+def _get_offset_from_pageno(pageno):
+ return (pageno - 1) * 15 + 1
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ search_res = loads(resp.text)
+
+ embedded_url = (
+ '<iframe width="560" height="315" sandbox="allow-same-origin allow-scripts allow-popups" '
+ + 'src="'
+ + base_url
+ + '{embed_path}" frameborder="0" allowfullscreen></iframe>'
+ )
+ # return empty array if there are no results
+ if "data" not in search_res:
+ return []
+
+ # parse results
+ for res in search_res["data"]:
+ title = res["name"]
+ url = base_url + "/videos/watch/" + res["uuid"]
+ description = res["description"]
+ if description:
+ content = html_to_text(res["description"])
+ else:
+ content = None
+ thumbnail = base_url + res["thumbnailPath"]
+ publishedDate = datetime.strptime(res["publishedAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
+ embedded = embedded_url.format(embed_path=res["embedPath"][1:])
+
+ results.append(
+ {
+ "template": "videos.html",
+ "url": url,
+ "title": title,
+ "content": content,
+ "publishedDate": publishedDate,
+ "embedded": embedded,
+ "thumbnail": thumbnail,
+ }
+ )
+
+ # return results
+ return results
+
+
+def _fetch_supported_languages(resp):
+ ret_val = {}
+ peertube_languages = list(loads(resp.text).keys())
+ return peertube_languages
diff --git a/searx/engines/photon.py b/searx/engines/photon.py
index 15236f680..9201fc168 100644
--- a/searx/engines/photon.py
+++ b/searx/engines/photon.py
@@ -11,8 +11,8 @@
"""
from json import loads
+from urllib.parse import urlencode
from searx.utils import searx_useragent
-from searx.url_utils import urlencode
# engine dependent config
categories = ['map']
diff --git a/searx/engines/piratebay.py b/searx/engines/piratebay.py
index 2f3f22a97..42866d058 100644
--- a/searx/engines/piratebay.py
+++ b/searx/engines/piratebay.py
@@ -1,44 +1,52 @@
# Piratebay (Videos, Music, Files)
#
-# @website https://thepiratebay.se
-# @provide-api no (nothing found)
+# @website https://thepiratebay.org
+# @provide-api yes (https://apibay.org/)
#
-# @using-api no
-# @results HTML (using search portal)
-# @stable yes (HTML can change)
-# @parse url, title, content, seed, leech, magnetlink
+# @using-api yes
+# @results JSON
+# @stable no (the API is not documented nor versioned)
+# @parse url, title, seed, leech, magnetlink, filesize, publishedDate
-from lxml import html
+from json import loads
+from datetime import datetime
from operator import itemgetter
+
+from urllib.parse import quote, urljoin
from searx.engines.xpath import extract_text
-from searx.url_utils import quote, urljoin
+from searx.utils import get_torrent_size
# engine dependent config
-categories = ['videos', 'music', 'files']
-paging = True
+categories = ["videos", "music", "files"]
# search-url
-url = 'https://thepiratebay.org/'
-search_url = url + 'search/{search_term}/{pageno}/99/{search_type}'
+url = "https://thepiratebay.org/"
+search_url = "https://apibay.org/q.php?q={search_term}&cat={search_type}"
+
+# default trackers provided by thepiratebay
+trackers = [
+ "udp://tracker.coppersurfer.tk:6969/announce",
+ "udp://9.rarbg.to:2920/announce",
+ "udp://tracker.opentrackr.org:1337",
+ "udp://tracker.internetwarriors.net:1337/announce",
+ "udp://tracker.leechers-paradise.org:6969/announce",
+ "udp://tracker.coppersurfer.tk:6969/announce",
+ "udp://tracker.pirateparty.gr:6969/announce",
+ "udp://tracker.cyberia.is:6969/announce",
+]
# piratebay specific type-definitions
-search_types = {'files': '0',
- 'music': '100',
- 'videos': '200'}
-
-# specific xpath variables
-magnet_xpath = './/a[@title="Download this torrent using magnet"]'
-torrent_xpath = './/a[@title="Download this torrent"]'
-content_xpath = './/font[@class="detDesc"]'
+search_types = {"files": "0",
+ "music": "100",
+ "videos": "200"}
# do search-request
def request(query, params):
- search_type = search_types.get(params['category'], '0')
+ search_type = search_types.get(params["category"], "0")
- params['url'] = search_url.format(search_term=quote(query),
- search_type=search_type,
- pageno=params['pageno'] - 1)
+ params["url"] = search_url.format(search_term=quote(query),
+ search_type=search_type)
return params
@@ -47,50 +55,43 @@ def request(query, params):
def response(resp):
results = []
- dom = html.fromstring(resp.text)
-
- search_res = dom.xpath('//table[@id="searchResult"]//tr')
+ search_res = loads(resp.text)
# return empty array if nothing is found
- if not search_res:
+ if search_res[0]["name"] == "No results returned":
return []
# parse results
- for result in search_res[1:]:
- link = result.xpath('.//div[@class="detName"]//a')[0]
- href = urljoin(url, link.attrib.get('href'))
- title = extract_text(link)
- content = extract_text(result.xpath(content_xpath))
- seed, leech = result.xpath('.//td[@align="right"]/text()')[:2]
-
- # convert seed to int if possible
- if seed.isdigit():
- seed = int(seed)
- else:
- seed = 0
-
- # convert leech to int if possible
- if leech.isdigit():
- leech = int(leech)
- else:
- leech = 0
-
- magnetlink = result.xpath(magnet_xpath)[0]
- torrentfile_links = result.xpath(torrent_xpath)
- if torrentfile_links:
- torrentfile_link = torrentfile_links[0].attrib.get('href')
- else:
- torrentfile_link = None
+ for result in search_res:
+ link = url + "description.php?id=" + result["id"]
+ magnetlink = "magnet:?xt=urn:btih:" + result["info_hash"] + "&dn=" + result["name"]\
+ + "&tr=" + "&tr=".join(trackers)
+
+ params = {
+ "url": link,
+ "title": result["name"],
+ "seed": result["seeders"],
+ "leech": result["leechers"],
+ "magnetlink": magnetlink,
+ "template": "torrent.html"
+ }
+
+ # extract and convert creation date
+ try:
+ date = datetime.fromtimestamp(float(result["added"]))
+ params['publishedDate'] = date
+ except:
+ pass
+
+ # let's try to calculate the torrent size
+ try:
+ filesize = get_torrent_size(result["size"], "B")
+ params['filesize'] = filesize
+ except:
+ pass
# append result
- results.append({'url': href,
- 'title': title,
- 'content': content,
- 'seed': seed,
- 'leech': leech,
- 'magnetlink': magnetlink.attrib.get('href'),
- 'torrentfile': torrentfile_link,
- 'template': 'torrent.html'})
+ results.append(params)
# return results sorted by seeder
- return sorted(results, key=itemgetter('seed'), reverse=True)
+ return sorted(results, key=itemgetter("seed"), reverse=True)
diff --git a/searx/engines/pubmed.py b/searx/engines/pubmed.py
index 055f09226..7eb2e92f9 100644
--- a/searx/engines/pubmed.py
+++ b/searx/engines/pubmed.py
@@ -14,7 +14,7 @@
from flask_babel import gettext
from lxml import etree
from datetime import datetime
-from searx.url_utils import urlencode
+from urllib.parse import urlencode
from searx.poolrequests import get
diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py
index 54e9dafad..ac918b905 100644
--- a/searx/engines/qwant.py
+++ b/searx/engines/qwant.py
@@ -12,9 +12,9 @@
from datetime import datetime
from json import loads
-from searx.utils import html_to_text
-from searx.url_utils import urlencode
-from searx.utils import match_language
+from urllib.parse import urlencode
+from searx.utils import html_to_text, match_language
+
# engine dependent config
categories = None
diff --git a/searx/engines/reddit.py b/searx/engines/reddit.py
index d19724906..e732875cb 100644
--- a/searx/engines/reddit.py
+++ b/searx/engines/reddit.py
@@ -12,7 +12,7 @@
import json
from datetime import datetime
-from searx.url_utils import urlencode, urljoin, urlparse
+from urllib.parse import urlencode, urljoin, urlparse
# engine dependent config
categories = ['general', 'images', 'news', 'social media']
diff --git a/searx/engines/scanr_structures.py b/searx/engines/scanr_structures.py
index 7208dcb70..6dbbf4fd9 100644
--- a/searx/engines/scanr_structures.py
+++ b/searx/engines/scanr_structures.py
@@ -11,7 +11,7 @@
"""
from json import loads, dumps
-from searx.utils import html_to_text
+from urllib.parse import html_to_text
# engine dependent config
categories = ['science']
@@ -29,7 +29,7 @@ def request(query, params):
params['url'] = search_url
params['method'] = 'POST'
params['headers']['Content-type'] = "application/json"
- params['data'] = dumps({"query": query.decode('utf-8'),
+ params['data'] = dumps({"query": query,
"searchField": "ALL",
"sortDirection": "ASC",
"sortOrder": "RELEVANCY",
diff --git a/searx/engines/searchcode_code.py b/searx/engines/searchcode_code.py
index 789e8e7a9..706285814 100644
--- a/searx/engines/searchcode_code.py
+++ b/searx/engines/searchcode_code.py
@@ -11,7 +11,7 @@
"""
from json import loads
-from searx.url_utils import urlencode
+from urllib.parse import urlencode
# engine dependent config
diff --git a/searx/engines/searchcode_doc.py b/searx/engines/searchcode_doc.py
index 4b8e9a84a..878d2e792 100644
--- a/searx/engines/searchcode_doc.py
+++ b/searx/engines/searchcode_doc.py
@@ -11,7 +11,7 @@
"""
from json import loads
-from searx.url_utils import urlencode
+from urllib.parse import urlencode
# engine dependent config
categories = ['it']
diff --git a/searx/engines/seedpeer.py b/searx/engines/seedpeer.py
index f9b1f99c8..3778abe7b 100644
--- a/searx/engines/seedpeer.py
+++ b/searx/engines/seedpeer.py
@@ -11,7 +11,7 @@
from lxml import html
from json import loads
from operator import itemgetter
-from searx.url_utils import quote, urljoin
+from urllib.parse import quote, urljoin
from searx.engines.xpath import extract_text
diff --git a/searx/engines/soundcloud.py b/searx/engines/soundcloud.py
index 284689bf6..5165ea3ea 100644
--- a/searx/engines/soundcloud.py
+++ b/searx/engines/soundcloud.py
@@ -14,14 +14,11 @@ import re
from json import loads
from lxml import html
from dateutil import parser
+from io import StringIO
+from urllib.parse import quote_plus, urlencode
from searx import logger
from searx.poolrequests import get as http_get
-from searx.url_utils import quote_plus, urlencode
-try:
- from cStringIO import StringIO
-except:
- from io import StringIO
# engine dependent config
categories = ['music']
@@ -61,7 +58,7 @@ def get_client_id():
# gets app_js and searches for the clientid
response = http_get(app_js_url)
if response.ok:
- cids = cid_re.search(response.content.decode("utf-8"))
+ cids = cid_re.search(response.content.decode())
if cids is not None and len(cids.groups()):
return cids.groups()[0]
logger.warning("Unable to fetch guest client_id from SoundCloud, check parser!")
diff --git a/searx/engines/spotify.py b/searx/engines/spotify.py
index 00c395706..74942326e 100644
--- a/searx/engines/spotify.py
+++ b/searx/engines/spotify.py
@@ -11,7 +11,7 @@
"""
from json import loads
-from searx.url_utils import urlencode
+from urllib.parse import urlencode
import requests
import base64
@@ -39,8 +39,8 @@ def request(query, params):
'https://accounts.spotify.com/api/token',
data={'grant_type': 'client_credentials'},
headers={'Authorization': 'Basic ' + base64.b64encode(
- "{}:{}".format(api_client_id, api_client_secret).encode('utf-8')
- ).decode('utf-8')}
+ "{}:{}".format(api_client_id, api_client_secret).encode()
+ ).decode()}
)
j = loads(r.text)
params['headers'] = {'Authorization': 'Bearer {}'.format(j.get('access_token'))}
@@ -59,7 +59,7 @@ def response(resp):
if result['type'] == 'track':
title = result['name']
url = result['external_urls']['spotify']
- content = u'{} - {} - {}'.format(
+ content = '{} - {} - {}'.format(
result['artists'][0]['name'],
result['album']['name'],
result['name'])
diff --git a/searx/engines/stackoverflow.py b/searx/engines/stackoverflow.py
index 25875aa15..90e4543d7 100644
--- a/searx/engines/stackoverflow.py
+++ b/searx/engines/stackoverflow.py
@@ -10,9 +10,9 @@
@parse url, title, content
"""
+from urllib.parse import urlencode, urljoin
from lxml import html
from searx.engines.xpath import extract_text
-from searx.url_utils import urlencode, urljoin
# engine dependent config
categories = ['it']
diff --git a/searx/engines/tokyotoshokan.py b/searx/engines/tokyotoshokan.py
index 773212043..9c8774d7c 100644
--- a/searx/engines/tokyotoshokan.py
+++ b/searx/engines/tokyotoshokan.py
@@ -11,10 +11,10 @@
"""
import re
+from urllib.parse import urlencode
from lxml import html
from searx.engines.xpath import extract_text
from datetime import datetime
-from searx.url_utils import urlencode
from searx.utils import get_torrent_size, int_or_zero
# engine dependent config
diff --git a/searx/engines/torrentz.py b/searx/engines/torrentz.py
index fd4164a66..fcc8c042c 100644
--- a/searx/engines/torrentz.py
+++ b/searx/engines/torrentz.py
@@ -1,21 +1,21 @@
"""
- Torrentz2.eu (BitTorrent meta-search engine)
+ Torrentz2.is (BitTorrent meta-search engine)
- @website https://torrentz2.eu/
+ @website https://torrentz2.is/
@provide-api no
@using-api no
@results HTML
@stable no (HTML can change, although unlikely,
- see https://torrentz.eu/torrentz.btsearch)
+ see https://torrentz.is/torrentz.btsearch)
@parse url, title, publishedDate, seed, leech, filesize, magnetlink
"""
import re
+from urllib.parse import urlencode
from lxml import html
from datetime import datetime
from searx.engines.xpath import extract_text
-from searx.url_utils import urlencode
from searx.utils import get_torrent_size
# engine dependent config
@@ -23,8 +23,8 @@ categories = ['files', 'videos', 'music']
paging = True
# search-url
-# https://torrentz2.eu/search?f=EXAMPLE&p=6
-base_url = 'https://torrentz2.eu/'
+# https://torrentz2.is/search?f=EXAMPLE&p=6
+base_url = 'https://torrentz2.is/'
search_url = base_url + 'search?{query}'
diff --git a/searx/engines/translated.py b/searx/engines/translated.py
index 5c7b17033..a50e7c830 100644
--- a/searx/engines/translated.py
+++ b/searx/engines/translated.py
@@ -9,23 +9,19 @@
@parse url, title, content
"""
import re
-from sys import version_info
from searx.utils import is_valid_lang
-if version_info[0] == 3:
- unicode = str
-
categories = ['general']
-url = u'http://api.mymemory.translated.net/get?q={query}&langpair={from_lang}|{to_lang}{key}'
-web_url = u'http://mymemory.translated.net/en/{from_lang}/{to_lang}/{query}'
+url = 'https://api.mymemory.translated.net/get?q={query}&langpair={from_lang}|{to_lang}{key}'
+web_url = 'https://mymemory.translated.net/en/{from_lang}/{to_lang}/{query}'
weight = 100
-parser_re = re.compile(u'.*?([a-z]+)-([a-z]+) (.{2,})$', re.I)
+parser_re = re.compile('.*?([a-z]+)-([a-z]+) (.{2,})$', re.I)
api_key = ''
def request(query, params):
- m = parser_re.match(unicode(query, 'utf8'))
+ m = parser_re.match(query)
if not m:
return params
diff --git a/searx/engines/twitter.py b/searx/engines/twitter.py
index d2a8d2088..549b14e96 100644
--- a/searx/engines/twitter.py
+++ b/searx/engines/twitter.py
@@ -12,10 +12,10 @@
@todo publishedDate
"""
+from urllib.parse import urlencode, urljoin
from lxml import html
from datetime import datetime
from searx.engines.xpath import extract_text
-from searx.url_utils import urlencode, urljoin
# engine dependent config
categories = ['social media']
diff --git a/searx/engines/unsplash.py b/searx/engines/unsplash.py
index 2e8d6fdfc..45c6b30da 100644
--- a/searx/engines/unsplash.py
+++ b/searx/engines/unsplash.py
@@ -10,7 +10,7 @@
@parse url, title, img_src, thumbnail_src
"""
-from searx.url_utils import urlencode, urlparse, urlunparse, parse_qsl
+from urllib.parse import urlencode, urlparse, urlunparse, parse_qsl
from json import loads
url = 'https://unsplash.com/'
diff --git a/searx/engines/vimeo.py b/searx/engines/vimeo.py
index a92271019..fd3abc858 100644
--- a/searx/engines/vimeo.py
+++ b/searx/engines/vimeo.py
@@ -12,9 +12,9 @@
# @todo rewrite to api
# @todo set content-parameter with correct data
+from urllib.parse import urlencode
from json import loads
from dateutil import parser
-from searx.url_utils import urlencode
# engine dependent config
categories = ['videos']
diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py
index e913b3915..ffa3724fd 100644
--- a/searx/engines/wikidata.py
+++ b/searx/engines/wikidata.py
@@ -15,9 +15,9 @@ from searx import logger
from searx.poolrequests import get
from searx.engines.xpath import extract_text
from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url
-from searx.url_utils import urlencode
from searx.utils import match_language, eval_xpath
+from urllib.parse import urlencode
from json import loads
from lxml.html import fromstring
from lxml import etree
@@ -76,7 +76,7 @@ def request(query, params):
def response(resp):
results = []
htmlparser = etree.HTMLParser()
- html = fromstring(resp.content.decode("utf-8"), parser=htmlparser)
+ html = fromstring(resp.content.decode(), parser=htmlparser)
search_results = eval_xpath(html, wikidata_ids_xpath)
if resp.search_params['language'].split('-')[0] == 'all':
@@ -89,7 +89,7 @@ def response(resp):
wikidata_id = search_result.split('/')[-1]
url = url_detail.format(query=urlencode({'page': wikidata_id, 'uselang': language}))
htmlresponse = get(url)
- jsonresponse = loads(htmlresponse.content.decode("utf-8"))
+ jsonresponse = loads(htmlresponse.content.decode())
results += getDetail(jsonresponse, wikidata_id, language, resp.search_params['language'], htmlparser)
return results
@@ -382,7 +382,7 @@ def add_attribute(attributes, id_cache, property_id, default_label=None, date=Fa
# requires property_id unless it's a wiki link (defined in link_type)
def add_url(urls, result, id_cache, property_id=None, default_label=None, url_prefix=None, results=None,
- link_type=None):
+ link_type=None, only_first=True):
links = []
# wiki links don't have property in wikidata page
@@ -414,11 +414,15 @@ def add_url(urls, result, id_cache, property_id=None, default_label=None, url_pr
# append urls
for url in links:
if url is not None:
- urls.append({'title': default_label or label,
- 'url': url})
+ u = {'title': default_label or label, 'url': url}
+ if property_id == 'P856':
+ u['official'] = True
+ u['domain'] = url.split('/')[2]
+ urls.append(u)
if results is not None:
- results.append({'title': default_label or label,
- 'url': url})
+ results.append(u)
+ if only_first:
+ break
def get_imdblink(result, url_prefix):
@@ -449,16 +453,16 @@ def get_geolink(result):
latitude, longitude = coordinates.split(',')
# convert to decimal
- lat = int(latitude[:latitude.find(u'°')])
+ lat = int(latitude[:latitude.find('°')])
if latitude.find('\'') >= 0:
- lat += int(latitude[latitude.find(u'°') + 1:latitude.find('\'')] or 0) / 60.0
+ lat += int(latitude[latitude.find('°') + 1:latitude.find('\'')] or 0) / 60.0
if latitude.find('"') >= 0:
lat += float(latitude[latitude.find('\'') + 1:latitude.find('"')] or 0) / 3600.0
if latitude.find('S') >= 0:
lat *= -1
- lon = int(longitude[:longitude.find(u'°')])
+ lon = int(longitude[:longitude.find('°')])
if longitude.find('\'') >= 0:
- lon += int(longitude[longitude.find(u'°') + 1:longitude.find('\'')] or 0) / 60.0
+ lon += int(longitude[longitude.find('°') + 1:longitude.find('\'')] or 0) / 60.0
if longitude.find('"') >= 0:
lon += float(longitude[longitude.find('\'') + 1:longitude.find('"')] or 0) / 3600.0
if longitude.find('W') >= 0:
diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py
index a216ba886..620ec3c14 100644
--- a/searx/engines/wikipedia.py
+++ b/searx/engines/wikipedia.py
@@ -1,7 +1,7 @@
"""
Wikipedia (Web)
- @website https://{language}.wikipedia.org
+ @website https://en.wikipedia.org/api/rest_v1/
@provide-api yes
@using-api yes
@@ -10,23 +10,13 @@
@parse url, infobox
"""
+from urllib.parse import quote
from json import loads
from lxml.html import fromstring
-from searx.url_utils import quote, urlencode
-from searx.utils import match_language
+from searx.utils import match_language, searx_useragent
# search-url
-base_url = u'https://{language}.wikipedia.org/'
-search_url = base_url + u'w/api.php?'\
- 'action=query'\
- '&format=json'\
- '&{query}'\
- '&prop=extracts|pageimages|pageprops'\
- '&ppprop=disambiguation'\
- '&exintro'\
- '&explaintext'\
- '&pithumbsize=300'\
- '&redirects'
+search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}'
supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
@@ -41,77 +31,37 @@ def url_lang(lang):
# do search-request
def request(query, params):
if query.islower():
- query = u'{0}|{1}'.format(query.decode('utf-8'), query.decode('utf-8').title()).encode('utf-8')
+ query = query.title()
- params['url'] = search_url.format(query=urlencode({'titles': query}),
+ params['url'] = search_url.format(title=quote(query),
language=url_lang(params['language']))
- return params
-
-
-# get first meaningful paragraph
-# this should filter out disambiguation pages and notes above first paragraph
-# "magic numbers" were obtained by fine tuning
-def extract_first_paragraph(content, title, image):
- first_paragraph = None
-
- failed_attempts = 0
- for paragraph in content.split('\n'):
-
- starts_with_title = paragraph.lower().find(title.lower(), 0, len(title) + 35)
- length = len(paragraph)
+ params['headers']['User-Agent'] = searx_useragent()
- if length >= 200 or (starts_with_title >= 0 and (image or length >= 150)):
- first_paragraph = paragraph
- break
-
- failed_attempts += 1
- if failed_attempts > 3:
- return None
-
- return first_paragraph
+ return params
# get response from search-request
def response(resp):
- results = []
-
- search_result = loads(resp.text)
-
- # wikipedia article's unique id
- # first valid id is assumed to be the requested article
- if 'pages' not in search_result['query']:
- return results
-
- for article_id in search_result['query']['pages']:
- page = search_result['query']['pages'][article_id]
- if int(article_id) > 0:
- break
-
- if int(article_id) < 0 or 'disambiguation' in page.get('pageprops', {}):
+ if not resp.ok:
return []
- title = page.get('title')
-
- image = page.get('thumbnail')
- if image:
- image = image.get('source')
-
- extract = page.get('extract')
+ results = []
+ api_result = loads(resp.text)
- summary = extract_first_paragraph(extract, title, image)
- summary = summary.replace('() ', '')
+ # skip disambiguation pages
+ if api_result['type'] != 'standard':
+ return []
- # link to wikipedia article
- wikipedia_link = base_url.format(language=url_lang(resp.search_params['language'])) \
- + 'wiki/' + quote(title.replace(' ', '_').encode('utf8'))
+ title = api_result['title']
+ wikipedia_link = api_result['content_urls']['desktop']['page']
results.append({'url': wikipedia_link, 'title': title})
results.append({'infobox': title,
'id': wikipedia_link,
- 'content': summary,
- 'img_src': image,
+ 'content': api_result.get('extract', ''),
+ 'img_src': api_result.get('thumbnail', {}).get('source'),
'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})
return results
diff --git a/searx/engines/wolframalpha_api.py b/searx/engines/wolframalpha_api.py
index 1c58c4a9b..520eaa209 100644
--- a/searx/engines/wolframalpha_api.py
+++ b/searx/engines/wolframalpha_api.py
@@ -9,7 +9,7 @@
# @parse url, infobox
from lxml import etree
-from searx.url_utils import urlencode
+from urllib.parse import urlencode
# search-url
search_url = 'https://api.wolframalpha.com/v2/query?appid={api_key}&{query}'
@@ -45,15 +45,15 @@ def request(query, params):
# replace private user area characters to make text legible
def replace_pua_chars(text):
- pua_chars = {u'\uf522': u'\u2192', # rigth arrow
- u'\uf7b1': u'\u2115', # set of natural numbers
- u'\uf7b4': u'\u211a', # set of rational numbers
- u'\uf7b5': u'\u211d', # set of real numbers
- u'\uf7bd': u'\u2124', # set of integer numbers
- u'\uf74c': 'd', # differential
- u'\uf74d': u'\u212f', # euler's number
- u'\uf74e': 'i', # imaginary number
- u'\uf7d9': '='} # equals sign
+ pua_chars = {'\uf522': '\u2192', # rigth arrow
+ '\uf7b1': '\u2115', # set of natural numbers
+ '\uf7b4': '\u211a', # set of rational numbers
+ '\uf7b5': '\u211d', # set of real numbers
+ '\uf7bd': '\u2124', # set of integer numbers
+ '\uf74c': 'd', # differential
+ '\uf74d': '\u212f', # euler's number
+ '\uf74e': 'i', # imaginary number
+ '\uf7d9': '='} # equals sign
for k, v in pua_chars.items():
text = text.replace(k, v)
diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py
index 387c9fa17..943d4f3fb 100644
--- a/searx/engines/wolframalpha_noapi.py
+++ b/searx/engines/wolframalpha_noapi.py
@@ -10,9 +10,9 @@
from json import loads
from time import time
+from urllib.parse import urlencode
from searx.poolrequests import get as http_get
-from searx.url_utils import urlencode
# search-url
url = 'https://www.wolframalpha.com/'
diff --git a/searx/engines/www1x.py b/searx/engines/www1x.py
index f1154b16d..1cb74dbad 100644
--- a/searx/engines/www1x.py
+++ b/searx/engines/www1x.py
@@ -11,7 +11,7 @@
"""
from lxml import html
-from searx.url_utils import urlencode, urljoin
+from urllib.parse import urlencode, urljoin
from searx.engines.xpath import extract_text
# engine dependent config
diff --git a/searx/engines/xpath.py b/searx/engines/xpath.py
index b75896cc7..bd97a93a5 100644
--- a/searx/engines/xpath.py
+++ b/searx/engines/xpath.py
@@ -1,7 +1,7 @@
+from urllib.parse import unquote, urlencode, urljoin, urlparse
from lxml import html
from lxml.etree import _ElementStringResult, _ElementUnicodeResult
from searx.utils import html_to_text, eval_xpath
-from searx.url_utils import unquote, urlencode, urljoin, urlparse
search_url = None
url_xpath = None
@@ -56,11 +56,15 @@ def extract_url(xpath_results, search_url):
if url.startswith('//'):
# add http or https to this kind of url //example.com/
parsed_search_url = urlparse(search_url)
- url = u'{0}:{1}'.format(parsed_search_url.scheme or 'http', url)
+ url = '{0}:{1}'.format(parsed_search_url.scheme or 'http', url)
elif url.startswith('/'):
# fix relative url to the search engine
url = urljoin(search_url, url)
+ # fix relative urls that fall through the crack
+ if '://' not in url:
+ url = urljoin(search_url, url)
+
# normalize url
url = normalize_url(url)
@@ -82,7 +86,7 @@ def normalize_url(url):
p = parsed_url.path
mark = p.find('/**')
if mark != -1:
- return unquote(p[mark + 3:]).decode('utf-8')
+ return unquote(p[mark + 3:]).decode()
return url
diff --git a/searx/engines/yacy.py b/searx/engines/yacy.py
index 25bc83687..daa151082 100644
--- a/searx/engines/yacy.py
+++ b/searx/engines/yacy.py
@@ -14,7 +14,7 @@
from json import loads
from dateutil import parser
-from searx.url_utils import urlencode
+from urllib.parse import urlencode
from searx.utils import html_to_text
@@ -75,7 +75,7 @@ def response(resp):
for result in search_results[0].get('items', []):
# parse image results
- if result.get('image'):
+ if result.get('image') and result.get('width') and result.get('height'):
result_url = ''
if 'url' in result:
diff --git a/searx/engines/yahoo.py b/searx/engines/yahoo.py
index 36c1a11f8..0133b57b5 100644
--- a/searx/engines/yahoo.py
+++ b/searx/engines/yahoo.py
@@ -11,9 +11,9 @@
@parse url, title, content, suggestion
"""
+from urllib.parse import unquote, urlencode
from lxml import html
from searx.engines.xpath import extract_text, extract_url
-from searx.url_utils import unquote, urlencode
from searx.utils import match_language, eval_xpath
# engine dependent config
@@ -33,7 +33,7 @@ supported_languages_url = 'https://search.yahoo.com/web/advanced'
results_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' Sr ')]"
url_xpath = './/h3/a/@href'
title_xpath = './/h3/a'
-content_xpath = './/div[@class="compText aAbs"]'
+content_xpath = './/div[contains(@class, "compText")]'
suggestion_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' AlsoTry ')]//a"
time_range_dict = {'day': ['1d', 'd'],
diff --git a/searx/engines/yahoo_news.py b/searx/engines/yahoo_news.py
index 9f6a4159b..345e4d91f 100644
--- a/searx/engines/yahoo_news.py
+++ b/searx/engines/yahoo_news.py
@@ -11,13 +11,13 @@
import re
from datetime import datetime, timedelta
+from urllib.parse import urlencode
from lxml import html
from searx.engines.xpath import extract_text, extract_url
from searx.engines.yahoo import (
parse_url, _fetch_supported_languages, supported_languages_url, language_aliases
)
from dateutil import parser
-from searx.url_utils import urlencode
from searx.utils import match_language
# engine dependent config
@@ -58,7 +58,7 @@ def request(query, params):
def sanitize_url(url):
if ".yahoo.com/" in url:
- return re.sub(u"\\;\\_ylt\\=.+$", "", url)
+ return re.sub("\\;\\_ylt\\=.+$", "", url)
else:
return url
diff --git a/searx/engines/yandex.py b/searx/engines/yandex.py
index 1c789f6cb..ff1ef5a26 100644
--- a/searx/engines/yandex.py
+++ b/searx/engines/yandex.py
@@ -9,9 +9,9 @@
@parse url, title, content
"""
+from urllib.parse import urlencode
from lxml import html
from searx import logger
-from searx.url_utils import urlencode
logger = logger.getChild('yandex engine')
diff --git a/searx/engines/yggtorrent.py b/searx/engines/yggtorrent.py
new file mode 100644
index 000000000..37bf3b1d9
--- /dev/null
+++ b/searx/engines/yggtorrent.py
@@ -0,0 +1,124 @@
+# Yggtorrent (Videos, Music, Files)
+#
+# @website https://www2.yggtorrent.si
+# @provide-api no (nothing found)
+#
+# @using-api no
+# @results HTML (using search portal)
+# @stable no (HTML can change)
+# @parse url, title, seed, leech, publishedDate, filesize
+
+from lxml import html
+from operator import itemgetter
+from datetime import datetime
+from urllib.parse import quote
+from searx.engines.xpath import extract_text
+from searx.utils import get_torrent_size
+from searx.poolrequests import get as http_get
+
+# engine dependent config
+categories = ['videos', 'music', 'files']
+paging = True
+
+# search-url
+url = 'https://www2.yggtorrent.si/'
+search_url = url + 'engine/search?name={search_term}&do=search&page={pageno}&category={search_type}'
+
+# yggtorrent specific type-definitions
+search_types = {'files': 'all',
+ 'music': '2139',
+ 'videos': '2145'}
+
+cookies = dict()
+
+
+def init(engine_settings=None):
+ global cookies
+ # initial cookies
+ resp = http_get(url, allow_redirects=False)
+ if resp.ok:
+ for r in resp.history:
+ cookies.update(r.cookies)
+ cookies.update(resp.cookies)
+
+
+# do search-request
+def request(query, params):
+ search_type = search_types.get(params['category'], 'all')
+ pageno = (params['pageno'] - 1) * 50
+
+ params['url'] = search_url.format(search_term=quote(query),
+ search_type=search_type,
+ pageno=pageno)
+
+ params['cookies'] = cookies
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+ dom = html.fromstring(resp.text)
+
+ search_res = dom.xpath('//section[@id="#torrents"]/div/table/tbody/tr')
+
+ # return empty array if nothing is found
+ if not search_res:
+ return []
+
+ # parse results
+ for result in search_res:
+ link = result.xpath('.//a[@id="torrent_name"]')[0]
+ href = link.attrib.get('href')
+ title = extract_text(link)
+ seed = result.xpath('.//td[8]/text()')[0]
+ leech = result.xpath('.//td[9]/text()')[0]
+
+ # convert seed to int if possible
+ if seed.isdigit():
+ seed = int(seed)
+ else:
+ seed = 0
+
+ # convert leech to int if possible
+ if leech.isdigit():
+ leech = int(leech)
+ else:
+ leech = 0
+
+ params = {'url': href,
+ 'title': title,
+ 'seed': seed,
+ 'leech': leech,
+ 'template': 'torrent.html'}
+
+ # let's try to calculate the torrent size
+ try:
+ filesize_info = result.xpath('.//td[6]/text()')[0]
+ filesize = filesize_info[:-2]
+ filesize_multiplier = filesize_info[-2:].lower()
+ multiplier_french_to_english = {
+ 'to': 'TiB',
+ 'go': 'GiB',
+ 'mo': 'MiB',
+ 'ko': 'KiB'
+ }
+ filesize = get_torrent_size(filesize, multiplier_french_to_english[filesize_multiplier])
+ params['filesize'] = filesize
+ except:
+ pass
+
+ # extract and convert creation date
+ try:
+ date_ts = result.xpath('.//td[5]/div/text()')[0]
+ date = datetime.fromtimestamp(float(date_ts))
+ params['publishedDate'] = date
+ except:
+ pass
+
+ # append result
+ results.append(params)
+
+ # return results sorted by seeder
+ return sorted(results, key=itemgetter('seed'), reverse=True)
diff --git a/searx/engines/youtube_api.py b/searx/engines/youtube_api.py
index bc4c0d58e..2542169a6 100644
--- a/searx/engines/youtube_api.py
+++ b/searx/engines/youtube_api.py
@@ -10,7 +10,7 @@
from json import loads
from dateutil import parser
-from searx.url_utils import urlencode
+from urllib.parse import urlencode
# engine dependent config
categories = ['videos', 'music']
diff --git a/searx/engines/youtube_noapi.py b/searx/engines/youtube_noapi.py
index 49d0ae604..fef501458 100644
--- a/searx/engines/youtube_noapi.py
+++ b/searx/engines/youtube_noapi.py
@@ -10,9 +10,9 @@
from functools import reduce
from json import loads
+from urllib.parse import quote_plus
from searx.engines.xpath import extract_text
from searx.utils import list_get
-from searx.url_utils import quote_plus
# engine dependent config
categories = ['videos', 'music']
@@ -70,11 +70,15 @@ def response(resp):
title = get_text_from_json(video.get('title', {}))
content = get_text_from_json(video.get('descriptionSnippet', {}))
embedded = embedded_url.format(videoid=videoid)
+ author = get_text_from_json(video.get('ownerText', {}))
+ length = get_text_from_json(video.get('lengthText', {}))
# append result
results.append({'url': url,
'title': title,
'content': content,
+ 'author': author,
+ 'length': length,
'template': 'videos.html',
'embedded': embedded,
'thumbnail': thumbnail})