summaryrefslogtreecommitdiff
path: root/searx/engines
diff options
context:
space:
mode:
Diffstat (limited to 'searx/engines')
-rw-r--r--searx/engines/__init__.py51
-rw-r--r--searx/engines/bing_images.py16
-rw-r--r--searx/engines/blekko_images.py2
-rw-r--r--searx/engines/btdigg.py2
-rw-r--r--searx/engines/deviantart.py2
-rw-r--r--searx/engines/digg.py2
-rw-r--r--searx/engines/faroo.py2
-rw-r--r--searx/engines/frinkiac.py44
-rw-r--r--searx/engines/gigablast.py57
-rw-r--r--searx/engines/google.py50
-rw-r--r--searx/engines/mediawiki.py21
-rw-r--r--searx/engines/searchcode_code.py4
-rw-r--r--searx/engines/searchcode_doc.py4
-rw-r--r--searx/engines/soundcloud.py32
-rw-r--r--searx/engines/stackoverflow.py2
-rw-r--r--searx/engines/startpage.py8
-rw-r--r--searx/engines/swisscows.py9
-rw-r--r--searx/engines/wikidata.py4
-rw-r--r--searx/engines/wolframalpha_api.py122
-rw-r--r--searx/engines/wolframalpha_noapi.py116
-rw-r--r--searx/engines/www1x.py2
-rw-r--r--searx/engines/xpath.py4
-rw-r--r--searx/engines/yandex.py7
23 files changed, 442 insertions, 121 deletions
diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py
index 447138d3b..6d5066733 100644
--- a/searx/engines/__init__.py
+++ b/searx/engines/__init__.py
@@ -34,6 +34,15 @@ engines = {}
categories = {'general': []}
engine_shortcuts = {}
+engine_default_args = {'paging': False,
+ 'categories': ['general'],
+ 'language_support': True,
+ 'safesearch': False,
+ 'timeout': settings['outgoing']['request_timeout'],
+ 'shortcut': '-',
+ 'disabled': False,
+ 'suspend_end_time': 0,
+ 'continuous_errors': 0}
def load_module(filename):
@@ -62,26 +71,9 @@ def load_engine(engine_data):
continue
setattr(engine, param_name, engine_data[param_name])
- if not hasattr(engine, 'paging'):
- engine.paging = False
-
- if not hasattr(engine, 'categories'):
- engine.categories = ['general']
-
- if not hasattr(engine, 'language_support'):
- engine.language_support = True
-
- if not hasattr(engine, 'safesearch'):
- engine.safesearch = False
-
- if not hasattr(engine, 'timeout'):
- engine.timeout = settings['outgoing']['request_timeout']
-
- if not hasattr(engine, 'shortcut'):
- engine.shortcut = ''
-
- if not hasattr(engine, 'disabled'):
- engine.disabled = False
+ for arg_name, arg_value in engine_default_args.iteritems():
+ if not hasattr(engine, arg_name):
+ setattr(engine, arg_name, arg_value)
# checking required variables
for engine_attr in dir(engine):
@@ -100,18 +92,15 @@ def load_engine(engine_data):
'errors': 0
}
- if hasattr(engine, 'categories'):
- for category_name in engine.categories:
- categories.setdefault(category_name, []).append(engine)
- else:
- categories['general'].append(engine)
+ for category_name in engine.categories:
+ categories.setdefault(category_name, []).append(engine)
+
+ if engine.shortcut in engine_shortcuts:
+ logger.error('Engine config error: ambigious shortcut: {0}'.format(engine.shortcut))
+ sys.exit(1)
+
+ engine_shortcuts[engine.shortcut] = engine.name
- if engine.shortcut:
- if engine.shortcut in engine_shortcuts:
- logger.error('Engine config error: ambigious shortcut: {0}'
- .format(engine.shortcut))
- sys.exit(1)
- engine_shortcuts[engine.shortcut] = engine.name
return engine
diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py
index 06850dfe1..2664b795f 100644
--- a/searx/engines/bing_images.py
+++ b/searx/engines/bing_images.py
@@ -17,7 +17,7 @@
from urllib import urlencode
from lxml import html
-from yaml import load
+from json import loads
import re
# engine dependent config
@@ -36,6 +36,9 @@ safesearch_types = {2: 'STRICT',
0: 'OFF'}
+_quote_keys_regex = re.compile('({|,)([a-z][a-z0-9]*):(")', re.I | re.U)
+
+
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * 10 + 1
@@ -65,22 +68,19 @@ def response(resp):
dom = html.fromstring(resp.text)
- # init regex for yaml-parsing
- p = re.compile('({|,)([a-z]+):(")')
-
# parse results
for result in dom.xpath('//div[@class="dg_u"]'):
link = result.xpath('./a')[0]
- # parse yaml-data (it is required to add a space, to make it parsable)
- yaml_data = load(p.sub(r'\1\2: \3', link.attrib.get('m')))
+ # parse json-data (it is required to add a space, to make it parsable)
+ json_data = loads(_quote_keys_regex.sub(r'\1"\2": \3', link.attrib.get('m')))
title = link.attrib.get('t1')
ihk = link.attrib.get('ihk')
# url = 'http://' + link.attrib.get('t3')
- url = yaml_data.get('surl')
- img_src = yaml_data.get('imgurl')
+ url = json_data.get('surl')
+ img_src = json_data.get('imgurl')
# append result
results.append({'template': 'images.html',
diff --git a/searx/engines/blekko_images.py b/searx/engines/blekko_images.py
index 93ac6616b..c0664f390 100644
--- a/searx/engines/blekko_images.py
+++ b/searx/engines/blekko_images.py
@@ -37,7 +37,7 @@ def request(query, params):
c=c)
if params['pageno'] != 1:
- params['url'] += '&page={pageno}'.format(pageno=(params['pageno']-1))
+ params['url'] += '&page={pageno}'.format(pageno=(params['pageno'] - 1))
# let Blekko know we wan't have profiling
params['cookies']['tag_lesslogging'] = '1'
diff --git a/searx/engines/btdigg.py b/searx/engines/btdigg.py
index 192ed6ee9..c2b22f003 100644
--- a/searx/engines/btdigg.py
+++ b/searx/engines/btdigg.py
@@ -29,7 +29,7 @@ search_url = url + '/search?q={search_term}&p={pageno}'
# do search-request
def request(query, params):
params['url'] = search_url.format(search_term=quote(query),
- pageno=params['pageno']-1)
+ pageno=params['pageno'] - 1)
return params
diff --git a/searx/engines/deviantart.py b/searx/engines/deviantart.py
index 60c8d7ea7..135aeb324 100644
--- a/searx/engines/deviantart.py
+++ b/searx/engines/deviantart.py
@@ -24,7 +24,7 @@ paging = True
# search-url
base_url = 'https://www.deviantart.com/'
-search_url = base_url+'browse/all/?offset={offset}&{query}'
+search_url = base_url + 'browse/all/?offset={offset}&{query}'
# do search-request
diff --git a/searx/engines/digg.py b/searx/engines/digg.py
index 000f66ba2..a10b38bb6 100644
--- a/searx/engines/digg.py
+++ b/searx/engines/digg.py
@@ -22,7 +22,7 @@ paging = True
# search-url
base_url = 'https://digg.com/'
-search_url = base_url+'api/search/{query}.json?position={position}&format=html'
+search_url = base_url + 'api/search/{query}.json?position={position}&format=html'
# specific xpath variables
results_xpath = '//article'
diff --git a/searx/engines/faroo.py b/searx/engines/faroo.py
index 43df14eef..9fa244e77 100644
--- a/searx/engines/faroo.py
+++ b/searx/engines/faroo.py
@@ -88,7 +88,7 @@ def response(resp):
for result in search_res['results']:
if result['news']:
# timestamp (milliseconds since 1970)
- publishedDate = datetime.datetime.fromtimestamp(result['date']/1000.0) # noqa
+ publishedDate = datetime.datetime.fromtimestamp(result['date'] / 1000.0) # noqa
# append news result
results.append({'url': result['url'],
diff --git a/searx/engines/frinkiac.py b/searx/engines/frinkiac.py
new file mode 100644
index 000000000..a9383f862
--- /dev/null
+++ b/searx/engines/frinkiac.py
@@ -0,0 +1,44 @@
+"""
+Frinkiac (Images)
+
+@website https://www.frinkiac.com
+@provide-api no
+@using-api no
+@results JSON
+@stable no
+@parse url, title, img_src
+"""
+
+from json import loads
+from urllib import urlencode
+
+categories = ['images']
+
+BASE = 'https://frinkiac.com/'
+SEARCH_URL = '{base}api/search?{query}'
+RESULT_URL = '{base}?{query}'
+THUMB_URL = '{base}img/{episode}/{timestamp}/medium.jpg'
+IMAGE_URL = '{base}img/{episode}/{timestamp}.jpg'
+
+
+def request(query, params):
+ params['url'] = SEARCH_URL.format(base=BASE, query=urlencode({'q': query}))
+ return params
+
+
+def response(resp):
+ results = []
+ response_data = loads(resp.text)
+ for result in response_data:
+ episode = result['Episode']
+ timestamp = result['Timestamp']
+
+ results.append({'template': 'images.html',
+ 'url': RESULT_URL.format(base=BASE,
+ query=urlencode({'p': 'caption', 'e': episode, 't': timestamp})),
+ 'title': episode,
+ 'content': '',
+ 'thumbnail_src': THUMB_URL.format(base=BASE, episode=episode, timestamp=timestamp),
+ 'img_src': IMAGE_URL.format(base=BASE, episode=episode, timestamp=timestamp)})
+
+ return results
diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py
index 3fef102f4..1cc243104 100644
--- a/searx/engines/gigablast.py
+++ b/searx/engines/gigablast.py
@@ -10,20 +10,30 @@
@parse url, title, content
"""
-from urllib import urlencode
from cgi import escape
-from lxml import etree
+from json import loads
from random import randint
from time import time
+from urllib import urlencode
# engine dependent config
categories = ['general']
paging = True
-number_of_results = 5
+number_of_results = 10
+language_support = True
+safesearch = True
-# search-url, invalid HTTPS certificate
+# search-url
base_url = 'https://gigablast.com/'
-search_string = 'search?{query}&n={number_of_results}&s={offset}&format=xml&qh=0&rxiyd={rxiyd}&rand={rand}'
+search_string = 'search?{query}'\
+ '&n={number_of_results}'\
+ '&c=main'\
+ '&s={offset}'\
+ '&format=json'\
+ '&qh=0'\
+ '&rxiwd={rxiwd}'\
+ '&qlang={lang}'\
+ '&ff={safesearch}'
# specific xpath variables
results_xpath = '//response//result'
@@ -36,12 +46,23 @@ content_xpath = './/sum'
def request(query, params):
offset = (params['pageno'] - 1) * number_of_results
- search_path = search_string.format(
- query=urlencode({'q': query}),
- offset=offset,
- number_of_results=number_of_results,
- rxiyd=randint(10000, 10000000),
- rand=int(time()))
+ if params['language'] == 'all':
+ language = 'xx'
+ else:
+ language = params['language'][0:2]
+
+ if params['safesearch'] >= 1:
+ safesearch = 1
+ else:
+ safesearch = 0
+
+ search_path = search_string.format(query=urlencode({'q': query}),
+ offset=offset,
+ number_of_results=number_of_results,
+ rxiwd=1,
+ # rand=int(time()),
+ lang=language,
+ safesearch=safesearch)
params['url'] = base_url + search_path
@@ -52,18 +73,14 @@ def request(query, params):
def response(resp):
results = []
- dom = etree.fromstring(resp.content)
-
# parse results
- for result in dom.xpath(results_xpath):
- url = result.xpath(url_xpath)[0].text
- title = result.xpath(title_xpath)[0].text
- content = escape(result.xpath(content_xpath)[0].text)
+ response_json = loads(resp.text)
+ for result in response_json['results']:
# append result
- results.append({'url': url,
- 'title': title,
- 'content': content})
+ results.append({'url': result['url'],
+ 'title': escape(result['title']),
+ 'content': escape(result['sum'])})
# return results
return results
diff --git a/searx/engines/google.py b/searx/engines/google.py
index e82260356..dbca205a1 100644
--- a/searx/engines/google.py
+++ b/searx/engines/google.py
@@ -90,7 +90,7 @@ url_map = 'https://www.openstreetmap.org/'\
search_path = '/search'
search_url = ('https://{hostname}' +
search_path +
- '?{query}&start={offset}&gbv=1&gws_rd=cr')
+ '?{query}&start={offset}&gbv=1&gws_rd=ssl')
# other URLs
map_hostname_start = 'maps.google.'
@@ -99,7 +99,7 @@ redirect_path = '/url'
images_path = '/images'
# specific xpath variables
-results_xpath = '//li[@class="g"]'
+results_xpath = '//div[@class="g"]'
url_xpath = './/h3/a/@href'
title_xpath = './/h3'
content_xpath = './/span[@class="st"]'
@@ -209,29 +209,29 @@ def response(resp):
parsed_url = urlparse(url, google_hostname)
# map result
- if ((parsed_url.netloc == google_hostname and parsed_url.path.startswith(maps_path))
- or (parsed_url.netloc.startswith(map_hostname_start))):
- x = result.xpath(map_near)
- if len(x) > 0:
- # map : near the location
- results = results + parse_map_near(parsed_url, x, google_hostname)
- else:
- # map : detail about a location
- results = results + parse_map_detail(parsed_url, result, google_hostname)
-
- # google news
- elif (parsed_url.netloc == google_hostname
- and parsed_url.path == search_path):
- # skipping news results
- pass
-
- # images result
- elif (parsed_url.netloc == google_hostname
- and parsed_url.path == images_path):
- # only thumbnail image provided,
- # so skipping image results
- # results = results + parse_images(result, google_hostname)
- pass
+ if parsed_url.netloc == google_hostname:
+ # TODO fix inside links
+ continue
+ # if parsed_url.path.startswith(maps_path) or parsed_url.netloc.startswith(map_hostname_start):
+ # print "yooooo"*30
+ # x = result.xpath(map_near)
+ # if len(x) > 0:
+ # # map : near the location
+ # results = results + parse_map_near(parsed_url, x, google_hostname)
+ # else:
+ # # map : detail about a location
+ # results = results + parse_map_detail(parsed_url, result, google_hostname)
+ # # google news
+ # elif parsed_url.path == search_path:
+ # # skipping news results
+ # pass
+
+ # # images result
+ # elif parsed_url.path == images_path:
+ # # only thumbnail image provided,
+ # # so skipping image results
+ # # results = results + parse_images(result, google_hostname)
+ # pass
else:
# normal result
diff --git a/searx/engines/mediawiki.py b/searx/engines/mediawiki.py
index 9fb72e830..26d3720d9 100644
--- a/searx/engines/mediawiki.py
+++ b/searx/engines/mediawiki.py
@@ -24,13 +24,13 @@ number_of_results = 1
# search-url
base_url = 'https://{language}.wikipedia.org/'
-search_url = base_url + 'w/api.php?action=query'\
- '&list=search'\
- '&{query}'\
- '&srprop=timestamp'\
- '&format=json'\
- '&sroffset={offset}'\
- '&srlimit={limit}' # noqa
+search_postfix = 'w/api.php?action=query'\
+ '&list=search'\
+ '&{query}'\
+ '&format=json'\
+ '&sroffset={offset}'\
+ '&srlimit={limit}'\
+ '&srwhat=nearmatch' # search for a near match in the title
# do search-request
@@ -48,12 +48,15 @@ def request(query, params):
else:
language = params['language'].split('_')[0]
- if len(format_strings) > 1:
+ # format_string [('https://', 'language', '', None), ('.wikipedia.org/', None, None, None)]
+ if any(x[1] == 'language' for x in format_strings):
string_args['language'] = language
# write search-language back to params, required in response
params['language'] = language
+ search_url = base_url + search_postfix
+
params['url'] = search_url.format(**string_args)
return params
@@ -71,6 +74,8 @@ def response(resp):
# parse results
for result in search_results['query']['search']:
+ if result.get('snippet', '').startswith('#REDIRECT'):
+ continue
url = base_url.format(language=resp.search_params['language']) +\
'wiki/' + quote(result['title'].replace(' ', '_').encode('utf-8'))
diff --git a/searx/engines/searchcode_code.py b/searx/engines/searchcode_code.py
index bd5eb71d2..de8cd43be 100644
--- a/searx/engines/searchcode_code.py
+++ b/searx/engines/searchcode_code.py
@@ -20,7 +20,7 @@ paging = True
# search-url
url = 'https://searchcode.com/'
-search_url = url+'api/codesearch_I/?{query}&p={pageno}'
+search_url = url + 'api/codesearch_I/?{query}&p={pageno}'
# special code-endings which are not recognised by the file ending
code_endings = {'cs': 'c#',
@@ -32,7 +32,7 @@ code_endings = {'cs': 'c#',
# do search-request
def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query}),
- pageno=params['pageno']-1)
+ pageno=params['pageno'] - 1)
# Disable SSL verification
# error: (60) SSL certificate problem: unable to get local issuer
diff --git a/searx/engines/searchcode_doc.py b/searx/engines/searchcode_doc.py
index 9453f31a4..f24fe6f90 100644
--- a/searx/engines/searchcode_doc.py
+++ b/searx/engines/searchcode_doc.py
@@ -19,13 +19,13 @@ paging = True
# search-url
url = 'https://searchcode.com/'
-search_url = url+'api/search_IV/?{query}&p={pageno}'
+search_url = url + 'api/search_IV/?{query}&p={pageno}'
# do search-request
def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query}),
- pageno=params['pageno']-1)
+ pageno=params['pageno'] - 1)
# Disable SSL verification
# error: (60) SSL certificate problem: unable to get local issuer
diff --git a/searx/engines/soundcloud.py b/searx/engines/soundcloud.py
index 46e17fc81..ac23c1e83 100644
--- a/searx/engines/soundcloud.py
+++ b/searx/engines/soundcloud.py
@@ -10,17 +10,19 @@
@parse url, title, content, publishedDate, embedded
"""
+import re
+from StringIO import StringIO
from json import loads
+from lxml import etree
from urllib import urlencode, quote_plus
from dateutil import parser
+from searx import logger
+from searx.poolrequests import get as http_get
# engine dependent config
categories = ['music']
paging = True
-# api-key
-guest_client_id = 'b45b1aa10f1ac2941910a7f0d10f8e28'
-
# search-url
url = 'https://api.soundcloud.com/'
search_url = url + 'search?{query}'\
@@ -35,6 +37,30 @@ embedded_url = '<iframe width="100%" height="166" ' +\
'data-src="https://w.soundcloud.com/player/?url={uri}"></iframe>'
+def get_client_id():
+ response = http_get("https://soundcloud.com")
+ rx_namespace = {"re": "http://exslt.org/regular-expressions"}
+
+ if response.ok:
+ tree = etree.parse(StringIO(response.content), etree.HTMLParser())
+ script_tags = tree.xpath("//script[re:match(@src, '(.*app.*js)')]", namespaces=rx_namespace)
+ app_js_urls = [script_tag.get('src') for script_tag in script_tags if script_tag is not None]
+
+ # extracts valid app_js urls from soundcloud.com content
+ for app_js_url in app_js_urls:
+ # gets app_js and searches for the clientid
+ response = http_get(app_js_url)
+ if response.ok:
+ cids = re.search(r'client_id:"([^"]*)"', response.content, re.M | re.I)
+ if cids is not None and len(cids.groups()):
+ return cids.groups()[0]
+ logger.warning("Unable to fetch guest client_id from SoundCloud, check parser!")
+ return ""
+
+# api-key
+guest_client_id = get_client_id()
+
+
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * 20
diff --git a/searx/engines/stackoverflow.py b/searx/engines/stackoverflow.py
index 34ecabae7..fdd3711a9 100644
--- a/searx/engines/stackoverflow.py
+++ b/searx/engines/stackoverflow.py
@@ -22,7 +22,7 @@ paging = True
# search-url
url = 'https://stackoverflow.com/'
-search_url = url+'search?{query}&page={pageno}'
+search_url = url + 'search?{query}&page={pageno}'
# specific xpath variables
results_xpath = '//div[contains(@class,"question-summary")]'
diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py
index a91cafa00..52dd0b92f 100644
--- a/searx/engines/startpage.py
+++ b/searx/engines/startpage.py
@@ -90,8 +90,8 @@ def response(resp):
# check if search result starts with something like: "2 Sep 2014 ... "
if re.match("^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
- date_pos = content.find('...')+4
- date_string = content[0:date_pos-5]
+ date_pos = content.find('...') + 4
+ date_string = content[0:date_pos - 5]
published_date = parser.parse(date_string, dayfirst=True)
# fix content string
@@ -99,8 +99,8 @@ def response(resp):
# check if search result starts with something like: "5 days ago ... "
elif re.match("^[0-9]+ days? ago \.\.\. ", content):
- date_pos = content.find('...')+4
- date_string = content[0:date_pos-5]
+ date_pos = content.find('...') + 4
+ date_string = content[0:date_pos - 5]
# calculate datetime
published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))
diff --git a/searx/engines/swisscows.py b/searx/engines/swisscows.py
index 2d31264ca..864436a52 100644
--- a/searx/engines/swisscows.py
+++ b/searx/engines/swisscows.py
@@ -10,6 +10,7 @@
@parse url, title, content
"""
+from cgi import escape
from json import loads
from urllib import urlencode, unquote
import re
@@ -77,7 +78,7 @@ def response(resp):
# append result
results.append({'url': result['SourceUrl'],
- 'title': result['Title'],
+ 'title': escape(result['Title']),
'content': '',
'img_src': img_url,
'template': 'images.html'})
@@ -89,8 +90,8 @@ def response(resp):
# append result
results.append({'url': result_url,
- 'title': result_title,
- 'content': result_content})
+ 'title': escape(result_title),
+ 'content': escape(result_content)})
# parse images
for result in json.get('Images', []):
@@ -99,7 +100,7 @@ def response(resp):
# append result
results.append({'url': result['SourceUrl'],
- 'title': result['Title'],
+ 'title': escape(result['Title']),
'content': '',
'img_src': img_url,
'template': 'images.html'})
diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py
index fc840d47c..9f3496b72 100644
--- a/searx/engines/wikidata.py
+++ b/searx/engines/wikidata.py
@@ -295,7 +295,7 @@ def get_geolink(claims, propertyName, defaultValue=''):
if precision < 0.0003:
zoom = 19
else:
- zoom = int(15 - precision*8.8322 + precision*precision*0.625447)
+ zoom = int(15 - precision * 8.8322 + precision * precision * 0.625447)
url = url_map\
.replace('{latitude}', str(value.get('latitude', 0)))\
@@ -318,6 +318,6 @@ def get_wikilink(result, wikiid):
def get_wiki_firstlanguage(result, wikipatternid):
for k in result.get('sitelinks', {}).keys():
- if k.endswith(wikipatternid) and len(k) == (2+len(wikipatternid)):
+ if k.endswith(wikipatternid) and len(k) == (2 + len(wikipatternid)):
return k[0:2]
return None
diff --git a/searx/engines/wolframalpha_api.py b/searx/engines/wolframalpha_api.py
new file mode 100644
index 000000000..4526c825f
--- /dev/null
+++ b/searx/engines/wolframalpha_api.py
@@ -0,0 +1,122 @@
+# Wolfram Alpha (Science)
+#
+# @website https://www.wolframalpha.com
+# @provide-api yes (https://api.wolframalpha.com/v2/)
+#
+# @using-api yes
+# @results XML
+# @stable yes
+# @parse url, infobox
+
+from urllib import urlencode
+from lxml import etree
+
+# search-url
+search_url = 'https://api.wolframalpha.com/v2/query?appid={api_key}&{query}'
+site_url = 'https://www.wolframalpha.com/input/?{query}'
+api_key = '' # defined in settings.yml
+
+# xpath variables
+failure_xpath = '/queryresult[attribute::success="false"]'
+answer_xpath = '//pod[attribute::primary="true"]/subpod/plaintext'
+input_xpath = '//pod[starts-with(attribute::id, "Input")]/subpod/plaintext'
+pods_xpath = '//pod'
+subpods_xpath = './subpod'
+pod_id_xpath = './@id'
+pod_title_xpath = './@title'
+plaintext_xpath = './plaintext'
+image_xpath = './img'
+img_src_xpath = './@src'
+img_alt_xpath = './@alt'
+
+# pods to display as image in infobox
+# this pods do return a plaintext, but they look better and are more useful as images
+image_pods = {'VisualRepresentation',
+ 'Illustration'}
+
+
+# do search-request
+def request(query, params):
+ params['url'] = search_url.format(query=urlencode({'input': query}),
+ api_key=api_key)
+ params['headers']['Referer'] = site_url.format(query=urlencode({'i': query}))
+
+ return params
+
+
+# replace private user area characters to make text legible
+def replace_pua_chars(text):
+ pua_chars = {u'\uf522': u'\u2192', # rigth arrow
+ u'\uf7b1': u'\u2115', # set of natural numbers
+ u'\uf7b4': u'\u211a', # set of rational numbers
+ u'\uf7b5': u'\u211d', # set of real numbers
+ u'\uf7bd': u'\u2124', # set of integer numbers
+ u'\uf74c': 'd', # differential
+ u'\uf74d': u'\u212f', # euler's number
+ u'\uf74e': 'i', # imaginary number
+ u'\uf7d9': '='} # equals sign
+
+ for k, v in pua_chars.iteritems():
+ text = text.replace(k, v)
+
+ return text
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ search_results = etree.XML(resp.content)
+
+ # return empty array if there are no results
+ if search_results.xpath(failure_xpath):
+ return []
+
+ try:
+ infobox_title = search_results.xpath(input_xpath)[0].text
+ except:
+ infobox_title = None
+
+ pods = search_results.xpath(pods_xpath)
+ result_chunks = []
+ for pod in pods:
+ pod_id = pod.xpath(pod_id_xpath)[0]
+ pod_title = pod.xpath(pod_title_xpath)[0]
+
+ subpods = pod.xpath(subpods_xpath)
+ if not subpods:
+ continue
+
+ # Appends either a text or an image, depending on which one is more suitable
+ for subpod in subpods:
+ content = subpod.xpath(plaintext_xpath)[0].text
+ image = subpod.xpath(image_xpath)
+
+ if content and pod_id not in image_pods:
+
+ # if no input pod was found, title is first plaintext pod
+ if not infobox_title:
+ infobox_title = content
+
+ content = replace_pua_chars(content)
+ result_chunks.append({'label': pod_title, 'value': content})
+
+ elif image:
+ result_chunks.append({'label': pod_title,
+ 'image': {'src': image[0].xpath(img_src_xpath)[0],
+ 'alt': image[0].xpath(img_alt_xpath)[0]}})
+
+ if not result_chunks:
+ return []
+
+ # append infobox
+ results.append({'infobox': infobox_title,
+ 'attributes': result_chunks,
+ 'urls': [{'title': 'Wolfram|Alpha', 'url': resp.request.headers['Referer'].decode('utf8')}]})
+
+ # append link to site
+ results.append({'url': resp.request.headers['Referer'].decode('utf8'),
+ 'title': 'Wolfram|Alpha',
+ 'content': infobox_title})
+
+ return results
diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py
new file mode 100644
index 000000000..59629b833
--- /dev/null
+++ b/searx/engines/wolframalpha_noapi.py
@@ -0,0 +1,116 @@
+# Wolfram|Alpha (Science)
+#
+# @website https://www.wolframalpha.com/
+# @provide-api yes (https://api.wolframalpha.com/v2/)
+#
+# @using-api no
+# @results JSON
+# @stable no
+# @parse url, infobox
+
+from cgi import escape
+from json import loads
+from time import time
+from urllib import urlencode
+from lxml.etree import XML
+
+from searx.poolrequests import get as http_get
+
+# search-url
+url = 'https://www.wolframalpha.com/'
+
+search_url = url + 'input/json.jsp'\
+ '?async=false'\
+ '&banners=raw'\
+ '&debuggingdata=false'\
+ '&format=image,plaintext,imagemap,minput,moutput'\
+ '&formattimeout=2'\
+ '&{query}'\
+ '&output=JSON'\
+ '&parsetimeout=2'\
+ '&proxycode={token}'\
+ '&scantimeout=0.5'\
+ '&sponsorcategories=true'\
+ '&statemethod=deploybutton'
+
+referer_url = url + 'input/?{query}'
+
+token = {'value': '',
+ 'last_updated': None}
+
+# pods to display as image in infobox
+# this pods do return a plaintext, but they look better and are more useful as images
+image_pods = {'VisualRepresentation',
+ 'Illustration',
+ 'Symbol'}
+
+
+# seems, wolframalpha resets its token in every hour
+def obtain_token():
+ update_time = time() - (time() % 3600)
+ try:
+ token_response = http_get('https://www.wolframalpha.com/input/api/v1/code?ts=9999999999999999999', timeout=2.0)
+ token['value'] = loads(token_response.text)['code']
+ token['last_updated'] = update_time
+ except:
+ pass
+ return token
+
+
+obtain_token()
+
+
+# do search-request
+def request(query, params):
+ # obtain token if last update was more than an hour
+ if time() - token['last_updated'] > 3600:
+ obtain_token()
+ params['url'] = search_url.format(query=urlencode({'input': query}), token=token['value'])
+ params['headers']['Referer'] = referer_url.format(query=urlencode({'i': query}))
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ resp_json = loads(resp.text)
+
+ if not resp_json['queryresult']['success']:
+ return []
+
+ # TODO handle resp_json['queryresult']['assumptions']
+ result_chunks = []
+ infobox_title = None
+ for pod in resp_json['queryresult']['pods']:
+ pod_id = pod.get('id', '')
+ pod_title = pod.get('title', '')
+
+ if 'subpods' not in pod:
+ continue
+
+ if pod_id == 'Input' or not infobox_title:
+ infobox_title = pod['subpods'][0]['plaintext']
+
+ for subpod in pod['subpods']:
+ if subpod['plaintext'] != '' and pod_id not in image_pods:
+ # append unless it's not an actual answer
+ if subpod['plaintext'] != '(requires interactivity)':
+ result_chunks.append({'label': pod_title, 'value': subpod['plaintext']})
+
+ elif 'img' in subpod:
+ result_chunks.append({'label': pod_title, 'image': subpod['img']})
+
+ if not result_chunks:
+ return []
+
+ results.append({'infobox': infobox_title,
+ 'attributes': result_chunks,
+ 'urls': [{'title': 'Wolfram|Alpha', 'url': resp.request.headers['Referer'].decode('utf8')}]})
+
+ results.append({'url': resp.request.headers['Referer'].decode('utf8'),
+ 'title': 'Wolfram|Alpha',
+ 'content': infobox_title})
+
+ return results
diff --git a/searx/engines/www1x.py b/searx/engines/www1x.py
index ddb79bfea..1269a5422 100644
--- a/searx/engines/www1x.py
+++ b/searx/engines/www1x.py
@@ -22,7 +22,7 @@ paging = False
# search-url
base_url = 'https://1x.com'
-search_url = base_url+'/backend/search.php?{query}'
+search_url = base_url + '/backend/search.php?{query}'
# do search-request
diff --git a/searx/engines/xpath.py b/searx/engines/xpath.py
index 1a599dc0a..f51634be0 100644
--- a/searx/engines/xpath.py
+++ b/searx/engines/xpath.py
@@ -43,7 +43,7 @@ def extract_url(xpath_results, search_url):
if url.startswith('//'):
# add http or https to this kind of url //example.com/
parsed_search_url = urlparse(search_url)
- url = parsed_search_url.scheme+url
+ url = parsed_search_url.scheme + url
elif url.startswith('/'):
# fix relative url to the search engine
url = urljoin(search_url, url)
@@ -69,7 +69,7 @@ def normalize_url(url):
p = parsed_url.path
mark = p.find('/**')
if mark != -1:
- return unquote(p[mark+3:]).decode('utf-8')
+ return unquote(p[mark + 3:]).decode('utf-8')
return url
diff --git a/searx/engines/yandex.py b/searx/engines/yandex.py
index edc6ad5f2..be3ec36ce 100644
--- a/searx/engines/yandex.py
+++ b/searx/engines/yandex.py
@@ -9,6 +9,7 @@
@parse url, title, content
"""
+from cgi import escape
from urllib import urlencode
from lxml import html
from searx.search import logger
@@ -38,7 +39,7 @@ content_xpath = './/div[@class="serp-item__text"]//text()'
def request(query, params):
lang = params['language'].split('_')[0]
host = base_url.format(tld=language_map.get(lang) or default_tld)
- params['url'] = host + search_url.format(page=params['pageno']-1,
+ params['url'] = host + search_url.format(page=params['pageno'] - 1,
query=urlencode({'text': query}))
return params
@@ -51,8 +52,8 @@ def response(resp):
for result in dom.xpath(results_xpath):
try:
res = {'url': result.xpath(url_xpath)[0],
- 'title': ''.join(result.xpath(title_xpath)),
- 'content': ''.join(result.xpath(content_xpath))}
+ 'title': escape(''.join(result.xpath(title_xpath))),
+ 'content': escape(''.join(result.xpath(content_xpath)))}
except:
logger.exception('yandex parse crash')
continue