summaryrefslogtreecommitdiff
path: root/searx/search.py
diff options
context:
space:
mode:
Diffstat (limited to 'searx/search.py')
-rw-r--r--searx/search.py133
1 files changed, 76 insertions, 57 deletions
diff --git a/searx/search.py b/searx/search.py
index 5b5cc6200..fbbf3fe41 100644
--- a/searx/search.py
+++ b/searx/search.py
@@ -19,8 +19,9 @@ import requests as requests_lib
import threading
import re
from itertools import izip_longest, chain
-from datetime import datetime
from operator import itemgetter
+from Queue import Queue
+from time import time
from urlparse import urlparse, unquote
from searx.engines import (
categories, engines
@@ -33,82 +34,79 @@ from searx.query import Query
number_of_searches = 0
+def search_request_wrapper(fn, url, engine_name, **kwargs):
+ try:
+ return fn(url, **kwargs)
+ except Exception, e:
+ # increase errors stats
+ engines[engine_name].stats['errors'] += 1
+
+ # print engine name and specific error message
+ print('[E] Error with engine "{0}":\n\t{1}'.format(
+ engine_name, str(e)))
+ return
+
+
def threaded_requests(requests):
- for fn, url, request_args in requests:
+ timeout_limit = max(r[2]['timeout'] for r in requests)
+ search_start = time()
+ for fn, url, request_args, engine_name in requests:
+ request_args['timeout'] = timeout_limit
th = threading.Thread(
- target=fn,
- args=(url,),
+ target=search_request_wrapper,
+ args=(fn, url, engine_name),
kwargs=request_args,
name='search_request',
)
+ th._engine_name = engine_name
th.start()
for th in threading.enumerate():
if th.name == 'search_request':
- th.join()
+ remaining_time = max(0.0, timeout_limit - (time() - search_start))
+ th.join(remaining_time)
+ if th.isAlive():
+ print('engine timeout: {0}'.format(th._engine_name))
# get default reqest parameter
def default_request_params():
return {
- 'method': 'GET', 'headers': {}, 'data': {}, 'url': '', 'cookies': {}}
+ 'method': 'GET',
+ 'headers': {},
+ 'data': {},
+ 'url': '',
+ 'cookies': {},
+ 'verify': True
+ }
# create a callback wrapper for the search engine results
-def make_callback(engine_name,
- results,
- suggestions,
- answers,
- infoboxes,
- callback,
- params):
+def make_callback(engine_name, results_queue, callback, params):
# creating a callback wrapper for the search engine results
def process_callback(response, **kwargs):
- cb_res = []
response.search_params = params
- # callback
- try:
- search_results = callback(response)
- except Exception, e:
- # increase errors stats
+ timeout_overhead = 0.2 # seconds
+ search_duration = time() - params['started']
+ timeout_limit = engines[engine_name].timeout + timeout_overhead
+ if search_duration > timeout_limit:
+ engines[engine_name].stats['page_load_time'] += timeout_limit
engines[engine_name].stats['errors'] += 1
- results[engine_name] = cb_res
-
- # print engine name and specific error message
- print '[E] Error with engine "{0}":\n\t{1}'.format(
- engine_name, str(e))
return
+ # callback
+ search_results = callback(response)
+
# add results
for result in search_results:
result['engine'] = engine_name
- # if it is a suggestion, add it to list of suggestions
- if 'suggestion' in result:
- # TODO type checks
- suggestions.add(result['suggestion'])
- continue
-
- # if it is an answer, add it to list of answers
- if 'answer' in result:
- answers.add(result['answer'])
- continue
-
- # if it is an infobox, add it to list of infoboxes
- if 'infobox' in result:
- infoboxes.append(result)
- continue
-
- # append result
- cb_res.append(result)
-
- results[engine_name] = cb_res
+ results_queue.put_nowait((engine_name, search_results))
# update stats with current page-load-time
- engines[engine_name].stats['page_load_time'] += \
- (datetime.now() - params['started']).total_seconds()
+ engines[engine_name].stats['page_load_time'] += search_duration
return process_callback
@@ -420,6 +418,7 @@ class Search(object):
# init vars
requests = []
+ results_queue = Queue()
results = {}
suggestions = set()
answers = set()
@@ -452,14 +451,13 @@ class Search(object):
request_params = default_request_params()
request_params['headers']['User-Agent'] = user_agent
request_params['category'] = selected_engine['category']
- request_params['started'] = datetime.now()
+ request_params['started'] = time()
request_params['pageno'] = self.pageno
request_params['language'] = self.lang
# update request parameters dependent on
# search-engine (contained in engines folder)
- request_params = engine.request(self.query.encode('utf-8'),
- request_params)
+ engine.request(self.query.encode('utf-8'), request_params)
if request_params['url'] is None:
# TODO add support of offline engines
@@ -468,13 +466,9 @@ class Search(object):
# create a callback wrapper for the search engine results
callback = make_callback(
selected_engine['name'],
- results,
- suggestions,
- answers,
- infoboxes,
+ results_queue,
engine.response,
- request_params
- )
+ request_params)
# create dictionary which contain all
# informations about the request
@@ -482,7 +476,8 @@ class Search(object):
headers=request_params['headers'],
hooks=dict(response=callback),
cookies=request_params['cookies'],
- timeout=engine.timeout
+ timeout=engine.timeout,
+ verify=request_params['verify']
)
# specific type of request (GET or POST)
@@ -497,11 +492,35 @@ class Search(object):
continue
# append request to list
- requests.append((req, request_params['url'], request_args))
+ requests.append((req, request_params['url'],
+ request_args,
+ selected_engine['name']))
+ if not requests:
+ return results, suggestions, answers, infoboxes
# send all search-request
threaded_requests(requests)
+ while not results_queue.empty():
+ engine_name, engine_results = results_queue.get_nowait()
+
+ # TODO type checks
+ [suggestions.add(x['suggestion'])
+ for x in list(engine_results)
+ if 'suggestion' in x
+ and engine_results.remove(x) is None]
+
+ [answers.add(x['answer'])
+ for x in list(engine_results)
+ if 'answer' in x
+ and engine_results.remove(x) is None]
+
+ infoboxes.extend(x for x in list(engine_results)
+ if 'infobox' in x
+ and engine_results.remove(x) is None)
+
+ results[engine_name] = engine_results
+
# update engine-specific stats
for engine_name, engine_results in results.items():
engines[engine_name].stats['search_count'] += 1