summaryrefslogtreecommitdiff
path: root/searx/results.py
diff options
context:
space:
mode:
Diffstat (limited to 'searx/results.py')
-rw-r--r--searx/results.py110
1 files changed, 74 insertions, 36 deletions
diff --git a/searx/results.py b/searx/results.py
index cb204a682..3b1e4bd62 100644
--- a/searx/results.py
+++ b/searx/results.py
@@ -67,8 +67,9 @@ def merge_two_infoboxes(infobox1, infobox2):
for url2 in infobox2.get('urls', []):
unique_url = True
- for url1 in infobox1.get('urls', []):
- if compare_urls(urlparse(url1.get('url', '')), urlparse(url2.get('url', ''))):
+ parsed_url2 = urlparse(url2.get('url', ''))
+ for url1 in urls1:
+ if compare_urls(urlparse(url1.get('url', '')), parsed_url2):
unique_url = False
break
if unique_url:
@@ -136,6 +137,7 @@ class ResultContainer(object):
self._ordered = False
self.paging = False
self.unresponsive_engines = set()
+ self.timings = []
def extend(self, engine_name, results):
for result in list(results):
@@ -187,8 +189,9 @@ class ResultContainer(object):
add_infobox = True
infobox_id = infobox.get('id', None)
if infobox_id is not None:
+ parsed_url_infobox_id = urlparse(infobox_id)
for existingIndex in self.infoboxes:
- if compare_urls(urlparse(existingIndex.get('id', '')), urlparse(infobox_id)):
+ if compare_urls(urlparse(existingIndex.get('id', '')), parsed_url_infobox_id):
merge_two_infoboxes(existingIndex, infobox)
add_infobox = False
@@ -196,6 +199,13 @@ class ResultContainer(object):
self.infoboxes.append(infobox)
def _merge_result(self, result, position):
+ if 'url' in result:
+ self.__merge_url_result(result, position)
+ return
+
+ self.__merge_result_no_url(result, position)
+
+ def __merge_url_result(self, result, position):
result['parsed_url'] = urlparse(result['url'])
# if the result has no scheme, use http as default
@@ -209,42 +219,60 @@ class ResultContainer(object):
if result.get('content'):
result['content'] = WHITESPACE_REGEX.sub(' ', result['content'])
- # check for duplicates
- duplicated = False
- for merged_result in self._merged_results:
- if compare_urls(result['parsed_url'], merged_result['parsed_url'])\
- and result.get('template') == merged_result.get('template'):
- duplicated = merged_result
- break
-
- # merge duplicates together
+ duplicated = self.__find_duplicated_http_result(result)
if duplicated:
- # using content with more text
- if result_content_len(result.get('content', '')) >\
- result_content_len(duplicated.get('content', '')):
- duplicated['content'] = result['content']
-
- # merge all result's parameters not found in duplicate
- for key in result.keys():
- if not duplicated.get(key):
- duplicated[key] = result.get(key)
-
- # add the new position
- duplicated['positions'].append(position)
-
- # add engine to list of result-engines
- duplicated['engines'].add(result['engine'])
-
- # using https if possible
- if duplicated['parsed_url'].scheme != 'https' and result['parsed_url'].scheme == 'https':
- duplicated['url'] = result['parsed_url'].geturl()
- duplicated['parsed_url'] = result['parsed_url']
+ self.__merge_duplicated_http_result(duplicated, result, position)
+ return
# if there is no duplicate found, append result
- else:
- result['positions'] = [position]
- with RLock():
- self._merged_results.append(result)
+ result['positions'] = [position]
+ with RLock():
+ self._merged_results.append(result)
+
+ def __find_duplicated_http_result(self, result):
+ result_template = result.get('template')
+ for merged_result in self._merged_results:
+ if 'parsed_url' not in merged_result:
+ continue
+ if compare_urls(result['parsed_url'], merged_result['parsed_url'])\
+ and result_template == merged_result.get('template'):
+ if result_template != 'images.html':
+ # not an image, same template, same url : it's a duplicate
+ return merged_result
+ else:
+ # it's an image
+ # it's a duplicate if the parsed_url, template and img_src are differents
+ if result.get('img_src', '') == merged_result.get('img_src', ''):
+ return merged_result
+ return None
+
+ def __merge_duplicated_http_result(self, duplicated, result, position):
+ # using content with more text
+ if result_content_len(result.get('content', '')) >\
+ result_content_len(duplicated.get('content', '')):
+ duplicated['content'] = result['content']
+
+ # merge all result's parameters not found in duplicate
+ for key in result.keys():
+ if not duplicated.get(key):
+ duplicated[key] = result.get(key)
+
+ # add the new position
+ duplicated['positions'].append(position)
+
+ # add engine to list of result-engines
+ duplicated['engines'].add(result['engine'])
+
+ # using https if possible
+ if duplicated['parsed_url'].scheme != 'https' and result['parsed_url'].scheme == 'https':
+ duplicated['url'] = result['parsed_url'].geturl()
+ duplicated['parsed_url'] = result['parsed_url']
+
+ def __merge_result_no_url(self, result, position):
+ result['engines'] = set([result['engine']])
+ result['positions'] = [position]
+ with RLock():
+ self._merged_results.append(result)
def order_results(self):
for result in self._merged_results:
@@ -319,3 +347,13 @@ class ResultContainer(object):
def add_unresponsive_engine(self, engine_error):
self.unresponsive_engines.add(engine_error)
+
+ def add_timing(self, engine_name, engine_time, page_load_time):
+ self.timings.append({
+ 'engine': engines[engine_name].shortcut,
+ 'total': engine_time,
+ 'load': page_load_time
+ })
+
+ def get_timings(self):
+ return self.timings