diff options
Diffstat (limited to 'searx/results.py')
| -rw-r--r-- | searx/results.py | 110 |
1 files changed, 74 insertions, 36 deletions
diff --git a/searx/results.py b/searx/results.py index cb204a682..3b1e4bd62 100644 --- a/searx/results.py +++ b/searx/results.py @@ -67,8 +67,9 @@ def merge_two_infoboxes(infobox1, infobox2): for url2 in infobox2.get('urls', []): unique_url = True - for url1 in infobox1.get('urls', []): - if compare_urls(urlparse(url1.get('url', '')), urlparse(url2.get('url', ''))): + parsed_url2 = urlparse(url2.get('url', '')) + for url1 in urls1: + if compare_urls(urlparse(url1.get('url', '')), parsed_url2): unique_url = False break if unique_url: @@ -136,6 +137,7 @@ class ResultContainer(object): self._ordered = False self.paging = False self.unresponsive_engines = set() + self.timings = [] def extend(self, engine_name, results): for result in list(results): @@ -187,8 +189,9 @@ class ResultContainer(object): add_infobox = True infobox_id = infobox.get('id', None) if infobox_id is not None: + parsed_url_infobox_id = urlparse(infobox_id) for existingIndex in self.infoboxes: - if compare_urls(urlparse(existingIndex.get('id', '')), urlparse(infobox_id)): + if compare_urls(urlparse(existingIndex.get('id', '')), parsed_url_infobox_id): merge_two_infoboxes(existingIndex, infobox) add_infobox = False @@ -196,6 +199,13 @@ class ResultContainer(object): self.infoboxes.append(infobox) def _merge_result(self, result, position): + if 'url' in result: + self.__merge_url_result(result, position) + return + + self.__merge_result_no_url(result, position) + + def __merge_url_result(self, result, position): result['parsed_url'] = urlparse(result['url']) # if the result has no scheme, use http as default @@ -209,42 +219,60 @@ class ResultContainer(object): if result.get('content'): result['content'] = WHITESPACE_REGEX.sub(' ', result['content']) - # check for duplicates - duplicated = False - for merged_result in self._merged_results: - if compare_urls(result['parsed_url'], merged_result['parsed_url'])\ - and result.get('template') == merged_result.get('template'): - duplicated = merged_result - break - - # merge duplicates together + duplicated = self.__find_duplicated_http_result(result) if duplicated: - # using content with more text - if result_content_len(result.get('content', '')) >\ - result_content_len(duplicated.get('content', '')): - duplicated['content'] = result['content'] - - # merge all result's parameters not found in duplicate - for key in result.keys(): - if not duplicated.get(key): - duplicated[key] = result.get(key) - - # add the new position - duplicated['positions'].append(position) - - # add engine to list of result-engines - duplicated['engines'].add(result['engine']) - - # using https if possible - if duplicated['parsed_url'].scheme != 'https' and result['parsed_url'].scheme == 'https': - duplicated['url'] = result['parsed_url'].geturl() - duplicated['parsed_url'] = result['parsed_url'] + self.__merge_duplicated_http_result(duplicated, result, position) + return # if there is no duplicate found, append result - else: - result['positions'] = [position] - with RLock(): - self._merged_results.append(result) + result['positions'] = [position] + with RLock(): + self._merged_results.append(result) + + def __find_duplicated_http_result(self, result): + result_template = result.get('template') + for merged_result in self._merged_results: + if 'parsed_url' not in merged_result: + continue + if compare_urls(result['parsed_url'], merged_result['parsed_url'])\ + and result_template == merged_result.get('template'): + if result_template != 'images.html': + # not an image, same template, same url : it's a duplicate + return merged_result + else: + # it's an image + # it's a duplicate if the parsed_url, template and img_src are differents + if result.get('img_src', '') == merged_result.get('img_src', ''): + return merged_result + return None + + def __merge_duplicated_http_result(self, duplicated, result, position): + # using content with more text + if result_content_len(result.get('content', '')) >\ + result_content_len(duplicated.get('content', '')): + duplicated['content'] = result['content'] + + # merge all result's parameters not found in duplicate + for key in result.keys(): + if not duplicated.get(key): + duplicated[key] = result.get(key) + + # add the new position + duplicated['positions'].append(position) + + # add engine to list of result-engines + duplicated['engines'].add(result['engine']) + + # using https if possible + if duplicated['parsed_url'].scheme != 'https' and result['parsed_url'].scheme == 'https': + duplicated['url'] = result['parsed_url'].geturl() + duplicated['parsed_url'] = result['parsed_url'] + + def __merge_result_no_url(self, result, position): + result['engines'] = set([result['engine']]) + result['positions'] = [position] + with RLock(): + self._merged_results.append(result) def order_results(self): for result in self._merged_results: @@ -319,3 +347,13 @@ class ResultContainer(object): def add_unresponsive_engine(self, engine_error): self.unresponsive_engines.add(engine_error) + + def add_timing(self, engine_name, engine_time, page_load_time): + self.timings.append({ + 'engine': engines[engine_name].shortcut, + 'total': engine_time, + 'load': page_load_time + }) + + def get_timings(self): + return self.timings |