diff options
| author | Markus Heiser <markus.heiser@darmarIT.de> | 2019-12-24 13:33:07 +0100 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2019-12-24 13:33:07 +0100 |
| commit | fb668e2075484084a1f7a9b205ecbe7957ea5e8e (patch) | |
| tree | c6f2e83d9d222d69d79348faac342c07c32dbbf3 /searx/results.py | |
| parent | f407dd8ef4e3f6c82bef31f678139d6db2a4d810 (diff) | |
| parent | 6d232e9b695c2553b7594efe00c4f63aa96fc62d (diff) | |
Merge branch 'master' into libgen
Diffstat (limited to 'searx/results.py')
| -rw-r--r-- | searx/results.py | 82 |
1 files changed, 50 insertions, 32 deletions
diff --git a/searx/results.py b/searx/results.py index be74a836b..3b1e4bd62 100644 --- a/searx/results.py +++ b/searx/results.py @@ -67,8 +67,9 @@ def merge_two_infoboxes(infobox1, infobox2): for url2 in infobox2.get('urls', []): unique_url = True - for url1 in infobox1.get('urls', []): - if compare_urls(urlparse(url1.get('url', '')), urlparse(url2.get('url', ''))): + parsed_url2 = urlparse(url2.get('url', '')) + for url1 in urls1: + if compare_urls(urlparse(url1.get('url', '')), parsed_url2): unique_url = False break if unique_url: @@ -188,8 +189,9 @@ class ResultContainer(object): add_infobox = True infobox_id = infobox.get('id', None) if infobox_id is not None: + parsed_url_infobox_id = urlparse(infobox_id) for existingIndex in self.infoboxes: - if compare_urls(urlparse(existingIndex.get('id', '')), urlparse(infobox_id)): + if compare_urls(urlparse(existingIndex.get('id', '')), parsed_url_infobox_id): merge_two_infoboxes(existingIndex, infobox) add_infobox = False @@ -197,6 +199,13 @@ class ResultContainer(object): self.infoboxes.append(infobox) def _merge_result(self, result, position): + if 'url' in result: + self.__merge_url_result(result, position) + return + + self.__merge_result_no_url(result, position) + + def __merge_url_result(self, result, position): result['parsed_url'] = urlparse(result['url']) # if the result has no scheme, use http as default @@ -210,51 +219,60 @@ class ResultContainer(object): if result.get('content'): result['content'] = WHITESPACE_REGEX.sub(' ', result['content']) - # check for duplicates - duplicated = False + duplicated = self.__find_duplicated_http_result(result) + if duplicated: + self.__merge_duplicated_http_result(duplicated, result, position) + return + + # if there is no duplicate found, append result + result['positions'] = [position] + with RLock(): + self._merged_results.append(result) + + def __find_duplicated_http_result(self, result): result_template = result.get('template') for merged_result in self._merged_results: + if 'parsed_url' not in merged_result: + continue if compare_urls(result['parsed_url'], merged_result['parsed_url'])\ and result_template == merged_result.get('template'): if result_template != 'images.html': # not an image, same template, same url : it's a duplicate - duplicated = merged_result - break + return merged_result else: # it's an image # it's a duplicate if the parsed_url, template and img_src are differents if result.get('img_src', '') == merged_result.get('img_src', ''): - duplicated = merged_result - break + return merged_result + return None - # merge duplicates together - if duplicated: - # using content with more text - if result_content_len(result.get('content', '')) >\ - result_content_len(duplicated.get('content', '')): - duplicated['content'] = result['content'] + def __merge_duplicated_http_result(self, duplicated, result, position): + # using content with more text + if result_content_len(result.get('content', '')) >\ + result_content_len(duplicated.get('content', '')): + duplicated['content'] = result['content'] - # merge all result's parameters not found in duplicate - for key in result.keys(): - if not duplicated.get(key): - duplicated[key] = result.get(key) + # merge all result's parameters not found in duplicate + for key in result.keys(): + if not duplicated.get(key): + duplicated[key] = result.get(key) - # add the new position - duplicated['positions'].append(position) + # add the new position + duplicated['positions'].append(position) - # add engine to list of result-engines - duplicated['engines'].add(result['engine']) + # add engine to list of result-engines + duplicated['engines'].add(result['engine']) - # using https if possible - if duplicated['parsed_url'].scheme != 'https' and result['parsed_url'].scheme == 'https': - duplicated['url'] = result['parsed_url'].geturl() - duplicated['parsed_url'] = result['parsed_url'] + # using https if possible + if duplicated['parsed_url'].scheme != 'https' and result['parsed_url'].scheme == 'https': + duplicated['url'] = result['parsed_url'].geturl() + duplicated['parsed_url'] = result['parsed_url'] - # if there is no duplicate found, append result - else: - result['positions'] = [position] - with RLock(): - self._merged_results.append(result) + def __merge_result_no_url(self, result, position): + result['engines'] = set([result['engine']]) + result['positions'] = [position] + with RLock(): + self._merged_results.append(result) def order_results(self): for result in self._merged_results: |