diff options
| author | asciimoo <asciimoo@gmail.com> | 2013-10-19 17:36:44 +0200 |
|---|---|---|
| committer | asciimoo <asciimoo@gmail.com> | 2013-10-19 17:36:44 +0200 |
| commit | 70cbc09e9390d02686882786c20c201b3a08edef (patch) | |
| tree | 972e5ea6678470a3a326a7445b599c227a7bf86c | |
| parent | 34941aca3f3c9e204309dbf0426b932e35412238 (diff) | |
[enh] better url comparison
| -rw-r--r-- | searx/engines/__init__.py | 10 |
1 files changed, 9 insertions, 1 deletions
diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index cdf667e3c..078188cd4 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -22,6 +22,7 @@ from imp import load_source import grequests from itertools import izip_longest, chain from operator import itemgetter +from urlparse import urlparse engine_dir = dirname(realpath(__file__)) @@ -87,16 +88,23 @@ def search(query, request, selected_engines): results = [] # deduplication + scoring for i,res in enumerate(flat_res): + res['parsed_url'] = urlparse(res['url']) score = flat_len - i duplicated = False for new_res in results: - if res['url'] == new_res['url']: + if res['parsed_url'].netloc == new_res['parsed_url'].netloc and\ + res['parsed_url'].path == new_res['parsed_url'].path: duplicated = new_res break if duplicated: if len(res.get('content', '')) > len(duplicated.get('content', '')): duplicated['content'] = res['content'] duplicated['score'] += score + if duplicated['parsed_url'].scheme == 'https': + continue + elif res['parsed_url'].scheme == 'https': + duplicated['parsed_url'].scheme == 'https' + duplicated['url'] = duplicated['parsed_url'].geturl() else: res['score'] = score results.append(res) |