diff options
Diffstat (limited to 'searx/search.py')
| -rw-r--r-- | searx/search.py | 184 |
1 files changed, 116 insertions, 68 deletions
diff --git a/searx/search.py b/searx/search.py index c861a795a..064c68844 100644 --- a/searx/search.py +++ b/searx/search.py @@ -16,6 +16,7 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >. ''' import grequests +import re from itertools import izip_longest, chain from datetime import datetime from operator import itemgetter @@ -25,6 +26,7 @@ from searx.engines import ( ) from searx.languages import language_codes from searx.utils import gen_useragent +from searx.query import Query number_of_searches = 0 @@ -37,17 +39,14 @@ def default_request_params(): # create a callback wrapper for the search engine results -def make_callback(engine_name, results, suggestions, callback, params): +def make_callback(engine_name, results, suggestions, answers, infoboxes, callback, params): # creating a callback wrapper for the search engine results def process_callback(response, **kwargs): cb_res = [] response.search_params = params - # update stats with current page-load-time - engines[engine_name].stats['page_load_time'] += \ - (datetime.now() - params['started']).total_seconds() - + # callback try: search_results = callback(response) except Exception, e: @@ -60,6 +59,7 @@ def make_callback(engine_name, results, suggestions, callback, params): engine_name, str(e)) return + # add results for result in search_results: result['engine'] = engine_name @@ -69,14 +69,37 @@ def make_callback(engine_name, results, suggestions, callback, params): suggestions.add(result['suggestion']) continue + # if it is an answer, add it to list of answers + if 'answer' in result: + answers.add(result['answer']) + continue + + # if it is an infobox, add it to list of infoboxes + if 'infobox' in result: + infoboxes.append(result) + continue + # append result cb_res.append(result) results[engine_name] = cb_res + # update stats with current page-load-time + engines[engine_name].stats['page_load_time'] += \ + (datetime.now() - params['started']).total_seconds() + return process_callback +# return the meaningful length of the content for a result +def content_result_len(content): + if isinstance(content, basestring): + content = re.sub('[,;:!?\./\\\\ ()-_]', '', content) + return len(content) + else: + return 0 + + # score results and remove duplications def score_results(results): # calculate scoring parameters @@ -98,8 +121,13 @@ def score_results(results): res['host'] = res['host'].replace('www.', '', 1) res['engines'] = [res['engine']] + weight = 1.0 + # strip multiple spaces and cariage returns from content + if 'content' in res: + res['content'] = re.sub(' +', ' ', res['content'].strip().replace('\n', '')) + # get weight of this engine if possible if hasattr(engines[res['engine']], 'weight'): weight = float(engines[res['engine']].weight) @@ -107,9 +135,8 @@ def score_results(results): # calculate score for that engine score = int((flat_len - i) / engines_len) * weight + 1 - duplicated = False - # check for duplicates + duplicated = False for new_res in results: # remove / from the end of the url if required p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path # noqa @@ -126,7 +153,7 @@ def score_results(results): # merge duplicates together if duplicated: # using content with more text - if res.get('content') > duplicated.get('content'): + if content_result_len(res.get('content', '')) > content_result_len(duplicated.get('content', '')): duplicated['content'] = res['content'] # increase result-score @@ -185,6 +212,64 @@ def score_results(results): return gresults +def merge_two_infoboxes(infobox1, infobox2): + if 'urls' in infobox2: + urls1 = infobox1.get('urls', None) + if urls1 == None: + urls1 = [] + infobox1.set('urls', urls1) + + urlSet = set() + for url in infobox1.get('urls', []): + urlSet.add(url.get('url', None)) + + for url in infobox2.get('urls', []): + if url.get('url', None) not in urlSet: + urls1.append(url) + + if 'attributes' in infobox2: + attributes1 = infobox1.get('attributes', None) + if attributes1 == None: + attributes1 = [] + infobox1.set('attributes', attributes1) + + attributeSet = set() + for attribute in infobox1.get('attributes', []): + if attribute.get('label', None) not in attributeSet: + attributeSet.add(attribute.get('label', None)) + + for attribute in infobox2.get('attributes', []): + attributes1.append(attribute) + + if 'content' in infobox2: + content1 = infobox1.get('content', None) + content2 = infobox2.get('content', '') + if content1 != None: + if content_result_len(content2) > content_result_len(content1): + infobox1['content'] = content2 + else: + infobox1.set('content', content2) + + +def merge_infoboxes(infoboxes): + results = [] + infoboxes_id = {} + for infobox in infoboxes: + add_infobox = True + infobox_id = infobox.get('id', None) + if infobox_id != None: + existingIndex = infoboxes_id.get(infobox_id, None) + if existingIndex != None: + merge_two_infoboxes(results[existingIndex], infobox) + add_infobox=False + + if add_infobox: + results.append(infobox) + infoboxes_id[infobox_id] = len(results)-1 + + return results + + class Search(object): """Search information container""" @@ -207,6 +292,8 @@ class Search(object): self.results = [] self.suggestions = [] + self.answers = [] + self.infoboxes = [] self.request_data = {} # set specific language if set @@ -224,9 +311,6 @@ class Search(object): if not self.request_data.get('q'): raise Exception('noquery') - # set query - self.query = self.request_data['q'] - # set pagenumber pageno_param = self.request_data.get('pageno', '1') if not pageno_param.isdigit() or int(pageno_param) < 1: @@ -235,7 +319,18 @@ class Search(object): self.pageno = int(pageno_param) # parse query, if tags are set, which change the serch engine or search-language - self.parse_query() + query_obj = Query(self.request_data['q'], self.blocked_engines) + query_obj.parse_query() + + # set query + self.query = query_obj.getSearchQuery() + + # get last selected language in query, if possible + # TODO support search with multible languages + if len(query_obj.languages): + self.lang = query_obj.languages[-1] + + self.engines = query_obj.engines self.categories = [] @@ -276,60 +371,6 @@ class Search(object): for x in categories[categ] if not x.name in self.blocked_engines) - # parse query, if tags are set, which change the serch engine or search-language - def parse_query(self): - query_parts = self.query.split() - modified = False - - # check if language-prefix is set - if query_parts[0].startswith(':'): - lang = query_parts[0][1:].lower() - - # check if any language-code is equal with declared language-codes - for lc in language_codes: - lang_id, lang_name, country = map(str.lower, lc) - - # if correct language-code is found, set it as new search-language - if lang == lang_id\ - or lang_id.startswith(lang)\ - or lang == lang_name\ - or lang == country: - self.lang = lang - modified = True - break - - # check if category/engine prefix is set - elif query_parts[0].startswith('!'): - prefix = query_parts[0][1:].replace('_', ' ') - - # check if prefix is equal with engine shortcut - if prefix in engine_shortcuts\ - and not engine_shortcuts[prefix] in self.blocked_engines: - modified = True - self.engines.append({'category': 'none', - 'name': engine_shortcuts[prefix]}) - - # check if prefix is equal with engine name - elif prefix in engines\ - and not prefix in self.blocked_engines: - modified = True - self.engines.append({'category': 'none', - 'name': prefix}) - - # check if prefix is equal with categorie name - elif prefix in categories: - modified = True - # using all engines for that search, which are declared under that categorie name - self.engines.extend({'category': prefix, - 'name': engine.name} - for engine in categories[prefix] - if not engine in self.blocked_engines) - - # if language, category or engine were specificed in this query, search for more tags which does the same - if modified: - self.query = self.query.replace(query_parts[0], '', 1).strip() - self.parse_query() - # do search-request def search(self, request): global number_of_searches @@ -338,6 +379,8 @@ class Search(object): requests = [] results = {} suggestions = set() + answers = set() + infoboxes = [] # increase number of searches number_of_searches += 1 @@ -382,6 +425,8 @@ class Search(object): selected_engine['name'], results, suggestions, + answers, + infoboxes, engine.response, request_params ) @@ -419,11 +464,14 @@ class Search(object): # score results and remove duplications results = score_results(results) + # merge infoboxes according to their ids + infoboxes = merge_infoboxes(infoboxes) + # update engine stats, using calculated score for result in results: for res_engine in result['engines']: engines[result['engine']]\ .stats['score_count'] += result['score'] - # return results and suggestions - return results, suggestions + # return results, suggestions, answers and infoboxes + return results, suggestions, answers, infoboxes |