summaryrefslogtreecommitdiff
path: root/searx/search.py
diff options
context:
space:
mode:
Diffstat (limited to 'searx/search.py')
-rw-r--r--searx/search.py184
1 files changed, 116 insertions, 68 deletions
diff --git a/searx/search.py b/searx/search.py
index c861a795a..064c68844 100644
--- a/searx/search.py
+++ b/searx/search.py
@@ -16,6 +16,7 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >.
'''
import grequests
+import re
from itertools import izip_longest, chain
from datetime import datetime
from operator import itemgetter
@@ -25,6 +26,7 @@ from searx.engines import (
)
from searx.languages import language_codes
from searx.utils import gen_useragent
+from searx.query import Query
number_of_searches = 0
@@ -37,17 +39,14 @@ def default_request_params():
# create a callback wrapper for the search engine results
-def make_callback(engine_name, results, suggestions, callback, params):
+def make_callback(engine_name, results, suggestions, answers, infoboxes, callback, params):
# creating a callback wrapper for the search engine results
def process_callback(response, **kwargs):
cb_res = []
response.search_params = params
- # update stats with current page-load-time
- engines[engine_name].stats['page_load_time'] += \
- (datetime.now() - params['started']).total_seconds()
-
+ # callback
try:
search_results = callback(response)
except Exception, e:
@@ -60,6 +59,7 @@ def make_callback(engine_name, results, suggestions, callback, params):
engine_name, str(e))
return
+ # add results
for result in search_results:
result['engine'] = engine_name
@@ -69,14 +69,37 @@ def make_callback(engine_name, results, suggestions, callback, params):
suggestions.add(result['suggestion'])
continue
+ # if it is an answer, add it to list of answers
+ if 'answer' in result:
+ answers.add(result['answer'])
+ continue
+
+ # if it is an infobox, add it to list of infoboxes
+ if 'infobox' in result:
+ infoboxes.append(result)
+ continue
+
# append result
cb_res.append(result)
results[engine_name] = cb_res
+ # update stats with current page-load-time
+ engines[engine_name].stats['page_load_time'] += \
+ (datetime.now() - params['started']).total_seconds()
+
return process_callback
+# return the meaningful length of the content for a result
+def content_result_len(content):
+ if isinstance(content, basestring):
+ content = re.sub('[,;:!?\./\\\\ ()-_]', '', content)
+ return len(content)
+ else:
+ return 0
+
+
# score results and remove duplications
def score_results(results):
# calculate scoring parameters
@@ -98,8 +121,13 @@ def score_results(results):
res['host'] = res['host'].replace('www.', '', 1)
res['engines'] = [res['engine']]
+
weight = 1.0
+ # strip multiple spaces and cariage returns from content
+ if 'content' in res:
+ res['content'] = re.sub(' +', ' ', res['content'].strip().replace('\n', ''))
+
# get weight of this engine if possible
if hasattr(engines[res['engine']], 'weight'):
weight = float(engines[res['engine']].weight)
@@ -107,9 +135,8 @@ def score_results(results):
# calculate score for that engine
score = int((flat_len - i) / engines_len) * weight + 1
- duplicated = False
-
# check for duplicates
+ duplicated = False
for new_res in results:
# remove / from the end of the url if required
p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path # noqa
@@ -126,7 +153,7 @@ def score_results(results):
# merge duplicates together
if duplicated:
# using content with more text
- if res.get('content') > duplicated.get('content'):
+ if content_result_len(res.get('content', '')) > content_result_len(duplicated.get('content', '')):
duplicated['content'] = res['content']
# increase result-score
@@ -185,6 +212,64 @@ def score_results(results):
return gresults
+def merge_two_infoboxes(infobox1, infobox2):
+ if 'urls' in infobox2:
+ urls1 = infobox1.get('urls', None)
+ if urls1 == None:
+ urls1 = []
+ infobox1.set('urls', urls1)
+
+ urlSet = set()
+ for url in infobox1.get('urls', []):
+ urlSet.add(url.get('url', None))
+
+ for url in infobox2.get('urls', []):
+ if url.get('url', None) not in urlSet:
+ urls1.append(url)
+
+ if 'attributes' in infobox2:
+ attributes1 = infobox1.get('attributes', None)
+ if attributes1 == None:
+ attributes1 = []
+ infobox1.set('attributes', attributes1)
+
+ attributeSet = set()
+ for attribute in infobox1.get('attributes', []):
+ if attribute.get('label', None) not in attributeSet:
+ attributeSet.add(attribute.get('label', None))
+
+ for attribute in infobox2.get('attributes', []):
+ attributes1.append(attribute)
+
+ if 'content' in infobox2:
+ content1 = infobox1.get('content', None)
+ content2 = infobox2.get('content', '')
+ if content1 != None:
+ if content_result_len(content2) > content_result_len(content1):
+ infobox1['content'] = content2
+ else:
+ infobox1.set('content', content2)
+
+
+def merge_infoboxes(infoboxes):
+ results = []
+ infoboxes_id = {}
+ for infobox in infoboxes:
+ add_infobox = True
+ infobox_id = infobox.get('id', None)
+ if infobox_id != None:
+ existingIndex = infoboxes_id.get(infobox_id, None)
+ if existingIndex != None:
+ merge_two_infoboxes(results[existingIndex], infobox)
+ add_infobox=False
+
+ if add_infobox:
+ results.append(infobox)
+ infoboxes_id[infobox_id] = len(results)-1
+
+ return results
+
+
class Search(object):
"""Search information container"""
@@ -207,6 +292,8 @@ class Search(object):
self.results = []
self.suggestions = []
+ self.answers = []
+ self.infoboxes = []
self.request_data = {}
# set specific language if set
@@ -224,9 +311,6 @@ class Search(object):
if not self.request_data.get('q'):
raise Exception('noquery')
- # set query
- self.query = self.request_data['q']
-
# set pagenumber
pageno_param = self.request_data.get('pageno', '1')
if not pageno_param.isdigit() or int(pageno_param) < 1:
@@ -235,7 +319,18 @@ class Search(object):
self.pageno = int(pageno_param)
# parse query, if tags are set, which change the serch engine or search-language
- self.parse_query()
+ query_obj = Query(self.request_data['q'], self.blocked_engines)
+ query_obj.parse_query()
+
+ # set query
+ self.query = query_obj.getSearchQuery()
+
+ # get last selected language in query, if possible
+ # TODO support search with multible languages
+ if len(query_obj.languages):
+ self.lang = query_obj.languages[-1]
+
+ self.engines = query_obj.engines
self.categories = []
@@ -276,60 +371,6 @@ class Search(object):
for x in categories[categ]
if not x.name in self.blocked_engines)
- # parse query, if tags are set, which change the serch engine or search-language
- def parse_query(self):
- query_parts = self.query.split()
- modified = False
-
- # check if language-prefix is set
- if query_parts[0].startswith(':'):
- lang = query_parts[0][1:].lower()
-
- # check if any language-code is equal with declared language-codes
- for lc in language_codes:
- lang_id, lang_name, country = map(str.lower, lc)
-
- # if correct language-code is found, set it as new search-language
- if lang == lang_id\
- or lang_id.startswith(lang)\
- or lang == lang_name\
- or lang == country:
- self.lang = lang
- modified = True
- break
-
- # check if category/engine prefix is set
- elif query_parts[0].startswith('!'):
- prefix = query_parts[0][1:].replace('_', ' ')
-
- # check if prefix is equal with engine shortcut
- if prefix in engine_shortcuts\
- and not engine_shortcuts[prefix] in self.blocked_engines:
- modified = True
- self.engines.append({'category': 'none',
- 'name': engine_shortcuts[prefix]})
-
- # check if prefix is equal with engine name
- elif prefix in engines\
- and not prefix in self.blocked_engines:
- modified = True
- self.engines.append({'category': 'none',
- 'name': prefix})
-
- # check if prefix is equal with categorie name
- elif prefix in categories:
- modified = True
- # using all engines for that search, which are declared under that categorie name
- self.engines.extend({'category': prefix,
- 'name': engine.name}
- for engine in categories[prefix]
- if not engine in self.blocked_engines)
-
- # if language, category or engine were specificed in this query, search for more tags which does the same
- if modified:
- self.query = self.query.replace(query_parts[0], '', 1).strip()
- self.parse_query()
-
# do search-request
def search(self, request):
global number_of_searches
@@ -338,6 +379,8 @@ class Search(object):
requests = []
results = {}
suggestions = set()
+ answers = set()
+ infoboxes = []
# increase number of searches
number_of_searches += 1
@@ -382,6 +425,8 @@ class Search(object):
selected_engine['name'],
results,
suggestions,
+ answers,
+ infoboxes,
engine.response,
request_params
)
@@ -419,11 +464,14 @@ class Search(object):
# score results and remove duplications
results = score_results(results)
+ # merge infoboxes according to their ids
+ infoboxes = merge_infoboxes(infoboxes)
+
# update engine stats, using calculated score
for result in results:
for res_engine in result['engines']:
engines[result['engine']]\
.stats['score_count'] += result['score']
- # return results and suggestions
- return results, suggestions
+ # return results, suggestions, answers and infoboxes
+ return results, suggestions, answers, infoboxes