summaryrefslogtreecommitdiff
path: root/searx/engines/recoll.py
diff options
context:
space:
mode:
authorNoémi Ványi <kvch@users.noreply.github.com>2020-11-30 08:35:15 +0100
committerGitHub <noreply@github.com>2020-11-30 08:35:15 +0100
commit4a36a3044d6e39bc60d026d99ed7a010f6505a5f (patch)
tree7037ea2a1e86ec7d308a5d231eae3aa0897e0f87 /searx/engines/recoll.py
parent93c2603561c039fb43137c251493e77032f91743 (diff)
Add recoll engine (#2325)
recoll is a local search engine based on Xapian: http://www.lesbonscomptes.com/recoll/ By itself recoll does not offer web or API access, this can be achieved using recoll-webui: https://framagit.org/medoc92/recollwebui.git This engine uses a custom 'files' result template set `base_url` to the location where recoll-webui can be reached set `dl_prefix` to a location where the file hierarchy as indexed by recoll can be reached set `search_dir` to the part of the indexed file hierarchy to be searched, use an empty string to search the entire search domain
Diffstat (limited to 'searx/engines/recoll.py')
-rw-r--r--searx/engines/recoll.py104
1 files changed, 104 insertions, 0 deletions
diff --git a/searx/engines/recoll.py b/searx/engines/recoll.py
new file mode 100644
index 000000000..5a956b8bf
--- /dev/null
+++ b/searx/engines/recoll.py
@@ -0,0 +1,104 @@
+"""
+ Recoll (local search engine)
+
+ @using-api yes
+ @results JSON
+ @stable yes
+ @parse url, content, size, abstract, author, mtype, subtype, time, \
+ filename, label, type, embedded
+"""
+
+from datetime import date, timedelta
+from json import loads
+from urllib.parse import urlencode, quote
+
+# engine dependent config
+time_range_support = True
+
+# parameters from settings.yml
+base_url = None
+search_dir = ''
+mount_prefix = None
+dl_prefix = None
+
+# embedded
+embedded_url = '<{ttype} controls height="166px" ' +\
+ 'src="{url}" type="{mtype}"></{ttype}>'
+
+
+# helper functions
+def get_time_range(time_range):
+ sw = {
+ 'day': 1,
+ 'week': 7,
+ 'month': 30,
+ 'year': 365
+ }
+
+ offset = sw.get(time_range, 0)
+ if not offset:
+ return ''
+
+ return (date.today() - timedelta(days=offset)).isoformat()
+
+
+# do search-request
+def request(query, params):
+ search_after = get_time_range(params['time_range'])
+ search_url = base_url + 'json?{query}&highlight=0'
+ params['url'] = search_url.format(query=urlencode({
+ 'query': query,
+ 'after': search_after,
+ 'dir': search_dir}))
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ response_json = loads(resp.text)
+
+ if not response_json:
+ return []
+
+ for result in response_json.get('results', []):
+ title = result['label']
+ url = result['url'].replace('file://' + mount_prefix, dl_prefix)
+ content = '{}'.format(result['snippet'])
+
+ # append result
+ item = {'url': url,
+ 'title': title,
+ 'content': content,
+ 'template': 'files.html'}
+
+ if result['size']:
+ item['size'] = int(result['size'])
+
+ for parameter in ['filename', 'abstract', 'author', 'mtype', 'time']:
+ if result[parameter]:
+ item[parameter] = result[parameter]
+
+ # facilitate preview support for known mime types
+ if 'mtype' in result and '/' in result['mtype']:
+ (mtype, subtype) = result['mtype'].split('/')
+ item['mtype'] = mtype
+ item['subtype'] = subtype
+
+ if mtype in ['audio', 'video']:
+ item['embedded'] = embedded_url.format(
+ ttype=mtype,
+ url=quote(url.encode('utf8'), '/:'),
+ mtype=result['mtype'])
+
+ if mtype in ['image'] and subtype in ['bmp', 'gif', 'jpeg', 'png']:
+ item['img_src'] = url
+
+ results.append(item)
+
+ if 'nres' in response_json:
+ results.append({'number_of_results': response_json['nres']})
+
+ return results