Add recoll engine (#2325)

recoll is a local search engine based on Xapian: http://www.lesbonscomptes.com/recoll/ By itself recoll does not offer web or API access, this can be achieved using recoll-webui: https://framagit.org/medoc92/recollwebui.git This engine uses a custom 'files' result template set `base_url` to the location where recoll-webui can be reached set `dl_prefix` to a location where the file hierarchy as indexed by recoll can be reached set `search_dir` to the part of the indexed file hierarchy to be searched, use an empty string to search the entire search domain
author: Noémi Ványi <kvch@users.noreply.github.com> 2020-11-30 08:35:15 +0100
committer: GitHub <noreply@github.com> 2020-11-30 08:35:15 +0100
commit: 4a36a3044d6e39bc60d026d99ed7a010f6505a5f (patch)
tree: 7037ea2a1e86ec7d308a5d231eae3aa0897e0f87 /searx/engines/recoll.py
parent: 93c2603561c039fb43137c251493e77032f91743 (diff)
1 files changed, 104 insertions, 0 deletions
diff --git a/searx/engines/recoll.py b/searx/engines/recoll.py
new file mode 100644
index 000000000..5a956b8bf
--- /dev/null
+++ b/searx/engines/recoll.py
@@ -0,0 +1,104 @@
+"""
+ Recoll (local search engine)
+
+ @using-api   yes
+ @results     JSON
+ @stable      yes
+ @parse       url, content, size, abstract, author, mtype, subtype, time, \
+              filename, label, type, embedded
+"""
+
+from datetime import date, timedelta
+from json import loads
+from urllib.parse import urlencode, quote
+
+# engine dependent config
+time_range_support = True
+
+# parameters from settings.yml
+base_url = None
+search_dir = ''
+mount_prefix = None
+dl_prefix = None
+
+# embedded
+embedded_url = '<{ttype} controls height="166px" ' +\
+    'src="{url}" type="{mtype}"></{ttype}>'
+
+
+# helper functions
+def get_time_range(time_range):
+    sw = {
+        'day': 1,
+        'week': 7,
+        'month': 30,
+        'year': 365
+    }
+
+    offset = sw.get(time_range, 0)
+    if not offset:
+        return ''
+
+    return (date.today() - timedelta(days=offset)).isoformat()
+
+
+# do search-request
+def request(query, params):
+    search_after = get_time_range(params['time_range'])
+    search_url = base_url + 'json?{query}&highlight=0'
+    params['url'] = search_url.format(query=urlencode({
+        'query': query,
+        'after': search_after,
+        'dir': search_dir}))
+
+    return params
+
+
+# get response from search-request
+def response(resp):
+    results = []
+
+    response_json = loads(resp.text)
+
+    if not response_json:
+        return []
+
+    for result in response_json.get('results', []):
+        title = result['label']
+        url = result['url'].replace('file://' + mount_prefix, dl_prefix)
+        content = '{}'.format(result['snippet'])
+
+        # append result
+        item = {'url': url,
+                'title': title,
+                'content': content,
+                'template': 'files.html'}
+
+        if result['size']:
+            item['size'] = int(result['size'])
+
+        for parameter in ['filename', 'abstract', 'author', 'mtype', 'time']:
+            if result[parameter]:
+                item[parameter] = result[parameter]
+
+        # facilitate preview support for known mime types
+        if 'mtype' in result and '/' in result['mtype']:
+            (mtype, subtype) = result['mtype'].split('/')
+            item['mtype'] = mtype
+            item['subtype'] = subtype
+
+            if mtype in ['audio', 'video']:
+                item['embedded'] = embedded_url.format(
+                    ttype=mtype,
+                    url=quote(url.encode('utf8'), '/:'),
+                    mtype=result['mtype'])
+
+            if mtype in ['image'] and subtype in ['bmp', 'gif', 'jpeg', 'png']:
+                item['img_src'] = url
+
+        results.append(item)
+
+    if 'nres' in response_json:
+        results.append({'number_of_results': response_json['nres']})
+
+    return results
author	Noémi Ványi <kvch@users.noreply.github.com>	2020-11-30 08:35:15 +0100
committer	GitHub <noreply@github.com>	2020-11-30 08:35:15 +0100
commit	4a36a3044d6e39bc60d026d99ed7a010f6505a5f (patch)
tree	7037ea2a1e86ec7d308a5d231eae3aa0897e0f87 /searx/engines/recoll.py
parent	93c2603561c039fb43137c251493e77032f91743 (diff)