summaryrefslogtreecommitdiff
path: root/utils
diff options
context:
space:
mode:
Diffstat (limited to 'utils')
-rw-r--r--utils/brand.env4
-rw-r--r--utils/build_env.py38
-rw-r--r--utils/fetch_engine_descriptions.py206
-rw-r--r--utils/makefile.python4
-rwxr-xr-xutils/searx.sh8
-rw-r--r--utils/templates/etc/uwsgi/apps-archlinux/searx.ini5
-rw-r--r--utils/templates/etc/uwsgi/apps-available/searx.ini3
7 files changed, 263 insertions, 5 deletions
diff --git a/utils/brand.env b/utils/brand.env
index 2e763159d..2136d278f 100644
--- a/utils/brand.env
+++ b/utils/brand.env
@@ -1,7 +1,9 @@
+export SEARX_URL=''
export GIT_URL='https://github.com/searx/searx'
export GIT_BRANCH='master'
export ISSUE_URL='https://github.com/searx/searx/issues'
-export SEARX_URL='https://searx.me'
export DOCS_URL='https://searx.github.io/searx'
export PUBLIC_INSTANCES='https://searx.space'
export CONTACT_URL=''
+export WIKI_URL='https://github.com/searx/searx/wiki'
+export TWITTER_URL='https://twitter.com/Searx_engine'
diff --git a/utils/build_env.py b/utils/build_env.py
new file mode 100644
index 000000000..ffb2689e9
--- /dev/null
+++ b/utils/build_env.py
@@ -0,0 +1,38 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+"""build environment used by shell scripts
+"""
+
+# set path
+import sys
+import os
+from os.path import realpath, dirname, join, sep, abspath
+
+repo_root = realpath(dirname(realpath(__file__)) + sep + '..')
+sys.path.insert(0, repo_root)
+os.environ['SEARX_SETTINGS_PATH'] = abspath(dirname(__file__) + '/settings.yml')
+
+# Under the assumption that a brand is always a fork assure that the settings
+# file from reposetorie's working tree is used to generate the build_env, not
+# from /etc/searx/settings.yml.
+os.environ['SEARX_SETTINGS_PATH'] = abspath(dirname(__file__) + sep + 'settings.yml')
+
+from searx import brand
+
+name_val = [
+ ('SEARX_URL' , brand.SEARX_URL),
+ ('GIT_URL' , brand.GIT_URL),
+ ('GIT_BRANCH' , brand.GIT_BRANCH),
+ ('ISSUE_URL' , brand.ISSUE_URL),
+ ('DOCS_URL' , brand.DOCS_URL),
+ ('PUBLIC_INSTANCES' , brand.PUBLIC_INSTANCES),
+ ('CONTACT_URL' , brand.CONTACT_URL),
+ ('WIKI_URL' , brand.WIKI_URL),
+ ('TWITTER_URL' , brand.TWITTER_URL),
+]
+
+brand_env = 'utils' + sep + 'brand.env'
+
+print('build %s' % brand_env)
+with open(repo_root + sep + brand_env, 'w', encoding='utf-8') as f:
+ for name, val in name_val:
+ print("export %s='%s'" % (name, val), file=f)
diff --git a/utils/fetch_engine_descriptions.py b/utils/fetch_engine_descriptions.py
new file mode 100644
index 000000000..9ca001d45
--- /dev/null
+++ b/utils/fetch_engine_descriptions.py
@@ -0,0 +1,206 @@
+#!/usr/bin/env python
+
+import sys
+import json
+from urllib.parse import quote, urlparse
+from os.path import realpath, dirname
+import cld3
+from lxml.html import fromstring
+
+# set path
+sys.path.append(realpath(dirname(realpath(__file__)) + '/../'))
+
+from searx.engines.wikidata import send_wikidata_query
+from searx.utils import extract_text
+import searx
+import searx.search
+import searx.poolrequests
+
+SPARQL_WIKIPEDIA_ARTICLE = """
+SELECT DISTINCT ?item ?name
+WHERE {
+ VALUES ?item { %IDS% }
+ ?article schema:about ?item ;
+ schema:inLanguage ?lang ;
+ schema:name ?name ;
+ schema:isPartOf [ wikibase:wikiGroup "wikipedia" ] .
+ FILTER(?lang in (%LANGUAGES_SPARQL%)) .
+ FILTER (!CONTAINS(?name, ':')) .
+}
+"""
+
+SPARQL_DESCRIPTION = """
+SELECT DISTINCT ?item ?itemDescription
+WHERE {
+ VALUES ?item { %IDS% }
+ ?item schema:description ?itemDescription .
+ FILTER (lang(?itemDescription) in (%LANGUAGES_SPARQL%))
+}
+ORDER BY ?itemLang
+"""
+
+LANGUAGES = searx.settings['locales'].keys()
+LANGUAGES_SPARQL = ', '.join(set(map(lambda l: repr(l.split('_')[0]), LANGUAGES)))
+IDS = None
+
+descriptions = {}
+wd_to_engine_name = {}
+
+
+def normalize_description(description):
+ for c in [chr(c) for c in range(0, 31)]:
+ description = description.replace(c, ' ')
+ description = ' '.join(description.strip().split())
+ return description
+
+
+def update_description(engine_name, lang, description, source, replace=True):
+ if replace or lang not in descriptions[engine_name]:
+ descriptions[engine_name][lang] = [normalize_description(description), source]
+
+
+def get_wikipedia_summary(language, pageid):
+ search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}'
+ url = search_url.format(title=quote(pageid), language=language)
+ try:
+ response = searx.poolrequests.get(url)
+ response.raise_for_status()
+ api_result = json.loads(response.text)
+ return api_result.get('extract')
+ except:
+ return None
+
+
+def detect_language(text):
+ r = cld3.get_language(str(text)) # pylint: disable=E1101
+ if r is not None and r.probability >= 0.98 and r.is_reliable:
+ return r.language
+ return None
+
+
+def get_website_description(url, lang1, lang2=None):
+ headers = {
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0',
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+ 'DNT': '1',
+ 'Upgrade-Insecure-Requests': '1',
+ 'Sec-GPC': '1',
+ 'Cache-Control': 'max-age=0',
+ }
+ if lang1 is not None:
+ lang_list = [lang1]
+ if lang2 is not None:
+ lang_list.append(lang2)
+ headers['Accept-Language'] = f'{",".join(lang_list)};q=0.8'
+ try:
+ response = searx.poolrequests.get(url, headers=headers, timeout=10)
+ response.raise_for_status()
+ except Exception:
+ return (None, None)
+
+ try:
+ html = fromstring(response.text)
+ except ValueError:
+ html = fromstring(response.content)
+
+ description = extract_text(html.xpath('/html/head/meta[@name="description"]/@content'))
+ if not description:
+ description = extract_text(html.xpath('/html/head/meta[@property="og:description"]/@content'))
+ if not description:
+ description = extract_text(html.xpath('/html/head/title'))
+ lang = extract_text(html.xpath('/html/@lang'))
+ if lang is None and len(lang1) > 0:
+ lang = lang1
+ lang = detect_language(description) or lang or 'en'
+ lang = lang.split('_')[0]
+ lang = lang.split('-')[0]
+ return (lang, description)
+
+
+def initialize():
+ global descriptions, wd_to_engine_name, IDS
+ searx.search.initialize()
+ for engine_name, engine in searx.engines.engines.items():
+ descriptions[engine_name] = {}
+ wikidata_id = getattr(engine, "about", {}).get('wikidata_id')
+ if wikidata_id is not None:
+ wd_to_engine_name.setdefault(wikidata_id, set()).add(engine_name)
+
+ IDS = ' '.join(list(map(lambda wd_id: 'wd:' + wd_id, wd_to_engine_name.keys())))
+
+
+def fetch_wikidata_descriptions():
+ global IDS
+ result = send_wikidata_query(SPARQL_DESCRIPTION
+ .replace('%IDS%', IDS)
+ .replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL))
+ if result is not None:
+ for binding in result['results']['bindings']:
+ wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '')
+ lang = binding['itemDescription']['xml:lang']
+ description = binding['itemDescription']['value']
+ if ' ' in description: # skip unique word description (like "website")
+ for engine_name in wd_to_engine_name[wikidata_id]:
+ update_description(engine_name, lang, description, 'wikidata')
+
+
+def fetch_wikipedia_descriptions():
+ global IDS
+ result = send_wikidata_query(SPARQL_WIKIPEDIA_ARTICLE
+ .replace('%IDS%', IDS)
+ .replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL))
+ if result is not None:
+ for binding in result['results']['bindings']:
+ wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '')
+ lang = binding['name']['xml:lang']
+ pageid = binding['name']['value']
+ description = get_wikipedia_summary(lang, pageid)
+ if description is not None and ' ' in description:
+ for engine_name in wd_to_engine_name[wikidata_id]:
+ update_description(engine_name, lang, description, 'wikipedia')
+
+
+def normalize_url(url):
+ url = url.replace('{language}', 'en')
+ url = urlparse(url)._replace(path='/', params='', query='', fragment='').geturl()
+ url = url.replace('https://api.', 'https://')
+ return url
+
+
+def fetch_website_description(engine_name, website):
+ default_lang, default_description = get_website_description(website, None, None)
+ if default_lang is None or default_description is None:
+ return
+ if default_lang not in descriptions[engine_name]:
+ descriptions[engine_name][default_lang] = [normalize_description(default_description), website]
+ for request_lang in ('en-US', 'es-US', 'fr-FR', 'zh', 'ja', 'ru', 'ar', 'ko'):
+ if request_lang.split('-')[0] not in descriptions[engine_name]:
+ lang, desc = get_website_description(website, request_lang, request_lang.split('-')[0])
+ if desc is not None and desc != default_description:
+ update_description(engine_name, lang, desc, website, replace=False)
+ else:
+ break
+
+
+def fetch_website_descriptions():
+ for engine_name, engine in searx.engines.engines.items():
+ website = getattr(engine, "about", {}).get('website')
+ if website is None:
+ website = normalize_url(getattr(engine, "search_url"))
+ if website is None:
+ website = normalize_url(getattr(engine, "base_url"))
+ if website is not None:
+ fetch_website_description(engine_name, website)
+
+
+def main():
+ initialize()
+ fetch_wikidata_descriptions()
+ fetch_wikipedia_descriptions()
+ fetch_website_descriptions()
+
+ sys.stdout.write(json.dumps(descriptions, indent=1, separators=(',', ':'), ensure_ascii=False))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/utils/makefile.python b/utils/makefile.python
index 5d0837e00..345434384 100644
--- a/utils/makefile.python
+++ b/utils/makefile.python
@@ -243,8 +243,10 @@ pyenv-python: pyenvinstall
# [2] https://github.com/pypa/pip/pull/1519
# https://github.com/pypa/twine
-PHONY += upload-pypi
+PHONY += upload-pypi upload-pypi-test
upload-pypi: pyclean pyenvinstall pybuild
@$(PY_ENV_BIN)/twine upload $(PYDIST)/*
+upload-pypi-test: pyclean pyenvinstall pybuild
+ @$(PY_ENV_BIN)/twine upload -r testpypi $(PYDIST)/*
.PHONY: $(PHONY)
diff --git a/utils/searx.sh b/utils/searx.sh
index b7d3b8e1c..f85935fa2 100755
--- a/utils/searx.sh
+++ b/utils/searx.sh
@@ -46,6 +46,7 @@ SEARX_PACKAGES_debian="\
python3-dev python3-babel python3-venv
uwsgi uwsgi-plugin-python3
git build-essential libxslt-dev zlib1g-dev libffi-dev libssl-dev
+libprotobuf-dev protobuf-compiler
shellcheck"
BUILD_PACKAGES_debian="\
@@ -58,6 +59,7 @@ SEARX_PACKAGES_arch="\
python python-pip python-lxml python-babel
uwsgi uwsgi-plugin-python
git base-devel libxml2
+protobuf
shellcheck"
BUILD_PACKAGES_arch="\
@@ -69,7 +71,7 @@ SEARX_PACKAGES_fedora="\
python python-pip python-lxml python-babel
uwsgi uwsgi-plugin-python3
git @development-tools libxml2
-ShellCheck"
+ShellCheck protobuf-compiler protobuf-devel"
BUILD_PACKAGES_fedora="\
firefox graphviz graphviz-gd ImageMagick librsvg2-tools
@@ -82,7 +84,7 @@ SEARX_PACKAGES_centos="\
python36 python36-pip python36-lxml python-babel
uwsgi uwsgi-plugin-python3
git @development-tools libxml2
-ShellCheck"
+ShellCheck protobuf-compiler protobuf-devel"
BUILD_PACKAGES_centos="\
firefox graphviz graphviz-gd ImageMagick librsvg2-tools
@@ -331,6 +333,7 @@ git pull
pip install -U pip
pip install -U setuptools
pip install -U wheel
+pip install -U pyyaml
pip install -U -e .
EOF
install_settings
@@ -501,6 +504,7 @@ EOF
pip install -U pip
pip install -U setuptools
pip install -U wheel
+pip install -U pyyaml
pip install -U -e .
cd ${SEARX_SRC}
pip install -e .
diff --git a/utils/templates/etc/uwsgi/apps-archlinux/searx.ini b/utils/templates/etc/uwsgi/apps-archlinux/searx.ini
index 9dd2e6f2f..71cece3c4 100644
--- a/utils/templates/etc/uwsgi/apps-archlinux/searx.ini
+++ b/utils/templates/etc/uwsgi/apps-archlinux/searx.ini
@@ -82,4 +82,7 @@ http = ${SEARX_INTERNAL_HTTP}
# mkdir -p /run/uwsgi/app/searx
# chown -R ${SERVICE_USER}:${SERVICE_GROUP} /run/uwsgi/app/searx
#
-# socket = /run/uwsgi/app/searx/socket \ No newline at end of file
+# socket = /run/uwsgi/app/searx/socket
+
+# Cache
+cache2 = name=searxcache,items=2000,blocks=2000,blocksize=4096,bitmap=1
diff --git a/utils/templates/etc/uwsgi/apps-available/searx.ini b/utils/templates/etc/uwsgi/apps-available/searx.ini
index 4d69da0cf..45214ef13 100644
--- a/utils/templates/etc/uwsgi/apps-available/searx.ini
+++ b/utils/templates/etc/uwsgi/apps-available/searx.ini
@@ -82,3 +82,6 @@ http = ${SEARX_INTERNAL_HTTP}
# chown -R ${SERVICE_USER}:${SERVICE_GROUP} /run/uwsgi/app/searx
#
# socket = /run/uwsgi/app/searx/socket
+
+# Cache
+cache2 = name=searxcache,items=2000,blocks=2000,blocksize=4096,bitmap=1