diff options
Diffstat (limited to 'utils')
| -rw-r--r-- | utils/brand.env | 4 | ||||
| -rw-r--r-- | utils/build_env.py | 38 | ||||
| -rw-r--r-- | utils/fetch_engine_descriptions.py | 206 | ||||
| -rw-r--r-- | utils/makefile.python | 4 | ||||
| -rwxr-xr-x | utils/searx.sh | 8 | ||||
| -rw-r--r-- | utils/templates/etc/uwsgi/apps-archlinux/searx.ini | 5 | ||||
| -rw-r--r-- | utils/templates/etc/uwsgi/apps-available/searx.ini | 3 |
7 files changed, 263 insertions, 5 deletions
diff --git a/utils/brand.env b/utils/brand.env index 2e763159d..2136d278f 100644 --- a/utils/brand.env +++ b/utils/brand.env @@ -1,7 +1,9 @@ +export SEARX_URL='' export GIT_URL='https://github.com/searx/searx' export GIT_BRANCH='master' export ISSUE_URL='https://github.com/searx/searx/issues' -export SEARX_URL='https://searx.me' export DOCS_URL='https://searx.github.io/searx' export PUBLIC_INSTANCES='https://searx.space' export CONTACT_URL='' +export WIKI_URL='https://github.com/searx/searx/wiki' +export TWITTER_URL='https://twitter.com/Searx_engine' diff --git a/utils/build_env.py b/utils/build_env.py new file mode 100644 index 000000000..ffb2689e9 --- /dev/null +++ b/utils/build_env.py @@ -0,0 +1,38 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""build environment used by shell scripts +""" + +# set path +import sys +import os +from os.path import realpath, dirname, join, sep, abspath + +repo_root = realpath(dirname(realpath(__file__)) + sep + '..') +sys.path.insert(0, repo_root) +os.environ['SEARX_SETTINGS_PATH'] = abspath(dirname(__file__) + '/settings.yml') + +# Under the assumption that a brand is always a fork assure that the settings +# file from reposetorie's working tree is used to generate the build_env, not +# from /etc/searx/settings.yml. +os.environ['SEARX_SETTINGS_PATH'] = abspath(dirname(__file__) + sep + 'settings.yml') + +from searx import brand + +name_val = [ + ('SEARX_URL' , brand.SEARX_URL), + ('GIT_URL' , brand.GIT_URL), + ('GIT_BRANCH' , brand.GIT_BRANCH), + ('ISSUE_URL' , brand.ISSUE_URL), + ('DOCS_URL' , brand.DOCS_URL), + ('PUBLIC_INSTANCES' , brand.PUBLIC_INSTANCES), + ('CONTACT_URL' , brand.CONTACT_URL), + ('WIKI_URL' , brand.WIKI_URL), + ('TWITTER_URL' , brand.TWITTER_URL), +] + +brand_env = 'utils' + sep + 'brand.env' + +print('build %s' % brand_env) +with open(repo_root + sep + brand_env, 'w', encoding='utf-8') as f: + for name, val in name_val: + print("export %s='%s'" % (name, val), file=f) diff --git a/utils/fetch_engine_descriptions.py b/utils/fetch_engine_descriptions.py new file mode 100644 index 000000000..9ca001d45 --- /dev/null +++ b/utils/fetch_engine_descriptions.py @@ -0,0 +1,206 @@ +#!/usr/bin/env python + +import sys +import json +from urllib.parse import quote, urlparse +from os.path import realpath, dirname +import cld3 +from lxml.html import fromstring + +# set path +sys.path.append(realpath(dirname(realpath(__file__)) + '/../')) + +from searx.engines.wikidata import send_wikidata_query +from searx.utils import extract_text +import searx +import searx.search +import searx.poolrequests + +SPARQL_WIKIPEDIA_ARTICLE = """ +SELECT DISTINCT ?item ?name +WHERE { + VALUES ?item { %IDS% } + ?article schema:about ?item ; + schema:inLanguage ?lang ; + schema:name ?name ; + schema:isPartOf [ wikibase:wikiGroup "wikipedia" ] . + FILTER(?lang in (%LANGUAGES_SPARQL%)) . + FILTER (!CONTAINS(?name, ':')) . +} +""" + +SPARQL_DESCRIPTION = """ +SELECT DISTINCT ?item ?itemDescription +WHERE { + VALUES ?item { %IDS% } + ?item schema:description ?itemDescription . + FILTER (lang(?itemDescription) in (%LANGUAGES_SPARQL%)) +} +ORDER BY ?itemLang +""" + +LANGUAGES = searx.settings['locales'].keys() +LANGUAGES_SPARQL = ', '.join(set(map(lambda l: repr(l.split('_')[0]), LANGUAGES))) +IDS = None + +descriptions = {} +wd_to_engine_name = {} + + +def normalize_description(description): + for c in [chr(c) for c in range(0, 31)]: + description = description.replace(c, ' ') + description = ' '.join(description.strip().split()) + return description + + +def update_description(engine_name, lang, description, source, replace=True): + if replace or lang not in descriptions[engine_name]: + descriptions[engine_name][lang] = [normalize_description(description), source] + + +def get_wikipedia_summary(language, pageid): + search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}' + url = search_url.format(title=quote(pageid), language=language) + try: + response = searx.poolrequests.get(url) + response.raise_for_status() + api_result = json.loads(response.text) + return api_result.get('extract') + except: + return None + + +def detect_language(text): + r = cld3.get_language(str(text)) # pylint: disable=E1101 + if r is not None and r.probability >= 0.98 and r.is_reliable: + return r.language + return None + + +def get_website_description(url, lang1, lang2=None): + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'DNT': '1', + 'Upgrade-Insecure-Requests': '1', + 'Sec-GPC': '1', + 'Cache-Control': 'max-age=0', + } + if lang1 is not None: + lang_list = [lang1] + if lang2 is not None: + lang_list.append(lang2) + headers['Accept-Language'] = f'{",".join(lang_list)};q=0.8' + try: + response = searx.poolrequests.get(url, headers=headers, timeout=10) + response.raise_for_status() + except Exception: + return (None, None) + + try: + html = fromstring(response.text) + except ValueError: + html = fromstring(response.content) + + description = extract_text(html.xpath('/html/head/meta[@name="description"]/@content')) + if not description: + description = extract_text(html.xpath('/html/head/meta[@property="og:description"]/@content')) + if not description: + description = extract_text(html.xpath('/html/head/title')) + lang = extract_text(html.xpath('/html/@lang')) + if lang is None and len(lang1) > 0: + lang = lang1 + lang = detect_language(description) or lang or 'en' + lang = lang.split('_')[0] + lang = lang.split('-')[0] + return (lang, description) + + +def initialize(): + global descriptions, wd_to_engine_name, IDS + searx.search.initialize() + for engine_name, engine in searx.engines.engines.items(): + descriptions[engine_name] = {} + wikidata_id = getattr(engine, "about", {}).get('wikidata_id') + if wikidata_id is not None: + wd_to_engine_name.setdefault(wikidata_id, set()).add(engine_name) + + IDS = ' '.join(list(map(lambda wd_id: 'wd:' + wd_id, wd_to_engine_name.keys()))) + + +def fetch_wikidata_descriptions(): + global IDS + result = send_wikidata_query(SPARQL_DESCRIPTION + .replace('%IDS%', IDS) + .replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL)) + if result is not None: + for binding in result['results']['bindings']: + wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '') + lang = binding['itemDescription']['xml:lang'] + description = binding['itemDescription']['value'] + if ' ' in description: # skip unique word description (like "website") + for engine_name in wd_to_engine_name[wikidata_id]: + update_description(engine_name, lang, description, 'wikidata') + + +def fetch_wikipedia_descriptions(): + global IDS + result = send_wikidata_query(SPARQL_WIKIPEDIA_ARTICLE + .replace('%IDS%', IDS) + .replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL)) + if result is not None: + for binding in result['results']['bindings']: + wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '') + lang = binding['name']['xml:lang'] + pageid = binding['name']['value'] + description = get_wikipedia_summary(lang, pageid) + if description is not None and ' ' in description: + for engine_name in wd_to_engine_name[wikidata_id]: + update_description(engine_name, lang, description, 'wikipedia') + + +def normalize_url(url): + url = url.replace('{language}', 'en') + url = urlparse(url)._replace(path='/', params='', query='', fragment='').geturl() + url = url.replace('https://api.', 'https://') + return url + + +def fetch_website_description(engine_name, website): + default_lang, default_description = get_website_description(website, None, None) + if default_lang is None or default_description is None: + return + if default_lang not in descriptions[engine_name]: + descriptions[engine_name][default_lang] = [normalize_description(default_description), website] + for request_lang in ('en-US', 'es-US', 'fr-FR', 'zh', 'ja', 'ru', 'ar', 'ko'): + if request_lang.split('-')[0] not in descriptions[engine_name]: + lang, desc = get_website_description(website, request_lang, request_lang.split('-')[0]) + if desc is not None and desc != default_description: + update_description(engine_name, lang, desc, website, replace=False) + else: + break + + +def fetch_website_descriptions(): + for engine_name, engine in searx.engines.engines.items(): + website = getattr(engine, "about", {}).get('website') + if website is None: + website = normalize_url(getattr(engine, "search_url")) + if website is None: + website = normalize_url(getattr(engine, "base_url")) + if website is not None: + fetch_website_description(engine_name, website) + + +def main(): + initialize() + fetch_wikidata_descriptions() + fetch_wikipedia_descriptions() + fetch_website_descriptions() + + sys.stdout.write(json.dumps(descriptions, indent=1, separators=(',', ':'), ensure_ascii=False)) + + +if __name__ == "__main__": + main() diff --git a/utils/makefile.python b/utils/makefile.python index 5d0837e00..345434384 100644 --- a/utils/makefile.python +++ b/utils/makefile.python @@ -243,8 +243,10 @@ pyenv-python: pyenvinstall # [2] https://github.com/pypa/pip/pull/1519 # https://github.com/pypa/twine -PHONY += upload-pypi +PHONY += upload-pypi upload-pypi-test upload-pypi: pyclean pyenvinstall pybuild @$(PY_ENV_BIN)/twine upload $(PYDIST)/* +upload-pypi-test: pyclean pyenvinstall pybuild + @$(PY_ENV_BIN)/twine upload -r testpypi $(PYDIST)/* .PHONY: $(PHONY) diff --git a/utils/searx.sh b/utils/searx.sh index b7d3b8e1c..f85935fa2 100755 --- a/utils/searx.sh +++ b/utils/searx.sh @@ -46,6 +46,7 @@ SEARX_PACKAGES_debian="\ python3-dev python3-babel python3-venv uwsgi uwsgi-plugin-python3 git build-essential libxslt-dev zlib1g-dev libffi-dev libssl-dev +libprotobuf-dev protobuf-compiler shellcheck" BUILD_PACKAGES_debian="\ @@ -58,6 +59,7 @@ SEARX_PACKAGES_arch="\ python python-pip python-lxml python-babel uwsgi uwsgi-plugin-python git base-devel libxml2 +protobuf shellcheck" BUILD_PACKAGES_arch="\ @@ -69,7 +71,7 @@ SEARX_PACKAGES_fedora="\ python python-pip python-lxml python-babel uwsgi uwsgi-plugin-python3 git @development-tools libxml2 -ShellCheck" +ShellCheck protobuf-compiler protobuf-devel" BUILD_PACKAGES_fedora="\ firefox graphviz graphviz-gd ImageMagick librsvg2-tools @@ -82,7 +84,7 @@ SEARX_PACKAGES_centos="\ python36 python36-pip python36-lxml python-babel uwsgi uwsgi-plugin-python3 git @development-tools libxml2 -ShellCheck" +ShellCheck protobuf-compiler protobuf-devel" BUILD_PACKAGES_centos="\ firefox graphviz graphviz-gd ImageMagick librsvg2-tools @@ -331,6 +333,7 @@ git pull pip install -U pip pip install -U setuptools pip install -U wheel +pip install -U pyyaml pip install -U -e . EOF install_settings @@ -501,6 +504,7 @@ EOF pip install -U pip pip install -U setuptools pip install -U wheel +pip install -U pyyaml pip install -U -e . cd ${SEARX_SRC} pip install -e . diff --git a/utils/templates/etc/uwsgi/apps-archlinux/searx.ini b/utils/templates/etc/uwsgi/apps-archlinux/searx.ini index 9dd2e6f2f..71cece3c4 100644 --- a/utils/templates/etc/uwsgi/apps-archlinux/searx.ini +++ b/utils/templates/etc/uwsgi/apps-archlinux/searx.ini @@ -82,4 +82,7 @@ http = ${SEARX_INTERNAL_HTTP} # mkdir -p /run/uwsgi/app/searx # chown -R ${SERVICE_USER}:${SERVICE_GROUP} /run/uwsgi/app/searx # -# socket = /run/uwsgi/app/searx/socket
\ No newline at end of file +# socket = /run/uwsgi/app/searx/socket + +# Cache +cache2 = name=searxcache,items=2000,blocks=2000,blocksize=4096,bitmap=1 diff --git a/utils/templates/etc/uwsgi/apps-available/searx.ini b/utils/templates/etc/uwsgi/apps-available/searx.ini index 4d69da0cf..45214ef13 100644 --- a/utils/templates/etc/uwsgi/apps-available/searx.ini +++ b/utils/templates/etc/uwsgi/apps-available/searx.ini @@ -82,3 +82,6 @@ http = ${SEARX_INTERNAL_HTTP} # chown -R ${SERVICE_USER}:${SERVICE_GROUP} /run/uwsgi/app/searx # # socket = /run/uwsgi/app/searx/socket + +# Cache +cache2 = name=searxcache,items=2000,blocks=2000,blocksize=4096,bitmap=1 |