From b8cd3264644208d7afa1a239f829222d45226334 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Thu, 25 Feb 2021 17:42:52 +0100 Subject: Add searx_extra package Split the utils directory into: * searx_extra contains update scripts, standalone_searx.py * utils contains the files to build and setup searx. --- .github/workflows/data-update.yml | 16 +- Makefile | 4 +- docs/index.rst | 1 + docs/searx_extra/index.rst | 14 ++ docs/searx_extra/standalone_searx.py.rst | 9 + docs/utils/index.rst | 10 +- docs/utils/standalone_searx.py.rst | 11 -- searx_extra/__init__.py | 0 searx_extra/google_search.py | 35 ++++ searx_extra/standalone_searx.py | 217 +++++++++++++++++++++++ searx_extra/update/__init__.py | 0 searx_extra/update/update_ahmia_blacklist.py | 30 ++++ searx_extra/update/update_currencies.py | 150 ++++++++++++++++ searx_extra/update/update_engine_descriptions.py | 202 +++++++++++++++++++++ searx_extra/update/update_external_bangs.py | 157 ++++++++++++++++ searx_extra/update/update_firefox_version.py | 68 +++++++ searx_extra/update/update_languages.py | 205 +++++++++++++++++++++ searx_extra/update/update_translations.sh | 15 ++ searx_extra/update/update_wikidata_units.py | 54 ++++++ setup.py | 2 +- tests/unit/test_standalone_searx.py | 20 +-- utils/fetch_ahmia_blacklist.py | 33 ---- utils/fetch_currencies.py | 151 ---------------- utils/fetch_engine_descriptions.py | 206 --------------------- utils/fetch_external_bangs.py | 161 ----------------- utils/fetch_firefox_version.py | 73 -------- utils/fetch_languages.py | 207 --------------------- utils/fetch_wikidata_units.py | 56 ------ utils/google_search.py | 35 ---- utils/standalone_searx.py | 217 ----------------------- utils/update-translations.sh | 15 -- 31 files changed, 1173 insertions(+), 1201 deletions(-) create mode 100644 docs/searx_extra/index.rst create mode 100644 docs/searx_extra/standalone_searx.py.rst delete mode 100644 docs/utils/standalone_searx.py.rst create mode 100644 searx_extra/__init__.py create mode 100644 searx_extra/google_search.py create mode 100755 searx_extra/standalone_searx.py create mode 100644 searx_extra/update/__init__.py create mode 100755 searx_extra/update/update_ahmia_blacklist.py create mode 100755 searx_extra/update/update_currencies.py create mode 100755 searx_extra/update/update_engine_descriptions.py create mode 100755 searx_extra/update/update_external_bangs.py create mode 100755 searx_extra/update/update_firefox_version.py create mode 100755 searx_extra/update/update_languages.py create mode 100755 searx_extra/update/update_translations.sh create mode 100755 searx_extra/update/update_wikidata_units.py delete mode 100755 utils/fetch_ahmia_blacklist.py delete mode 100644 utils/fetch_currencies.py delete mode 100644 utils/fetch_engine_descriptions.py delete mode 100755 utils/fetch_external_bangs.py delete mode 100755 utils/fetch_firefox_version.py delete mode 100644 utils/fetch_languages.py delete mode 100644 utils/fetch_wikidata_units.py delete mode 100644 utils/google_search.py delete mode 100755 utils/standalone_searx.py delete mode 100755 utils/update-translations.sh diff --git a/.github/workflows/data-update.yml b/.github/workflows/data-update.yml index c9c6b29a4..eb9bed8c8 100644 --- a/.github/workflows/data-update.yml +++ b/.github/workflows/data-update.yml @@ -11,12 +11,12 @@ jobs: strategy: matrix: fetch: - - ahmia_blacklist - - currencies - - external_bangs - - firefox_version - - languages - - wikidata_units + - update_ahmia_blacklist.py + - update_currencies.py + - update_external_bangs.py + - update_firefox_version.py + - update_languages.py + - update_wikidata_units.py steps: - name: Checkout uses: actions/checkout@v2 @@ -45,10 +45,10 @@ jobs: - name: Fetch data env: - FETCH_SCRIPT: utils/fetch_${{ matrix.fetch }}.py + FETCH_SCRIPT: ./searx_extra/update/${{ matrix.fetch }} run: | source local/py3/bin/activate - python $FETCH_SCRIPT + $FETCH_SCRIPT - name: Create Pull Request id: cpr diff --git a/Makefile b/Makefile index 4e451b7ca..9917da78b 100644 --- a/Makefile +++ b/Makefile @@ -195,8 +195,8 @@ PYLINT_FILES=\ searx/engines/google_videos.py \ searx/engines/google_images.py \ searx/engines/mediathekviewweb.py \ - utils/fetch_external_bangs.py \ - searx/engines/google_scholar.py + searx/engines/google_scholar.py \ + searx_extra/update/update_external_bangs.py test.pylint: pyenvinstall $(call cmd,pylint,$(PYLINT_FILES)) diff --git a/docs/index.rst b/docs/index.rst index 9e590867c..a406da197 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -30,6 +30,7 @@ anyone, you can set up your own, see :ref:`installation`. user/index admin/index dev/index + searx_extra/index utils/index blog/index diff --git a/docs/searx_extra/index.rst b/docs/searx_extra/index.rst new file mode 100644 index 000000000..64d0b9047 --- /dev/null +++ b/docs/searx_extra/index.rst @@ -0,0 +1,14 @@ +.. _searx_extra: + +====================================================== +Tooling box ``searx_extra`` for developers and users +====================================================== + +In the folder :origin:`searx_extra/` we maintain some tools useful for +developers and users. + +.. toctree:: + :maxdepth: 2 + :caption: Contents + + standalone_searx.py diff --git a/docs/searx_extra/standalone_searx.py.rst b/docs/searx_extra/standalone_searx.py.rst new file mode 100644 index 000000000..ff4b53387 --- /dev/null +++ b/docs/searx_extra/standalone_searx.py.rst @@ -0,0 +1,9 @@ + +.. _standalone_searx.py: + +=================================== +``searx_extra/standalone_searx.py`` +=================================== + +.. automodule:: searx_extra.standalone_searx + :members: diff --git a/docs/utils/index.rst b/docs/utils/index.rst index 28515318f..32baa5704 100644 --- a/docs/utils/index.rst +++ b/docs/utils/index.rst @@ -1,12 +1,11 @@ .. _searx_utils: .. _toolboxing: -======================= -Tooling box ``utils/*`` -======================= +======================================== +Tooling box ``utils`` for administrators +======================================== -In the folder :origin:`utils/` we maintain some tools useful for admins and -developers. +In the folder :origin:`utils/` we maintain some tools useful for administrators. .. toctree:: :maxdepth: 2 @@ -16,7 +15,6 @@ developers. filtron.sh morty.sh lxc.sh - standalone_searx.py .. _toolboxing common: diff --git a/docs/utils/standalone_searx.py.rst b/docs/utils/standalone_searx.py.rst deleted file mode 100644 index 557c4b75b..000000000 --- a/docs/utils/standalone_searx.py.rst +++ /dev/null @@ -1,11 +0,0 @@ - -.. _standalone_searx.py: - -============================= -``utils/standalone_searx.py`` -============================= - -.. automodule:: standalone_searx - :members: - - diff --git a/searx_extra/__init__.py b/searx_extra/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/searx_extra/google_search.py b/searx_extra/google_search.py new file mode 100644 index 000000000..cad32eeca --- /dev/null +++ b/searx_extra/google_search.py @@ -0,0 +1,35 @@ +from sys import argv, exit + +if not len(argv) > 1: + print('search query required') + exit(1) + +import requests +from json import dumps +from searx.engines import google +from searx.search import default_request_params + +request_params = default_request_params() +# Possible params +# request_params['headers']['User-Agent'] = '' +# request_params['category'] = '' +request_params['pageno'] = 1 +request_params['language'] = 'en_us' +request_params['time_range'] = '' + +params = google.request(argv[1], request_params) + +request_args = dict( + headers=request_params['headers'], + cookies=request_params['cookies'], +) + +if request_params['method'] == 'GET': + req = requests.get +else: + req = requests.post + request_args['data'] = request_params['data'] + +resp = req(request_params['url'], **request_args) +resp.search_params = request_params +print(dumps(google.response(resp))) diff --git a/searx_extra/standalone_searx.py b/searx_extra/standalone_searx.py new file mode 100755 index 000000000..f52b7e80c --- /dev/null +++ b/searx_extra/standalone_searx.py @@ -0,0 +1,217 @@ +#!/usr/bin/env python +"""Script to run searx from terminal. + +Getting categories without initiate the engine will only return `['general']` + +>>> import searx.engines +... list(searx.engines.categories.keys()) +['general'] +>>> import searx.search +... searx.search.initialize() +... list(searx.engines.categories.keys()) +['general', 'it', 'science', 'images', 'news', 'videos', 'music', 'files', 'social media', 'map'] + +Example to use this script: + +.. code:: bash + + $ python3 searx_extra/standalone_searx.py rain + +Example to run it from python: + +>>> import importlib +... import json +... import sys +... import searx.engines +... import searx.search +... search_query = 'rain' +... # initialize engines +... searx.search.initialize() +... # load engines categories once instead of each time the function called +... engine_cs = list(searx.engines.categories.keys()) +... # load module +... spec = importlib.util.spec_from_file_location( +... 'utils.standalone_searx', 'utils/standalone_searx.py') +... sas = importlib.util.module_from_spec(spec) +... spec.loader.exec_module(sas) +... # use function from module +... prog_args = sas.parse_argument([search_query], category_choices=engine_cs) +... search_q = sas.get_search_query(prog_args, engine_categories=engine_cs) +... res_dict = sas.to_dict(search_q) +... sys.stdout.write(json.dumps( +... res_dict, sort_keys=True, indent=4, ensure_ascii=False, +... default=sas.json_serial)) +{ + "answers": [], + "infoboxes": [ {...} ], + "paging": true, + "results": [... ], + "results_number": 820000000.0, + "search": { + "lang": "all", + "pageno": 1, + "q": "rain", + "safesearch": 0, + "timerange": null + }, + "suggestions": [...] +} +""" # noqa: E501 +# pylint: disable=pointless-string-statement +''' +searx is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +searx is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with searx. If not, see < http://www.gnu.org/licenses/ >. + +(C) 2016- by Alexandre Flament, +''' +# pylint: disable=wrong-import-position +import argparse +import sys +from datetime import datetime +from json import dumps +from typing import Any, Dict, List, Optional + +import searx +import searx.preferences +import searx.query +import searx.search +import searx.webadapter + +EngineCategoriesVar = Optional[List[str]] + + +def get_search_query( + args: argparse.Namespace, engine_categories: EngineCategoriesVar = None +) -> searx.search.SearchQuery: + """Get search results for the query""" + if engine_categories is None: + engine_categories = list(searx.engines.categories.keys()) + try: + category = args.category.decode('utf-8') + except AttributeError: + category = args.category + form = { + "q": args.query, + "categories": category, + "pageno": str(args.pageno), + "language": args.lang, + "time_range": args.timerange + } + preferences = searx.preferences.Preferences( + ['oscar'], engine_categories, searx.engines.engines, []) + preferences.key_value_settings['safesearch'].parse(args.safesearch) + + search_query = searx.webadapter.get_search_query_from_webapp( + preferences, form)[0] + return search_query + + +def no_parsed_url(results: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Remove parsed url from dict.""" + for result in results: + del result['parsed_url'] + return results + + +def json_serial(obj: Any) -> Any: + """JSON serializer for objects not serializable by default json code. + + :raise TypeError: raised when **obj** is not serializable + """ + if isinstance(obj, datetime): + serial = obj.isoformat() + return serial + if isinstance(obj, bytes): + return obj.decode('utf8') + if isinstance(obj, set): + return list(obj) + raise TypeError("Type ({}) not serializable".format(type(obj))) + + +def to_dict(search_query: searx.search.SearchQuery) -> Dict[str, Any]: + """Get result from parsed arguments.""" + result_container = searx.search.Search(search_query).search() + result_container_json = { + "search": { + "q": search_query.query, + "pageno": search_query.pageno, + "lang": search_query.lang, + "safesearch": search_query.safesearch, + "timerange": search_query.time_range, + }, + "results": no_parsed_url(result_container.get_ordered_results()), + "infoboxes": result_container.infoboxes, + "suggestions": list(result_container.suggestions), + "answers": list(result_container.answers), + "paging": result_container.paging, + "results_number": result_container.results_number() + } + return result_container_json + + +def parse_argument( + args: Optional[List[str]]=None, + category_choices: EngineCategoriesVar=None +) -> argparse.Namespace: + """Parse command line. + + :raise SystemExit: Query argument required on `args` + + Examples: + + >>> import importlib + ... # load module + ... spec = importlib.util.spec_from_file_location( + ... 'utils.standalone_searx', 'utils/standalone_searx.py') + ... sas = importlib.util.module_from_spec(spec) + ... spec.loader.exec_module(sas) + ... sas.parse_argument() + usage: ptipython [-h] [--category [{general}]] [--lang [LANG]] [--pageno [PAGENO]] [--safesearch [{0,1,2}]] [--timerange [{day,week,month,year}]] + query + SystemExit: 2 + >>> sas.parse_argument(['rain']) + Namespace(category='general', lang='all', pageno=1, query='rain', safesearch='0', timerange=None) + """ # noqa: E501 + if not category_choices: + category_choices = list(searx.engines.categories.keys()) + parser = argparse.ArgumentParser(description='Standalone searx.') + parser.add_argument('query', type=str, + help='Text query') + parser.add_argument('--category', type=str, nargs='?', + choices=category_choices, + default='general', + help='Search category') + parser.add_argument('--lang', type=str, nargs='?', default='all', + help='Search language') + parser.add_argument('--pageno', type=int, nargs='?', default=1, + help='Page number starting from 1') + parser.add_argument( + '--safesearch', type=str, nargs='?', + choices=['0', '1', '2'], default='0', + help='Safe content filter from none to strict') + parser.add_argument( + '--timerange', type=str, + nargs='?', choices=['day', 'week', 'month', 'year'], + help='Filter by time range') + return parser.parse_args(args) + + +if __name__ == '__main__': + searx.search.initialize() + engine_cs = list(searx.engines.categories.keys()) + prog_args = parse_argument(category_choices=engine_cs) + search_q = get_search_query(prog_args, engine_categories=engine_cs) + res_dict = to_dict(search_q) + sys.stdout.write(dumps( + res_dict, sort_keys=True, indent=4, ensure_ascii=False, + default=json_serial)) diff --git a/searx_extra/update/__init__.py b/searx_extra/update/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/searx_extra/update/update_ahmia_blacklist.py b/searx_extra/update/update_ahmia_blacklist.py new file mode 100755 index 000000000..f645880e6 --- /dev/null +++ b/searx_extra/update/update_ahmia_blacklist.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python + +# This script saves Ahmia's blacklist for onion sites. +# More info in https://ahmia.fi/blacklist/ + +# set path +from os.path import join + +import requests +from searx import searx_dir + +URL = 'https://ahmia.fi/blacklist/banned/' + + +def fetch_ahmia_blacklist(): + resp = requests.get(URL, timeout=3.0) + if resp.status_code != 200: + raise Exception("Error fetching Ahmia blacklist, HTTP code " + resp.status_code) + else: + blacklist = resp.text.split() + return blacklist + + +def get_ahmia_blacklist_filename(): + return join(join(searx_dir, "data"), "ahmia_blacklist.txt") + + +blacklist = fetch_ahmia_blacklist() +with open(get_ahmia_blacklist_filename(), "w") as f: + f.write('\n'.join(blacklist)) diff --git a/searx_extra/update/update_currencies.py b/searx_extra/update/update_currencies.py new file mode 100755 index 000000000..0cfb7a951 --- /dev/null +++ b/searx_extra/update/update_currencies.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python + +import re +import unicodedata +import json + +# set path +from sys import path +from os.path import realpath, dirname, join + +from searx import searx_dir, settings +from searx.engines.wikidata import send_wikidata_query + + +# ORDER BY (with all the query fields) is important to keep a deterministic result order +# so multiple invokation of this script doesn't change currencies.json +SARQL_REQUEST = """ +SELECT DISTINCT ?iso4217 ?unit ?unicode ?label ?alias WHERE { + ?item wdt:P498 ?iso4217; rdfs:label ?label. + OPTIONAL { ?item skos:altLabel ?alias FILTER (LANG (?alias) = LANG(?label)). } + OPTIONAL { ?item wdt:P5061 ?unit. } + OPTIONAL { ?item wdt:P489 ?symbol. + ?symbol wdt:P487 ?unicode. } + MINUS { ?item wdt:P582 ?end_data . } # Ignore monney with an end date + MINUS { ?item wdt:P31/wdt:P279* wd:Q15893266 . } # Ignore "former entity" (obsolete currency) + FILTER(LANG(?label) IN (%LANGUAGES_SPARQL%)). +} +ORDER BY ?iso4217 ?unit ?unicode ?label ?alias +""" + +# ORDER BY (with all the query fields) is important to keep a deterministic result order +# so multiple invokation of this script doesn't change currencies.json +SPARQL_WIKIPEDIA_NAMES_REQUEST = """ +SELECT DISTINCT ?iso4217 ?article_name WHERE { + ?item wdt:P498 ?iso4217 . + ?article schema:about ?item ; + schema:name ?article_name ; + schema:isPartOf [ wikibase:wikiGroup "wikipedia" ] + MINUS { ?item wdt:P582 ?end_data . } # Ignore monney with an end date + MINUS { ?item wdt:P31/wdt:P279* wd:Q15893266 . } # Ignore "former entity" (obsolete currency) + FILTER(LANG(?article_name) IN (%LANGUAGES_SPARQL%)). +} +ORDER BY ?iso4217 ?article_name +""" + + +LANGUAGES = settings['locales'].keys() +LANGUAGES_SPARQL = ', '.join(set(map(lambda l: repr(l.split('_')[0]), LANGUAGES))) + + +def remove_accents(name): + return unicodedata.normalize('NFKD', name).lower() + + +def remove_extra(name): + for c in ('(', ':'): + if c in name: + name = name.split(c)[0].strip() + return name + + +def _normalize_name(name): + name = re.sub(' +', ' ', remove_accents(name.lower()).replace('-', ' ')) + name = remove_extra(name) + return name + + +def add_currency_name(db, name, iso4217, normalize_name=True): + db_names = db['names'] + + if normalize_name: + name = _normalize_name(name) + + iso4217_set = db_names.setdefault(name, []) + if iso4217 not in iso4217_set: + iso4217_set.insert(0, iso4217) + + +def add_currency_label(db, label, iso4217, language): + labels = db['iso4217'].setdefault(iso4217, {}) + labels[language] = label + + +def wikidata_request_result_iterator(request): + result = send_wikidata_query(request.replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL)) + if result is not None: + for r in result['results']['bindings']: + yield r + + +def fetch_db(): + db = { + 'names': {}, + 'iso4217': {}, + } + + for r in wikidata_request_result_iterator(SPARQL_WIKIPEDIA_NAMES_REQUEST): + iso4217 = r['iso4217']['value'] + article_name = r['article_name']['value'] + article_lang = r['article_name']['xml:lang'] + add_currency_name(db, article_name, iso4217) + add_currency_label(db, article_name, iso4217, article_lang) + + for r in wikidata_request_result_iterator(SARQL_REQUEST): + iso4217 = r['iso4217']['value'] + if 'label' in r: + label = r['label']['value'] + label_lang = r['label']['xml:lang'] + add_currency_name(db, label, iso4217) + add_currency_label(db, label, iso4217, label_lang) + + if 'alias' in r: + add_currency_name(db, r['alias']['value'], iso4217) + + if 'unicode' in r: + add_currency_name(db, r['unicode']['value'], iso4217, normalize_name=False) + + if 'unit' in r: + add_currency_name(db, r['unit']['value'], iso4217, normalize_name=False) + + # reduce memory usage: + # replace lists with one item by the item. + # see searx.search.processors.online_currency.name_to_iso4217 + for name in db['names']: + if len(db['names'][name]) == 1: + db['names'][name] = db['names'][name][0] + + return db + + +def get_filename(): + return join(join(searx_dir, "data"), "currencies.json") + + +def main(): + # + db = fetch_db() + # static + add_currency_name(db, "euro", 'EUR') + add_currency_name(db, "euros", 'EUR') + add_currency_name(db, "dollar", 'USD') + add_currency_name(db, "dollars", 'USD') + add_currency_name(db, "peso", 'MXN') + add_currency_name(db, "pesos", 'MXN') + + with open(get_filename(), 'w', encoding='utf8') as f: + json.dump(db, f, ensure_ascii=False, indent=4) + +if __name__ == '__main__': + main() diff --git a/searx_extra/update/update_engine_descriptions.py b/searx_extra/update/update_engine_descriptions.py new file mode 100755 index 000000000..109fdbfa0 --- /dev/null +++ b/searx_extra/update/update_engine_descriptions.py @@ -0,0 +1,202 @@ +#!/usr/bin/env python + +import sys +import json +from urllib.parse import quote, urlparse +import detect_language +from lxml.html import fromstring + +from searx.engines.wikidata import send_wikidata_query +from searx.utils import extract_text +import searx +import searx.search +import searx.poolrequests + +SPARQL_WIKIPEDIA_ARTICLE = """ +SELECT DISTINCT ?item ?name +WHERE { + VALUES ?item { %IDS% } + ?article schema:about ?item ; + schema:inLanguage ?lang ; + schema:name ?name ; + schema:isPartOf [ wikibase:wikiGroup "wikipedia" ] . + FILTER(?lang in (%LANGUAGES_SPARQL%)) . + FILTER (!CONTAINS(?name, ':')) . +} +""" + +SPARQL_DESCRIPTION = """ +SELECT DISTINCT ?item ?itemDescription +WHERE { + VALUES ?item { %IDS% } + ?item schema:description ?itemDescription . + FILTER (lang(?itemDescription) in (%LANGUAGES_SPARQL%)) +} +ORDER BY ?itemLang +""" + +LANGUAGES = searx.settings['locales'].keys() +LANGUAGES_SPARQL = ', '.join(set(map(lambda l: repr(l.split('_')[0]), LANGUAGES))) +IDS = None + +descriptions = {} +wd_to_engine_name = {} + + +def normalize_description(description): + for c in [chr(c) for c in range(0, 31)]: + description = description.replace(c, ' ') + description = ' '.join(description.strip().split()) + return description + + +def update_description(engine_name, lang, description, source, replace=True): + if replace or lang not in descriptions[engine_name]: + descriptions[engine_name][lang] = [normalize_description(description), source] + + +def get_wikipedia_summary(language, pageid): + search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}' + url = search_url.format(title=quote(pageid), language=language) + try: + response = searx.poolrequests.get(url) + response.raise_for_status() + api_result = json.loads(response.text) + return api_result.get('extract') + except: + return None + + +def detect_language(text): + r = cld3.get_language(str(text)) # pylint: disable=E1101 + if r is not None and r.probability >= 0.98 and r.is_reliable: + return r.language + return None + + +def get_website_description(url, lang1, lang2=None): + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'DNT': '1', + 'Upgrade-Insecure-Requests': '1', + 'Sec-GPC': '1', + 'Cache-Control': 'max-age=0', + } + if lang1 is not None: + lang_list = [lang1] + if lang2 is not None: + lang_list.append(lang2) + headers['Accept-Language'] = f'{",".join(lang_list)};q=0.8' + try: + response = searx.poolrequests.get(url, headers=headers, timeout=10) + response.raise_for_status() + except Exception: + return (None, None) + + try: + html = fromstring(response.text) + except ValueError: + html = fromstring(response.content) + + description = extract_text(html.xpath('/html/head/meta[@name="description"]/@content')) + if not description: + description = extract_text(html.xpath('/html/head/meta[@property="og:description"]/@content')) + if not description: + description = extract_text(html.xpath('/html/head/title')) + lang = extract_text(html.xpath('/html/@lang')) + if lang is None and len(lang1) > 0: + lang = lang1 + lang = detect_language(description) or lang or 'en' + lang = lang.split('_')[0] + lang = lang.split('-')[0] + return (lang, description) + + +def initialize(): + global descriptions, wd_to_engine_name, IDS + searx.search.initialize() + for engine_name, engine in searx.engines.engines.items(): + descriptions[engine_name] = {} + wikidata_id = getattr(engine, "about", {}).get('wikidata_id') + if wikidata_id is not None: + wd_to_engine_name.setdefault(wikidata_id, set()).add(engine_name) + + IDS = ' '.join(list(map(lambda wd_id: 'wd:' + wd_id, wd_to_engine_name.keys()))) + + +def fetch_wikidata_descriptions(): + global IDS + result = send_wikidata_query(SPARQL_DESCRIPTION + .replace('%IDS%', IDS) + .replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL)) + if result is not None: + for binding in result['results']['bindings']: + wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '') + lang = binding['itemDescription']['xml:lang'] + description = binding['itemDescription']['value'] + if ' ' in description: # skip unique word description (like "website") + for engine_name in wd_to_engine_name[wikidata_id]: + update_description(engine_name, lang, description, 'wikidata') + + +def fetch_wikipedia_descriptions(): + global IDS + result = send_wikidata_query(SPARQL_WIKIPEDIA_ARTICLE + .replace('%IDS%', IDS) + .replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL)) + if result is not None: + for binding in result['results']['bindings']: + wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '') + lang = binding['name']['xml:lang'] + pageid = binding['name']['value'] + description = get_wikipedia_summary(lang, pageid) + if description is not None and ' ' in description: + for engine_name in wd_to_engine_name[wikidata_id]: + update_description(engine_name, lang, description, 'wikipedia') + + +def normalize_url(url): + url = url.replace('{language}', 'en') + url = urlparse(url)._replace(path='/', params='', query='', fragment='').geturl() + url = url.replace('https://api.', 'https://') + return url + + +def fetch_website_description(engine_name, website): + default_lang, default_description = get_website_description(website, None, None) + if default_lang is None or default_description is None: + return + if default_lang not in descriptions[engine_name]: + descriptions[engine_name][default_lang] = [normalize_description(default_description), website] + for request_lang in ('en-US', 'es-US', 'fr-FR', 'zh', 'ja', 'ru', 'ar', 'ko'): + if request_lang.split('-')[0] not in descriptions[engine_name]: + lang, desc = get_website_description(website, request_lang, request_lang.split('-')[0]) + if desc is not None and desc != default_description: + update_description(engine_name, lang, desc, website, replace=False) + else: + break + + +def fetch_website_descriptions(): + for engine_name, engine in searx.engines.engines.items(): + website = getattr(engine, "about", {}).get('website') + if website is None: + website = normalize_url(getattr(engine, "search_url")) + if website is None: + website = normalize_url(getattr(engine, "base_url")) + if website is not None: + fetch_website_description(engine_name, website) + + +def main(): + initialize() + fetch_wikidata_descriptions() + fetch_wikipedia_descriptions() + fetch_website_descriptions() + + sys.stdout.write(json.dumps(descriptions, indent=1, separators=(',', ':'), ensure_ascii=False)) + + +if __name__ == "__main__": + main() diff --git a/searx_extra/update/update_external_bangs.py b/searx_extra/update/update_external_bangs.py new file mode 100755 index 000000000..e9dc0ff1d --- /dev/null +++ b/searx_extra/update/update_external_bangs.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python +""" +Update searx/data/external_bangs.json using the duckduckgo bangs. + +https://duckduckgo.com/newbang loads +* a javascript which provides the bang version ( https://duckduckgo.com/bv1.js ) +* a JSON file which contains the bangs ( https://duckduckgo.com/bang.v260.js for example ) + +This script loads the javascript, then the bangs. + +The javascript URL may change in the future ( for example https://duckduckgo.com/bv2.js ), +but most probably it will requires to update RE_BANG_VERSION +""" +# pylint: disable=C0116 + +import json +import re +from os.path import join + +import requests + +from searx import searx_dir # pylint: disable=E0401 C0413 + + +# from https://duckduckgo.com/newbang +URL_BV1 = 'https://duckduckgo.com/bv1.js' +RE_BANG_VERSION = re.compile(r'\/bang\.v([0-9]+)\.js') +HTTPS_COLON = 'https:' +HTTP_COLON = 'http:' + + +def get_bang_url(): + response = requests.get(URL_BV1) + response.raise_for_status() + + r = RE_BANG_VERSION.findall(response.text) + return f'https://duckduckgo.com/bang.v{r[0]}.js', r[0] + + +def fetch_ddg_bangs(url): + response = requests.get(url) + response.raise_for_status() + return json.loads(response.content.decode()) + + +def merge_when_no_leaf(node): + """Minimize the number of nodes + + A -> B -> C + B is child of A + C is child of B + + If there are no C equals to '*', then each C are merged into A + + For example: + d -> d -> g -> * (ddg*) + -> i -> g -> * (dig*) + becomes + d -> dg -> * + -> ig -> * + """ + restart = False + if not isinstance(node, dict): + return + + # create a copy of the keys so node can be modified + keys = list(node.keys()) + + for key in keys: + if key == '*': + continue + + value = node[key] + value_keys = list(value.keys()) + if '*' not in value_keys: + for value_key in value_keys: + node[key + value_key] = value[value_key] + merge_when_no_leaf(node[key + value_key]) + del node[key] + restart = True + else: + merge_when_no_leaf(value) + + if restart: + merge_when_no_leaf(node) + + +def optimize_leaf(parent, parent_key, node): + if not isinstance(node, dict): + return + + if len(node) == 1 and '*' in node and parent is not None: + parent[parent_key] = node['*'] + else: + for key, value in node.items(): + optimize_leaf(node, key, value) + + +def parse_ddg_bangs(ddg_bangs): + bang_trie = {} + bang_urls = {} + + for bang_definition in ddg_bangs: + # bang_list + bang_url = bang_definition['u'] + if '{{{s}}}' not in bang_url: + # ignore invalid bang + continue + + bang_url = bang_url.replace('{{{s}}}', chr(2)) + + # only for the https protocol: "https://example.com" becomes "//example.com" + if bang_url.startswith(HTTPS_COLON + '//'): + bang_url = bang_url[len(HTTPS_COLON):] + + # + if bang_url.startswith(HTTP_COLON + '//') and bang_url[len(HTTP_COLON):] in bang_urls: + # if the bang_url uses the http:// protocol, and the same URL exists in https:// + # then reuse the https:// bang definition. (written //example.com) + bang_def_output = bang_urls[bang_url[len(HTTP_COLON):]] + else: + # normal use case : new http:// URL or https:// URL (without "https:", see above) + bang_rank = str(bang_definition['r']) + bang_def_output = bang_url + chr(1) + bang_rank + bang_def_output = bang_urls.setdefault(bang_url, bang_def_output) + + bang_urls[bang_url] = bang_def_output + + # bang name + bang = bang_definition['t'] + + # bang_trie + t = bang_trie + for bang_letter in bang: + t = t.setdefault(bang_letter, {}) + t = t.setdefault('*', bang_def_output) + + # optimize the trie + merge_when_no_leaf(bang_trie) + optimize_leaf(None, None, bang_trie) + + return bang_trie + + +def get_bangs_filename(): + return join(join(searx_dir, "data"), "external_bangs.json") + + +if __name__ == '__main__': + bangs_url, bangs_version = get_bang_url() + print(f'fetch bangs from {bangs_url}') + output = { + 'version': bangs_version, + 'trie': parse_ddg_bangs(fetch_ddg_bangs(bangs_url)) + } + with open(get_bangs_filename(), 'w') as fp: + json.dump(output, fp, ensure_ascii=False, indent=4) diff --git a/searx_extra/update/update_firefox_version.py b/searx_extra/update/update_firefox_version.py new file mode 100755 index 000000000..6acfe76ce --- /dev/null +++ b/searx_extra/update/update_firefox_version.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python + +import json +import requests +import re +from os.path import dirname, join +from urllib.parse import urlparse, urljoin +from distutils.version import LooseVersion, StrictVersion +from lxml import html +from searx import searx_dir + +URL = 'https://ftp.mozilla.org/pub/firefox/releases/' +RELEASE_PATH = '/pub/firefox/releases/' + +NORMAL_REGEX = re.compile('^[0-9]+\.[0-9](\.[0-9])?$') +# BETA_REGEX = re.compile('.*[0-9]b([0-9\-a-z]+)$') +# ESR_REGEX = re.compile('^[0-9]+\.[0-9](\.[0-9])?esr$') + +# +useragents = { + "versions": (), + "os": ('Windows NT 10.0; WOW64', + 'X11; Linux x86_64'), + "ua": "Mozilla/5.0 ({os}; rv:{version}) Gecko/20100101 Firefox/{version}" +} + + +def fetch_firefox_versions(): + resp = requests.get(URL, timeout=2.0) + if resp.status_code != 200: + raise Exception("Error fetching firefox versions, HTTP code " + resp.status_code) + else: + dom = html.fromstring(resp.text) + versions = [] + + for link in dom.xpath('//a/@href'): + url = urlparse(urljoin(URL, link)) + path = url.path + if path.startswith(RELEASE_PATH): + version = path[len(RELEASE_PATH):-1] + if NORMAL_REGEX.match(version): + versions.append(LooseVersion(version)) + + list.sort(versions, reverse=True) + return versions + + +def fetch_firefox_last_versions(): + versions = fetch_firefox_versions() + + result = [] + major_last = versions[0].version[0] + major_list = (major_last, major_last - 1) + for version in versions: + major_current = version.version[0] + if major_current in major_list: + result.append(version.vstring) + + return result + + +def get_useragents_filename(): + return join(join(searx_dir, "data"), "useragents.json") + + +useragents["versions"] = fetch_firefox_last_versions() +with open(get_useragents_filename(), "w") as f: + json.dump(useragents, f, indent=4, ensure_ascii=False) diff --git a/searx_extra/update/update_languages.py b/searx_extra/update/update_languages.py new file mode 100755 index 000000000..e63282586 --- /dev/null +++ b/searx_extra/update/update_languages.py @@ -0,0 +1,205 @@ +#!/usr/bin/env python + +# This script generates languages.py from intersecting each engine's supported languages. +# +# Output files: searx/data/engines_languages.json and searx/languages.py + +import json +from pathlib import Path +from pprint import pformat +from babel import Locale, UnknownLocaleError +from babel.languages import get_global + +from searx import settings, searx_dir +from searx.engines import initialize_engines, engines + +# Output files. +engines_languages_file = Path(searx_dir) / 'data' / 'engines_languages.json' +languages_file = Path(searx_dir) / 'languages.py' + + +# Fetchs supported languages for each engine and writes json file with those. +def fetch_supported_languages(): + + engines_languages = dict() + names = list(engines) + names.sort() + + for engine_name in names: + if hasattr(engines[engine_name], 'fetch_supported_languages'): + engines_languages[engine_name] = engines[engine_name].fetch_supported_languages() + print("fetched %s languages from engine %s" % ( + len(engines_languages[engine_name]), engine_name)) + if type(engines_languages[engine_name]) == list: + engines_languages[engine_name] = sorted(engines_languages[engine_name]) + + # write json file + with open(engines_languages_file, 'w', encoding='utf-8') as f: + json.dump(engines_languages, f, indent=2, sort_keys=True) + + return engines_languages + + +# Get babel Locale object from lang_code if possible. +def get_locale(lang_code): + try: + locale = Locale.parse(lang_code, sep='-') + return locale + except (UnknownLocaleError, ValueError): + return None + + +# Join all language lists. +def join_language_lists(engines_languages): + language_list = dict() + for engine_name in engines_languages: + for lang_code in engines_languages[engine_name]: + + # apply custom fixes if necessary + if lang_code in getattr(engines[engine_name], 'language_aliases', {}).values(): + lang_code = next(lc for lc, alias in engines[engine_name].language_aliases.items() + if lang_code == alias) + + locale = get_locale(lang_code) + + # ensure that lang_code uses standard language and country codes + if locale and locale.territory: + lang_code = "{lang}-{country}".format(lang=locale.language, country=locale.territory) + short_code = lang_code.split('-')[0] + + # add language without country if not in list + if short_code not in language_list: + if locale: + # get language's data from babel's Locale object + language_name = locale.get_language_name().title() + english_name = locale.english_name.split(' (')[0] + elif short_code in engines_languages['wikipedia']: + # get language's data from wikipedia if not known by babel + language_name = engines_languages['wikipedia'][short_code]['name'] + english_name = engines_languages['wikipedia'][short_code]['english_name'] + else: + language_name = None + english_name = None + + # add language to list + language_list[short_code] = {'name': language_name, + 'english_name': english_name, + 'counter': set(), + 'countries': dict()} + + # add language with country if not in list + if lang_code != short_code and lang_code not in language_list[short_code]['countries']: + country_name = '' + if locale: + # get country name from babel's Locale object + country_name = locale.get_territory_name() + + language_list[short_code]['countries'][lang_code] = {'country_name': country_name, + 'counter': set()} + + # count engine for both language_country combination and language alone + language_list[short_code]['counter'].add(engine_name) + if lang_code != short_code: + language_list[short_code]['countries'][lang_code]['counter'].add(engine_name) + + return language_list + + +# Filter language list so it only includes the most supported languages and countries +def filter_language_list(all_languages): + min_engines_per_lang = 15 + min_engines_per_country = 10 + main_engines = [engine_name for engine_name in engines.keys() + if 'general' in engines[engine_name].categories and + engines[engine_name].supported_languages and + not engines[engine_name].disabled] + + # filter list to include only languages supported by most engines or all default general engines + filtered_languages = {code: lang for code, lang + in all_languages.items() + if (len(lang['counter']) >= min_engines_per_lang or + all(main_engine in lang['counter'] + for main_engine in main_engines))} + + def _copy_lang_data(lang, country_name=None): + new_dict = dict() + new_dict['name'] = all_languages[lang]['name'] + new_dict['english_name'] = all_languages[lang]['english_name'] + if country_name: + new_dict['country_name'] = country_name + return new_dict + + def _country_count(i): + return len(countries[sorted_countries[i]]['counter']) + + # for each language get country codes supported by most engines or at least one country code + filtered_languages_with_countries = dict() + for lang, lang_data in filtered_languages.items(): + countries = lang_data['countries'] + filtered_countries = dict() + + # get language's country codes with enough supported engines + for lang_country, country_data in countries.items(): + if len(country_data['counter']) >= min_engines_per_country: + filtered_countries[lang_country] = _copy_lang_data(lang, country_data['country_name']) + + # add language without countries too if there's more than one country to choose from + if len(filtered_countries) > 1: + filtered_countries[lang] = _copy_lang_data(lang) + elif len(filtered_countries) == 1: + # if there's only one country per language, it's not necessary to show country name + lang_country = next(iter(filtered_countries)) + filtered_countries[lang_country]['country_name'] = None + + # if no country has enough engines try to get most likely country code from babel + if not filtered_countries: + lang_country = None + subtags = get_global('likely_subtags').get(lang) + if subtags: + country_code = subtags.split('_')[-1] + if len(country_code) == 2: + lang_country = "{lang}-{country}".format(lang=lang, country=country_code) + + if lang_country: + filtered_countries[lang_country] = _copy_lang_data(lang) + else: + filtered_countries[lang] = _copy_lang_data(lang) + + filtered_languages_with_countries.update(filtered_countries) + + return filtered_languages_with_countries + + +# Write languages.py. +def write_languages_file(languages): + file_headers = ( + "# -*- coding: utf-8 -*-", + "# list of language codes", + "# this file is generated automatically by utils/fetch_languages.py", + "language_codes =" + ) + + language_codes = tuple([ + ( + code, + languages[code]['name'].split(' (')[0], + languages[code].get('country_name') or '', + languages[code].get('english_name') or '' + ) for code in sorted(languages) + ]) + + with open(languages_file, 'w') as new_file: + file_content = "{file_headers} \\\n{language_codes}".format( + file_headers='\n'.join(file_headers), + language_codes=pformat(language_codes, indent=4) + ) + new_file.write(file_content) + new_file.close() + + +if __name__ == "__main__": + initialize_engines(settings['engines']) + engines_languages = fetch_supported_languages() + all_languages = join_language_lists(engines_languages) + filtered_languages = filter_language_list(all_languages) + write_languages_file(filtered_languages) diff --git a/searx_extra/update/update_translations.sh b/searx_extra/update/update_translations.sh new file mode 100755 index 000000000..240387ae7 --- /dev/null +++ b/searx_extra/update/update_translations.sh @@ -0,0 +1,15 @@ +#!/bin/sh + +# script to easily update translation language files + +# add new language: +# pybabel init -i messages.pot -d searx/translations -l en + +SEARX_DIR='searx' + +pybabel extract -F babel.cfg -o messages.pot "$SEARX_DIR" +for f in `ls "$SEARX_DIR"'/translations/'`; do + pybabel update -N -i messages.pot -d "$SEARX_DIR"'/translations/' -l "$f" +done + +echo '[!] update done, edit .po files if required and run pybabel compile -d searx/translations/' diff --git a/searx_extra/update/update_wikidata_units.py b/searx_extra/update/update_wikidata_units.py new file mode 100755 index 000000000..1e6b8b9ca --- /dev/null +++ b/searx_extra/update/update_wikidata_units.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python + +import json +import collections + +# set path +from os.path import join + +from searx import searx_dir +from searx.engines.wikidata import send_wikidata_query + + +# the response contains duplicate ?item with the different ?symbol +# "ORDER BY ?item DESC(?rank) ?symbol" provides a deterministic result +# even if a ?item has different ?symbol of the same rank. +# A deterministic result +# see: +# * https://www.wikidata.org/wiki/Help:Ranking +# * https://www.mediawiki.org/wiki/Wikibase/Indexing/RDF_Dump_Format ("Statement representation" section) +# * https://w.wiki/32BT +# see the result for https://www.wikidata.org/wiki/Q11582 +# there are multiple symbols the same rank +SARQL_REQUEST = """ +SELECT DISTINCT ?item ?symbol +WHERE +{ + ?item wdt:P31/wdt:P279 wd:Q47574 . + ?item p:P5061 ?symbolP . + ?symbolP ps:P5061 ?symbol ; + wikibase:rank ?rank . + FILTER(LANG(?symbol) = "en"). +} +ORDER BY ?item DESC(?rank) ?symbol +""" + + +def get_data(): + results = collections.OrderedDict() + response = send_wikidata_query(SARQL_REQUEST) + for unit in response['results']['bindings']: + name = unit['item']['value'].replace('http://www.wikidata.org/entity/', '') + unit = unit['symbol']['value'] + if name not in results: + # ignore duplicate: always use the first one + results[name] = unit + return results + + +def get_wikidata_units_filename(): + return join(join(searx_dir, "data"), "wikidata_units.json") + + +with open(get_wikidata_units_filename(), 'w') as f: + json.dump(get_data(), f, indent=4, ensure_ascii=False) diff --git a/setup.py b/setup.py index 09a3021ee..61227d199 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ setup( author='Adam Tauber', author_email='asciimoo@gmail.com', license='GNU Affero General Public License', - packages=find_packages(exclude=["tests*"]), + packages=find_packages(exclude=["tests*", "searx_extra"]), zip_safe=False, install_requires=requirements, extras_require={ diff --git a/tests/unit/test_standalone_searx.py b/tests/unit/test_standalone_searx.py index 6cc230e6c..a69353c03 100644 --- a/tests/unit/test_standalone_searx.py +++ b/tests/unit/test_standalone_searx.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- """Test utils/standalone_searx.py""" import datetime -import importlib.util import io import sys @@ -10,16 +9,7 @@ from nose2.tools import params from searx.search import SearchQuery, EngineRef, initialize from searx.testing import SearxTestCase - - -def get_standalone_searx_module(): - """Get standalone_searx module.""" - module_name = 'utils.standalone_searx' - filename = 'utils/standalone_searx.py' - spec = importlib.util.spec_from_file_location(module_name, filename) - sas = importlib.util.module_from_spec(spec) - spec.loader.exec_module(sas) - return sas +from searx_extra import standalone_searx as sas class StandaloneSearx(SearxTestCase): @@ -33,7 +23,6 @@ class StandaloneSearx(SearxTestCase): def test_parse_argument_no_args(self): """Test parse argument without args.""" - sas = get_standalone_searx_module() with patch.object(sys, 'argv', ['standalone_searx']), \ self.assertRaises(SystemExit): sys.stderr = io.StringIO() @@ -42,7 +31,6 @@ class StandaloneSearx(SearxTestCase): def test_parse_argument_basic_args(self): """Test parse argument with basic args.""" - sas = get_standalone_searx_module() query = 'red box' exp_dict = { 'query': query, 'category': 'general', 'lang': 'all', 'pageno': 1, @@ -56,7 +44,6 @@ class StandaloneSearx(SearxTestCase): def test_to_dict(self): """test to_dict.""" - sas = get_standalone_searx_module() self.assertEqual( sas.to_dict( sas.get_search_query(sas.parse_argument(['red box']))), @@ -72,7 +59,6 @@ class StandaloneSearx(SearxTestCase): def test_to_dict_with_mock(self): """test to dict.""" - sas = get_standalone_searx_module() with patch.object(sas.searx.search, 'Search') as mock_s: m_search = mock_s().search() m_sq = Mock() @@ -97,7 +83,6 @@ class StandaloneSearx(SearxTestCase): def test_get_search_query(self): """test get_search_query.""" - sas = get_standalone_searx_module() args = sas.parse_argument(['rain', ]) search_q = sas.get_search_query(args) self.assertTrue(search_q) @@ -106,7 +91,6 @@ class StandaloneSearx(SearxTestCase): def test_no_parsed_url(self): """test no_parsed_url func""" - sas = get_standalone_searx_module() self.assertEqual( sas.no_parsed_url([{'parsed_url': 'http://example.com'}]), [{}] @@ -119,11 +103,9 @@ class StandaloneSearx(SearxTestCase): ) def test_json_serial(self, arg, exp_res): """test json_serial func""" - sas = get_standalone_searx_module() self.assertEqual(sas.json_serial(arg), exp_res) def test_json_serial_error(self): """test error on json_serial.""" - sas = get_standalone_searx_module() with self.assertRaises(TypeError): sas.json_serial('a') diff --git a/utils/fetch_ahmia_blacklist.py b/utils/fetch_ahmia_blacklist.py deleted file mode 100755 index 3e393edbe..000000000 --- a/utils/fetch_ahmia_blacklist.py +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env python - -# This script saves Ahmia's blacklist for onion sites. -# More info in https://ahmia.fi/blacklist/ - -# set path -from sys import path -from os.path import realpath, dirname, join -path.append(realpath(dirname(realpath(__file__)) + '/../')) - -# -import requests -from searx import searx_dir - -URL = 'https://ahmia.fi/blacklist/banned/' - - -def fetch_ahmia_blacklist(): - resp = requests.get(URL, timeout=3.0) - if resp.status_code != 200: - raise Exception("Error fetching Ahmia blacklist, HTTP code " + resp.status_code) - else: - blacklist = resp.text.split() - return blacklist - - -def get_ahmia_blacklist_filename(): - return join(join(searx_dir, "data"), "ahmia_blacklist.txt") - - -blacklist = fetch_ahmia_blacklist() -with open(get_ahmia_blacklist_filename(), "w") as f: - f.write('\n'.join(blacklist)) diff --git a/utils/fetch_currencies.py b/utils/fetch_currencies.py deleted file mode 100644 index 8811049a5..000000000 --- a/utils/fetch_currencies.py +++ /dev/null @@ -1,151 +0,0 @@ -#!/usr/bin/env python - -import re -import unicodedata -import json - -# set path -from sys import path -from os.path import realpath, dirname, join -path.append(realpath(dirname(realpath(__file__)) + '/../')) - -from searx import searx_dir, settings -from searx.engines.wikidata import send_wikidata_query - - -# ORDER BY (with all the query fields) is important to keep a deterministic result order -# so multiple invokation of this script doesn't change currencies.json -SARQL_REQUEST = """ -SELECT DISTINCT ?iso4217 ?unit ?unicode ?label ?alias WHERE { - ?item wdt:P498 ?iso4217; rdfs:label ?label. - OPTIONAL { ?item skos:altLabel ?alias FILTER (LANG (?alias) = LANG(?label)). } - OPTIONAL { ?item wdt:P5061 ?unit. } - OPTIONAL { ?item wdt:P489 ?symbol. - ?symbol wdt:P487 ?unicode. } - MINUS { ?item wdt:P582 ?end_data . } # Ignore monney with an end date - MINUS { ?item wdt:P31/wdt:P279* wd:Q15893266 . } # Ignore "former entity" (obsolete currency) - FILTER(LANG(?label) IN (%LANGUAGES_SPARQL%)). -} -ORDER BY ?iso4217 ?unit ?unicode ?label ?alias -""" - -# ORDER BY (with all the query fields) is important to keep a deterministic result order -# so multiple invokation of this script doesn't change currencies.json -SPARQL_WIKIPEDIA_NAMES_REQUEST = """ -SELECT DISTINCT ?iso4217 ?article_name WHERE { - ?item wdt:P498 ?iso4217 . - ?article schema:about ?item ; - schema:name ?article_name ; - schema:isPartOf [ wikibase:wikiGroup "wikipedia" ] - MINUS { ?item wdt:P582 ?end_data . } # Ignore monney with an end date - MINUS { ?item wdt:P31/wdt:P279* wd:Q15893266 . } # Ignore "former entity" (obsolete currency) - FILTER(LANG(?article_name) IN (%LANGUAGES_SPARQL%)). -} -ORDER BY ?iso4217 ?article_name -""" - - -LANGUAGES = settings['locales'].keys() -LANGUAGES_SPARQL = ', '.join(set(map(lambda l: repr(l.split('_')[0]), LANGUAGES))) - - -def remove_accents(name): - return unicodedata.normalize('NFKD', name).lower() - - -def remove_extra(name): - for c in ('(', ':'): - if c in name: - name = name.split(c)[0].strip() - return name - - -def _normalize_name(name): - name = re.sub(' +', ' ', remove_accents(name.lower()).replace('-', ' ')) - name = remove_extra(name) - return name - - -def add_currency_name(db, name, iso4217, normalize_name=True): - db_names = db['names'] - - if normalize_name: - name = _normalize_name(name) - - iso4217_set = db_names.setdefault(name, []) - if iso4217 not in iso4217_set: - iso4217_set.insert(0, iso4217) - - -def add_currency_label(db, label, iso4217, language): - labels = db['iso4217'].setdefault(iso4217, {}) - labels[language] = label - - -def wikidata_request_result_iterator(request): - result = send_wikidata_query(request.replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL)) - if result is not None: - for r in result['results']['bindings']: - yield r - - -def fetch_db(): - db = { - 'names': {}, - 'iso4217': {}, - } - - for r in wikidata_request_result_iterator(SPARQL_WIKIPEDIA_NAMES_REQUEST): - iso4217 = r['iso4217']['value'] - article_name = r['article_name']['value'] - article_lang = r['article_name']['xml:lang'] - add_currency_name(db, article_name, iso4217) - add_currency_label(db, article_name, iso4217, article_lang) - - for r in wikidata_request_result_iterator(SARQL_REQUEST): - iso4217 = r['iso4217']['value'] - if 'label' in r: - label = r['label']['value'] - label_lang = r['label']['xml:lang'] - add_currency_name(db, label, iso4217) - add_currency_label(db, label, iso4217, label_lang) - - if 'alias' in r: - add_currency_name(db, r['alias']['value'], iso4217) - - if 'unicode' in r: - add_currency_name(db, r['unicode']['value'], iso4217, normalize_name=False) - - if 'unit' in r: - add_currency_name(db, r['unit']['value'], iso4217, normalize_name=False) - - # reduce memory usage: - # replace lists with one item by the item. - # see searx.search.processors.online_currency.name_to_iso4217 - for name in db['names']: - if len(db['names'][name]) == 1: - db['names'][name] = db['names'][name][0] - - return db - - -def get_filename(): - return join(join(searx_dir, "data"), "currencies.json") - - -def main(): - # - db = fetch_db() - # static - add_currency_name(db, "euro", 'EUR') - add_currency_name(db, "euros", 'EUR') - add_currency_name(db, "dollar", 'USD') - add_currency_name(db, "dollars", 'USD') - add_currency_name(db, "peso", 'MXN') - add_currency_name(db, "pesos", 'MXN') - - with open(get_filename(), 'w', encoding='utf8') as f: - json.dump(db, f, ensure_ascii=False, indent=4) - -if __name__ == '__main__': - main() diff --git a/utils/fetch_engine_descriptions.py b/utils/fetch_engine_descriptions.py deleted file mode 100644 index 9ca001d45..000000000 --- a/utils/fetch_engine_descriptions.py +++ /dev/null @@ -1,206 +0,0 @@ -#!/usr/bin/env python - -import sys -import json -from urllib.parse import quote, urlparse -from os.path import realpath, dirname -import cld3 -from lxml.html import fromstring - -# set path -sys.path.append(realpath(dirname(realpath(__file__)) + '/../')) - -from searx.engines.wikidata import send_wikidata_query -from searx.utils import extract_text -import searx -import searx.search -import searx.poolrequests - -SPARQL_WIKIPEDIA_ARTICLE = """ -SELECT DISTINCT ?item ?name -WHERE { - VALUES ?item { %IDS% } - ?article schema:about ?item ; - schema:inLanguage ?lang ; - schema:name ?name ; - schema:isPartOf [ wikibase:wikiGroup "wikipedia" ] . - FILTER(?lang in (%LANGUAGES_SPARQL%)) . - FILTER (!CONTAINS(?name, ':')) . -} -""" - -SPARQL_DESCRIPTION = """ -SELECT DISTINCT ?item ?itemDescription -WHERE { - VALUES ?item { %IDS% } - ?item schema:description ?itemDescription . - FILTER (lang(?itemDescription) in (%LANGUAGES_SPARQL%)) -} -ORDER BY ?itemLang -""" - -LANGUAGES = searx.settings['locales'].keys() -LANGUAGES_SPARQL = ', '.join(set(map(lambda l: repr(l.split('_')[0]), LANGUAGES))) -IDS = None - -descriptions = {} -wd_to_engine_name = {} - - -def normalize_description(description): - for c in [chr(c) for c in range(0, 31)]: - description = description.replace(c, ' ') - description = ' '.join(description.strip().split()) - return description - - -def update_description(engine_name, lang, description, source, replace=True): - if replace or lang not in descriptions[engine_name]: - descriptions[engine_name][lang] = [normalize_description(description), source] - - -def get_wikipedia_summary(language, pageid): - search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}' - url = search_url.format(title=quote(pageid), language=language) - try: - response = searx.poolrequests.get(url) - response.raise_for_status() - api_result = json.loads(response.text) - return api_result.get('extract') - except: - return None - - -def detect_language(text): - r = cld3.get_language(str(text)) # pylint: disable=E1101 - if r is not None and r.probability >= 0.98 and r.is_reliable: - return r.language - return None - - -def get_website_description(url, lang1, lang2=None): - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'DNT': '1', - 'Upgrade-Insecure-Requests': '1', - 'Sec-GPC': '1', - 'Cache-Control': 'max-age=0', - } - if lang1 is not None: - lang_list = [lang1] - if lang2 is not None: - lang_list.append(lang2) - headers['Accept-Language'] = f'{",".join(lang_list)};q=0.8' - try: - response = searx.poolrequests.get(url, headers=headers, timeout=10) - response.raise_for_status() - except Exception: - return (None, None) - - try: - html = fromstring(response.text) - except ValueError: - html = fromstring(response.content) - - description = extract_text(html.xpath('/html/head/meta[@name="description"]/@content')) - if not description: - description = extract_text(html.xpath('/html/head/meta[@property="og:description"]/@content')) - if not description: - description = extract_text(html.xpath('/html/head/title')) - lang = extract_text(html.xpath('/html/@lang')) - if lang is None and len(lang1) > 0: - lang = lang1 - lang = detect_language(description) or lang or 'en' - lang = lang.split('_')[0] - lang = lang.split('-')[0] - return (lang, description) - - -def initialize(): - global descriptions, wd_to_engine_name, IDS - searx.search.initialize() - for engine_name, engine in searx.engines.engines.items(): - descriptions[engine_name] = {} - wikidata_id = getattr(engine, "about", {}).get('wikidata_id') - if wikidata_id is not None: - wd_to_engine_name.setdefault(wikidata_id, set()).add(engine_name) - - IDS = ' '.join(list(map(lambda wd_id: 'wd:' + wd_id, wd_to_engine_name.keys()))) - - -def fetch_wikidata_descriptions(): - global IDS - result = send_wikidata_query(SPARQL_DESCRIPTION - .replace('%IDS%', IDS) - .replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL)) - if result is not None: - for binding in result['results']['bindings']: - wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '') - lang = binding['itemDescription']['xml:lang'] - description = binding['itemDescription']['value'] - if ' ' in description: # skip unique word description (like "website") - for engine_name in wd_to_engine_name[wikidata_id]: - update_description(engine_name, lang, description, 'wikidata') - - -def fetch_wikipedia_descriptions(): - global IDS - result = send_wikidata_query(SPARQL_WIKIPEDIA_ARTICLE - .replace('%IDS%', IDS) - .replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL)) - if result is not None: - for binding in result['results']['bindings']: - wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '') - lang = binding['name']['xml:lang'] - pageid = binding['name']['value'] - description = get_wikipedia_summary(lang, pageid) - if description is not None and ' ' in description: - for engine_name in wd_to_engine_name[wikidata_id]: - update_description(engine_name, lang, description, 'wikipedia') - - -def normalize_url(url): - url = url.replace('{language}', 'en') - url = urlparse(url)._replace(path='/', params='', query='', fragment='').geturl() - url = url.replace('https://api.', 'https://') - return url - - -def fetch_website_description(engine_name, website): - default_lang, default_description = get_website_description(website, None, None) - if default_lang is None or default_description is None: - return - if default_lang not in descriptions[engine_name]: - descriptions[engine_name][default_lang] = [normalize_description(default_description), website] - for request_lang in ('en-US', 'es-US', 'fr-FR', 'zh', 'ja', 'ru', 'ar', 'ko'): - if request_lang.split('-')[0] not in descriptions[engine_name]: - lang, desc = get_website_description(website, request_lang, request_lang.split('-')[0]) - if desc is not None and desc != default_description: - update_description(engine_name, lang, desc, website, replace=False) - else: - break - - -def fetch_website_descriptions(): - for engine_name, engine in searx.engines.engines.items(): - website = getattr(engine, "about", {}).get('website') - if website is None: - website = normalize_url(getattr(engine, "search_url")) - if website is None: - website = normalize_url(getattr(engine, "base_url")) - if website is not None: - fetch_website_description(engine_name, website) - - -def main(): - initialize() - fetch_wikidata_descriptions() - fetch_wikipedia_descriptions() - fetch_website_descriptions() - - sys.stdout.write(json.dumps(descriptions, indent=1, separators=(',', ':'), ensure_ascii=False)) - - -if __name__ == "__main__": - main() diff --git a/utils/fetch_external_bangs.py b/utils/fetch_external_bangs.py deleted file mode 100755 index ba6f51e7a..000000000 --- a/utils/fetch_external_bangs.py +++ /dev/null @@ -1,161 +0,0 @@ -#!/usr/bin/env python -""" -Update searx/data/external_bangs.json using the duckduckgo bangs. - -https://duckduckgo.com/newbang loads -* a javascript which provides the bang version ( https://duckduckgo.com/bv1.js ) -* a JSON file which contains the bangs ( https://duckduckgo.com/bang.v260.js for example ) - -This script loads the javascript, then the bangs. - -The javascript URL may change in the future ( for example https://duckduckgo.com/bv2.js ), -but most probably it will requires to update RE_BANG_VERSION -""" -# pylint: disable=C0116 - -import sys -import json -import re -from os.path import realpath, dirname, join - -import requests - -# set path -sys.path.append(realpath(dirname(realpath(__file__)) + '/../')) - -from searx import searx_dir # pylint: disable=E0401 C0413 - - -# from https://duckduckgo.com/newbang -URL_BV1 = 'https://duckduckgo.com/bv1.js' -RE_BANG_VERSION = re.compile(r'\/bang\.v([0-9]+)\.js') -HTTPS_COLON = 'https:' -HTTP_COLON = 'http:' - - -def get_bang_url(): - response = requests.get(URL_BV1) - response.raise_for_status() - - r = RE_BANG_VERSION.findall(response.text) - return f'https://duckduckgo.com/bang.v{r[0]}.js', r[0] - - -def fetch_ddg_bangs(url): - response = requests.get(url) - response.raise_for_status() - return json.loads(response.content.decode()) - - -def merge_when_no_leaf(node): - """Minimize the number of nodes - - A -> B -> C - B is child of A - C is child of B - - If there are no C equals to '*', then each C are merged into A - - For example: - d -> d -> g -> * (ddg*) - -> i -> g -> * (dig*) - becomes - d -> dg -> * - -> ig -> * - """ - restart = False - if not isinstance(node, dict): - return - - # create a copy of the keys so node can be modified - keys = list(node.keys()) - - for key in keys: - if key == '*': - continue - - value = node[key] - value_keys = list(value.keys()) - if '*' not in value_keys: - for value_key in value_keys: - node[key + value_key] = value[value_key] - merge_when_no_leaf(node[key + value_key]) - del node[key] - restart = True - else: - merge_when_no_leaf(value) - - if restart: - merge_when_no_leaf(node) - - -def optimize_leaf(parent, parent_key, node): - if not isinstance(node, dict): - return - - if len(node) == 1 and '*' in node and parent is not None: - parent[parent_key] = node['*'] - else: - for key, value in node.items(): - optimize_leaf(node, key, value) - - -def parse_ddg_bangs(ddg_bangs): - bang_trie = {} - bang_urls = {} - - for bang_definition in ddg_bangs: - # bang_list - bang_url = bang_definition['u'] - if '{{{s}}}' not in bang_url: - # ignore invalid bang - continue - - bang_url = bang_url.replace('{{{s}}}', chr(2)) - - # only for the https protocol: "https://example.com" becomes "//example.com" - if bang_url.startswith(HTTPS_COLON + '//'): - bang_url = bang_url[len(HTTPS_COLON):] - - # - if bang_url.startswith(HTTP_COLON + '//') and bang_url[len(HTTP_COLON):] in bang_urls: - # if the bang_url uses the http:// protocol, and the same URL exists in https:// - # then reuse the https:// bang definition. (written //example.com) - bang_def_output = bang_urls[bang_url[len(HTTP_COLON):]] - else: - # normal use case : new http:// URL or https:// URL (without "https:", see above) - bang_rank = str(bang_definition['r']) - bang_def_output = bang_url + chr(1) + bang_rank - bang_def_output = bang_urls.setdefault(bang_url, bang_def_output) - - bang_urls[bang_url] = bang_def_output - - # bang name - bang = bang_definition['t'] - - # bang_trie - t = bang_trie - for bang_letter in bang: - t = t.setdefault(bang_letter, {}) - t = t.setdefault('*', bang_def_output) - - # optimize the trie - merge_when_no_leaf(bang_trie) - optimize_leaf(None, None, bang_trie) - - return bang_trie - - -def get_bangs_filename(): - return join(join(searx_dir, "data"), "external_bangs.json") - - -if __name__ == '__main__': - bangs_url, bangs_version = get_bang_url() - print(f'fetch bangs from {bangs_url}') - output = { - 'version': bangs_version, - 'trie': parse_ddg_bangs(fetch_ddg_bangs(bangs_url)) - } - with open(get_bangs_filename(), 'w') as fp: - json.dump(output, fp, ensure_ascii=False, indent=4) diff --git a/utils/fetch_firefox_version.py b/utils/fetch_firefox_version.py deleted file mode 100755 index 997a752b3..000000000 --- a/utils/fetch_firefox_version.py +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env python - -# set path -from sys import path -from os.path import realpath, dirname, join -path.append(realpath(dirname(realpath(__file__)) + '/../')) - -# -import json -import requests -import re -from urllib.parse import urlparse, urljoin -from distutils.version import LooseVersion, StrictVersion -from lxml import html -from searx import searx_dir - -URL = 'https://ftp.mozilla.org/pub/firefox/releases/' -RELEASE_PATH = '/pub/firefox/releases/' - -NORMAL_REGEX = re.compile('^[0-9]+\.[0-9](\.[0-9])?$') -# BETA_REGEX = re.compile('.*[0-9]b([0-9\-a-z]+)$') -# ESR_REGEX = re.compile('^[0-9]+\.[0-9](\.[0-9])?esr$') - -# -useragents = { - "versions": (), - "os": ('Windows NT 10.0; WOW64', - 'X11; Linux x86_64'), - "ua": "Mozilla/5.0 ({os}; rv:{version}) Gecko/20100101 Firefox/{version}" -} - - -def fetch_firefox_versions(): - resp = requests.get(URL, timeout=2.0) - if resp.status_code != 200: - raise Exception("Error fetching firefox versions, HTTP code " + resp.status_code) - else: - dom = html.fromstring(resp.text) - versions = [] - - for link in dom.xpath('//a/@href'): - url = urlparse(urljoin(URL, link)) - path = url.path - if path.startswith(RELEASE_PATH): - version = path[len(RELEASE_PATH):-1] - if NORMAL_REGEX.match(version): - versions.append(LooseVersion(version)) - - list.sort(versions, reverse=True) - return versions - - -def fetch_firefox_last_versions(): - versions = fetch_firefox_versions() - - result = [] - major_last = versions[0].version[0] - major_list = (major_last, major_last - 1) - for version in versions: - major_current = version.version[0] - if major_current in major_list: - result.append(version.vstring) - - return result - - -def get_useragents_filename(): - return join(join(searx_dir, "data"), "useragents.json") - - -useragents["versions"] = fetch_firefox_last_versions() -with open(get_useragents_filename(), "w") as f: - json.dump(useragents, f, indent=4, ensure_ascii=False) diff --git a/utils/fetch_languages.py b/utils/fetch_languages.py deleted file mode 100644 index 582e0ae00..000000000 --- a/utils/fetch_languages.py +++ /dev/null @@ -1,207 +0,0 @@ -# -*- coding: utf-8 -*- - -# This script generates languages.py from intersecting each engine's supported languages. -# -# Output files: searx/data/engines_languages.json and searx/languages.py - -import json -from pathlib import Path -from pprint import pformat -from sys import path -from babel import Locale, UnknownLocaleError -from babel.languages import get_global - -path.append('../searx') # noqa -from searx import settings, searx_dir -from searx.engines import initialize_engines, engines - -# Output files. -engines_languages_file = Path(searx_dir) / 'data' / 'engines_languages.json' -languages_file = Path(searx_dir) / 'languages.py' - - -# Fetchs supported languages for each engine and writes json file with those. -def fetch_supported_languages(): - - engines_languages = dict() - names = list(engines) - names.sort() - - for engine_name in names: - if hasattr(engines[engine_name], 'fetch_supported_languages'): - engines_languages[engine_name] = engines[engine_name].fetch_supported_languages() - print("fetched %s languages from engine %s" % ( - len(engines_languages[engine_name]), engine_name)) - if type(engines_languages[engine_name]) == list: - engines_languages[engine_name] = sorted(engines_languages[engine_name]) - - # write json file - with open(engines_languages_file, 'w', encoding='utf-8') as f: - json.dump(engines_languages, f, indent=2, sort_keys=True) - - return engines_languages - - -# Get babel Locale object from lang_code if possible. -def get_locale(lang_code): - try: - locale = Locale.parse(lang_code, sep='-') - return locale - except (UnknownLocaleError, ValueError): - return None - - -# Join all language lists. -def join_language_lists(engines_languages): - language_list = dict() - for engine_name in engines_languages: - for lang_code in engines_languages[engine_name]: - - # apply custom fixes if necessary - if lang_code in getattr(engines[engine_name], 'language_aliases', {}).values(): - lang_code = next(lc for lc, alias in engines[engine_name].language_aliases.items() - if lang_code == alias) - - locale = get_locale(lang_code) - - # ensure that lang_code uses standard language and country codes - if locale and locale.territory: - lang_code = "{lang}-{country}".format(lang=locale.language, country=locale.territory) - short_code = lang_code.split('-')[0] - - # add language without country if not in list - if short_code not in language_list: - if locale: - # get language's data from babel's Locale object - language_name = locale.get_language_name().title() - english_name = locale.english_name.split(' (')[0] - elif short_code in engines_languages['wikipedia']: - # get language's data from wikipedia if not known by babel - language_name = engines_languages['wikipedia'][short_code]['name'] - english_name = engines_languages['wikipedia'][short_code]['english_name'] - else: - language_name = None - english_name = None - - # add language to list - language_list[short_code] = {'name': language_name, - 'english_name': english_name, - 'counter': set(), - 'countries': dict()} - - # add language with country if not in list - if lang_code != short_code and lang_code not in language_list[short_code]['countries']: - country_name = '' - if locale: - # get country name from babel's Locale object - country_name = locale.get_territory_name() - - language_list[short_code]['countries'][lang_code] = {'country_name': country_name, - 'counter': set()} - - # count engine for both language_country combination and language alone - language_list[short_code]['counter'].add(engine_name) - if lang_code != short_code: - language_list[short_code]['countries'][lang_code]['counter'].add(engine_name) - - return language_list - - -# Filter language list so it only includes the most supported languages and countries -def filter_language_list(all_languages): - min_engines_per_lang = 15 - min_engines_per_country = 10 - main_engines = [engine_name for engine_name in engines.keys() - if 'general' in engines[engine_name].categories and - engines[engine_name].supported_languages and - not engines[engine_name].disabled] - - # filter list to include only languages supported by most engines or all default general engines - filtered_languages = {code: lang for code, lang - in all_languages.items() - if (len(lang['counter']) >= min_engines_per_lang or - all(main_engine in lang['counter'] - for main_engine in main_engines))} - - def _copy_lang_data(lang, country_name=None): - new_dict = dict() - new_dict['name'] = all_languages[lang]['name'] - new_dict['english_name'] = all_languages[lang]['english_name'] - if country_name: - new_dict['country_name'] = country_name - return new_dict - - def _country_count(i): - return len(countries[sorted_countries[i]]['counter']) - - # for each language get country codes supported by most engines or at least one country code - filtered_languages_with_countries = dict() - for lang, lang_data in filtered_languages.items(): - countries = lang_data['countries'] - filtered_countries = dict() - - # get language's country codes with enough supported engines - for lang_country, country_data in countries.items(): - if len(country_data['counter']) >= min_engines_per_country: - filtered_countries[lang_country] = _copy_lang_data(lang, country_data['country_name']) - - # add language without countries too if there's more than one country to choose from - if len(filtered_countries) > 1: - filtered_countries[lang] = _copy_lang_data(lang) - elif len(filtered_countries) == 1: - # if there's only one country per language, it's not necessary to show country name - lang_country = next(iter(filtered_countries)) - filtered_countries[lang_country]['country_name'] = None - - # if no country has enough engines try to get most likely country code from babel - if not filtered_countries: - lang_country = None - subtags = get_global('likely_subtags').get(lang) - if subtags: - country_code = subtags.split('_')[-1] - if len(country_code) == 2: - lang_country = "{lang}-{country}".format(lang=lang, country=country_code) - - if lang_country: - filtered_countries[lang_country] = _copy_lang_data(lang) - else: - filtered_countries[lang] = _copy_lang_data(lang) - - filtered_languages_with_countries.update(filtered_countries) - - return filtered_languages_with_countries - - -# Write languages.py. -def write_languages_file(languages): - file_headers = ( - "# -*- coding: utf-8 -*-", - "# list of language codes", - "# this file is generated automatically by utils/fetch_languages.py", - "language_codes =" - ) - - language_codes = tuple([ - ( - code, - languages[code]['name'].split(' (')[0], - languages[code].get('country_name') or '', - languages[code].get('english_name') or '' - ) for code in sorted(languages) - ]) - - with open(languages_file, 'w') as new_file: - file_content = "{file_headers} \\\n{language_codes}".format( - file_headers='\n'.join(file_headers), - language_codes=pformat(language_codes, indent=4) - ) - new_file.write(file_content) - new_file.close() - - -if __name__ == "__main__": - initialize_engines(settings['engines']) - engines_languages = fetch_supported_languages() - all_languages = join_language_lists(engines_languages) - filtered_languages = filter_language_list(all_languages) - write_languages_file(filtered_languages) diff --git a/utils/fetch_wikidata_units.py b/utils/fetch_wikidata_units.py deleted file mode 100644 index 69ae8ab27..000000000 --- a/utils/fetch_wikidata_units.py +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env python - -import json -import collections - -# set path -from sys import path -from os.path import realpath, dirname, join -path.append(realpath(dirname(realpath(__file__)) + '/../')) - -from searx import searx_dir -from searx.engines.wikidata import send_wikidata_query - - -# the response contains duplicate ?item with the different ?symbol -# "ORDER BY ?item DESC(?rank) ?symbol" provides a deterministic result -# even if a ?item has different ?symbol of the same rank. -# A deterministic result -# see: -# * https://www.wikidata.org/wiki/Help:Ranking -# * https://www.mediawiki.org/wiki/Wikibase/Indexing/RDF_Dump_Format ("Statement representation" section) -# * https://w.wiki/32BT -# see the result for https://www.wikidata.org/wiki/Q11582 -# there are multiple symbols the same rank -SARQL_REQUEST = """ -SELECT DISTINCT ?item ?symbol -WHERE -{ - ?item wdt:P31/wdt:P279 wd:Q47574 . - ?item p:P5061 ?symbolP . - ?symbolP ps:P5061 ?symbol ; - wikibase:rank ?rank . - FILTER(LANG(?symbol) = "en"). -} -ORDER BY ?item DESC(?rank) ?symbol -""" - - -def get_data(): - results = collections.OrderedDict() - response = send_wikidata_query(SARQL_REQUEST) - for unit in response['results']['bindings']: - name = unit['item']['value'].replace('http://www.wikidata.org/entity/', '') - unit = unit['symbol']['value'] - if name not in results: - # ignore duplicate: always use the first one - results[name] = unit - return results - - -def get_wikidata_units_filename(): - return join(join(searx_dir, "data"), "wikidata_units.json") - - -with open(get_wikidata_units_filename(), 'w') as f: - json.dump(get_data(), f, indent=4, ensure_ascii=False) diff --git a/utils/google_search.py b/utils/google_search.py deleted file mode 100644 index cad32eeca..000000000 --- a/utils/google_search.py +++ /dev/null @@ -1,35 +0,0 @@ -from sys import argv, exit - -if not len(argv) > 1: - print('search query required') - exit(1) - -import requests -from json import dumps -from searx.engines import google -from searx.search import default_request_params - -request_params = default_request_params() -# Possible params -# request_params['headers']['User-Agent'] = '' -# request_params['category'] = '' -request_params['pageno'] = 1 -request_params['language'] = 'en_us' -request_params['time_range'] = '' - -params = google.request(argv[1], request_params) - -request_args = dict( - headers=request_params['headers'], - cookies=request_params['cookies'], -) - -if request_params['method'] == 'GET': - req = requests.get -else: - req = requests.post - request_args['data'] = request_params['data'] - -resp = req(request_params['url'], **request_args) -resp.search_params = request_params -print(dumps(google.response(resp))) diff --git a/utils/standalone_searx.py b/utils/standalone_searx.py deleted file mode 100755 index 89023f41b..000000000 --- a/utils/standalone_searx.py +++ /dev/null @@ -1,217 +0,0 @@ -#!/usr/bin/env python -"""Script to run searx from terminal. - -Getting categories without initiate the engine will only return `['general']` - ->>> import searx.engines -... list(searx.engines.categories.keys()) -['general'] ->>> import searx.search -... searx.search.initialize() -... list(searx.engines.categories.keys()) -['general', 'it', 'science', 'images', 'news', 'videos', 'music', 'files', 'social media', 'map'] - -Example to use this script: - -.. code:: bash - - $ python3 utils/standalone_searx.py rain - -Example to run it from python: - ->>> import importlib -... import json -... import sys -... import searx.engines -... import searx.search -... search_query = 'rain' -... # initialize engines -... searx.search.initialize() -... # load engines categories once instead of each time the function called -... engine_cs = list(searx.engines.categories.keys()) -... # load module -... spec = importlib.util.spec_from_file_location( -... 'utils.standalone_searx', 'utils/standalone_searx.py') -... sas = importlib.util.module_from_spec(spec) -... spec.loader.exec_module(sas) -... # use function from module -... prog_args = sas.parse_argument([search_query], category_choices=engine_cs) -... search_q = sas.get_search_query(prog_args, engine_categories=engine_cs) -... res_dict = sas.to_dict(search_q) -... sys.stdout.write(json.dumps( -... res_dict, sort_keys=True, indent=4, ensure_ascii=False, -... default=sas.json_serial)) -{ - "answers": [], - "infoboxes": [ {...} ], - "paging": true, - "results": [... ], - "results_number": 820000000.0, - "search": { - "lang": "all", - "pageno": 1, - "q": "rain", - "safesearch": 0, - "timerange": null - }, - "suggestions": [...] -} -""" # noqa: E501 -# pylint: disable=pointless-string-statement -''' -searx is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -searx is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with searx. If not, see < http://www.gnu.org/licenses/ >. - -(C) 2016- by Alexandre Flament, -''' -# pylint: disable=wrong-import-position -import argparse -import sys -from datetime import datetime -from json import dumps -from typing import Any, Dict, List, Optional - -import searx -import searx.preferences -import searx.query -import searx.search -import searx.webadapter - -EngineCategoriesVar = Optional[List[str]] - - -def get_search_query( - args: argparse.Namespace, engine_categories: EngineCategoriesVar = None -) -> searx.search.SearchQuery: - """Get search results for the query""" - if engine_categories is None: - engine_categories = list(searx.engines.categories.keys()) - try: - category = args.category.decode('utf-8') - except AttributeError: - category = args.category - form = { - "q": args.query, - "categories": category, - "pageno": str(args.pageno), - "language": args.lang, - "time_range": args.timerange - } - preferences = searx.preferences.Preferences( - ['oscar'], engine_categories, searx.engines.engines, []) - preferences.key_value_settings['safesearch'].parse(args.safesearch) - - search_query = searx.webadapter.get_search_query_from_webapp( - preferences, form)[0] - return search_query - - -def no_parsed_url(results: List[Dict[str, Any]]) -> List[Dict[str, Any]]: - """Remove parsed url from dict.""" - for result in results: - del result['parsed_url'] - return results - - -def json_serial(obj: Any) -> Any: - """JSON serializer for objects not serializable by default json code. - - :raise TypeError: raised when **obj** is not serializable - """ - if isinstance(obj, datetime): - serial = obj.isoformat() - return serial - if isinstance(obj, bytes): - return obj.decode('utf8') - if isinstance(obj, set): - return list(obj) - raise TypeError("Type ({}) not serializable".format(type(obj))) - - -def to_dict(search_query: searx.search.SearchQuery) -> Dict[str, Any]: - """Get result from parsed arguments.""" - result_container = searx.search.Search(search_query).search() - result_container_json = { - "search": { - "q": search_query.query, - "pageno": search_query.pageno, - "lang": search_query.lang, - "safesearch": search_query.safesearch, - "timerange": search_query.time_range, - }, - "results": no_parsed_url(result_container.get_ordered_results()), - "infoboxes": result_container.infoboxes, - "suggestions": list(result_container.suggestions), - "answers": list(result_container.answers), - "paging": result_container.paging, - "results_number": result_container.results_number() - } - return result_container_json - - -def parse_argument( - args: Optional[List[str]]=None, - category_choices: EngineCategoriesVar=None -) -> argparse.Namespace: - """Parse command line. - - :raise SystemExit: Query argument required on `args` - - Examples: - - >>> import importlib - ... # load module - ... spec = importlib.util.spec_from_file_location( - ... 'utils.standalone_searx', 'utils/standalone_searx.py') - ... sas = importlib.util.module_from_spec(spec) - ... spec.loader.exec_module(sas) - ... sas.parse_argument() - usage: ptipython [-h] [--category [{general}]] [--lang [LANG]] [--pageno [PAGENO]] [--safesearch [{0,1,2}]] [--timerange [{day,week,month,year}]] - query - SystemExit: 2 - >>> sas.parse_argument(['rain']) - Namespace(category='general', lang='all', pageno=1, query='rain', safesearch='0', timerange=None) - """ # noqa: E501 - if not category_choices: - category_choices = list(searx.engines.categories.keys()) - parser = argparse.ArgumentParser(description='Standalone searx.') - parser.add_argument('query', type=str, - help='Text query') - parser.add_argument('--category', type=str, nargs='?', - choices=category_choices, - default='general', - help='Search category') - parser.add_argument('--lang', type=str, nargs='?', default='all', - help='Search language') - parser.add_argument('--pageno', type=int, nargs='?', default=1, - help='Page number starting from 1') - parser.add_argument( - '--safesearch', type=str, nargs='?', - choices=['0', '1', '2'], default='0', - help='Safe content filter from none to strict') - parser.add_argument( - '--timerange', type=str, - nargs='?', choices=['day', 'week', 'month', 'year'], - help='Filter by time range') - return parser.parse_args(args) - - -if __name__ == '__main__': - searx.search.initialize() - engine_cs = list(searx.engines.categories.keys()) - prog_args = parse_argument(category_choices=engine_cs) - search_q = get_search_query(prog_args, engine_categories=engine_cs) - res_dict = to_dict(search_q) - sys.stdout.write(dumps( - res_dict, sort_keys=True, indent=4, ensure_ascii=False, - default=json_serial)) diff --git a/utils/update-translations.sh b/utils/update-translations.sh deleted file mode 100755 index 240387ae7..000000000 --- a/utils/update-translations.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/sh - -# script to easily update translation language files - -# add new language: -# pybabel init -i messages.pot -d searx/translations -l en - -SEARX_DIR='searx' - -pybabel extract -F babel.cfg -o messages.pot "$SEARX_DIR" -for f in `ls "$SEARX_DIR"'/translations/'`; do - pybabel update -N -i messages.pot -d "$SEARX_DIR"'/translations/' -l "$f" -done - -echo '[!] update done, edit .po files if required and run pybabel compile -d searx/translations/' -- cgit v1.2.3