diff options
| author | Markus Heiser <markus.heiser@darmarit.de> | 2025-05-04 20:37:06 +0200 |
|---|---|---|
| committer | Markus Heiser <markus.heiser@darmarIT.de> | 2025-05-31 20:34:59 +0200 |
| commit | a800dd04735c98a293edff00493a5fee3dfeaed7 (patch) | |
| tree | 7ea69e8d6dd525ca6c22c40593e91c3a446f34f0 /searx/wikidata_units.py | |
| parent | cf59ee2efcf5bbebf2899b1f57e892f25d23864d (diff) | |
[mod] implement searx.wikidata_units for unit converters
Diffstat (limited to 'searx/wikidata_units.py')
| -rw-r--r-- | searx/wikidata_units.py | 231 |
1 files changed, 231 insertions, 0 deletions
diff --git a/searx/wikidata_units.py b/searx/wikidata_units.py new file mode 100644 index 000000000..9fc94585f --- /dev/null +++ b/searx/wikidata_units.py @@ -0,0 +1,231 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Unit conversion on the basis of `SPARQL/WIKIDATA Precision, Units and +Coordinates`_ + +.. _SPARQL/WIKIDATA Precision, Units and Coordinates: + https://en.wikibooks.org/wiki/SPARQL/WIKIDATA_Precision,_Units_and_Coordinates#Quantities +""" + +__all__ = ["convert_from_si", "convert_to_si", "symbol_to_si"] + +import collections + +from searx import data +from searx.engines import wikidata + +ADDITIONAL_UNITS = [ + { + "si_name": "Q11579", + "symbol": "°C", + "to_si": lambda val: val + 273.15, + "from_si": lambda val: val - 273.15, + }, + { + "si_name": "Q11579", + "symbol": "°F", + "to_si": lambda val: (val + 459.67) * 5 / 9, + "from_si": lambda val: (val * 9 / 5) - 459.67, + }, +] +"""Additional items to convert from a measure unit to a SI unit (vice versa). + +.. code:: python + + { + "si_name": "Q11579", # Wikidata item ID of the SI unit (Kelvin) + "symbol": "°C", # symbol of the measure unit + "to_si": lambda val: val + 273.15, # convert measure value (val) to SI unit + "from_si": lambda val: val - 273.15, # convert SI value (val) measure unit + }, + { + "si_name": "Q11573", + "symbol": "mi", + "to_si": 1609.344, # convert measure value (val) to SI unit + "from_si": 1 / 1609.344 # convert SI value (val) measure unit + }, + +The values of ``to_si`` and ``from_si`` can be of :py:obj:`float` (a multiplier) +or a callable_ (val in / converted value returned). + +.. _callable: https://docs.python.org/3/glossary.html#term-callable +""" + + +ALIAS_SYMBOLS = { + '°C': ('C',), + '°F': ('F',), + 'mi': ('L',), +} +"""Alias symbols for known unit of measure symbols / by example:: + + '°C': ('C', ...), # list of alias symbols for °C (Q69362731) + '°F': ('F', ...), # list of alias symbols for °F (Q99490479) + 'mi': ('L',), # list of alias symbols for mi (Q253276) +""" + + +SYMBOL_TO_SI = [] +UNITS_BY_SI_NAME: dict | None = None + + +def convert_from_si(si_name: str, symbol: str, value: float | int) -> float: + from_si = units_by_si_name(si_name)[symbol][symbol]["from_si"] + if isinstance(from_si, (float, int)): + value = float(value) * from_si + else: + value = from_si(float(value)) + return value + + +def convert_to_si(si_name: str, symbol: str, value: float | int) -> float: + to_si = units_by_si_name(si_name)[symbol][symbol]["to_si"] + if isinstance(to_si, (float, int)): + value = float(value) * to_si + else: + value = to_si(float(value)) + return value + + +def units_by_si_name(si_name): + + global UNITS_BY_SI_NAME + if UNITS_BY_SI_NAME is not None: + return UNITS_BY_SI_NAME[si_name] + + UNITS_BY_SI_NAME = {} + for item in symbol_to_si(): + by_symbol = UNITS_BY_SI_NAME.get(si_name) + if by_symbol is None: + by_symbol = {} + UNITS_BY_SI_NAME[si_name] = by_symbol + by_symbol[item["symbol"]] = item + return UNITS_BY_SI_NAME[si_name] + + +def symbol_to_si(): + """Generates a list of tuples, each tuple is a measure unit and the fields + in the tuple are: + + 0. Symbol of the measure unit (e.g. 'mi' for measure unit 'miles' Q253276) + + 1. SI name of the measure unit (e.g. Q11573 for SI unit 'metre') + + 2. Factor to get SI value from measure unit (e.g. 1mi is equal to SI 1m + multiplied by 1609.344) + + 3. Factor to get measure value from from SI value (e.g. SI 100m is equal to + 100mi divided by 1609.344) + + The returned list is sorted, the first items are created from + ``WIKIDATA_UNITS``, the second group of items is build from + :py:obj:`ADDITIONAL_UNITS` and items created from :py:obj:`ALIAS_SYMBOLS`. + + If you search this list for a symbol, then a match with a symbol from + Wikidata has the highest weighting (first hit in the list), followed by the + symbols from the :py:obj:`ADDITIONAL_UNITS` and the lowest weighting is + given to the symbols resulting from the aliases :py:obj:`ALIAS_SYMBOLS`. + + """ + + global SYMBOL_TO_SI # pylint: disable=global-statement + if SYMBOL_TO_SI: + return SYMBOL_TO_SI + + # filter out units which can't be normalized to a SI unit and filter out + # units without a symbol / arcsecond does not have a symbol + # https://www.wikidata.org/wiki/Q829073 + + for item in data.WIKIDATA_UNITS.values(): + if item['to_si_factor'] and item['symbol']: + SYMBOL_TO_SI.append( + ( + item['symbol'], + item['si_name'], + 1 / item['to_si_factor'], # from_si + item['to_si_factor'], # to_si + item['symbol'], + ) + ) + + for item in ADDITIONAL_UNITS: + SYMBOL_TO_SI.append( + ( + item['symbol'], + item['si_name'], + item['from_si'], + item['to_si'], + item['symbol'], + ) + ) + + alias_items = [] + for item in SYMBOL_TO_SI: + for alias in ALIAS_SYMBOLS.get(item[0], ()): + alias_items.append( + ( + alias, + item[1], + item[2], # from_si + item[3], # to_si + item[0], # origin unit + ) + ) + SYMBOL_TO_SI = SYMBOL_TO_SI + alias_items + return SYMBOL_TO_SI + + +# the response contains duplicate ?item with the different ?symbol +# "ORDER BY ?item DESC(?rank) ?symbol" provides a deterministic result +# even if a ?item has different ?symbol of the same rank. +# A deterministic result +# see: +# * https://www.wikidata.org/wiki/Help:Ranking +# * https://www.mediawiki.org/wiki/Wikibase/Indexing/RDF_Dump_Format ("Statement representation" section) +# * https://w.wiki/32BT +# * https://en.wikibooks.org/wiki/SPARQL/WIKIDATA_Precision,_Units_and_Coordinates#Quantities +# see the result for https://www.wikidata.org/wiki/Q11582 +# there are multiple symbols the same rank + +SARQL_REQUEST = """ +SELECT DISTINCT ?item ?symbol ?tosi ?tosiUnit +WHERE +{ + ?item wdt:P31/wdt:P279 wd:Q47574 . + ?item p:P5061 ?symbolP . + ?symbolP ps:P5061 ?symbol ; + wikibase:rank ?rank . + OPTIONAL { + ?item p:P2370 ?tosistmt . + ?tosistmt psv:P2370 ?tosinode . + ?tosinode wikibase:quantityAmount ?tosi . + ?tosinode wikibase:quantityUnit ?tosiUnit . + } + FILTER(LANG(?symbol) = "en"). +} +ORDER BY ?item DESC(?rank) ?symbol +""" + + +def fetch_units(): + """Fetch units from Wikidata. Function is used to update persistence of + :py:obj:`searx.data.WIKIDATA_UNITS`.""" + + results = collections.OrderedDict() + response = wikidata.send_wikidata_query(SARQL_REQUEST) + for unit in response['results']['bindings']: + + symbol = unit['symbol']['value'] + name = unit['item']['value'].rsplit('/', 1)[1] + si_name = unit.get('tosiUnit', {}).get('value', '') + if si_name: + si_name = si_name.rsplit('/', 1)[1] + + to_si_factor = unit.get('tosi', {}).get('value', '') + if name not in results: + # ignore duplicate: always use the first one + results[name] = { + 'symbol': symbol, + 'si_name': si_name if si_name else None, + 'to_si_factor': float(to_si_factor) if to_si_factor else None, + } + return results |