summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--docs/dev/engines/online/openalex.rst100
-rw-r--r--searx/engines/openalex.py205
-rw-r--r--searx/settings.yml9
3 files changed, 314 insertions, 0 deletions
diff --git a/docs/dev/engines/online/openalex.rst b/docs/dev/engines/online/openalex.rst
new file mode 100644
index 000000000..f04543653
--- /dev/null
+++ b/docs/dev/engines/online/openalex.rst
@@ -0,0 +1,100 @@
+.. _openalex engine:
+
+=========
+OpenAlex
+=========
+
+Overview
+========
+
+The OpenAlex engine integrates the `OpenAlex`_ Works API to return scientific paper
+results using the :origin:`paper.html <searx/templates/simple/result_templates/paper.html>`
+template. It is an "online" JSON engine that uses the official public API and does
+not require an API key.
+
+.. _OpenAlex: https://openalex.org
+.. _OpenAlex API overview: https://docs.openalex.org/how-to-use-the-api/api-overview
+
+Key features
+------------
+
+- Uses the official Works endpoint (JSON)
+- Paging support via ``page`` and ``per-page``
+- Relevance sorting (``sort=relevance_score:desc``)
+- Language filter support (maps SearXNG language to ``filter=language:<iso2>``)
+- Maps fields commonly used in scholarly results: title, authors, abstract
+ (reconstructed from inverted index), journal/venue, publisher, DOI, tags
+ (concepts), PDF/HTML links, pages, volume, issue, published date, and a short
+ citations comment
+- Supports OpenAlex "polite pool" by adding a ``mailto`` parameter
+
+
+Configuration
+=============
+
+Minimal example for :origin:`settings.yml <searx/settings.yml>`:
+
+.. code:: yaml
+
+ - name: openalex
+ engine: openalex
+ shortcut: oa
+ categories: science, scientific publications
+ timeout: 5.0
+ # Recommended by OpenAlex: join the polite pool with an email address
+ mailto: "[email protected]"
+
+Notes
+-----
+
+- The ``mailto`` key is optional but recommended by OpenAlex for better service.
+- Language is inherited from the user's UI language; when it is not ``all``, the
+ engine adds ``filter=language:<iso2>`` (e.g. ``language:fr``). If OpenAlex has
+ few results for that language, you may see fewer items.
+- Results typically include a main link. When the primary landing page from
+ OpenAlex is a DOI resolver, the engine will use that stable link. When an open
+ access link is available, it is exposed via the ``PDF`` and/or ``HTML`` links
+ in the result footer.
+
+
+What is returned
+================
+
+Each result uses the ``paper.html`` template and may include:
+
+- ``title`` and ``content`` (abstract; reconstructed from the inverted index)
+- ``authors`` (display names)
+- ``journal`` (host venue display name) and ``publisher``
+- ``doi`` (normalized to the plain DOI, without the ``https://doi.org/`` prefix)
+- ``tags`` (OpenAlex concepts display names)
+- ``pdf_url`` (Open access PDF if available) and ``html_url`` (landing page)
+- ``publishedDate`` (parsed from ``publication_date``)
+- ``pages``, ``volume``, ``number`` (issue)
+- ``type`` and a brief ``comments`` string with citation count
+
+
+Rate limits & polite pool
+=========================
+
+OpenAlex offers a free public API with generous daily limits. For extra courtesy
+and improved service quality, include a contact email in each request via
+``mailto``. You can set it directly in the engine configuration as shown above.
+See: `OpenAlex API overview`_.
+
+
+Troubleshooting
+===============
+
+- Few or no results in a non-English UI language:
+ Ensure the selected language has sufficient coverage at OpenAlex, or set the
+ UI language to English and retry.
+- Preference changes fail while testing locally:
+ Make sure your ``server.secret_key`` and ``server.base_url`` are set in your
+ instance settings so signed cookies work; see :ref:`settings server`.
+
+
+Implementation
+===============
+
+.. automodule:: searx.engines.openalex
+ :members:
diff --git a/searx/engines/openalex.py b/searx/engines/openalex.py
new file mode 100644
index 000000000..c7bb6839b
--- /dev/null
+++ b/searx/engines/openalex.py
@@ -0,0 +1,205 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# pylint: disable=missing-module-docstring
+#
+# Engine is documented in: docs/dev/engines/online/openalex.rst
+
+from __future__ import annotations
+
+import typing as t
+from datetime import datetime
+from urllib.parse import urlencode
+from searx.result_types import EngineResults
+from searx.extended_types import SXNG_Response
+
+# about
+about = {
+ "website": "https://openalex.org/",
+ "wikidata_id": "Q110718454",
+ "official_api_documentation": "https://docs.openalex.org/how-to-use-the-api/api-overview",
+ "use_official_api": True,
+ "require_api_key": False,
+ "results": "JSON",
+}
+
+
+# engine dependent config
+categories = ["science", "scientific publications"]
+paging = True
+search_url = "https://api.openalex.org/works"
+
+# Optional: include your email for OpenAlex polite pool. Can be set from settings.yml
+# engines: - name: openalex; engine: openalex; mailto: "[email protected]"
+mailto = ""
+
+
+def request(query: str, params: dict[str, t.Any]) -> None:
+ # Build OpenAlex query using search parameter and paging
+ args = {
+ "search": query,
+ "page": params["pageno"],
+ # keep result size moderate; OpenAlex default is 25
+ "per-page": 10,
+ # relevance sorting works only with `search`
+ "sort": "relevance_score:desc",
+ }
+
+ # Language filter (expects ISO639-1 like 'fr', 'en')
+ language = params.get("language")
+ filters: list[str] = []
+ if isinstance(language, str) and language != "all":
+ iso2 = language.split("-")[0].split("_")[0]
+ if len(iso2) == 2:
+ filters.append(f"language:{iso2}")
+
+ if filters:
+ args["filter"] = ",".join(filters)
+
+ # include mailto if configured for polite pool (engine module setting)
+ if isinstance(mailto, str) and mailto != "":
+ args["mailto"] = mailto
+
+ params["url"] = f"{search_url}?{urlencode(args)}"
+
+
+def response(resp: SXNG_Response) -> EngineResults:
+ data = resp.json()
+ res = EngineResults()
+
+ for item in data.get("results", []):
+ url, html_url, pdf_url = _extract_links(item)
+ title: str = item.get("title", "")
+ content: str = _reconstruct_abstract(item.get("abstract_inverted_index")) or ""
+ authors = _extract_authors(item)
+ journal, publisher, pages, volume, number, published_date = _extract_biblio(item)
+ doi = _doi_to_plain(item.get("doi"))
+ tags = _extract_tags(item) or None
+ comments = _extract_comments(item)
+
+ res.add(
+ res.types.LegacyResult(
+ template="paper.html",
+ url=url,
+ title=title,
+ content=content,
+ journal=journal,
+ publisher=publisher,
+ doi=doi,
+ tags=tags,
+ authors=authors,
+ pdf_url=pdf_url,
+ html_url=html_url,
+ publishedDate=published_date,
+ pages=pages,
+ volume=volume,
+ number=number,
+ type=item.get("type"),
+ comments=comments,
+ )
+ )
+
+ return res
+
+
+def _stringify_pages(biblio: dict[str, t.Any]) -> str | None:
+ first_page = biblio.get("first_page")
+ last_page = biblio.get("last_page")
+ if first_page and last_page:
+ return f"{first_page}-{last_page}"
+ if first_page:
+ return str(first_page)
+ if last_page:
+ return str(last_page)
+ return None
+
+
+def _parse_date(value: str | None) -> datetime | None:
+ if not value:
+ return None
+ # OpenAlex may return YYYY, YYYY-MM or YYYY-MM-DD
+ for fmt in ("%Y-%m-%d", "%Y-%m", "%Y"):
+ try:
+ return datetime.strptime(value, fmt)
+ except ValueError:
+ continue
+ return None
+
+
+def _doi_to_plain(doi_value: str | None) -> str | None:
+ if not doi_value:
+ return None
+ # OpenAlex `doi` field is commonly a full URL like https://doi.org/10.1234/abcd
+ return doi_value.removeprefix("https://doi.org/")
+
+
+def _reconstruct_abstract(
+ abstract_inverted_index: dict[str, list[int]] | None,
+) -> str | None:
+ # The abstract is returned as an inverted index {token: [positions...]}
+ # Reconstruct by placing tokens at their positions and joining with spaces.
+ if not abstract_inverted_index:
+ return None
+ position_to_token: dict[int, str] = {}
+ max_index = -1
+ for token, positions in abstract_inverted_index.items():
+ for pos in positions:
+ position_to_token[pos] = token
+ max_index = max(max_index, pos)
+ if max_index < 0:
+ return None
+ ordered_tokens = [position_to_token.get(i, "") for i in range(0, max_index + 1)]
+ # collapse multiple empty tokens
+ text = " ".join(t for t in ordered_tokens if t != "")
+ return text if text != "" else None
+
+
+def _extract_links(item: dict[str, t.Any]) -> tuple[str, str | None, str | None]:
+ primary_location = item.get("primary_location", {})
+ landing_page_url: str | None = primary_location.get("landing_page_url")
+ work_url: str = item.get("id", "")
+ url: str = landing_page_url or work_url
+ open_access = item.get("open_access", {})
+ pdf_url: str | None = primary_location.get("pdf_url") or open_access.get("oa_url")
+ html_url: str | None = landing_page_url
+ return url, html_url, pdf_url
+
+
+def _extract_authors(item: dict[str, t.Any]) -> list[str]:
+ authors: list[str] = []
+ for auth in item.get("authorships", []):
+ if not auth:
+ continue
+ author_obj = auth.get("author", {})
+ display_name = author_obj.get("display_name")
+ if isinstance(display_name, str) and display_name != "":
+ authors.append(display_name)
+ return authors
+
+
+def _extract_tags(item: dict[str, t.Any]) -> list[str]:
+ tags: list[str] = []
+ for c in item.get("concepts", []):
+ name = (c or {}).get("display_name")
+ if isinstance(name, str) and name != "":
+ tags.append(name)
+ return tags
+
+
+def _extract_biblio(
+ item: dict[str, t.Any],
+) -> tuple[str | None, str | None, str | None, str | None, str | None, datetime | None]:
+ host_venue = item.get("host_venue", {})
+ biblio = item.get("biblio", {})
+ journal: str | None = host_venue.get("display_name")
+ publisher: str | None = host_venue.get("publisher")
+ pages = _stringify_pages(biblio)
+ volume = biblio.get("volume")
+ number = biblio.get("issue")
+ published_date = _parse_date(item.get("publication_date"))
+ return journal, publisher, pages, volume, number, published_date
+
+
+def _extract_comments(item: dict[str, t.Any]) -> str | None:
+ cited_by_count = item.get("cited_by_count")
+ if isinstance(cited_by_count, int):
+ return f"{cited_by_count} citations"
+ return None
diff --git a/searx/settings.yml b/searx/settings.yml
index a3455815d..ca5e27df7 100644
--- a/searx/settings.yml
+++ b/searx/settings.yml
@@ -1495,6 +1495,15 @@ engines:
require_api_key: false
results: JSON
+ - name: openalex
+ engine: openalex
+ shortcut: oa
+ # https://docs.searxng.org/dev/engines/online/openalex.html
+ # Recommended by OpenAlex: join the polite pool with an email address
+ # mailto: "[email protected]"
+ timeout: 5.0
+ disabled: true
+
- name: openclipart
engine: openclipart
shortcut: ocl