summaryrefslogtreecommitdiff
path: root/searx/engines/pubmed.py
diff options
context:
space:
mode:
authorMarkus Heiser <markus.heiser@darmarit.de>2025-09-10 16:43:42 +0200
committerMarkus Heiser <markus.heiser@darmarIT.de>2025-09-20 10:56:46 +0200
commitbb22bb1831f1a154d365713dfb5d30e86943dade (patch)
tree38c7a3b437dcfc72a8a7d5d89efa4bb124a37718 /searx/engines/pubmed.py
parent96e63df8ca187136ea37942fb840220dae8c8766 (diff)
[mod] PubMed engine: revision of the engine (Paper result)
Revision of the engine / use of the result type Paper as well as other typifications. Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx/engines/pubmed.py')
-rw-r--r--searx/engines/pubmed.py195
1 files changed, 107 insertions, 88 deletions
diff --git a/searx/engines/pubmed.py b/searx/engines/pubmed.py
index be934cdc8..6fcfaa9a3 100644
--- a/searx/engines/pubmed.py
+++ b/searx/engines/pubmed.py
@@ -1,132 +1,151 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
-"""PubMed (Scholar publications)
+"""PubMed_ comprises more than 39 million citations for biomedical literature
+from MEDLINE, life science journals, and online books. Citations may include
+links to full text content from PubMed Central and publisher web sites.
+
+.. _PubMed: https://pubmed.ncbi.nlm.nih.gov/
+
+Configuration
+=============
+
+.. code:: yaml
+
+ - name: pubmed
+ engine: pubmed
+ shortcut: pub
+
+Implementations
+===============
"""
+import typing as t
+
from datetime import datetime
from urllib.parse import urlencode
from lxml import etree
+
+from searx.result_types import EngineResults
from searx.network import get
from searx.utils import (
eval_xpath_getindex,
eval_xpath_list,
extract_text,
+ ElementType,
)
-# about
+if t.TYPE_CHECKING:
+ from searx.extended_types import SXNG_Response
+ from searx.search.processors import OnlineParams
+
+
about = {
- "website": 'https://www.ncbi.nlm.nih.gov/pubmed/',
- "wikidata_id": 'Q1540899',
+ "website": "https://www.ncbi.nlm.nih.gov/pubmed/",
+ "wikidata_id": "Q1540899",
"official_api_documentation": {
- 'url': 'https://www.ncbi.nlm.nih.gov/home/develop/api/',
- 'comment': 'More info on api: https://www.ncbi.nlm.nih.gov/books/NBK25501/',
+ "url": "https://www.ncbi.nlm.nih.gov/home/develop/api/",
+ "comment": "More info on api: https://www.ncbi.nlm.nih.gov/books/NBK25501/",
},
"use_official_api": True,
"require_api_key": False,
- "results": 'XML',
+ "results": "XML",
}
-categories = ['science', 'scientific publications']
+categories = ["science", "scientific publications"]
-base_url = (
- 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' + '?db=pubmed&{query}&retstart={offset}&retmax={hits}'
-)
+eutils_api = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
# engine dependent config
number_of_results = 10
-pubmed_url = 'https://www.ncbi.nlm.nih.gov/pubmed/'
-
+pubmed_url = "https://www.ncbi.nlm.nih.gov/pubmed/"
-def request(query, params):
- # basic search
- offset = (params['pageno'] - 1) * number_of_results
- string_args = {
- 'query': urlencode({'term': query}),
- 'offset': offset,
- 'hits': number_of_results,
- }
+def request(query: str, params: "OnlineParams") -> None:
- params['url'] = base_url.format(**string_args)
+ args = urlencode(
+ {
+ "db": "pubmed",
+ "term": query,
+ "retstart": (params["pageno"] - 1) * number_of_results,
+ "hits": number_of_results,
+ }
+ )
+ esearch_url = f"{eutils_api}/esearch.fcgi?{args}"
+ # DTD: https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20060628/esearch.dtd
+ esearch_resp: "SXNG_Response" = get(esearch_url)
+ pmids_results = etree.XML(esearch_resp.content)
+ pmids: list[str] = [i.text for i in pmids_results.xpath("//eSearchResult/IdList/Id")]
+
+ # send efetch request with the IDs from esearch response
+ args = urlencode(
+ {
+ "db": "pubmed",
+ "retmode": "xml",
+ "id": ",".join(pmids),
+ }
+ )
+ efetch_url = f"{eutils_api}/efetch.fcgi?{args}"
+ params["url"] = efetch_url
- return params
+def response(resp: "SXNG_Response") -> EngineResults: # pylint: disable=too-many-locals
-def response(resp): # pylint: disable=too-many-locals
- results = []
+ # DTD: https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_250101.dtd
- # First retrieve notice of each result
- pubmed_retrieve_api_url = (
- 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?' + 'db=pubmed&retmode=xml&id={pmids_string}'
- )
+ # parse efetch response
+ efetch_xml = etree.XML(resp.content)
+ res = EngineResults()
- pmids_results = etree.XML(resp.content)
- pmids = pmids_results.xpath('//eSearchResult/IdList/Id')
- pmids_string = ''
+ def _field_txt(xml: ElementType, xpath_str: str) -> str:
+ elem = eval_xpath_getindex(xml, xpath_str, 0, default="")
+ return extract_text(elem, allow_none=True) or ""
- for item in pmids:
- pmids_string += item.text + ','
+ for pubmed_article in eval_xpath_list(efetch_xml, "//PubmedArticle"):
- retrieve_notice_args = {'pmids_string': pmids_string}
+ medline_citation: ElementType = eval_xpath_getindex(pubmed_article, "./MedlineCitation", 0)
+ pubmed_data: ElementType = eval_xpath_getindex(pubmed_article, "./PubmedData", 0)
- retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args)
+ title: str = eval_xpath_getindex(medline_citation, ".//Article/ArticleTitle", 0).text
+ pmid: str = eval_xpath_getindex(medline_citation, ".//PMID", 0).text
+ url: str = pubmed_url + pmid
+ content = _field_txt(medline_citation, ".//Abstract/AbstractText//text()")
+ doi = _field_txt(medline_citation, ".//ELocationID[@EIdType='doi']/text()")
+ journal = _field_txt(medline_citation, "./Article/Journal/Title/text()")
+ issn = _field_txt(medline_citation, "./Article/Journal/ISSN/text()")
- search_results_response = get(retrieve_url_encoded).content
- search_results = etree.XML(search_results_response)
- for entry in eval_xpath_list(search_results, '//PubmedArticle'):
- medline = eval_xpath_getindex(entry, './MedlineCitation', 0)
+ authors: list[str] = []
- title = eval_xpath_getindex(medline, './/Article/ArticleTitle', 0).text
- pmid = eval_xpath_getindex(medline, './/PMID', 0).text
- url = pubmed_url + pmid
- content = extract_text(
- eval_xpath_getindex(medline, './/Abstract/AbstractText//text()', 0, default=None), allow_none=True
- )
- doi = extract_text(
- eval_xpath_getindex(medline, './/ELocationID[@EIdType="doi"]/text()', 0, default=None), allow_none=True
- )
- journal = extract_text(
- eval_xpath_getindex(medline, './Article/Journal/Title/text()', 0, default=None), allow_none=True
- )
- issn = extract_text(
- eval_xpath_getindex(medline, './Article/Journal/ISSN/text()', 0, default=None), allow_none=True
- )
- authors = []
- for author in eval_xpath_list(medline, './Article/AuthorList/Author'):
- f = eval_xpath_getindex(author, './ForeName', 0, default=None)
- l = eval_xpath_getindex(author, './LastName', 0, default=None)
- f = '' if f is None else f.text
- l = '' if l is None else l.text
- authors.append((f + ' ' + l).strip())
-
- res_dict = {
- 'template': 'paper.html',
- 'url': url,
- 'title': title,
- 'content': content or "",
- 'journal': journal,
- 'issn': [issn],
- 'authors': authors,
- 'doi': doi,
- }
+ for author in eval_xpath_list(medline_citation, "./Article/AuthorList/Author"):
+ f = eval_xpath_getindex(author, "./ForeName", 0, default=None)
+ l = eval_xpath_getindex(author, "./LastName", 0, default=None)
+ author_name = f"{f.text if f is not None else ''} {l.text if l is not None else ''}".strip()
+ if author_name:
+ authors.append(author_name)
accepted_date = eval_xpath_getindex(
- entry, './PubmedData/History//PubMedPubDate[@PubStatus="accepted"]', 0, default=None
+ pubmed_data, "./History//PubMedPubDate[@PubStatus='accepted']", 0, default=None
)
+ pub_date = None
if accepted_date is not None:
- year = eval_xpath_getindex(accepted_date, './Year', 0)
- month = eval_xpath_getindex(accepted_date, './Month', 0)
- day = eval_xpath_getindex(accepted_date, './Day', 0)
+ year = eval_xpath_getindex(accepted_date, "./Year", 0)
+ month = eval_xpath_getindex(accepted_date, "./Month", 0)
+ day = eval_xpath_getindex(accepted_date, "./Day", 0)
try:
- publishedDate = datetime.strptime(
- year.text + '-' + month.text + '-' + day.text,
- '%Y-%m-%d',
- )
- res_dict['publishedDate'] = publishedDate
- except Exception as e: # pylint: disable=broad-exception-caught
- print(e)
-
- results.append(res_dict)
-
- return results
+ pub_date = datetime(year=int(year.text), month=int(month.text), day=int(day.text))
+ except ValueError:
+ pass
+
+ res.add(
+ res.types.Paper(
+ url=url,
+ title=title,
+ content=content,
+ journal=journal,
+ issn=[issn],
+ authors=authors,
+ doi=doi,
+ publishedDate=pub_date,
+ )
+ )
+ return res