diff options
| author | Markus Heiser <markus.heiser@darmarit.de> | 2025-09-10 16:43:42 +0200 |
|---|---|---|
| committer | Markus Heiser <markus.heiser@darmarIT.de> | 2025-09-20 10:56:46 +0200 |
| commit | bb22bb1831f1a154d365713dfb5d30e86943dade (patch) | |
| tree | 38c7a3b437dcfc72a8a7d5d89efa4bb124a37718 /searx | |
| parent | 96e63df8ca187136ea37942fb840220dae8c8766 (diff) | |
[mod] PubMed engine: revision of the engine (Paper result)
Revision of the engine / use of the result type Paper as well as other
typifications.
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx')
| -rw-r--r-- | searx/engines/pubmed.py | 195 | ||||
| -rw-r--r-- | searx/settings.yml | 1 |
2 files changed, 107 insertions, 89 deletions
diff --git a/searx/engines/pubmed.py b/searx/engines/pubmed.py index be934cdc8..6fcfaa9a3 100644 --- a/searx/engines/pubmed.py +++ b/searx/engines/pubmed.py @@ -1,132 +1,151 @@ # SPDX-License-Identifier: AGPL-3.0-or-later -"""PubMed (Scholar publications) +"""PubMed_ comprises more than 39 million citations for biomedical literature +from MEDLINE, life science journals, and online books. Citations may include +links to full text content from PubMed Central and publisher web sites. + +.. _PubMed: https://pubmed.ncbi.nlm.nih.gov/ + +Configuration +============= + +.. code:: yaml + + - name: pubmed + engine: pubmed + shortcut: pub + +Implementations +=============== """ +import typing as t + from datetime import datetime from urllib.parse import urlencode from lxml import etree + +from searx.result_types import EngineResults from searx.network import get from searx.utils import ( eval_xpath_getindex, eval_xpath_list, extract_text, + ElementType, ) -# about +if t.TYPE_CHECKING: + from searx.extended_types import SXNG_Response + from searx.search.processors import OnlineParams + + about = { - "website": 'https://www.ncbi.nlm.nih.gov/pubmed/', - "wikidata_id": 'Q1540899', + "website": "https://www.ncbi.nlm.nih.gov/pubmed/", + "wikidata_id": "Q1540899", "official_api_documentation": { - 'url': 'https://www.ncbi.nlm.nih.gov/home/develop/api/', - 'comment': 'More info on api: https://www.ncbi.nlm.nih.gov/books/NBK25501/', + "url": "https://www.ncbi.nlm.nih.gov/home/develop/api/", + "comment": "More info on api: https://www.ncbi.nlm.nih.gov/books/NBK25501/", }, "use_official_api": True, "require_api_key": False, - "results": 'XML', + "results": "XML", } -categories = ['science', 'scientific publications'] +categories = ["science", "scientific publications"] -base_url = ( - 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' + '?db=pubmed&{query}&retstart={offset}&retmax={hits}' -) +eutils_api = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils" # engine dependent config number_of_results = 10 -pubmed_url = 'https://www.ncbi.nlm.nih.gov/pubmed/' - +pubmed_url = "https://www.ncbi.nlm.nih.gov/pubmed/" -def request(query, params): - # basic search - offset = (params['pageno'] - 1) * number_of_results - string_args = { - 'query': urlencode({'term': query}), - 'offset': offset, - 'hits': number_of_results, - } +def request(query: str, params: "OnlineParams") -> None: - params['url'] = base_url.format(**string_args) + args = urlencode( + { + "db": "pubmed", + "term": query, + "retstart": (params["pageno"] - 1) * number_of_results, + "hits": number_of_results, + } + ) + esearch_url = f"{eutils_api}/esearch.fcgi?{args}" + # DTD: https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20060628/esearch.dtd + esearch_resp: "SXNG_Response" = get(esearch_url) + pmids_results = etree.XML(esearch_resp.content) + pmids: list[str] = [i.text for i in pmids_results.xpath("//eSearchResult/IdList/Id")] + + # send efetch request with the IDs from esearch response + args = urlencode( + { + "db": "pubmed", + "retmode": "xml", + "id": ",".join(pmids), + } + ) + efetch_url = f"{eutils_api}/efetch.fcgi?{args}" + params["url"] = efetch_url - return params +def response(resp: "SXNG_Response") -> EngineResults: # pylint: disable=too-many-locals -def response(resp): # pylint: disable=too-many-locals - results = [] + # DTD: https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_250101.dtd - # First retrieve notice of each result - pubmed_retrieve_api_url = ( - 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?' + 'db=pubmed&retmode=xml&id={pmids_string}' - ) + # parse efetch response + efetch_xml = etree.XML(resp.content) + res = EngineResults() - pmids_results = etree.XML(resp.content) - pmids = pmids_results.xpath('//eSearchResult/IdList/Id') - pmids_string = '' + def _field_txt(xml: ElementType, xpath_str: str) -> str: + elem = eval_xpath_getindex(xml, xpath_str, 0, default="") + return extract_text(elem, allow_none=True) or "" - for item in pmids: - pmids_string += item.text + ',' + for pubmed_article in eval_xpath_list(efetch_xml, "//PubmedArticle"): - retrieve_notice_args = {'pmids_string': pmids_string} + medline_citation: ElementType = eval_xpath_getindex(pubmed_article, "./MedlineCitation", 0) + pubmed_data: ElementType = eval_xpath_getindex(pubmed_article, "./PubmedData", 0) - retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args) + title: str = eval_xpath_getindex(medline_citation, ".//Article/ArticleTitle", 0).text + pmid: str = eval_xpath_getindex(medline_citation, ".//PMID", 0).text + url: str = pubmed_url + pmid + content = _field_txt(medline_citation, ".//Abstract/AbstractText//text()") + doi = _field_txt(medline_citation, ".//ELocationID[@EIdType='doi']/text()") + journal = _field_txt(medline_citation, "./Article/Journal/Title/text()") + issn = _field_txt(medline_citation, "./Article/Journal/ISSN/text()") - search_results_response = get(retrieve_url_encoded).content - search_results = etree.XML(search_results_response) - for entry in eval_xpath_list(search_results, '//PubmedArticle'): - medline = eval_xpath_getindex(entry, './MedlineCitation', 0) + authors: list[str] = [] - title = eval_xpath_getindex(medline, './/Article/ArticleTitle', 0).text - pmid = eval_xpath_getindex(medline, './/PMID', 0).text - url = pubmed_url + pmid - content = extract_text( - eval_xpath_getindex(medline, './/Abstract/AbstractText//text()', 0, default=None), allow_none=True - ) - doi = extract_text( - eval_xpath_getindex(medline, './/ELocationID[@EIdType="doi"]/text()', 0, default=None), allow_none=True - ) - journal = extract_text( - eval_xpath_getindex(medline, './Article/Journal/Title/text()', 0, default=None), allow_none=True - ) - issn = extract_text( - eval_xpath_getindex(medline, './Article/Journal/ISSN/text()', 0, default=None), allow_none=True - ) - authors = [] - for author in eval_xpath_list(medline, './Article/AuthorList/Author'): - f = eval_xpath_getindex(author, './ForeName', 0, default=None) - l = eval_xpath_getindex(author, './LastName', 0, default=None) - f = '' if f is None else f.text - l = '' if l is None else l.text - authors.append((f + ' ' + l).strip()) - - res_dict = { - 'template': 'paper.html', - 'url': url, - 'title': title, - 'content': content or "", - 'journal': journal, - 'issn': [issn], - 'authors': authors, - 'doi': doi, - } + for author in eval_xpath_list(medline_citation, "./Article/AuthorList/Author"): + f = eval_xpath_getindex(author, "./ForeName", 0, default=None) + l = eval_xpath_getindex(author, "./LastName", 0, default=None) + author_name = f"{f.text if f is not None else ''} {l.text if l is not None else ''}".strip() + if author_name: + authors.append(author_name) accepted_date = eval_xpath_getindex( - entry, './PubmedData/History//PubMedPubDate[@PubStatus="accepted"]', 0, default=None + pubmed_data, "./History//PubMedPubDate[@PubStatus='accepted']", 0, default=None ) + pub_date = None if accepted_date is not None: - year = eval_xpath_getindex(accepted_date, './Year', 0) - month = eval_xpath_getindex(accepted_date, './Month', 0) - day = eval_xpath_getindex(accepted_date, './Day', 0) + year = eval_xpath_getindex(accepted_date, "./Year", 0) + month = eval_xpath_getindex(accepted_date, "./Month", 0) + day = eval_xpath_getindex(accepted_date, "./Day", 0) try: - publishedDate = datetime.strptime( - year.text + '-' + month.text + '-' + day.text, - '%Y-%m-%d', - ) - res_dict['publishedDate'] = publishedDate - except Exception as e: # pylint: disable=broad-exception-caught - print(e) - - results.append(res_dict) - - return results + pub_date = datetime(year=int(year.text), month=int(month.text), day=int(day.text)) + except ValueError: + pass + + res.add( + res.types.Paper( + url=url, + title=title, + content=content, + journal=journal, + issn=[issn], + authors=authors, + doi=doi, + publishedDate=pub_date, + ) + ) + return res diff --git a/searx/settings.yml b/searx/settings.yml index 4f260cae0..3e51a3c38 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -1735,7 +1735,6 @@ engines: - name: pubmed engine: pubmed shortcut: pub - timeout: 3.0 - name: pypi shortcut: pypi |