diff options
| author | Bnyro <bnyro@tutanota.com> | 2025-04-29 22:34:44 +0200 |
|---|---|---|
| committer | Markus Heiser <markus.heiser@darmarIT.de> | 2025-05-02 16:46:38 +0200 |
| commit | 590b21165288127c6f942fe43d261d57d9a6d5c3 (patch) | |
| tree | d5fad888cbe9751efbbc70a2768a439d94485b8e | |
| parent | 41e3a0baa75f349aa287ca98b4524d211804b43f (diff) | |
[fix] semantic scholar: method not allowed / engine doesn't work
Fixes the semantic scholar engine by extracting a ui version token.
BTW: remove html tags from the content.
Author's checklist:
- they are ratelimiting very fast, if you do approx more than 2 requests per
minute, you have to wait some time again...
- they also have an official api at api.semanticscholar.org, but it's ratelimits
are even harder
Closes: https://github.com/searxng/searxng/issues/4685
| -rw-r--r-- | searx/engines/semantic_scholar.py | 38 |
1 files changed, 30 insertions, 8 deletions
diff --git a/searx/engines/semantic_scholar.py b/searx/engines/semantic_scholar.py index cc65ff542..450f3f765 100644 --- a/searx/engines/semantic_scholar.py +++ b/searx/engines/semantic_scholar.py @@ -1,11 +1,14 @@ # SPDX-License-Identifier: AGPL-3.0-or-later -"""Semantic Scholar (Science) -""" +"""Semantic Scholar (Science)""" -from json import dumps, loads +from json import dumps from datetime import datetime +from lxml import html from flask_babel import gettext +from searx.network import get +from searx.utils import eval_xpath_getindex, gen_useragent, html_to_text + about = { "website": 'https://www.semanticscholar.org/', @@ -19,13 +22,31 @@ about = { categories = ['science', 'scientific publications'] paging = True search_url = 'https://www.semanticscholar.org/api/1/search' -paper_url = 'https://www.semanticscholar.org/paper' +base_url = 'https://www.semanticscholar.org' + + +def _get_ui_version(): + resp = get(base_url) + if not resp.ok: + raise RuntimeError("Can't determine Semantic Scholar UI version") + + doc = html.fromstring(resp.text) + ui_version = eval_xpath_getindex(doc, "//meta[@name='s2-ui-version']/@content", 0) + if not ui_version: + raise RuntimeError("Can't determine Semantic Scholar UI version") + + return ui_version def request(query, params): params['url'] = search_url params['method'] = 'POST' - params['headers']['content-type'] = 'application/json' + params['headers'] = { + 'Content-Type': 'application/json', + 'X-S2-UI-Version': _get_ui_version(), + 'X-S2-Client': "webapp-browser", + 'User-Agent': gen_useragent(), + } params['data'] = dumps( { "queryString": query, @@ -43,7 +64,8 @@ def request(query, params): def response(resp): - res = loads(resp.text) + res = resp.json() + results = [] for result in res['results']: url = result.get('primaryPaperLink', {}).get('url') @@ -54,7 +76,7 @@ def response(resp): if alternatePaperLinks: url = alternatePaperLinks[0].get('url') if not url: - url = paper_url + '/%s' % result['id'] + url = base_url + '/paper/%s' % result['id'] # publishedDate if 'pubDate' in result: @@ -88,7 +110,7 @@ def response(resp): 'template': 'paper.html', 'url': url, 'title': result['title']['text'], - 'content': result['paperAbstract']['text'], + 'content': html_to_text(result['paperAbstract']['text']), 'journal': result.get('venue', {}).get('text') or result.get('journal', {}).get('name'), 'doi': result.get('doiInfo', {}).get('doi'), 'tags': result.get('fieldsOfStudy'), |