diff options
| author | Markus Heiser <markus.heiser@darmarit.de> | 2025-09-10 16:39:24 +0200 |
|---|---|---|
| committer | Markus Heiser <markus.heiser@darmarIT.de> | 2025-09-20 10:56:46 +0200 |
| commit | 599d9488c5e363fd01ec9170a5fea795c3f09f5d (patch) | |
| tree | a4095e608dff46a7d5cd5abc744f03aace0f3c2c /searx/engines/google_scholar.py | |
| parent | 078c9fcb68fe0e1b75e5aa64040d892faa83c063 (diff) | |
[mod] Google Scholar engine: revision of the engine (Paper result)
Revision of the engine / use of the result type Paper as well as other
typifications.
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx/engines/google_scholar.py')
| -rw-r--r-- | searx/engines/google_scholar.py | 265 |
1 files changed, 143 insertions, 122 deletions
diff --git a/searx/engines/google_scholar.py b/searx/engines/google_scholar.py index 5420a5415..8a82b36ee 100644 --- a/searx/engines/google_scholar.py +++ b/searx/engines/google_scholar.py @@ -1,12 +1,29 @@ # SPDX-License-Identifier: AGPL-3.0-or-later -"""This is the implementation of the Google Scholar engine. +"""Google Scholar is a freely accessible web search engine that indexes the full +text or metadata of scholarly literature across an array of publishing formats +and disciplines. Compared to other Google services the Scholar engine has a simple GET REST-API -and there does not exists `async` API. Even though the API slightly vintage we -can make use of the :ref:`google API` to assemble the arguments of the GET +and there does not exists ``async`` API. Even though the API slightly vintage +we can make use of the :ref:`google API` to assemble the arguments of the GET request. + +Configuration +============= + +.. code:: yaml + + - name: google scholar + engine: google_scholar + shortcut: gos + +Implementations +=============== + """ +import typing as t + from urllib.parse import urlencode from datetime import datetime from lxml import html @@ -16,6 +33,7 @@ from searx.utils import ( eval_xpath_getindex, eval_xpath_list, extract_text, + ElementType, ) from searx.exceptions import SearxEngineCaptchaException @@ -26,18 +44,23 @@ from searx.engines.google import ( time_range_dict, ) -# about +from searx.result_types import EngineResults + +if t.TYPE_CHECKING: + from searx.extended_types import SXNG_Response + from searx.search.processors import OnlineParams + about = { - "website": 'https://scholar.google.com', - "wikidata_id": 'Q494817', - "official_api_documentation": 'https://developers.google.com/custom-search', + "website": "https://scholar.google.com", + "wikidata_id": "Q494817", + "official_api_documentation": "https://developers.google.com/custom-search", "use_official_api": False, "require_api_key": False, - "results": 'HTML', + "results": "HTML", } # engine dependent config -categories = ['science', 'scientific publications'] +categories = ["science", "scientific publications"] paging = True max_page = 50 """`Google max 50 pages`_ @@ -50,9 +73,97 @@ safesearch = False send_accept_language_header = True -def time_range_args(params): +def request(query: str, params: "OnlineParams") -> None: + """Google-Scholar search request""" + + google_info = get_google_info(params, traits) + # subdomain is: scholar.google.xy + google_info["subdomain"] = google_info["subdomain"].replace("www.", "scholar.") + + args = { + "q": query, + **google_info["params"], + "start": (params["pageno"] - 1) * 10, + "as_sdt": "2007", # include patents / to disable set "0,5" + "as_vis": "0", # include citations / to disable set "1" + } + args.update(time_range_args(params)) + + params["url"] = "https://" + google_info["subdomain"] + "/scholar?" + urlencode(args) + params["cookies"] = google_info["cookies"] + params["headers"].update(google_info["headers"]) + + +def response(resp: "SXNG_Response") -> EngineResults: # pylint: disable=too-many-locals + """Parse response from Google Scholar""" + + res = EngineResults() + dom = html.fromstring(resp.text) + detect_google_captcha(dom) + + # parse results + for result in eval_xpath_list(dom, "//div[@data-rp]"): + + title = extract_text(eval_xpath(result, ".//h3[1]//a")) + if not title: + # this is a [ZITATION] block + continue + + pub_type: str = extract_text(eval_xpath(result, ".//span[@class='gs_ctg2']")) or "" + if pub_type: + pub_type = pub_type[1:-1].lower() + + url: str = eval_xpath_getindex(result, ".//h3[1]//a/@href", 0) + content: str = extract_text(eval_xpath(result, ".//div[@class='gs_rs']")) or "" + authors, journal, publisher, publishedDate = parse_gs_a( + extract_text(eval_xpath(result, ".//div[@class='gs_a']")) + ) + if publisher in url: + publisher = "" + + # cited by + comments: str = ( + extract_text(eval_xpath(result, ".//div[@class='gs_fl']/a[starts-with(@href,'/scholar?cites=')]")) or "" + ) + + # link to the html or pdf document + html_url: str = "" + pdf_url: str = "" + doc_url = eval_xpath_getindex(result, ".//div[@class='gs_or_ggsm']/a/@href", 0, default=None) + doc_type = extract_text(eval_xpath(result, ".//span[@class='gs_ctg2']")) + if doc_type == "[PDF]": + pdf_url = doc_url + else: + html_url = doc_url + + res.add( + res.types.Paper( + type=pub_type, + url=url, + title=title, + authors=authors, + publisher=publisher, + journal=journal, + publishedDate=publishedDate, + content=content, + comments=comments, + html_url=html_url, + pdf_url=pdf_url, + ) + ) + + # parse suggestion + for suggestion in eval_xpath(dom, "//div[contains(@class, 'gs_qsuggest_wrap')]//li//a"): + res.add(res.types.LegacyResult(suggestion=extract_text(suggestion))) + + for correction in eval_xpath(dom, "//div[@class='gs_r gs_pda']/a"): + res.add(res.types.LegacyResult(correction=extract_text(correction))) + return res + + +def time_range_args(params: "OnlineParams") -> dict[str, int]: """Returns a dictionary with a time range arguments based on - ``params['time_range']``. + ``params["time_range"]``. Google Scholar supports a detailed search by year. Searching by *last month* or *last week* (as offered by SearXNG) is uncommon for scientific @@ -60,21 +171,23 @@ def time_range_args(params): To limit the result list when the users selects a range, all the SearXNG ranges (*day*, *week*, *month*, *year*) are mapped to *year*. If no range - is set an empty dictionary of arguments is returned. Example; when - user selects a time range (current year minus one in 2022): + is set an empty dictionary of arguments is returned. + + Example; when user selects a time range and we find ourselves in the year + 2025 (current year minus one): .. code:: python - { 'as_ylo' : 2021 } + { "as_ylo" : 2024 } """ - ret_val = {} - if params['time_range'] in time_range_dict: - ret_val['as_ylo'] = datetime.now().year - 1 + ret_val: dict[str, int] = {} + if params["time_range"] in time_range_dict: + ret_val["as_ylo"] = datetime.now().year - 1 return ret_val -def detect_google_captcha(dom): +def detect_google_captcha(dom: ElementType): """In case of CAPTCHA Google Scholar open its own *not a Robot* dialog and is not redirected to ``sorry.google.com``. """ @@ -82,29 +195,7 @@ def detect_google_captcha(dom): raise SearxEngineCaptchaException() -def request(query, params): - """Google-Scholar search request""" - - google_info = get_google_info(params, traits) - # subdomain is: scholar.google.xy - google_info['subdomain'] = google_info['subdomain'].replace("www.", "scholar.") - - args = { - 'q': query, - **google_info['params'], - 'start': (params['pageno'] - 1) * 10, - 'as_sdt': '2007', # include patents / to disable set '0,5' - 'as_vis': '0', # include citations / to disable set '1' - } - args.update(time_range_args(params)) - - params['url'] = 'https://' + google_info['subdomain'] + '/scholar?' + urlencode(args) - params['cookies'] = google_info['cookies'] - params['headers'].update(google_info['headers']) - return params - - -def parse_gs_a(text: str | None): +def parse_gs_a(text: str | None) -> tuple[list[str], str, str, datetime | None]: """Parse the text written in green. Possible formats: @@ -113,98 +204,28 @@ def parse_gs_a(text: str | None): * "{authors} - {publisher}" """ if text is None or text == "": - return None, None, None, None + return [], "", "", None - s_text = text.split(' - ') - authors = s_text[0].split(', ') - publisher = s_text[-1] + s_text = text.split(" - ") + authors: list[str] = s_text[0].split(", ") + publisher: str = s_text[-1] if len(s_text) != 3: - return authors, None, publisher, None + return authors, "", publisher, None # the format is "{authors} - {journal}, {year} - {publisher}" or "{authors} - {year} - {publisher}" # get journal and year - journal_year = s_text[1].split(', ') + journal_year = s_text[1].split(", ") # journal is optional and may contains some coma if len(journal_year) > 1: - journal = ', '.join(journal_year[0:-1]) - if journal == '…': - journal = None + journal: str = ", ".join(journal_year[0:-1]) + if journal == "…": + journal = "" else: - journal = None + journal = "" # year year = journal_year[-1] try: - publishedDate = datetime.strptime(year.strip(), '%Y') + publishedDate = datetime.strptime(year.strip(), "%Y") except ValueError: publishedDate = None return authors, journal, publisher, publishedDate - - -def response(resp): # pylint: disable=too-many-locals - """Parse response from Google Scholar""" - results = [] - - # convert the text to dom - dom = html.fromstring(resp.text) - detect_google_captcha(dom) - - # parse results - for result in eval_xpath_list(dom, '//div[@data-rp]'): - - title = extract_text(eval_xpath(result, './/h3[1]//a')) - - if not title: - # this is a [ZITATION] block - continue - - pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]')) - if pub_type: - pub_type = pub_type[1:-1].lower() - - url = eval_xpath_getindex(result, './/h3[1]//a/@href', 0) - content = extract_text(eval_xpath(result, './/div[@class="gs_rs"]')) - authors, journal, publisher, publishedDate = parse_gs_a( - extract_text(eval_xpath(result, './/div[@class="gs_a"]')) - ) - if publisher in url: - publisher = None - - # cited by - comments = extract_text(eval_xpath(result, './/div[@class="gs_fl"]/a[starts-with(@href,"/scholar?cites=")]')) - - # link to the html or pdf document - html_url = None - pdf_url = None - doc_url = eval_xpath_getindex(result, './/div[@class="gs_or_ggsm"]/a/@href', 0, default=None) - doc_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]')) - if doc_type == "[PDF]": - pdf_url = doc_url - else: - html_url = doc_url - - results.append( - { - 'template': 'paper.html', - 'type': pub_type, - 'url': url, - 'title': title, - 'authors': authors, - 'publisher': publisher, - 'journal': journal, - 'publishedDate': publishedDate, - 'content': content, - 'comments': comments, - 'html_url': html_url, - 'pdf_url': pdf_url, - } - ) - - # parse suggestion - for suggestion in eval_xpath(dom, '//div[contains(@class, "gs_qsuggest_wrap")]//li//a'): - # append suggestion - results.append({'suggestion': extract_text(suggestion)}) - - for correction in eval_xpath(dom, '//div[@class="gs_r gs_pda"]/a'): - results.append({'correction': extract_text(correction)}) - - return results |