diff options
| author | Alexandre Flament <alex@al-f.net> | 2022-09-24 15:12:05 +0200 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2022-09-24 15:12:05 +0200 |
| commit | bfd6f6184911b5f4e7dcc8ee040daa9b790a28cb (patch) | |
| tree | 24b67e11f2ce75a0b78f061fabcd51271604f7e8 /searx | |
| parent | c808284f4d617225f25b131c577af43c923a647e (diff) | |
| parent | 16443d4f4a4a3b94c8646db48ac3f1ae6f0623c4 (diff) | |
Merge pull request #1804 from return42/fix-core.ac.uk
core.ac.uk: use paper.html template
Diffstat (limited to 'searx')
| -rw-r--r-- | searx/engines/core.py | 74 |
1 files changed, 54 insertions, 20 deletions
diff --git a/searx/engines/core.py b/searx/engines/core.py index 1fcb68f1f..a997343f2 100644 --- a/searx/engines/core.py +++ b/searx/engines/core.py @@ -4,7 +4,6 @@ """ -from json import loads from datetime import datetime from urllib.parse import urlencode @@ -42,39 +41,74 @@ def request(query, params): ) params['url'] = base_url + search_path - logger.debug("query_url --> %s", params['url']) return params def response(resp): results = [] - json_data = loads(resp.text) + json_data = resp.json() for result in json_data['data']: - source = result['_source'] + url = None + if source.get('urls'): + url = source['urls'][0].replace('http://', 'https://', 1) + + if url is None and source.get('doi'): + # use the DOI reference + url = 'https://doi.org/' + source['doi'] + + if url is None and source.get('downloadUrl'): + # use the downloadUrl + url = source['downloadUrl'] + + if url is None and source.get('identifiers'): + # try to find an ark id, see + # https://www.wikidata.org/wiki/Property:P8091 + # and https://en.wikipedia.org/wiki/Archival_Resource_Key + arkids = [ + identifier[5:] # 5 is the length of "ark:/" + for identifier in source.get('identifiers') + if isinstance(identifier, str) and identifier.startswith('ark:/') + ] + if len(arkids) > 0: + url = 'https://n2t.net/' + arkids[0] + + if url is None: + continue + time = source['publishedDate'] or source['depositedDate'] if time: - date = datetime.fromtimestamp(time / 1000) - else: - date = None - - metadata = [] - if source['publisher'] and len(source['publisher']) > 3: - metadata.append(source['publisher']) - if source['topics']: - metadata.append(source['topics'][0]) - if source['doi']: - metadata.append(source['doi']) - metadata = ' / '.join(metadata) + publishedDate = datetime.fromtimestamp(time / 1000) + + # sometimes the 'title' is None / filter None values + journals = [j['title'] for j in (source.get('journals') or []) if j['title']] + + publisher = source['publisher'] + if publisher: + publisher = source['publisher'].strip("'") results.append( { - 'url': source['urls'][0].replace('http://', 'https://', 1), + 'template': 'paper.html', 'title': source['title'], - 'content': source['description'], - 'publishedDate': date, - 'metadata': metadata, + 'url': url, + 'content': source['description'] or '', + # 'comments': '', + 'tags': source['topics'], + 'publishedDate': publishedDate, + 'type': (source['types'] or [None])[0], + 'authors': source['authors'], + 'editor': ', '.join(source['contributors'] or []), + 'publisher': publisher, + 'journal': ', '.join(journals), + # 'volume': '', + # 'pages' : '', + # 'number': '', + 'doi': source['doi'], + 'issn': source['issn'], + 'isbn': source.get('isbn'), # exists in the rawRecordXml + 'pdf_url': source.get('repositoryDocument', {}).get('pdfOrigin'), } ) |