diff options
| -rw-r--r-- | AUTHORS.rst | 1 | ||||
| -rwxr-xr-x | manage.sh | 3 | ||||
| -rw-r--r-- | searx/engines/duckduckgo.py | 30 | ||||
| -rw-r--r-- | searx/engines/scanr_structures.py | 78 | ||||
| -rw-r--r-- | searx/results.py | 10 | ||||
| -rw-r--r-- | searx/settings.yml | 19 | ||||
| -rw-r--r-- | searx/templates/oscar/base.html | 3 | ||||
| -rw-r--r-- | searx/webapp.py | 22 | ||||
| -rw-r--r-- | tests/unit/engines/test_duckduckgo.py | 8 | ||||
| -rw-r--r-- | tests/unit/engines/test_scanr_structures.py | 175 | ||||
| -rw-r--r-- | tests/unit/test_webapp.py | 4 | ||||
| -rw-r--r-- | utils/fetch_currencies.py | 2 |
12 files changed, 324 insertions, 31 deletions
diff --git a/AUTHORS.rst b/AUTHORS.rst index 5bc6807a9..f3e875ada 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -51,3 +51,4 @@ generally made searx better: - Ashutosh Das @pyprism - YuLun Shih @imZack - Dmitry Mikhirev @mikhirev +- David A Roberts `@davidar <https://github.com/davidar>`_ @@ -58,7 +58,8 @@ styles() { build_style themes/courgette/less/style.less themes/courgette/css/style.css build_style themes/courgette/less/style-rtl.less themes/courgette/css/style-rtl.css build_style less/bootstrap/bootstrap.less css/bootstrap.min.css - build_style themes/oscar/less/oscar/oscar.less themes/oscar/css/oscar.min.css + build_style themes/oscar/less/pointhi/oscar.less themes/oscar/css/pointhi.min.css + build_style themes/oscar/less/logicodev/oscar.less themes/oscar/css/logicodev.min.css build_style themes/pix-art/less/style.less themes/pix-art/css/style.css } diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 373ce1b2d..d29e4416a 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -11,13 +11,12 @@ @parse url, title, content @todo rewrite to api - @todo language support - (the current used site does not support language-change) """ from urllib import urlencode from lxml.html import fromstring from searx.engines.xpath import extract_text +from searx.languages import language_codes # engine dependent config categories = ['general'] @@ -39,13 +38,28 @@ def request(query, params): offset = (params['pageno'] - 1) * 30 if params['language'] == 'all': - locale = 'en-us' + locale = None else: - locale = params['language'].replace('_', '-').lower() - - params['url'] = url.format( - query=urlencode({'q': query, 'kl': locale}), - offset=offset) + locale = params['language'].split('_') + if len(locale) == 2: + # country code goes first + locale = locale[1].lower() + '-' + locale[0].lower() + else: + # tries to get a country code from language + locale = locale[0].lower() + lang_codes = [x[0] for x in language_codes] + for lc in lang_codes: + lc = lc.split('_') + if locale == lc[0]: + locale = lc[1].lower() + '-' + lc[0].lower() + break + + if locale: + params['url'] = url.format( + query=urlencode({'q': query, 'kl': locale}), offset=offset) + else: + params['url'] = url.format( + query=urlencode({'q': query}), offset=offset) return params diff --git a/searx/engines/scanr_structures.py b/searx/engines/scanr_structures.py new file mode 100644 index 000000000..ad78155ac --- /dev/null +++ b/searx/engines/scanr_structures.py @@ -0,0 +1,78 @@ +""" + ScanR Structures (Science) + + @website https://scanr.enseignementsup-recherche.gouv.fr + @provide-api yes (https://scanr.enseignementsup-recherche.gouv.fr/api/swagger-ui.html) + + @using-api yes + @results JSON + @stable yes + @parse url, title, content, img_src +""" + +from urllib import urlencode +from json import loads, dumps +from dateutil import parser +from searx.utils import html_to_text + +# engine dependent config +categories = ['science'] +paging = True +page_size = 20 + +# search-url +url = 'https://scanr.enseignementsup-recherche.gouv.fr/' +search_url = url + 'api/structures/search' + + +# do search-request +def request(query, params): + + params['url'] = search_url + params['method'] = 'POST' + params['headers']['Content-type'] = "application/json" + params['data'] = dumps({"query": query, + "searchField": "ALL", + "sortDirection": "ASC", + "sortOrder": "RELEVANCY", + "page": params['pageno'], + "pageSize": page_size}) + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_res = loads(resp.text) + + # return empty array if there are no results + if search_res.get('total') < 1: + return [] + + # parse results + for result in search_res['results']: + if 'id' not in result: + continue + + # is it thumbnail or img_src?? + thumbnail = None + if 'logo' in result: + thumbnail = result['logo'] + if thumbnail[0] == '/': + thumbnail = url + thumbnail + + content = None + if 'highlights' in result: + content = result['highlights'][0]['value'] + + # append result + results.append({'url': url + 'structure/' + result['id'], + 'title': result['label'], + # 'thumbnail': thumbnail, + 'img_src': thumbnail, + 'content': html_to_text(content)}) + + # return results + return results diff --git a/searx/results.py b/searx/results.py index d5d88af6b..1fbbadc38 100644 --- a/searx/results.py +++ b/searx/results.py @@ -100,7 +100,7 @@ class ResultContainer(object): self._infobox_ids = {} self.suggestions = set() self.answers = set() - self.number_of_results = 0 + self._number_of_results = [] def extend(self, engine_name, results): for result in list(results): @@ -114,7 +114,7 @@ class ResultContainer(object): self._merge_infobox(result) results.remove(result) elif 'number_of_results' in result: - self.number_of_results = max(self.number_of_results, result['number_of_results']) + self._number_of_results.append(result['number_of_results']) results.remove(result) with RLock(): @@ -253,3 +253,9 @@ class ResultContainer(object): def results_length(self): return len(self._merged_results) + + def results_number(self): + resultnum_sum = sum(self._number_of_results) + if not resultnum_sum or not self._number_of_results: + return 0 + return resultnum_sum / len(self._number_of_results) diff --git a/searx/settings.yml b/searx/settings.yml index 558898886..e578dacc3 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -200,6 +200,20 @@ engines: engine : google_news shortcut : gon + - name : google scholar + engine : xpath + paging : True + search_url : https://scholar.google.com/scholar?start={pageno}&q={query}&hl=en&as_sdt=0,5&as_vis=1' + results_xpath : //div[@class="gs_r"]/div[@class="gs_ri"] + url_xpath : .//h3/a/@href + title_xpath : .//h3/a + content_xpath : .//div[@class="gs_rs"] + suggestion_xpath : //div[@id="gs_qsuggest"]/ul/li + page_size : 10 + first_page_num : 0 + categories : science + shortcut : gos + - name : google play apps engine : xpath search_url : https://play.google.com/store/search?q={query}&c=apps @@ -314,6 +328,11 @@ engines: engine : kickass shortcut : ka + - name : scanr_structures + shortcut: scs + engine : scanr_structures + disabled : True + - name : soundcloud engine : soundcloud shortcut : sc diff --git a/searx/templates/oscar/base.html b/searx/templates/oscar/base.html index 649d91f4d..a1f1c1a90 100644 --- a/searx/templates/oscar/base.html +++ b/searx/templates/oscar/base.html @@ -90,8 +90,5 @@ {% for script in scripts %} <script src="{{ url_for('static', filename=script) }}"></script> {% endfor %} - <script type="text/javascript"> - $(function() { $('a[data-toggle="modal"]').attr('href', '#'); }); - </script> </body> </html> diff --git a/searx/webapp.py b/searx/webapp.py index 00a203636..e9d27a0db 100644 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -380,7 +380,9 @@ def index(): plugins.call('post_search', request, locals()) - for result in search.result_container.get_ordered_results(): + results = search.result_container.get_ordered_results() + + for result in results: plugins.call('on_result', request, locals()) if not search.paging and engines[result['engine']].paging: @@ -418,16 +420,20 @@ def index(): else: result['publishedDate'] = format_date(result['publishedDate']) + number_of_results = search.result_container.results_number() + if number_of_results < search.result_container.results_length(): + number_of_results = 0 + if search.request_data.get('format') == 'json': return Response(json.dumps({'query': search.query, - 'number_of_results': search.result_container.number_of_results, - 'results': search.result_container.get_ordered_results()}), + 'number_of_results': number_of_results, + 'results': results}), mimetype='application/json') elif search.request_data.get('format') == 'csv': csv = UnicodeWriter(cStringIO.StringIO()) keys = ('title', 'url', 'content', 'host', 'engine', 'score') csv.writerow(keys) - for row in search.result_container.get_ordered_results(): + for row in results: row['host'] = row['parsed_url'].netloc csv.writerow([row.get(key, '') for key in keys]) csv.stream.seek(0) @@ -438,20 +444,20 @@ def index(): elif search.request_data.get('format') == 'rss': response_rss = render( 'opensearch_response_rss.xml', - results=search.result_container.get_ordered_results(), + results=results, q=search.request_data['q'], - number_of_results=search.result_container.number_of_results, + number_of_results=number_of_results, base_url=get_base_url() ) return Response(response_rss, mimetype='text/xml') return render( 'results.html', - results=search.result_container.get_ordered_results(), + results=results, q=search.request_data['q'], selected_categories=search.categories, paging=search.paging, - number_of_results=format_decimal(search.result_container.number_of_results), + number_of_results=format_decimal(number_of_results), pageno=search.pageno, base_url=get_base_url(), suggestions=search.result_container.suggestions, diff --git a/tests/unit/engines/test_duckduckgo.py b/tests/unit/engines/test_duckduckgo.py index 8f99dc9cb..90cdc9d9e 100644 --- a/tests/unit/engines/test_duckduckgo.py +++ b/tests/unit/engines/test_duckduckgo.py @@ -11,16 +11,12 @@ class TestDuckduckgoEngine(SearxTestCase): query = 'test_query' dicto = defaultdict(dict) dicto['pageno'] = 1 - dicto['language'] = 'fr_FR' + dicto['language'] = 'de_CH' params = duckduckgo.request(query, dicto) self.assertIn('url', params) self.assertIn(query, params['url']) self.assertIn('duckduckgo.com', params['url']) - self.assertIn('fr-fr', params['url']) - - dicto['language'] = 'all' - params = duckduckgo.request(query, dicto) - self.assertIn('en-us', params['url']) + self.assertIn('ch-de', params['url']) def test_response(self): self.assertRaises(AttributeError, duckduckgo.response, None) diff --git a/tests/unit/engines/test_scanr_structures.py b/tests/unit/engines/test_scanr_structures.py new file mode 100644 index 000000000..a7b9e9185 --- /dev/null +++ b/tests/unit/engines/test_scanr_structures.py @@ -0,0 +1,175 @@ +from collections import defaultdict +import mock +from searx.engines import scanr_structures +from searx.testing import SearxTestCase + + +class TestScanrStructuresEngine(SearxTestCase): + + def test_request(self): + query = 'test_query' + dicto = defaultdict(dict) + dicto['pageno'] = 1 + params = scanr_structures.request(query, dicto) + self.assertIn('url', params) + self.assertIn(query, params['data']) + self.assertIn('scanr.enseignementsup-recherche.gouv.fr', params['url']) + + def test_response(self): + self.assertRaises(AttributeError, scanr_structures.response, None) + self.assertRaises(AttributeError, scanr_structures.response, []) + self.assertRaises(AttributeError, scanr_structures.response, '') + self.assertRaises(AttributeError, scanr_structures.response, '[]') + + response = mock.Mock(text='{}') + self.assertEqual(scanr_structures.response(response), []) + + response = mock.Mock(text='{"data": []}') + self.assertEqual(scanr_structures.response(response), []) + + json = u""" + { + "request": + { + "query":"test_query", + "page":1, + "pageSize":20, + "sortOrder":"RELEVANCY", + "sortDirection":"ASC", + "searchField":"ALL", + "from":0 + }, + "total":2471, + "results":[ + { + "id":"200711886U", + "label":"Laboratoire d'Informatique de Grenoble", + "kind":"RNSR", + "publicEntity":true, + "address":{"city":"Grenoble","departement":"38"}, + "logo":"/static/logos/200711886U.png", + "acronym":"LIG", + "type":{"code":"UR","label":"Unit\xe9 de recherche"}, + "level":2, + "institutions":[ + { + "id":"193819125", + "label":"Grenoble INP", + "acronym":"IPG", + "code":"UMR 5217" + }, + { + "id":"130021397", + "label":"Universit\xe9 de Grenoble Alpes", + "acronym":"UGA", + "code":"UMR 5217" + }, + { + "id":"180089013", + "label":"Centre national de la recherche scientifique", + "acronym":"CNRS", + "code":"UMR 5217" + }, + { + "id":"180089047", + "label":"Institut national de recherche en informatique et en automatique", + "acronym":"Inria", + "code":"UMR 5217" + } + ], + "highlights":[ + { + "type":"projects", + "value":"linguicielles d\xe9velopp\xe9s jusqu'ici par le GETALP\ + du <strong>LIG</strong> en tant que prototypes op\xe9rationnels.\ +\\r\\nDans le contexte" + }, + { + "type":"acronym", + "value":"<strong>LIG</strong>" + }, + { + "type":"websiteContents", + "value":"S\xe9lection\\nListe structures\\nD\xe9tail\\n\ + Accueil\\n200711886U : <strong>LIG</strong>\ + Laboratoire d'Informatique de Grenoble Unit\xe9 de recherche"}, + { + "type":"publications", + "value":"de noms. Nous avons d'abord d\xe9velopp\xe9 LOOV \ + (pour <strong>Lig</strong> Overlaid OCR in Vid\xe9o), \ + un outil d'extraction des" + } + ] + }, + { + "id":"199511665F", + "label":"Laboratoire Bordelais de Recherche en Informatique", + "kind":"RNSR", + "publicEntity":true, + "address":{"city":"Talence","departement":"33"}, + "logo":"/static/logos/199511665F.png", + "acronym":"LaBRI", + "type":{"code":"UR","label":"Unit\xe9 de recherche"}, + "level":2, + "institutions":[ + { + "id":"130006356", + "label":"Institut polytechnique de Bordeaux", + "acronym":"IPB", + "code":"UMR 5800" + }, + { + "id":"130018351", + "label":"Universit\xe9 de Bordeaux", + "acronym":null, + "code":"UMR 5800" + }, + { + "id":"180089013", + "label":"Centre national de la recherche scientifique", + "acronym":"CNRS", + "code":"UMR 5800" + }, + { + "id":"180089047", + "label":"Institut national de recherche en informatique et en automatique", + "acronym":"Inria", + "code":"UMR 5800" + } + ], + "highlights":[ + { + "type":"websiteContents", + "value":"Samia Kerdjoudj\\n2016-07-05\\nDouble-exponential\ + and <strong>triple</strong>-exponential bounds for\ + choosability problems parameterized" + }, + { + "type":"publications", + "value":"de cam\xe9ras install\xe9es dans les lieux publiques \ + a <strong>tripl\xe9</strong> en 2009, passant de 20 000 \ + \xe0 60 000. Malgr\xe9 le" + } + ] + } + ] + } + """ + response = mock.Mock(text=json) + results = scanr_structures.response(response) + self.assertEqual(type(results), list) + self.assertEqual(len(results), 2) + self.assertEqual(results[0]['title'], u"Laboratoire d'Informatique de Grenoble") + self.assertEqual(results[0]['url'], 'https://scanr.enseignementsup-recherche.gouv.fr/structure/200711886U') + self.assertEqual(results[0]['content'], + u"linguicielles d\xe9velopp\xe9s jusqu'ici par le GETALP " + u"du LIG en tant que prototypes " + u"op\xe9rationnels. Dans le contexte") + self.assertEqual(results[1]['img_src'], + 'https://scanr.enseignementsup-recherche.gouv.fr//static/logos/199511665F.png') + self.assertEqual(results[1]['content'], + "Samia Kerdjoudj 2016-07-05 Double-exponential and" + " triple-exponential bounds for " + "choosability problems parameterized") + self.assertEqual(results[1]['url'], 'https://scanr.enseignementsup-recherche.gouv.fr/structure/199511665F') + self.assertEqual(results[1]['title'], u"Laboratoire Bordelais de Recherche en Informatique") diff --git a/tests/unit/test_webapp.py b/tests/unit/test_webapp.py index cbf0da471..1762d66b6 100644 --- a/tests/unit/test_webapp.py +++ b/tests/unit/test_webapp.py @@ -38,7 +38,7 @@ class ViewsTestCase(SearxTestCase): suggestions=set(), infoboxes=[], results=self.test_results, - number_of_results=len(self.test_results), + results_number=lambda: 3, results_length=lambda: len(self.test_results)) webapp.Search.search = search_mock @@ -96,7 +96,7 @@ class ViewsTestCase(SearxTestCase): ) self.assertIn( - '<opensearch:totalResults>2</opensearch:totalResults>', + '<opensearch:totalResults>3</opensearch:totalResults>', result.data ) diff --git a/utils/fetch_currencies.py b/utils/fetch_currencies.py index 3ca8fcfd7..47f732163 100644 --- a/utils/fetch_currencies.py +++ b/utils/fetch_currencies.py @@ -142,7 +142,7 @@ def wd_query(query, offset=0): qlist.append(r.get('title', '')) fetch_data_batch(qlist) -## fetch ## +# fetch # for q in wmflabs_queries: wdq_query(q) |