summaryrefslogtreecommitdiff
path: root/searx/engines
diff options
context:
space:
mode:
authorMarkus Heiser <markus.heiser@darmarIT.de>2020-04-29 12:02:39 +0000
committerGitHub <noreply@github.com>2020-04-29 12:02:39 +0000
commit0f4dbc4eca848d910fb4433e3d1c61fa7fcfa8de (patch)
treed3b37c89b57aca471cddecaea44f8e9218cc487f /searx/engines
parent678d41d75bcdf14b17fa13f69aa323a9f8716faa (diff)
parent7342806987aec05c50f12e149683609640ba66a0 (diff)
Merge branch 'master' into uwsgi_static
Diffstat (limited to 'searx/engines')
-rw-r--r--searx/engines/bing.py23
-rw-r--r--searx/engines/bing_images.py21
-rw-r--r--searx/engines/bing_news.py4
-rw-r--r--searx/engines/bing_videos.py6
-rw-r--r--searx/engines/faroo.py96
-rw-r--r--searx/engines/google_news.py2
-rw-r--r--searx/engines/startpage.py7
7 files changed, 30 insertions, 129 deletions
diff --git a/searx/engines/bing.py b/searx/engines/bing.py
index b193f7c60..afb776acd 100644
--- a/searx/engines/bing.py
+++ b/searx/engines/bing.py
@@ -110,13 +110,18 @@ def response(resp):
# get supported languages from their site
def _fetch_supported_languages(resp):
- supported_languages = []
+ lang_tags = set()
+
+ setmkt = re.compile('setmkt=([^&]*)')
dom = html.fromstring(resp.text)
- options = eval_xpath(dom, '//div[@id="limit-languages"]//input')
- for option in options:
- code = eval_xpath(option, './@id')[0].replace('_', '-')
- if code == 'nb':
- code = 'no'
- supported_languages.append(code)
-
- return supported_languages
+ lang_links = eval_xpath(dom, "//li/a[contains(@href, 'setmkt')]")
+
+ for a in lang_links:
+ href = eval_xpath(a, './@href')[0]
+ match = setmkt.search(href)
+ l_tag = match.groups()[0]
+ _lang, _nation = l_tag.split('-', 1)
+ l_tag = _lang.lower() + '-' + _nation.upper()
+ lang_tags.add(l_tag)
+
+ return list(lang_tags)
diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py
index 44e2c3bbc..138ed11c6 100644
--- a/searx/engines/bing_images.py
+++ b/searx/engines/bing_images.py
@@ -18,6 +18,8 @@ import re
from searx.url_utils import urlencode
from searx.utils import match_language
+from searx.engines.bing import _fetch_supported_languages, supported_languages_url, language_aliases
+
# engine dependent config
categories = ['images']
paging = True
@@ -103,22 +105,3 @@ def response(resp):
continue
return results
-
-
-# get supported languages from their site
-def _fetch_supported_languages(resp):
- supported_languages = []
- dom = html.fromstring(resp.text)
-
- regions_xpath = '//div[@id="region-section-content"]' \
- + '//ul[@class="b_vList"]/li/a/@href'
-
- regions = dom.xpath(regions_xpath)
- for region in regions:
- code = re.search('setmkt=[^\&]+', region).group()[7:]
- if code == 'nb-NO':
- code = 'no-NO'
-
- supported_languages.append(code)
-
- return supported_languages
diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py
index 669130c42..d13be777c 100644
--- a/searx/engines/bing_news.py
+++ b/searx/engines/bing_news.py
@@ -15,9 +15,10 @@ from datetime import datetime
from dateutil import parser
from lxml import etree
from searx.utils import list_get, match_language
-from searx.engines.bing import _fetch_supported_languages, supported_languages_url, language_aliases
from searx.url_utils import urlencode, urlparse, parse_qsl
+from searx.engines.bing import _fetch_supported_languages, supported_languages_url, language_aliases
+
# engine dependent config
categories = ['news']
paging = True
@@ -58,6 +59,7 @@ def _get_url(query, language, offset, time_range):
offset=offset,
interval=time_range_dict[time_range])
else:
+ # e.g. setmkt=de-de&setlang=de
search_path = search_string.format(
query=urlencode({'q': query, 'setmkt': language}),
offset=offset)
diff --git a/searx/engines/bing_videos.py b/searx/engines/bing_videos.py
index f1e636819..f048f0d8e 100644
--- a/searx/engines/bing_videos.py
+++ b/searx/engines/bing_videos.py
@@ -12,10 +12,10 @@
from json import loads
from lxml import html
-from searx.engines.bing_images import _fetch_supported_languages, supported_languages_url
from searx.url_utils import urlencode
from searx.utils import match_language
+from searx.engines.bing import _fetch_supported_languages, supported_languages_url, language_aliases
categories = ['videos']
paging = True
@@ -67,6 +67,10 @@ def request(query, params):
if params['time_range'] in time_range_dict:
params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']])
+ # bing videos did not like "older" versions < 70.0.1 when selectin other
+ # languages then 'en' .. very strange ?!?!
+ params['headers']['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64; rv:73.0.1) Gecko/20100101 Firefox/73.0.1'
+
return params
diff --git a/searx/engines/faroo.py b/searx/engines/faroo.py
deleted file mode 100644
index a36ecf778..000000000
--- a/searx/engines/faroo.py
+++ /dev/null
@@ -1,96 +0,0 @@
-"""
- Faroo (Web, News)
-
- @website http://www.faroo.com
- @provide-api yes (http://www.faroo.com/hp/api/api.html), require API-key
-
- @using-api no
- @results JSON
- @stable yes
- @parse url, title, content, publishedDate, img_src
-"""
-
-from json import loads
-import datetime
-from searx.utils import searx_useragent
-from searx.url_utils import urlencode
-
-# engine dependent config
-categories = ['general', 'news']
-paging = True
-language_support = True
-number_of_results = 10
-
-# search-url
-url = 'http://www.faroo.com/'
-search_url = url + 'instant.json?{query}'\
- '&start={offset}'\
- '&length={number_of_results}'\
- '&l={language}'\
- '&src={categorie}'\
- '&i=false'\
- '&c=false'
-
-search_category = {'general': 'web',
- 'news': 'news'}
-
-
-# do search-request
-def request(query, params):
- offset = (params['pageno'] - 1) * number_of_results + 1
- categorie = search_category.get(params['category'], 'web')
-
- if params['language'] == 'all':
- language = 'en'
- else:
- language = params['language'].split('-')[0]
-
- # if language is not supported, put it in english
- if language != 'en' and\
- language != 'de' and\
- language != 'zh':
- language = 'en'
-
- params['url'] = search_url.format(offset=offset,
- number_of_results=number_of_results,
- query=urlencode({'q': query}),
- language=language,
- categorie=categorie)
-
- params['headers']['Referer'] = url
-
- return params
-
-
-# get response from search-request
-def response(resp):
- # HTTP-Code 429: rate limit exceeded
- if resp.status_code == 429:
- raise Exception("rate limit has been exceeded!")
-
- results = []
-
- search_res = loads(resp.text)
-
- # return empty array if there are no results
- if not search_res.get('results', {}):
- return []
-
- # parse results
- for result in search_res['results']:
- publishedDate = None
- result_json = {'url': result['url'], 'title': result['title'],
- 'content': result['kwic']}
- if result['news']:
- result_json['publishedDate'] = \
- datetime.datetime.fromtimestamp(result['date'] / 1000.0)
-
- # append image result if image url is set
- if result['iurl']:
- result_json['template'] = 'videos.html'
- result_json['thumbnail'] = result['iurl']
-
- results.append(result_json)
-
- # return results
- return results
diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py
index 9c837b45b..c9cc75435 100644
--- a/searx/engines/google_news.py
+++ b/searx/engines/google_news.py
@@ -54,7 +54,7 @@ def request(query, params):
if params['language'] != 'all':
language = match_language(params['language'], supported_languages, language_aliases).split('-')[0]
if language:
- params['url'] += '&lr=lang_' + language
+ params['url'] += '&hl=' + language
return params
diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py
index 76567396f..953734934 100644
--- a/searx/engines/startpage.py
+++ b/searx/engines/startpage.py
@@ -99,11 +99,14 @@ def response(resp):
if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
date_pos = content.find('...') + 4
date_string = content[0:date_pos - 5]
- published_date = parser.parse(date_string, dayfirst=True)
-
# fix content string
content = content[date_pos:]
+ try:
+ published_date = parser.parse(date_string, dayfirst=True)
+ except ValueError:
+ pass
+
# check if search result starts with something like: "5 days ago ... "
elif re.match(r"^[0-9]+ days? ago \.\.\. ", content):
date_pos = content.find('...') + 4