summaryrefslogtreecommitdiff
path: root/searx
diff options
context:
space:
mode:
Diffstat (limited to 'searx')
-rw-r--r--searx/engines/quark.py383
-rw-r--r--searx/settings.yml14
2 files changed, 397 insertions, 0 deletions
diff --git a/searx/engines/quark.py b/searx/engines/quark.py
new file mode 100644
index 000000000..957ef0d97
--- /dev/null
+++ b/searx/engines/quark.py
@@ -0,0 +1,383 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+"""Quark (Shenma) search engine for searxng"""
+
+from urllib.parse import urlencode
+from datetime import datetime
+import re
+import json
+
+from searx.utils import html_to_text, gen_useragent
+from searx.exceptions import SearxEngineAPIException, SearxEngineCaptchaException
+
+# Metadata
+about = {
+ "website": "https://m.quark.cn/",
+ "wikidata_id": "Q48816502",
+ "use_official_api": False,
+ "require_api_key": False,
+ "results": "HTML",
+ "language": "zh",
+}
+
+# Engine Configuration
+categories = []
+paging = True
+results_per_page = 10
+
+quark_category = 'general'
+
+time_range_support = True
+time_range_dict = {'day': '4', 'week': '3', 'month': '2', 'year': '1'}
+
+CAPTCHA_PATTERN = r'\{[^{]*?"action"\s*:\s*"captcha"\s*,\s*"url"\s*:\s*"([^"]+)"[^{]*?\}'
+
+
+def is_alibaba_captcha(html):
+ """
+ Detects if the response contains an Alibaba X5SEC CAPTCHA page.
+
+ Quark may return a CAPTCHA challenge after 9 requests in a short period.
+
+ Typically, the ban duration is around 15 minutes.
+ """
+ return bool(re.search(CAPTCHA_PATTERN, html))
+
+
+def init(_):
+ if quark_category not in ('general', 'images'):
+ raise SearxEngineAPIException(f"Unsupported category: {quark_category}")
+
+
+def request(query, params):
+ page_num = params["pageno"]
+
+ category_config = {
+ 'general': {
+ 'endpoint': 'https://m.quark.cn/s',
+ 'params': {
+ "q": query,
+ "layout": "html",
+ "page": page_num,
+ },
+ },
+ 'images': {
+ 'endpoint': 'https://vt.sm.cn/api/pic/list',
+ 'params': {
+ "query": query,
+ "limit": results_per_page,
+ "start": (page_num - 1) * results_per_page,
+ },
+ },
+ }
+
+ query_params = category_config[quark_category]['params']
+ query_url = category_config[quark_category]['endpoint']
+
+ if time_range_dict.get(params['time_range']) and quark_category == 'general':
+ query_params["tl_request"] = time_range_dict.get(params['time_range'])
+
+ params["url"] = f"{query_url}?{urlencode(query_params)}"
+ params["headers"] = {
+ "User-Agent": gen_useragent(),
+ }
+ return params
+
+
+def response(resp):
+ results = []
+ text = resp.text
+
+ if is_alibaba_captcha(text):
+ raise SearxEngineCaptchaException(
+ suspended_time=900, message="Alibaba CAPTCHA detected. Please try again later."
+ )
+
+ if quark_category == 'images':
+ data = json.loads(text)
+ for item in data.get('data', {}).get('hit', {}).get('imgInfo', {}).get('item', []):
+ try:
+ published_date = datetime.fromtimestamp(int(item.get("publish_time")))
+ except (ValueError, TypeError):
+ published_date = None
+
+ results.append(
+ {
+ "template": "images.html",
+ "url": item.get("imgUrl"),
+ "thumbnail_src": item.get("img"),
+ "img_src": item.get("bigPicUrl"),
+ "title": item.get("title"),
+ "source": item.get("site"),
+ "resolution": f"{item['width']} x {item['height']}",
+ "publishedDate": published_date,
+ }
+ )
+
+ if quark_category == 'general':
+ # Quark returns a variety of different sc values on a single page, depending on the query type.
+ source_category_parsers = {
+ 'addition': parse_addition,
+ 'ai_page': parse_ai_page,
+ 'baike_sc': parse_baike_sc,
+ 'finance_shuidi': parse_finance_shuidi,
+ 'kk_yidian_all': parse_kk_yidian_all,
+ 'life_show_general_image': parse_life_show_general_image,
+ 'med_struct': parse_med_struct,
+ 'music_new_song': parse_music_new_song,
+ 'nature_result': parse_nature_result,
+ 'news_uchq': parse_news_uchq,
+ 'ss_note': parse_ss_note,
+ # ss_kv, ss_pic, ss_text, ss_video, baike, structure_web_novel use the same struct as ss_doc
+ 'ss_doc': parse_ss_doc,
+ 'ss_kv': parse_ss_doc,
+ 'ss_pic': parse_ss_doc,
+ 'ss_text': parse_ss_doc,
+ 'ss_video': parse_ss_doc,
+ 'baike': parse_ss_doc,
+ 'structure_web_novel': parse_ss_doc,
+ 'travel_dest_overview': parse_travel_dest_overview,
+ 'travel_ranking_list': parse_travel_ranking_list,
+ }
+
+ pattern = r'<script\s+type="application/json"\s+id="s-data-[^"]+"\s+data-used-by="hydrate">(.*?)</script>'
+ matches = re.findall(pattern, text, re.DOTALL)
+
+ for match in matches:
+ data = json.loads(match)
+ initial_data = data.get('data', {}).get('initialData', {})
+ extra_data = data.get('extraData', {})
+
+ source_category = extra_data.get('sc')
+
+ parsers = source_category_parsers.get(source_category)
+ if parsers:
+ parsed_results = parsers(initial_data)
+ if isinstance(parsed_results, list):
+ # Extend if the result is a list
+ results.extend(parsed_results)
+ else:
+ # Append if it's a single result
+ results.append(parsed_results)
+
+ return results
+
+
+def parse_addition(data):
+ return {
+ "title": html_to_text(data.get('title', {}).get('content')),
+ "url": data.get('source', {}).get('url'),
+ "content": html_to_text(data.get('summary', {}).get('content')),
+ }
+
+
+def parse_ai_page(data):
+ results = []
+ for item in data.get('list', []):
+ content = (
+ " | ".join(map(str, item.get('content', [])))
+ if isinstance(item.get('content'), list)
+ else str(item.get('content'))
+ )
+
+ try:
+ published_date = datetime.fromtimestamp(int(item.get('source', {}).get('time')))
+ except (ValueError, TypeError):
+ published_date = None
+
+ results.append(
+ {
+ "title": html_to_text(item.get('title')),
+ "url": item.get('url'),
+ "content": html_to_text(content),
+ "publishedDate": published_date,
+ }
+ )
+ return results
+
+
+def parse_baike_sc(data):
+ return {
+ "title": html_to_text(data.get('data', {}).get('title')),
+ "url": data.get('data', {}).get('url'),
+ "content": html_to_text(data.get('data', {}).get('abstract')),
+ "thumbnail": data.get('data', {}).get('img').replace("http://", "https://"),
+ }
+
+
+def parse_finance_shuidi(data):
+ content = " | ".join(
+ (
+ info
+ for info in [
+ data.get('establish_time'),
+ data.get('company_status'),
+ data.get('controled_type'),
+ data.get('company_type'),
+ data.get('capital'),
+ data.get('address'),
+ data.get('business_scope'),
+ ]
+ if info
+ )
+ )
+ return {
+ "title": html_to_text(data.get('company_name')),
+ "url": data.get('title_url'),
+ "content": html_to_text(content),
+ }
+
+
+def parse_kk_yidian_all(data):
+ content_list = []
+ for section in data.get('list_container', []):
+ for item in section.get('list_container', []):
+ if 'dot_text' in item:
+ content_list.append(item['dot_text'])
+
+ return {
+ "title": html_to_text(data.get('title')),
+ "url": data.get('title_url'),
+ "content": html_to_text(' '.join(content_list)),
+ }
+
+
+def parse_life_show_general_image(data):
+ results = []
+ for item in data.get('image', []):
+ try:
+ published_date = datetime.fromtimestamp(int(item.get("publish_time")))
+ except (ValueError, TypeError):
+ published_date = None
+
+ results.append(
+ {
+ "template": "images.html",
+ "url": item.get("imgUrl"),
+ "thumbnail_src": item.get("img"),
+ "img_src": item.get("bigPicUrl"),
+ "title": item.get("title"),
+ "source": item.get("site"),
+ "resolution": f"{item['width']} x {item['height']}",
+ "publishedDate": published_date,
+ }
+ )
+ return results
+
+
+def parse_med_struct(data):
+ return {
+ "title": html_to_text(data.get('title')),
+ "url": data.get('message', {}).get('statistics', {}).get('nu'),
+ "content": html_to_text(data.get('message', {}).get('content_text')),
+ "thumbnail": data.get('message', {}).get('video_img').replace("http://", "https://"),
+ }
+
+
+def parse_music_new_song(data):
+ results = []
+ for item in data.get('hit3', []):
+ results.append(
+ {
+ "title": f"{item['song_name']} | {item['song_singer']}",
+ "url": item.get("play_url"),
+ "content": html_to_text(item.get("lyrics")),
+ "thumbnail": item.get("image_url").replace("http://", "https://"),
+ }
+ )
+ return results
+
+
+def parse_nature_result(data):
+ return {"title": html_to_text(data.get('title')), "url": data.get('url'), "content": html_to_text(data.get('desc'))}
+
+
+def parse_news_uchq(data):
+ results = []
+ for item in data.get('feed', []):
+ try:
+ published_date = datetime.strptime(item.get('time'), "%Y-%m-%d")
+ except (ValueError, TypeError):
+ # Sometime Quark will return non-standard format like "1天前", set published_date as None
+ published_date = None
+
+ results.append(
+ {
+ "title": html_to_text(item.get('title')),
+ "url": item.get('url'),
+ "content": html_to_text(item.get('summary')),
+ "thumbnail": item.get('image').replace("http://", "https://"),
+ "publishedDate": published_date,
+ }
+ )
+ return results
+
+
+def parse_ss_doc(data):
+ published_date = None
+ try:
+ timestamp = int(data.get('sourceProps', {}).get('time'))
+
+ # Sometime Quark will return 0, set published_date as None
+ if timestamp != 0:
+ published_date = datetime.fromtimestamp(timestamp)
+ except (ValueError, TypeError):
+ pass
+
+ try:
+ thumbnail = data.get('picListProps', [])[0].get('src').replace("http://", "https://")
+ except (ValueError, TypeError, IndexError):
+ thumbnail = None
+
+ return {
+ "title": html_to_text(
+ data.get('titleProps', {}).get('content')
+ # ss_kv variant 1 & 2
+ or data.get('title')
+ ),
+ "url": data.get('sourceProps', {}).get('dest_url')
+ # ss_kv variant 1
+ or data.get('normal_url')
+ # ss_kv variant 2
+ or data.get('url'),
+ "content": html_to_text(
+ data.get('summaryProps', {}).get('content')
+ # ss_doc variant 1
+ or data.get('message', {}).get('replyContent')
+ # ss_kv variant 1
+ or data.get('show_body')
+ # ss_kv variant 2
+ or data.get('desc')
+ ),
+ "publishedDate": published_date,
+ "thumbnail": thumbnail,
+ }
+
+
+def parse_ss_note(data):
+ try:
+ published_date = datetime.fromtimestamp(int(data.get('source', {}).get('time')))
+ except (ValueError, TypeError):
+ published_date = None
+
+ return {
+ "title": html_to_text(data.get('title', {}).get('content')),
+ "url": data.get('source', {}).get('dest_url'),
+ "content": html_to_text(data.get('summary', {}).get('content')),
+ "publishedDate": published_date,
+ }
+
+
+def parse_travel_dest_overview(data):
+ return {
+ "title": html_to_text(data.get('strong', {}).get('title')),
+ "url": data.get('strong', {}).get('baike_url'),
+ "content": html_to_text(data.get('strong', {}).get('baike_text')),
+ }
+
+
+def parse_travel_ranking_list(data):
+ return {
+ "title": html_to_text(data.get('title', {}).get('text')),
+ "url": data.get('title', {}).get('url'),
+ "content": html_to_text(data.get('title', {}).get('title_tag')),
+ }
diff --git a/searx/settings.yml b/searx/settings.yml
index 0ce50f25f..55893f8a1 100644
--- a/searx/settings.yml
+++ b/searx/settings.yml
@@ -1661,6 +1661,20 @@ engines:
shortcut: pypi
engine: pypi
+ - name: quark
+ quark_category: general
+ categories: [general]
+ engine: quark
+ shortcut: qk
+ disabled: true
+
+ - name: quark images
+ quark_category: images
+ categories: [images]
+ engine: quark
+ shortcut: qki
+ disabled: true
+
- name: qwant
qwant_categ: web
engine: qwant