- client.py: inherit crawler_core.http_client.HTTPClient, use crawler_core.boss.sign.BossSign - api.py: use crawler_core.base.Result/BaseFetcher/BaseSearcher, fix self._http -> self.http_client - main.py: import BaseFetcher/BaseSearcher and BossSign from crawler_core - sign.py: replace with backward-compat stub re-exporting BossSign from crawler_core Satisfies ARCH-03
123 lines
3.8 KiB
Python
123 lines
3.8 KiB
Python
"""
|
||
Boss直聘 HTTP 客户端
|
||
在通用 HTTPClient 上叠加 Boss 特有的 headers 和 Traceid 注入
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
from typing import Any, Optional
|
||
|
||
from crawler_core.http_client import HTTPClient
|
||
from crawler_core.boss.sign import BossSign
|
||
|
||
BASE_URL = "https://www.zhipin.com"
|
||
|
||
# Boss 小程序特有的默认请求头
|
||
BOSS_HEADERS = {
|
||
"content-type": "application/x-www-form-urlencoded",
|
||
"user-agent": (
|
||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
|
||
"(KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 "
|
||
"MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI "
|
||
"MiniProgramEnv/Mac MacWechat/WMPF MacWechat/3.8.7(0x13080712) "
|
||
"UnifiedPCMacWechat(0xf2641702) XWEB/18788"
|
||
),
|
||
"x-requested-with": "XMLHttpRequest",
|
||
"xweb_xhr": "1",
|
||
"zp_app_id": "10002",
|
||
"zp_product_id": "10002",
|
||
"ver": "14.0400",
|
||
"mini_ver": "14.0400",
|
||
"platform": "zhipin/mac",
|
||
"ua": '{"model":"Mac16,8","platform":"mac"}',
|
||
"scene": "1256",
|
||
"sec-fetch-site": "cross-site",
|
||
"sec-fetch-mode": "cors",
|
||
"sec-fetch-dest": "empty",
|
||
"referer": "https://servicewechat.com/wxa8da525af05281f3/601/page-frame.html",
|
||
"accept-language": "zh-CN,zh;q=0.9",
|
||
}
|
||
|
||
|
||
class BossClient(HTTPClient):
|
||
"""
|
||
Boss直聘 HTTP 客户端
|
||
|
||
继承通用 HTTPClient,每次请求自动注入 Traceid
|
||
|
||
Args:
|
||
signer: BossSign 实例(可选)
|
||
tunnel_proxy: 隧道代理地址(每次请求自动换 IP)
|
||
proxy: 固定代理地址
|
||
proxy_pool: 代理池列表
|
||
timeout: 请求超时秒数
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
signer: Optional[BossSign] = None,
|
||
tunnel_proxy: Optional[str] = None,
|
||
proxy: Optional[str] = None,
|
||
proxy_pool: Optional[list[str]] = None,
|
||
timeout: int = 10,
|
||
):
|
||
super().__init__(
|
||
base_url=BASE_URL,
|
||
default_headers=BOSS_HEADERS,
|
||
tunnel_proxy=tunnel_proxy,
|
||
proxy=proxy,
|
||
proxy_pool=proxy_pool,
|
||
timeout=timeout,
|
||
)
|
||
self.signer = signer or BossSign()
|
||
|
||
def _boss_headers(self) -> dict:
|
||
"""构造每次请求需要动态更新的 Boss 请求头"""
|
||
return {
|
||
"mpt": self.signer.mpt,
|
||
"wt2": self.signer.wt2,
|
||
"Traceid": BossSign.generate_traceid("M-W"),
|
||
}
|
||
|
||
def post(self, path: str, body: dict, headers: Optional[dict] = None) -> tuple[int, Any]:
|
||
"""POST 请求,自动注入 Boss headers"""
|
||
boss_h = self._boss_headers()
|
||
if headers:
|
||
boss_h.update(headers)
|
||
return super().post(path, body, boss_h)
|
||
|
||
def get(self, path: str, params: Optional[dict] = None, headers: Optional[dict] = None) -> tuple[int, Any]:
|
||
"""GET 请求,自动注入 Boss headers"""
|
||
boss_h = self._boss_headers()
|
||
if headers:
|
||
boss_h.update(headers)
|
||
return super().get(path, params, boss_h)
|
||
|
||
def batch(self, sub_reqs: list[dict]) -> tuple[int, Any]:
|
||
"""
|
||
批量请求 /wapi/batch/requests
|
||
|
||
Args:
|
||
sub_reqs: 子请求列表, 每个元素格式:
|
||
{"path": "/wapi/...", "method": "GET", "query": "key=val&..."}
|
||
|
||
Returns:
|
||
(http_code, response_json)
|
||
"""
|
||
body = {"subReqs": sub_reqs, "appId": 10002}
|
||
return self.post(
|
||
"/wapi/batch/requests",
|
||
body,
|
||
headers={"content-type": "application/json"},
|
||
)
|
||
|
||
|
||
def create_client(
|
||
signer: Optional[BossSign] = None,
|
||
tunnel_proxy: Optional[str] = None,
|
||
proxy: Optional[str] = None,
|
||
proxy_pool: Optional[list[str]] = None,
|
||
) -> BossClient:
|
||
"""创建 Boss 客户端"""
|
||
return BossClient(signer=signer, tunnel_proxy=tunnel_proxy, proxy=proxy, proxy_pool=proxy_pool)
|