job51 (spiderJobs/platforms/job51/): - client.py: HTTPClient+Job51Sign from crawler_core - api.py: ApiResult→Result, self._http→self.http_client, _request() POST overrides - main.py: BaseFetcher/BaseSearcher from crawler_core - sign.py: backward-compatible stub re-exporting crawler_core.qcwy.sign.Job51Sign zhilian (spiderJobs/platforms/zhilian/): - client.py: HTTPClient+ZhilianSign from crawler_core - api.py: add _parse_zhilian_response (HTTP 200=success), add _parse()/_request() to all classes (GET fetchers + POST searcher overrides) - main.py: BaseFetcher/BaseSearcher from crawler_core - sign.py: backward-compatible stub re-exporting crawler_core.zhilian.sign.ZhilianSign tests: 34 new mock tests (17 job51 + 17 zhilian) Full regression: 98 passed (job51:17 + zhilian:17 + boss:22 + crawler_core:41 + 1)
170 lines
5.8 KiB
Python
170 lines
5.8 KiB
Python
"""
|
||
前程无忧 (51Job) HTTP 客户端
|
||
在通用 HTTPClient 上叠加 51job 特有的 sign 签名和默认 headers
|
||
|
||
与 Boss/智联不同,51job 的 sign 依赖完整的 URL path + body,
|
||
因此需要在 post/get 方法中先构造签名再拼接最终 URL。
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
from typing import Any, Optional
|
||
from urllib.parse import quote
|
||
|
||
from crawler_core.http_client import HTTPClient
|
||
from crawler_core.qcwy.sign import Job51Sign
|
||
|
||
BASE_URL = "https://cupid.51job.com"
|
||
|
||
# 51job 小程序特有的默认请求头
|
||
JOB51_HEADERS = {
|
||
"user-agent": (
|
||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
|
||
"(KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 "
|
||
"MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI "
|
||
"MiniProgramEnv/Mac MacWechat/WMPF MacWechat/3.8.7(0x13080712) "
|
||
"UnifiedPCMacWechat(0xf2641702) XWEB/18788"
|
||
),
|
||
"xweb_xhr": "1",
|
||
"from-domain": "51job_weixin_wxapp",
|
||
"sec-fetch-site": "cross-site",
|
||
"sec-fetch-mode": "cors",
|
||
"sec-fetch-dest": "empty",
|
||
"referer": "https://servicewechat.com/wx1131e5c71e668b5d/426/page-frame.html",
|
||
"accept-language": "zh-CN,zh;q=0.9",
|
||
"priority": "u=1, i",
|
||
}
|
||
|
||
|
||
class Job51Client(HTTPClient):
|
||
"""
|
||
前程无忧 HTTP 客户端
|
||
|
||
继承通用 HTTPClient,每次请求自动计算 HMAC-SHA256 签名
|
||
|
||
Args:
|
||
signer: Job51Sign 实例(可选)
|
||
tunnel_proxy: 隧道代理地址(每次请求自动换 IP)
|
||
proxy: 固定代理地址
|
||
proxy_pool: 代理池列表
|
||
timeout: 请求超时秒数
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
signer: Optional[Job51Sign] = None,
|
||
tunnel_proxy: Optional[str] = None,
|
||
proxy: Optional[str] = None,
|
||
proxy_pool: Optional[list[str]] = None,
|
||
timeout: int = 10,
|
||
):
|
||
super().__init__(
|
||
base_url=BASE_URL,
|
||
default_headers=JOB51_HEADERS,
|
||
tunnel_proxy=tunnel_proxy,
|
||
proxy=proxy,
|
||
proxy_pool=proxy_pool,
|
||
timeout=timeout,
|
||
)
|
||
self.signer = signer or Job51Sign()
|
||
self._uuid = Job51Sign.generate_uuid()
|
||
|
||
def _job51_headers(self, sign: str) -> dict:
|
||
"""构造每次请求的 51job 特有 headers"""
|
||
property_obj = {
|
||
"frompageUrl": "",
|
||
"pageUrl": "pages/index/index",
|
||
"isLogin": "否",
|
||
"accountid": "",
|
||
"resumeId": "",
|
||
"firstFrompageUrl": "",
|
||
"distinct_id": self._uuid,
|
||
}
|
||
return {
|
||
"sign": sign,
|
||
"partner": "",
|
||
"property": quote(json.dumps(property_obj, ensure_ascii=False, separators=(",", ":")), safe=""),
|
||
"uuid": self._uuid,
|
||
"user-token": "",
|
||
"account-id": "",
|
||
}
|
||
|
||
def post(self, path: str, body: dict, headers: Optional[dict] = None) -> tuple[int, Any]:
|
||
"""
|
||
POST 请求,自动计算签名
|
||
|
||
注意: path 参数为 endpoint(如 open/noauth/recommend/job-tab-dynamic-wx-mini)
|
||
签名后会拼为 /endpoint?api_key=51job×tamp=xxx
|
||
|
||
关键: body 必须以 compact JSON 发送(无空格),与签名字符串完全一致
|
||
不能使用 requests 的 json= 参数(会用默认带空格的序列化)
|
||
"""
|
||
url_path, sign = self.signer.build_sign_path(path, "POST", body=body)
|
||
|
||
job51_h = self._job51_headers(sign)
|
||
job51_h["Content-Type"] = "application/json"
|
||
if headers:
|
||
job51_h.update(headers)
|
||
|
||
# 必须用 compact JSON(与签名一致),通过 _post_raw 发送预序列化 body
|
||
raw_body = json.dumps(body, ensure_ascii=False, separators=(",", ":"))
|
||
return self._post_raw(url_path, raw_body, job51_h)
|
||
|
||
def _post_raw(self, path: str, raw_body: str, headers: dict) -> tuple[int, Any]:
|
||
"""发送预序列化的 POST 请求(data= 而非 json=)"""
|
||
merged_headers = self._merge_headers(headers)
|
||
url = f"{self.base_url}{path}"
|
||
|
||
if self._tunnel_proxy:
|
||
import requests_go as requests
|
||
s = self._new_session()
|
||
try:
|
||
resp = s.post(
|
||
url,
|
||
data=raw_body.encode("utf-8"),
|
||
headers=merged_headers,
|
||
proxies={"http": self._tunnel_proxy, "https": self._tunnel_proxy},
|
||
timeout=self.timeout,
|
||
)
|
||
return resp.status_code, resp.json()
|
||
finally:
|
||
s.close()
|
||
|
||
proxies = self._get_proxies()
|
||
kwargs: dict[str, Any] = {
|
||
"data": raw_body.encode("utf-8"),
|
||
"headers": merged_headers,
|
||
"timeout": self.timeout,
|
||
}
|
||
if proxies:
|
||
kwargs["proxies"] = proxies
|
||
resp = self._session.post(url, **kwargs)
|
||
return resp.status_code, resp.json()
|
||
|
||
def get(self, path: str, params: Optional[dict] = None, headers: Optional[dict] = None) -> tuple[int, Any]:
|
||
"""
|
||
GET 请求,自动计算签名
|
||
|
||
注意: params 会被编入签名路径的 query string 中
|
||
"""
|
||
url_path, sign = self.signer.build_sign_path(path, "GET", params=params)
|
||
|
||
job51_h = self._job51_headers(sign)
|
||
job51_h["content-type"] = "application/x-www-form-urlencoded"
|
||
if headers:
|
||
job51_h.update(headers)
|
||
|
||
# GET 参数已经编入 url_path,不再传 params
|
||
return super().get(url_path, params=None, headers=job51_h)
|
||
|
||
|
||
def create_client(
|
||
signer: Optional[Job51Sign] = None,
|
||
tunnel_proxy: Optional[str] = None,
|
||
proxy: Optional[str] = None,
|
||
proxy_pool: Optional[list[str]] = None,
|
||
) -> Job51Client:
|
||
"""创建 51job 客户端"""
|
||
return Job51Client(signer=signer, tunnel_proxy=tunnel_proxy, proxy=proxy, proxy_pool=proxy_pool)
|