win 8c2c2d29d7 feat(03): migrate job51+zhilian to crawler_core (ARCH-04/05)
job51 (spiderJobs/platforms/job51/):
- client.py: HTTPClient+Job51Sign from crawler_core
- api.py: ApiResult→Result, self._http→self.http_client, _request() POST overrides
- main.py: BaseFetcher/BaseSearcher from crawler_core
- sign.py: backward-compatible stub re-exporting crawler_core.qcwy.sign.Job51Sign

zhilian (spiderJobs/platforms/zhilian/):
- client.py: HTTPClient+ZhilianSign from crawler_core
- api.py: add _parse_zhilian_response (HTTP 200=success), add _parse()/_request()
  to all classes (GET fetchers + POST searcher overrides)
- main.py: BaseFetcher/BaseSearcher from crawler_core
- sign.py: backward-compatible stub re-exporting crawler_core.zhilian.sign.ZhilianSign

tests: 34 new mock tests (17 job51 + 17 zhilian)
Full regression: 98 passed (job51:17 + zhilian:17 + boss:22 + crawler_core:41 + 1)
2026-03-21 19:18:22 +08:00

170 lines
5.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
前程无忧 (51Job) HTTP 客户端
在通用 HTTPClient 上叠加 51job 特有的 sign 签名和默认 headers
与 Boss/智联不同51job 的 sign 依赖完整的 URL path + body
因此需要在 post/get 方法中先构造签名再拼接最终 URL。
"""
from __future__ import annotations
import json
from typing import Any, Optional
from urllib.parse import quote
from crawler_core.http_client import HTTPClient
from crawler_core.qcwy.sign import Job51Sign
BASE_URL = "https://cupid.51job.com"
# 51job 小程序特有的默认请求头
JOB51_HEADERS = {
"user-agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 "
"MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI "
"MiniProgramEnv/Mac MacWechat/WMPF MacWechat/3.8.7(0x13080712) "
"UnifiedPCMacWechat(0xf2641702) XWEB/18788"
),
"xweb_xhr": "1",
"from-domain": "51job_weixin_wxapp",
"sec-fetch-site": "cross-site",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
"referer": "https://servicewechat.com/wx1131e5c71e668b5d/426/page-frame.html",
"accept-language": "zh-CN,zh;q=0.9",
"priority": "u=1, i",
}
class Job51Client(HTTPClient):
"""
前程无忧 HTTP 客户端
继承通用 HTTPClient每次请求自动计算 HMAC-SHA256 签名
Args:
signer: Job51Sign 实例(可选)
tunnel_proxy: 隧道代理地址(每次请求自动换 IP
proxy: 固定代理地址
proxy_pool: 代理池列表
timeout: 请求超时秒数
"""
def __init__(
self,
signer: Optional[Job51Sign] = None,
tunnel_proxy: Optional[str] = None,
proxy: Optional[str] = None,
proxy_pool: Optional[list[str]] = None,
timeout: int = 10,
):
super().__init__(
base_url=BASE_URL,
default_headers=JOB51_HEADERS,
tunnel_proxy=tunnel_proxy,
proxy=proxy,
proxy_pool=proxy_pool,
timeout=timeout,
)
self.signer = signer or Job51Sign()
self._uuid = Job51Sign.generate_uuid()
def _job51_headers(self, sign: str) -> dict:
"""构造每次请求的 51job 特有 headers"""
property_obj = {
"frompageUrl": "",
"pageUrl": "pages/index/index",
"isLogin": "",
"accountid": "",
"resumeId": "",
"firstFrompageUrl": "",
"distinct_id": self._uuid,
}
return {
"sign": sign,
"partner": "",
"property": quote(json.dumps(property_obj, ensure_ascii=False, separators=(",", ":")), safe=""),
"uuid": self._uuid,
"user-token": "",
"account-id": "",
}
def post(self, path: str, body: dict, headers: Optional[dict] = None) -> tuple[int, Any]:
"""
POST 请求,自动计算签名
注意: path 参数为 endpoint如 open/noauth/recommend/job-tab-dynamic-wx-mini
签名后会拼为 /endpoint?api_key=51job&timestamp=xxx
关键: body 必须以 compact JSON 发送(无空格),与签名字符串完全一致
不能使用 requests 的 json= 参数(会用默认带空格的序列化)
"""
url_path, sign = self.signer.build_sign_path(path, "POST", body=body)
job51_h = self._job51_headers(sign)
job51_h["Content-Type"] = "application/json"
if headers:
job51_h.update(headers)
# 必须用 compact JSON与签名一致通过 _post_raw 发送预序列化 body
raw_body = json.dumps(body, ensure_ascii=False, separators=(",", ":"))
return self._post_raw(url_path, raw_body, job51_h)
def _post_raw(self, path: str, raw_body: str, headers: dict) -> tuple[int, Any]:
"""发送预序列化的 POST 请求data= 而非 json="""
merged_headers = self._merge_headers(headers)
url = f"{self.base_url}{path}"
if self._tunnel_proxy:
import requests_go as requests
s = self._new_session()
try:
resp = s.post(
url,
data=raw_body.encode("utf-8"),
headers=merged_headers,
proxies={"http": self._tunnel_proxy, "https": self._tunnel_proxy},
timeout=self.timeout,
)
return resp.status_code, resp.json()
finally:
s.close()
proxies = self._get_proxies()
kwargs: dict[str, Any] = {
"data": raw_body.encode("utf-8"),
"headers": merged_headers,
"timeout": self.timeout,
}
if proxies:
kwargs["proxies"] = proxies
resp = self._session.post(url, **kwargs)
return resp.status_code, resp.json()
def get(self, path: str, params: Optional[dict] = None, headers: Optional[dict] = None) -> tuple[int, Any]:
"""
GET 请求,自动计算签名
注意: params 会被编入签名路径的 query string 中
"""
url_path, sign = self.signer.build_sign_path(path, "GET", params=params)
job51_h = self._job51_headers(sign)
job51_h["content-type"] = "application/x-www-form-urlencoded"
if headers:
job51_h.update(headers)
# GET 参数已经编入 url_path不再传 params
return super().get(url_path, params=None, headers=job51_h)
def create_client(
signer: Optional[Job51Sign] = None,
tunnel_proxy: Optional[str] = None,
proxy: Optional[str] = None,
proxy_pool: Optional[list[str]] = None,
) -> Job51Client:
"""创建 51job 客户端"""
return Job51Client(signer=signer, tunnel_proxy=tunnel_proxy, proxy=proxy, proxy_pool=proxy_pool)