feat(01-02): port sign algorithms to crawler_core/ platform directories

- Add crawler_core/boss/sign.py: BossSign traceid generator (pure stdlib)
- Add crawler_core/qcwy/sign.py: Job51Sign HMAC-SHA256 signing (pure stdlib)
- Add crawler_core/zhilian/sign.py: ZhilianSign header/param signing (pure stdlib)
- Add __init__.py for all three crawler_core platform directories
- Updated module docstrings to reference crawler_core; all logic unchanged
- No imports from spiderJobs or app; no HTTP dependencies
This commit is contained in:
win 2026-03-21 18:08:53 +08:00
parent 4932177f7c
commit bd1e50e410
3 changed files with 281 additions and 0 deletions

107
crawler_core/boss/sign.py Normal file
View File

@ -0,0 +1,107 @@
"""
Boss直聘 Traceid 生成算法 (crawler_core)
miniprogram_npm/trace-id/index.js 翻译而来
Traceid 格式: {prefix}{hex_timestamp_13}{random_6}{checksum_3}
示例: M-W0019d0a8af5f32gtVvnD4M
"""
from __future__ import annotations
import random
import time
# base62 字符集(与 JS 端一致)
_CHARS = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
def _to_u32(n: int) -> int:
"""模拟 JS 的 >>> 0无符号 32 位截断)"""
return n & 0xFFFFFFFF
def _compute_checksum(uuid_str: str) -> str:
"""
19 字符的 UUID 计算 3 位校验码
JS computeChecksum() 完全一致:
- 正向哈希 r: (r << 5) - r + charCode
- 反向哈希 a: (a << 7) - a + charCode * (index + 1)
- 中间扩展哈希 n: (n << 3) - n + charCode * (|index - mid| + 1)
- 三组混合后取 base62 下标
"""
r = 0
for ch in uuid_str:
r = ((r << 5) - r + ord(ch)) & 0xFFFFFFFF
# JS 的 r &= r 等价于保持 32 位(已在上面做了)
a = 0
for i in range(len(uuid_str) - 1, -1, -1):
a = ((a << 7) - a + ord(uuid_str[i]) * (i + 1)) & 0xFFFFFFFF
n = 0
mid = len(uuid_str) // 2
for i in range(len(uuid_str)):
n = ((n << 3) - n + ord(uuid_str[i]) * (abs(i - mid) + 1)) & 0xFFFFFFFF
# ── 第 1 个校验字符 ──
s = _to_u32(r ^ a)
# JS: Math.abs(s) — 但 s 是 u32 所以已经 ≥0
# 然后: s = (2654435761 * s >>> 0) ^ (s >>> 16) >>> 0
# s = (2246822507 * s >>> 0) ^ (s >>> 13) >>> 0
s = _to_u32(2654435761 * s)
s = _to_u32(s ^ (s >> 16))
s = _to_u32(2246822507 * s)
s = _to_u32(s ^ (s >> 13))
c1 = _CHARS[s % 62]
# ── 第 2 个校验字符 ──
h = _to_u32(a ^ n)
h = _to_u32(3266489909 * h)
h = _to_u32(h ^ (h >> 16))
h = _to_u32(2654435761 * h)
h = _to_u32(h ^ (h >> 13))
c2 = _CHARS[h % 62]
# ── 第 3 个校验字符 ──
v = _to_u32(n ^ r)
v = _to_u32(668265261 * v)
v = _to_u32(v ^ (v >> 16))
v = _to_u32(2246822507 * v)
v = _to_u32(v ^ (v >> 13))
c3 = _CHARS[v % 62]
return f"{c1}{c2}{c3}"
def _generate_uuid() -> str:
"""
生成 19 字符 UUID: 13 hex 时间戳 + 6 base62 随机字符
"""
hex_ts = format(int(time.time() * 1000), "x").lower()
hex_ts = hex_ts[-13:].zfill(13)
rand_part = "".join(random.choice(_CHARS) for _ in range(6))
return hex_ts + rand_part
class BossSign:
"""
Boss直聘请求签名
功能: 生成 Traceid
参数说明:
mpt: 登录凭证无登录时为空字符串
wt2: 登录凭证无登录时为空字符串
"""
def __init__(self, *, mpt: str = "", wt2: str = ""):
self.mpt = mpt
self.wt2 = wt2
@staticmethod
def generate_traceid(prefix: str = "M-W") -> str:
"""生成 Traceid格式: {prefix}{uuid}{checksum}"""
uuid_str = _generate_uuid()
checksum = _compute_checksum(uuid_str)
return f"{prefix}{uuid_str}{checksum}"

88
crawler_core/qcwy/sign.py Normal file
View File

@ -0,0 +1,88 @@
"""
前程无忧 (51Job) 签名算法 (crawler_core)
从小程序源码 utils/cupid.js + server/request/index.js 翻译而来
签名逻辑:
1. 构造 path = /{endpoint}?api_key=51job&timestamp={ts}
2. GET: message = path + &param1=val1&param2=val2
POST: message = path + JSON.stringify(body)
3. sign = HMAC-SHA256(message, SIGN_KEY).hex()
"""
from __future__ import annotations
import hmac
import hashlib
import time
import random
from urllib.parse import quote
# 与小程序硬编码一致server/config/index.js
SIGN_KEY = "abfc8f9dcf8c3f3d8aa294ac5f2cf2cc7767e5592590f39c3f503271dd68562b"
class Job51Sign:
"""
前程无忧请求签名
功能: 根据请求路径方法参数生成 sign 和带签名的 URL path
参数说明:
sign_key: HMAC 密钥默认使用小程序内置 key
"""
def __init__(self, *, sign_key: str = SIGN_KEY):
self.sign_key = sign_key
@staticmethod
def generate_uuid() -> str:
"""生成 UUID13位时间戳 + 随机数),用作 distinct_id / uuid"""
ts = str(int(time.time() * 1000))
rand = str(random.randint(1000000000, 9999999999))
return ts + rand
def build_sign_path(
self,
endpoint: str,
method: str = "GET",
params: dict | None = None,
body: dict | None = None,
) -> tuple[str, str]:
"""
构建签名路径和 sign
Args:
endpoint: API 路径不含域名 open/noauth/jobs/detail/base/170651439
method: HTTP 方法GET / POST
params: GET 请求的额外 query 参数
body: POST 请求的 JSON body
Returns:
(url_path, sign_hex)
url_path: 完整的 URL path api_key timestamp
sign_hex: HMAC-SHA256 签名
"""
import json
ts = int(time.time())
path = f"/{endpoint}?api_key=51job&timestamp={ts}"
if method.upper() == "GET" and params:
query_parts = []
for k, v in params.items():
query_parts.append(f"{quote(str(k), safe='')}={quote(str(v), safe='')}")
if query_parts:
path += "&" + "&".join(query_parts)
# 签名字符串
message = path
if method.upper() == "POST" and body is not None:
message += json.dumps(body, ensure_ascii=False, separators=(",", ":"))
sign_hex = hmac.new(
self.sign_key.encode("utf-8"),
message.encode("utf-8"),
hashlib.sha256,
).hexdigest()
return path, sign_hex

View File

@ -0,0 +1,86 @@
"""
智联招聘签名算法 (crawler_core)
职责参数构造 + 签名算法不涉及 HTTP 请求
"""
from __future__ import annotations
import math
import random
from typing import Optional
class ZhilianSign:
"""
智联招聘请求签名
功能 1: 生成请求所需的签名参数device_id, action_id, at, rt
功能 2: 根据接口类型cgate / capi构造对应的签名头或签名参数
参数说明:
at: Access Token登录后获得未登录为空
rt: Refresh Token登录后获得未登录为空
device_id: 设备 ID自动生成 UUID也可手动指定
version: 小程序版本号
channel: 渠道标识
platform: 平台 ID12 = 微信小程序
"""
def __init__(
self,
*,
at: str = "",
rt: str = "",
device_id: Optional[str] = None,
version: str = "4.1.259",
channel: str = "wxxiaochengxu",
platform: str = "12",
):
self.at = at
self.rt = rt
self.device_id = device_id or self.generate_uuid()
self.version = version
self.channel = channel
self.platform = platform
# ── 算法: UUID 生成(与小程序一致)────────
@staticmethod
def generate_uuid() -> str:
chars = "0123456789ABCDEF"
uuid = [""] * 36
for i in range(36):
uuid[i] = chars[math.floor(16 * random.random())]
uuid[14] = "4"
uuid[19] = chars[(int(uuid[19], 16) & 0x3) | 0x8]
uuid[8] = uuid[13] = uuid[18] = uuid[23] = "-"
return "".join(uuid)
# ── cgate 签名头 ─────────────────────────
def sign_headers(self, page_code: str = "0") -> dict:
"""构造 cgate 接口的签名请求头"""
return {
"x-zp-at": self.at,
"x-zp-rt": self.rt,
"x-zp-action-id": self.generate_uuid(),
"x-zp-page-code": page_code,
"x-zp-version": self.version,
"x-zp-channel": self.channel,
"x-zp-platform": self.platform,
"x-zp-device-id": self.device_id,
"x-zp-business-system": "73",
}
# ── capi 签名参数 ────────────────────────
def sign_params(self) -> dict:
"""构造 capi 接口的签名查询参数"""
return {
"at": self.at,
"rt": self.rt,
"channel": self.channel,
"platform": self.platform,
"version": self.version,
"d": self.device_id,
}