From bd1e50e410e8deb50089b811b564a9b940d90d94 Mon Sep 17 00:00:00 2001 From: win Date: Sat, 21 Mar 2026 18:08:53 +0800 Subject: [PATCH] feat(01-02): port sign algorithms to crawler_core/ platform directories - Add crawler_core/boss/sign.py: BossSign traceid generator (pure stdlib) - Add crawler_core/qcwy/sign.py: Job51Sign HMAC-SHA256 signing (pure stdlib) - Add crawler_core/zhilian/sign.py: ZhilianSign header/param signing (pure stdlib) - Add __init__.py for all three crawler_core platform directories - Updated module docstrings to reference crawler_core; all logic unchanged - No imports from spiderJobs or app; no HTTP dependencies --- crawler_core/boss/sign.py | 107 +++++++++++++++++++++++++++++++++++ crawler_core/qcwy/sign.py | 88 ++++++++++++++++++++++++++++ crawler_core/zhilian/sign.py | 86 ++++++++++++++++++++++++++++ 3 files changed, 281 insertions(+) create mode 100644 crawler_core/boss/sign.py create mode 100644 crawler_core/qcwy/sign.py create mode 100644 crawler_core/zhilian/sign.py diff --git a/crawler_core/boss/sign.py b/crawler_core/boss/sign.py new file mode 100644 index 0000000..d8264ee --- /dev/null +++ b/crawler_core/boss/sign.py @@ -0,0 +1,107 @@ +""" +Boss直聘 Traceid 生成算法 (crawler_core) +从 miniprogram_npm/trace-id/index.js 翻译而来 + +Traceid 格式: {prefix}{hex_timestamp_13}{random_6}{checksum_3} +示例: M-W0019d0a8af5f32gtVvnD4M +""" + +from __future__ import annotations + +import random +import time + + +# base62 字符集(与 JS 端一致) +_CHARS = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" + + +def _to_u32(n: int) -> int: + """模拟 JS 的 >>> 0(无符号 32 位截断)""" + return n & 0xFFFFFFFF + + +def _compute_checksum(uuid_str: str) -> str: + """ + 对 19 字符的 UUID 计算 3 位校验码 + + 与 JS 端 computeChecksum() 完全一致: + - 正向哈希 r: (r << 5) - r + charCode + - 反向哈希 a: (a << 7) - a + charCode * (index + 1) + - 中间扩展哈希 n: (n << 3) - n + charCode * (|index - mid| + 1) + - 三组混合后取 base62 下标 + """ + r = 0 + for ch in uuid_str: + r = ((r << 5) - r + ord(ch)) & 0xFFFFFFFF + # JS 的 r &= r 等价于保持 32 位(已在上面做了) + + a = 0 + for i in range(len(uuid_str) - 1, -1, -1): + a = ((a << 7) - a + ord(uuid_str[i]) * (i + 1)) & 0xFFFFFFFF + + n = 0 + mid = len(uuid_str) // 2 + for i in range(len(uuid_str)): + n = ((n << 3) - n + ord(uuid_str[i]) * (abs(i - mid) + 1)) & 0xFFFFFFFF + + # ── 第 1 个校验字符 ── + s = _to_u32(r ^ a) + # JS: Math.abs(s) — 但 s 是 u32 所以已经 ≥0 + # 然后: s = (2654435761 * s >>> 0) ^ (s >>> 16) >>> 0 + # s = (2246822507 * s >>> 0) ^ (s >>> 13) >>> 0 + s = _to_u32(2654435761 * s) + s = _to_u32(s ^ (s >> 16)) + s = _to_u32(2246822507 * s) + s = _to_u32(s ^ (s >> 13)) + c1 = _CHARS[s % 62] + + # ── 第 2 个校验字符 ── + h = _to_u32(a ^ n) + h = _to_u32(3266489909 * h) + h = _to_u32(h ^ (h >> 16)) + h = _to_u32(2654435761 * h) + h = _to_u32(h ^ (h >> 13)) + c2 = _CHARS[h % 62] + + # ── 第 3 个校验字符 ── + v = _to_u32(n ^ r) + v = _to_u32(668265261 * v) + v = _to_u32(v ^ (v >> 16)) + v = _to_u32(2246822507 * v) + v = _to_u32(v ^ (v >> 13)) + c3 = _CHARS[v % 62] + + return f"{c1}{c2}{c3}" + + +def _generate_uuid() -> str: + """ + 生成 19 字符 UUID: 13位 hex 时间戳 + 6位 base62 随机字符 + """ + hex_ts = format(int(time.time() * 1000), "x").lower() + hex_ts = hex_ts[-13:].zfill(13) + rand_part = "".join(random.choice(_CHARS) for _ in range(6)) + return hex_ts + rand_part + + +class BossSign: + """ + Boss直聘请求签名 + + 功能: 生成 Traceid 头 + 参数说明: + mpt: 登录凭证(无登录时为空字符串) + wt2: 登录凭证(无登录时为空字符串) + """ + + def __init__(self, *, mpt: str = "", wt2: str = ""): + self.mpt = mpt + self.wt2 = wt2 + + @staticmethod + def generate_traceid(prefix: str = "M-W") -> str: + """生成 Traceid,格式: {prefix}{uuid}{checksum}""" + uuid_str = _generate_uuid() + checksum = _compute_checksum(uuid_str) + return f"{prefix}{uuid_str}{checksum}" diff --git a/crawler_core/qcwy/sign.py b/crawler_core/qcwy/sign.py new file mode 100644 index 0000000..dca58c4 --- /dev/null +++ b/crawler_core/qcwy/sign.py @@ -0,0 +1,88 @@ +""" +前程无忧 (51Job) 签名算法 (crawler_core) +从小程序源码 utils/cupid.js + server/request/index.js 翻译而来 + +签名逻辑: + 1. 构造 path = /{endpoint}?api_key=51job×tamp={ts} + 2. GET: message = path + ¶m1=val1¶m2=val2 + POST: message = path + JSON.stringify(body) + 3. sign = HMAC-SHA256(message, SIGN_KEY).hex() +""" + +from __future__ import annotations + +import hmac +import hashlib +import time +import random +from urllib.parse import quote + + +# 与小程序硬编码一致(server/config/index.js) +SIGN_KEY = "abfc8f9dcf8c3f3d8aa294ac5f2cf2cc7767e5592590f39c3f503271dd68562b" + + +class Job51Sign: + """ + 前程无忧请求签名 + + 功能: 根据请求路径、方法、参数生成 sign 和带签名的 URL path + 参数说明: + sign_key: HMAC 密钥(默认使用小程序内置 key) + """ + + def __init__(self, *, sign_key: str = SIGN_KEY): + self.sign_key = sign_key + + @staticmethod + def generate_uuid() -> str: + """生成 UUID(13位时间戳 + 随机数),用作 distinct_id / uuid""" + ts = str(int(time.time() * 1000)) + rand = str(random.randint(1000000000, 9999999999)) + return ts + rand + + def build_sign_path( + self, + endpoint: str, + method: str = "GET", + params: dict | None = None, + body: dict | None = None, + ) -> tuple[str, str]: + """ + 构建签名路径和 sign 值 + + Args: + endpoint: API 路径(不含域名,如 open/noauth/jobs/detail/base/170651439) + method: HTTP 方法(GET / POST) + params: GET 请求的额外 query 参数 + body: POST 请求的 JSON body + + Returns: + (url_path, sign_hex) + url_path: 完整的 URL path(含 api_key 和 timestamp) + sign_hex: HMAC-SHA256 签名 + """ + import json + + ts = int(time.time()) + path = f"/{endpoint}?api_key=51job×tamp={ts}" + + if method.upper() == "GET" and params: + query_parts = [] + for k, v in params.items(): + query_parts.append(f"{quote(str(k), safe='')}={quote(str(v), safe='')}") + if query_parts: + path += "&" + "&".join(query_parts) + + # 签名字符串 + message = path + if method.upper() == "POST" and body is not None: + message += json.dumps(body, ensure_ascii=False, separators=(",", ":")) + + sign_hex = hmac.new( + self.sign_key.encode("utf-8"), + message.encode("utf-8"), + hashlib.sha256, + ).hexdigest() + + return path, sign_hex diff --git a/crawler_core/zhilian/sign.py b/crawler_core/zhilian/sign.py new file mode 100644 index 0000000..26c8338 --- /dev/null +++ b/crawler_core/zhilian/sign.py @@ -0,0 +1,86 @@ +""" +智联招聘签名算法 (crawler_core) +职责:参数构造 + 签名算法,不涉及 HTTP 请求 +""" + +from __future__ import annotations + +import math +import random +from typing import Optional + + +class ZhilianSign: + """ + 智联招聘请求签名 + + 功能 1: 生成请求所需的签名参数(device_id, action_id, at, rt 等) + 功能 2: 根据接口类型(cgate / capi)构造对应的签名头或签名参数 + + 参数说明: + at: Access Token(登录后获得,未登录为空) + rt: Refresh Token(登录后获得,未登录为空) + device_id: 设备 ID(自动生成 UUID,也可手动指定) + version: 小程序版本号 + channel: 渠道标识 + platform: 平台 ID(12 = 微信小程序) + """ + + def __init__( + self, + *, + at: str = "", + rt: str = "", + device_id: Optional[str] = None, + version: str = "4.1.259", + channel: str = "wxxiaochengxu", + platform: str = "12", + ): + self.at = at + self.rt = rt + self.device_id = device_id or self.generate_uuid() + self.version = version + self.channel = channel + self.platform = platform + + # ── 算法: UUID 生成(与小程序一致)──────── + + @staticmethod + def generate_uuid() -> str: + chars = "0123456789ABCDEF" + uuid = [""] * 36 + for i in range(36): + uuid[i] = chars[math.floor(16 * random.random())] + uuid[14] = "4" + uuid[19] = chars[(int(uuid[19], 16) & 0x3) | 0x8] + uuid[8] = uuid[13] = uuid[18] = uuid[23] = "-" + return "".join(uuid) + + # ── cgate 签名头 ───────────────────────── + + def sign_headers(self, page_code: str = "0") -> dict: + """构造 cgate 接口的签名请求头""" + return { + "x-zp-at": self.at, + "x-zp-rt": self.rt, + "x-zp-action-id": self.generate_uuid(), + "x-zp-page-code": page_code, + "x-zp-version": self.version, + "x-zp-channel": self.channel, + "x-zp-platform": self.platform, + "x-zp-device-id": self.device_id, + "x-zp-business-system": "73", + } + + # ── capi 签名参数 ──────────────────────── + + def sign_params(self) -> dict: + """构造 capi 接口的签名查询参数""" + return { + "at": self.at, + "rt": self.rt, + "channel": self.channel, + "platform": self.platform, + "version": self.version, + "d": self.device_id, + }