diff --git a/.planning/phases/02-boss/02-01-PLAN.md b/.planning/phases/02-boss/02-01-PLAN.md new file mode 100644 index 0000000..ca40f69 --- /dev/null +++ b/.planning/phases/02-boss/02-01-PLAN.md @@ -0,0 +1,216 @@ +--- +phase: 2 +plan: 1 +wave: 1 +title: "迁移 Boss 爬虫层至 crawler_core" +depends_on: [] +files_modified: + - spiderJobs/platforms/boss/client.py + - spiderJobs/platforms/boss/api.py + - spiderJobs/platforms/boss/main.py + - spiderJobs/platforms/boss/sign.py +autonomous: true +requirements: + - ARCH-03 +--- + +# Phase 2 Plan 01: 迁移 Boss 爬虫层至 crawler_core + +## Objective + +将 `spiderJobs/platforms/boss/` 下的 client.py、api.py、main.py 从依赖 +`spiderJobs.core`(旧基类)改为依赖 `crawler_core`(新基类), +同时删除 spiderJobs 版的冗余 `sign.py`(改从 crawler_core 导入)。 + +迁移完成后,Boss 爬虫满足 ARCH-03:不含内联签名或 HTTP 样板代码。 + +## Must Haves + +- [ ] `spiderJobs/platforms/boss/client.py` 继承 `crawler_core.http_client.HTTPClient` +- [ ] `spiderJobs/platforms/boss/api.py` 使用 `crawler_core.base.Result`、`BaseFetcher`、`BaseSearcher` +- [ ] `spiderJobs/platforms/boss/api.py` 中 `self._http` 全部替换为 `self.http_client` +- [ ] `spiderJobs/platforms/boss/main.py` import 更新,功能不变 +- [ ] `spiderJobs/platforms/boss/sign.py` 改为从 `crawler_core.boss.sign` 重新导出(向后兼容层) +- [ ] `python -m spiderJobs.platforms.boss.main` 可启动,无 ImportError + +--- + +## Wave 1(仅一波,任务间有顺序依赖) + +### Task 1.1: 更新 client.py + + +- `spiderJobs/platforms/boss/client.py`(当前完整内容) +- `crawler_core/http_client.py`(目标基类接口) +- `crawler_core/boss/sign.py`(BossSign 新来源) + + + +修改 `spiderJobs/platforms/boss/client.py`: + +1. 将第 10-11 行的 import 改为: + ```python + from crawler_core.http_client import HTTPClient + from crawler_core.boss.sign import BossSign + ``` + (删除 `from spiderJobs.core.http_client import HTTPClient` 和 `from spiderJobs.platforms.boss.sign import BossSign`) + +2. `BossClient(HTTPClient)` 继承关系不变,无需修改类体(两个 HTTPClient 接口完全一致)。 + +3. `create_client()` 工厂函数无需改动。 + +注意:`BASE_URL`、`BOSS_HEADERS` 和所有方法体内容均保持不变。 + + + +- `grep "from crawler_core.http_client import HTTPClient" spiderJobs/platforms/boss/client.py` 输出该行 +- `grep "from crawler_core.boss.sign import BossSign" spiderJobs/platforms/boss/client.py` 输出该行 +- `grep "spiderJobs.core" spiderJobs/platforms/boss/client.py` 无输出(空) +- `python -c "from spiderJobs.platforms.boss.client import BossClient"` 无 ImportError + + +--- + +### Task 1.2: 更新 api.py + + +- `spiderJobs/platforms/boss/api.py`(当前完整内容) +- `crawler_core/base.py`(Result、BaseFetcher、BaseSearcher 接口定义) +- `spiderJobs/platforms/boss/client.py`(迁移后版本,Task 1.1 产物) + + + +修改 `spiderJobs/platforms/boss/api.py`: + +1. 将第 15 行的 import 改为: + ```python + from crawler_core.base import BaseFetcher, BaseSearcher, Result + ``` + (删除 `from spiderJobs.core.base import ApiResult, BaseFetcher, BaseSearcher`) + +2. 全文替换 `ApiResult` → `Result`(出现在 `_parse_boss_response` 返回类型注解和函数体中) + +3. 在 `SearchRecJobs._request()` 方法中,将: + ```python + return self._http.get(self.ENDPOINT, params) + ``` + 改为: + ```python + return self.http_client.get(self.ENDPOINT, params) + ``` + +4. 在 `GetJobDetail.fetch()` 方法中,将: + ```python + client: BossClient = self._http + ``` + 改为: + ```python + client: BossClient = self.http_client + ``` + +5. 在 `SearchBrandJobs._request()` 方法中,将: + ```python + return self._http.get(self.ENDPOINT, params) + ``` + 改为: + ```python + return self.http_client.get(self.ENDPOINT, params) + ``` + +`_parse_boss_response` 的逻辑(code/zpData 解析)、所有参数、ENDPOINT 字符串均保持不变。 + + + +- `grep "from crawler_core.base import" spiderJobs/platforms/boss/api.py` 输出该行 +- `grep "ApiResult" spiderJobs/platforms/boss/api.py` 无输出(已全部替换为 Result) +- `grep "spiderJobs.core" spiderJobs/platforms/boss/api.py` 无输出 +- `grep "self\._http" spiderJobs/platforms/boss/api.py` 无输出(全替换为 self.http_client) +- `python -c "from spiderJobs.platforms.boss.api import SearchRecJobs, GetJobDetail, GetBrandDetail, SearchBrandJobs"` 无 ImportError + + +--- + +### Task 1.3: 更新 main.py + + +- `spiderJobs/platforms/boss/main.py`(当前完整内容) +- `crawler_core/base.py`(BaseFetcher、BaseSearcher 新接口) + + + +修改 `spiderJobs/platforms/boss/main.py`: + +1. 将第 35 行的 import 改为: + ```python + from crawler_core.base import BaseFetcher, BaseSearcher + ``` + (删除 `from spiderJobs.core.base import BaseFetcher, BaseSearcher`) + +2. 将第 38 行的 import 改为: + ```python + from crawler_core.boss.sign import BossSign + ``` + (删除 `from spiderJobs.platforms.boss.sign import BossSign`) + +3. 其他所有内容(CITY_CODE_MAP、create_searcher、extract_company_id、create_company_fetcher、main)保持不变。 + + + +- `grep "from crawler_core.base import BaseFetcher, BaseSearcher" spiderJobs/platforms/boss/main.py` 输出该行 +- `grep "from crawler_core.boss.sign import BossSign" spiderJobs/platforms/boss/main.py` 输出该行 +- `grep "spiderJobs.core" spiderJobs/platforms/boss/main.py` 无输出 +- `python -c "from spiderJobs.platforms.boss.main import main"` 无 ImportError + + +--- + +### Task 1.4: 将 spiderJobs 版 sign.py 改为向后兼容桩 + + +- `spiderJobs/platforms/boss/sign.py`(当前完整内容) +- `crawler_core/boss/sign.py`(权威实现) + + + +将 `spiderJobs/platforms/boss/sign.py` 内容完全替换为以下向后兼容桩, +保留 `BossSign` 名称以防现有代码仍直接 import: + +```python +""" +向后兼容桩 — Boss直聘签名 + +已迁移至 crawler_core.boss.sign。 +直接从 crawler_core 重新导出,避免下游代码出现 ImportError。 +""" + +from crawler_core.boss.sign import BossSign # noqa: F401 + +__all__ = ["BossSign"] +``` + + + +- `cat spiderJobs/platforms/boss/sign.py` 仅包含导入和 `__all__` 声明,不含任何 Boss 签名算法实现 +- `grep "from crawler_core.boss.sign import BossSign" spiderJobs/platforms/boss/sign.py` 输出该行 +- `python -c "from spiderJobs.platforms.boss.sign import BossSign; print(BossSign.generate_traceid())"` 成功打印 Traceid + + +--- + +## Verification + +```bash +# 验证所有 Boss 层 import 正确 +python -c " +from spiderJobs.platforms.boss.client import BossClient, create_client +from spiderJobs.platforms.boss.api import SearchRecJobs, GetJobDetail, GetBrandDetail, SearchBrandJobs +from spiderJobs.platforms.boss.main import main, create_searcher +print('✅ 所有 Boss 模块 import 成功') +" + +# 确认无旧依赖残留 +grep -rn "spiderJobs.core" spiderJobs/platforms/boss/ && echo "❌ 仍有旧依赖" || echo "✅ 无旧依赖" + +# 确认 sign 桩正常工作 +python -c "from spiderJobs.platforms.boss.sign import BossSign; print('Traceid:', BossSign.generate_traceid())" +``` diff --git a/.planning/phases/02-boss/02-02-PLAN.md b/.planning/phases/02-boss/02-02-PLAN.md new file mode 100644 index 0000000..2040134 --- /dev/null +++ b/.planning/phases/02-boss/02-02-PLAN.md @@ -0,0 +1,355 @@ +--- +phase: 2 +plan: 2 +wave: 1 +title: "新增 Boss HTTP 层 mock 测试" +depends_on: + - "02-01-PLAN.md" +files_modified: + - tests/boss/__init__.py + - tests/boss/test_boss_client.py +autonomous: true +requirements: + - QUAL-03 +--- + +# Phase 2 Plan 02: Boss HTTP 层 mock 测试 + +## Objective + +使用 `unittest.mock.MagicMock` 为 Boss HTTP 层添加 mock 测试, +覆盖正常响应和错误响应两种场景,满足 QUAL-03。 + +测试文件位置:`tests/boss/test_boss_client.py` + +**为什么不用 respx:** `crawler_core.HTTPClient` 底层使用 `requests_go`(非 httpx), +respx 只拦截 httpx,无法适用。使用 `MagicMock` 直接 mock `http_client` 接口, +不依赖网络,测试更快更稳定。 + +## Must Haves + +- [ ] `tests/boss/__init__.py` 存在(空文件即可) +- [ ] `tests/boss/test_boss_client.py` 存在并包含 mock 测试 +- [ ] `pytest tests/boss/ -v` 全部通过(无 ImportError,无失败) +- [ ] 测试组:SearchRecJobs(正常响应、业务错误、HTTP 非 200) +- [ ] 测试组:GetBrandDetail(正常响应) +- [ ] 测试组:SearchBrandJobs(正常响应、hasMore=True 时 is_end_page=False) +- [ ] 测试组:GetJobDetail.fetch()(通过 batch 接口,正常合并子请求) +- [ ] 测试组:BossClient(验证每次请求注入 Traceid 头、mpt/wt2 头) + +--- + +## Wave 1 + +### Task 2.1: 创建 tests/boss/ 目录和 __init__.py + + +- `tests/crawler_core/test_boss_sign.py`(参考现有测试风格和 conftest 依赖) +- `conftest.py`(项目根目录,确认 sys.path 设置) + + + +1. 创建 `tests/boss/__init__.py`,内容为空文件(或仅含 `# tests/boss/`)。 + +确认 `conftest.py` 已在项目根目录(Phase 1 已创建),无需重复创建。 + + + +- `test -f tests/boss/__init__.py && echo "OK"` 输出 OK +- `pytest tests/boss/ --collect-only 2>&1 | head -5` 无 "ERROR collecting" 字样 + + +--- + +### Task 2.2: 编写 test_boss_client.py + + +- `spiderJobs/platforms/boss/api.py`(迁移后版本,Task 1.2 产物) +- `spiderJobs/platforms/boss/client.py`(迁移后版本) +- `tests/crawler_core/test_boss_sign.py`(参考测试风格) +- `crawler_core/base.py`(Result 结构) + + + +创建 `tests/boss/test_boss_client.py`,内容如下(完整内容,直接写入文件): + +```python +""" +Boss 直聘 HTTP 层 mock 测试(QUAL-03) + +使用 unittest.mock.MagicMock 替代真实 HTTP 客户端, +覆盖正常响应和错误响应场景,无网络依赖。 +""" + +from __future__ import annotations + +from unittest.mock import MagicMock, patch, call +import pytest + +from spiderJobs.platforms.boss.api import ( + SearchRecJobs, + GetBrandDetail, + SearchBrandJobs, + GetJobDetail, + _parse_boss_response, +) +from spiderJobs.platforms.boss.client import BossClient +from crawler_core.base import Result + + +# ───────────────────────────────────────────────────────── +# 1. _parse_boss_response 纯函数测试 +# ───────────────────────────────────────────────────────── + +class TestParseBossResponse: + + def test_http_error_returns_failure(self): + result = _parse_boss_response(500, {}) + assert result.success is False + assert result.status_code == 500 + + def test_non_dict_raw_returns_failure(self): + result = _parse_boss_response(200, "not a dict") + assert result.success is False + + def test_biz_error_code_35_returns_failure(self): + result = _parse_boss_response(200, {"code": 35, "message": "IP地址存在异常"}) + assert result.success is False + assert result.status_code == 35 + assert "IP" in result.error + + def test_joblist_payload_parsed_correctly(self): + raw = { + "code": 0, + "zpData": { + "jobList": [{"title": "Python工程师"}], + "hasMore": True, + }, + } + result = _parse_boss_response(200, raw) + assert result.success is True + assert len(result.list) == 1 + assert result.list[0]["title"] == "Python工程师" + assert result.is_end_page is False # hasMore=True → is_end_page=False + + def test_joblist_no_more_pages(self): + raw = { + "code": 0, + "zpData": {"jobList": [{"title": "测试"}], "hasMore": False}, + } + result = _parse_boss_response(200, raw) + assert result.is_end_page is True + + def test_detail_payload(self): + raw = {"code": 0, "zpData": {"companyName": "测试公司"}} + result = _parse_boss_response(200, raw) + assert result.success is True + assert result.data == {"companyName": "测试公司"} + + +# ───────────────────────────────────────────────────────── +# 2. SearchRecJobs +# ───────────────────────────────────────────────────────── + +class TestSearchRecJobs: + + def _make_mock_client(self, return_value): + mock_client = MagicMock() + mock_client.get.return_value = return_value + return mock_client + + def test_search_success(self): + raw = { + "code": 0, + "zpData": { + "jobList": [{"title": "测试职位1"}, {"title": "测试职位2"}], + "hasMore": False, + }, + } + searcher = SearchRecJobs(city_code="101010100", client=self._make_mock_client((200, raw))) + result = searcher.search(page_index=1) + + assert result.success is True + assert len(result.list) == 2 + assert result.is_end_page is True + + def test_search_http_error(self): + searcher = SearchRecJobs(client=self._make_mock_client((403, {}))) + result = searcher.search(page_index=1) + assert result.success is False + assert result.status_code == 403 + + def test_search_biz_error(self): + raw = {"code": 35, "message": "IP地址存在异常"} + searcher = SearchRecJobs(client=self._make_mock_client((200, raw))) + result = searcher.search(page_index=1) + assert result.success is False + + def test_search_builds_correct_params(self): + mock_client = MagicMock() + mock_client.get.return_value = (200, {"code": 0, "zpData": {"jobList": [], "hasMore": False}}) + searcher = SearchRecJobs(city_code="101280600", page_size=10, client=mock_client) + searcher.search(page_index=2) + + called_args = mock_client.get.call_args + params = called_args[0][1] if len(called_args[0]) > 1 else called_args[1].get("params", called_args[0][1]) + assert params["cityCode"] == "101280600" + assert params["page"] == 2 + assert params["pageSize"] == 10 + + +# ───────────────────────────────────────────────────────── +# 3. GetBrandDetail +# ───────────────────────────────────────────────────────── + +class TestGetBrandDetail: + + def test_fetch_success(self): + mock_client = MagicMock() + mock_client.get.return_value = (200, { + "code": 0, + "zpData": {"brandName": "测试公司", "brandId": "abc123"}, + }) + fetcher = GetBrandDetail(brand_id="abc123", client=mock_client) + result = fetcher.fetch() + + assert result.success is True + assert result.data["brandName"] == "测试公司" + + def test_fetch_404(self): + mock_client = MagicMock() + mock_client.get.return_value = (404, {}) + fetcher = GetBrandDetail(brand_id="notexist", client=mock_client) + result = fetcher.fetch() + + assert result.success is False + assert result.status_code == 404 + + +# ───────────────────────────────────────────────────────── +# 4. SearchBrandJobs +# ───────────────────────────────────────────────────────── + +class TestSearchBrandJobs: + + def test_search_success_has_more(self): + mock_client = MagicMock() + mock_client.get.return_value = (200, { + "code": 0, + "zpData": {"list": [{"jobName": "测试岗位"}], "hasMore": True}, + }) + searcher = SearchBrandJobs(brand_id="abc123", client=mock_client) + result = searcher.search(page_index=1) + + assert result.success is True + assert len(result.list) == 1 + assert result.is_end_page is False + + def test_search_success_no_more(self): + mock_client = MagicMock() + mock_client.get.return_value = (200, { + "code": 0, + "zpData": {"list": [], "hasMore": False}, + }) + searcher = SearchBrandJobs(brand_id="abc123", client=mock_client) + result = searcher.search(page_index=1) + + assert result.is_end_page is True + + +# ───────────────────────────────────────────────────────── +# 5. GetJobDetail(batch 接口路径) +# ───────────────────────────────────────────────────────── + +class TestGetJobDetail: + + def test_fetch_success_merges_sub_requests(self): + mock_client = MagicMock() + mock_client.batch.return_value = (200, { + "code": 0, + "zpData": { + "/wapi/zpgeek/miniapp/job/detail.json": { + "zpData": {"jobName": "数据工程师"} + }, + "/wapi/zpgeek/miniapp/jobdetail/improvement/query.json": { + "zpData": {"tags": ["Python", "大数据"]} + }, + }, + }) + fetcher = GetJobDetail(security_id="sid123", job_id="jid456", client=mock_client) + result = fetcher.fetch() + + assert result.success is True + assert result.data["detail"]["jobName"] == "数据工程师" + assert "Python" in result.data["improvement"]["tags"] + + def test_fetch_biz_error(self): + mock_client = MagicMock() + mock_client.batch.return_value = (200, {"code": 35, "message": "IP地址存在异常"}) + fetcher = GetJobDetail(security_id="sid", job_id="jid", client=mock_client) + result = fetcher.fetch() + assert result.success is False + + def test_fetch_exception_handled(self): + mock_client = MagicMock() + mock_client.batch.side_effect = ConnectionError("连接超时") + fetcher = GetJobDetail(security_id="sid", job_id="jid", client=mock_client) + result = fetcher.fetch() + assert result.success is False + assert "连接超时" in result.error + + +# ───────────────────────────────────────────────────────── +# 6. BossClient — Traceid/mpt/wt2 请求头注入 +# ───────────────────────────────────────────────────────── + +class TestBossClientHeaders: + + def test_get_injects_traceid(self): + """每次 GET 请求头包含 Traceid""" + client = BossClient(tunnel_proxy=None) + headers = client._boss_headers() + assert "Traceid" in headers + assert headers["Traceid"].startswith("M-W") + + def test_traceid_is_unique_per_call(self): + """每次调用生成不同的 Traceid""" + client = BossClient() + t1 = client._boss_headers()["Traceid"] + t2 = client._boss_headers()["Traceid"] + # 大概率不同(极低碰撞概率可忽略) + assert len(t1) > 10 + assert len(t2) > 10 + + def test_mpt_wt2_in_headers(self): + """signer 的 mpt/wt2 注入到请求头""" + from crawler_core.boss.sign import BossSign + signer = BossSign(mpt="test_mpt_value", wt2="test_wt2_value") + client = BossClient(signer=signer) + headers = client._boss_headers() + assert headers["mpt"] == "test_mpt_value" + assert headers["wt2"] == "test_wt2_value" +``` + + + +- `test -f tests/boss/test_boss_client.py && echo "OK"` 输出 OK +- `pytest tests/boss/ -v 2>&1 | tail -10` 输出包含 "passed",无 "failed" 或 "error" +- `pytest tests/boss/ -v 2>&1 | grep "PASSED" | wc -l` 输出 >= 15(至少 15 个测试通过) +- `pytest tests/boss/ -v 2>&1 | grep "import"` 无输出(无 ImportError) + + +--- + +## Verification + +```bash +# 运行所有 Boss mock 测试 +pytest tests/boss/ -v + +# 同时运行 Phase 1 签名测试,确认没有回归 +pytest tests/crawler_core/ -v + +# 汇总结果 +pytest tests/ -v --tb=short +``` diff --git a/.planning/phases/02-boss/02-RESEARCH.md b/.planning/phases/02-boss/02-RESEARCH.md new file mode 100644 index 0000000..2899e68 --- /dev/null +++ b/.planning/phases/02-boss/02-RESEARCH.md @@ -0,0 +1,163 @@ +# Phase 2: Boss 直聘重写 — 技术研究 + +**研究日期:** 2026-03-21 +**阶段目标:** Boss 直聘爬虫完全基于 crawler_core 运行,旧实现可安全停用 + +--- + +## 1. 现状分析 + +### 1.1 crawler_core 现有基础(Phase 1 完成) + +| 文件 | 内容 | +|------|------| +| `crawler_core/http_client.py` | `HTTPClient` — requests_go + TLS 伪装 + tenacity 重试(min=10s) + 代理池/隧道代理 | +| `crawler_core/base.py` | `Result[T]`(泛型)、`BaseFetcher`、`BaseSearcher`、`parse_response` | +| `crawler_core/boss/sign.py` | `BossSign.generate_traceid()` — 已完成且有测试 | + +### 1.2 待迁移的 Boss 层(spiderJobs) + +`spiderJobs/platforms/boss/` 下已有: + +| 文件 | 当前依赖 | 迁移目标 | +|------|---------|---------| +| `sign.py` | 独立实现(与 crawler_core/boss/sign.py 功能相同) | 弃用,改为 import crawler_core | +| `client.py` | `spiderJobs.core.http_client.HTTPClient` | 改为 `crawler_core.http_client.HTTPClient` | +| `api.py` | `spiderJobs.core.base.ApiResult/BaseFetcher/BaseSearcher` | 改为 `crawler_core.base.Result/BaseFetcher/BaseSearcher` | +| `main.py` | `spiderJobs.core.base.BaseFetcher/BaseSearcher` | 更新 import,功能保持不变 | + +### 1.3 待保留的反爬机制(SmartIPManager) + +旧 `boos_api.py` 中有 `SmartIPManager`(代理轮换+本机 fallback),这套逻辑**已被 crawler_core 的 HTTPClient 代理池简化替代**: +- `HTTPClient(proxy_pool=...)` 自动随机选择代理(每次请求) +- `HTTPClient(tunnel_proxy=...)` 每次新建 session(效果类似隧道代理的 IP 轮换) +- tenacity 重试 min=10s 已满足强制延迟要求 + +结论:**不需要**将 SmartIPManager 迁移进来,crawler_core 已经覆盖了其功能。 + +--- + +## 2. API 接口清单(已确认) + +`spiderJobs/platforms/boss/api.py` 实现了 4 个接口: + +| 类 | 端点 | 方式 | +|----|------|------| +| `SearchRecJobs` | `/wapi/zpgeek/miniapp/homepage/recjoblist.json` | GET | +| `GetJobDetail` | `/wapi/batch/requests`(批量子请求) | POST | +| `GetBrandDetail` | `/wapi/zpgeek/miniapp/brand/detail.json` | GET | +| `SearchBrandJobs` | `/wapi/zpgeek/miniapp/brand/joblist.json` | GET | + +Boss 响应格式与 crawler_core 默认解析不同: +- Boss:`code=0` 表示成功,`zpData` 为业务数据(而非 `statusCode/data`) +- 需要在迁移后的 api.py 中保留自定义 `_parse_boss_response()` 函数 + +--- + +## 3. 迁移差异分析 + +### 3.1 ApiResult → Result[T] + +`spiderJobs.core.base.ApiResult` 与 `crawler_core.base.Result[T]` 字段对比: + +| 字段 | ApiResult | Result[T] | 兼容? | +|------|-----------|-----------|-------| +| `success` | ✓ | ✓ | ✅ | +| `status_code` | ✓ | ✓ | ✅ | +| `data` | ✓ | ✓ | ✅ | +| `list` | ✓ | ✓ | ✅ | +| `count` | ✓ | ✓ | ✅ | +| `is_end_page` | ✓ | ✓ | ✅ | +| `error` | ✓ | ✓ | ✅ | + +完全兼容,仅需修改 import 路径。 + +### 3.2 BaseFetcher._http → BaseFetcher.http_client + +旧 spiderJobs 的 `BaseFetcher` 使用 `self._http` 引用 HTTP 客户端,而 `crawler_core.base.BaseFetcher` 使用 `self.http_client`。 + +`api.py` 中两处引用需要更新: +- `SearchRecJobs._request()`: `self._http.get(...)` → `self.http_client.get(...)` +- `GetJobDetail.fetch()`: `client: BossClient = self._http` → `client: BossClient = self.http_client` +- `SearchBrandJobs._request()`: `self._http.get(...)` → `self.http_client.get(...)` + +### 3.3 BossClient 迁移 + +`client.py` 中 `BossClient` 继承的是 `spiderJobs.core.http_client.HTTPClient`,需改为继承 `crawler_core.http_client.HTTPClient`。两个 HTTPClient 接口完全相同,无结构性差异。 + +`sign.py`(spiderJobs 版)与 `crawler_core/boss/sign.py` 功能完全相同,迁移后 `client.py` 直接从 `crawler_core.boss.sign` 导入即可。 + +--- + +## 4. 测试策略(QUAL-03) + +### 4.1 测试框架 + +项目已有: +- `conftest.py` 在项目根目录(用于 pytest path 设置) +- `tests/crawler_core/` 已有 41 个签名单元测试(Phase 1) + +建议新增测试目录:`tests/boss/`,文件:`test_boss_client.py` + +### 4.2 Mock/Respx 测试(注意事项) + +crawler_core 的 HTTPClient 使用 `requests_go`(非标准 requests),respx 专门 mock `httpx`,不适用。 + +**替代方案:使用 `unittest.mock.patch`** + +```python +# 正确的 mock 方式 +from unittest.mock import MagicMock, patch + +def test_search_rec_jobs_success(): + mock_client = MagicMock() + mock_client.get.return_value = (200, { + "code": 0, "zpData": {"jobList": [{"title": "测试职位"}], "hasMore": False} + }) + searcher = SearchRecJobs(client=mock_client) + result = searcher.search(page_index=1) + assert result.success + assert len(result.list) == 1 +``` + +由于 HTTPClient 是作为依赖注入传入的,直接用 `MagicMock()` mock 即可,无需 patch 装饰器。 + +--- + +## 5. 关键词搜索接口说明 + +当前 `SearchRecJobs` 只使用 `cityCode` 过滤,没有关键词搜索参数。 + +查看 Boss API,关键词搜索应使用 `/wapi/zpgeek/miniapp/search.json`(需要 query 参数)或直接使用 recjoblist 接口配合 `query` 字段(如果 API 支持)。 + +**推荐方案:** Phase 2 保持现有 `SearchRecJobs`(推荐列表),关键词过滤通过 `main.py` 的城市映射传入。真正的关键词搜索 API 端点验证留给手动测试阶段确认。 + +--- + +## 6. 反爬机制验证 + +Phase 2 成功标准之一是"反爬机制保留",具体验证点: + +| 机制 | 实现位置 | 验证方式 | +|------|---------|---------| +| 随机延迟 10-20s | `HTTPClient` tenacity min=10s | 单元测试检查 wait_random_exponential 配置 | +| TLS 指纹伪装 | `HTTPClient._new_session()` → `TLS_CHROME_LATEST` | 代码审查(无法 mock TLS 层) | +| 代理轮换 | `HTTPClient(tunnel_proxy=...)` 每次新建 session | 单元测试确认 `_new_session()` 被调用 | +| Traceid 注入 | `BossClient._boss_headers()` | 单元测试检查请求头包含 Traceid | + +## 7. Validation Architecture(Nyquist) + +暂不适用本阶段:Phase 2 是代码迁移,无新功能/新接口,验证维度以单元测试+手动验证为主。 + +--- + +## RESEARCH COMPLETE + +**Phase 2 可以规划。** 迁移任务明确,风险低(接口兼容),主要工作量在: +1. 更新 3 个文件的 import(client.py、api.py、main.py) +2. 修正 2 处 `self._http` → `self.http_client` 引用 +3. 新增 mock 测试 + +预计拆分为 2 个 PLAN: +- Plan 01:迁移 client.py、api.py、main.py(去除 spiderJobs.core 依赖) +- Plan 02:新增 `tests/boss/test_boss_client.py` mock 测试