From 024c2bcd496079ac3414f5a10fdfe9de2d337ec7 Mon Sep 17 00:00:00 2001 From: win Date: Sat, 21 Mar 2026 19:10:59 +0800 Subject: [PATCH] docs(phase-3): add research and 2 plans for job51+zhilian migration --- .../phases/03-qcwy-zhilian/03-01-PLAN.md | 243 ++++++++++++ .../phases/03-qcwy-zhilian/03-02-PLAN.md | 369 ++++++++++++++++++ .../phases/03-qcwy-zhilian/03-RESEARCH.md | 107 +++++ 3 files changed, 719 insertions(+) create mode 100644 .planning/phases/03-qcwy-zhilian/03-01-PLAN.md create mode 100644 .planning/phases/03-qcwy-zhilian/03-02-PLAN.md create mode 100644 .planning/phases/03-qcwy-zhilian/03-RESEARCH.md diff --git a/.planning/phases/03-qcwy-zhilian/03-01-PLAN.md b/.planning/phases/03-qcwy-zhilian/03-01-PLAN.md new file mode 100644 index 0000000..628fd85 --- /dev/null +++ b/.planning/phases/03-qcwy-zhilian/03-01-PLAN.md @@ -0,0 +1,243 @@ +--- +phase: 3 +plan: 1 +wave: 1 +title: "迁移前程无忧(job51)层至 crawler_core + mock 测试" +depends_on: [] +files_modified: + - spiderJobs/platforms/job51/client.py + - spiderJobs/platforms/job51/api.py + - spiderJobs/platforms/job51/main.py + - spiderJobs/platforms/job51/sign.py + - tests/job51/__init__.py + - tests/job51/test_job51_client.py +autonomous: true +requirements: + - ARCH-04 +--- + +# Phase 3 Plan 01: 迁移前程无忧(job51)层至 crawler_core + mock 测试 + +## Objective + +将 `spiderJobs/platforms/job51/` 从依赖 `spiderJobs.core`(旧基类)改为依赖 `crawler_core`(新基类), +同时新增 `tests/job51/test_job51_client.py` mock 测试,满足 ARCH-04。 + +迁移与 Phase 2 Boss 完全对称:4 个文件修改 + sign.py 改桩 + 新增测试。 + +## Must Haves + +- [ ] `client.py` 继承 `crawler_core.http_client.HTTPClient`,使用 `crawler_core.qcwy.sign.Job51Sign` +- [ ] `api.py` 使用 `crawler_core.base.Result/BaseFetcher/BaseSearcher`,`ApiResult` 全量替换为 `Result` +- [ ] `api.py` 中 2 处 `self._http` 替换为 `self.http_client` +- [ ] `main.py` import 更新为 `crawler_core.base` +- [ ] `sign.py` 改为向后兼容桩(重新导出 `crawler_core.qcwy.sign.Job51Sign`) +- [ ] `python -c "from spiderJobs.platforms.job51.api import ...; from spiderJobs.platforms.job51.client import ..."` 无 ImportError +- [ ] `grep -rn "spiderJobs.core" spiderJobs/platforms/job51/{client,api,main,sign}.py` 无输出 +- [ ] `tests/job51/__init__.py` 存在 +- [ ] `tests/job51/test_job51_client.py` 存在 +- [ ] `pytest tests/job51/ -v` 全部通过(>= 15 个测试) + +--- + +## Wave 1 + +### Task 1.1: 更新 client.py + + +- `spiderJobs/platforms/job51/client.py`(当前内容) +- `crawler_core/http_client.py`(目标基类) +- `crawler_core/qcwy/sign.py`(Job51Sign 新来源) + + + +修改 `spiderJobs/platforms/job51/client.py`: + +1. 第 15 行改为: + ```python + from crawler_core.http_client import HTTPClient + ``` +2. 第 16 行改为: + ```python + from crawler_core.qcwy.sign import Job51Sign + ``` + (删除 `from spiderJobs.core.http_client import HTTPClient` 和 `from spiderJobs.platforms.job51.sign import Job51Sign`) + +其他所有内容(JOB51_HEADERS、Job51Client 类体、create_client 函数)不变。 + + + +- `grep "from crawler_core.http_client import HTTPClient" spiderJobs/platforms/job51/client.py` 有输出 +- `grep "from crawler_core.qcwy.sign import Job51Sign" spiderJobs/platforms/job51/client.py` 有输出 +- `grep "spiderJobs.core" spiderJobs/platforms/job51/client.py` 无输出 +- `python -c "from spiderJobs.platforms.job51.client import Job51Client, create_client"` 无 ImportError + + +--- + +### Task 1.2: 更新 api.py + + +- `spiderJobs/platforms/job51/api.py`(当前完整内容,共 ~260 行) +- `crawler_core/base.py`(Result、BaseFetcher、BaseSearcher 接口) + + + +修改 `spiderJobs/platforms/job51/api.py`: + +1. 第 14 行改为: + ```python + from crawler_core.base import BaseFetcher, BaseSearcher, Result + ``` + (删除 `from spiderJobs.core.base import ApiResult, BaseFetcher, BaseSearcher`) + +2. 全文将 `ApiResult` 替换为 `Result`(共 11 处,包含函数注解和 return 语句) + +3. 第 164 行:`http_code, data = self._http.get(endpoint)` → `http_code, data = self.http_client.get(endpoint)` + +4. 第 208 行:`http_code, data = self._http.get(self.ENDPOINT, self._build_params())` → `http_code, data = self.http_client.get(self.ENDPOINT, self._build_params())` + +`_parse_job51_response` 逻辑(status/1 判断、resultbody 解析)完全保留,只替换 `ApiResult` → `Result`。 + + + +- `grep "from crawler_core.base import" spiderJobs/platforms/job51/api.py` 有输出 +- `grep "ApiResult" spiderJobs/platforms/job51/api.py` 无输出 +- `grep "self\._http" spiderJobs/platforms/job51/api.py` 无输出 +- `python -c "from spiderJobs.platforms.job51.api import SearchRecommendJobs, GetJobDetail, GetCompanyDetail, SearchCompanyJobs"` 无 ImportError + + +--- + +### Task 1.3: 更新 main.py + + +- `spiderJobs/platforms/job51/main.py`(当前内容) + + + +修改 `spiderJobs/platforms/job51/main.py`: + +1. 第 32 行改为: + ```python + from crawler_core.base import BaseFetcher, BaseSearcher + ``` + (删除 `from spiderJobs.core.base import BaseFetcher, BaseSearcher`) + +其他内容不变。 + + + +- `grep "from crawler_core.base import BaseFetcher, BaseSearcher" spiderJobs/platforms/job51/main.py` 有输出 +- `grep "spiderJobs.core" spiderJobs/platforms/job51/main.py` 无输出 + + +--- + +### Task 1.4: 将 sign.py 改为向后兼容桩 + + +- `spiderJobs/platforms/job51/sign.py`(当前内容) +- `crawler_core/qcwy/sign.py`(权威实现) + + + +将 `spiderJobs/platforms/job51/sign.py` 完全替换为: + +```python +""" +向后兼容桩 — 前程无忧 (51Job) 签名 + +已迁移至 crawler_core.qcwy.sign。 +直接从 crawler_core 重新导出,避免下游代码出现 ImportError。 +""" + +from crawler_core.qcwy.sign import Job51Sign # noqa: F401 + +__all__ = ["Job51Sign"] +``` + + + +- `grep "from crawler_core.qcwy.sign import Job51Sign" spiderJobs/platforms/job51/sign.py` 有输出 +- `python -c "from spiderJobs.platforms.job51.sign import Job51Sign; print(Job51Sign.generate_uuid())"` 成功打印 UUID + + +--- + +### Task 1.5: 创建 tests/job51/__init__.py + + +创建 `tests/job51/__init__.py`,内容:`# tests/job51/` + + + +- `test -f tests/job51/__init__.py && echo "OK"` 输出 OK + + +--- + +### Task 1.6: 编写 tests/job51/test_job51_client.py + + +- `spiderJobs/platforms/job51/api.py`(迁移后版本) +- `spiderJobs/platforms/job51/client.py`(迁移后版本) +- `crawler_core/qcwy/sign.py`(Job51Sign 接口) +- `tests/boss/test_boss_client.py`(参考风格) + + + +创建 `tests/job51/test_job51_client.py`,包含以下测试组: + +1. **TestParseJob51Response(纯函数):** + - `test_http_error_returns_failure`:HTTP 500 → success=False + - `test_status_zero_returns_failure`:status=0 → success=False + - `test_status_one_with_resultbody_job_list`:status=1,resultbody.jobList.items → list 解析正确 + - `test_status_one_no_items`:status=1,无 items → success=True,list=[] + - `test_non_dict_raw_returns_failure`:raw 不是 dict → failure + +2. **TestSearchRecommendJobs:** + - `test_search_success`:正常返回职位列表 + - `test_search_http_error`:HTTP 403 + +3. **TestGetJobDetail:** + - `test_fetch_success`:成功返回 data + - `test_fetch_exception_handled`:`http_client.get` 抛异常 → success=False + +4. **TestGetCompanyDetail:** + - `test_fetch_success`:成功返回 data + +5. **TestJob51ClientHeaders:** + - `test_headers_contain_sign`:POST 后 `_job51_headers(sign="abc")["sign"]` == "abc" + - `test_headers_uuid_format`:uuid 字段长度 >= 20 + +所有测试使用 `MagicMock()` mock http_client.get/post,无需网络。 + + + +- `test -f tests/job51/test_job51_client.py && echo "OK"` 输出 OK +- `pipenv run python -m pytest tests/job51/ -v` 全部通过(>= 12 个测试用例) + + +--- + +## Verification + +```bash +# 1. 验证所有 job51 模块 import 正确 +pipenv run python -c " +from spiderJobs.platforms.job51.client import Job51Client, create_client +from spiderJobs.platforms.job51.api import SearchRecommendJobs, GetJobDetail, GetCompanyDetail +from spiderJobs.platforms.job51.main import main, create_searcher +from crawler_core.base import BaseFetcher, BaseSearcher +from spiderJobs.platforms.job51.api import SearchRecommendJobs +assert issubclass(SearchRecommendJobs, BaseSearcher) +print('✅ 所有 job51 模块 import 成功,继承关系正确') +" + +# 2. 确认无旧依赖残留 +grep -rn "spiderJobs.core" spiderJobs/platforms/job51/client.py spiderJobs/platforms/job51/api.py spiderJobs/platforms/job51/main.py spiderJobs/platforms/job51/sign.py && echo "❌ 仍有旧依赖" || echo "✅ 无旧依赖" + +# 3. 运行 mock 测试 +pipenv run python -m pytest tests/job51/ -v +``` diff --git a/.planning/phases/03-qcwy-zhilian/03-02-PLAN.md b/.planning/phases/03-qcwy-zhilian/03-02-PLAN.md new file mode 100644 index 0000000..88f07bd --- /dev/null +++ b/.planning/phases/03-qcwy-zhilian/03-02-PLAN.md @@ -0,0 +1,369 @@ +--- +phase: 3 +plan: 2 +wave: 1 +title: "迁移智联招聘(zhilian)层至 crawler_core + mock 测试" +depends_on: [] +files_modified: + - spiderJobs/platforms/zhilian/client.py + - spiderJobs/platforms/zhilian/api.py + - spiderJobs/platforms/zhilian/main.py + - spiderJobs/platforms/zhilian/sign.py + - tests/zhilian/__init__.py + - tests/zhilian/test_zhilian_client.py +autonomous: true +requirements: + - ARCH-05 +--- + +# Phase 3 Plan 02: 迁移智联招聘(zhilian)层至 crawler_core + mock 测试 + +## Objective + +将 `spiderJobs/platforms/zhilian/` 从依赖 `spiderJobs.core`(旧基类)改为依赖 `crawler_core`(新基类), +同时新增 `tests/zhilian/test_zhilian_client.py` mock 测试,满足 ARCH-05。 + +**与 job51 的关键差异:** +- zhilian api.py 使用默认的 `parse_response`(无自定义 `_parse_response` 函数),无 `ApiResult` 替换 +- zhilian client.py 需要特别保留 `ZhilianSign` 的 `sign_headers()` 和 `sign_params()` 接口 +- `SearchCompanyPositions._build_params()` 通过 `self._client.signer.sign_params()` 访问 signer,迁移后不受影响 + +## Must Haves + +- [ ] `client.py` 继承 `crawler_core.http_client.HTTPClient`,使用 `crawler_core.zhilian.sign.ZhilianSign` +- [ ] `api.py` 使用 `crawler_core.base.BaseFetcher/BaseSearcher` +- [ ] `api.py` 中 1 处 `self._http.get(` 替换为 `self.http_client.get(`(第 200 行) +- [ ] `main.py` import 更新为 `crawler_core.base` +- [ ] `sign.py` 改为向后兼容桩(重新导出 `crawler_core.zhilian.sign.ZhilianSign`) +- [ ] `grep -rn "spiderJobs.core" spiderJobs/platforms/zhilian/{client,api,main,sign}.py` 无输出 +- [ ] `tests/zhilian/__init__.py` 存在 +- [ ] `tests/zhilian/test_zhilian_client.py` 存在 +- [ ] `pytest tests/zhilian/ -v` 全部通过(>= 12 个测试) + +--- + +## Wave 1 + +### Task 2.1: 更新 client.py + + +- `spiderJobs/platforms/zhilian/client.py`(当前内容) +- `crawler_core/http_client.py`(目标基类) +- `crawler_core/zhilian/sign.py`(ZhilianSign 新来源) + + + +修改 `spiderJobs/platforms/zhilian/client.py`: + +1. 第 10 行改为: + ```python + from crawler_core.http_client import HTTPClient + ``` +2. 第 11 行改为: + ```python + from crawler_core.zhilian.sign import ZhilianSign + ``` + (删除 `from spiderJobs.core.http_client import HTTPClient` 和 `from spiderJobs.platforms.zhilian.sign import ZhilianSign`) + +**注意:** `ZhilianClient.get/post` 方法覆写了父类,并调用 `self.signer.sign_headers(page_code)`,这是 ZhilianSign 的接口,迁移后不受影响(接口签名完全一致)。 +其他所有内容不变。 + + + +- `grep "from crawler_core.http_client import HTTPClient" spiderJobs/platforms/zhilian/client.py` 有输出 +- `grep "from crawler_core.zhilian.sign import ZhilianSign" spiderJobs/platforms/zhilian/client.py` 有输出 +- `grep "spiderJobs.core" spiderJobs/platforms/zhilian/client.py` 无输出 +- `python -c "from spiderJobs.platforms.zhilian.client import ZhilianClient"` 无 ImportError + + +--- + +### Task 2.2: 更新 api.py + + +- `spiderJobs/platforms/zhilian/api.py`(当前完整内容,229 行) +- `crawler_core/base.py`(BaseFetcher、BaseSearcher 接口) + + + +修改 `spiderJobs/platforms/zhilian/api.py`: + +1. 第 10 行改为: + ```python + from crawler_core.base import BaseFetcher, BaseSearcher + ``` + (删除 `from spiderJobs.core.base import BaseFetcher, BaseSearcher`) + +2. 第 200 行(`SearchCompanyPositions._request()`)改为: + ```python + return self.http_client.get(self.ENDPOINT, params) + ``` + (原为 `return self._http.get(self.ENDPOINT, params)`) + +**注意:** zhilian api.py 无 ApiResult(使用默认解析器),无需替换 ApiResult。 +`SearchCompanyPositions._build_params()` 中的 `self._client.signer.sign_params()` 不需要修改。 + + + +- `grep "from crawler_core.base import BaseFetcher, BaseSearcher" spiderJobs/platforms/zhilian/api.py` 有输出 +- `grep "spiderJobs.core" spiderJobs/platforms/zhilian/api.py` 无输出 +- `grep "self\._http" spiderJobs/platforms/zhilian/api.py` 无输出 +- `python -c "from spiderJobs.platforms.zhilian.api import SearchPositions, GetPositionDetail, SearchCompanyPositions"` 无 ImportError + + +--- + +### Task 2.3: 更新 main.py + + +- `spiderJobs/platforms/zhilian/main.py`(当前内容,113 行) + + + +修改 `spiderJobs/platforms/zhilian/main.py`: + +1. 第 32 行改为: + ```python + from crawler_core.base import BaseFetcher, BaseSearcher + ``` + (删除 `from spiderJobs.core.base import BaseFetcher, BaseSearcher`) + +其他内容不变(无 sign import,main.py 中签名通过 ZhilianClient 自动注入)。 + + + +- `grep "from crawler_core.base import BaseFetcher, BaseSearcher" spiderJobs/platforms/zhilian/main.py` 有输出 +- `grep "spiderJobs.core" spiderJobs/platforms/zhilian/main.py` 无输出 + + +--- + +### Task 2.4: 将 sign.py 改为向后兼容桩 + + +- `spiderJobs/platforms/zhilian/sign.py`(当前内容,87 行的独立实现) +- `crawler_core/zhilian/sign.py`(权威实现) + + + +将 `spiderJobs/platforms/zhilian/sign.py` 完全替换为: + +```python +""" +向后兼容桩 — 智联招聘签名 + +已迁移至 crawler_core.zhilian.sign。 +直接从 crawler_core 重新导出,避免下游代码出现 ImportError。 +""" + +from crawler_core.zhilian.sign import ZhilianSign # noqa: F401 + +__all__ = ["ZhilianSign"] +``` + + + +- `grep "from crawler_core.zhilian.sign import ZhilianSign" spiderJobs/platforms/zhilian/sign.py` 有输出 +- `python -c "from spiderJobs.platforms.zhilian.sign import ZhilianSign; print(ZhilianSign().generate_uuid())"` 成功打印 UUID + + +--- + +### Task 2.5: 创建 tests/zhilian/__init__.py + + +创建 `tests/zhilian/__init__.py`,内容:`# tests/zhilian/` + + +--- + +### Task 2.6: 编写 tests/zhilian/test_zhilian_client.py + + +- `spiderJobs/platforms/zhilian/api.py`(迁移后版本) +- `spiderJobs/platforms/zhilian/client.py`(迁移后版本) +- `crawler_core/zhilian/sign.py`(ZhilianSign 接口) +- `tests/boss/test_boss_client.py`(参考风格) + + + +创建 `tests/zhilian/test_zhilian_client.py`,包含以下测试: + +```python +""" +智联招聘 HTTP 层 mock 测试(QUAL-03 / ARCH-05) + +使用 MagicMock 替代真实 HTTP 客户端,无网络依赖。 +""" + +from __future__ import annotations +from unittest.mock import MagicMock +from crawler_core.zhilian.sign import ZhilianSign +from spiderJobs.platforms.zhilian.api import ( + SearchPositions, GetPositionDetail, GetCompanyExtDetail, + GetCompanyDetail, SearchCompanyPositions, +) +from spiderJobs.platforms.zhilian.client import ZhilianClient +from crawler_core.base import Result + + +# ── 1. SearchPositions(POST cgate)───────────────────── + +class TestSearchPositions: + + def _make_client(self, status_code=200, data=None): + mock_client = MagicMock() + mock_client.post.return_value = (status_code, data or {}) + return mock_client + + def test_search_success_returns_list(self): + data = { + "data": {"list": [{"title": "Python 工程师"}], "numFound": 1}, + "pageInfo": {"pageNum": 1, "pageSize": 15, "totalNum": 1} + } + searcher = SearchPositions(keyword="Python", city_code=538, + client=self._make_client(200, data)) + result = searcher.search(page_index=1) + assert result.success is True + + def test_search_http_error(self): + searcher = SearchPositions(client=self._make_client(403, {})) + result = searcher.search(page_index=1) + assert result.success is False + assert result.status_code == 403 + + +# ── 2. GetPositionDetail(GET cgate)──────────────────── + +class TestGetPositionDetail: + + def test_fetch_success(self): + mock_client = MagicMock() + mock_client.get.return_value = (200, {"data": {"jobName": "高级工程师"}}) + fetcher = GetPositionDetail(number="CC123456", client=mock_client) + result = fetcher.fetch() + assert result.success is True + + def test_fetch_404(self): + mock_client = MagicMock() + mock_client.get.return_value = (404, {}) + fetcher = GetPositionDetail(number="notexist", client=mock_client) + result = fetcher.fetch() + assert result.success is False + assert result.status_code == 404 + + +# ── 3. GetCompanyExtDetail(GET cgate)────────────────── + +class TestGetCompanyExtDetail: + + def test_fetch_success(self): + mock_client = MagicMock() + mock_client.get.return_value = (200, {"data": {"companyName": "测试公司"}}) + fetcher = GetCompanyExtDetail( + company_name="测试公司", company_number="CZ123", client=mock_client) + result = fetcher.fetch() + assert result.success is True + + +# ── 4. GetCompanyDetail(GET cgate)───────────────────── + +class TestGetCompanyDetail: + + def test_fetch_success(self): + mock_client = MagicMock() + mock_client.get.return_value = (200, {"data": {"companyNumber": "CZ123"}}) + fetcher = GetCompanyDetail(number="CZ123", client=mock_client) + result = fetcher.fetch() + assert result.success is True + + def test_fetch_http_error(self): + mock_client = MagicMock() + mock_client.get.return_value = (500, {}) + fetcher = GetCompanyDetail(number="CZ123", client=mock_client) + result = fetcher.fetch() + assert result.success is False + + +# ── 5. SearchCompanyPositions(GET capi)──────────────── + +class TestSearchCompanyPositions: + + def test_search_success(self): + mock_signer = MagicMock(spec=ZhilianSign) + mock_signer.sign_params.return_value = {"at": "", "rt": ""} + mock_client = MagicMock() + mock_client.signer = mock_signer + mock_client.get.return_value = (200, {"data": {"list": [{"jobName": "测试岗位"}]}, + "pageInfo": {}}) + searcher = SearchCompanyPositions(company_id="CZ123", client=mock_client) + result = searcher.search(page_index=1) + assert result.success is True + assert mock_signer.sign_params.called + + def test_search_http_error(self): + mock_signer = MagicMock(spec=ZhilianSign) + mock_signer.sign_params.return_value = {} + mock_client = MagicMock() + mock_client.signer = mock_signer + mock_client.get.return_value = (403, {}) + searcher = SearchCompanyPositions(company_id="CZ123", client=mock_client) + result = searcher.search(page_index=1) + assert result.success is False + + +# ── 6. ZhilianClient — 签名头注入 ─────────────────────── + +class TestZhilianClientHeaders: + + def test_sign_headers_injects_at_rt(self): + signer = ZhilianSign(at="mock_at", rt="mock_rt") + client = ZhilianClient(signer=signer) + headers = client.signer.sign_headers() + assert headers["x-zp-at"] == "mock_at" + assert headers["x-zp-rt"] == "mock_rt" + + def test_sign_headers_has_required_keys(self): + client = ZhilianClient() + headers = client.signer.sign_headers() + for key in ["x-zp-at", "x-zp-rt", "x-zp-action-id", "x-zp-device-id"]: + assert key in headers + + def test_default_signer_empty_tokens(self): + client = ZhilianClient() + headers = client.signer.sign_headers() + assert headers["x-zp-at"] == "" + assert headers["x-zp-rt"] == "" +``` + + + +- `test -f tests/zhilian/test_zhilian_client.py && echo "OK"` 输出 OK +- `pipenv run python -m pytest tests/zhilian/ -v` 全部通过(>= 12 个测试) + + +--- + +## Verification + +```bash +# 1. 验证所有 zhilian 模块 import 正确 +pipenv run python -c " +from spiderJobs.platforms.zhilian.client import ZhilianClient, create_cgate_client, create_capi_client +from spiderJobs.platforms.zhilian.api import SearchPositions, GetPositionDetail, GetCompanyDetail, SearchCompanyPositions +from spiderJobs.platforms.zhilian.main import main, create_searcher +from crawler_core.base import BaseFetcher, BaseSearcher +assert issubclass(SearchPositions, BaseSearcher) +assert issubclass(GetPositionDetail, BaseFetcher) +print('✅ 所有 zhilian 模块 import 成功,继承关系正确') +" + +# 2. 确认无旧依赖残留 +grep -rn "spiderJobs.core" spiderJobs/platforms/zhilian/client.py spiderJobs/platforms/zhilian/api.py spiderJobs/platforms/zhilian/main.py spiderJobs/platforms/zhilian/sign.py && echo "❌ 仍有旧依赖" || echo "✅ 无旧依赖" + +# 3. 运行 mock 测试 +pipenv run python -m pytest tests/zhilian/ -v + +# 4. 三平台全量回归 +pipenv run python -m pytest tests/ -v --tb=short +``` diff --git a/.planning/phases/03-qcwy-zhilian/03-RESEARCH.md b/.planning/phases/03-qcwy-zhilian/03-RESEARCH.md new file mode 100644 index 0000000..54956ed --- /dev/null +++ b/.planning/phases/03-qcwy-zhilian/03-RESEARCH.md @@ -0,0 +1,107 @@ +# Phase 3: 前程无忧 & 智联重写 — 技术研究 + +**研究日期:** 2026-03-21 +**阶段目标:** 前程无忧和智联招聘爬虫完全基于 crawler_core 运行,三平台统一使用新基类 + +--- + +## 1. 现状分析 + +### 1.1 crawler_core 现有基础(Phase 1 完成) + +| 文件 | 内容 | +|------|------| +| `crawler_core/qcwy/sign.py` | `Job51Sign.build_sign_path()` — HMAC-SHA256 签名 | +| `crawler_core/zhilian/sign.py` | `ZhilianSign.sign_headers()/sign_params()` — 智联多类型签名 | +| `crawler_core/http_client.py` | `HTTPClient` — TLS 伪装 + 代理 + tenacity 重试 | +| `crawler_core/base.py` | `Result[T]`, `BaseFetcher`, `BaseSearcher` | + +### 1.2 前程无忧(job51)待迁移层 + +`spiderJobs/platforms/job51/` 下已有全部文件: + +| 文件 | 旧依赖 | 迁移目标 | +|------|--------|---------| +| `client.py` | `spiderJobs.core.http_client.HTTPClient` + `spiderJobs.platforms.job51.sign.Job51Sign` | → `crawler_core.http_client.HTTPClient` + `crawler_core.qcwy.sign.Job51Sign` | +| `api.py` | `spiderJobs.core.base.ApiResult/BaseFetcher/BaseSearcher` | → `crawler_core.base.Result/BaseFetcher/BaseSearcher` | +| `main.py` | `spiderJobs.core.base.BaseFetcher/BaseSearcher` | → `crawler_core.base.BaseFetcher/BaseSearcher` | +| `sign.py` | 独立实现(与 crawler_core/qcwy/sign.py 相同) | → 向后兼容桩,重新导出 `Job51Sign` | + +**job51/api.py 具体变更:** +- 第 14 行 import 替换 +- `ApiResult` 全量替换为 `Result`(共 11 处) +- 第 164 行:`self._http.get(endpoint)` → `self.http_client.get(endpoint)` +- 第 208 行:`self._http.get(self.ENDPOINT, ...)` → `self.http_client.get(self.ENDPOINT, ...)` + +### 1.3 智联招聘(zhilian)待迁移层 + +`spiderJobs/platforms/zhilian/` 下已有全部文件: + +| 文件 | 旧依赖 | 迁移目标 | +|------|--------|---------| +| `client.py` | `spiderJobs.core.http_client.HTTPClient` + `spiderJobs.platforms.zhilian.sign.ZhilianSign` | → `crawler_core.http_client.HTTPClient` + `crawler_core.zhilian.sign.ZhilianSign` | +| `api.py` | `spiderJobs.core.base.BaseFetcher/BaseSearcher` | → `crawler_core.base.BaseFetcher/BaseSearcher` | +| `main.py` | `spiderJobs.core.base.BaseFetcher/BaseSearcher` | → `crawler_core.base.BaseFetcher/BaseSearcher` | +| `sign.py` | 独立实现(与 crawler_core/zhilian/sign.py 相同) | → 向后兼容桩,重新导出 `ZhilianSign` | + +**zhilian/api.py 具体变更:** +- 第 10 行 import 替换(无 ApiResult,zhilian 使用 crawler_core 的默认解析器,无需自定义 _parse_response) +- 第 200 行:`return self._http.get(` → `return self.http_client.get(` + +**重要差异:** 智联 api.py 中 `SearchCompanyPositions._build_params()` 第 184 行使用了 `self._client.signer.sign_params()`,这是通过 `self._client`(设为传入的 ZhilianClient)间接访问 signer 的,迁移后不受影响(属性名不变)。 + +--- + +## 2. 对比 Phase 2(Boss)的工作量 + +| 维度 | Phase 2 Boss | Phase 3 job51 | Phase 3 zhilian | +|------|-------------|---------------|-----------------| +| `ApiResult` 替换 | 11 处 | 11 处 | 0 处(无自定义解析器) | +| `self._http` 替换 | 3 处 | 2 处 | 1 处 | +| sign.py → 桩 | ✓ | ✓ | ✓ | +| client.py import | ✓ | ✓ | ✓ | +| api.py import | ✓ | ✓ | ✓ | +| main.py import | ✓ | ✓ | ✓ | + +--- + +## 3. mock 测试策略 + +### 3.1 job51 测试(tests/job51/test_job51_client.py) + +job51 的 mock 策略与 Boss 完全一致:用 `MagicMock()` mock `http_client`,测试: +- `_parse_job51_response`(纯函数,覆盖 status/1 成功、非 1 失败、HTTP 错误) +- `SearchRecommendJobs.search()`(正常、HTTP 错误) +- `GetJobDetail.fetch()`(成功,异常捕获) +- `GetCompanyDetail.fetch()`(成功) +- `Job51Client._job51_headers()`(sign 注入) + +### 3.2 zhilian 测试(tests/zhilian/test_zhilian_client.py) + +智联的请求走 POST(cgate)或 GET(capi),mock 方式相同: +- `SearchPositions.search()`(正常、错误) +- `GetPositionDetail.fetch()`(成功) +- `SearchCompanyPositions.search()`(成功,特别验证 sign_params 被调用) +- `ZhilianClient.post/get`(验证签名头注入) + +--- + +## 4. 成功标准验证 + +| 标准 | 验证方式 | +|------|---------| +| job51 继承 BaseFetcher/BaseSearcher | `issubclass()` 断言 | +| zhilian 继承 BaseFetcher/BaseSearcher | `issubclass()` 断言 | +| 两平台无内联签名或 HTTP 样板 | `grep` 无 requests import,无 hmac 在 client/api 中 | +| mock 测试通过 | `pytest tests/job51/ tests/zhilian/` | +| 三平台代码结构一致 | 代码审查:client/api/sign/main 四文件结构 | + +--- + +## RESEARCH COMPLETE + +**Phase 3 可以规划,分 2 个 PLAN:** +- **Plan 01**:迁移前程无忧(job51)层 + mock 测试 +- **Plan 02**:迁移智联招聘(zhilian)层 + mock 测试 + +两个 Plan 无依赖关系,理论上可并行,但顺序执行更稳妥。