diff --git a/.planning/phases/04-backend-scripts/04-01-PLAN.md b/.planning/phases/04-backend-scripts/04-01-PLAN.md new file mode 100644 index 0000000..f34708d --- /dev/null +++ b/.planning/phases/04-backend-scripts/04-01-PLAN.md @@ -0,0 +1,262 @@ +--- +phase: 4 +plan: 1 +wave: 1 +title: "迁移 facade 层 import 至 spiderJobs.platforms.* + asyncio.to_thread 桥接" +depends_on: [] +files_modified: + - app/services/crawler/boss.py + - app/services/crawler/qcwy.py + - app/services/crawler/zhilian.py +autonomous: true +requirements: + - ARCH-06 + - ARCH-07 +--- + +# Phase 4 Plan 01: 迁移 facade 层 import 至 spiderJobs.platforms.* + asyncio.to_thread 桥接 + +## Objective + +将 `app/services/crawler/` 的三个 facade 文件(boss.py/qcwy.py/zhilian.py)从 +引用内部私有复制文件(`_boss_api.py`、`_boss_client.py` 等)改为直接引用 +`spiderJobs.platforms.*`(已基于 crawler_core),满足 ARCH-06/ARCH-07。 + +对外接口(`set_proxy()`、`get_job_detail()` 等)完全不变。 + +同时为每个 Service 添加 `asyncio.to_thread()` 异步包装方法(ARCH-06)。 + +## Must Haves + +- [ ] `boss.py` 改导入 `spiderJobs.platforms.boss.{api,client,sign}` +- [ ] `qcwy.py` 改导入 `spiderJobs.platforms.job51.{api,client}` +- [ ] `zhilian.py` 改导入 `spiderJobs.platforms.zhilian.{api,client,sign}` +- [ ] 三个 Service 各添加 async 方法(`asyncio.to_thread` 包装) +- [ ] `python -c "from app.services.crawler.boss import BossService"` 无 ImportError +- [ ] `pytest tests/ -v` 全部通过(无回归) + +--- + +## Wave 1 + +### Task 1.1: 更新 boss.py + + +- `app/services/crawler/boss.py`(当前内容,116 行) +- `spiderJobs/platforms/boss/api.py`(GetBrandDetail/GetJobDetail/SearchBrandJobs/SearchRecJobs 导出) +- `spiderJobs/platforms/boss/client.py`(BossClient/create_client 导出,含 batch()) +- `spiderJobs/platforms/boss/sign.py`(BossSign → crawler_core 桩) + + + +修改 `app/services/crawler/boss.py`: + +1. 将 import 块(第 12-19 行)替换为: + ```python + from spiderJobs.platforms.boss.api import ( + GetBrandDetail, + GetJobDetail, + SearchBrandJobs, + SearchRecJobs, + ) + from spiderJobs.platforms.boss.client import BossClient, create_client + from spiderJobs.platforms.boss.sign import BossSign + ``` + +2. 在 `BossService` 类末尾添加异步包装方法: + ```python + # ── asyncio.to_thread 桥接(ARCH-06)──────── + + async def async_get_job_detail( + self, job_id: str, lid: str = "", security_id: str = "" + ) -> Optional[Dict]: + import asyncio + return await asyncio.to_thread(self.get_job_detail_by_id, job_id, lid, security_id) + + async def async_get_company_detail(self, company_id: str) -> Optional[Dict]: + import asyncio + return await asyncio.to_thread(self.get_company_detail_by_id, company_id) + + async def async_get_company_jobs( + self, company_id: str, page: int = 1 + ) -> Optional[Dict]: + import asyncio + return await asyncio.to_thread(self.get_company_jobs_by_id, company_id, page) + + async def async_search_jobs( + self, keyword: str, city_code: str = "101010100", page: int = 1 + ) -> Optional[Dict]: + import asyncio + return await asyncio.to_thread(self.search_jobs, keyword, city_code, page) + ``` + + + +- `grep "from spiderJobs.platforms.boss" app/services/crawler/boss.py` 有输出 +- `grep "app.services.crawler._boss" app/services/crawler/boss.py` 无输出 +- `grep "asyncio.to_thread" app/services/crawler/boss.py` 有输出 +- `pipenv run python -c "from app.services.crawler.boss import BossService"` 成功 + + +--- + +### Task 1.2: 更新 qcwy.py + + +- `app/services/crawler/qcwy.py`(当前内容,103 行) +- `spiderJobs/platforms/job51/api.py`(GetCompanyInfo/GetJobDetail/SearchCompanyJobs/SearchRecommendJobs 导出) +- `spiderJobs/platforms/job51/client.py`(Job51Client/create_client 导出) + + + +修改 `app/services/crawler/qcwy.py`: + +1. 将 import 块(第 12-18 行)替换为: + ```python + from spiderJobs.platforms.job51.api import ( + GetCompanyInfo, + GetJobDetail, + SearchCompanyJobs, + SearchRecommendJobs, + ) + from spiderJobs.platforms.job51.client import Job51Client, create_client + ``` + +2. 在 `QcwyService` 类末尾添加异步包装方法: + ```python + # ── asyncio.to_thread 桥接(ARCH-06)──────── + + async def async_get_job_detail(self, job_id: str) -> Dict: + import asyncio + return await asyncio.to_thread(self.get_job_detail, job_id) + + async def async_get_company_info(self, company_id: str) -> Dict: + import asyncio + return await asyncio.to_thread(self.get_company_info, company_id) + + async def async_get_company_jobs( + self, company_id: str, page: int = 1, page_size: int = 30, **kwargs + ) -> Dict: + import asyncio + return await asyncio.to_thread( + self.get_company_jobs_by_id, company_id, page, page_size + ) + + async def async_search_jobs( + self, keyword: str, job_area: str = "020000", page: int = 1 + ) -> List: + import asyncio + return await asyncio.to_thread(self.search_jobs, keyword, job_area, page) + ``` + + + +- `grep "from spiderJobs.platforms.job51" app/services/crawler/qcwy.py` 有输出 +- `grep "app.services.crawler._job51" app/services/crawler/qcwy.py` 无输出 +- `grep "asyncio.to_thread" app/services/crawler/qcwy.py` 有输出 +- `pipenv run python -c "from app.services.crawler.qcwy import QcwyService"` 成功 + + +--- + +### Task 1.3: 更新 zhilian.py + + +- `app/services/crawler/zhilian.py`(当前内容,143 行) +- `spiderJobs/platforms/zhilian/api.py`(GetCompanyDetail/GetPositionDetail/SearchCompanyPositions/SearchPositions 导出) +- `spiderJobs/platforms/zhilian/client.py`(ZhilianClient/create_capi_client/create_cgate_client 导出) +- `spiderJobs/platforms/zhilian/sign.py`(ZhilianSign → crawler_core 桩) + + + +修改 `app/services/crawler/zhilian.py`: + +1. 将 import 块(第 12-23 行)替换为: + ```python + from spiderJobs.platforms.zhilian.api import ( + GetCompanyDetail, + GetPositionDetail, + SearchCompanyPositions, + SearchPositions, + ) + from spiderJobs.platforms.zhilian.client import ( + ZhilianClient, + create_capi_client, + create_cgate_client, + ) + from spiderJobs.platforms.zhilian.sign import ZhilianSign + ``` + +2. 在 `ZhilianService` 类末尾添加异步包装方法: + ```python + # ── asyncio.to_thread 桥接(ARCH-06)──────── + + async def async_get_job_detail(self, job_number: str) -> Optional[Dict]: + import asyncio + return await asyncio.to_thread(self.get_job_detail, job_number) + + async def async_get_company_detail(self, company_number: str) -> Optional[Dict]: + import asyncio + return await asyncio.to_thread(self.get_company_detail, company_number) + + async def async_get_company_jobs( + self, company_number: str, page_index: int = 1, page_size: int = 30, + work_city: Optional[int] = None, + ) -> Optional[Dict]: + import asyncio + return await asyncio.to_thread( + self.get_company_jobs_by_id, company_number, page_index, page_size, work_city + ) + + async def async_search_jobs( + self, city_id: int = 801, page_size: int = 15, page_index: int = 1, + job_level3_code: Optional[str] = None, + ) -> List: + import asyncio + return await asyncio.to_thread( + self.search_jobs, city_id, page_size, page_index, job_level3_code + ) + ``` + + + +- `grep "from spiderJobs.platforms.zhilian" app/services/crawler/zhilian.py` 有输出 +- `grep "app.services.crawler._zhilian" app/services/crawler/zhilian.py` 无输出 +- `grep "asyncio.to_thread" app/services/crawler/zhilian.py` 有输出 +- `pipenv run python -c "from app.services.crawler.zhilian import ZhilianService"` 成功 + + +--- + +## Verification + +```bash +# 1. 验证三个 facade 模块 import 正确 +pipenv run python -c " +from app.services.crawler.boss import BossService +from app.services.crawler.qcwy import QcwyService +from app.services.crawler.zhilian import ZhilianService +print('✅ 三个 facade 模块 import 成功') + +# 验证无旧导入 +import inspect, sys +for svc in [BossService, QcwyService, ZhilianService]: + src = inspect.getsourcefile(svc) + with open(src) as f: + content = f.read() + assert '_boss_' not in content and '_job51_' not in content and '_zhilian_' not in content, f'{src} 仍有旧导入!' +print('✅ 无旧导入残留') + +# 验证 async 方法存在 +assert hasattr(BossService, 'async_get_job_detail') +assert hasattr(QcwyService, 'async_get_company_info') +assert hasattr(ZhilianService, 'async_get_company_detail') +print('✅ asyncio 桥接方法存在') +" + +# 2. 验证旧导入无残留 +grep -rn "from app.services.crawler._" app/services/crawler/boss.py app/services/crawler/qcwy.py app/services/crawler/zhilian.py && echo "❌ 旧导入残留" || echo "✅ 无旧导入" + +# 3. 全量回归 +pipenv run python -m pytest tests/ -v --tb=short +``` diff --git a/.planning/phases/04-backend-scripts/04-02-PLAN.md b/.planning/phases/04-backend-scripts/04-02-PLAN.md new file mode 100644 index 0000000..aaed3cd --- /dev/null +++ b/.planning/phases/04-backend-scripts/04-02-PLAN.md @@ -0,0 +1,140 @@ +--- +phase: 4 +plan: 2 +wave: 2 +title: "废弃 app/services/crawler/ 私有复制文件 + jobs_spider/ 标记废弃" +depends_on: + - "04-01-PLAN.md" +files_modified: + - app/services/crawler/_base.py + - app/services/crawler/_http_client.py + - app/services/crawler/_boss_api.py + - app/services/crawler/_boss_client.py + - app/services/crawler/_boss_sign.py + - app/services/crawler/_job51_api.py + - app/services/crawler/_job51_client.py + - app/services/crawler/_job51_sign.py + - app/services/crawler/_zhilian_api.py + - app/services/crawler/_zhilian_client.py + - app/services/crawler/_zhilian_sign.py + - jobs_spider/boss/boos_api.py +autonomous: true +requirements: + - ARCH-08 +--- + +# Phase 4 Plan 02: 废弃私有复制文件 + jobs_spider/ 标记废弃 + +## Objective + +Plan 01 执行后,`app/services/crawler/` 的 11 个私有复制文件不再被任何 facade 引用。 +本 Plan 将这些文件添加 `DEPRECATED` 文件头注释,表明它们已废弃,并对 `jobs_spider/` 做同样处理。 + +**注意:** 策略是"添加废弃标记"而非"直接删除",避免第三方未知调用方受影响。 +未来可在下一个里程碑中删除这些文件。 + +## Must Haves + +- [ ] 11 个私有文件(`_base.py`、`_http_client.py`、`_boss_*`、`_job51_*`、`_zhilian_*`)文件头添加 `## DEPRECATED` 注释 +- [ ] `jobs_spider/boss/boos_api.py` 文件头添加 `## DEPRECATED` 注释 +- [ ] `jobs_spider/CLAUDE.md` 添加废弃声明(如有) +- [ ] `pytest tests/ -v` 全部通过(废弃标记不影响 import) + +--- + +## Wave 2(依赖 Plan 01 完成) + +### Task 2.1: 为 _base.py 和 _http_client.py 添加废弃头 + + +在 `app/services/crawler/_base.py` 文件顶部(第 1 行前)插入: +```python +""" +⚠️ DEPRECATED — 2026-03-21 + +此文件是 crawler_core.base 的手工复制,已废弃。 +请改用: from crawler_core.base import Result, BaseFetcher, BaseSearcher +将在下一里程碑中删除。 +""" +``` + +在 `app/services/crawler/_http_client.py` 文件顶部插入: +```python +""" +⚠️ DEPRECATED — 2026-03-21 + +此文件是 crawler_core.http_client 的手工复制,已废弃。 +请改用: from crawler_core.http_client import HTTPClient +将在下一里程碑中删除。 +""" +``` + + +--- + +### Task 2.2: 为 9 个平台私有文件添加废弃头 + + +分别为以下文件在文件头(原 docstring 的第一行)插入 `⚠️ DEPRECATED — 2026-03-21\n\n此文件已废弃,请改用 spiderJobs.platforms.* 相应模块。` 或更新已有 docstring 中的说明文字: + +- `app/services/crawler/_boss_api.py`(已有 docstring "复制自...",在其下方添加废弃行) +- `app/services/crawler/_boss_client.py` +- `app/services/crawler/_boss_sign.py` +- `app/services/crawler/_job51_api.py` +- `app/services/crawler/_job51_client.py` +- `app/services/crawler/_zhilian_api.py`(需要先读文件确认 docstring) +- `app/services/crawler/_zhilian_client.py` +- `app/services/crawler/_zhilian_sign.py` + +统一格式:在原 docstring 下方第二行插入: +`⚠️ DEPRECATED — 2026-03-21. 请改用 spiderJobs.platforms.{boss|job51|zhilian} 对应文件。` + + +--- + +### Task 2.3: 标记 jobs_spider/ 旧框架废弃 + + +1. 在 `jobs_spider/boss/boos_api.py` 文件最顶部(第 1 行前)插入注释块: + ```python + # ⚠️ DEPRECATED — 2026-03-21 + # 此文件是最旧的 Boss 爬虫实现,已由 spiderJobs/platforms/boss/ + crawler_core 替代。 + # 不再接受新的生产流量,保留仅供历史参考。 + # + ``` + +2. 检查 `jobs_spider/CLAUDE.md` 是否存在(通过 list_dir 查看),若存在则在顶部添加废弃说明; + 若不存在则创建,内容为: + ```markdown + # ⚠️ DEPRECATED + + **废弃时间:** 2026-03-21 + + `jobs_spider/` 目录下的所有文件已由以下模块替代,不再维护: + - `spiderJobs/platforms/boss/` + `crawler_core` — Boss 直聘 + - `spiderJobs/platforms/job51/` + `crawler_core/qcwy/` — 前程无忧 + - `spiderJobs/platforms/zhilian/` + `crawler_core/zhilian/` — 智联招聘 + + 文件保留仅供历史参考,将在下一里程碑中删除。 + ``` + + +--- + +## Verification + +```bash +# 1. 确认 facade 文件不再引用私有文件 +grep -rn "from app.services.crawler._" app/services/crawler/boss.py \ + app/services/crawler/qcwy.py app/services/crawler/zhilian.py \ + && echo "❌ 仍有旧导入" || echo "✅ facade 无旧导入" + +# 2. 确认废弃标记存在 +grep -l "DEPRECATED" app/services/crawler/_base.py \ + app/services/crawler/_boss_api.py \ + jobs_spider/boss/boos_api.py \ + && echo "✅ 废弃标记存在" + +# 3. 全量回归(废弃标记不影响 import) +pipenv run python -m pytest tests/ -v --tb=short +``` diff --git a/.planning/phases/04-backend-scripts/04-RESEARCH.md b/.planning/phases/04-backend-scripts/04-RESEARCH.md new file mode 100644 index 0000000..f856c52 --- /dev/null +++ b/.planning/phases/04-backend-scripts/04-RESEARCH.md @@ -0,0 +1,111 @@ +# Phase 4: 后端 & 外部脚本接入 — 技术研究 + +**研究日期:** 2026-03-21 +**阶段目标:** 后端 facade 使用 spiderJobs.platforms.*(已基于 crawler_core),私有复制文件删除;外部脚本 jobs_spider/ 标记废弃 + +--- + +## 1. 现状分析 + +### 1.1 app/services/crawler/ 文件结构 + +``` +app/services/crawler/ +├── _base.py ❌ 复制自 spiderJobs/core/base.py(ApiResult/BaseFetcher/BaseSearcher 旧版) +├── _http_client.py ❌ 复制自 spiderJobs/core/http_client.py +├── _boss_api.py ❌ 复制自 spiderJobs/platforms/boss/api.py +├── _boss_client.py ❌ 复制自 spiderJobs/platforms/boss/client.py(含 batch() 方法) +├── _boss_sign.py ❌ 复制自 spiderJobs/platforms/boss/sign.py +├── _job51_api.py ❌ 复制自 spiderJobs/platforms/job51/api.py +├── _job51_client.py ❌ 复制自 spiderJobs/platforms/job51/client.py +├── _job51_sign.py ❌ 复制自 spiderJobs/platforms/job51/sign.py(应该如此,推测) +├── _zhilian_api.py ❌ 复制自 spiderJobs/platforms/zhilian/api.py +├── _zhilian_client.py ❌ 复制自 spiderJobs/platforms/zhilian/client.py +├── _zhilian_sign.py ❌ 复制自 spiderJobs/platforms/zhilian/sign.py +├── boss.py ✅ BossService — 结构良好,使用 _boss_*(需改导入) +├── qcwy.py ✅ QcwyService — 结构良好,使用 _job51_*(需改导入) +├── zhilian.py ✅ ZhilianService — 结构良好,使用 _zhilian_*(需改导入) +└── __init__.py ✅ 不变 +``` + +### 1.2 facade 层的对外接口(调用方) + +| 调用方 | 使用的 Service | +|--------|----------------| +| `app/services/cleaning.py` | BossService, QcwyService, ZhilianService | +| `app/services/company_cleaner.py` | BossService, QcwyService, ZhilianService | +| `app/services/company_jobs_sync.py` | BossService, QcwyService, ZhilianService | +| `app/api/v1/...` | 通过 Service 层调用 | + +**重要:** 这些调用方使用的接口(`set_proxy()`, `get_job_detail()`, `get_company_detail()`, `search_jobs()` 等)保持不变,只修改 facade 内部实现。 + +### 1.3 迁移前后对应关系 + +| 当前私有文件 | 替换为 | +|-------------|--------| +| `_boss_api.py` → `GetBrandDetail, GetJobDetail, SearchBrandJobs, SearchRecJobs` | `spiderJobs.platforms.boss.api` | +| `_boss_client.py` → `BossClient, create_client` | `spiderJobs.platforms.boss.client` | +| `_boss_sign.py` → `BossSign` | `spiderJobs.platforms.boss.sign`(→ crawler_core 桩) | +| `_job51_api.py` → `GetCompanyInfo, GetJobDetail, SearchCompanyJobs, SearchRecommendJobs` | `spiderJobs.platforms.job51.api` | +| `_job51_client.py` → `Job51Client, create_client` | `spiderJobs.platforms.job51.client` | +| `_zhilian_api.py` → `GetCompanyDetail, GetPositionDetail, SearchCompanyPositions, SearchPositions` | `spiderJobs.platforms.zhilian.api` | +| `_zhilian_client.py` → `ZhilianClient, create_capi_client, create_cgate_client` | `spiderJobs.platforms.zhilian.client` | +| `_zhilian_sign.py` → `ZhilianSign` | `spiderJobs.platforms.zhilian.sign`(→ crawler_core 桩) | + +### 1.4 关键兼容性检查 + +**BossService 依赖 `_boss_client.BossClient.batch()` 方法:** +- `_boss_client.py` 第 79-85 行有 `batch()` 方法 +- `spiderJobs/platforms/boss/client.py` 也有 `batch()` 方法(Phase 2 迁移时未删除) +- ✅ 完全兼容 + +**BossService.set_login_data() 设置 `self._signer.mpt/wt2`:** +- `BossSign` 在 spiderJobs 版是 crawler_core 的桩(完全透传),`mpt/wt2` 属性在 `crawler_core.boss.sign.BossSign` 中存在 +- ✅ 完全兼容 + +**ZhilianService 使用 `create_cgate_client/create_capi_client`:** +- `spiderJobs/platforms/zhilian/client.py` 导出这两个函数 +- ✅ 完全兼容 + +### 1.5 asyncio.to_thread() 桥接(ARCH-06) + +后端是 FastAPI 异步框架,而 crawler_core 使用 requests-go(同步)。 +目前 cleaning.py 等是在同步上下文中调用 Service,推测已有 asyncio.to_thread 或者是同步服务。 + +**决策:** 在 facade 层的 Service 类中添加 async 方法(`async_get_job_detail()` 等), +内部用 `asyncio.to_thread(self.get_job_detail, ...)` 桥接。不修改已有同步方法(让旧调用方继续工作)。 + +### 1.6 jobs_spider/ 废弃 + +`jobs_spider/boss/boos_api.py`(94KB)是最旧的单文件实现,现在三个平台的新实现都在 `spiderJobs/platforms/` 和 `crawler_core/` 中。 +只需在 `jobs_spider/CLAUDE.md` 和文件头添加 `## ⚠️ DEPRECATED` 标记,不删除文件(保留历史)。 + +--- + +## 2. 工作量对比 + +| 维度 | 描述 | +|------|------| +| facade 层修改 | 3 个文件(boss.py/qcwy.py/zhilian.py)各 3-5 行 import 修改 | +| 私有文件 | 9 个文件可以**保留但标记废弃**(或直接删除,风险更低是保留) | +| asyncio 桥接 | 3 个 Service 类各加 4-5 个 async 方法 | +| jobs_spider 废弃 | 1 个 DEPRECATED 文件头注释 | + +--- + +## 3. 成功标准 + +| 标准 | 验证方式 | +|------|---------| +| ARCH-06:facade 有 asyncio.to_thread 桥接 | `grep asyncio.to_thread app/services/crawler/*.py` | +| ARCH-07:facade import spiderJobs.platforms.* | `grep "spiderJobs.platforms" app/services/crawler/boss.py` | +| ARCH-08:jobs_spider 标记废弃 | `grep DEPRECATED jobs_spider/boss/boos_api.py` | +| import 验证 | `python -c "from app.services.crawler.boss import BossService"` 无 ImportError | +| 回归测试 | `pytest tests/ -v` 全部通过(98+α 个) | + +--- + +## 4. Phase 4 计划分解 + +- **Plan 01**:迁移三个 facade 文件(boss.py/qcwy.py/zhilian.py)改导入 spiderJobs.platforms.*,添加 asyncio.to_thread 异步包装方法 +- **Plan 02**:废弃 9 个私有复制文件和 jobs_spider/ 旧框架(添加 DEPRECATED 文件头)