diff --git a/.planning/phases/06-quality-frontend/06-01-PLAN.md b/.planning/phases/06-quality-frontend/06-01-PLAN.md new file mode 100644 index 0000000..ce9e02b --- /dev/null +++ b/.planning/phases/06-quality-frontend/06-01-PLAN.md @@ -0,0 +1,92 @@ +--- +phase: 6 +plan: 1 +wave: 1 +title: "三平台数据解析函数单元测试(QUAL-02)" +depends_on: [] +files_modified: + - tests/ingest/test_configs_boss.py # NEW + - tests/ingest/test_configs_qcwy.py # NEW + - tests/ingest/test_configs_zhilian.py # NEW +autonomous: true +requirements: + - QUAL-02 +--- + +# Phase 6 Plan 01: 三平台解析函数单元测试(QUAL-02) + +## Objective + +为 `app/services/ingest/configs/` 中的三平台 `_extract_*` 和 `_build_*_push` 函数 +新增单元测试,覆盖正常字段和缺字段场景。 + +去重逻辑测试(dedup.py)已在 Phase 5 完成(6 个测试),本 Plan 仅补充解析函数测试。 + +## Must Haves + +- [ ] `tests/ingest/test_configs_boss.py`:8 个测试,覆盖 `_extract_job_id`、`_extract_company_name`、`_build_boss_push` +- [ ] `tests/ingest/test_configs_qcwy.py`:10 个测试,覆盖 `_extract_job_id`、`_extract_update_dt`、`_extract_company_name`、`_build_qcwy_push`(含 welfare 列表场景) +- [ ] `tests/ingest/test_configs_zhilian.py`:9 个测试,覆盖 `_extract_number`、`_extract_fpt`、`_extract_company_name`、`_build_zhilian_push` +- [ ] `pipenv run python -m pytest tests/ingest/ -v --tb=short` 全部绿色(含原有 dedup 6 个) +- [ ] `pipenv run python -m pytest tests/ -v` 全量通过 + +--- + +## Wave 1 + +### Task 1.1: tests/ingest/test_configs_boss.py + +**测试清单:** +1. `test_extract_job_id_from_jobBaseInfoVO` — 正常嵌套字段 +2. `test_extract_job_id_missing` — 缺 jobBaseInfoVO → None +3. `test_extract_company_name_from_name` — data["name"] 直接取 +4. `test_extract_company_name_from_companyFullInfoVO` — 嵌套字段 +5. `test_extract_company_name_missing` → None +6. `test_build_boss_push_full` — 完整字段,验证 source_type="Boss直聘"、url 含 encryptJobId +7. `test_build_boss_push_partial` — 缺字段不 raise,返回合理降级值 +8. `test_build_boss_push_none_data` — 空 dict,关键字段为 None + +--- + +### Task 1.2: tests/ingest/test_configs_qcwy.py + +**测试清单:** +1. `test_extract_job_id_normal` +2. `test_extract_job_id_missing` → None +3. `test_extract_update_dt_normal` +4. `test_extract_update_dt_missing` → None +5. `test_extract_company_name_from_companyName` +6. `test_extract_company_name_from_company_name_fallback` +7. `test_extract_company_name_missing` → None +8. `test_build_qcwy_push_welfare_list` — welfare 为对象列表,提取 chineseTitle +9. `test_build_qcwy_push_welfare_string` — welfare 为字符串 +10. `test_build_qcwy_push_partial` — 缺字段 → 合理降级,source_type="前程无忧" + +--- + +### Task 1.3: tests/ingest/test_configs_zhilian.py + +**测试清单:** +1. `test_extract_number_normal` +2. `test_extract_number_missing` → None +3. `test_extract_fpt_normal` +4. `test_extract_fpt_missing` → None +5. `test_extract_company_name_from_companyName` +6. `test_extract_company_name_from_name_fallback` +7. `test_extract_company_name_missing` → None +8. `test_build_zhilian_push_skill_labels` — skillLabel 列表,提取 value +9. `test_build_zhilian_push_partial` — 缺字段降级,source_type="智联招聘" + +--- + +## Verification + +```bash +# 运行新测试 +pipenv run python -m pytest tests/ingest/ -v --tb=short + +# 全量回归 +pipenv run python -m pytest tests/ -v --tb=short +``` + +**预期:** 所有 27-30 个测试通过,全量 ≥ 130 个测试全绿 diff --git a/.planning/phases/06-quality-frontend/06-02-PLAN.md b/.planning/phases/06-quality-frontend/06-02-PLAN.md new file mode 100644 index 0000000..2663fe3 --- /dev/null +++ b/.planning/phases/06-quality-frontend/06-02-PLAN.md @@ -0,0 +1,191 @@ +--- +phase: 6 +plan: 2 +wave: 2 +title: "爬虫入库统计 API + 前端监控区域(QUAL-06/07)" +depends_on: + - "06-01-PLAN.md" +files_modified: + - app/api/v1/job/job.py # 新增 GET /data/stats 端点 + - web/src/views/cleaning/monitor.vue # 新增爬虫统计区域 + - web/src/api/index.js # 新增 getIngestStats API +autonomous: true +requirements: + - QUAL-06 + - QUAL-07 +--- + +# Phase 6 Plan 02: 爬虫入库统计 API + 前端监控(QUAL-06/07) + +## Objective + +### QUAL-07 状态确认(已完成) + +`cleaning/monitor.vue` 已包含: +- ✅ 待清洗公司列表(队列表格) +- ✅ 触发清洗 +- ✅ 查看结果 + +**QUAL-07 无需额外改动。** + +### QUAL-06 缺口 + +现有监控页面仅展示公司清洗队列状态,**缺少爬虫职位入库的实时统计**: +- 各平台最近抓取时间(ClickHouse `created_at` 最大值) +- 数量趋势(近 7 天每日入库量) +- 错误状态(失败/去重统计暂不通过 ClickHouse,后续可扩展) + +## Must Haves + +- [ ] 后端新增 `GET /api/v1/job/data/stats` 端点,接受 `platform`(可选)和 `days`(默认 7)参数 + - 返回:各平台 `total`、`today`、`last_ingest_at`、`daily_counts`(列表) +- [ ] 前端 `monitor.vue` 在现有 4 个 metric-card 上方新增一个"爬虫入库"统计区域: + - 3 个平台卡片,各显示:总量、今日、最近抓取时间 + - 一个数量趋势表格(近 7 天,按日显示 boss/qcwy/zhilian) +- [ ] `web/src/api/index.js` 新增 `getIngestStats` 函数 +- [ ] 前端 `pnpm dev`(或工具链验证)可正常加载 +- [ ] 全量 pytest 回归 `pipenv run python -m pytest tests/` 无失败 + +--- + +## Wave 2(依赖 Plan 01) + +### Task 2.1: 后端新增 GET /job/data/stats 端点 + + +- `app/api/v1/job/job.py`(当前 123 行) +- `app/core/clickhouse.py`(获取 client 方式) + + + +在 `job.py` 中追加端点: + +```python +@router.get("/data/stats", summary="各平台入库统计") +async def get_ingest_stats( + platform: Optional[PlatformType] = None, + days: int = 7, + service: IngestService = Depends(get_ingest_service), +) -> Dict[str, Any]: + """ + 查询各平台 ClickHouse 入库统计:总量、今日、最近入库时间、近 N 天每日趋势 + """ + from app.core.clickhouse import clickhouse_manager + client = await clickhouse_manager.get_client() + + platforms = [platform.value] if platform else ["boss", "qcwy", "zhilian"] + table_map = {"boss": "boss_job", "qcwy": "qcwy_job", "zhilian": "zhilian_job"} + + result = {} + for p in platforms: + table = f"job_data.{table_map[p]}" + try: + # 总量 + r_total = await client.query(f"SELECT count() FROM {table}") + total = r_total.result_rows[0][0] if r_total.result_rows else 0 + + # 今日 + r_today = await client.query( + f"SELECT count() FROM {table} WHERE toDate(created_at) = today()" + ) + today = r_today.result_rows[0][0] if r_today.result_rows else 0 + + # 最近入库时间 + r_last = await client.query( + f"SELECT max(created_at) FROM {table}" + ) + last_at = str(r_last.result_rows[0][0]) if r_last.result_rows and r_last.result_rows[0][0] else None + + # 近 N 天每日趋势 + r_daily = await client.query( + f"SELECT toDate(created_at) AS day, count() AS cnt " + f"FROM {table} " + f"WHERE created_at >= today() - {days} " + f"GROUP BY day ORDER BY day DESC" + ) + daily_counts = [{"date": str(row[0]), "count": row[1]} for row in r_daily.result_rows] + + result[p] = { + "total": total, + "today": today, + "last_ingest_at": last_at, + "daily_counts": daily_counts, + } + except Exception as e: + result[p] = {"error": str(e), "total": 0, "today": 0, "last_ingest_at": None, "daily_counts": []} + + return {"code": 200, "data": result} +``` + + +--- + +### Task 2.2: 前端新增 getIngestStats API + + +在 `web/src/api/index.js` 找到已有 API 函数,追加: + +```js +getIngestStats: (params) => request.get('/job/data/stats', { params }), +``` + + +--- + +### Task 2.3: 前端 monitor.vue 添加爬虫统计区域 + + +在 `monitor.vue` 的 `` **之前** 插入一个新 section: + +```html + + + + {{ p.label }} + {{ p.total.toLocaleString() }} + + 今日 +{{ p.today }} · 最近 {{ p.last_ingest_at || '--' }} + + + + 近 7 天入库趋势 + + + +``` + +对应 `