win 24918a272b feat: 爬虫优化 — company_desc 补全、Boss详情获取、URL修复
- 新增 company_enrichment.py: job 入库时自动补全 company_desc
  (优先查 MySQL,fallback 调平台 API 获取并入库)
- Boss 爬虫: 搜索列表后逐条调 batch 详情接口拿完整数据
  (jobBaseInfoVO/brandComInfoVO),每条获取后立即上报
- Boss push_mapper: 兼容新旧两种 API 格式(扁平/嵌套VO)
- Boss token: 启动时自动从后端 API 读取数据库中的 mpt/wt2
- Boss client: header 值 strip 防止空格导致请求失败
- qcwy URL: 用 jobId/coId 拼接 jobs.51job.com 格式
- 三个平台 max_pages 默认改为 100
2026-03-22 21:54:19 +08:00

109 lines
4.1 KiB
Python

from typing import Dict, Any, Optional
from app.services.ingest.registry import PlatformConfig, DedupFieldSpec, register
from app.services.ingest.remote_push import safe_join
def _extract_job_id(data: Dict[str, Any]) -> Optional[str]:
val = data.get("jobId")
return str(val) if val else None
def _extract_update_dt(data: Dict[str, Any]) -> Optional[str]:
val = data.get("updateDateTime")
return str(val) if val else None
def _extract_company_name(data: Dict[str, Any]) -> Optional[str]:
name = data.get("companyName") or data.get("company_name")
return str(name) if name else None
def _build_qcwy_push(data: Dict[str, Any]) -> Optional[Dict[str, Any]]:
welfare_list = data.get("jobWelfareCodeDataList")
if isinstance(welfare_list, list):
welfare_str = ",".join(
str(item.get("chineseTitle") or item.get("typeTitle") or item.get("englishTitle") or item.get("code"))
for item in welfare_list if isinstance(item, dict)
)
elif isinstance(welfare_list, str):
welfare_str = welfare_list.replace("[", "").replace("]", "")
else:
welfare_str = ""
raw_location = data.get("location") or ""
if not raw_location:
work_loc = data.get("workLocation") or {}
raw_location = work_loc.get("workAddress") or work_loc.get("address") or ""
location_val = raw_location or "位置信息未找到"
raw_area = data.get("jobAreaString") or ""
if not raw_area:
level_detail = data.get("jobAreaLevelDetail") or {}
city_str = level_detail.get("cityString") or ""
landmark_str = level_detail.get("landMarkString") or ""
raw_area = f"{city_str}{landmark_str}".strip()
area_val = raw_area or "位置信息未找到"
job_id = data.get("jobId") or ""
co_id = data.get("coId") or ""
job_url = data.get("jobHref") or (f"https://jobs.51job.com/all/{job_id}.html" if job_id else "")
company_url = data.get("companyHref") or (f"https://jobs.51job.com/all/co{co_id}.html" if co_id else "")
return {
"source_type": "前程无忧",
"name": data.get("companyName"),
"title": data.get("jobName"),
"title_addr": data.get("jobName"),
"description": data.get("jobDescribe"),
"age": "", "sex": "", "number": "",
"education": data.get("degreeString"),
"skill": safe_join(data.get("jobTagsForOrder")),
"welfare": welfare_str,
"years": data.get("workYearString"),
"salary": f'{data.get("jobSalaryMax", "")}-{data.get("jobSalaryMin", "")}',
"location": location_val,
"position": area_val,
"date": data.get("confirmDateString"),
"start_date": data.get("confirmDateString"),
"end_date": "",
"job_type": data.get("termStr"),
"size": data.get("companySizeString"),
"employer_type": data.get("companyTypeString"),
"industry": f'{data.get("major1Str", "")}-{data.get("major2Str", "")}',
"job_1st_class": "", "job_2nd_class": "", "job_3rd_class": "", "job_4th_class": "",
"url": job_url,
"company_id": co_id,
"company_name": data.get("fullCompanyName"),
"company_url": company_url,
"company_desc": data.get("company_desc", ""),
"base_data": data,
}
register(PlatformConfig(
platform="qcwy", channel="mini", data_type="job",
table="qcwy_job",
dedup_fields=(
DedupFieldSpec(column="job_id", extractor=_extract_job_id),
DedupFieldSpec(column="update_date_time", extractor=_extract_update_dt),
),
push_mapper=_build_qcwy_push,
))
register(PlatformConfig(
platform="qcwy", channel="mini", data_type="company",
table="qcwy_company",
dedup_fields=(DedupFieldSpec(column="company_name", extractor=_extract_company_name),),
))
# 公司关联职位(通过 company_jobs_sync 写入,与搜索职位 mini 区分)
register(PlatformConfig(
platform="qcwy", channel="company", data_type="job",
table="qcwy_job",
dedup_fields=(
DedupFieldSpec(column="job_id", extractor=_extract_job_id),
DedupFieldSpec(column="update_date_time", extractor=_extract_update_dt),
),
))