Plan 01 - DATA-01: 30-day window dedup fix: - dedup.py: both single-field and double-field SQL queries now include AND created_at > now() - INTERVAL 30 DAY - tests/ingest/test_dedup.py: 6 mock tests validating 30-day window Plan 02 - DATA-04: company vs search job channel separation: - schemas/ingest.py: ChannelType.COMPANY = 'company' - configs/boss.py: register channel='company' config - configs/qcwy.py: register channel='company' config - configs/zhilian.py: register channel='company' config - company_jobs_sync.py: store_batch(..., 'mini', ...) → (..., 'company', ...) DATA-02: confirmed already complete (job.py has /data/batch-async endpoint) DATA-03: confirmed already complete (company_cleaner.py full pipeline) Full regression: 112 passed (106 existing + 6 new)
71 lines
3.0 KiB
Python
71 lines
3.0 KiB
Python
from typing import Dict, Any, Optional
|
|
|
|
from app.services.ingest.registry import PlatformConfig, DedupFieldSpec, register
|
|
from app.services.ingest.remote_push import safe_get, safe_join
|
|
|
|
|
|
def _extract_job_id(data: Dict[str, Any]) -> Optional[str]:
|
|
job_base = data.get("jobBaseInfoVO", {})
|
|
val = job_base.get("jobId") if job_base else None
|
|
return str(val) if val else None
|
|
|
|
|
|
def _extract_company_name(data: Dict[str, Any]) -> Optional[str]:
|
|
name = data.get("name") or (data.get("companyFullInfoVO") or {}).get("name")
|
|
return str(name) if name else None
|
|
|
|
|
|
def _build_boss_push(data: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
|
boss_base = data.get("bossBaseInfoVO") or {}
|
|
job_base = data.get("jobBaseInfoVO") or {}
|
|
brand = data.get("brandComInfoVO") or {}
|
|
return {
|
|
"source_type": "Boss直聘",
|
|
"name": safe_get(brand, "brandName"),
|
|
"common_name": safe_get(boss_base, "brandName"),
|
|
"title": safe_get(job_base, "positionName"),
|
|
"title_addr": safe_get(job_base, "positionName"),
|
|
"description": safe_get(job_base, "jobDesc"),
|
|
"education": safe_get(job_base, "degreeName"),
|
|
"skill": safe_join(job_base.get("requiredSkills")),
|
|
"welfare": safe_join(job_base.get("salaryWelfareInfo")),
|
|
"years": safe_get(job_base, "experienceName"),
|
|
"salary": f'{safe_get(job_base, "lowSalary")}-{safe_get(job_base, "highSalary")}',
|
|
"location": safe_get(job_base, "locationName", "位置信息未找到"),
|
|
"position": safe_get(job_base, "locationDesc", "位置信息未找到"),
|
|
"job_type": "全职",
|
|
"size": safe_get(brand, "scaleName"),
|
|
"employer_type": "全职",
|
|
"industry": safe_get(brand, "industryName"),
|
|
"job_1st_class": "", "job_2nd_class": "", "job_3rd_class": "", "job_4th_class": "",
|
|
"date": "", "start_date": "", "end_date": "",
|
|
"age": "", "sex": "", "number": "",
|
|
"url": f"https://www.zhipin.com/job_detail/{safe_get(job_base, 'encryptJobId')}.html",
|
|
"company_id": safe_get(brand, "encryptBrandId"),
|
|
"company_name": safe_get(brand, "brandName"),
|
|
"company_url": f"https://www.zhipin.com/gongsi/{safe_get(brand, 'encryptBrandId')}.html",
|
|
"company_desc": safe_get(brand, "introduce"),
|
|
"base_data": data,
|
|
}
|
|
|
|
|
|
register(PlatformConfig(
|
|
platform="boss", channel="mini", data_type="job",
|
|
table="boss_job",
|
|
dedup_fields=(DedupFieldSpec(column="job_id", extractor=_extract_job_id),),
|
|
push_mapper=_build_boss_push,
|
|
))
|
|
|
|
register(PlatformConfig(
|
|
platform="boss", channel="mini", data_type="company",
|
|
table="boss_company",
|
|
dedup_fields=(DedupFieldSpec(column="company_name", extractor=_extract_company_name),),
|
|
))
|
|
|
|
# 公司关联职位(通过 company_jobs_sync 写入,与搜索职位 mini 区分)
|
|
register(PlatformConfig(
|
|
platform="boss", channel="company", data_type="job",
|
|
table="boss_job",
|
|
dedup_fields=(DedupFieldSpec(column="job_id", extractor=_extract_job_id),),
|
|
))
|