win 24918a272b feat: 爬虫优化 — company_desc 补全、Boss详情获取、URL修复
- 新增 company_enrichment.py: job 入库时自动补全 company_desc
  (优先查 MySQL,fallback 调平台 API 获取并入库)
- Boss 爬虫: 搜索列表后逐条调 batch 详情接口拿完整数据
  (jobBaseInfoVO/brandComInfoVO),每条获取后立即上报
- Boss push_mapper: 兼容新旧两种 API 格式(扁平/嵌套VO)
- Boss token: 启动时自动从后端 API 读取数据库中的 mpt/wt2
- Boss client: header 值 strip 防止空格导致请求失败
- qcwy URL: 用 jobId/coId 拼接 jobs.51job.com 格式
- 三个平台 max_pages 默认改为 100
2026-03-22 21:54:19 +08:00

112 lines
3.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
智联招聘 小程序爬虫入口
功能:
1. 从后端获取关键词(优先断点续爬 > 失败重试 > 全新)
2. 调用 SearchPositions 分页爬取职位列表
3. 每页实时上传数据 + 汇报进度
4. 支持从断点页码恢复
5. 可选:搜索 job 时顺带抓取公司详情
启动:
python -m spiderJobs.platforms.zhilian.main
环境变量:
API_BASE_URL 后端地址 (默认 http://124.222.106.226:9999)
MAX_PAGES 每个关键词最大翻页数 (默认 3)
SLEEP_MIN_SECONDS 最小延迟秒数 (默认 10)
SLEEP_MAX_SECONDS 最大延迟秒数 (默认 20)
INLINE_COMPANY 是否内联抓公司 (默认 1设 0 关闭)
"""
from __future__ import annotations
import os
import sys
from typing import Optional
_project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
if _project_root not in sys.path:
sys.path.insert(0, _project_root)
from crawler_core.base import BaseFetcher, BaseSearcher
from spiderJobs.platforms.zhilian.api import GetCompanyDetail, SearchPositions
from spiderJobs.platforms.zhilian.client import ZhilianClient, create_cgate_client
from spiderJobs.runner.loop import run_crawl_loop
# 智联城市代码映射
CITY_CODE_MAP = {
"全国": "",
"北京": 530,
"上海": 538,
"广州": 763,
"深圳": 765,
"杭州": 653,
"成都": 801,
"南京": 635,
"武汉": 736,
"西安": 854,
"长沙": 749,
"重庆": 551,
"苏州": 639,
"天津": 531,
"厦门": 682,
"郑州": 719,
"合肥": 664,
"济南": 703,
"青岛": 704,
"大连": 600,
"东莞": 769,
"佛山": 766,
"珠海": 768,
"无锡": 636,
"宁波": 654,
}
def create_searcher(keyword: dict, http_client: ZhilianClient) -> BaseSearcher:
"""根据关键词创建智联搜索器"""
city = keyword.get("city", "")
job = keyword.get("job", "")
city_code = CITY_CODE_MAP.get(city, 538)
return SearchPositions(
keyword=job,
city_code=city_code,
client=http_client,
)
def extract_company_id(job: dict) -> Optional[str]:
"""从智联 job dict 中提取公司 ID (companyNumber)"""
company_number = job.get("companyNumber") or job.get("company", {}).get("number")
return str(company_number) if company_number else None
def create_company_fetcher(company_id: str, http_client: ZhilianClient) -> BaseFetcher:
"""创建智联公司详情 fetcher"""
return GetCompanyDetail(number=company_id, client=http_client)
def main():
client_kwargs = {}
proxy = os.environ.get("PROXY_URL", "")
if proxy:
client_kwargs["proxy"] = proxy
run_crawl_loop(
platform="zhilian",
create_searcher=create_searcher,
create_client_fn=create_cgate_client,
max_pages=100,
data_type="job",
client_kwargs=client_kwargs,
extract_company_id=extract_company_id,
create_company_fetcher=create_company_fetcher,
)
if __name__ == "__main__":
main()