- 新增 company_enrichment.py: job 入库时自动补全 company_desc (优先查 MySQL,fallback 调平台 API 获取并入库) - Boss 爬虫: 搜索列表后逐条调 batch 详情接口拿完整数据 (jobBaseInfoVO/brandComInfoVO),每条获取后立即上报 - Boss push_mapper: 兼容新旧两种 API 格式(扁平/嵌套VO) - Boss token: 启动时自动从后端 API 读取数据库中的 mpt/wt2 - Boss client: header 值 strip 防止空格导致请求失败 - qcwy URL: 用 jobId/coId 拼接 jobs.51job.com 格式 - 三个平台 max_pages 默认改为 100
112 lines
3.0 KiB
Python
112 lines
3.0 KiB
Python
"""
|
||
智联招聘 小程序爬虫入口
|
||
|
||
功能:
|
||
1. 从后端获取关键词(优先断点续爬 > 失败重试 > 全新)
|
||
2. 调用 SearchPositions 分页爬取职位列表
|
||
3. 每页实时上传数据 + 汇报进度
|
||
4. 支持从断点页码恢复
|
||
5. 可选:搜索 job 时顺带抓取公司详情
|
||
|
||
启动:
|
||
python -m spiderJobs.platforms.zhilian.main
|
||
|
||
环境变量:
|
||
API_BASE_URL 后端地址 (默认 http://124.222.106.226:9999)
|
||
MAX_PAGES 每个关键词最大翻页数 (默认 3)
|
||
SLEEP_MIN_SECONDS 最小延迟秒数 (默认 10)
|
||
SLEEP_MAX_SECONDS 最大延迟秒数 (默认 20)
|
||
INLINE_COMPANY 是否内联抓公司 (默认 1,设 0 关闭)
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import os
|
||
import sys
|
||
from typing import Optional
|
||
|
||
_project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
|
||
if _project_root not in sys.path:
|
||
sys.path.insert(0, _project_root)
|
||
|
||
from crawler_core.base import BaseFetcher, BaseSearcher
|
||
from spiderJobs.platforms.zhilian.api import GetCompanyDetail, SearchPositions
|
||
from spiderJobs.platforms.zhilian.client import ZhilianClient, create_cgate_client
|
||
from spiderJobs.runner.loop import run_crawl_loop
|
||
|
||
|
||
# 智联城市代码映射
|
||
CITY_CODE_MAP = {
|
||
"全国": "",
|
||
"北京": 530,
|
||
"上海": 538,
|
||
"广州": 763,
|
||
"深圳": 765,
|
||
"杭州": 653,
|
||
"成都": 801,
|
||
"南京": 635,
|
||
"武汉": 736,
|
||
"西安": 854,
|
||
"长沙": 749,
|
||
"重庆": 551,
|
||
"苏州": 639,
|
||
"天津": 531,
|
||
"厦门": 682,
|
||
"郑州": 719,
|
||
"合肥": 664,
|
||
"济南": 703,
|
||
"青岛": 704,
|
||
"大连": 600,
|
||
"东莞": 769,
|
||
"佛山": 766,
|
||
"珠海": 768,
|
||
"无锡": 636,
|
||
"宁波": 654,
|
||
}
|
||
|
||
|
||
def create_searcher(keyword: dict, http_client: ZhilianClient) -> BaseSearcher:
|
||
"""根据关键词创建智联搜索器"""
|
||
city = keyword.get("city", "")
|
||
job = keyword.get("job", "")
|
||
city_code = CITY_CODE_MAP.get(city, 538)
|
||
|
||
return SearchPositions(
|
||
keyword=job,
|
||
city_code=city_code,
|
||
client=http_client,
|
||
)
|
||
|
||
|
||
def extract_company_id(job: dict) -> Optional[str]:
|
||
"""从智联 job dict 中提取公司 ID (companyNumber)"""
|
||
company_number = job.get("companyNumber") or job.get("company", {}).get("number")
|
||
return str(company_number) if company_number else None
|
||
|
||
|
||
def create_company_fetcher(company_id: str, http_client: ZhilianClient) -> BaseFetcher:
|
||
"""创建智联公司详情 fetcher"""
|
||
return GetCompanyDetail(number=company_id, client=http_client)
|
||
|
||
|
||
def main():
|
||
client_kwargs = {}
|
||
proxy = os.environ.get("PROXY_URL", "")
|
||
if proxy:
|
||
client_kwargs["proxy"] = proxy
|
||
|
||
run_crawl_loop(
|
||
platform="zhilian",
|
||
create_searcher=create_searcher,
|
||
create_client_fn=create_cgate_client,
|
||
max_pages=100,
|
||
data_type="job",
|
||
client_kwargs=client_kwargs,
|
||
extract_company_id=extract_company_id,
|
||
create_company_fetcher=create_company_fetcher,
|
||
)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|