win 24918a272b feat: 爬虫优化 — company_desc 补全、Boss详情获取、URL修复
- 新增 company_enrichment.py: job 入库时自动补全 company_desc
  (优先查 MySQL,fallback 调平台 API 获取并入库)
- Boss 爬虫: 搜索列表后逐条调 batch 详情接口拿完整数据
  (jobBaseInfoVO/brandComInfoVO),每条获取后立即上报
- Boss push_mapper: 兼容新旧两种 API 格式(扁平/嵌套VO)
- Boss token: 启动时自动从后端 API 读取数据库中的 mpt/wt2
- Boss client: header 值 strip 防止空格导致请求失败
- qcwy URL: 用 jobId/coId 拼接 jobs.51job.com 格式
- 三个平台 max_pages 默认改为 100
2026-03-22 21:54:19 +08:00

169 lines
5.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Boss直聘 小程序爬虫入口
功能:
1. 从后端获取关键词(优先断点续爬 > 失败重试 > 全新)
2. 调用 SearchRecJobs 分页爬取职位列表
3. 每页实时上传数据 + 汇报进度
4. 支持从断点页码恢复
5. 可选:搜索 job 时顺带抓取公司详情
启动:
python -m spiderJobs.platforms.boss.main
环境变量:
API_BASE_URL 后端地址 (默认 http://124.222.106.226:9999)
MAX_PAGES 每个关键词最大翻页数 (默认 3)
SLEEP_MIN_SECONDS 最小延迟秒数 (默认 10)
SLEEP_MAX_SECONDS 最大延迟秒数 (默认 20)
BOSS_MPT Boss Token (mpt)
BOSS_WT2 Boss Token (wt2)
INLINE_COMPANY 是否内联抓公司 (默认 1设 0 关闭)
"""
from __future__ import annotations
import os
import sys
from typing import Optional
# 确保项目根目录在 sys.path 中
_project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
if _project_root not in sys.path:
sys.path.insert(0, _project_root)
from crawler_core.base import BaseFetcher, BaseSearcher
from spiderJobs.platforms.boss.api import GetBrandDetail, GetJobDetail, SearchRecJobs
from spiderJobs.platforms.boss.client import BossClient, create_client
from crawler_core.boss.sign import BossSign
from spiderJobs.runner.loop import run_crawl_loop
# Boss 城市代码映射(关键词中的城市名 -> Boss cityCode
CITY_CODE_MAP = {
"全国": "100010000",
"北京": "101010100",
"上海": "101020100",
"广州": "101280100",
"深圳": "101280600",
"杭州": "101210100",
"成都": "101270100",
"南京": "101190100",
"武汉": "101200100",
"西安": "101110100",
"长沙": "101250100",
"重庆": "101040100",
"苏州": "101190400",
"天津": "101030100",
"厦门": "101230200",
"郑州": "101180100",
"合肥": "101220100",
"济南": "101120100",
"青岛": "101120200",
"大连": "101070200",
"东莞": "101281600",
"佛山": "101280800",
"珠海": "101280700",
"无锡": "101190200",
"宁波": "101210400",
}
def create_searcher(keyword: dict, http_client: BossClient) -> BaseSearcher:
"""根据关键词创建 Boss 搜索器"""
city = keyword.get("city", "")
city_code = CITY_CODE_MAP.get(city, "101280600")
return SearchRecJobs(
city_code=city_code,
client=http_client,
)
def extract_company_id(job: dict) -> Optional[str]:
"""从 Boss job dict 中提取公司 ID (brandId)"""
brand_id = job.get("brandId")
return str(brand_id) if brand_id else None
def create_company_fetcher(company_id: str, http_client: BossClient) -> BaseFetcher:
"""创建 Boss 公司详情 fetcher"""
return GetBrandDetail(brand_id=company_id, client=http_client)
def enrich_boss_job(job: dict, http_client: BossClient) -> Optional[dict]:
"""通过 batch 详情接口获取完整 job + 公司信息
列表接口只返回基本字段,详情接口返回完整的
jobBaseInfoVO / brandComInfoVO / bossBaseInfoVO 等嵌套数据
"""
security_id = job.get("securityId", "")
encrypt_job_id = job.get("encryptJobId", "")
lid = job.get("lid", "")
if not security_id or not encrypt_job_id:
return None
result = GetJobDetail(
security_id=security_id,
job_id=encrypt_job_id,
lid=lid,
client=http_client,
).fetch()
if not result.success or not result.data:
print(f" [详情] {encrypt_job_id} 获取失败: {result.error}")
return None
detail = result.data.get("detail") or {}
print(f" [详情] {encrypt_job_id} 获取成功")
return detail
def main():
# 优先环境变量,没有则从后端 API 读取数据库中的 token
mpt = os.environ.get("BOSS_MPT", "")
wt2 = os.environ.get("BOSS_WT2", "")
if not mpt:
from spiderJobs.runner.api_client import RunnerAPIClient
api = RunnerAPIClient(platform="boss")
token_data = api.fetch_token()
if token_data:
mpt = (token_data.get("mpt") or "").strip()
wt2 = (token_data.get("wt2") or "").strip()
print(f"[boss] 从后端获取 Token 成功: mpt={mpt[:10]}...")
else:
print("[boss] 警告: 未获取到 Token签名可能失败")
client_kwargs = {}
if mpt or wt2:
signer = BossSign(mpt=mpt, wt2=wt2)
client_kwargs["signer"] = signer
tunnel = os.environ.get("PROXY_TUNNEL", "")
if tunnel:
scheme = os.environ.get("PROXY_SCHEME", "http")
username = os.environ.get("PROXY_USERNAME", "")
password = os.environ.get("PROXY_PASSWORD", "")
if username and password:
client_kwargs["tunnel_proxy"] = f"{scheme}://{username}:{password}@{tunnel}"
else:
client_kwargs["tunnel_proxy"] = f"{scheme}://{tunnel}"
run_crawl_loop(
platform="boss",
create_searcher=create_searcher,
create_client_fn=create_client,
max_pages=100,
data_type="job",
client_kwargs=client_kwargs,
extract_company_id=extract_company_id,
create_company_fetcher=create_company_fetcher,
enrich_job=enrich_boss_job,
)
if __name__ == "__main__":
main()