165 lines
5.1 KiB
Python
165 lines
5.1 KiB
Python
"""
|
||
Boss直聘 小程序爬虫入口
|
||
|
||
功能:
|
||
1. 从后端获取关键词(优先断点续爬 > 失败重试 > 全新)
|
||
2. 调用 SearchRecJobs 分页爬取职位列表
|
||
3. 每页实时上传数据 + 汇报进度
|
||
4. 支持从断点页码恢复
|
||
5. 可选:搜索 job 时顺带抓取公司详情
|
||
|
||
启动:
|
||
python -m spiderJobs.platforms.boss.main
|
||
|
||
环境变量:
|
||
API_BASE_URL 后端地址 (默认 http://124.222.106.226:9999)
|
||
MAX_PAGES 每个关键词最大翻页数 (默认 3)
|
||
SLEEP_MIN_SECONDS 最小延迟秒数 (默认 10)
|
||
SLEEP_MAX_SECONDS 最大延迟秒数 (默认 20)
|
||
BOSS_MPT Boss Token (mpt)
|
||
BOSS_WT2 Boss Token (wt2)
|
||
INLINE_COMPANY 是否内联抓公司 (默认 1,设 0 关闭)
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import os
|
||
import sys
|
||
from typing import Optional
|
||
|
||
# 确保项目根目录在 sys.path 中
|
||
_project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
|
||
if _project_root not in sys.path:
|
||
sys.path.insert(0, _project_root)
|
||
|
||
from crawler_core.base import BaseFetcher, BaseSearcher
|
||
from spiderJobs.platforms.boss.api import GetBrandDetail, GetJobDetail, SearchRecJobs
|
||
from spiderJobs.platforms.boss.client import BossClient, create_client
|
||
from crawler_core.boss.sign import BossSign
|
||
from spiderJobs.runner.loop import run_crawl_loop
|
||
|
||
|
||
# Boss 城市代码映射(关键词中的城市名 -> Boss cityCode)
|
||
CITY_CODE_MAP = {
|
||
"全国": "100010000",
|
||
"北京": "101010100",
|
||
"上海": "101020100",
|
||
"广州": "101280100",
|
||
"深圳": "101280600",
|
||
"杭州": "101210100",
|
||
"成都": "101270100",
|
||
"南京": "101190100",
|
||
"武汉": "101200100",
|
||
"西安": "101110100",
|
||
"长沙": "101250100",
|
||
"重庆": "101040100",
|
||
"苏州": "101190400",
|
||
"天津": "101030100",
|
||
"厦门": "101230200",
|
||
"郑州": "101180100",
|
||
"合肥": "101220100",
|
||
"济南": "101120100",
|
||
"青岛": "101120200",
|
||
"大连": "101070200",
|
||
"东莞": "101281600",
|
||
"佛山": "101280800",
|
||
"珠海": "101280700",
|
||
"无锡": "101190200",
|
||
"宁波": "101210400",
|
||
}
|
||
|
||
|
||
def create_searcher(keyword: dict, http_client: BossClient) -> BaseSearcher:
|
||
"""根据关键词创建 Boss 搜索器"""
|
||
city = keyword.get("city", "")
|
||
city_code = CITY_CODE_MAP.get(city, "101280600")
|
||
|
||
return SearchRecJobs(
|
||
city_code=city_code,
|
||
client=http_client,
|
||
)
|
||
|
||
|
||
def extract_company_id(job: dict) -> Optional[str]:
|
||
"""从 Boss job dict 中提取公司 ID (brandId)"""
|
||
brand_id = job.get("brandId")
|
||
return str(brand_id) if brand_id else None
|
||
|
||
|
||
def create_company_fetcher(company_id: str, http_client: BossClient) -> BaseFetcher:
|
||
"""创建 Boss 公司详情 fetcher"""
|
||
return GetBrandDetail(brand_id=company_id, client=http_client)
|
||
|
||
|
||
def enrich_boss_job(job: dict, http_client: BossClient) -> Optional[dict]:
|
||
"""通过 batch 详情接口获取完整 job + 公司信息
|
||
|
||
列表接口只返回基本字段,详情接口返回完整的
|
||
jobBaseInfoVO / brandComInfoVO / bossBaseInfoVO 等嵌套数据
|
||
"""
|
||
security_id = job.get("securityId", "")
|
||
encrypt_job_id = job.get("encryptJobId", "")
|
||
lid = job.get("lid", "")
|
||
|
||
if not security_id or not encrypt_job_id:
|
||
return None
|
||
|
||
result = GetJobDetail(
|
||
security_id=security_id,
|
||
job_id=encrypt_job_id,
|
||
lid=lid,
|
||
client=http_client,
|
||
).fetch()
|
||
|
||
if not result.success or not result.data:
|
||
print(f" [详情] {encrypt_job_id} 获取失败: {result.error}")
|
||
return None
|
||
|
||
detail = result.data.get("detail") or {}
|
||
print(f" [详情] {encrypt_job_id} 获取成功")
|
||
return detail
|
||
|
||
|
||
def main():
|
||
# 优先环境变量,没有则从后端 API 读取数据库中的 token
|
||
mpt = os.environ.get("BOSS_MPT", "")
|
||
wt2 = os.environ.get("BOSS_WT2", "")
|
||
|
||
if not mpt:
|
||
from spiderJobs.runner.api_client import RunnerAPIClient
|
||
api = RunnerAPIClient(platform="boss")
|
||
token_data = api.fetch_token()
|
||
if token_data:
|
||
mpt = (token_data.get("mpt") or "").strip()
|
||
wt2 = (token_data.get("wt2") or "").strip()
|
||
print(f"[boss] 从后端获取 Token 成功: mpt={mpt[:10]}...")
|
||
else:
|
||
print("[boss] 警告: 未获取到 Token,签名可能失败")
|
||
|
||
client_kwargs = {}
|
||
if mpt or wt2:
|
||
signer = BossSign(mpt=mpt, wt2=wt2)
|
||
client_kwargs["signer"] = signer
|
||
|
||
from spiderJobs import DEFAULT_TUNNEL_PROXY
|
||
tunnel_proxy = os.environ.get("PROXY_TUNNEL", DEFAULT_TUNNEL_PROXY)
|
||
if tunnel_proxy and tunnel_proxy.lower() != "none":
|
||
client_kwargs["tunnel_proxy"] = tunnel_proxy
|
||
print(f"[boss] 隧道代理: {tunnel_proxy.split('@')[-1] if '@' in tunnel_proxy else tunnel_proxy}")
|
||
|
||
run_crawl_loop(
|
||
platform="boss",
|
||
create_searcher=create_searcher,
|
||
create_client_fn=create_client,
|
||
max_pages=100,
|
||
data_type="job",
|
||
client_kwargs=client_kwargs,
|
||
extract_company_id=extract_company_id,
|
||
create_company_fetcher=create_company_fetcher,
|
||
enrich_job=enrich_boss_job,
|
||
)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|