win 46883cef8a feat(02-01): migrate Boss spider layer from spiderJobs.core to crawler_core
- client.py: inherit crawler_core.http_client.HTTPClient, use crawler_core.boss.sign.BossSign
- api.py: use crawler_core.base.Result/BaseFetcher/BaseSearcher, fix self._http -> self.http_client
- main.py: import BaseFetcher/BaseSearcher and BossSign from crawler_core
- sign.py: replace with backward-compat stub re-exporting BossSign from crawler_core
Satisfies ARCH-03
2026-03-21 19:00:30 +08:00

127 lines
3.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Boss直聘 小程序爬虫入口
功能:
1. 从后端获取关键词(优先断点续爬 > 失败重试 > 全新)
2. 调用 SearchRecJobs 分页爬取职位列表
3. 每页实时上传数据 + 汇报进度
4. 支持从断点页码恢复
5. 可选:搜索 job 时顺带抓取公司详情
启动:
python -m spiderJobs.platforms.boss.main
环境变量:
API_BASE_URL 后端地址 (默认 http://124.222.106.226:9999)
MAX_PAGES 每个关键词最大翻页数 (默认 3)
SLEEP_MIN_SECONDS 最小延迟秒数 (默认 10)
SLEEP_MAX_SECONDS 最大延迟秒数 (默认 20)
BOSS_MPT Boss Token (mpt)
BOSS_WT2 Boss Token (wt2)
INLINE_COMPANY 是否内联抓公司 (默认 1设 0 关闭)
"""
from __future__ import annotations
import os
import sys
from typing import Optional
# 确保项目根目录在 sys.path 中
_project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
if _project_root not in sys.path:
sys.path.insert(0, _project_root)
from crawler_core.base import BaseFetcher, BaseSearcher
from spiderJobs.platforms.boss.api import GetBrandDetail, SearchRecJobs
from spiderJobs.platforms.boss.client import BossClient, create_client
from crawler_core.boss.sign import BossSign
from spiderJobs.runner.loop import run_crawl_loop
# Boss 城市代码映射(关键词中的城市名 -> Boss cityCode
CITY_CODE_MAP = {
"全国": "100010000",
"北京": "101010100",
"上海": "101020100",
"广州": "101280100",
"深圳": "101280600",
"杭州": "101210100",
"成都": "101270100",
"南京": "101190100",
"武汉": "101200100",
"西安": "101110100",
"长沙": "101250100",
"重庆": "101040100",
"苏州": "101190400",
"天津": "101030100",
"厦门": "101230200",
"郑州": "101180100",
"合肥": "101220100",
"济南": "101120100",
"青岛": "101120200",
"大连": "101070200",
"东莞": "101281600",
"佛山": "101280800",
"珠海": "101280700",
"无锡": "101190200",
"宁波": "101210400",
}
def create_searcher(keyword: dict, http_client: BossClient) -> BaseSearcher:
"""根据关键词创建 Boss 搜索器"""
city = keyword.get("city", "")
city_code = CITY_CODE_MAP.get(city, "101280600")
return SearchRecJobs(
city_code=city_code,
client=http_client,
)
def extract_company_id(job: dict) -> Optional[str]:
"""从 Boss job dict 中提取公司 ID (brandId)"""
brand_id = job.get("brandId")
return str(brand_id) if brand_id else None
def create_company_fetcher(company_id: str, http_client: BossClient) -> BaseFetcher:
"""创建 Boss 公司详情 fetcher"""
return GetBrandDetail(brand_id=company_id, client=http_client)
def main():
mpt = os.environ.get("BOSS_MPT", "")
wt2 = os.environ.get("BOSS_WT2", "")
client_kwargs = {}
if mpt or wt2:
signer = BossSign(mpt=mpt, wt2=wt2)
client_kwargs["signer"] = signer
tunnel = os.environ.get("PROXY_TUNNEL", "")
if tunnel:
scheme = os.environ.get("PROXY_SCHEME", "http")
username = os.environ.get("PROXY_USERNAME", "")
password = os.environ.get("PROXY_PASSWORD", "")
if username and password:
client_kwargs["tunnel_proxy"] = f"{scheme}://{username}:{password}@{tunnel}"
else:
client_kwargs["tunnel_proxy"] = f"{scheme}://{tunnel}"
run_crawl_loop(
platform="boss",
create_searcher=create_searcher,
create_client_fn=create_client,
max_pages=3,
data_type="job",
client_kwargs=client_kwargs,
extract_company_id=extract_company_id,
create_company_fetcher=create_company_fetcher,
)
if __name__ == "__main__":
main()