2026-03-24 01:49:31 +08:00

113 lines
3.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
前程无忧 (51Job) 小程序爬虫入口
功能:
1. 从后端获取关键词(优先断点续爬 > 失败重试 > 全新)
2. 调用 SearchRecommendJobs 分页爬取职位列表
3. 每页实时上传数据 + 汇报进度
4. 支持从断点页码恢复
5. 可选:搜索 job 时顺带抓取公司详情
启动:
python -m spiderJobs.platforms.job51.main
环境变量:
API_BASE_URL 后端地址 (默认 http://124.222.106.226:9999)
MAX_PAGES 每个关键词最大翻页数 (默认 3)
SLEEP_MIN_SECONDS 最小延迟秒数 (默认 10)
SLEEP_MAX_SECONDS 最大延迟秒数 (默认 20)
INLINE_COMPANY 是否内联抓公司 (默认 1设 0 关闭)
"""
from __future__ import annotations
import os
import sys
from typing import Optional
_project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
if _project_root not in sys.path:
sys.path.insert(0, _project_root)
from crawler_core.base import BaseFetcher, BaseSearcher
from spiderJobs.platforms.job51.api import GetCompanyInfo, SearchRecommendJobs
from spiderJobs.platforms.job51.client import Job51Client, create_client
from spiderJobs.runner.loop import run_crawl_loop
# 51job 城市代码映射
CITY_CODE_MAP = {
"全国": "000000",
"北京": "010000",
"上海": "020000",
"广州": "030200",
"深圳": "040000",
"杭州": "080200",
"成都": "090200",
"南京": "070200",
"武汉": "180200",
"西安": "200200",
"长沙": "190200",
"重庆": "060000",
"苏州": "070300",
"天津": "050000",
"厦门": "110300",
"郑州": "170200",
"合肥": "150200",
"济南": "120200",
"青岛": "120300",
"大连": "230300",
"东莞": "030800",
"佛山": "030600",
"珠海": "030500",
"无锡": "070400",
"宁波": "080300",
}
def create_searcher(keyword: dict, http_client: Job51Client) -> BaseSearcher:
"""根据关键词创建 51job 搜索器"""
city = keyword.get("city", "")
job_area = CITY_CODE_MAP.get(city, "020000")
return SearchRecommendJobs(
job_area=job_area,
client=http_client,
)
def extract_company_id(job: dict) -> Optional[str]:
"""从 51job job dict 中提取公司 ID (coId)"""
co_id = job.get("coId")
return str(co_id) if co_id else None
def create_company_fetcher(company_id: str, http_client: Job51Client) -> BaseFetcher:
"""创建 51job 公司详情 fetcher"""
return GetCompanyInfo(company_id=company_id, client=http_client)
def main():
client_kwargs = {}
from spiderJobs import DEFAULT_TUNNEL_PROXY
tunnel_proxy = os.environ.get("PROXY_TUNNEL", DEFAULT_TUNNEL_PROXY)
if tunnel_proxy and tunnel_proxy.lower() != "none":
client_kwargs["tunnel_proxy"] = tunnel_proxy
print(f"[qcwy] 隧道代理: {tunnel_proxy.split('@')[-1] if '@' in tunnel_proxy else tunnel_proxy}")
run_crawl_loop(
platform="qcwy",
create_searcher=create_searcher,
create_client_fn=create_client,
max_pages=100,
data_type="job",
client_kwargs=client_kwargs,
extract_company_id=extract_company_id,
create_company_fetcher=create_company_fetcher,
)
if __name__ == "__main__":
main()