2026-03-22 23:22:30 +08:00

60 lines
1.8 KiB
Python

"""
前程无忧 (51Job) 独立公司爬虫入口
从 pending_company 队列获取待爬取的 51job 公司,
逐个调用 GetCompanyInfo 获取详情并上传。
启动:
python -m spiderJobs.platforms.job51.company_main
环境变量:
API_BASE_URL 后端地址 (默认 http://124.222.106.226:9999)
COMPANY_BATCH_SIZE 每批获取公司数 (默认 10)
SLEEP_MIN_SECONDS 最小延迟秒数 (默认 10)
SLEEP_MAX_SECONDS 最大延迟秒数 (默认 20)
"""
from __future__ import annotations
import os
import sys
_project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
if _project_root not in sys.path:
sys.path.insert(0, _project_root)
from spiderJobs.core.base import BaseFetcher
from spiderJobs.platforms.job51.api import GetCompanyInfo
from spiderJobs.platforms.job51.client import Job51Client, create_client
from spiderJobs.runner.company_loop import run_company_loop
def create_company_fetcher(company_id: str, http_client: Job51Client) -> BaseFetcher:
"""创建 51job 公司详情 fetcher"""
return GetCompanyInfo(company_id=company_id, client=http_client)
def main():
client_kwargs = {}
tunnel = os.environ.get("PROXY_TUNNEL", "")
if tunnel:
scheme = os.environ.get("PROXY_SCHEME", "http")
username = os.environ.get("PROXY_USERNAME", "")
password = os.environ.get("PROXY_PASSWORD", "")
if username and password:
client_kwargs["tunnel_proxy"] = f"{scheme}://{username}:{password}@{tunnel}"
else:
client_kwargs["tunnel_proxy"] = f"{scheme}://{tunnel}"
run_company_loop(
platform="qcwy",
create_company_fetcher=create_company_fetcher,
create_client_fn=create_client,
client_kwargs=client_kwargs,
)
if __name__ == "__main__":
main()