""" runner.loop - 通用爬虫主循环 提供 run_crawl_loop() 作为所有平台的统一入口。 各平台只需提供 create_searcher(keyword, client) 工厂函数。 可选:传入 extract_company_id / create_company_fetcher 实现搜索 job 时顺带抓取公司详情(维度1)。 """ from __future__ import annotations import os import random import time import traceback from typing import Any, Callable, Optional from spiderJobs.core.base import ApiResult, BaseFetcher, BaseSearcher from spiderJobs.runner.api_client import RunnerAPIClient def sleep_random(min_s: float = 10, max_s: float = 20) -> None: """反爬随机延迟""" delay = random.uniform(min_s, max_s) print(f"[延迟] 等待 {delay:.1f}s ...") time.sleep(delay) def _crawl_companies_from_jobs( jobs: list[dict], *, extract_company_id: Callable[[dict], Optional[str]], create_company_fetcher: Callable[[str, Any], BaseFetcher], http_client: Any, api: RunnerAPIClient, seen_companies: set[str], sleep_min: float, sleep_max: float, ) -> None: """从 job 结果中提取公司 ID 并抓取公司详情(内联公司爬取)""" new_ids: list[str] = [] for job in jobs: cid = extract_company_id(job) if cid and cid not in seen_companies: seen_companies.add(cid) new_ids.append(cid) if not new_ids: return print(f" [公司] 发现 {len(new_ids)} 个新公司,开始抓取详情...") ok = 0 for cid in new_ids: sleep_random(sleep_min, sleep_max) try: fetcher = create_company_fetcher(cid, http_client) result = fetcher.fetch() if result.success and result.data: data = result.data if isinstance(result.data, dict) else {"raw": result.data} api.upload_data([data], data_type="company") ok += 1 else: print(f" [公司] {cid} 获取失败: {result.error}") except Exception as e: print(f" [公司] {cid} 异常: {e}") print(f" [公司] 批次完成: {ok}/{len(new_ids)} 成功") def run_crawl_loop( *, platform: str, create_searcher: Callable[[dict, Any], BaseSearcher], create_client_fn: Callable[..., Any], max_pages: int = 3, sleep_min: float = 10, sleep_max: float = 20, data_type: str = "job", api_base_url: str = "", client_kwargs: dict | None = None, # ── 可选:内联公司爬取 ── extract_company_id: Callable[[dict], Optional[str]] | None = None, create_company_fetcher: Callable[[str, Any], BaseFetcher] | None = None, # ── 可选:获取详情enrichment ── enrich_job: Callable[[dict, Any], Optional[dict]] | None = None, ) -> None: """通用爬虫主循环 Args: platform: 平台标识 (boss/qcwy/zhilian) create_searcher: 工厂函数 (keyword_dict, http_client) -> BaseSearcher create_client_fn: 平台 HTTP client 工厂 max_pages: 每个关键词最大翻页数 sleep_min/max: 请求间随机延迟范围(秒) data_type: 数据类型 (job/company) api_base_url: 后端 API 地址 client_kwargs: 传给 create_client_fn 的额外参数 extract_company_id: 从单条 job dict 提取公司 ID (可选) create_company_fetcher: 创建公司详情 fetcher (company_id, http_client) -> BaseFetcher (可选) enrich_job: 从列表 job dict 获取完整详情 (job_dict, http_client) -> 详情 dict (可选) """ max_pages = int(os.environ.get("MAX_PAGES", str(max_pages))) sleep_min = float(os.environ.get("SLEEP_MIN_SECONDS", str(sleep_min))) sleep_max = float(os.environ.get("SLEEP_MAX_SECONDS", str(sleep_max))) inline_company = bool( extract_company_id and create_company_fetcher and os.environ.get("INLINE_COMPANY", "1") != "0" ) api = RunnerAPIClient( base_url=api_base_url, platform=platform, ) print(f"[{platform}] 爬虫启动 | crawler_id={api.crawler_id}") print(f"[{platform}] API: {api.base_url} | max_pages={max_pages} | delay={sleep_min}-{sleep_max}s") if inline_company: print(f"[{platform}] 内联公司爬取: 已启用 (INLINE_COMPANY=0 可关闭)") http_client = create_client_fn(**(client_kwargs or {})) # 会话级公司 ID 去重集合 seen_companies: set[str] = set() while True: try: # 1. 获取关键词 keywords = api.fetch_keyword(limit=1) if not keywords: print(f"[{platform}] 无可用关键词,等待 60s 后重试...") time.sleep(60) continue kw = keywords[0] kw_id = kw["id"] city = kw.get("city", "") job = kw.get("job", "") start_page = (kw.get("last_completed_page") or 0) + 1 print(f"\n[{platform}] 开始爬取: city={city} job={job} (id={kw_id}, 从第{start_page}页)") # 2. 创建 searcher searcher = create_searcher(kw, http_client) # 3. 带断点续爬的分页抓取 all_jobs: list[dict] = [] last_page = start_page - 1 error_occurred = False for page_index in range(start_page, start_page + max_pages): sleep_random(sleep_min, sleep_max) try: result = searcher.search(page_index=page_index) except Exception as e: print(f"[{platform}] 第{page_index}页请求异常: {e}") error_occurred = True api.report_crawl_complete( kw_id, status="failed", error_message=str(e) ) break if not result.success: print(f"[{platform}] 第{page_index}页失败: {result.error}") error_occurred = True api.report_crawl_complete( kw_id, status="failed", error_message=result.error or "unknown", ) break page_jobs = result.list all_jobs.extend(page_jobs) last_page = page_index # 汇报进度 api.report_page_progress( keyword_id=kw_id, page=page_index, jobs_found=len(page_jobs), ) print( f"[{platform}] 第{page_index}页: {len(page_jobs)}条 | " f"累计: {len(all_jobs)}条 | is_end={result.is_end_page}" ) # 上传本页数据(实时推送,不积攒) if page_jobs: if enrich_job: # 逐条获取详情并立即上报,避免批量失败浪费 for j, job in enumerate(page_jobs): try: detail = enrich_job(job, http_client) upload_job = detail if detail else job except Exception as e: print(f" [详情] 第{j+1}条获取失败: {e}") upload_job = job api.upload_data([upload_job], data_type=data_type) if j < len(page_jobs) - 1: sleep_random(sleep_min, sleep_max) else: api.upload_data(page_jobs, data_type=data_type) # 内联公司爬取:从本页 job 中提取公司并抓取详情 if inline_company and page_jobs: _crawl_companies_from_jobs( page_jobs, extract_company_id=extract_company_id, create_company_fetcher=create_company_fetcher, http_client=http_client, api=api, seen_companies=seen_companies, sleep_min=sleep_min, sleep_max=sleep_max, ) if result.is_end_page: break # 4. 完成 if not error_occurred: api.report_crawl_complete(kw_id, status="completed") print(f"[{platform}] 关键词 {city}/{job} 完成,共{len(all_jobs)}条") except KeyboardInterrupt: print(f"\n[{platform}] 收到中断信号,退出...") break except Exception as e: print(f"[{platform}] 主循环异常: {e}") traceback.print_exc() time.sleep(30)