JobData/spiderJobs/runner/api_client.py
2026-03-22 23:22:30 +08:00

215 lines
7.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
runner.api_client - 爬虫与后端 API 的通信层
提供关键词获取、进度汇报、数据上传等功能。
爬虫主循环通过此模块与后端交互,实现状态管理。
"""
from __future__ import annotations
import json
import os
import time
import uuid
from typing import Any, Optional
import requests
class RunnerAPIClient:
"""后端 API 客户端,负责关键词调度与数据上传"""
def __init__(
self,
base_url: str = "",
api_token: str = "dev",
platform: str = "",
crawler_id: str = "",
):
self.base_url = (
base_url
or os.environ.get("API_BASE_URL", "http://127.0.0.1:9999")
).rstrip("/")
self.api_token = api_token or os.environ.get("API_TOKEN", "dev")
self.platform = platform
self.crawler_id = crawler_id or f"{platform}-{uuid.uuid4().hex[:8]}"
self._session = requests.Session()
self._session.headers.update({"token": self.api_token})
# ─────────────────────────────────────────────
# 关键词调度
# ─────────────────────────────────────────────
def fetch_keyword(self, limit: int = 1) -> list[dict]:
"""从后端获取可用关键词(自动原子锁定为 crawling 状态)
返回关键词列表,每个元素包含:
id, city, job, last_completed_page, crawl_status
"""
resp = self._get(
"/api/v1/keyword/available",
params={
"source": self.platform,
"limit": limit,
"reserve": "true",
"crawler_id": self.crawler_id,
},
)
print(resp)
if resp and resp.get("code") == 200:
return resp.get("data", {}).get("items", [])
return []
def report_page_progress(
self,
keyword_id: int,
page: int,
total_pages: int = 0,
jobs_found: int = 0,
) -> dict:
"""汇报单页爬取进度"""
return self._post(
"/api/v1/keyword/page-progress",
body={
"source": self.platform,
"keyword_id": keyword_id,
"page": page,
"total_pages": total_pages,
"jobs_found": jobs_found,
},
)
def report_crawl_complete(
self,
keyword_id: int,
status: str = "completed",
error_message: str = "",
) -> dict:
"""汇报爬取完成或失败"""
return self._post(
"/api/v1/keyword/crawl-complete",
body={
"source": self.platform,
"keyword_id": keyword_id,
"status": status,
"error_message": error_message,
},
)
# ─────────────────────────────────────────────
# 数据上传
# ─────────────────────────────────────────────
def upload_data(
self,
data_list: list[dict],
data_type: str = "job",
channel: str = "mini",
) -> dict:
"""批量上传数据到后端(异步入库)"""
if not data_list:
return {"code": 200, "message": "空数据跳过"}
print(
f"[上报] {self.platform}/{data_type} | "
f"条数={len(data_list)} | channel={channel} | "
f"目标={self.base_url}/api/v1/universal/data/batch-store-async"
)
resp = self._post(
"/api/v1/universal/data/batch-store-async",
body={
"data_list": data_list,
"data_type": data_type,
"platform": self.platform,
"channel": channel,
},
)
code = resp.get("code", "?")
msg = resp.get("msg") or resp.get("message", "")
stored = resp.get("data", {}).get("stored", "") if isinstance(resp.get("data"), dict) else ""
print(f"[上报] 响应: code={code} msg={msg} {f'stored={stored}' if stored else ''}")
return resp
# ─────────────────────────────────────────────
# Token 管理Boss 平台需要)
# ─────────────────────────────────────────────
def fetch_token(self) -> Optional[dict]:
"""获取可用的平台 Token"""
resp = self._get(
"/api/v1/token/tokens",
params={"platform": self.platform},
)
if resp and resp.get("code") == 200:
tokens = resp.get("data", [])
return tokens[0] if tokens else None
return None
# ─────────────────────────────────────────────
# 公司队列
# ─────────────────────────────────────────────
def fetch_pending_companies(
self,
limit: int = 10,
status: str = "pending",
) -> list[dict]:
"""从后端获取待爬取公司列表
返回列表,每个元素包含:
source, company_id, company_name, status, error_msg
"""
resp = self._get(
"/api/v1/cleaning/companies",
params={
"source": self.platform,
"status": status,
"page_size": limit,
},
)
if resp and resp.get("code") == 200:
return resp.get("data", [])
return []
def update_company_status(
self,
company_id: str,
status: str = "done",
error_message: str = "",
) -> dict:
"""更新公司爬取状态done/failed"""
return self._post(
"/api/v1/cleaning/update-company-status",
body={
"source": self.platform,
"company_id": company_id,
"status": status,
"error_message": error_message,
},
)
# ─────────────────────────────────────────────
# HTTP 底层
# ─────────────────────────────────────────────
def _get(self, path: str, params: dict | None = None) -> dict:
url = f"{self.base_url}{path}"
for attempt in range(3):
try:
resp = self._session.get(url, params=params, timeout=15)
return resp.json()
except Exception as e:
print(f"[API] GET {path}{attempt + 1}次失败: {e}")
time.sleep(2 * (attempt + 1))
return {}
def _post(self, path: str, body: dict) -> dict:
url = f"{self.base_url}{path}"
for attempt in range(3):
try:
resp = self._session.post(url, json=body, timeout=30)
return resp.json()
except Exception as e:
print(f"[API] POST {path}{attempt + 1}次失败: {e}")
time.sleep(2 * (attempt + 1))
return {}