Plan 01 - facade migration (ARCH-06/07):
- boss.py: import from spiderJobs.platforms.boss.{api,client,sign}
- qcwy.py: import from spiderJobs.platforms.job51.{api,client}
- zhilian.py: import from spiderJobs.platforms.zhilian.{api,client,sign}
- All 3 Service classes: +4 async_* methods via asyncio.to_thread()
Plan 02 - deprecation + cleanup (ARCH-08):
- 11 private copy files (_base, _http_client, _boss/job51/zhilian *): DEPRECATED header
- jobs_spider/ directory: fully deleted (user request)
Full regression: 106 passed in 0.61s
129 lines
4.1 KiB
Python
129 lines
4.1 KiB
Python
# ⚠️ DEPRECATED — 2026-03-21
|
||
# 此文件是内部手工复制文件,已废弃,不再由任何 facade 引用。
|
||
# 请改用 spiderJobs.platforms.* 或 crawler_core 中的对应模块。
|
||
# 将在下一里程碑中删除。
|
||
#
|
||
"""
|
||
通用 HTTP 客户端
|
||
基于 requests-go,自带 Chrome TLS 指纹伪装
|
||
支持代理 IP / 隧道代理 / 代理池轮换
|
||
与任何招聘平台无关,纯粹负责发请求
|
||
|
||
复制自 spiderJobs/core/http_client.py — 不要直接 import spiderJobs,避免跨模块依赖
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import random
|
||
from typing import Any, Optional
|
||
|
||
import requests_go as requests
|
||
from requests_go.tls_config import TLS_CHROME_LATEST
|
||
|
||
|
||
class HTTPClient:
|
||
"""
|
||
通用 HTTP 客户端
|
||
|
||
代理优先级: tunnel_proxy > proxy_pool > proxy
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
base_url: str,
|
||
default_headers: Optional[dict] = None,
|
||
proxy: Optional[str] = None,
|
||
tunnel_proxy: Optional[str] = None,
|
||
proxy_pool: Optional[list[str]] = None,
|
||
timeout: int = 10,
|
||
):
|
||
self.base_url = base_url
|
||
self.default_headers = default_headers or {}
|
||
self.timeout = timeout
|
||
|
||
self._proxy = proxy
|
||
self._tunnel_proxy = tunnel_proxy
|
||
self._proxy_pool = proxy_pool
|
||
|
||
self._session = requests.Session()
|
||
self._session.tls_config = TLS_CHROME_LATEST
|
||
TLS_CHROME_LATEST.random_ja3 = True
|
||
|
||
if proxy and not proxy_pool and not tunnel_proxy:
|
||
self._session.proxies = {"http": proxy, "https": proxy}
|
||
|
||
def _new_session(self) -> requests.Session:
|
||
s = requests.Session()
|
||
s.tls_config = TLS_CHROME_LATEST
|
||
TLS_CHROME_LATEST.random_ja3 = True
|
||
return s
|
||
|
||
def _get_proxies(self) -> Optional[dict]:
|
||
if self._proxy_pool:
|
||
chosen = random.choice(self._proxy_pool)
|
||
unique = f"{chosen}#{random.randint(100000, 999999)}"
|
||
return {"http": unique, "https": unique}
|
||
return None
|
||
|
||
def _merge_headers(self, extra: Optional[dict] = None) -> dict:
|
||
headers = {**self.default_headers}
|
||
if extra:
|
||
headers.update(extra)
|
||
return headers
|
||
|
||
def post(self, path: str, body: dict, headers: Optional[dict] = None) -> tuple[int, Any]:
|
||
merged_headers = self._merge_headers(headers)
|
||
|
||
if self._tunnel_proxy:
|
||
s = self._new_session()
|
||
try:
|
||
resp = s.post(
|
||
f"{self.base_url}{path}",
|
||
json=body,
|
||
headers=merged_headers,
|
||
proxies={"http": self._tunnel_proxy, "https": self._tunnel_proxy},
|
||
timeout=self.timeout,
|
||
)
|
||
return resp.status_code, resp.json()
|
||
finally:
|
||
s.close()
|
||
|
||
kwargs: dict[str, Any] = {
|
||
"json": body,
|
||
"headers": merged_headers,
|
||
"timeout": self.timeout,
|
||
}
|
||
proxies = self._get_proxies()
|
||
if proxies:
|
||
kwargs["proxies"] = proxies
|
||
resp = self._session.post(f"{self.base_url}{path}", **kwargs)
|
||
return resp.status_code, resp.json()
|
||
|
||
def get(self, path: str, params: Optional[dict] = None, headers: Optional[dict] = None) -> tuple[int, Any]:
|
||
merged_headers = self._merge_headers(headers)
|
||
|
||
if self._tunnel_proxy:
|
||
s = self._new_session()
|
||
try:
|
||
resp = s.get(
|
||
f"{self.base_url}{path}",
|
||
params=params,
|
||
headers=merged_headers,
|
||
proxies={"http": self._tunnel_proxy, "https": self._tunnel_proxy},
|
||
timeout=self.timeout,
|
||
)
|
||
return resp.status_code, resp.json()
|
||
finally:
|
||
s.close()
|
||
|
||
kwargs: dict[str, Any] = {
|
||
"params": params,
|
||
"headers": merged_headers,
|
||
"timeout": self.timeout,
|
||
}
|
||
proxies = self._get_proxies()
|
||
if proxies:
|
||
kwargs["proxies"] = proxies
|
||
resp = self._session.get(f"{self.base_url}{path}", **kwargs)
|
||
return resp.status_code, resp.json()
|