JobData/app/services/crawler/_http_client.py
win 3aadbd128b feat(04): migrate facade to spiderJobs.platforms.* + asyncio bridge; delete jobs_spider/
Plan 01 - facade migration (ARCH-06/07):
- boss.py: import from spiderJobs.platforms.boss.{api,client,sign}
- qcwy.py: import from spiderJobs.platforms.job51.{api,client}
- zhilian.py: import from spiderJobs.platforms.zhilian.{api,client,sign}
- All 3 Service classes: +4 async_* methods via asyncio.to_thread()

Plan 02 - deprecation + cleanup (ARCH-08):
- 11 private copy files (_base, _http_client, _boss/job51/zhilian *): DEPRECATED header
- jobs_spider/ directory: fully deleted (user request)

Full regression: 106 passed in 0.61s
2026-03-21 19:36:24 +08:00

129 lines
4.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# ⚠️ DEPRECATED — 2026-03-21
# 此文件是内部手工复制文件,已废弃,不再由任何 facade 引用。
# 请改用 spiderJobs.platforms.* 或 crawler_core 中的对应模块。
# 将在下一里程碑中删除。
#
"""
通用 HTTP 客户端
基于 requests-go自带 Chrome TLS 指纹伪装
支持代理 IP / 隧道代理 / 代理池轮换
与任何招聘平台无关,纯粹负责发请求
复制自 spiderJobs/core/http_client.py — 不要直接 import spiderJobs避免跨模块依赖
"""
from __future__ import annotations
import random
from typing import Any, Optional
import requests_go as requests
from requests_go.tls_config import TLS_CHROME_LATEST
class HTTPClient:
"""
通用 HTTP 客户端
代理优先级: tunnel_proxy > proxy_pool > proxy
"""
def __init__(
self,
base_url: str,
default_headers: Optional[dict] = None,
proxy: Optional[str] = None,
tunnel_proxy: Optional[str] = None,
proxy_pool: Optional[list[str]] = None,
timeout: int = 10,
):
self.base_url = base_url
self.default_headers = default_headers or {}
self.timeout = timeout
self._proxy = proxy
self._tunnel_proxy = tunnel_proxy
self._proxy_pool = proxy_pool
self._session = requests.Session()
self._session.tls_config = TLS_CHROME_LATEST
TLS_CHROME_LATEST.random_ja3 = True
if proxy and not proxy_pool and not tunnel_proxy:
self._session.proxies = {"http": proxy, "https": proxy}
def _new_session(self) -> requests.Session:
s = requests.Session()
s.tls_config = TLS_CHROME_LATEST
TLS_CHROME_LATEST.random_ja3 = True
return s
def _get_proxies(self) -> Optional[dict]:
if self._proxy_pool:
chosen = random.choice(self._proxy_pool)
unique = f"{chosen}#{random.randint(100000, 999999)}"
return {"http": unique, "https": unique}
return None
def _merge_headers(self, extra: Optional[dict] = None) -> dict:
headers = {**self.default_headers}
if extra:
headers.update(extra)
return headers
def post(self, path: str, body: dict, headers: Optional[dict] = None) -> tuple[int, Any]:
merged_headers = self._merge_headers(headers)
if self._tunnel_proxy:
s = self._new_session()
try:
resp = s.post(
f"{self.base_url}{path}",
json=body,
headers=merged_headers,
proxies={"http": self._tunnel_proxy, "https": self._tunnel_proxy},
timeout=self.timeout,
)
return resp.status_code, resp.json()
finally:
s.close()
kwargs: dict[str, Any] = {
"json": body,
"headers": merged_headers,
"timeout": self.timeout,
}
proxies = self._get_proxies()
if proxies:
kwargs["proxies"] = proxies
resp = self._session.post(f"{self.base_url}{path}", **kwargs)
return resp.status_code, resp.json()
def get(self, path: str, params: Optional[dict] = None, headers: Optional[dict] = None) -> tuple[int, Any]:
merged_headers = self._merge_headers(headers)
if self._tunnel_proxy:
s = self._new_session()
try:
resp = s.get(
f"{self.base_url}{path}",
params=params,
headers=merged_headers,
proxies={"http": self._tunnel_proxy, "https": self._tunnel_proxy},
timeout=self.timeout,
)
return resp.status_code, resp.json()
finally:
s.close()
kwargs: dict[str, Any] = {
"params": params,
"headers": merged_headers,
"timeout": self.timeout,
}
proxies = self._get_proxies()
if proxies:
kwargs["proxies"] = proxies
resp = self._session.get(f"{self.base_url}{path}", **kwargs)
return resp.status_code, resp.json()