373 lines
14 KiB
Python
373 lines
14 KiB
Python
import requests
|
|
import time
|
|
import json
|
|
import uuid
|
|
from typing import Dict, Any, Optional, List
|
|
from app.core.algorithms.antispider import IPStrategyConfig, IPAnomalyDetector, SmartIPManager, generate_boss_trace_id, generate_token
|
|
from loguru import logger
|
|
import os
|
|
from urllib.parse import urlencode
|
|
|
|
class BossService:
|
|
def __init__(self, proxy_pool: Optional[List[Dict[str, str]]] = None):
|
|
self.app_id = 10002
|
|
self.zp_product_id = 10002
|
|
self.serve_domain = "https://www.zhipin.com"
|
|
self.api_domain = "https://wxapp.zhipin.com"
|
|
|
|
self.session = requests.Session()
|
|
self.session.trust_env = False
|
|
self.session.headers.update({'no_proxy': '10.0.0.0/16,example.com,.example.com'})
|
|
|
|
# Initialize IP Strategy
|
|
self.ip_cfg = IPStrategyConfig()
|
|
self.ip_detector = IPAnomalyDetector(self.ip_cfg)
|
|
self.ip_manager = SmartIPManager(proxy_pool, self.ip_cfg)
|
|
|
|
# Initial route
|
|
route_mode, route_cfg = self.ip_manager.current_route()
|
|
if route_mode == 'proxy' and route_cfg:
|
|
self.session.proxies = route_cfg
|
|
|
|
self.device_id = str(uuid.uuid4())
|
|
self.wx_version = "8.0.43"
|
|
self.mini_version = "1.0.0"
|
|
self.scene = 1001
|
|
|
|
self.default_headers = {
|
|
"Accept": "*/*",
|
|
"Accept-Language": "zh-CN,zh;q=0.9",
|
|
"Accept-Encoding": "gzip, deflate, br",
|
|
"Connection": "keep-alive",
|
|
"Content-Type": "application/x-www-form-urlencoded",
|
|
"Host": "www.zhipin.com",
|
|
"Referer": "https://servicewechat.com/wxa8da525af05281f3/571/page-frame.html",
|
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF MacWechat/3.8.10(0x13080a10) XWEB/1227",
|
|
"X-Requested-With": "XMLHttpRequest",
|
|
"platform": "zhipin/mac",
|
|
"zp_app_id": str(self.app_id),
|
|
"ver": "100.0000",
|
|
"mini_ver": "100.0000",
|
|
"ua": json.dumps({"model": "Mac16,8", "platform": "mac"}),
|
|
"zp_product_id": str(self.zp_product_id),
|
|
"scene": "1006",
|
|
"xweb_xhr": "1",
|
|
"sec-fetch-site": "cross-site",
|
|
"sec-fetch-mode": "cors",
|
|
"sec-fetch-dest": "empty"
|
|
}
|
|
|
|
self.login_data = {
|
|
"mpt": "", # Needs to be filled via login/token logic if required
|
|
"wt2": "",
|
|
"openId": "",
|
|
"traceid": "F-77d05bnXuMVrHIB3"
|
|
}
|
|
|
|
self.current_token_id: Optional[int] = None
|
|
self.init_cookies()
|
|
|
|
def init_cookies(self):
|
|
cookies = {
|
|
'__zp_stoken__': generate_token(),
|
|
'Hm_lvt_194df3105ad7148dcf2b98a91b5e727a': str(int(time.time())),
|
|
'Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a': str(int(time.time())),
|
|
'__c': self.device_id[:8],
|
|
'__g': '-',
|
|
'__l': 'l=%2Fwww.zhipin.com%2F&r=&friend_source=0&s=3&friend_source=0',
|
|
'lastCity': '101010100',
|
|
'cityName': '%E5%8C%97%E4%BA%AC',
|
|
'__zp_sseed__': 'btHZ0bjBq8m//WNwlVrPUnVcIvini5J5P5LQUbflM24=',
|
|
'__zp_sname__': '3998243a',
|
|
'__zp_sts__': str(int(time.time() * 1000))
|
|
}
|
|
|
|
for name, value in cookies.items():
|
|
self.session.cookies.set(name, value, domain='.zhipin.com')
|
|
|
|
def set_login_data(self, mpt: str, wt2: str, open_id: str = "") -> None:
|
|
self.login_data.update(
|
|
{
|
|
"mpt": mpt,
|
|
"wt2": wt2,
|
|
"openId": open_id,
|
|
}
|
|
)
|
|
if wt2:
|
|
self.session.cookies.set("wt2", wt2, domain=".zhipin.com")
|
|
if mpt:
|
|
self.session.cookies.set("mpt", mpt, domain=".zhipin.com")
|
|
|
|
def set_proxy(self, proxy: Optional[str]) -> None:
|
|
if not proxy:
|
|
self.session.proxies = {}
|
|
route_mode, route_cfg = self.ip_manager.current_route()
|
|
if route_mode == "proxy" and route_cfg:
|
|
self.session.proxies = route_cfg
|
|
logger.info("BossService proxy reset to default route")
|
|
return
|
|
proxy = proxy.strip().strip("`")
|
|
proxies = {"http": proxy, "https": proxy}
|
|
self.session.proxies = proxies
|
|
logger.info(f"BossService using user proxy: {proxies}")
|
|
|
|
def build_request_headers(self, custom_headers: Optional[Dict] = None) -> Dict[str, str]:
|
|
headers = self.default_headers.copy()
|
|
headers.update({
|
|
"mpt": self.login_data.get("mpt", ""),
|
|
"scene": "1006",
|
|
"wt2": self.login_data.get("wt2", ""),
|
|
"Traceid": generate_boss_trace_id()
|
|
})
|
|
headers["timestamp"] = str(int(time.time() * 1000))
|
|
if custom_headers:
|
|
headers.update(custom_headers)
|
|
return headers
|
|
|
|
def _sanitize_headers(self, headers: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
|
return headers
|
|
|
|
def _log_request_response(
|
|
self,
|
|
label: str,
|
|
method: str,
|
|
url: str,
|
|
headers: Dict[str, Any],
|
|
params: Optional[Dict[str, Any]] = None,
|
|
json_body: Optional[Dict[str, Any]] = None,
|
|
response: Optional[requests.Response] = None,
|
|
) -> None:
|
|
safe_headers = self._sanitize_headers(headers)
|
|
current_proxies = getattr(self.session, "proxies", None)
|
|
proxy_info = current_proxies if current_proxies else {}
|
|
login_flags = {
|
|
"mpt_set": bool(self.login_data.get("mpt")),
|
|
"wt2_set": bool(self.login_data.get("wt2")),
|
|
}
|
|
logger.info(
|
|
f"[Boss-{label}] request method={method} url={url} headers={safe_headers} "
|
|
f"params={params} json={json_body} proxies={proxy_info} login={login_flags}"
|
|
)
|
|
try:
|
|
curl_url = url
|
|
if params and isinstance(params, dict):
|
|
query_string = urlencode(params)
|
|
if query_string:
|
|
separator = "&" if "?" in curl_url else "?"
|
|
curl_url = f"{curl_url}{separator}{query_string}"
|
|
header_parts = []
|
|
for k, v in safe_headers.items():
|
|
v_str = str(v).replace("'", "'\"'\"'")
|
|
header_parts.append(f"-H '{k}: {v_str}'")
|
|
data_part = ""
|
|
if json_body is not None:
|
|
body_str = json.dumps(json_body, ensure_ascii=False)
|
|
body_str = body_str.replace("'", "'\"'\"'")
|
|
data_part = f" --data '{body_str}'"
|
|
curl_cmd = f"curl -X {method} '{curl_url}' " + " ".join(header_parts) + data_part
|
|
logger.info(f"[Boss-{label}] curl_debug {curl_cmd}")
|
|
except Exception as e:
|
|
logger.debug(f"[Boss-{label}] build curl error: {e}")
|
|
if response is not None:
|
|
text_sample = ""
|
|
try:
|
|
body = response.text or ""
|
|
text_sample = body[:1000]
|
|
except Exception:
|
|
text_sample = "<unreadable>"
|
|
logger.info(
|
|
f"[Boss-{label}] response status={response.status_code} "
|
|
f"headers={self._sanitize_headers(dict(response.headers))} "
|
|
f"body_sample={text_sample}"
|
|
)
|
|
|
|
def build_request_data(self, data: Optional[Dict] = None) -> Dict[str, Any]:
|
|
request_data = {
|
|
"appId": self.app_id,
|
|
"scene": self.scene,
|
|
"timestamp": int(time.time() * 1000)
|
|
}
|
|
if data:
|
|
request_data.update(data)
|
|
return request_data
|
|
|
|
def get_job_detail_by_id(self, job_id: str, lid: str = "", security_id: str = "") -> Optional[Dict]:
|
|
"""根据招聘ID获取招聘详情"""
|
|
logger.info(f"🔍 获取招聘详情: {job_id}")
|
|
|
|
# Batch request simulation
|
|
sub_reqs = [
|
|
{
|
|
"path": "/wapi/zpgeek/miniapp/job/detail.json",
|
|
"method": "GET",
|
|
"query": urlencode({
|
|
"securityId": security_id,
|
|
"jobId": job_id,
|
|
"lid": lid,
|
|
"source": "10"
|
|
})
|
|
},
|
|
{
|
|
"path": "/wapi/zpgeek/miniapp/jobdetail/improvement/query.json",
|
|
"method": "GET",
|
|
"query": urlencode({
|
|
"securityId": security_id,
|
|
"jobId": job_id,
|
|
"lid": lid
|
|
})
|
|
}
|
|
]
|
|
|
|
post_data = {
|
|
"subReqs": sub_reqs,
|
|
"appId": 10002
|
|
}
|
|
|
|
headers = self.build_request_headers({
|
|
"Content-Type": "application/json",
|
|
"Referer": "https://servicewechat.com/wxa8da525af05281f3/585/page-frame.html"
|
|
})
|
|
|
|
try:
|
|
response = self.session.post(
|
|
"https://www.zhipin.com/wapi/batch/requests",
|
|
json=post_data,
|
|
headers=headers,
|
|
timeout=30
|
|
)
|
|
self._log_request_response(
|
|
"job-detail",
|
|
"POST",
|
|
"https://www.zhipin.com/wapi/batch/requests",
|
|
headers,
|
|
params=None,
|
|
json_body=post_data,
|
|
response=response,
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
# Extract relevant part from batch response
|
|
if data.get("code") == 0 and data.get("zpData"):
|
|
# Simplification: return the whole structure or extract job detail
|
|
# Usually we want the job detail part
|
|
job_detail_path = "/wapi/zpgeek/miniapp/job/detail.json"
|
|
if job_detail_path in data["zpData"]:
|
|
return data["zpData"][job_detail_path]
|
|
return data
|
|
except Exception as e:
|
|
logger.error(f"Failed to fetch job detail: {e}")
|
|
return None
|
|
|
|
def get_company_detail_by_id(self, company_id: str) -> Optional[Dict]:
|
|
"""根据公司ID获取公司详情"""
|
|
logger.info(f"🏢 获取公司详情: {company_id}")
|
|
params = {
|
|
"brandId": company_id,
|
|
"appId": "10002"
|
|
}
|
|
headers = self.build_request_headers({
|
|
"Referer": "https://servicewechat.com/wxa8da525af05281f3/574/page-frame.html"
|
|
})
|
|
|
|
try:
|
|
request_data = self.build_request_data(params)
|
|
response = self.session.get(
|
|
f"{self.serve_domain}/wapi/zpgeek/miniapp/brand/detail.json",
|
|
headers=headers,
|
|
params=request_data,
|
|
timeout=30
|
|
)
|
|
self._log_request_response(
|
|
"company-detail",
|
|
"GET",
|
|
f"{self.serve_domain}/wapi/zpgeek/miniapp/brand/detail.json",
|
|
headers,
|
|
params=request_data,
|
|
json_body=None,
|
|
response=response,
|
|
)
|
|
response.raise_for_status()
|
|
return response.json()
|
|
except Exception as e:
|
|
logger.error(f"Failed to fetch company detail: {e}")
|
|
return None
|
|
|
|
def get_company_jobs_by_id(self, company_id: str, page: int = 1) -> Optional[Dict]:
|
|
"""根据公司ID获取该公司职位列表"""
|
|
logger.info(f"📄 获取公司职位列表: {company_id}, page={page}")
|
|
params = {
|
|
"brandId": company_id,
|
|
"query": "",
|
|
"page": page,
|
|
"hasMore": "true",
|
|
"positionLv1": 0,
|
|
"city": "",
|
|
"experience": "",
|
|
"salary": "",
|
|
"appId": "10002",
|
|
}
|
|
headers = self.build_request_headers({
|
|
"Referer": "https://servicewechat.com/wxa8da525af05281f3/587/page-frame.html"
|
|
})
|
|
|
|
try:
|
|
request_data = self.build_request_data(params)
|
|
response = self.session.get(
|
|
f"{self.serve_domain}/wapi/zpgeek/miniapp/brand/joblist.json",
|
|
headers=headers,
|
|
params=request_data,
|
|
timeout=30,
|
|
)
|
|
self._log_request_response(
|
|
"company-joblist",
|
|
"GET",
|
|
f"{self.serve_domain}/wapi/zpgeek/miniapp/brand/joblist.json",
|
|
headers,
|
|
params=request_data,
|
|
json_body=None,
|
|
response=response,
|
|
)
|
|
response.raise_for_status()
|
|
return response.json()
|
|
except Exception as e:
|
|
logger.error(f"Failed to fetch company job list: {e}")
|
|
return None
|
|
|
|
def search_jobs(self, keyword: str, city_code: str = "101010100", page: int = 1) -> Optional[Dict]:
|
|
"""搜索职位"""
|
|
params = {
|
|
'pageSize': 15,
|
|
'query': keyword,
|
|
'city': city_code,
|
|
'page': page,
|
|
'appId': '10002'
|
|
}
|
|
|
|
try:
|
|
headers = self.build_request_headers({
|
|
"Referer": "https://www.zhipin.com/web/geek/job"
|
|
})
|
|
request_data = self.build_request_data(params)
|
|
response = self.session.get(
|
|
f"{self.serve_domain}/wapi/zpgeek/miniapp/search/joblist.json",
|
|
headers=headers,
|
|
params=request_data,
|
|
timeout=30
|
|
)
|
|
self._log_request_response(
|
|
"search-jobs",
|
|
"GET",
|
|
f"{self.serve_domain}/wapi/zpgeek/miniapp/search/joblist.json",
|
|
headers,
|
|
params=request_data,
|
|
json_body=None,
|
|
response=response,
|
|
)
|
|
response.raise_for_status()
|
|
return response.json()
|
|
except Exception as e:
|
|
logger.error(f"Search failed: {e}")
|
|
return None
|