234 lines
9.1 KiB
Python
234 lines
9.1 KiB
Python
import httpx
|
|
import time
|
|
import random
|
|
import json
|
|
import os
|
|
from typing import Dict, Any, Optional, List
|
|
from urllib.parse import quote
|
|
from loguru import logger
|
|
from app.core.algorithms.signature import SignatureGenerator
|
|
from jobs_spider.qcwy import search_company_jobs as qcwy_spider
|
|
|
|
class QcwyService:
|
|
def __init__(self, proxy_url: Optional[str] = None):
|
|
self.signature_generator = SignatureGenerator("abfc8f9dcf8c3f3d8aa294ac5f2cf2cc7767e5592590f39c3f503271dd68562b")
|
|
self.base_url = "https://cupid.51job.com"
|
|
self.api_key = "51job"
|
|
|
|
self.base_headers = {
|
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF MacWechat/3.8.10(0x13080a10) XWEB/1227",
|
|
"Connection": "keep-alive",
|
|
"Accept": "*/*",
|
|
"Accept-Encoding": "gzip, deflate, br",
|
|
"Content-Type": "application/json",
|
|
"account-id": "",
|
|
"From-Domain": "51job_weixin_wxapp",
|
|
"xweb_xhr": "1",
|
|
"user-token": "",
|
|
"uuid": str(int(time.time() * 1000)) + str(random.randint(10000000, 99999999)),
|
|
"partner": "",
|
|
"timestamp": str(int(time.time() * 1000)),
|
|
"Sec-Fetch-Site": "cross-site",
|
|
"Sec-Fetch-Mode": "cors",
|
|
"Sec-Fetch-Dest": "empty",
|
|
"Referer": "https://servicewechat.com/wx1131e5c71e668b5d/391/page-frame.html",
|
|
"Accept-Language": "zh-CN,zh;q=0.9"
|
|
}
|
|
env_account_id = os.getenv("QCWY_ACCOUNT_ID", "").strip()
|
|
env_user_token = os.getenv("QCWY_USER_TOKEN", "").strip()
|
|
if env_account_id:
|
|
self.base_headers["account-id"] = env_account_id
|
|
if env_user_token:
|
|
self.base_headers["user-token"] = env_user_token
|
|
|
|
client_kwargs = {
|
|
"timeout": 30.0,
|
|
"verify": True,
|
|
"trust_env": False
|
|
}
|
|
if proxy_url:
|
|
client_kwargs["proxy"] = proxy_url
|
|
self.client = httpx.Client(**client_kwargs)
|
|
|
|
def set_proxy(self, proxy_url: Optional[str]) -> None:
|
|
client_kwargs = {
|
|
"timeout": 30.0,
|
|
"verify": True,
|
|
"trust_env": False,
|
|
}
|
|
if proxy_url:
|
|
client_kwargs["proxy"] = proxy_url
|
|
try:
|
|
old_client = self.client
|
|
except AttributeError:
|
|
old_client = None
|
|
self.client = httpx.Client(**client_kwargs)
|
|
if old_client is not None:
|
|
try:
|
|
old_client.close()
|
|
except Exception:
|
|
pass
|
|
|
|
def _sanitize_headers(self, headers: Dict[str, Any]) -> Dict[str, Any]:
|
|
masked_headers: Dict[str, Any] = {}
|
|
for k, v in headers.items():
|
|
key_lower = str(k).lower()
|
|
if key_lower in {"authorization", "cookie", "set-cookie"}:
|
|
masked_headers[k] = "***"
|
|
else:
|
|
masked_headers[k] = v
|
|
return masked_headers
|
|
|
|
def _log_request_response(
|
|
self,
|
|
label: str,
|
|
method: str,
|
|
url: str,
|
|
headers: Dict[str, Any],
|
|
params: Optional[Dict[str, Any]] = None,
|
|
json_body: Optional[Dict[str, Any]] = None,
|
|
response: Optional[httpx.Response] = None,
|
|
) -> None:
|
|
safe_headers = self._sanitize_headers(headers)
|
|
logger.info(
|
|
f"[Qcwy-{label}] request method={method} url={url} headers={safe_headers} "
|
|
f"params={params} json={json_body}"
|
|
)
|
|
if response is not None:
|
|
text_sample = ""
|
|
try:
|
|
body = response.text or ""
|
|
text_sample = body[:1000]
|
|
except Exception:
|
|
text_sample = "<unreadable>"
|
|
logger.info(
|
|
f"[Qcwy-{label}] response status={response.status_code} "
|
|
f"headers={self._sanitize_headers(dict(response.headers))} "
|
|
f"body_sample={text_sample}"
|
|
)
|
|
|
|
def build_property(self, page_code: str = "home|hotjob|jobfxlist") -> str:
|
|
distinct_id = str(int(time.time() * 1000)) + str(random.randint(100000, 999999))
|
|
property_data = {
|
|
"frompageUrl": "",
|
|
"pageUrl": "pages/index/index",
|
|
"isLogin": "否",
|
|
"accountid": "",
|
|
"resumeId": "",
|
|
"firstFrompageUrl": "",
|
|
"distinct_id": distinct_id,
|
|
"pageCode": page_code,
|
|
"shortPageCode": page_code,
|
|
"policyType": "推荐"
|
|
}
|
|
return quote(json.dumps(property_data, ensure_ascii=False, separators=(',', ':')))
|
|
|
|
def _make_request(self, url: str, data: Dict[str, Any] = None, headers: Dict[str, str] = None, method: str = "POST") -> Optional[Dict[str, Any]]:
|
|
try:
|
|
local_headers: Dict[str, str] = headers or {}
|
|
if method.upper() == "GET":
|
|
response = self.client.get(url, headers=local_headers)
|
|
else:
|
|
response = self.client.post(url, headers=local_headers, json=data)
|
|
self._log_request_response(
|
|
"request",
|
|
method.upper(),
|
|
url,
|
|
local_headers,
|
|
params=None,
|
|
json_body=data if method.upper() != "GET" else None,
|
|
response=response,
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
return response.json()
|
|
else:
|
|
logger.warning(f"Request failed: {response.status_code} - {response.text}")
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Request exception: {e}")
|
|
return None
|
|
|
|
def get_job_detail(self, job_id: str) -> Dict[str, Any]:
|
|
timestamp = int(time.time())
|
|
api_path = f"open/noauth/jobs/detail/base/{job_id}"
|
|
url_path = f"/{api_path}?api_key={self.api_key}×tamp={timestamp}"
|
|
full_url = f"{self.base_url}{url_path}"
|
|
|
|
signature = self.signature_generator.generate_signature(url_path)
|
|
property_value = self.build_property(page_code="pages/jobs/jobdetail/jobdetail")
|
|
|
|
headers = self.base_headers.copy()
|
|
headers["sign"] = signature
|
|
headers["property"] = property_value
|
|
headers["Content-Type"] = "application/x-www-form-urlencoded"
|
|
|
|
response = self._make_request(full_url, None, headers, method="GET")
|
|
if response and response.get('status') in ['1', 1]:
|
|
return response.get('resultbody', {})
|
|
return {}
|
|
|
|
def get_company_info(self, company_id: str) -> Dict[str, Any]:
|
|
try:
|
|
return qcwy_spider.get_company_info(company_id)
|
|
except Exception as e:
|
|
logger.error(f"Qcwy get_company_info failed: {e}")
|
|
return {}
|
|
|
|
def search_jobs(self, keyword: str, job_area: str = "020000", page: int = 1) -> List[Dict[str, Any]]:
|
|
# This uses the recommend/search logic
|
|
timestamp = int(time.time())
|
|
data = {
|
|
"pageNo": page,
|
|
"pageSize": 20,
|
|
"keyword": keyword, # QCwy usually recommends, but let's assume recommend for now or search
|
|
"jobArea": job_area,
|
|
"type": "recommend", # fallback to recommend if keyword search API is different/complex
|
|
"isTouristMode": True,
|
|
"specialPageCode": True
|
|
}
|
|
# Note: QCwy search API might be different, but using the recommend endpoint from original script
|
|
# If real search is needed, we might need to reverse engineer 'search/job-list' endpoint.
|
|
# For now, let's stick to what was in the script or use recommend.
|
|
# The original script used `open/noauth/recommend/job-tab-dynamic-wx-mini`
|
|
|
|
api_path = "open/noauth/recommend/job-tab-dynamic-wx-mini"
|
|
url_path = f"/{api_path}?api_key={self.api_key}×tamp={timestamp}"
|
|
full_url = f"{self.base_url}{url_path}"
|
|
|
|
signature = self.signature_generator.generate_signature(url_path, data)
|
|
property_value = self.build_property()
|
|
|
|
headers = self.base_headers.copy()
|
|
headers["sign"] = signature
|
|
headers["property"] = property_value
|
|
|
|
# Convert bools
|
|
for key, value in data.items():
|
|
if isinstance(value, bool):
|
|
data[key] = "true" if value else "false"
|
|
|
|
response = self._make_request(full_url, data, headers, method="POST")
|
|
if response and response.get("status") in ['1', 1]:
|
|
return response.get("resultbody", {}).get("jobList", {}).get("items", [])
|
|
return []
|
|
|
|
def get_company_jobs_by_id(
|
|
self,
|
|
company_id: str,
|
|
page: int = 1,
|
|
page_size: int = 30,
|
|
job_area: str = "",
|
|
function: str = "",
|
|
salary_type: str = "",
|
|
) -> Dict[str, Any]:
|
|
try:
|
|
return qcwy_spider.company_jobs_by_id(
|
|
co_id=company_id,
|
|
page=page,
|
|
size=page_size,
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Qcwy get_company_jobs_by_id failed: {e}")
|
|
return {}
|