2026-01-24 17:07:34 +08:00

328 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import hashlib
import hmac
import json
import execjs
import re
import time
import uuid
import requests
from urllib.parse import unquote, quote
from typing import Optional, Dict
import os
class SignGenerator:
def __init__(self):
# 签名密钥从JS代码中获取
self.secret_key = "abfc8f9dcf8c3f3d8aa294ac5f2cf2cc7767e5592590f39c3f503271dd68562b"
self.secret_key_bytes = self.secret_key.encode('utf-8')
def hmac_sha256(self, message, key):
"""HMAC-SHA256签名"""
key_bytes = key.encode('utf-8') if isinstance(key, str) else key
message_bytes = message.encode('utf-8') if isinstance(message, str) else message
signature = hmac.new(key_bytes, message_bytes, hashlib.sha256)
return signature.hexdigest()
def generate_signature(self, t):
"""
生成签名对应JS中的函数A
JS逻辑: signature = hmacSHA256(url + (data || ""), secret_key)
"""
# 获取URL
url = t.get("url", "")
# 获取data如果不存在则使用空字符串
data = t.get("data", "")
if data and isinstance(data, dict):
# 如果data是字典转换为字符串
data = json.dumps(data, ensure_ascii=False, separators=(',', ':'))
# 拼接字符串
message = url + data
# 生成签名
signature = self.hmac_sha256(message, self.secret_key)
return signature
def generate_signature_from_components(self, url, data=None):
"""从URL和data生成签名更简单的接口"""
if data is None:
data = ""
elif isinstance(data, dict):
data = json.dumps(data, ensure_ascii=False, separators=(',', ':'))
message = url + data
return self.hmac_sha256(message, self.secret_key)
def generate_acw_sc__v2(self, arg1):
"""生成cookies acw_sc__v2"""
# 获取当前文件所在目录
current_dir = os.path.dirname(os.path.abspath(__file__))
js_file_path = os.path.join(current_dir, '04.js')
with open(js_file_path, 'r', encoding='utf-8') as f:
js = f.read()
acw_sc__v2 = execjs.compile(js).call('l', arg1)
return acw_sc__v2 if acw_sc__v2 else None
def generate_company_detail(self, cid: str) -> dict:
timestamp = int(time.time())
# 待签名的字符串
message = f"/open/noauth/company-info/pc-info?api_key=51job&timestamp={timestamp}&encryCompanyId={cid}"
secret = "abfc8f9dcf8c3f3d8aa294ac5f2cf2cc7767e5592590f39c3f503271dd68562b"
# 进行 HMAC-SHA256 签名
signature = hmac.new(
key=secret.encode("utf-8"),
msg=message.encode("utf-8"),
digestmod=hashlib.sha256
).hexdigest()
return {"signature": signature, "timestamp": timestamp}
def search_company(keyword: str, job_area: str = "000000") -> Optional[Dict]:
"""
根据关键字搜索公司信息
Args:
keyword: 搜索关键字(公司名称)
job_area: 工作区域代码,默认"000000"表示全国
Returns:
如果找到匹配的公司返回包含fullCompanyName, companyName, companyHref的字典
否则返回None
"""
signer = SignGenerator()
session = requests.Session()
# 生成时间戳
timestamp = str(int(time.time()))
# 构建请求参数
params = {
'api_key': '51job',
'timestamp': timestamp,
'keyword': keyword,
'searchType': '2', # 2表示搜索公司
'function': '',
'industry': '',
'jobArea': job_area,
'jobArea2': '',
'landmark': '',
'metro': '',
'salary': '',
'workYear': '',
'degree': '',
'companyType': '',
'companySize': '',
'jobType': '',
'issueDate': '',
'sortType': '0',
'pageNum': '1',
'requestId': '',
'pageSize': '20',
'source': '1',
'accountId': '',
'pageCode': 'sou|sou|soulb',
'scene': '7'
}
# 构建URL用于生成签名
# 注意签名时keyword需要URL编码其他参数保持原样
url_path = '/api/job/search-pc'
query_parts = []
for k, v in params.items():
if v:
# keyword参数需要URL编码与浏览器行为一致
if k == 'keyword':
query_parts.append(f'{k}={quote(str(v))}')
else:
query_parts.append(f'{k}={str(v)}')
else:
query_parts.append(f'{k}=')
query_string = '&'.join(query_parts)
full_url = f"{url_path}?{query_string}"
# 生成签名
sign = signer.generate_signature_from_components(full_url)
# 构建请求头
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'From-Domain': '51job_web',
'Pragma': 'no-cache',
'Referer': f'https://we.51job.com/pc/search?keyword={quote(keyword)}&searchType=2&sortType=0&metro=',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
'account-id': '',
'partner': '',
'property': '%7B%22partner%22%3A%22%22%2C%22webId%22%3A2%2C%22fromdomain%22%3A%2251job_web%22%2C%22frompageUrl%22%3A%22https%3A%2F%2Fwe.51job.com%2F%22%2C%22pageUrl%22%3A%22https%3A%2F%2Fwe.51job.com%2Fpc%2Fsearch%3Fkeyword%3D' + quote(
keyword) + '%26searchType%3D2%26sortType%3D0%26metro%3D%22%2C%22identityType%22%3A%22%22%2C%22userType%22%3A%22%22%2C%22isLogin%22%3A%22%E5%90%A6%22%2C%22accountid%22%3A%22%22%7D',
'sec-ch-ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sign': sign,
'user-token': '',
'uuid': str(uuid.uuid4()).replace('-', ''),
}
try:
# 第一次请求可能会返回arg1用于生成acw_sc__v2
response = session.get(
'https://we.51job.com/api/job/search-pc',
params=params,
headers=headers,
verify=False,
timeout=30
)
# 检查是否需要处理acw_sc__v2反爬虫
if 'arg1' in response.text:
# 提取arg1
arg1_match = re.findall(r"arg1='(.*?)';", response.text, re.S)
if arg1_match:
arg1 = arg1_match[0]
# 生成acw_sc__v2
acw_sc__v2 = signer.generate_acw_sc__v2(arg1)
if acw_sc__v2:
# 生成guid
guid = str(uuid.uuid4()).replace("-", "")
cookies = {
'guid': guid,
'acw_sc__v2': acw_sc__v2
}
# 第二次请求带上cookies
response2 = session.get(
'https://we.51job.com/api/job/search-pc',
params=params,
headers=headers,
cookies=cookies,
verify=False,
timeout=30
)
# 更新cookies
cookies.update(response2.cookies.get_dict())
# 第三次请求使用完整的cookies
response = session.get(
'https://we.51job.com/api/job/search-pc',
params=params,
headers=headers,
cookies=cookies,
verify=False,
timeout=30
)
# 解析响应
if response.status_code == 200:
try:
data = response.json()
# print(data)
if data.get('status') == '1' and 'resultbody' in data:
resultbody = data['resultbody']
if 'job' in resultbody and 'items' in resultbody['job']:
items = resultbody['job']['items']
# 遍历所有职位,查找匹配的公司
for item in items:
print(item)
full_company_name = item.get('fullCompanyName', '').strip()
if full_company_name == keyword.strip():
return {
'fullCompanyName': full_company_name,
'companyName': item.get('companyName', '').strip(),
'companyHref': item.get('companyHref', '').strip()
}
except json.JSONDecodeError:
print(f"[错误] 响应不是有效的JSON: {response.text[:200]}")
return None
return None
except Exception as e:
print(f"[错误] 请求失败: {e}")
import traceback
print(traceback.format_exc())
return None
def parse_json_company_desc(uri: str) -> dict:
"""解析HTML页面返回字典格式不使用pandas
Args:
uri: 页面URL
Returns:
包含location和company_desc的字典
"""
"https://jobs.51job.com/all/coUT9QPQdhBzEGY1A1VjQ.html"
sy = uri.split("/")[-1].replace(".html", "")
# print(sy)
if sy.startswith("co"):
cid = sy.replace("co", "")
else:
cid = sy
signer = SignGenerator()
generate_company_detail_info = signer.generate_company_detail(cid)
headers = {
'Host': 'cupid.51job.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0',
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'sign': generate_company_detail_info["signature"],
'uuid': '1e6151f7bc3ce8d7e526c88d7d6592cd',
'From-Domain': '51job_web',
'account-id': '',
'user-token': '',
'partner': '',
'property': '%7B%22partner%22%3A%22%22%2C%22webId%22%3A2%2C%22fromdomain%22%3A%2251job_web%22%2C%22frompageUrl%22%3A%22https%3A%2F%2Fjobs.51job.com%2F%22%2C%22pageUrl%22%3A%22https%3A%2F%2Fjobs.51job.com%2Fall%2Fco4194496.html%22%2C%22isLogin%22%3A%22%E5%90%A6%22%2C%22accountId%22%3A%22%22%2C%22shortPageCode%22%3A%22gsxq%7Czwlb%7Cgsxqlb%22%2C%22pageCode%22%3A%22gsxq%7Czwlb%7Cgsxqlb%22%7D',
'Origin': 'https://jobs.51job.com',
'Connection': 'keep-alive',
'Referer': 'https://jobs.51job.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'TE': 'trailers',
}
try:
# 使用已配置的代理发送请求
desc_url = f"https://cupid.51job.com/open/noauth/company-info/pc-info?api_key=51job&timestamp={generate_company_detail_info['timestamp']}&encryCompanyId={cid}"
res = requests.get(url=desc_url, headers=headers, verify=False)
# print(res.text)
if not res:
return {"company_desc": "请求失败", "company_location": "请求失败"}
company_dinfo = res.json()
print(company_dinfo)
# print(company_dinfo["resultbody"])
coinfo = company_dinfo["resultbody"]["coinfo"]
return {"company_desc": coinfo["coinfo"], "company_location": coinfo["caddr"],"encryCompanyId":coinfo["encryCompanyId"]}
except Exception as e:
print(f"解析HTML失败: {e}")
return {"company_desc": "解析失败", "company_location": "解析失败"}
# 使用示例
if __name__ == "__main__":
# 测试搜索
keyword = "华为技术有限公司"
result = search_company(keyword)
if result:
print(f"找到匹配的公司:")
print(f" 全称: {result['fullCompanyName']}")
print(f" 简称: {result['companyName']}")
print(f" 链接: {result['companyHref']}")
else:
print(f"未找到匹配的公司: {keyword}")
print(parse_json_company_desc("https://jobs.51job.com/all/coA2RXNgBnATgPaQJn.html"))