328 lines
12 KiB
Python
328 lines
12 KiB
Python
import hashlib
|
||
import hmac
|
||
import json
|
||
import execjs
|
||
import re
|
||
import time
|
||
import uuid
|
||
import requests
|
||
from urllib.parse import unquote, quote
|
||
from typing import Optional, Dict
|
||
import os
|
||
|
||
|
||
class SignGenerator:
|
||
def __init__(self):
|
||
# 签名密钥(从JS代码中获取)
|
||
self.secret_key = "abfc8f9dcf8c3f3d8aa294ac5f2cf2cc7767e5592590f39c3f503271dd68562b"
|
||
self.secret_key_bytes = self.secret_key.encode('utf-8')
|
||
|
||
def hmac_sha256(self, message, key):
|
||
"""HMAC-SHA256签名"""
|
||
key_bytes = key.encode('utf-8') if isinstance(key, str) else key
|
||
message_bytes = message.encode('utf-8') if isinstance(message, str) else message
|
||
|
||
signature = hmac.new(key_bytes, message_bytes, hashlib.sha256)
|
||
return signature.hexdigest()
|
||
|
||
def generate_signature(self, t):
|
||
"""
|
||
生成签名(对应JS中的函数A)
|
||
JS逻辑: signature = hmacSHA256(url + (data || ""), secret_key)
|
||
"""
|
||
# 获取URL
|
||
url = t.get("url", "")
|
||
|
||
# 获取data,如果不存在则使用空字符串
|
||
data = t.get("data", "")
|
||
if data and isinstance(data, dict):
|
||
# 如果data是字典,转换为字符串
|
||
data = json.dumps(data, ensure_ascii=False, separators=(',', ':'))
|
||
|
||
# 拼接字符串
|
||
message = url + data
|
||
|
||
# 生成签名
|
||
signature = self.hmac_sha256(message, self.secret_key)
|
||
return signature
|
||
|
||
def generate_signature_from_components(self, url, data=None):
|
||
"""从URL和data生成签名(更简单的接口)"""
|
||
if data is None:
|
||
data = ""
|
||
elif isinstance(data, dict):
|
||
data = json.dumps(data, ensure_ascii=False, separators=(',', ':'))
|
||
|
||
message = url + data
|
||
return self.hmac_sha256(message, self.secret_key)
|
||
|
||
def generate_acw_sc__v2(self, arg1):
|
||
"""生成cookies acw_sc__v2"""
|
||
# 获取当前文件所在目录
|
||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||
js_file_path = os.path.join(current_dir, '04.js')
|
||
with open(js_file_path, 'r', encoding='utf-8') as f:
|
||
js = f.read()
|
||
acw_sc__v2 = execjs.compile(js).call('l', arg1)
|
||
return acw_sc__v2 if acw_sc__v2 else None
|
||
|
||
def generate_company_detail(self, cid: str) -> dict:
|
||
|
||
timestamp = int(time.time())
|
||
# 待签名的字符串
|
||
message = f"/open/noauth/company-info/pc-info?api_key=51job×tamp={timestamp}&encryCompanyId={cid}"
|
||
secret = "abfc8f9dcf8c3f3d8aa294ac5f2cf2cc7767e5592590f39c3f503271dd68562b"
|
||
# 进行 HMAC-SHA256 签名
|
||
signature = hmac.new(
|
||
key=secret.encode("utf-8"),
|
||
msg=message.encode("utf-8"),
|
||
digestmod=hashlib.sha256
|
||
).hexdigest()
|
||
return {"signature": signature, "timestamp": timestamp}
|
||
|
||
|
||
def search_company(keyword: str, job_area: str = "000000") -> Optional[Dict]:
|
||
"""
|
||
根据关键字搜索公司信息
|
||
|
||
Args:
|
||
keyword: 搜索关键字(公司名称)
|
||
job_area: 工作区域代码,默认"000000"表示全国
|
||
|
||
Returns:
|
||
如果找到匹配的公司,返回包含fullCompanyName, companyName, companyHref的字典
|
||
否则返回None
|
||
"""
|
||
signer = SignGenerator()
|
||
session = requests.Session()
|
||
|
||
# 生成时间戳
|
||
timestamp = str(int(time.time()))
|
||
|
||
# 构建请求参数
|
||
params = {
|
||
'api_key': '51job',
|
||
'timestamp': timestamp,
|
||
'keyword': keyword,
|
||
'searchType': '2', # 2表示搜索公司
|
||
'function': '',
|
||
'industry': '',
|
||
'jobArea': job_area,
|
||
'jobArea2': '',
|
||
'landmark': '',
|
||
'metro': '',
|
||
'salary': '',
|
||
'workYear': '',
|
||
'degree': '',
|
||
'companyType': '',
|
||
'companySize': '',
|
||
'jobType': '',
|
||
'issueDate': '',
|
||
'sortType': '0',
|
||
'pageNum': '1',
|
||
'requestId': '',
|
||
'pageSize': '20',
|
||
'source': '1',
|
||
'accountId': '',
|
||
'pageCode': 'sou|sou|soulb',
|
||
'scene': '7'
|
||
}
|
||
|
||
# 构建URL用于生成签名
|
||
# 注意:签名时keyword需要URL编码,其他参数保持原样
|
||
url_path = '/api/job/search-pc'
|
||
query_parts = []
|
||
for k, v in params.items():
|
||
if v:
|
||
# keyword参数需要URL编码(与浏览器行为一致)
|
||
if k == 'keyword':
|
||
query_parts.append(f'{k}={quote(str(v))}')
|
||
else:
|
||
query_parts.append(f'{k}={str(v)}')
|
||
else:
|
||
query_parts.append(f'{k}=')
|
||
query_string = '&'.join(query_parts)
|
||
full_url = f"{url_path}?{query_string}"
|
||
|
||
# 生成签名
|
||
sign = signer.generate_signature_from_components(full_url)
|
||
|
||
# 构建请求头
|
||
headers = {
|
||
'Accept': 'application/json, text/plain, */*',
|
||
'Accept-Language': 'zh',
|
||
'Cache-Control': 'no-cache',
|
||
'Connection': 'keep-alive',
|
||
'From-Domain': '51job_web',
|
||
'Pragma': 'no-cache',
|
||
'Referer': f'https://we.51job.com/pc/search?keyword={quote(keyword)}&searchType=2&sortType=0&metro=',
|
||
'Sec-Fetch-Dest': 'empty',
|
||
'Sec-Fetch-Mode': 'cors',
|
||
'Sec-Fetch-Site': 'same-origin',
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
|
||
'account-id': '',
|
||
'partner': '',
|
||
'property': '%7B%22partner%22%3A%22%22%2C%22webId%22%3A2%2C%22fromdomain%22%3A%2251job_web%22%2C%22frompageUrl%22%3A%22https%3A%2F%2Fwe.51job.com%2F%22%2C%22pageUrl%22%3A%22https%3A%2F%2Fwe.51job.com%2Fpc%2Fsearch%3Fkeyword%3D' + quote(
|
||
keyword) + '%26searchType%3D2%26sortType%3D0%26metro%3D%22%2C%22identityType%22%3A%22%22%2C%22userType%22%3A%22%22%2C%22isLogin%22%3A%22%E5%90%A6%22%2C%22accountid%22%3A%22%22%7D',
|
||
'sec-ch-ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
|
||
'sec-ch-ua-mobile': '?0',
|
||
'sec-ch-ua-platform': '"Windows"',
|
||
'sign': sign,
|
||
'user-token': '',
|
||
'uuid': str(uuid.uuid4()).replace('-', ''),
|
||
}
|
||
|
||
try:
|
||
# 第一次请求,可能会返回arg1用于生成acw_sc__v2
|
||
response = session.get(
|
||
'https://we.51job.com/api/job/search-pc',
|
||
params=params,
|
||
headers=headers,
|
||
verify=False,
|
||
timeout=30
|
||
)
|
||
|
||
# 检查是否需要处理acw_sc__v2反爬虫
|
||
if 'arg1' in response.text:
|
||
# 提取arg1
|
||
arg1_match = re.findall(r"arg1='(.*?)';", response.text, re.S)
|
||
if arg1_match:
|
||
arg1 = arg1_match[0]
|
||
# 生成acw_sc__v2
|
||
acw_sc__v2 = signer.generate_acw_sc__v2(arg1)
|
||
if acw_sc__v2:
|
||
# 生成guid
|
||
guid = str(uuid.uuid4()).replace("-", "")
|
||
cookies = {
|
||
'guid': guid,
|
||
'acw_sc__v2': acw_sc__v2
|
||
}
|
||
|
||
# 第二次请求,带上cookies
|
||
response2 = session.get(
|
||
'https://we.51job.com/api/job/search-pc',
|
||
params=params,
|
||
headers=headers,
|
||
cookies=cookies,
|
||
verify=False,
|
||
timeout=30
|
||
)
|
||
# 更新cookies
|
||
cookies.update(response2.cookies.get_dict())
|
||
|
||
# 第三次请求,使用完整的cookies
|
||
response = session.get(
|
||
'https://we.51job.com/api/job/search-pc',
|
||
params=params,
|
||
headers=headers,
|
||
cookies=cookies,
|
||
verify=False,
|
||
timeout=30
|
||
)
|
||
|
||
# 解析响应
|
||
if response.status_code == 200:
|
||
try:
|
||
data = response.json()
|
||
# print(data)
|
||
if data.get('status') == '1' and 'resultbody' in data:
|
||
resultbody = data['resultbody']
|
||
if 'job' in resultbody and 'items' in resultbody['job']:
|
||
items = resultbody['job']['items']
|
||
|
||
# 遍历所有职位,查找匹配的公司
|
||
for item in items:
|
||
print(item)
|
||
full_company_name = item.get('fullCompanyName', '').strip()
|
||
if full_company_name == keyword.strip():
|
||
return {
|
||
'fullCompanyName': full_company_name,
|
||
'companyName': item.get('companyName', '').strip(),
|
||
'companyHref': item.get('companyHref', '').strip()
|
||
}
|
||
except json.JSONDecodeError:
|
||
print(f"[错误] 响应不是有效的JSON: {response.text[:200]}")
|
||
return None
|
||
|
||
return None
|
||
|
||
except Exception as e:
|
||
print(f"[错误] 请求失败: {e}")
|
||
import traceback
|
||
print(traceback.format_exc())
|
||
return None
|
||
|
||
|
||
def parse_json_company_desc(uri: str) -> dict:
|
||
"""解析HTML页面,返回字典格式(不使用pandas)
|
||
|
||
Args:
|
||
uri: 页面URL
|
||
|
||
Returns:
|
||
包含location和company_desc的字典
|
||
"""
|
||
"https://jobs.51job.com/all/coUT9QPQdhBzEGY1A1VjQ.html"
|
||
sy = uri.split("/")[-1].replace(".html", "")
|
||
# print(sy)
|
||
if sy.startswith("co"):
|
||
cid = sy.replace("co", "")
|
||
else:
|
||
cid = sy
|
||
signer = SignGenerator()
|
||
generate_company_detail_info = signer.generate_company_detail(cid)
|
||
headers = {
|
||
'Host': 'cupid.51job.com',
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0',
|
||
'Accept': 'application/json, text/plain, */*',
|
||
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
|
||
'Accept-Encoding': 'gzip, deflate, br, zstd',
|
||
'sign': generate_company_detail_info["signature"],
|
||
'uuid': '1e6151f7bc3ce8d7e526c88d7d6592cd',
|
||
'From-Domain': '51job_web',
|
||
'account-id': '',
|
||
'user-token': '',
|
||
'partner': '',
|
||
'property': '%7B%22partner%22%3A%22%22%2C%22webId%22%3A2%2C%22fromdomain%22%3A%2251job_web%22%2C%22frompageUrl%22%3A%22https%3A%2F%2Fjobs.51job.com%2F%22%2C%22pageUrl%22%3A%22https%3A%2F%2Fjobs.51job.com%2Fall%2Fco4194496.html%22%2C%22isLogin%22%3A%22%E5%90%A6%22%2C%22accountId%22%3A%22%22%2C%22shortPageCode%22%3A%22gsxq%7Czwlb%7Cgsxqlb%22%2C%22pageCode%22%3A%22gsxq%7Czwlb%7Cgsxqlb%22%7D',
|
||
'Origin': 'https://jobs.51job.com',
|
||
'Connection': 'keep-alive',
|
||
'Referer': 'https://jobs.51job.com/',
|
||
'Sec-Fetch-Dest': 'empty',
|
||
'Sec-Fetch-Mode': 'cors',
|
||
'Sec-Fetch-Site': 'same-site',
|
||
'TE': 'trailers',
|
||
}
|
||
|
||
try:
|
||
# 使用已配置的代理发送请求
|
||
desc_url = f"https://cupid.51job.com/open/noauth/company-info/pc-info?api_key=51job×tamp={generate_company_detail_info['timestamp']}&encryCompanyId={cid}"
|
||
res = requests.get(url=desc_url, headers=headers, verify=False)
|
||
# print(res.text)
|
||
if not res:
|
||
return {"company_desc": "请求失败", "company_location": "请求失败"}
|
||
company_dinfo = res.json()
|
||
print(company_dinfo)
|
||
# print(company_dinfo["resultbody"])
|
||
|
||
coinfo = company_dinfo["resultbody"]["coinfo"]
|
||
return {"company_desc": coinfo["coinfo"], "company_location": coinfo["caddr"],"encryCompanyId":coinfo["encryCompanyId"]}
|
||
|
||
except Exception as e:
|
||
print(f"解析HTML失败: {e}")
|
||
return {"company_desc": "解析失败", "company_location": "解析失败"}
|
||
|
||
|
||
# 使用示例
|
||
if __name__ == "__main__":
|
||
# 测试搜索
|
||
keyword = "华为技术有限公司"
|
||
result = search_company(keyword)
|
||
if result:
|
||
print(f"找到匹配的公司:")
|
||
print(f" 全称: {result['fullCompanyName']}")
|
||
print(f" 简称: {result['companyName']}")
|
||
print(f" 链接: {result['companyHref']}")
|
||
else:
|
||
print(f"未找到匹配的公司: {keyword}")
|
||
print(parse_json_company_desc("https://jobs.51job.com/all/coA2RXNgBnATgPaQJn.html"))
|