856 lines
33 KiB
Python
856 lines
33 KiB
Python
import os
|
||
import time
|
||
import json
|
||
import pprint
|
||
import random
|
||
import uuid
|
||
import hashlib
|
||
from typing import Any, Dict, Optional, Callable
|
||
|
||
try:
|
||
import requests
|
||
except Exception:
|
||
requests = None
|
||
import ssl
|
||
from urllib.request import Request, urlopen, ProxyHandler, build_opener, HTTPSHandler
|
||
from urllib.parse import urlencode
|
||
import sqlite3
|
||
SUCCESS_LOG_PATH = os.path.join(os.path.dirname(__file__), "success.txt")
|
||
_SUCCESS_WRITTEN: set = set()
|
||
try:
|
||
import httpx
|
||
except Exception:
|
||
httpx = None
|
||
API_BASE_URL = os.getenv("API_BASE_URL", "http://127.0.0.1:9999")
|
||
|
||
def _build_proxy() -> Optional[Dict[str, str]]:
|
||
"""构造代理配置字典(requests/httpx/urllib 兼容)。
|
||
|
||
从环境变量读取:
|
||
- ZP_PROXY_URL: 完整代理URL,如 http://user:pass@host:port
|
||
或组合:
|
||
- ZP_PROXY_USERNAME, ZP_PROXY_PASSWORD, ZP_PROXY_TUNNEL
|
||
|
||
Returns:
|
||
Optional[Dict[str, str]]: {'http': url, 'https': url} 或 None。
|
||
"""
|
||
|
||
url ="http://t13319619426654:ln8aj9nl@s432.kdltps.com:15818"
|
||
return {"http": url, "https": url}
|
||
|
||
def _get_user_agent(mobile: bool = True) -> str:
|
||
try:
|
||
from fake_useragent import UserAgent
|
||
ua = UserAgent(platforms=['mobile'] if mobile else None)
|
||
return ua.random
|
||
except Exception:
|
||
if mobile:
|
||
return "Mozilla/5.0 (Linux; Android 10; Mobile) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Mobile Safari/537.36"
|
||
return "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36"
|
||
|
||
def gen_page_request_id() -> str:
|
||
return f"cf1e3b3e655b4eb5a306110a83c77c29-{int(time.time()*1000)}-{random.randint(0,999999)}"
|
||
|
||
def gen_client_id() -> str:
|
||
t = int(time.time() * 1000)
|
||
try:
|
||
t += int(time.perf_counter() * 1000)
|
||
except Exception:
|
||
pass
|
||
def repl(c: str) -> str:
|
||
n = int((t + random.random() * 16) % 16)
|
||
if c == 'x':
|
||
return hex(n)[2:]
|
||
return hex((n & 0x3) | 0x8)[2:]
|
||
tpl = "xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx"
|
||
return ''.join(repl(c) if c in 'xy' else c for c in tpl)
|
||
|
||
def gen_v() -> float:
|
||
return round(random.random(), 8)
|
||
|
||
def build_headers_miniapp(user_agent: str) -> Dict[str, str]:
|
||
return {
|
||
'User-Agent': user_agent,
|
||
'x-zp-page-code': "7020",
|
||
'x-zp-rt': hashlib.md5(f"{uuid.uuid4()}-{time.time()}".encode("utf-8")).hexdigest(),
|
||
'x-zp-device-id': str(uuid.uuid4()).upper(),
|
||
'content-type': "application/json",
|
||
'x-zp-version': "0.0.0",
|
||
'x-zp-business-system': "73",
|
||
'x-zp-action-id': "",
|
||
'xweb_xhr': "1",
|
||
'x-zp-channel': "wxxiaochengxu",
|
||
'x-zp-platform': "12",
|
||
'sec-fetch-site': "cross-site",
|
||
'sec-fetch-mode': "cors",
|
||
'sec-fetch-dest': "empty",
|
||
'referer': "https://servicewechat.com/wxb7718fb9257e4fd2/529/page-frame.html",
|
||
'accept-language': "zh-CN,zh;q=0.9",
|
||
}
|
||
|
||
def _get_db_path() -> str:
|
||
"""返回默认 SQLite 数据库文件路径。"""
|
||
base_dir = os.path.dirname(__file__)
|
||
return os.path.join(base_dir, "zhilian_raw.sqlite3")
|
||
|
||
|
||
def _init_db(db_path: str) -> None:
|
||
"""初始化 SQLite 数据库并创建表。"""
|
||
try:
|
||
con = sqlite3.connect(db_path)
|
||
cur = con.cursor()
|
||
cur.execute(
|
||
"""
|
||
CREATE TABLE IF NOT EXISTS responses (
|
||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||
sou_full_index TEXT,
|
||
page INTEGER,
|
||
created_at INTEGER,
|
||
payload TEXT
|
||
)
|
||
"""
|
||
)
|
||
cur.execute(
|
||
"""
|
||
CREATE TABLE IF NOT EXISTS company_details (
|
||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||
number TEXT,
|
||
created_at INTEGER,
|
||
payload TEXT
|
||
)
|
||
"""
|
||
)
|
||
cur.execute(
|
||
"""
|
||
CREATE UNIQUE INDEX IF NOT EXISTS idx_responses_sou_page
|
||
ON responses(sou_full_index, page)
|
||
"""
|
||
)
|
||
cur.execute(
|
||
"""
|
||
CREATE UNIQUE INDEX IF NOT EXISTS idx_company_details_number
|
||
ON company_details(number)
|
||
"""
|
||
)
|
||
con.commit()
|
||
con.close()
|
||
except Exception:
|
||
pass
|
||
|
||
|
||
def _save_search_response(db_path: str, sou_full_index: str, page: int, raw_payload: str) -> None:
|
||
"""保存职位搜索的原始响应。"""
|
||
try:
|
||
con = sqlite3.connect(db_path)
|
||
cur = con.cursor()
|
||
cur.execute(
|
||
"INSERT OR IGNORE INTO responses(sou_full_index, page, created_at, payload) VALUES(?, ?, ?, ?)",
|
||
(sou_full_index, int(page), int(time.time()), raw_payload),
|
||
)
|
||
con.commit()
|
||
con.close()
|
||
except Exception:
|
||
pass
|
||
|
||
|
||
def _save_company_detail(db_path: str, number: str, raw_payload: str) -> None:
|
||
"""保存公司详情的原始响应。"""
|
||
try:
|
||
con = sqlite3.connect(db_path)
|
||
cur = con.cursor()
|
||
cur.execute(
|
||
"INSERT OR IGNORE INTO company_details(number, created_at, payload) VALUES(?, ?, ?)",
|
||
(number, int(time.time()), raw_payload),
|
||
)
|
||
con.commit()
|
||
con.close()
|
||
except Exception:
|
||
pass
|
||
def _has_company_detail(db_path: str, number: str) -> bool:
|
||
"""检查公司详情是否已存在(按职位编号 number)。"""
|
||
|
||
try:
|
||
con = sqlite3.connect(db_path)
|
||
cur = con.cursor()
|
||
cur.execute("SELECT 1 FROM company_details WHERE number=? LIMIT 1", (number,))
|
||
row = cur.fetchone()
|
||
con.close()
|
||
return row is not None
|
||
except Exception:
|
||
return False
|
||
def _has_page_record(db_path: str, sou_full_index: str, page: int) -> bool:
|
||
"""检查指定关键词与页码是否已经存在。"""
|
||
|
||
try:
|
||
con = sqlite3.connect(db_path)
|
||
cur = con.cursor()
|
||
cur.execute("SELECT 1 FROM responses WHERE sou_full_index=? AND page=? LIMIT 1", (sou_full_index, int(page)))
|
||
row = cur.fetchone()
|
||
con.close()
|
||
return row is not None
|
||
except Exception:
|
||
return False
|
||
|
||
def _sleep_between_requests(min_seconds: float = 0.3, max_seconds: float = 0.8) -> None:
|
||
"""在请求间进行随机休眠。"""
|
||
|
||
try:
|
||
dur = random.uniform(min_seconds, max_seconds)
|
||
time.sleep(dur)
|
||
except Exception:
|
||
time.sleep(min_seconds)
|
||
|
||
def _has_keyword_record(db_path: str, sou_full_index: str) -> bool:
|
||
"""检查指定关键词是否已有任意页记录。"""
|
||
|
||
try:
|
||
con = sqlite3.connect(db_path)
|
||
cur = con.cursor()
|
||
cur.execute("SELECT 1 FROM responses WHERE sou_full_index=? LIMIT 1", (sou_full_index,))
|
||
row = cur.fetchone()
|
||
con.close()
|
||
return row is not None
|
||
except Exception:
|
||
return False
|
||
def _request_json(method: str, url: str, headers: Dict[str, str], params: Optional[Dict[str, Any]] = None,
|
||
json_body: Optional[Dict[str, Any]] = None, timeout: int = 30, max_retries: int = 3,
|
||
proxies: Optional[Dict[str, str]] = None, raw_sink: Optional[Callable[[str], None]] = None) -> Optional[Dict[str, Any]]:
|
||
for attempt in range(max_retries):
|
||
try:
|
||
time.sleep(random.uniform(0.8, 2.5))
|
||
if httpx is not None:
|
||
use_http2 = os.getenv("ZP_HTTP2", "1") == "1"
|
||
debug = os.getenv("ZP_DEBUG", "0") == "1"
|
||
kwargs: Dict[str, Any] = {"http2": use_http2, "timeout": timeout, "headers": headers, "trust_env": False}
|
||
if proxies:
|
||
px = proxies.get("https") or proxies.get("http")
|
||
if px:
|
||
kwargs["proxies"] = px
|
||
with httpx.Client(**kwargs) as client:
|
||
method_u = method.upper()
|
||
if method_u == "GET":
|
||
resp = client.get(url, params=params)
|
||
else:
|
||
resp = client.post(url, json=json_body)
|
||
if debug:
|
||
print({"_request_json": {"method": "POST", "status": resp.status_code}})
|
||
if resp.status_code == 405:
|
||
merged = params or {}
|
||
if json_body:
|
||
merged = {**merged, **{k: str(v) for k, v in json_body.items()}}
|
||
resp = client.get(url, params=merged)
|
||
if debug:
|
||
print({"_request_json": {"fallback": "GET", "status": resp.status_code}})
|
||
if raw_sink:
|
||
try:
|
||
raw_sink(resp.text)
|
||
except Exception:
|
||
pass
|
||
try:
|
||
return resp.json()
|
||
except ValueError:
|
||
return json.loads(resp.text)
|
||
else:
|
||
if requests:
|
||
resp = requests.request(
|
||
method.upper(), url,
|
||
headers=headers, params=params, json=json_body,
|
||
timeout=timeout, proxies=proxies
|
||
)
|
||
resp.raise_for_status()
|
||
if raw_sink:
|
||
try:
|
||
raw_sink(resp.text)
|
||
except Exception:
|
||
pass
|
||
return resp.json()
|
||
if method.upper() == 'GET':
|
||
full_url = url
|
||
if params:
|
||
qs = urlencode(params)
|
||
full_url = f"{url}?{qs}"
|
||
req = Request(full_url, headers=headers, method='GET')
|
||
else:
|
||
data_bytes = json.dumps(json_body or {}).encode('utf-8')
|
||
req = Request(url, headers=headers, data=data_bytes, method='POST')
|
||
ctx = ssl.create_default_context()
|
||
opener = None
|
||
if proxies and isinstance(proxies, dict) and (proxies.get("http") or proxies.get("https")):
|
||
try:
|
||
ph = ProxyHandler(proxies)
|
||
opener = build_opener(ph, HTTPSHandler(context=ctx))
|
||
except Exception:
|
||
opener = None
|
||
if opener:
|
||
with opener.open(req, timeout=timeout) as r:
|
||
raw = r.read()
|
||
else:
|
||
with urlopen(req, context=ctx, timeout=timeout) as r:
|
||
raw = r.read()
|
||
if raw_sink:
|
||
try:
|
||
raw_sink(raw.decode("utf-8"))
|
||
except Exception:
|
||
pass
|
||
return json.loads(raw)
|
||
except Exception:
|
||
if attempt == max_retries - 1:
|
||
return None
|
||
time.sleep(1.2 * (attempt + 1))
|
||
return None
|
||
|
||
def fetch_company_desc_by_job(number: str, db_path: Optional[str] = None) -> Optional[str]:
|
||
if not isinstance(number, str) or not number.strip():
|
||
return None
|
||
if db_path and _has_company_detail(db_path, number):
|
||
return None
|
||
client_id = gen_client_id()
|
||
url_pc = "https://fe-api.zhaopin.com/c/i/jobs/position-detail-new"
|
||
params_pc = {
|
||
"number": number,
|
||
"_v": gen_v(),
|
||
"x-zp-page-request-id": gen_page_request_id(),
|
||
"x-zp-client-id": client_id,
|
||
}
|
||
headers_pc = {
|
||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
|
||
"Accept": "application/json, text/plain, */*",
|
||
"Accept-Encoding": "identity",
|
||
"sec-ch-ua-platform": "macOS",
|
||
"x-zp-business-system": "1",
|
||
"x-zp-page-code": "4019",
|
||
"sec-ch-ua": "\"Not/A)Brand\";v=\"8\", \"Chromium\";v=\"136\", \"Google Chrome\";v=\"136\"",
|
||
"sec-ch-ua-mobile": "?0",
|
||
"x-zp-platform": "13",
|
||
"origin": "https://www.zhaopin.com",
|
||
"sec-fetch-site": "same-site",
|
||
"sec-fetch-mode": "cors",
|
||
"sec-fetch-dest": "empty",
|
||
"referer": "https://www.zhaopin.com/",
|
||
"accept-language": "zh-CN,zh;q=0.9",
|
||
"priority": "u=1, i",
|
||
"Cookie": f"x-zp-client-id={client_id}"
|
||
}
|
||
def _sink_pc(raw: str) -> None:
|
||
if db_path:
|
||
_save_company_detail(db_path, number, raw)
|
||
data_pc = _request_json("GET", url_pc, headers_pc, params=params_pc, proxies=_build_proxy(), raw_sink=_sink_pc)
|
||
if data_pc and isinstance(data_pc, dict):
|
||
detail = data_pc.get("data") or {}
|
||
comp = detail.get("detailedCompany") or {}
|
||
desc_pc = comp.get("companyDescription")
|
||
if isinstance(desc_pc, str) and desc_pc:
|
||
return desc_pc
|
||
ua = _get_user_agent(True)
|
||
url_mini = "https://cgate.zhaopin.com/positionbusiness/exposure/companyDetail"
|
||
params_mini = {
|
||
"number": number,
|
||
"platform": "12",
|
||
"version": "0.0.0",
|
||
}
|
||
headers_mini = build_headers_miniapp(ua)
|
||
def _sink_mini(raw: str) -> None:
|
||
if db_path:
|
||
_save_company_detail(db_path, number, raw)
|
||
data_mini = _request_json("GET", url_mini, headers_mini, params=params_mini, proxies=_build_proxy(), raw_sink=_sink_mini)
|
||
if data_mini and isinstance(data_mini, dict):
|
||
desc_mini = ((data_mini.get("data") or {}).get("companyBase") or {}).get("companyDescWithHtml")
|
||
if isinstance(desc_mini, str) and desc_mini:
|
||
return desc_mini
|
||
return None
|
||
|
||
|
||
def build_headers(
|
||
at: str,
|
||
rt: str,
|
||
device_id: str,
|
||
channel: str = "miniapp",
|
||
platform: str = "miniapp",
|
||
version: str = "1.0.0",
|
||
business_system: str = "zpfe-miniapp",
|
||
page_code: Optional[str] = None,
|
||
action_id: Optional[str] = None,
|
||
user_agent: Optional[str] = None,
|
||
referer: Optional[str] = None,
|
||
) -> Dict[str, str]:
|
||
"""生成请求头。
|
||
|
||
参数:
|
||
- at:访问令牌。
|
||
- rt:刷新/辅助令牌。
|
||
- device_id:设备标识。
|
||
- channel:渠道标识。
|
||
- platform:平台标识。
|
||
- version:版本号。
|
||
- business_system:业务系统标识。
|
||
- page_code:页面编码(注入 x-zp-page-code)。
|
||
- action_id:动作标识(注入 x-zp-action-id)。
|
||
- user_agent:UA(可选,默认填充为微信小程序 UA)。
|
||
- referer:来源(可选)。
|
||
|
||
返回:
|
||
- 头字典,包含公共头与在 cgate 路径下的 x-zp-at/x-zp-rt。
|
||
"""
|
||
headers: Dict[str, str] = {
|
||
"accept": "*/*",
|
||
"content-type": "application/json",
|
||
"x-zp-version": version,
|
||
"x-zp-channel": channel,
|
||
"x-zp-platform": platform,
|
||
"x-zp-device-id": device_id,
|
||
"x-zp-business-system": business_system,
|
||
"xweb_xhr": "1",
|
||
"sec-fetch-site": "cross-site",
|
||
"sec-fetch-mode": "cors",
|
||
"sec-fetch-dest": "empty",
|
||
"accept-language": "zh-CN,zh;q=0.9",
|
||
}
|
||
if at:
|
||
headers["x-zp-at"] = at
|
||
if rt:
|
||
headers["x-zp-rt"] = rt
|
||
if page_code:
|
||
headers["x-zp-page-code"] = page_code
|
||
if action_id is not None:
|
||
headers["x-zp-action-id"] = action_id
|
||
headers["User-Agent"] = user_agent or (
|
||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 "
|
||
"MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF "
|
||
"MacWechat/3.8.10(0x13080a10) XWEB/1227"
|
||
)
|
||
if referer:
|
||
headers["referer"] = referer
|
||
else:
|
||
headers["referer"] = "https://servicewechat.com/wxb7718fb9257e4fd2/602/page-frame.html"
|
||
return headers
|
||
|
||
|
||
def base_url_for(path: str, env: str = "prod") -> str:
|
||
"""选择基础域名。
|
||
|
||
参数:
|
||
- path:相对路径。
|
||
- env:环境标识,"prod" 或 "pre"。
|
||
|
||
返回:
|
||
- 基础域名字符串。
|
||
"""
|
||
is_pre = env == "pre"
|
||
if path.startswith("/capi"):
|
||
return f"https://capi{'pre' if is_pre else ''}.zhaopin.com"
|
||
if path.startswith("/api"):
|
||
return f"https://m{'-pre' if is_pre else ''}.zhaopin.com"
|
||
if path.startswith("/weex"):
|
||
return f"https://zhibo{'-pre' if is_pre else ''}.zhaopin.com"
|
||
return f"https://cgate{'-pre' if is_pre else ''}.zhaopin.com"
|
||
|
||
|
||
def build_payload(
|
||
page_index: int = 1,
|
||
page_size: int = 10,
|
||
city_id: Optional[int] = None,
|
||
event_scenario: Optional[str] = None,
|
||
sou_expand: Optional[str] = None,
|
||
sou_full_index: Optional[str] = None,
|
||
sort_type: Optional[str] = None,
|
||
resume_number: Optional[str] = None,
|
||
filter_min_salary: Optional[int] = None,
|
||
) -> Dict[str, Any]:
|
||
"""构造职位搜索请求体。"""
|
||
body: Dict[str, Any] = {
|
||
"pageIndex": page_index,
|
||
"pageSize": page_size,
|
||
}
|
||
if event_scenario:
|
||
body["eventScenario"] = event_scenario
|
||
if filter_min_salary is not None:
|
||
body["filterMinSalary"] = filter_min_salary
|
||
if sou_expand:
|
||
body["S_SOU_EXPAND"] = sou_expand
|
||
if sou_full_index:
|
||
body["S_SOU_FULL_INDEX"] = sou_full_index
|
||
if city_id is not None:
|
||
body["S_SOU_WORK_CITY"] = city_id
|
||
if sort_type:
|
||
body["sortType"] = sort_type
|
||
if resume_number:
|
||
body["resumeNumber"] = resume_number
|
||
return body
|
||
|
||
|
||
def call(
|
||
page_index: int = 1,
|
||
page_size: int = 10,
|
||
city_id: Optional[int] = None,
|
||
env: str = "prod",
|
||
timeout: float = 10.0,
|
||
sou_full_index: Optional[str] = None,
|
||
) -> Any:
|
||
"""执行职位搜索请求(POST)。"""
|
||
path = "/positionbusiness/searchrecommend/searchPositions"
|
||
base = base_url_for(path, env=env)
|
||
url = f"{base}{path}"
|
||
|
||
at = os.getenv("ZP_AT", "")
|
||
rt = os.getenv("ZP_RT", "")
|
||
device_id = os.getenv("ZP_DEVICE_ID", "")
|
||
channel = os.getenv("ZP_CHANNEL", "wxxiaochengxu")
|
||
platform = os.getenv("ZP_PLATFORM", "12")
|
||
version = os.getenv("ZP_VERSION", "0.0.0")
|
||
business_system = os.getenv("ZP_BUSINESS_SYSTEM", "73")
|
||
page_code = os.getenv("ZP_PAGE_CODE", "7019")
|
||
action_id = os.getenv("ZP_ACTION_ID", "")
|
||
user_agent = os.getenv("ZP_USER_AGENT")
|
||
referer = os.getenv("ZP_REFERER")
|
||
|
||
headers = build_headers(
|
||
at=at,
|
||
rt=rt,
|
||
device_id=device_id,
|
||
channel=channel,
|
||
platform=platform,
|
||
version=version,
|
||
business_system=business_system,
|
||
page_code=page_code,
|
||
action_id=action_id,
|
||
user_agent=user_agent,
|
||
referer=referer,
|
||
)
|
||
|
||
body_env = os.getenv("ZP_BODY_JSON")
|
||
if body_env:
|
||
try:
|
||
body = json.loads(body_env)
|
||
except Exception:
|
||
body = {}
|
||
else:
|
||
body = build_payload(
|
||
page_index=page_index,
|
||
page_size=page_size,
|
||
city_id=city_id,
|
||
event_scenario=os.getenv("ZP_EVENT_SCENARIO", "wxmpZhaopinSearchV2"),
|
||
sou_expand=os.getenv("ZP_SOU_EXPAND", "SOU_COMPANY_ID"),
|
||
sou_full_index=sou_full_index or os.getenv("ZP_SOU_FULL_INDEX"),
|
||
sort_type=os.getenv("ZP_SORT_TYPE", "DEFAULT"),
|
||
resume_number=os.getenv("ZP_RESUME_NUMBER"),
|
||
filter_min_salary=int(os.getenv("ZP_FILTER_MIN_SALARY", "1")),
|
||
)
|
||
if sou_full_index:
|
||
body["S_SOU_FULL_INDEX"] = sou_full_index
|
||
|
||
use_http2 = os.getenv("ZP_HTTP2", "1") == "1"
|
||
proxies = _build_proxy()
|
||
debug = os.getenv("ZP_DEBUG", "0") == "1"
|
||
if httpx is not None:
|
||
kwargs: Dict[str, Any] = {"http2": use_http2, "timeout": timeout, "headers": headers, "trust_env": False}
|
||
if proxies:
|
||
px = proxies.get("https") or proxies.get("http")
|
||
if px:
|
||
kwargs["proxies"] = px
|
||
with httpx.Client(**kwargs) as client:
|
||
resp = client.post(url, json=body)
|
||
if debug:
|
||
print({"method": "POST", "status": resp.status_code})
|
||
if resp.status_code == 405:
|
||
params = {k: str(v) for k, v in body.items()}
|
||
resp = client.get(url, params=params)
|
||
if debug:
|
||
print({"fallback": "GET", "status": resp.status_code})
|
||
try:
|
||
os.environ["ZP_LAST_RAW"] = resp.text
|
||
except Exception:
|
||
pass
|
||
try:
|
||
return resp.json()
|
||
except ValueError:
|
||
return resp.text
|
||
else:
|
||
resp = requests.post(url, json=body, headers=headers, timeout=timeout, proxies=proxies) if requests is not None else None
|
||
if resp is not None:
|
||
if debug:
|
||
try:
|
||
print({"method": "POST", "status": resp.status_code})
|
||
except Exception:
|
||
pass
|
||
if getattr(resp, "status_code", None) == 405:
|
||
params = {k: str(v) for k, v in body.items()}
|
||
resp = requests.get(url, params=params, headers=headers, timeout=timeout, proxies=proxies)
|
||
if debug:
|
||
try:
|
||
print({"fallback": "GET", "status": resp.status_code})
|
||
except Exception:
|
||
pass
|
||
try:
|
||
return resp.json()
|
||
except ValueError:
|
||
return resp.text
|
||
return {"error": "httpx not available"}
|
||
|
||
|
||
def _load_lines() -> list:
|
||
base_dir = os.path.dirname(__file__)
|
||
candidates = [
|
||
os.path.join(base_dir, "company.txt"),
|
||
os.path.join(base_dir, "conpany.txt"),
|
||
]
|
||
for fp in candidates:
|
||
if os.path.exists(fp):
|
||
try:
|
||
with open(fp, "r", encoding="utf-8") as f:
|
||
lines = [ln.strip() for ln in f.readlines()]
|
||
return [ln for ln in lines if ln]
|
||
except Exception:
|
||
continue
|
||
return []
|
||
|
||
|
||
def _record_success(name: str) -> None:
|
||
try:
|
||
n = (name or "").strip()
|
||
if not n or n in _SUCCESS_WRITTEN:
|
||
return
|
||
with open(SUCCESS_LOG_PATH, "a", encoding="utf-8") as f:
|
||
f.write(f"{n}\n")
|
||
_SUCCESS_WRITTEN.add(n)
|
||
except Exception:
|
||
pass
|
||
|
||
def _extract_job_items_from_result(result: Any) -> list:
|
||
"""从调用结果中提取职位列表 items。"""
|
||
|
||
try:
|
||
if not isinstance(result, dict):
|
||
return []
|
||
data = result.get("data")
|
||
if isinstance(data, dict):
|
||
lst = data.get("list")
|
||
return lst if isinstance(lst, list) else []
|
||
return []
|
||
except Exception:
|
||
return []
|
||
|
||
import requests
|
||
|
||
def _report_universal(items: list, data_type: str = "job") -> bool:
|
||
"""Post items list to universal batch-store-async endpoint.
|
||
|
||
Args:
|
||
items (list): Data list to send.
|
||
data_type (str): Logical data type label.
|
||
|
||
Returns:
|
||
bool: True when accepted, else False.
|
||
"""
|
||
return False
|
||
|
||
|
||
|
||
|
||
def main() -> None:
|
||
env = os.getenv("ZP_ENV", "prod")
|
||
page_index = int(os.getenv("ZP_PAGE_INDEX", "1"))
|
||
page_size = int(os.getenv("ZP_PAGE_SIZE", "15"))
|
||
city_env = os.getenv("ZP_DEMO_CITY_ID")
|
||
city_id = int(city_env) if city_env and city_env.isdigit() else None
|
||
|
||
items = _load_lines()
|
||
if items:
|
||
try:
|
||
from tqdm import tqdm
|
||
except Exception:
|
||
tqdm = None
|
||
seq = tqdm(items, desc="S_SOU_FULL_INDEX") if tqdm else items
|
||
db_path = _get_db_path()
|
||
_init_db(db_path)
|
||
for x in seq:
|
||
|
||
total_items = 0
|
||
p = page_index
|
||
pages_bar = tqdm(desc=f"{x}", leave=False) if tqdm else None
|
||
# 若该关键词已存在任何页的记录,则整体跳过,避免重复请求
|
||
if _has_keyword_record(db_path, x):
|
||
if pages_bar:
|
||
pages_bar.update(0)
|
||
pages_bar.set_postfix({"keyword": x, "skipped": True})
|
||
continue
|
||
while True:
|
||
if _has_page_record(db_path, x, p):
|
||
if pages_bar:
|
||
pages_bar.update(1)
|
||
pages_bar.set_postfix({"page": p, "skipped": True})
|
||
_sleep_between_requests()
|
||
p += 1
|
||
continue
|
||
result = call(
|
||
page_index=p,
|
||
page_size=page_size,
|
||
city_id=city_id,
|
||
env=env,
|
||
sou_full_index=x,
|
||
)
|
||
try:
|
||
raw = os.getenv("ZP_LAST_RAW", "")
|
||
items = _extract_job_items_from_result(result)
|
||
payload_obj = {
|
||
"keyword": x,
|
||
"page": p,
|
||
"count": len(items),
|
||
"items": items,
|
||
"data": result.get("data") if isinstance(result, dict) else None,
|
||
"raw": raw or (json.dumps(result, ensure_ascii=False) if isinstance(result, dict) else str(result)),
|
||
}
|
||
_save_search_response(db_path, x, p, json.dumps(payload_obj, ensure_ascii=False))
|
||
except Exception:
|
||
pass
|
||
data = result.get("data") if isinstance(result, dict) else None
|
||
lst = data.get("list") if isinstance(data, dict) else None
|
||
is_end = (isinstance(data, dict) and str(data.get("isEndPage", "")).strip() in ("1", "true", "True"))
|
||
count_val = None
|
||
# 处理数据字段
|
||
if lst and isinstance(lst, list):
|
||
for item in lst:
|
||
item["companyName"] = x or item.get("companyName", "")
|
||
item["jobName"] = item.get("jobName", "") or item.get("name", "") or item.get("position", {}).get("base", {}).get("positionName", "")
|
||
item["jobDescribe"] = item.get("jobSummary", "") or item.get("position", {}).get("desc", {}).get("description", "")
|
||
item["degreeString"] = item.get("education", "") or item.get("position", {}).get("base", {}).get("education", "")
|
||
|
||
tags = []
|
||
for t in item.get("showSkillTags", []) or []:
|
||
if isinstance(t, dict):
|
||
v = t.get("tag") or t.get("value") or t.get("name")
|
||
if v:
|
||
tags.append(str(v))
|
||
elif isinstance(t, str):
|
||
tags.append(t)
|
||
for t in item.get("skillLabel", []) or []:
|
||
if isinstance(t, dict):
|
||
v = t.get("value") or t.get("name")
|
||
if v:
|
||
tags.append(str(v))
|
||
elif isinstance(t, str):
|
||
tags.append(t)
|
||
item["jobTagsForOrder"] = tags
|
||
|
||
# 年限/教育
|
||
item["workYearString"] = item.get("workingExp", "") or item.get("position", {}).get("base", {}).get("positionWorkingExp", "")
|
||
item["jobExperience"] = item.get("jobExperience", "")
|
||
item["jobEducation"] = item.get("jobEducation", "")
|
||
|
||
# 工种/职位类型
|
||
item["termStr"] = item.get("workType", "") or item.get("position", {}).get("base", {}).get("workType", "")
|
||
|
||
# 位置/区域
|
||
addr = (item.get("workLocation", {}) or {}).get("workAddress")
|
||
city = item.get("workCity", "")
|
||
district = item.get("cityDistrict", "")
|
||
street = item.get("streetName", "")
|
||
if not addr:
|
||
# 卡片JSON中的地址作为兜底
|
||
card_json = item.get("cardCustomJson")
|
||
try:
|
||
card_obj = json.loads(card_json) if isinstance(card_json, str) else {}
|
||
except Exception:
|
||
card_obj = {}
|
||
addr = card_obj.get("address")
|
||
item["location"] = addr or "" # 详细地址
|
||
item["jobAreaString"] = f"{city}{district}{street}".strip()
|
||
|
||
# 时间
|
||
item["confirmDateString"] = item.get("publishTime", "") or item.get("firstPublishTime", "")
|
||
|
||
# 公司规模/性质
|
||
item["companySizeString"] = item.get("companySize", "")
|
||
item["companyTypeString"] = item.get("propertyName", "")
|
||
|
||
# 行业
|
||
item["major1Str"] = item.get("industryName", "")
|
||
item["major2Str"] = ""
|
||
|
||
# 链接/ID/公司信息
|
||
def _clean_url(u: Any) -> str:
|
||
s = str(u or "").strip()
|
||
if s:
|
||
s = s.strip().strip("`").strip()
|
||
return s
|
||
job_url = item.get("positionUrl") or item.get("positionURL") or (item.get("position", {}) or {}).get("base", {}).get("positionUrl")
|
||
item["jobHref"] = _clean_url(job_url)
|
||
item["companyHref"] = _clean_url(item.get("companyUrl"))
|
||
item["coId"] = item.get("companyId")
|
||
item["fullCompanyName"] = item.get("companyName", "")
|
||
|
||
# 薪资:优先 salaryReal,其次 salary,再次 cardCustomJson.salary60
|
||
raw_salary = item.get("salaryReal") or item.get("salary")
|
||
if not raw_salary:
|
||
_cj = item.get("cardCustomJson")
|
||
try:
|
||
_cj_obj = json.loads(_cj) if isinstance(_cj, str) else {}
|
||
except Exception:
|
||
_cj_obj = {}
|
||
raw_salary = _cj_obj.get("salary60")
|
||
min_val = ""
|
||
max_val = ""
|
||
if isinstance(raw_salary, str):
|
||
s = raw_salary.replace("元", "").replace("/月", "").replace("/天", "").replace("/年", "")
|
||
parts = [p for p in s.split("-") if p.strip()]
|
||
if len(parts) == 2:
|
||
try:
|
||
a = int(parts[0])
|
||
b = int(parts[1])
|
||
min_val = str(min(a, b))
|
||
max_val = str(max(a, b))
|
||
except Exception:
|
||
min_val = parts[0].strip()
|
||
max_val = parts[1].strip()
|
||
item["jobSalaryMin"] = min_val
|
||
item["jobSalaryMax"] = max_val
|
||
|
||
num = item.get("number")
|
||
if isinstance(num, str) and num:
|
||
try:
|
||
desc_html = fetch_company_desc_by_job(num, db_path=db_path)
|
||
except Exception:
|
||
desc_html = None
|
||
if isinstance(desc_html, str) and desc_html:
|
||
item["companyDesc"] = desc_html
|
||
item["company_desc"] = desc_html
|
||
|
||
if lst and isinstance(lst, list):
|
||
pass
|
||
try:
|
||
count_val = data.get("count") if isinstance(data, dict) else None
|
||
except Exception:
|
||
count_val = None
|
||
cur_items = len(lst or [])
|
||
total_items += cur_items
|
||
if pages_bar:
|
||
pages_bar.update(1)
|
||
pages_bar.set_postfix({"page": p, "items": cur_items})
|
||
|
||
# 记录成功关键词
|
||
if cur_items > 0 and total_items == cur_items:
|
||
_record_success(x)
|
||
# 结束条件:空列表或服务端标记结束
|
||
if cur_items == 0 or is_end:
|
||
break
|
||
p += 1
|
||
_sleep_between_requests()
|
||
if pages_bar:
|
||
pages_bar.close()
|
||
if tqdm:
|
||
seq.set_postfix({"total": total_items})
|
||
else:
|
||
result = call(page_index=page_index, page_size=page_size, city_id=city_id, env=env)
|
||
if isinstance(result, dict):
|
||
data = result.get("data")
|
||
|
||
else:
|
||
print(str(result)[:200])
|
||
|
||
|
||
if __name__ == "__main__":
|
||
os.environ['ZP_PROXY_USERNAME']='t13319619426654'
|
||
os.environ['ZP_PROXY_PASSWORD']='ln8aj9nl'
|
||
os.environ['ZP_PROXY_TUNNEL']='s432.kdltps.com:15818'
|
||
main()
|