JobData/jobs_spider/zhilian/company_spider.py

856 lines
33 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import time
import json
import pprint
import random
import uuid
import hashlib
from typing import Any, Dict, Optional, Callable
try:
import requests
except Exception:
requests = None
import ssl
from urllib.request import Request, urlopen, ProxyHandler, build_opener, HTTPSHandler
from urllib.parse import urlencode
import sqlite3
SUCCESS_LOG_PATH = os.path.join(os.path.dirname(__file__), "success.txt")
_SUCCESS_WRITTEN: set = set()
try:
import httpx
except Exception:
httpx = None
API_BASE_URL = os.getenv("API_BASE_URL", "http://127.0.0.1:9999")
def _build_proxy() -> Optional[Dict[str, str]]:
"""构造代理配置字典requests/httpx/urllib 兼容)。
从环境变量读取:
- ZP_PROXY_URL: 完整代理URL如 http://user:pass@host:port
或组合:
- ZP_PROXY_USERNAME, ZP_PROXY_PASSWORD, ZP_PROXY_TUNNEL
Returns:
Optional[Dict[str, str]]: {'http': url, 'https': url} 或 None。
"""
url ="http://t13319619426654:ln8aj9nl@s432.kdltps.com:15818"
return {"http": url, "https": url}
def _get_user_agent(mobile: bool = True) -> str:
try:
from fake_useragent import UserAgent
ua = UserAgent(platforms=['mobile'] if mobile else None)
return ua.random
except Exception:
if mobile:
return "Mozilla/5.0 (Linux; Android 10; Mobile) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Mobile Safari/537.36"
return "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36"
def gen_page_request_id() -> str:
return f"cf1e3b3e655b4eb5a306110a83c77c29-{int(time.time()*1000)}-{random.randint(0,999999)}"
def gen_client_id() -> str:
t = int(time.time() * 1000)
try:
t += int(time.perf_counter() * 1000)
except Exception:
pass
def repl(c: str) -> str:
n = int((t + random.random() * 16) % 16)
if c == 'x':
return hex(n)[2:]
return hex((n & 0x3) | 0x8)[2:]
tpl = "xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx"
return ''.join(repl(c) if c in 'xy' else c for c in tpl)
def gen_v() -> float:
return round(random.random(), 8)
def build_headers_miniapp(user_agent: str) -> Dict[str, str]:
return {
'User-Agent': user_agent,
'x-zp-page-code': "7020",
'x-zp-rt': hashlib.md5(f"{uuid.uuid4()}-{time.time()}".encode("utf-8")).hexdigest(),
'x-zp-device-id': str(uuid.uuid4()).upper(),
'content-type': "application/json",
'x-zp-version': "0.0.0",
'x-zp-business-system': "73",
'x-zp-action-id': "",
'xweb_xhr': "1",
'x-zp-channel': "wxxiaochengxu",
'x-zp-platform': "12",
'sec-fetch-site': "cross-site",
'sec-fetch-mode': "cors",
'sec-fetch-dest': "empty",
'referer': "https://servicewechat.com/wxb7718fb9257e4fd2/529/page-frame.html",
'accept-language': "zh-CN,zh;q=0.9",
}
def _get_db_path() -> str:
"""返回默认 SQLite 数据库文件路径。"""
base_dir = os.path.dirname(__file__)
return os.path.join(base_dir, "zhilian_raw.sqlite3")
def _init_db(db_path: str) -> None:
"""初始化 SQLite 数据库并创建表。"""
try:
con = sqlite3.connect(db_path)
cur = con.cursor()
cur.execute(
"""
CREATE TABLE IF NOT EXISTS responses (
id INTEGER PRIMARY KEY AUTOINCREMENT,
sou_full_index TEXT,
page INTEGER,
created_at INTEGER,
payload TEXT
)
"""
)
cur.execute(
"""
CREATE TABLE IF NOT EXISTS company_details (
id INTEGER PRIMARY KEY AUTOINCREMENT,
number TEXT,
created_at INTEGER,
payload TEXT
)
"""
)
cur.execute(
"""
CREATE UNIQUE INDEX IF NOT EXISTS idx_responses_sou_page
ON responses(sou_full_index, page)
"""
)
cur.execute(
"""
CREATE UNIQUE INDEX IF NOT EXISTS idx_company_details_number
ON company_details(number)
"""
)
con.commit()
con.close()
except Exception:
pass
def _save_search_response(db_path: str, sou_full_index: str, page: int, raw_payload: str) -> None:
"""保存职位搜索的原始响应。"""
try:
con = sqlite3.connect(db_path)
cur = con.cursor()
cur.execute(
"INSERT OR IGNORE INTO responses(sou_full_index, page, created_at, payload) VALUES(?, ?, ?, ?)",
(sou_full_index, int(page), int(time.time()), raw_payload),
)
con.commit()
con.close()
except Exception:
pass
def _save_company_detail(db_path: str, number: str, raw_payload: str) -> None:
"""保存公司详情的原始响应。"""
try:
con = sqlite3.connect(db_path)
cur = con.cursor()
cur.execute(
"INSERT OR IGNORE INTO company_details(number, created_at, payload) VALUES(?, ?, ?)",
(number, int(time.time()), raw_payload),
)
con.commit()
con.close()
except Exception:
pass
def _has_company_detail(db_path: str, number: str) -> bool:
"""检查公司详情是否已存在(按职位编号 number"""
try:
con = sqlite3.connect(db_path)
cur = con.cursor()
cur.execute("SELECT 1 FROM company_details WHERE number=? LIMIT 1", (number,))
row = cur.fetchone()
con.close()
return row is not None
except Exception:
return False
def _has_page_record(db_path: str, sou_full_index: str, page: int) -> bool:
"""检查指定关键词与页码是否已经存在。"""
try:
con = sqlite3.connect(db_path)
cur = con.cursor()
cur.execute("SELECT 1 FROM responses WHERE sou_full_index=? AND page=? LIMIT 1", (sou_full_index, int(page)))
row = cur.fetchone()
con.close()
return row is not None
except Exception:
return False
def _sleep_between_requests(min_seconds: float = 0.3, max_seconds: float = 0.8) -> None:
"""在请求间进行随机休眠。"""
try:
dur = random.uniform(min_seconds, max_seconds)
time.sleep(dur)
except Exception:
time.sleep(min_seconds)
def _has_keyword_record(db_path: str, sou_full_index: str) -> bool:
"""检查指定关键词是否已有任意页记录。"""
try:
con = sqlite3.connect(db_path)
cur = con.cursor()
cur.execute("SELECT 1 FROM responses WHERE sou_full_index=? LIMIT 1", (sou_full_index,))
row = cur.fetchone()
con.close()
return row is not None
except Exception:
return False
def _request_json(method: str, url: str, headers: Dict[str, str], params: Optional[Dict[str, Any]] = None,
json_body: Optional[Dict[str, Any]] = None, timeout: int = 30, max_retries: int = 3,
proxies: Optional[Dict[str, str]] = None, raw_sink: Optional[Callable[[str], None]] = None) -> Optional[Dict[str, Any]]:
for attempt in range(max_retries):
try:
time.sleep(random.uniform(0.8, 2.5))
if httpx is not None:
use_http2 = os.getenv("ZP_HTTP2", "1") == "1"
debug = os.getenv("ZP_DEBUG", "0") == "1"
kwargs: Dict[str, Any] = {"http2": use_http2, "timeout": timeout, "headers": headers, "trust_env": False}
if proxies:
px = proxies.get("https") or proxies.get("http")
if px:
kwargs["proxies"] = px
with httpx.Client(**kwargs) as client:
method_u = method.upper()
if method_u == "GET":
resp = client.get(url, params=params)
else:
resp = client.post(url, json=json_body)
if debug:
print({"_request_json": {"method": "POST", "status": resp.status_code}})
if resp.status_code == 405:
merged = params or {}
if json_body:
merged = {**merged, **{k: str(v) for k, v in json_body.items()}}
resp = client.get(url, params=merged)
if debug:
print({"_request_json": {"fallback": "GET", "status": resp.status_code}})
if raw_sink:
try:
raw_sink(resp.text)
except Exception:
pass
try:
return resp.json()
except ValueError:
return json.loads(resp.text)
else:
if requests:
resp = requests.request(
method.upper(), url,
headers=headers, params=params, json=json_body,
timeout=timeout, proxies=proxies
)
resp.raise_for_status()
if raw_sink:
try:
raw_sink(resp.text)
except Exception:
pass
return resp.json()
if method.upper() == 'GET':
full_url = url
if params:
qs = urlencode(params)
full_url = f"{url}?{qs}"
req = Request(full_url, headers=headers, method='GET')
else:
data_bytes = json.dumps(json_body or {}).encode('utf-8')
req = Request(url, headers=headers, data=data_bytes, method='POST')
ctx = ssl.create_default_context()
opener = None
if proxies and isinstance(proxies, dict) and (proxies.get("http") or proxies.get("https")):
try:
ph = ProxyHandler(proxies)
opener = build_opener(ph, HTTPSHandler(context=ctx))
except Exception:
opener = None
if opener:
with opener.open(req, timeout=timeout) as r:
raw = r.read()
else:
with urlopen(req, context=ctx, timeout=timeout) as r:
raw = r.read()
if raw_sink:
try:
raw_sink(raw.decode("utf-8"))
except Exception:
pass
return json.loads(raw)
except Exception:
if attempt == max_retries - 1:
return None
time.sleep(1.2 * (attempt + 1))
return None
def fetch_company_desc_by_job(number: str, db_path: Optional[str] = None) -> Optional[str]:
if not isinstance(number, str) or not number.strip():
return None
if db_path and _has_company_detail(db_path, number):
return None
client_id = gen_client_id()
url_pc = "https://fe-api.zhaopin.com/c/i/jobs/position-detail-new"
params_pc = {
"number": number,
"_v": gen_v(),
"x-zp-page-request-id": gen_page_request_id(),
"x-zp-client-id": client_id,
}
headers_pc = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "identity",
"sec-ch-ua-platform": "macOS",
"x-zp-business-system": "1",
"x-zp-page-code": "4019",
"sec-ch-ua": "\"Not/A)Brand\";v=\"8\", \"Chromium\";v=\"136\", \"Google Chrome\";v=\"136\"",
"sec-ch-ua-mobile": "?0",
"x-zp-platform": "13",
"origin": "https://www.zhaopin.com",
"sec-fetch-site": "same-site",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
"referer": "https://www.zhaopin.com/",
"accept-language": "zh-CN,zh;q=0.9",
"priority": "u=1, i",
"Cookie": f"x-zp-client-id={client_id}"
}
def _sink_pc(raw: str) -> None:
if db_path:
_save_company_detail(db_path, number, raw)
data_pc = _request_json("GET", url_pc, headers_pc, params=params_pc, proxies=_build_proxy(), raw_sink=_sink_pc)
if data_pc and isinstance(data_pc, dict):
detail = data_pc.get("data") or {}
comp = detail.get("detailedCompany") or {}
desc_pc = comp.get("companyDescription")
if isinstance(desc_pc, str) and desc_pc:
return desc_pc
ua = _get_user_agent(True)
url_mini = "https://cgate.zhaopin.com/positionbusiness/exposure/companyDetail"
params_mini = {
"number": number,
"platform": "12",
"version": "0.0.0",
}
headers_mini = build_headers_miniapp(ua)
def _sink_mini(raw: str) -> None:
if db_path:
_save_company_detail(db_path, number, raw)
data_mini = _request_json("GET", url_mini, headers_mini, params=params_mini, proxies=_build_proxy(), raw_sink=_sink_mini)
if data_mini and isinstance(data_mini, dict):
desc_mini = ((data_mini.get("data") or {}).get("companyBase") or {}).get("companyDescWithHtml")
if isinstance(desc_mini, str) and desc_mini:
return desc_mini
return None
def build_headers(
at: str,
rt: str,
device_id: str,
channel: str = "miniapp",
platform: str = "miniapp",
version: str = "1.0.0",
business_system: str = "zpfe-miniapp",
page_code: Optional[str] = None,
action_id: Optional[str] = None,
user_agent: Optional[str] = None,
referer: Optional[str] = None,
) -> Dict[str, str]:
"""生成请求头。
参数:
- at访问令牌。
- rt刷新/辅助令牌。
- device_id设备标识。
- channel渠道标识。
- platform平台标识。
- version版本号。
- business_system业务系统标识。
- page_code页面编码注入 x-zp-page-code
- action_id动作标识注入 x-zp-action-id
- user_agentUA可选默认填充为微信小程序 UA
- referer来源可选
返回:
- 头字典,包含公共头与在 cgate 路径下的 x-zp-at/x-zp-rt。
"""
headers: Dict[str, str] = {
"accept": "*/*",
"content-type": "application/json",
"x-zp-version": version,
"x-zp-channel": channel,
"x-zp-platform": platform,
"x-zp-device-id": device_id,
"x-zp-business-system": business_system,
"xweb_xhr": "1",
"sec-fetch-site": "cross-site",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
"accept-language": "zh-CN,zh;q=0.9",
}
if at:
headers["x-zp-at"] = at
if rt:
headers["x-zp-rt"] = rt
if page_code:
headers["x-zp-page-code"] = page_code
if action_id is not None:
headers["x-zp-action-id"] = action_id
headers["User-Agent"] = user_agent or (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 "
"MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF "
"MacWechat/3.8.10(0x13080a10) XWEB/1227"
)
if referer:
headers["referer"] = referer
else:
headers["referer"] = "https://servicewechat.com/wxb7718fb9257e4fd2/602/page-frame.html"
return headers
def base_url_for(path: str, env: str = "prod") -> str:
"""选择基础域名。
参数:
- path相对路径。
- env环境标识"prod""pre"
返回:
- 基础域名字符串。
"""
is_pre = env == "pre"
if path.startswith("/capi"):
return f"https://capi{'pre' if is_pre else ''}.zhaopin.com"
if path.startswith("/api"):
return f"https://m{'-pre' if is_pre else ''}.zhaopin.com"
if path.startswith("/weex"):
return f"https://zhibo{'-pre' if is_pre else ''}.zhaopin.com"
return f"https://cgate{'-pre' if is_pre else ''}.zhaopin.com"
def build_payload(
page_index: int = 1,
page_size: int = 10,
city_id: Optional[int] = None,
event_scenario: Optional[str] = None,
sou_expand: Optional[str] = None,
sou_full_index: Optional[str] = None,
sort_type: Optional[str] = None,
resume_number: Optional[str] = None,
filter_min_salary: Optional[int] = None,
) -> Dict[str, Any]:
"""构造职位搜索请求体。"""
body: Dict[str, Any] = {
"pageIndex": page_index,
"pageSize": page_size,
}
if event_scenario:
body["eventScenario"] = event_scenario
if filter_min_salary is not None:
body["filterMinSalary"] = filter_min_salary
if sou_expand:
body["S_SOU_EXPAND"] = sou_expand
if sou_full_index:
body["S_SOU_FULL_INDEX"] = sou_full_index
if city_id is not None:
body["S_SOU_WORK_CITY"] = city_id
if sort_type:
body["sortType"] = sort_type
if resume_number:
body["resumeNumber"] = resume_number
return body
def call(
page_index: int = 1,
page_size: int = 10,
city_id: Optional[int] = None,
env: str = "prod",
timeout: float = 10.0,
sou_full_index: Optional[str] = None,
) -> Any:
"""执行职位搜索请求POST"""
path = "/positionbusiness/searchrecommend/searchPositions"
base = base_url_for(path, env=env)
url = f"{base}{path}"
at = os.getenv("ZP_AT", "")
rt = os.getenv("ZP_RT", "")
device_id = os.getenv("ZP_DEVICE_ID", "")
channel = os.getenv("ZP_CHANNEL", "wxxiaochengxu")
platform = os.getenv("ZP_PLATFORM", "12")
version = os.getenv("ZP_VERSION", "0.0.0")
business_system = os.getenv("ZP_BUSINESS_SYSTEM", "73")
page_code = os.getenv("ZP_PAGE_CODE", "7019")
action_id = os.getenv("ZP_ACTION_ID", "")
user_agent = os.getenv("ZP_USER_AGENT")
referer = os.getenv("ZP_REFERER")
headers = build_headers(
at=at,
rt=rt,
device_id=device_id,
channel=channel,
platform=platform,
version=version,
business_system=business_system,
page_code=page_code,
action_id=action_id,
user_agent=user_agent,
referer=referer,
)
body_env = os.getenv("ZP_BODY_JSON")
if body_env:
try:
body = json.loads(body_env)
except Exception:
body = {}
else:
body = build_payload(
page_index=page_index,
page_size=page_size,
city_id=city_id,
event_scenario=os.getenv("ZP_EVENT_SCENARIO", "wxmpZhaopinSearchV2"),
sou_expand=os.getenv("ZP_SOU_EXPAND", "SOU_COMPANY_ID"),
sou_full_index=sou_full_index or os.getenv("ZP_SOU_FULL_INDEX"),
sort_type=os.getenv("ZP_SORT_TYPE", "DEFAULT"),
resume_number=os.getenv("ZP_RESUME_NUMBER"),
filter_min_salary=int(os.getenv("ZP_FILTER_MIN_SALARY", "1")),
)
if sou_full_index:
body["S_SOU_FULL_INDEX"] = sou_full_index
use_http2 = os.getenv("ZP_HTTP2", "1") == "1"
proxies = _build_proxy()
debug = os.getenv("ZP_DEBUG", "0") == "1"
if httpx is not None:
kwargs: Dict[str, Any] = {"http2": use_http2, "timeout": timeout, "headers": headers, "trust_env": False}
if proxies:
px = proxies.get("https") or proxies.get("http")
if px:
kwargs["proxies"] = px
with httpx.Client(**kwargs) as client:
resp = client.post(url, json=body)
if debug:
print({"method": "POST", "status": resp.status_code})
if resp.status_code == 405:
params = {k: str(v) for k, v in body.items()}
resp = client.get(url, params=params)
if debug:
print({"fallback": "GET", "status": resp.status_code})
try:
os.environ["ZP_LAST_RAW"] = resp.text
except Exception:
pass
try:
return resp.json()
except ValueError:
return resp.text
else:
resp = requests.post(url, json=body, headers=headers, timeout=timeout, proxies=proxies) if requests is not None else None
if resp is not None:
if debug:
try:
print({"method": "POST", "status": resp.status_code})
except Exception:
pass
if getattr(resp, "status_code", None) == 405:
params = {k: str(v) for k, v in body.items()}
resp = requests.get(url, params=params, headers=headers, timeout=timeout, proxies=proxies)
if debug:
try:
print({"fallback": "GET", "status": resp.status_code})
except Exception:
pass
try:
return resp.json()
except ValueError:
return resp.text
return {"error": "httpx not available"}
def _load_lines() -> list:
base_dir = os.path.dirname(__file__)
candidates = [
os.path.join(base_dir, "company.txt"),
os.path.join(base_dir, "conpany.txt"),
]
for fp in candidates:
if os.path.exists(fp):
try:
with open(fp, "r", encoding="utf-8") as f:
lines = [ln.strip() for ln in f.readlines()]
return [ln for ln in lines if ln]
except Exception:
continue
return []
def _record_success(name: str) -> None:
try:
n = (name or "").strip()
if not n or n in _SUCCESS_WRITTEN:
return
with open(SUCCESS_LOG_PATH, "a", encoding="utf-8") as f:
f.write(f"{n}\n")
_SUCCESS_WRITTEN.add(n)
except Exception:
pass
def _extract_job_items_from_result(result: Any) -> list:
"""从调用结果中提取职位列表 items。"""
try:
if not isinstance(result, dict):
return []
data = result.get("data")
if isinstance(data, dict):
lst = data.get("list")
return lst if isinstance(lst, list) else []
return []
except Exception:
return []
import requests
def _report_universal(items: list, data_type: str = "job") -> bool:
"""Post items list to universal batch-store-async endpoint.
Args:
items (list): Data list to send.
data_type (str): Logical data type label.
Returns:
bool: True when accepted, else False.
"""
return False
def main() -> None:
env = os.getenv("ZP_ENV", "prod")
page_index = int(os.getenv("ZP_PAGE_INDEX", "1"))
page_size = int(os.getenv("ZP_PAGE_SIZE", "15"))
city_env = os.getenv("ZP_DEMO_CITY_ID")
city_id = int(city_env) if city_env and city_env.isdigit() else None
items = _load_lines()
if items:
try:
from tqdm import tqdm
except Exception:
tqdm = None
seq = tqdm(items, desc="S_SOU_FULL_INDEX") if tqdm else items
db_path = _get_db_path()
_init_db(db_path)
for x in seq:
total_items = 0
p = page_index
pages_bar = tqdm(desc=f"{x}", leave=False) if tqdm else None
# 若该关键词已存在任何页的记录,则整体跳过,避免重复请求
if _has_keyword_record(db_path, x):
if pages_bar:
pages_bar.update(0)
pages_bar.set_postfix({"keyword": x, "skipped": True})
continue
while True:
if _has_page_record(db_path, x, p):
if pages_bar:
pages_bar.update(1)
pages_bar.set_postfix({"page": p, "skipped": True})
_sleep_between_requests()
p += 1
continue
result = call(
page_index=p,
page_size=page_size,
city_id=city_id,
env=env,
sou_full_index=x,
)
try:
raw = os.getenv("ZP_LAST_RAW", "")
items = _extract_job_items_from_result(result)
payload_obj = {
"keyword": x,
"page": p,
"count": len(items),
"items": items,
"data": result.get("data") if isinstance(result, dict) else None,
"raw": raw or (json.dumps(result, ensure_ascii=False) if isinstance(result, dict) else str(result)),
}
_save_search_response(db_path, x, p, json.dumps(payload_obj, ensure_ascii=False))
except Exception:
pass
data = result.get("data") if isinstance(result, dict) else None
lst = data.get("list") if isinstance(data, dict) else None
is_end = (isinstance(data, dict) and str(data.get("isEndPage", "")).strip() in ("1", "true", "True"))
count_val = None
# 处理数据字段
if lst and isinstance(lst, list):
for item in lst:
item["companyName"] = x or item.get("companyName", "")
item["jobName"] = item.get("jobName", "") or item.get("name", "") or item.get("position", {}).get("base", {}).get("positionName", "")
item["jobDescribe"] = item.get("jobSummary", "") or item.get("position", {}).get("desc", {}).get("description", "")
item["degreeString"] = item.get("education", "") or item.get("position", {}).get("base", {}).get("education", "")
tags = []
for t in item.get("showSkillTags", []) or []:
if isinstance(t, dict):
v = t.get("tag") or t.get("value") or t.get("name")
if v:
tags.append(str(v))
elif isinstance(t, str):
tags.append(t)
for t in item.get("skillLabel", []) or []:
if isinstance(t, dict):
v = t.get("value") or t.get("name")
if v:
tags.append(str(v))
elif isinstance(t, str):
tags.append(t)
item["jobTagsForOrder"] = tags
# 年限/教育
item["workYearString"] = item.get("workingExp", "") or item.get("position", {}).get("base", {}).get("positionWorkingExp", "")
item["jobExperience"] = item.get("jobExperience", "")
item["jobEducation"] = item.get("jobEducation", "")
# 工种/职位类型
item["termStr"] = item.get("workType", "") or item.get("position", {}).get("base", {}).get("workType", "")
# 位置/区域
addr = (item.get("workLocation", {}) or {}).get("workAddress")
city = item.get("workCity", "")
district = item.get("cityDistrict", "")
street = item.get("streetName", "")
if not addr:
# 卡片JSON中的地址作为兜底
card_json = item.get("cardCustomJson")
try:
card_obj = json.loads(card_json) if isinstance(card_json, str) else {}
except Exception:
card_obj = {}
addr = card_obj.get("address")
item["location"] = addr or "" # 详细地址
item["jobAreaString"] = f"{city}{district}{street}".strip()
# 时间
item["confirmDateString"] = item.get("publishTime", "") or item.get("firstPublishTime", "")
# 公司规模/性质
item["companySizeString"] = item.get("companySize", "")
item["companyTypeString"] = item.get("propertyName", "")
# 行业
item["major1Str"] = item.get("industryName", "")
item["major2Str"] = ""
# 链接/ID/公司信息
def _clean_url(u: Any) -> str:
s = str(u or "").strip()
if s:
s = s.strip().strip("`").strip()
return s
job_url = item.get("positionUrl") or item.get("positionURL") or (item.get("position", {}) or {}).get("base", {}).get("positionUrl")
item["jobHref"] = _clean_url(job_url)
item["companyHref"] = _clean_url(item.get("companyUrl"))
item["coId"] = item.get("companyId")
item["fullCompanyName"] = item.get("companyName", "")
# 薪资:优先 salaryReal其次 salary再次 cardCustomJson.salary60
raw_salary = item.get("salaryReal") or item.get("salary")
if not raw_salary:
_cj = item.get("cardCustomJson")
try:
_cj_obj = json.loads(_cj) if isinstance(_cj, str) else {}
except Exception:
_cj_obj = {}
raw_salary = _cj_obj.get("salary60")
min_val = ""
max_val = ""
if isinstance(raw_salary, str):
s = raw_salary.replace("", "").replace("/月", "").replace("/天", "").replace("/年", "")
parts = [p for p in s.split("-") if p.strip()]
if len(parts) == 2:
try:
a = int(parts[0])
b = int(parts[1])
min_val = str(min(a, b))
max_val = str(max(a, b))
except Exception:
min_val = parts[0].strip()
max_val = parts[1].strip()
item["jobSalaryMin"] = min_val
item["jobSalaryMax"] = max_val
num = item.get("number")
if isinstance(num, str) and num:
try:
desc_html = fetch_company_desc_by_job(num, db_path=db_path)
except Exception:
desc_html = None
if isinstance(desc_html, str) and desc_html:
item["companyDesc"] = desc_html
item["company_desc"] = desc_html
if lst and isinstance(lst, list):
pass
try:
count_val = data.get("count") if isinstance(data, dict) else None
except Exception:
count_val = None
cur_items = len(lst or [])
total_items += cur_items
if pages_bar:
pages_bar.update(1)
pages_bar.set_postfix({"page": p, "items": cur_items})
# 记录成功关键词
if cur_items > 0 and total_items == cur_items:
_record_success(x)
# 结束条件:空列表或服务端标记结束
if cur_items == 0 or is_end:
break
p += 1
_sleep_between_requests()
if pages_bar:
pages_bar.close()
if tqdm:
seq.set_postfix({"total": total_items})
else:
result = call(page_index=page_index, page_size=page_size, city_id=city_id, env=env)
if isinstance(result, dict):
data = result.get("data")
else:
print(str(result)[:200])
if __name__ == "__main__":
os.environ['ZP_PROXY_USERNAME']='t13319619426654'
os.environ['ZP_PROXY_PASSWORD']='ln8aj9nl'
os.environ['ZP_PROXY_TUNNEL']='s432.kdltps.com:15818'
main()