282 lines
8.5 KiB
Python
282 lines
8.5 KiB
Python
import os
|
||
import json
|
||
import time
|
||
import uuid
|
||
import random
|
||
from typing import Any, Dict, Optional
|
||
import sqlite3
|
||
|
||
import requests
|
||
try:
|
||
import httpx
|
||
except Exception:
|
||
httpx = None
|
||
|
||
API_BASE_URL = os.getenv('API_BASE_URL', 'http://127.0.0.1:9999')
|
||
|
||
|
||
def _gen_traceid() -> str:
|
||
"""生成简易 traceid。"""
|
||
base = uuid.uuid4().hex[:12]
|
||
return f"M-{base}"
|
||
|
||
|
||
def report_universal(items: list, data_type: str = "job") -> bool:
|
||
"""上报功能已禁用,始终返回 False。"""
|
||
return False
|
||
|
||
|
||
|
||
|
||
def build_headers(
|
||
user_agent: Optional[str] = None,
|
||
referer: Optional[str] = None,
|
||
cookies: Optional[str] = None,
|
||
extra: Optional[Dict[str, str]] = None,
|
||
) -> Dict[str, str]:
|
||
"""构造带签名头的请求头。"""
|
||
headers: Dict[str, str] = {
|
||
"accept": "*/*",
|
||
"accept-language": "zh-CN,zh;q=0.9",
|
||
"accept-encoding": "gzip, deflate, br",
|
||
"connection": "keep-alive",
|
||
"sec-fetch-site": "cross-site",
|
||
"sec-fetch-mode": "cors",
|
||
"sec-fetch-dest": "empty",
|
||
"xweb_xhr": "1",
|
||
'wt2': "Epwo8bHXTy5wLU5ETExV2Ss5OwloFG3eJ0Pfe6T3FyIdDJhEyGkcxea9wI5VSqX-tafKQcVQJTI2szwdO0xQz3A~~",
|
||
"mpt":"21728a788201acffa22d876d1fc0e8d7",
|
||
"x-requested-with": "XMLHttpRequest",
|
||
"User-Agent": user_agent
|
||
or (
|
||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 "
|
||
"MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF "
|
||
"MacWechat/3.8.10(0x13080a10) XWEB/1227"
|
||
),
|
||
"referer": referer or "https://servicewechat.com/wxa8da525af05281f3/586/page-frame.html",
|
||
"content-type": os.getenv("BOSS_CT", "application/x-www-form-urlencoded"),
|
||
}
|
||
extra = extra or {}
|
||
headers.update({k: v for k, v in extra.items() if v})
|
||
if cookies:
|
||
headers["Cookie"] = cookies
|
||
return headers
|
||
|
||
|
||
def call(
|
||
query: str,
|
||
city: str = "101020100",
|
||
page: int = 1,
|
||
page_size: int = 15,
|
||
use_http2: bool = False,
|
||
timeout: float = 10.0,
|
||
) -> Any:
|
||
"""调用 Boss 搜索职位列表并返回结果。"""
|
||
url = "https://www.zhipin.com/wapi/zpgeek/miniapp/search/joblist.json"
|
||
app_id = os.getenv("BOSS_APP_ID", "10002")
|
||
extra_headers = {
|
||
"mini_ver": os.getenv("BOSS_MINI_VER", "100.0000"),
|
||
"ua": os.getenv("BOSS_UA_JSON", '{"model":"Mac16,8","platform":"mac"}'),
|
||
"wt2": os.getenv("BOSS_WT2"),
|
||
"zp_app_id": os.getenv("BOSS_ZP_APP_ID", app_id),
|
||
"traceid": os.getenv("BOSS_TRACEID") or _gen_traceid(),
|
||
"mpt": os.getenv("BOSS_MPT"),
|
||
"scene": os.getenv("BOSS_SCENE_HEADER", "1089"),
|
||
"zp_product_id": os.getenv("BOSS_ZP_PRODUCT_ID", app_id),
|
||
"platform": os.getenv("BOSS_PLATFORM", "zhipin/mac"),
|
||
"ver": os.getenv("BOSS_VER", "100.0000"),
|
||
}
|
||
headers = build_headers(
|
||
user_agent=os.getenv("BOSS_USER_AGENT"),
|
||
referer=os.getenv("BOSS_REFERER"),
|
||
cookies=os.getenv("BOSS_COOKIES"),
|
||
extra=extra_headers,
|
||
)
|
||
params = {
|
||
"pageSize": str(page_size),
|
||
"query": query,
|
||
"city": city,
|
||
"source": "1",
|
||
"sortType": "0",
|
||
"isSupplySearch": "true",
|
||
"page": str(page),
|
||
"appId": app_id,
|
||
}
|
||
enc_expect = os.getenv("BOSS_ENCRYPT_EXPECT_ID")
|
||
if enc_expect:
|
||
params["encryptExpectId"] = enc_expect
|
||
skip_verify = os.getenv("BOSS_SKIP_VERIFY", "0") == "1"
|
||
if use_http2 and httpx is not None:
|
||
with httpx.Client(http2=True, headers=headers, timeout=timeout, verify=not skip_verify, trust_env=False) as client:
|
||
resp = client.get(url, params=params)
|
||
try:
|
||
return resp.json()
|
||
except ValueError:
|
||
return resp.text
|
||
session = requests.Session()
|
||
session.trust_env = False
|
||
resp = session.get(url, params=params, headers=headers, timeout=timeout, verify=not skip_verify)
|
||
try:
|
||
return resp.json()
|
||
except ValueError:
|
||
return resp.text
|
||
|
||
def _load_keywords(path: str) -> list:
|
||
"""Load keywords from a UTF-8 text file, one per line.
|
||
|
||
Args:
|
||
path (str): File path.
|
||
|
||
Returns:
|
||
list: Non-empty trimmed lines.
|
||
"""
|
||
|
||
try:
|
||
with open(path, "r", encoding="utf-8") as f:
|
||
lines = [ln.strip() for ln in f.readlines()]
|
||
return [ln for ln in lines if ln]
|
||
except Exception:
|
||
return []
|
||
|
||
|
||
def _progress_iter(seq: list, desc: str = "", total: Optional[int] = None):
|
||
"""Iterate with a simple console progress bar.
|
||
|
||
Args:
|
||
seq (list): Items to iterate.
|
||
desc (str): Progress description.
|
||
total (Optional[int]): Total count for percentage.
|
||
|
||
Yields:
|
||
Any: Items from seq.
|
||
"""
|
||
|
||
n = 0
|
||
m = total if total is not None else len(seq)
|
||
bar_len = 24
|
||
for item in seq:
|
||
n += 1
|
||
filled = int(bar_len * n / m) if m else 0
|
||
bar = "#" * filled + "-" * (bar_len - filled)
|
||
pct = int(100 * n / m) if m else 100
|
||
print(f"\r{desc} [{bar}] {n}/{m} {pct}%", end="", flush=True)
|
||
yield item
|
||
print("", flush=True)
|
||
|
||
|
||
def main(query: str) -> None:
|
||
"""入口,读取环境变量并执行搜索。"""
|
||
city ="101020100"
|
||
page = 1
|
||
page_size = 15
|
||
use_http2 = True
|
||
pages = 3
|
||
db_path = _get_db_path()
|
||
_init_db(db_path)
|
||
if _has_keyword(db_path, query):
|
||
print(json.dumps({"skip": True, "keyword": query}, ensure_ascii=False))
|
||
return
|
||
for p in range(page, page + pages):
|
||
_sleep_between_requests(0.3, 0.8)
|
||
result = call(query=query, city=city, page=p, page_size=page_size, use_http2=use_http2)
|
||
if isinstance(result, dict):
|
||
raw = json.dumps({"page": p, "data": result}, ensure_ascii=False)
|
||
print(raw)
|
||
else:
|
||
raw = str(result)
|
||
print(raw)
|
||
time.sleep(random.uniform(10, 20))
|
||
try:
|
||
_save_raw_response(db_path, query, p, raw)
|
||
except Exception as e:
|
||
print(f"Error saving raw response for {query} page {p}: {e}")
|
||
|
||
|
||
|
||
def _get_db_path() -> str:
|
||
"""返回默认 SQLite 数据库文件路径。"""
|
||
base_dir = os.path.dirname(__file__)
|
||
return os.path.join(base_dir, "boss_raw.sqlite3")
|
||
|
||
|
||
def _init_db(db_path: str) -> None:
|
||
"""初始化 SQLite 数据库并创建 responses 表。"""
|
||
try:
|
||
con = sqlite3.connect(db_path)
|
||
cur = con.cursor()
|
||
cur.execute(
|
||
"""
|
||
CREATE TABLE IF NOT EXISTS responses (
|
||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||
keyword TEXT NOT NULL,
|
||
page INTEGER NOT NULL,
|
||
created_at INTEGER NOT NULL,
|
||
payload TEXT NOT NULL
|
||
)
|
||
"""
|
||
)
|
||
con.commit()
|
||
con.close()
|
||
except Exception:
|
||
pass
|
||
|
||
|
||
def _save_raw_response(db_path: str, keyword: str, page: int, raw_payload: str) -> None:
|
||
"""保存原始响应到 SQLite。"""
|
||
try:
|
||
con = sqlite3.connect(db_path)
|
||
cur = con.cursor()
|
||
cur.execute(
|
||
"INSERT INTO responses(keyword, page, created_at, payload) VALUES(?, ?, ?, ?)",
|
||
(keyword, int(page), int(time.time()), raw_payload),
|
||
)
|
||
con.commit()
|
||
con.close()
|
||
except Exception:
|
||
pass
|
||
def _sleep_between_requests(min_seconds: float = 0.5, max_seconds: float = 1.5) -> None:
|
||
"""在请求之间进行随机休眠以降低风控风险。
|
||
|
||
Args:
|
||
min_seconds (float): 最少休眠秒数。
|
||
max_seconds (float): 最大休眠秒数。
|
||
|
||
Returns:
|
||
None
|
||
"""
|
||
|
||
try:
|
||
dur = random.uniform(min_seconds, max_seconds)
|
||
time.sleep(dur)
|
||
except Exception:
|
||
time.sleep(min_seconds)
|
||
|
||
|
||
def _has_keyword(db_path: str, keyword: str) -> bool:
|
||
"""判断指定关键词是否已在数据库中出现过。
|
||
|
||
Args:
|
||
db_path (str): SQLite 数据库路径。
|
||
keyword (str): 关键词字符串。
|
||
|
||
Returns:
|
||
bool: 若存在记录返回 True,否则 False。
|
||
"""
|
||
|
||
try:
|
||
con = sqlite3.connect(db_path)
|
||
cur = con.cursor()
|
||
cur.execute("SELECT 1 FROM responses WHERE keyword=? LIMIT 1", (keyword,))
|
||
row = cur.fetchone()
|
||
con.close()
|
||
return row is not None
|
||
except Exception:
|
||
return False
|
||
|
||
|
||
if __name__ == "__main__":
|
||
keywords = _load_keywords("company.txt")
|
||
for keyword in _progress_iter(keywords, desc="Keywords", total=len(keywords)):
|
||
print(keyword)
|
||
main(keyword) |