JobData/jobs_spider/boss/company_spider.py

282 lines
8.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import json
import time
import uuid
import random
from typing import Any, Dict, Optional
import sqlite3
import requests
try:
import httpx
except Exception:
httpx = None
API_BASE_URL = os.getenv('API_BASE_URL', 'http://127.0.0.1:9999')
def _gen_traceid() -> str:
"""生成简易 traceid。"""
base = uuid.uuid4().hex[:12]
return f"M-{base}"
def report_universal(items: list, data_type: str = "job") -> bool:
"""上报功能已禁用,始终返回 False。"""
return False
def build_headers(
user_agent: Optional[str] = None,
referer: Optional[str] = None,
cookies: Optional[str] = None,
extra: Optional[Dict[str, str]] = None,
) -> Dict[str, str]:
"""构造带签名头的请求头。"""
headers: Dict[str, str] = {
"accept": "*/*",
"accept-language": "zh-CN,zh;q=0.9",
"accept-encoding": "gzip, deflate, br",
"connection": "keep-alive",
"sec-fetch-site": "cross-site",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
"xweb_xhr": "1",
'wt2': "Epwo8bHXTy5wLU5ETExV2Ss5OwloFG3eJ0Pfe6T3FyIdDJhEyGkcxea9wI5VSqX-tafKQcVQJTI2szwdO0xQz3A~~",
"mpt":"21728a788201acffa22d876d1fc0e8d7",
"x-requested-with": "XMLHttpRequest",
"User-Agent": user_agent
or (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 "
"MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF "
"MacWechat/3.8.10(0x13080a10) XWEB/1227"
),
"referer": referer or "https://servicewechat.com/wxa8da525af05281f3/586/page-frame.html",
"content-type": os.getenv("BOSS_CT", "application/x-www-form-urlencoded"),
}
extra = extra or {}
headers.update({k: v for k, v in extra.items() if v})
if cookies:
headers["Cookie"] = cookies
return headers
def call(
query: str,
city: str = "101020100",
page: int = 1,
page_size: int = 15,
use_http2: bool = False,
timeout: float = 10.0,
) -> Any:
"""调用 Boss 搜索职位列表并返回结果。"""
url = "https://www.zhipin.com/wapi/zpgeek/miniapp/search/joblist.json"
app_id = os.getenv("BOSS_APP_ID", "10002")
extra_headers = {
"mini_ver": os.getenv("BOSS_MINI_VER", "100.0000"),
"ua": os.getenv("BOSS_UA_JSON", '{"model":"Mac16,8","platform":"mac"}'),
"wt2": os.getenv("BOSS_WT2"),
"zp_app_id": os.getenv("BOSS_ZP_APP_ID", app_id),
"traceid": os.getenv("BOSS_TRACEID") or _gen_traceid(),
"mpt": os.getenv("BOSS_MPT"),
"scene": os.getenv("BOSS_SCENE_HEADER", "1089"),
"zp_product_id": os.getenv("BOSS_ZP_PRODUCT_ID", app_id),
"platform": os.getenv("BOSS_PLATFORM", "zhipin/mac"),
"ver": os.getenv("BOSS_VER", "100.0000"),
}
headers = build_headers(
user_agent=os.getenv("BOSS_USER_AGENT"),
referer=os.getenv("BOSS_REFERER"),
cookies=os.getenv("BOSS_COOKIES"),
extra=extra_headers,
)
params = {
"pageSize": str(page_size),
"query": query,
"city": city,
"source": "1",
"sortType": "0",
"isSupplySearch": "true",
"page": str(page),
"appId": app_id,
}
enc_expect = os.getenv("BOSS_ENCRYPT_EXPECT_ID")
if enc_expect:
params["encryptExpectId"] = enc_expect
skip_verify = os.getenv("BOSS_SKIP_VERIFY", "0") == "1"
if use_http2 and httpx is not None:
with httpx.Client(http2=True, headers=headers, timeout=timeout, verify=not skip_verify, trust_env=False) as client:
resp = client.get(url, params=params)
try:
return resp.json()
except ValueError:
return resp.text
session = requests.Session()
session.trust_env = False
resp = session.get(url, params=params, headers=headers, timeout=timeout, verify=not skip_verify)
try:
return resp.json()
except ValueError:
return resp.text
def _load_keywords(path: str) -> list:
"""Load keywords from a UTF-8 text file, one per line.
Args:
path (str): File path.
Returns:
list: Non-empty trimmed lines.
"""
try:
with open(path, "r", encoding="utf-8") as f:
lines = [ln.strip() for ln in f.readlines()]
return [ln for ln in lines if ln]
except Exception:
return []
def _progress_iter(seq: list, desc: str = "", total: Optional[int] = None):
"""Iterate with a simple console progress bar.
Args:
seq (list): Items to iterate.
desc (str): Progress description.
total (Optional[int]): Total count for percentage.
Yields:
Any: Items from seq.
"""
n = 0
m = total if total is not None else len(seq)
bar_len = 24
for item in seq:
n += 1
filled = int(bar_len * n / m) if m else 0
bar = "#" * filled + "-" * (bar_len - filled)
pct = int(100 * n / m) if m else 100
print(f"\r{desc} [{bar}] {n}/{m} {pct}%", end="", flush=True)
yield item
print("", flush=True)
def main(query: str) -> None:
"""入口,读取环境变量并执行搜索。"""
city ="101020100"
page = 1
page_size = 15
use_http2 = True
pages = 3
db_path = _get_db_path()
_init_db(db_path)
if _has_keyword(db_path, query):
print(json.dumps({"skip": True, "keyword": query}, ensure_ascii=False))
return
for p in range(page, page + pages):
_sleep_between_requests(0.3, 0.8)
result = call(query=query, city=city, page=p, page_size=page_size, use_http2=use_http2)
if isinstance(result, dict):
raw = json.dumps({"page": p, "data": result}, ensure_ascii=False)
print(raw)
else:
raw = str(result)
print(raw)
time.sleep(random.uniform(10, 20))
try:
_save_raw_response(db_path, query, p, raw)
except Exception as e:
print(f"Error saving raw response for {query} page {p}: {e}")
def _get_db_path() -> str:
"""返回默认 SQLite 数据库文件路径。"""
base_dir = os.path.dirname(__file__)
return os.path.join(base_dir, "boss_raw.sqlite3")
def _init_db(db_path: str) -> None:
"""初始化 SQLite 数据库并创建 responses 表。"""
try:
con = sqlite3.connect(db_path)
cur = con.cursor()
cur.execute(
"""
CREATE TABLE IF NOT EXISTS responses (
id INTEGER PRIMARY KEY AUTOINCREMENT,
keyword TEXT NOT NULL,
page INTEGER NOT NULL,
created_at INTEGER NOT NULL,
payload TEXT NOT NULL
)
"""
)
con.commit()
con.close()
except Exception:
pass
def _save_raw_response(db_path: str, keyword: str, page: int, raw_payload: str) -> None:
"""保存原始响应到 SQLite。"""
try:
con = sqlite3.connect(db_path)
cur = con.cursor()
cur.execute(
"INSERT INTO responses(keyword, page, created_at, payload) VALUES(?, ?, ?, ?)",
(keyword, int(page), int(time.time()), raw_payload),
)
con.commit()
con.close()
except Exception:
pass
def _sleep_between_requests(min_seconds: float = 0.5, max_seconds: float = 1.5) -> None:
"""在请求之间进行随机休眠以降低风控风险。
Args:
min_seconds (float): 最少休眠秒数。
max_seconds (float): 最大休眠秒数。
Returns:
None
"""
try:
dur = random.uniform(min_seconds, max_seconds)
time.sleep(dur)
except Exception:
time.sleep(min_seconds)
def _has_keyword(db_path: str, keyword: str) -> bool:
"""判断指定关键词是否已在数据库中出现过。
Args:
db_path (str): SQLite 数据库路径。
keyword (str): 关键词字符串。
Returns:
bool: 若存在记录返回 True否则 False。
"""
try:
con = sqlite3.connect(db_path)
cur = con.cursor()
cur.execute("SELECT 1 FROM responses WHERE keyword=? LIMIT 1", (keyword,))
row = cur.fetchone()
con.close()
return row is not None
except Exception:
return False
if __name__ == "__main__":
keywords = _load_keywords("company.txt")
for keyword in _progress_iter(keywords, desc="Keywords", total=len(keywords)):
print(keyword)
main(keyword)