job51 (spiderJobs/platforms/job51/): - client.py: HTTPClient+Job51Sign from crawler_core - api.py: ApiResult→Result, self._http→self.http_client, _request() POST overrides - main.py: BaseFetcher/BaseSearcher from crawler_core - sign.py: backward-compatible stub re-exporting crawler_core.qcwy.sign.Job51Sign zhilian (spiderJobs/platforms/zhilian/): - client.py: HTTPClient+ZhilianSign from crawler_core - api.py: add _parse_zhilian_response (HTTP 200=success), add _parse()/_request() to all classes (GET fetchers + POST searcher overrides) - main.py: BaseFetcher/BaseSearcher from crawler_core - sign.py: backward-compatible stub re-exporting crawler_core.zhilian.sign.ZhilianSign tests: 34 new mock tests (17 job51 + 17 zhilian) Full regression: 98 passed (job51:17 + zhilian:17 + boss:22 + crawler_core:41 + 1)
117 lines
3.3 KiB
Python
117 lines
3.3 KiB
Python
"""
|
||
前程无忧 (51Job) 小程序爬虫入口
|
||
|
||
功能:
|
||
1. 从后端获取关键词(优先断点续爬 > 失败重试 > 全新)
|
||
2. 调用 SearchRecommendJobs 分页爬取职位列表
|
||
3. 每页实时上传数据 + 汇报进度
|
||
4. 支持从断点页码恢复
|
||
5. 可选:搜索 job 时顺带抓取公司详情
|
||
|
||
启动:
|
||
python -m spiderJobs.platforms.job51.main
|
||
|
||
环境变量:
|
||
API_BASE_URL 后端地址 (默认 http://124.222.106.226:9999)
|
||
MAX_PAGES 每个关键词最大翻页数 (默认 3)
|
||
SLEEP_MIN_SECONDS 最小延迟秒数 (默认 10)
|
||
SLEEP_MAX_SECONDS 最大延迟秒数 (默认 20)
|
||
INLINE_COMPANY 是否内联抓公司 (默认 1,设 0 关闭)
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import os
|
||
import sys
|
||
from typing import Optional
|
||
|
||
_project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
|
||
if _project_root not in sys.path:
|
||
sys.path.insert(0, _project_root)
|
||
|
||
from crawler_core.base import BaseFetcher, BaseSearcher
|
||
from spiderJobs.platforms.job51.api import GetCompanyInfo, SearchRecommendJobs
|
||
from spiderJobs.platforms.job51.client import Job51Client, create_client
|
||
from spiderJobs.runner.loop import run_crawl_loop
|
||
|
||
|
||
# 51job 城市代码映射
|
||
CITY_CODE_MAP = {
|
||
"全国": "000000",
|
||
"北京": "010000",
|
||
"上海": "020000",
|
||
"广州": "030200",
|
||
"深圳": "040000",
|
||
"杭州": "080200",
|
||
"成都": "090200",
|
||
"南京": "070200",
|
||
"武汉": "180200",
|
||
"西安": "200200",
|
||
"长沙": "190200",
|
||
"重庆": "060000",
|
||
"苏州": "070300",
|
||
"天津": "050000",
|
||
"厦门": "110300",
|
||
"郑州": "170200",
|
||
"合肥": "150200",
|
||
"济南": "120200",
|
||
"青岛": "120300",
|
||
"大连": "230300",
|
||
"东莞": "030800",
|
||
"佛山": "030600",
|
||
"珠海": "030500",
|
||
"无锡": "070400",
|
||
"宁波": "080300",
|
||
}
|
||
|
||
|
||
def create_searcher(keyword: dict, http_client: Job51Client) -> BaseSearcher:
|
||
"""根据关键词创建 51job 搜索器"""
|
||
city = keyword.get("city", "")
|
||
job_area = CITY_CODE_MAP.get(city, "020000")
|
||
|
||
return SearchRecommendJobs(
|
||
job_area=job_area,
|
||
client=http_client,
|
||
)
|
||
|
||
|
||
def extract_company_id(job: dict) -> Optional[str]:
|
||
"""从 51job job dict 中提取公司 ID (coId)"""
|
||
co_id = job.get("coId")
|
||
return str(co_id) if co_id else None
|
||
|
||
|
||
def create_company_fetcher(company_id: str, http_client: Job51Client) -> BaseFetcher:
|
||
"""创建 51job 公司详情 fetcher"""
|
||
return GetCompanyInfo(company_id=company_id, client=http_client)
|
||
|
||
|
||
def main():
|
||
client_kwargs = {}
|
||
|
||
tunnel = os.environ.get("PROXY_TUNNEL", "")
|
||
if tunnel:
|
||
scheme = os.environ.get("PROXY_SCHEME", "http")
|
||
username = os.environ.get("PROXY_USERNAME", "")
|
||
password = os.environ.get("PROXY_PASSWORD", "")
|
||
if username and password:
|
||
client_kwargs["tunnel_proxy"] = f"{scheme}://{username}:{password}@{tunnel}"
|
||
else:
|
||
client_kwargs["tunnel_proxy"] = f"{scheme}://{tunnel}"
|
||
|
||
run_crawl_loop(
|
||
platform="qcwy",
|
||
create_searcher=create_searcher,
|
||
create_client_fn=create_client,
|
||
max_pages=3,
|
||
data_type="job",
|
||
client_kwargs=client_kwargs,
|
||
extract_company_id=extract_company_id,
|
||
create_company_fetcher=create_company_fetcher,
|
||
)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|