From 04d6303da2bea2a59cd5e945224a3f8a60627a8d Mon Sep 17 00:00:00 2001 From: win Date: Sat, 21 Mar 2026 18:10:40 +0800 Subject: [PATCH] feat(01-01): create crawler_core/base.py with Result[T] and crawler_core/__init__.py - Define generic Result[T] dataclass (7 fields: success, status_code, data, list, count, is_end_page, error) - Port parse_response() from spiderJobs/core/base.py returning Result[Any] - BaseFetcher: 4 template methods (_build_params, _parse required; _build_headers, _check_blocked optional) - BaseSearcher: 4 template methods with load_all() paginator using stdlib logging - crawler_core/__init__.py exports BaseFetcher, BaseSearcher, Result, HTTPClient, parse_response - No ApiResult, no loguru, no spiderJobs/app imports --- crawler_core/__init__.py | 19 ++++ crawler_core/base.py | 207 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 226 insertions(+) create mode 100644 crawler_core/__init__.py create mode 100644 crawler_core/base.py diff --git a/crawler_core/__init__.py b/crawler_core/__init__.py new file mode 100644 index 0000000..cd9b23b --- /dev/null +++ b/crawler_core/__init__.py @@ -0,0 +1,19 @@ +""" +crawler_core — 招聘爬虫共享核心包 + +安装方式: pip install -e ./crawler_core +使用方式: from crawler_core import BaseFetcher, BaseSearcher, Result, HTTPClient +""" + +from crawler_core.base import Result, BaseFetcher, BaseSearcher, parse_response +from crawler_core.http_client import HTTPClient + +__all__ = [ + "Result", + "BaseFetcher", + "BaseSearcher", + "HTTPClient", + "parse_response", +] + +__version__ = "0.1.0" diff --git a/crawler_core/base.py b/crawler_core/base.py new file mode 100644 index 0000000..c3800cf --- /dev/null +++ b/crawler_core/base.py @@ -0,0 +1,207 @@ +""" +crawler_core.base — 通用基类与数据结构 + +提供所有招聘平台共用的: Result, BaseFetcher, BaseSearcher, parse_response +不依赖任何平台特定代码。 +""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass, field +from typing import Any, Generic, Optional, TypeVar + +from crawler_core.http_client import HTTPClient + +T = TypeVar("T") + +_logger = logging.getLogger("crawler_core.base") + + +# ───────────────────────────────────────────── +# 通用数据结构 +# ───────────────────────────────────────────── + +@dataclass +class Result(Generic[T]): + """Typed result wrapper returned by all BaseFetcher and BaseSearcher methods. + + Callers annotate as Result[MyJobModel] etc. + """ + success: bool + status_code: int + data: Optional[T] = None + list: list[T] = field(default_factory=list) + count: int = 0 + is_end_page: bool = True + error: Optional[str] = None + + +# ───────────────────────────────────────────── +# 通用响应解析(可覆写) +# ───────────────────────────────────────────── + +def parse_response(http_code: int, raw: Any) -> Result[Any]: + """ + 默认响应解析算法 + + 各平台如果格式不同,可在子类中覆写 parse_response 方法 + """ + biz_code = raw.get("statusCode") if isinstance(raw, dict) else http_code + + if http_code != 200 or biz_code != 200: + return Result( + success=False, + status_code=biz_code or http_code, + error=( + raw.get("statusDescription") + or raw.get("message") + or f"请求失败: {biz_code}" + ) if isinstance(raw, dict) else f"请求失败: {http_code}", + ) + + payload = (raw.get("data") or {}) if isinstance(raw, dict) else {} + + if isinstance(payload, dict) and "list" in payload: + return Result( + success=True, status_code=200, data=payload, + list=payload.get("list", []), + count=payload.get("count", 0), + is_end_page=payload.get("isEndPage", True), + ) + + return Result(success=True, status_code=200, data=payload) + + +# ───────────────────────────────────────────── +# 基础 Fetcher(GET 详情类) +# ───────────────────────────────────────────── + +class BaseFetcher: + """Template-method base class for single-item fetchers. + + Required overrides: _build_params(), _parse() + Optional overrides: _build_headers(), _check_blocked() + """ + ENDPOINT: str = "" + + def __init__(self, http_client: HTTPClient) -> None: + self.http_client = http_client + + # --- Required template methods --- + + def _build_params(self) -> dict: + """Build query/body parameters for the request. MUST be overridden.""" + raise NotImplementedError(f"{type(self).__name__} must implement _build_params()") + + def _parse(self, http_code: int, raw: Any) -> Result: + """Parse the HTTP response into a Result. MUST be overridden.""" + raise NotImplementedError(f"{type(self).__name__} must implement _parse()") + + # --- Optional template methods --- + + def _build_headers(self) -> dict: + """Build extra request headers. Override to add platform-specific headers. + + Default: returns {} (no extra headers beyond HTTPClient defaults). + """ + return {} + + def _check_blocked(self, status_code: int, body: str) -> bool: + """Detect platform-specific anti-crawl blocks. + + Override to inspect response body/status for block signals. + Default: returns False (assume not blocked). + """ + return False + + # --- Orchestration --- + + def fetch(self) -> Result: + """Execute the fetch: build params → request → check blocked → parse.""" + params = self._build_params() + extra_headers = self._build_headers() + http_code, raw = self.http_client.get( + self.ENDPOINT, params=params, headers=extra_headers or None + ) + raw_str = str(raw) if not isinstance(raw, str) else raw + if self._check_blocked(http_code, raw_str): + return Result(success=False, status_code=http_code, error="blocked") + return self._parse(http_code, raw) + + +# ───────────────────────────────────────────── +# 基础 Searcher(搜索 + 分页类) +# ───────────────────────────────────────────── + +class BaseSearcher: + """Template-method base class for paginated list searchers. + + Required overrides: _build_params(), _parse() + Optional overrides: _build_headers(), _check_blocked() + """ + ENDPOINT: str = "" + + def __init__(self, page_size: int = 15, http_client: Optional[HTTPClient] = None) -> None: + self.page_size = page_size + self.http_client = http_client + + # --- Required template methods --- + + def _build_params(self, page_index: int) -> dict: + """Build pagination query params. MUST be overridden.""" + raise NotImplementedError(f"{type(self).__name__} must implement _build_params()") + + def _parse(self, http_code: int, raw: Any) -> Result: + """Parse the HTTP response into a Result. MUST be overridden.""" + raise NotImplementedError(f"{type(self).__name__} must implement _parse()") + + # --- Optional template methods --- + + def _build_headers(self) -> dict: + """Build extra request headers. Override for platform-specific headers. + + Default: returns {} (no extra headers beyond HTTPClient defaults). + """ + return {} + + def _check_blocked(self, status_code: int, body: str) -> bool: + """Detect platform-specific anti-crawl blocks. + + Override to inspect response body/status for block signals. + Default: returns False (assume not blocked). + """ + return False + + # --- Orchestration --- + + def _request(self, params: dict) -> tuple[int, Any]: + """Execute a single HTTP request. Uses _build_headers() for extra headers.""" + extra_headers = self._build_headers() + return self.http_client.get( + self.ENDPOINT, params=params, headers=extra_headers or None + ) + + def search(self, page_index: int = 1) -> Result: + """Fetch a single page: build params → request → check blocked → parse.""" + params = self._build_params(page_index) + http_code, raw = self._request(params) + raw_str = str(raw) if not isinstance(raw, str) else raw + if self._check_blocked(http_code, raw_str): + return Result(success=False, status_code=http_code, error="blocked") + return self._parse(http_code, raw) + + def load_all(self, max_pages: int = 10, on_page=None) -> list: + """Iterate pages until is_end_page=True or max_pages reached.""" + all_items: list = [] + for page_index in range(1, max_pages + 1): + result = self.search(page_index) + if not result.success: + _logger.warning("第 %d 页失败: %s", page_index, result.error) + break + all_items.extend(result.list) + if on_page: + on_page(page_index, result) + if result.is_end_page: + break + return all_items