import csv import io import re from typing import List, Dict, Any, Union, Optional from fastapi import UploadFile from loguru import logger from app.services.crawler.boss import BossService from app.services.crawler.qcwy import QcwyService from app.services.crawler.zhilian import ZhilianService from app.services.job import DataRouterService, DataType, PlatformType from app.core.clickhouse import clickhouse_manager from app.models.token import BossToken from jobs_spider.qcwy.search_company_jobs import _extract_items as qcwy_extract_items class CleaningService: def __init__(self): self.boss_service = BossService() self.qcwy_service = QcwyService() self.zhilian_service = ZhilianService() self.data_router = None self._boss_token_loaded = False def _apply_proxy(self, proxy: Optional[str]) -> None: self.boss_service.set_proxy(proxy) self.qcwy_service.set_proxy(proxy) self.zhilian_service.set_proxy(proxy) async def _ensure_boss_token_loaded(self) -> None: if self._boss_token_loaded and self.boss_service.login_data.get("mpt"): return token_obj = await BossToken.filter(is_active=True).order_by("-updated_at").first() if not token_obj: logger.warning("BossToken not found or inactive") return self.boss_service.set_login_data(token_obj.mpt or "", "") self._boss_token_loaded = True async def get_data_router(self) -> DataRouterService: if not self.data_router: client = await clickhouse_manager.get_client() self.data_router = DataRouterService(client) return self.data_router async def parse_file(self, file: UploadFile) -> List[str]: content = await file.read() filename = file.filename targets = [] if filename.endswith('.csv'): text = content.decode('utf-8') # Handle BOM if text.startswith('\uFEFF'): text = text[1:] reader = csv.reader(io.StringIO(text)) for row in reader: if row: targets.append(row[0].strip()) else: text = content.decode('utf-8') targets = [line.strip() for line in text.splitlines() if line.strip()] return [t for t in targets if t] async def process_single_item(self, target: str, clean_type: str = "auto", platform: str = "auto", proxy: Optional[str] = None) -> Dict[str, Any]: try: await self._ensure_boss_token_loaded() self._apply_proxy(proxy) result = None if clean_type == "auto": result = await self.clean_target_auto(target) elif clean_type == "clean_url": if platform == "auto": result = await self.clean_target_auto(target) elif platform == "boss": result = await self._process_boss_url(target) elif platform == "qcwy": result = await self._process_qcwy_url(target) elif platform == "zhilian": result = await self._process_zhilian_url(target) elif clean_type == "job_id": result = await self.clean_by_job_id(target, platform) elif clean_type == "company_name": result = await self.clean_by_company_name(target, platform) elif clean_type == "company_id": result = await self.clean_by_company_id(target, platform) elif clean_type == "company_jobs": if platform == "boss": result = await self.clean_boss_company_jobs(target) elif platform == "qcwy": result = await self.clean_qcwy_company_jobs(target) elif platform == "zhilian": result = await self.clean_zhilian_company_jobs(target) if not result: return { "success": False, "target": target, "error": "No data found or operation failed", "storage_status": "failed", "remote_sent": False } # Normalize result if it's just a dict (from store_data) # If it's a boolean (from some legacy paths), wrap it if isinstance(result, bool): return { "success": result, "target": target, "error": None if result else "Operation failed", "storage_status": "unknown", "remote_sent": False } # If it's the dict returned by DataRouterService.store_data return { "success": result.get("success", False), "target": target, "error": result.get("message") if not result.get("success") else None, "storage_status": "duplicate" if result.get("duplicate") else "saved", "remote_sent": result.get("remote_sent", False), "data_summary": result.get("data_summary"), # Optional: summary of data "original_data": result.get("original_data") } except Exception as e: logger.error(f"Error processing item {target}: {e}") return { "success": False, "target": target, "error": str(e), "storage_status": "error", "remote_sent": False } async def clean_target_auto(self, target: str) -> Union[bool, Dict[str, Any]]: if "zhipin.com" in target: return await self._process_boss_url(target) elif "51job.com" in target: return await self._process_qcwy_url(target) elif "zhaopin.com" in target: return await self._process_zhilian_url(target) return await self._process_search_company(target) async def clean_by_job_id(self, target: str, platform: str) -> Union[bool, Dict[str, Any]]: router = await self.get_data_router() data = None result = None # Try to extract ID if target looks like a URL if platform == "boss": match = re.search(r'job_detail/([^.]+)\.html', target) if match: target = match.group(1) elif platform == "qcwy": match = re.search(r'/(\d+)\.html', target) if match: target = match.group(1) elif platform == "zhilian": match = re.search(r'jobs\.zhaopin\.com/(\w+)\.htm', target) if match: target = match.group(1) if platform == "boss": data = self.boss_service.get_job_detail_by_id(target) if data: result = await router.store_data(data, DataType.JOB, PlatformType.BOSS) elif platform == "qcwy": data = self.qcwy_service.get_job_detail(target) if data: result = await router.store_data(data, DataType.JOB, PlatformType.QCWY) elif platform == "zhilian": data = self.zhilian_service.get_job_detail(target) if data: result = await router.store_data(data, DataType.JOB, PlatformType.ZHILIAN) if result and isinstance(result, dict) and data: result['original_data'] = data return result return False async def clean_by_company_name(self, target: str, platform: str) -> Union[bool, Dict[str, Any]]: router = await self.get_data_router() if platform == "boss": res = self.boss_service.search_jobs(target) if res and res.get('zpData') and res['zpData'].get('list'): # For company name search, we might get multiple jobs. # Currently we just return the result of the LAST one for simplicity in status reporting, # or we should change logic to handle list. # For now, let's just process them and return the last result as indicative. last_result = None for job in res['zpData']['list']: last_result = await router.store_data(job, DataType.JOB, PlatformType.BOSS) if last_result and isinstance(last_result, dict): # For search results, we store the full search response as original data last_result['original_data'] = res return last_result if last_result else False elif platform == "qcwy": res = self.qcwy_service.search_jobs(target) if res: last_result = None for job in res: last_result = await router.store_data(job, DataType.JOB, PlatformType.QCWY) if last_result and isinstance(last_result, dict): last_result['original_data'] = res return last_result if last_result else False elif platform == "zhilian": res = self.zhilian_service.search_company_jobs_by_name(target) if res and isinstance(res, dict): data = res.get("data") or {} items = data.get("list") or [] if not isinstance(items, list): items = [] last_result = None for job in items: last_result = await router.store_data(job, DataType.JOB, PlatformType.ZHILIAN) if last_result and isinstance(last_result, dict): last_result["original_data"] = res return last_result if last_result else False return False async def clean_by_company_id(self, target: str, platform: str) -> Union[bool, Dict[str, Any]]: router = await self.get_data_router() data = None result = None if platform == "boss": data = self.boss_service.get_company_detail_by_id(target) if data: result = await router.store_data(data, DataType.COMPANY, PlatformType.BOSS) elif platform == "qcwy": company_id = target match = re.match(r"^co(\d+)$", company_id) if match: company_id = match.group(1) data = self.qcwy_service.get_company_info(company_id) if data: result = await router.store_data(data, DataType.COMPANY, PlatformType.QCWY) elif platform == "zhilian": data = self.zhilian_service.get_company_detail(target) if data: result = await router.store_data(data, DataType.COMPANY, PlatformType.ZHILIAN) if result and isinstance(result, dict) and data: result['original_data'] = data return result return False async def clean_boss_company_jobs(self, target: str) -> Union[bool, Dict[str, Any]]: router = await self.get_data_router() company_id = target match = re.search(r'gongsi/([^.]+)\.html', target) if match: company_id = match.group(1) data = self.boss_service.get_company_jobs_by_id(company_id) if not data: return False jobs = [] zp_data = data.get("zpData") if isinstance(data, dict) else None if isinstance(zp_data, dict): if isinstance(zp_data.get("jobList"), list): jobs = zp_data.get("jobList") or [] elif isinstance(zp_data.get("list"), list): jobs = zp_data.get("list") or [] if not jobs: return False last_result: Optional[Dict[str, Any]] = None for job in jobs: last_result = await router.store_data(job, DataType.JOB, PlatformType.BOSS) if last_result and isinstance(last_result, dict): last_result["original_data"] = data return last_result return False async def clean_qcwy_company_jobs(self, target: str) -> Union[bool, Dict[str, Any]]: router = await self.get_data_router() company_id = target match = re.match(r'^co(\d+)$', company_id) if match: company_id = match.group(1) data = self.qcwy_service.get_company_jobs_by_id(company_id) if not data: return False jobs_list = qcwy_extract_items(data) jobs: List[Dict[str, Any]] = jobs_list if isinstance(jobs_list, list) else [] if not jobs: return False last_result: Optional[Dict[str, Any]] = None for job in jobs: last_result = await router.store_data(job, DataType.JOB, PlatformType.QCWY) if last_result and isinstance(last_result, dict): last_result["original_data"] = data return last_result return False async def clean_zhilian_company_jobs(self, target: str) -> Union[bool, Dict[str, Any]]: router = await self.get_data_router() company_id = target data = self.zhilian_service.get_company_jobs_by_id(company_id) if not data or not isinstance(data, dict): return False data_field = data.get("data") or {} jobs = data_field.get("list") or [] if not isinstance(jobs, list) or not jobs: return False last_result: Optional[Dict[str, Any]] = None for job in jobs: last_result = await router.store_data(job, DataType.JOB, PlatformType.ZHILIAN) if last_result and isinstance(last_result, dict): last_result["original_data"] = data return last_result return False async def _process_boss_url(self, url: str) -> Union[bool, Dict[str, Any]]: job_match = re.search(r'job_detail/([^.]+)\.html', url) if job_match: return await self.clean_by_job_id(job_match.group(1), "boss") company_match = re.search(r'gongsi/([^.]+)\.html', url) if company_match: return await self.clean_by_company_id(company_match.group(1), "boss") # Fallback: assume it's a job ID return await self.clean_by_job_id(url, "boss") async def _process_qcwy_url(self, url: str) -> Union[bool, Dict[str, Any]]: job_match = re.search(r'/(\d+)\.html', url) if job_match: return await self.clean_by_job_id(job_match.group(1), "qcwy") # Fallback: assume it's a job ID return await self.clean_by_job_id(url, "qcwy") async def _process_zhilian_url(self, url: str) -> Union[bool, Dict[str, Any]]: job_match = re.search(r'jobs\.zhaopin\.com/(\w+)\.htm', url) if job_match: return await self.clean_by_job_id(job_match.group(1), "zhilian") # Fallback: assume it's a job ID return await self.clean_by_job_id(url, "zhilian") async def _process_search_company(self, name: str) -> Union[bool, Dict[str, Any]]: return await self.clean_by_company_name(name, "boss")