JobData/app/services/cleaning.py

364 lines
15 KiB
Python

import csv
import io
import re
from typing import List, Dict, Any, Union, Optional
from fastapi import UploadFile
from loguru import logger
from app.services.crawler.boss import BossService
from app.services.crawler.qcwy import QcwyService
from app.services.crawler.zhilian import ZhilianService
from app.services.job import DataRouterService, DataType, PlatformType
from app.core.clickhouse import clickhouse_manager
from app.models.token import BossToken
from jobs_spider.qcwy.search_company_jobs import _extract_items as qcwy_extract_items
class CleaningService:
def __init__(self):
self.boss_service = BossService()
self.qcwy_service = QcwyService()
self.zhilian_service = ZhilianService()
self.data_router = None
self._boss_token_loaded = False
def _apply_proxy(self, proxy: Optional[str]) -> None:
self.boss_service.set_proxy(proxy)
self.qcwy_service.set_proxy(proxy)
self.zhilian_service.set_proxy(proxy)
async def _ensure_boss_token_loaded(self) -> None:
if self._boss_token_loaded and self.boss_service.login_data.get("mpt"):
return
token_obj = await BossToken.filter(is_active=True).order_by("-updated_at").first()
if not token_obj:
logger.warning("BossToken not found or inactive")
return
self.boss_service.set_login_data(token_obj.mpt or "", "")
self._boss_token_loaded = True
async def get_data_router(self) -> DataRouterService:
if not self.data_router:
client = await clickhouse_manager.get_client()
self.data_router = DataRouterService(client)
return self.data_router
async def parse_file(self, file: UploadFile) -> List[str]:
content = await file.read()
filename = file.filename
targets = []
if filename.endswith('.csv'):
text = content.decode('utf-8')
# Handle BOM
if text.startswith('\uFEFF'):
text = text[1:]
reader = csv.reader(io.StringIO(text))
for row in reader:
if row:
targets.append(row[0].strip())
else:
text = content.decode('utf-8')
targets = [line.strip() for line in text.splitlines() if line.strip()]
return [t for t in targets if t]
async def process_single_item(self, target: str, clean_type: str = "auto", platform: str = "auto", proxy: Optional[str] = None) -> Dict[str, Any]:
try:
await self._ensure_boss_token_loaded()
self._apply_proxy(proxy)
result = None
if clean_type == "auto":
result = await self.clean_target_auto(target)
elif clean_type == "clean_url":
if platform == "auto":
result = await self.clean_target_auto(target)
elif platform == "boss":
result = await self._process_boss_url(target)
elif platform == "qcwy":
result = await self._process_qcwy_url(target)
elif platform == "zhilian":
result = await self._process_zhilian_url(target)
elif clean_type == "job_id":
result = await self.clean_by_job_id(target, platform)
elif clean_type == "company_name":
result = await self.clean_by_company_name(target, platform)
elif clean_type == "company_id":
result = await self.clean_by_company_id(target, platform)
elif clean_type == "company_jobs":
if platform == "boss":
result = await self.clean_boss_company_jobs(target)
elif platform == "qcwy":
result = await self.clean_qcwy_company_jobs(target)
elif platform == "zhilian":
result = await self.clean_zhilian_company_jobs(target)
if not result:
return {
"success": False,
"target": target,
"error": "No data found or operation failed",
"storage_status": "failed",
"remote_sent": False
}
# Normalize result if it's just a dict (from store_data)
# If it's a boolean (from some legacy paths), wrap it
if isinstance(result, bool):
return {
"success": result,
"target": target,
"error": None if result else "Operation failed",
"storage_status": "unknown",
"remote_sent": False
}
# If it's the dict returned by DataRouterService.store_data
return {
"success": result.get("success", False),
"target": target,
"error": result.get("message") if not result.get("success") else None,
"storage_status": "duplicate" if result.get("duplicate") else "saved",
"remote_sent": result.get("remote_sent", False),
"data_summary": result.get("data_summary"), # Optional: summary of data
"original_data": result.get("original_data")
}
except Exception as e:
logger.error(f"Error processing item {target}: {e}")
return {
"success": False,
"target": target,
"error": str(e),
"storage_status": "error",
"remote_sent": False
}
async def clean_target_auto(self, target: str) -> Union[bool, Dict[str, Any]]:
if "zhipin.com" in target:
return await self._process_boss_url(target)
elif "51job.com" in target:
return await self._process_qcwy_url(target)
elif "zhaopin.com" in target:
return await self._process_zhilian_url(target)
return await self._process_search_company(target)
async def clean_by_job_id(self, target: str, platform: str) -> Union[bool, Dict[str, Any]]:
router = await self.get_data_router()
data = None
result = None
# Try to extract ID if target looks like a URL
if platform == "boss":
match = re.search(r'job_detail/([^.]+)\.html', target)
if match:
target = match.group(1)
elif platform == "qcwy":
match = re.search(r'/(\d+)\.html', target)
if match:
target = match.group(1)
elif platform == "zhilian":
match = re.search(r'jobs\.zhaopin\.com/(\w+)\.htm', target)
if match:
target = match.group(1)
if platform == "boss":
data = self.boss_service.get_job_detail_by_id(target)
if data:
result = await router.store_data(data, DataType.JOB, PlatformType.BOSS)
elif platform == "qcwy":
data = self.qcwy_service.get_job_detail(target)
if data:
result = await router.store_data(data, DataType.JOB, PlatformType.QCWY)
elif platform == "zhilian":
data = self.zhilian_service.get_job_detail(target)
if data:
result = await router.store_data(data, DataType.JOB, PlatformType.ZHILIAN)
if result and isinstance(result, dict) and data:
result['original_data'] = data
return result
return False
async def clean_by_company_name(self, target: str, platform: str) -> Union[bool, Dict[str, Any]]:
router = await self.get_data_router()
if platform == "boss":
res = self.boss_service.search_jobs(target)
if res and res.get('zpData') and res['zpData'].get('list'):
# For company name search, we might get multiple jobs.
# Currently we just return the result of the LAST one for simplicity in status reporting,
# or we should change logic to handle list.
# For now, let's just process them and return the last result as indicative.
last_result = None
for job in res['zpData']['list']:
last_result = await router.store_data(job, DataType.JOB, PlatformType.BOSS)
if last_result and isinstance(last_result, dict):
# For search results, we store the full search response as original data
last_result['original_data'] = res
return last_result if last_result else False
elif platform == "qcwy":
res = self.qcwy_service.search_jobs(target)
if res:
last_result = None
for job in res:
last_result = await router.store_data(job, DataType.JOB, PlatformType.QCWY)
if last_result and isinstance(last_result, dict):
last_result['original_data'] = res
return last_result if last_result else False
elif platform == "zhilian":
res = self.zhilian_service.search_company_jobs_by_name(target)
if res and isinstance(res, dict):
data = res.get("data") or {}
items = data.get("list") or []
if not isinstance(items, list):
items = []
last_result = None
for job in items:
last_result = await router.store_data(job, DataType.JOB, PlatformType.ZHILIAN)
if last_result and isinstance(last_result, dict):
last_result["original_data"] = res
return last_result if last_result else False
return False
async def clean_by_company_id(self, target: str, platform: str) -> Union[bool, Dict[str, Any]]:
router = await self.get_data_router()
data = None
result = None
if platform == "boss":
data = self.boss_service.get_company_detail_by_id(target)
if data:
result = await router.store_data(data, DataType.COMPANY, PlatformType.BOSS)
elif platform == "qcwy":
company_id = target
match = re.match(r"^co(\d+)$", company_id)
if match:
company_id = match.group(1)
data = self.qcwy_service.get_company_info(company_id)
if data:
result = await router.store_data(data, DataType.COMPANY, PlatformType.QCWY)
elif platform == "zhilian":
data = self.zhilian_service.get_company_detail(target)
if data:
result = await router.store_data(data, DataType.COMPANY, PlatformType.ZHILIAN)
if result and isinstance(result, dict) and data:
result['original_data'] = data
return result
return False
async def clean_boss_company_jobs(self, target: str) -> Union[bool, Dict[str, Any]]:
router = await self.get_data_router()
company_id = target
match = re.search(r'gongsi/([^.]+)\.html', target)
if match:
company_id = match.group(1)
data = self.boss_service.get_company_jobs_by_id(company_id)
if not data:
return False
jobs = []
zp_data = data.get("zpData") if isinstance(data, dict) else None
if isinstance(zp_data, dict):
if isinstance(zp_data.get("jobList"), list):
jobs = zp_data.get("jobList") or []
elif isinstance(zp_data.get("list"), list):
jobs = zp_data.get("list") or []
if not jobs:
return False
last_result: Optional[Dict[str, Any]] = None
for job in jobs:
last_result = await router.store_data(job, DataType.JOB, PlatformType.BOSS)
if last_result and isinstance(last_result, dict):
last_result["original_data"] = data
return last_result
return False
async def clean_qcwy_company_jobs(self, target: str) -> Union[bool, Dict[str, Any]]:
router = await self.get_data_router()
company_id = target
match = re.match(r'^co(\d+)$', company_id)
if match:
company_id = match.group(1)
data = self.qcwy_service.get_company_jobs_by_id(company_id)
if not data:
return False
jobs_list = qcwy_extract_items(data)
jobs: List[Dict[str, Any]] = jobs_list if isinstance(jobs_list, list) else []
if not jobs:
return False
last_result: Optional[Dict[str, Any]] = None
for job in jobs:
last_result = await router.store_data(job, DataType.JOB, PlatformType.QCWY)
if last_result and isinstance(last_result, dict):
last_result["original_data"] = data
return last_result
return False
async def clean_zhilian_company_jobs(self, target: str) -> Union[bool, Dict[str, Any]]:
router = await self.get_data_router()
company_id = target
data = self.zhilian_service.get_company_jobs_by_id(company_id)
if not data or not isinstance(data, dict):
return False
data_field = data.get("data") or {}
jobs = data_field.get("list") or []
if not isinstance(jobs, list) or not jobs:
return False
last_result: Optional[Dict[str, Any]] = None
for job in jobs:
last_result = await router.store_data(job, DataType.JOB, PlatformType.ZHILIAN)
if last_result and isinstance(last_result, dict):
last_result["original_data"] = data
return last_result
return False
async def _process_boss_url(self, url: str) -> Union[bool, Dict[str, Any]]:
job_match = re.search(r'job_detail/([^.]+)\.html', url)
if job_match:
return await self.clean_by_job_id(job_match.group(1), "boss")
company_match = re.search(r'gongsi/([^.]+)\.html', url)
if company_match:
return await self.clean_by_company_id(company_match.group(1), "boss")
# Fallback: assume it's a job ID
return await self.clean_by_job_id(url, "boss")
async def _process_qcwy_url(self, url: str) -> Union[bool, Dict[str, Any]]:
job_match = re.search(r'/(\d+)\.html', url)
if job_match:
return await self.clean_by_job_id(job_match.group(1), "qcwy")
# Fallback: assume it's a job ID
return await self.clean_by_job_id(url, "qcwy")
async def _process_zhilian_url(self, url: str) -> Union[bool, Dict[str, Any]]:
job_match = re.search(r'jobs\.zhaopin\.com/(\w+)\.htm', url)
if job_match:
return await self.clean_by_job_id(job_match.group(1), "zhilian")
# Fallback: assume it's a job ID
return await self.clean_by_job_id(url, "zhilian")
async def _process_search_company(self, name: str) -> Union[bool, Dict[str, Any]]:
return await self.clean_by_company_name(name, "boss")