364 lines
15 KiB
Python
364 lines
15 KiB
Python
import csv
|
|
import io
|
|
import re
|
|
from typing import List, Dict, Any, Union, Optional
|
|
from fastapi import UploadFile
|
|
from loguru import logger
|
|
from app.services.crawler.boss import BossService
|
|
from app.services.crawler.qcwy import QcwyService
|
|
from app.services.crawler.zhilian import ZhilianService
|
|
from app.services.job import DataRouterService, DataType, PlatformType
|
|
from app.core.clickhouse import clickhouse_manager
|
|
from app.models.token import BossToken
|
|
from jobs_spider.qcwy.search_company_jobs import _extract_items as qcwy_extract_items
|
|
|
|
class CleaningService:
|
|
def __init__(self):
|
|
self.boss_service = BossService()
|
|
self.qcwy_service = QcwyService()
|
|
self.zhilian_service = ZhilianService()
|
|
self.data_router = None
|
|
self._boss_token_loaded = False
|
|
|
|
def _apply_proxy(self, proxy: Optional[str]) -> None:
|
|
self.boss_service.set_proxy(proxy)
|
|
self.qcwy_service.set_proxy(proxy)
|
|
self.zhilian_service.set_proxy(proxy)
|
|
|
|
async def _ensure_boss_token_loaded(self) -> None:
|
|
if self._boss_token_loaded and self.boss_service.login_data.get("mpt"):
|
|
return
|
|
token_obj = await BossToken.filter(is_active=True).order_by("-updated_at").first()
|
|
if not token_obj:
|
|
logger.warning("BossToken not found or inactive")
|
|
return
|
|
self.boss_service.set_login_data(token_obj.mpt or "", "")
|
|
self._boss_token_loaded = True
|
|
|
|
async def get_data_router(self) -> DataRouterService:
|
|
if not self.data_router:
|
|
client = await clickhouse_manager.get_client()
|
|
self.data_router = DataRouterService(client)
|
|
return self.data_router
|
|
|
|
async def parse_file(self, file: UploadFile) -> List[str]:
|
|
content = await file.read()
|
|
filename = file.filename
|
|
|
|
targets = []
|
|
if filename.endswith('.csv'):
|
|
text = content.decode('utf-8')
|
|
# Handle BOM
|
|
if text.startswith('\uFEFF'):
|
|
text = text[1:]
|
|
reader = csv.reader(io.StringIO(text))
|
|
for row in reader:
|
|
if row:
|
|
targets.append(row[0].strip())
|
|
else:
|
|
text = content.decode('utf-8')
|
|
targets = [line.strip() for line in text.splitlines() if line.strip()]
|
|
|
|
return [t for t in targets if t]
|
|
|
|
async def process_single_item(self, target: str, clean_type: str = "auto", platform: str = "auto", proxy: Optional[str] = None) -> Dict[str, Any]:
|
|
try:
|
|
await self._ensure_boss_token_loaded()
|
|
self._apply_proxy(proxy)
|
|
result = None
|
|
if clean_type == "auto":
|
|
result = await self.clean_target_auto(target)
|
|
elif clean_type == "clean_url":
|
|
if platform == "auto":
|
|
result = await self.clean_target_auto(target)
|
|
elif platform == "boss":
|
|
result = await self._process_boss_url(target)
|
|
elif platform == "qcwy":
|
|
result = await self._process_qcwy_url(target)
|
|
elif platform == "zhilian":
|
|
result = await self._process_zhilian_url(target)
|
|
elif clean_type == "job_id":
|
|
result = await self.clean_by_job_id(target, platform)
|
|
elif clean_type == "company_name":
|
|
result = await self.clean_by_company_name(target, platform)
|
|
elif clean_type == "company_id":
|
|
result = await self.clean_by_company_id(target, platform)
|
|
elif clean_type == "company_jobs":
|
|
if platform == "boss":
|
|
result = await self.clean_boss_company_jobs(target)
|
|
elif platform == "qcwy":
|
|
result = await self.clean_qcwy_company_jobs(target)
|
|
elif platform == "zhilian":
|
|
result = await self.clean_zhilian_company_jobs(target)
|
|
|
|
if not result:
|
|
return {
|
|
"success": False,
|
|
"target": target,
|
|
"error": "No data found or operation failed",
|
|
"storage_status": "failed",
|
|
"remote_sent": False
|
|
}
|
|
|
|
# Normalize result if it's just a dict (from store_data)
|
|
# If it's a boolean (from some legacy paths), wrap it
|
|
if isinstance(result, bool):
|
|
return {
|
|
"success": result,
|
|
"target": target,
|
|
"error": None if result else "Operation failed",
|
|
"storage_status": "unknown",
|
|
"remote_sent": False
|
|
}
|
|
|
|
# If it's the dict returned by DataRouterService.store_data
|
|
return {
|
|
"success": result.get("success", False),
|
|
"target": target,
|
|
"error": result.get("message") if not result.get("success") else None,
|
|
"storage_status": "duplicate" if result.get("duplicate") else "saved",
|
|
"remote_sent": result.get("remote_sent", False),
|
|
"data_summary": result.get("data_summary"), # Optional: summary of data
|
|
"original_data": result.get("original_data")
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing item {target}: {e}")
|
|
return {
|
|
"success": False,
|
|
"target": target,
|
|
"error": str(e),
|
|
"storage_status": "error",
|
|
"remote_sent": False
|
|
}
|
|
|
|
async def clean_target_auto(self, target: str) -> Union[bool, Dict[str, Any]]:
|
|
if "zhipin.com" in target:
|
|
return await self._process_boss_url(target)
|
|
elif "51job.com" in target:
|
|
return await self._process_qcwy_url(target)
|
|
elif "zhaopin.com" in target:
|
|
return await self._process_zhilian_url(target)
|
|
return await self._process_search_company(target)
|
|
|
|
async def clean_by_job_id(self, target: str, platform: str) -> Union[bool, Dict[str, Any]]:
|
|
router = await self.get_data_router()
|
|
data = None
|
|
result = None
|
|
|
|
# Try to extract ID if target looks like a URL
|
|
if platform == "boss":
|
|
match = re.search(r'job_detail/([^.]+)\.html', target)
|
|
if match:
|
|
target = match.group(1)
|
|
elif platform == "qcwy":
|
|
match = re.search(r'/(\d+)\.html', target)
|
|
if match:
|
|
target = match.group(1)
|
|
elif platform == "zhilian":
|
|
match = re.search(r'jobs\.zhaopin\.com/(\w+)\.htm', target)
|
|
if match:
|
|
target = match.group(1)
|
|
|
|
if platform == "boss":
|
|
data = self.boss_service.get_job_detail_by_id(target)
|
|
if data:
|
|
result = await router.store_data(data, DataType.JOB, PlatformType.BOSS)
|
|
elif platform == "qcwy":
|
|
data = self.qcwy_service.get_job_detail(target)
|
|
if data:
|
|
result = await router.store_data(data, DataType.JOB, PlatformType.QCWY)
|
|
elif platform == "zhilian":
|
|
data = self.zhilian_service.get_job_detail(target)
|
|
if data:
|
|
result = await router.store_data(data, DataType.JOB, PlatformType.ZHILIAN)
|
|
|
|
if result and isinstance(result, dict) and data:
|
|
result['original_data'] = data
|
|
return result
|
|
|
|
return False
|
|
|
|
async def clean_by_company_name(self, target: str, platform: str) -> Union[bool, Dict[str, Any]]:
|
|
router = await self.get_data_router()
|
|
if platform == "boss":
|
|
res = self.boss_service.search_jobs(target)
|
|
if res and res.get('zpData') and res['zpData'].get('list'):
|
|
# For company name search, we might get multiple jobs.
|
|
# Currently we just return the result of the LAST one for simplicity in status reporting,
|
|
# or we should change logic to handle list.
|
|
# For now, let's just process them and return the last result as indicative.
|
|
last_result = None
|
|
for job in res['zpData']['list']:
|
|
last_result = await router.store_data(job, DataType.JOB, PlatformType.BOSS)
|
|
|
|
if last_result and isinstance(last_result, dict):
|
|
# For search results, we store the full search response as original data
|
|
last_result['original_data'] = res
|
|
return last_result if last_result else False
|
|
elif platform == "qcwy":
|
|
res = self.qcwy_service.search_jobs(target)
|
|
if res:
|
|
last_result = None
|
|
for job in res:
|
|
last_result = await router.store_data(job, DataType.JOB, PlatformType.QCWY)
|
|
|
|
if last_result and isinstance(last_result, dict):
|
|
last_result['original_data'] = res
|
|
return last_result if last_result else False
|
|
elif platform == "zhilian":
|
|
res = self.zhilian_service.search_company_jobs_by_name(target)
|
|
if res and isinstance(res, dict):
|
|
data = res.get("data") or {}
|
|
items = data.get("list") or []
|
|
if not isinstance(items, list):
|
|
items = []
|
|
last_result = None
|
|
for job in items:
|
|
last_result = await router.store_data(job, DataType.JOB, PlatformType.ZHILIAN)
|
|
if last_result and isinstance(last_result, dict):
|
|
last_result["original_data"] = res
|
|
return last_result if last_result else False
|
|
return False
|
|
|
|
async def clean_by_company_id(self, target: str, platform: str) -> Union[bool, Dict[str, Any]]:
|
|
router = await self.get_data_router()
|
|
data = None
|
|
result = None
|
|
|
|
if platform == "boss":
|
|
data = self.boss_service.get_company_detail_by_id(target)
|
|
if data:
|
|
result = await router.store_data(data, DataType.COMPANY, PlatformType.BOSS)
|
|
elif platform == "qcwy":
|
|
company_id = target
|
|
match = re.match(r"^co(\d+)$", company_id)
|
|
if match:
|
|
company_id = match.group(1)
|
|
data = self.qcwy_service.get_company_info(company_id)
|
|
if data:
|
|
result = await router.store_data(data, DataType.COMPANY, PlatformType.QCWY)
|
|
elif platform == "zhilian":
|
|
data = self.zhilian_service.get_company_detail(target)
|
|
if data:
|
|
result = await router.store_data(data, DataType.COMPANY, PlatformType.ZHILIAN)
|
|
|
|
if result and isinstance(result, dict) and data:
|
|
result['original_data'] = data
|
|
return result
|
|
|
|
return False
|
|
|
|
async def clean_boss_company_jobs(self, target: str) -> Union[bool, Dict[str, Any]]:
|
|
router = await self.get_data_router()
|
|
company_id = target
|
|
|
|
match = re.search(r'gongsi/([^.]+)\.html', target)
|
|
if match:
|
|
company_id = match.group(1)
|
|
|
|
data = self.boss_service.get_company_jobs_by_id(company_id)
|
|
if not data:
|
|
return False
|
|
|
|
jobs = []
|
|
zp_data = data.get("zpData") if isinstance(data, dict) else None
|
|
if isinstance(zp_data, dict):
|
|
if isinstance(zp_data.get("jobList"), list):
|
|
jobs = zp_data.get("jobList") or []
|
|
elif isinstance(zp_data.get("list"), list):
|
|
jobs = zp_data.get("list") or []
|
|
|
|
if not jobs:
|
|
return False
|
|
|
|
last_result: Optional[Dict[str, Any]] = None
|
|
for job in jobs:
|
|
last_result = await router.store_data(job, DataType.JOB, PlatformType.BOSS)
|
|
|
|
if last_result and isinstance(last_result, dict):
|
|
last_result["original_data"] = data
|
|
return last_result
|
|
|
|
return False
|
|
|
|
async def clean_qcwy_company_jobs(self, target: str) -> Union[bool, Dict[str, Any]]:
|
|
router = await self.get_data_router()
|
|
company_id = target
|
|
|
|
match = re.match(r'^co(\d+)$', company_id)
|
|
if match:
|
|
company_id = match.group(1)
|
|
|
|
data = self.qcwy_service.get_company_jobs_by_id(company_id)
|
|
if not data:
|
|
return False
|
|
|
|
jobs_list = qcwy_extract_items(data)
|
|
jobs: List[Dict[str, Any]] = jobs_list if isinstance(jobs_list, list) else []
|
|
|
|
if not jobs:
|
|
return False
|
|
|
|
last_result: Optional[Dict[str, Any]] = None
|
|
for job in jobs:
|
|
last_result = await router.store_data(job, DataType.JOB, PlatformType.QCWY)
|
|
|
|
if last_result and isinstance(last_result, dict):
|
|
last_result["original_data"] = data
|
|
return last_result
|
|
|
|
return False
|
|
|
|
async def clean_zhilian_company_jobs(self, target: str) -> Union[bool, Dict[str, Any]]:
|
|
router = await self.get_data_router()
|
|
company_id = target
|
|
|
|
data = self.zhilian_service.get_company_jobs_by_id(company_id)
|
|
if not data or not isinstance(data, dict):
|
|
return False
|
|
|
|
data_field = data.get("data") or {}
|
|
jobs = data_field.get("list") or []
|
|
if not isinstance(jobs, list) or not jobs:
|
|
return False
|
|
|
|
last_result: Optional[Dict[str, Any]] = None
|
|
for job in jobs:
|
|
last_result = await router.store_data(job, DataType.JOB, PlatformType.ZHILIAN)
|
|
|
|
if last_result and isinstance(last_result, dict):
|
|
last_result["original_data"] = data
|
|
return last_result
|
|
|
|
return False
|
|
|
|
async def _process_boss_url(self, url: str) -> Union[bool, Dict[str, Any]]:
|
|
job_match = re.search(r'job_detail/([^.]+)\.html', url)
|
|
if job_match:
|
|
return await self.clean_by_job_id(job_match.group(1), "boss")
|
|
|
|
company_match = re.search(r'gongsi/([^.]+)\.html', url)
|
|
if company_match:
|
|
return await self.clean_by_company_id(company_match.group(1), "boss")
|
|
|
|
# Fallback: assume it's a job ID
|
|
return await self.clean_by_job_id(url, "boss")
|
|
|
|
async def _process_qcwy_url(self, url: str) -> Union[bool, Dict[str, Any]]:
|
|
job_match = re.search(r'/(\d+)\.html', url)
|
|
if job_match:
|
|
return await self.clean_by_job_id(job_match.group(1), "qcwy")
|
|
# Fallback: assume it's a job ID
|
|
return await self.clean_by_job_id(url, "qcwy")
|
|
|
|
async def _process_zhilian_url(self, url: str) -> Union[bool, Dict[str, Any]]:
|
|
job_match = re.search(r'jobs\.zhaopin\.com/(\w+)\.htm', url)
|
|
if job_match:
|
|
return await self.clean_by_job_id(job_match.group(1), "zhilian")
|
|
# Fallback: assume it's a job ID
|
|
return await self.clean_by_job_id(url, "zhilian")
|
|
|
|
async def _process_search_company(self, name: str) -> Union[bool, Dict[str, Any]]:
|
|
return await self.clean_by_company_name(name, "boss")
|