JobData/app/services/job.py

import hashlib
import time
from typing import Dict, Any, Optional, List
from enum import Enum
import json
from datetime import datetime

import requests
from clickhouse_connect.driver import AsyncClient
from app.log import logger

from tenacity import retry, stop_after_attempt, wait_exponential


class DataType(str, Enum):
    """数据类型枚举"""
    JOB = "job"
    COMPANY = "company"


class PlatformType(str, Enum):
    """平台类型枚举"""
    BOSS = "boss"
    QCWY = "qcwy"
    ZHILIAN = "zhilian"


class DataRouterService:
    """通用数据路由服务 - 根据数据类型和平台自动选择对应的表进行存储"""

    def __init__(self, clickhouse_client: AsyncClient):
        self.clickhouse_client = clickhouse_client
        # 移除平台特定仓库引用，改用通用数据接口

    # 安全获取列表数据的辅助函数
    async def safe_join(self, data, default=""):
        """安全地将列表数据转换为逗号分隔的字符串"""
        if data is None:
            return default
        if isinstance(data, list):
            return ",".join(str(item) for item in data if item)
        return str(data) if data else default

    # 安全获取字符串数据的辅助函数
    async def safe_get(self, obj, key, default=""):
        """安全地获取字典中的值"""
        value = obj.get(key) if obj else None
        return str(value) if value is not None else default

    async def store_data(self,
                         data: Dict[str, Any],
                         data_type: DataType,
                         platform: PlatformType,
                         check_duplicate: bool = True) -> Dict[str, Any]:
        """通用数据存储方法 - 使用JSON存储方案

        Args:
            data: 要存储的数据
            data_type: 数据类型 (job/company)
            platform: 平台类型 (boss/qcwy/zhilian)
            check_duplicate: 是否检查重复数据

        Returns:
            存储结果信息
        """
        try:
            return await self._store_data_as_json(data, data_type, platform, check_duplicate)

        except Exception as e:
            logger.error(f"{platform} {data_type} 数据存储失败: {str(e)}")
            return {
                "success": False,
                "message": f"数据存储失败: {str(e)}",
                "duplicate": False,
                "error": str(e)
            }

    def _get_json_table_name(self, data_type: DataType, platform: PlatformType) -> str:
        """根据数据类型和平台获取对应的JSON表名"""
        return f"{platform.value}_{data_type.value}"

    async def _store_data_as_json(self, data: Dict[str, Any], data_type: DataType, platform: PlatformType,
                                  check_duplicate: bool = True) -> Dict[str, Any]:
        """使用JSON存储方案存储数据"""
        try:
            # 获取对应的JSON表名
            json_table_name = self._get_json_table_name(data_type, platform)
            remote_data = await self._prepare_remote_push_data(data, data_type, platform)
            if remote_data:
                await self.send_to_remote_server(remote_data)

            # QCWY平台重复检查
            if platform == PlatformType.QCWY and data_type == DataType.JOB:
                job_id = data.get('jobId')
                update_date_time = data.get('updateDateTime')
                if job_id and update_date_time:
                    duplicate_record = await self._check_qcwy_duplicate(json_table_name, job_id, update_date_time)
                    if duplicate_record:
                        logger.info(f"QCWY职位数据重复，跳过插入: jobId={job_id}, updateDateTime={update_date_time}")
                        return {
                            "success": True,
                            "message": "数据重复，跳过插入",
                            "duplicate": True,
                            "table_name": json_table_name,
                            "storage_type": "json"
                        }
            # BOSS平台重复检查: JonId
            if platform == PlatformType.BOSS and data_type == DataType.JOB:
                job_base_info = data.get('jobBaseInfoVO', {})
                job_id = job_base_info.get('jobId')

                if job_id:
                    duplicate_record = await self._check_boss_duplicate(json_table_name, job_id)
                    if duplicate_record:
                        logger.info(f"BOSS职位数据重复，跳过插入: jobId={job_id}")
                        return {
                            "success": True,
                            "message": "数据重复，跳过插入",
                            "duplicate": True,
                            "table_name": json_table_name,
                            "storage_type": "json"
                        }

            # 智联平台重复检查: number + firstPublishTime
            if platform == PlatformType.ZHILIAN and data_type == DataType.JOB:
                number = data.get('number')
                first_publish_time = data.get('firstPublishTime')
                if number and first_publish_time:
                    duplicate_record = await self._check_zhilian_duplicate(json_table_name, number, first_publish_time)
                    if duplicate_record:
                        logger.info(
                            f"智联职位数据重复，跳过插入: number={number}, firstPublishTime={first_publish_time}")
                        return {
                            "success": True,
                            "message": "数据重复，跳过插入",
                            "duplicate": True,
                            "table_name": json_table_name,
                            "storage_type": "json"
                        }

            # BOSS平台公司重复检查: 按公司名称
            if platform == PlatformType.BOSS and data_type == DataType.COMPANY:
                company_name = data.get('name') or data.get('companyFullInfoVO', {}).get('name')
                if company_name:
                    duplicate_record = await self._check_boss_company_duplicate(json_table_name, company_name)
                    if duplicate_record:
                        logger.info(f"BOSS公司数据重复，跳过插入: companyName={company_name}")
                        return {
                            "success": True,
                            "message": "数据重复，跳过插入",
                            "duplicate": True,
                            "table_name": json_table_name,
                            "storage_type": "json"
                        }

            # QCWY平台公司重复检查: 按公司名称
            if platform == PlatformType.QCWY and data_type == DataType.COMPANY:
                company_name = data.get('companyName') or data.get('company_name')
                if company_name:
                    duplicate_record = await self._check_qcwy_company_duplicate(json_table_name, company_name)
                    if duplicate_record:
                        logger.info(f"QCWY公司数据重复，跳过插入: companyName={company_name}")
                        return {
                            "success": True,
                            "message": "数据重复，跳过插入",
                            "duplicate": True,
                            "table_name": json_table_name,
                            "storage_type": "json"
                        }

            # 智联平台公司重复检查: companyName
            if platform == PlatformType.ZHILIAN and data_type == DataType.COMPANY:
                company_name = data.get('companyName') or data.get('name')
                if company_name:
                    duplicate_record = await self._check_zhilian_company_duplicate(json_table_name, company_name)
                    if duplicate_record:
                        logger.info(f"智联公司数据重复，跳过插入: companyName={company_name}")
                        return {
                            "success": True,
                            "message": "数据重复，跳过插入",
                            "duplicate": True,
                            "table_name": json_table_name,
                            "storage_type": "json"
                        }

            # 准备JSON存储数据
            current_time = datetime.now()
            json_data = {
                'id': 0,  # <20><>动生成
                'json_data': json.dumps(data, ensure_ascii=False),
                'created_at': current_time,
                'updated_at': current_time
            }

            # 根据平台和数据类型添加去重字段
            if platform == PlatformType.BOSS and data_type == DataType.JOB:
                # BOSS平台职位数据：添加job_id字段
                job_base_info = data.get('jobBaseInfoVO', {})
                if job_base_info and 'jobId' in job_base_info:
                    json_data['job_id'] = str(job_base_info['jobId'])

            elif platform == PlatformType.QCWY and data_type == DataType.JOB:
                # QCWY平台职位数据：添加job_id和update_date_time字段
                if 'jobId' in data:
                    json_data['job_id'] = str(data['jobId'])
                if 'updateDateTime' in data:
                    json_data['update_date_time'] = str(data['updateDateTime'])

            elif platform == PlatformType.ZHILIAN and data_type == DataType.JOB:
                # 智联平台职位数据：添加number和first_publish_time字段
                if 'number' in data:
                    json_data['number'] = str(data['number'])
                if 'firstPublishTime' in data:
                    json_data['first_publish_time'] = str(data['firstPublishTime'])

            # 根据平台和数据类型添加公司去重字段
            elif platform == PlatformType.BOSS and data_type == DataType.COMPANY:
                # BOSS平台公司数据：添加company_name字段
                company_name = data.get('name') or data.get('companyFullInfoVO', {}).get('name')
                if company_name:
                    json_data['company_name'] = str(company_name)

            elif platform == PlatformType.QCWY and data_type == DataType.COMPANY:
                # QCWY平台公司数据：添加company_name字段
                company_name = data.get('companyName') or data.get('company_name')
                if company_name:
                    json_data['company_name'] = str(company_name)

            elif platform == PlatformType.ZHILIAN and data_type == DataType.COMPANY:
                # 智联平台公司数据：添加company_name字段
                company_name = data.get('companyName') or data.get('name')
                if company_name:
                    json_data['company_name'] = str(company_name)

            # 插入到对应的JSON表
            await self._insert_data_to_clickhouse(json_table_name, json_data)

            logger.info(f"{platform} {data_type} 数据以JSON格式存储成功到表 {json_table_name}")
            return {
                "success": True,
                "message": "JSON数据存储成功",
                "duplicate": False,
                "table_name": json_table_name,
                "storage_type": "json"
            }

        except Exception as e:
            logger.error(f"JSON数据存储失败: {str(e)}")
            raise e

    async def query_json_data(self,
                              platform: PlatformType,
                              data_type: DataType,
                              json_fields: Optional[Dict[str, str]] = None,
                              limit: int = 100,
                              offset: int = 0) -> Dict[str, Any]:
        """查询JSON存储的数据

        Args:
            platform: 平台类型 (必需)
            data_type: 数据类型 (必需)
            json_fields: 要提取的JSON字段映射 {alias: json_path}
            limit: 返回记录数限制

        Returns:
            查询结果
        """
        try:
            # 获取对应的JSON表名
            json_table_name = self._get_json_table_name(data_type, platform)

            # 获取总数
            count_query = f"SELECT count() FROM job_data.{json_table_name}"
            count_result = await self.clickhouse_client.query(count_query)
            total_count = count_result.result_rows[0][0] if count_result.result_rows else 0

            # 构建查询
            if json_fields:
                select_fields = ['created_at']
                for alias, json_path in json_fields.items():
                    select_fields.append(f"JSONExtractString(json_data, '{json_path}') as {alias}")
                query = f"SELECT {', '.join(select_fields)} FROM job_data.{json_table_name}"
            else:
                # 如果没有指定字段，查询所有字段
                query = f"SELECT * FROM job_data.{json_table_name}"

            query += f" ORDER BY created_at DESC LIMIT {limit} OFFSET {offset}"

            # 执行查询
            result = await self.clickhouse_client.query(query)

            # 将结果转换为字典列表
            data = []
            for row in result.result_rows:
                item = dict(zip(result.column_names, row))
                # 尝试解析json_data
                if 'json_data' in item and isinstance(item['json_data'], str):
                    try:
                        json_content = json.loads(item['json_data'])
                        if isinstance(json_content, dict):
                            item.update(json_content)
                    except:
                        pass
                data.append(item)

            logger.info(f"JSON数据查询成功，从表 {json_table_name} 返回 {len(result.result_rows)} 条记录")
            return {
                "success": True,
                "data": data,
                "columns": result.column_names,
                "count": total_count,
                "table_name": json_table_name
            }

        except Exception as e:
            logger.error(f"JSON数据查询失败: {str(e)}")
            return {
                "success": False,
                "message": f"查询失败: {str(e)}",
                "error": str(e)
            }

    async def _insert_data_to_clickhouse(self, table_name: str, data: Dict[str, Any]) -> None:
        """向ClickHouse表插入数据

        Args:
            table_name: 表名
            data: 要插入的数据字典
        """
        try:
            columns = list(data.keys())
            values = [[data.get(col) for col in columns]]
            await self.clickhouse_client.insert(f"job_data.{table_name}", values, column_names=columns)

        except Exception as e:
            logger.error(f"向表 {table_name} 插入数据失败: {str(e)}")
            raise e

    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
    async def _check_qcwy_duplicate(self, table_name: str, job_id: str, update_date_time: str) -> Optional[
        Dict[str, Any]]:
        """检查QCWY平台重复数据 - 基于job_id和update_date_time字段"""
        try:
            query = f"""
            SELECT id, created_at
            FROM job_data.{table_name}
            WHERE job_id = {job_id:String}
              AND update_date_time = {udt:String}
            LIMIT 1
            """
            result = await self.clickhouse_client.query(query, parameters={"job_id": str(job_id), "udt": str(update_date_time)})

            if result.result_rows:
                logger.info(f"发现QCWY重复数据: jobId={job_id}, updateDateTime={update_date_time}")
                return {
                    "id": result.result_rows[0][0],
                    "created_at": result.result_rows[0][1]
                }
            return None

        except Exception as e:
            logger.error(f"检查QCWY重复数据失败: {str(e)}")
            return None

    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
    async def _check_zhilian_duplicate(self, table_name: str, number: str, first_publish_time: str) -> Optional[
        Dict[str, Any]]:
        """检查智联平台重复数据 - 基于number和first_publish_time字段"""
        try:
            query = f"""
            SELECT id, created_at
            FROM job_data.{table_name}
            WHERE number = {number:String}
              AND first_publish_time = {fpt:String}
            LIMIT 1
            """
            result = await self.clickhouse_client.query(query, parameters={"number": str(number), "fpt": str(first_publish_time)})

            if result.result_rows:
                logger.info(f"发现智联重复数据: number={number}, firstPublishTime={first_publish_time}")
                return {
                    "id": result.result_rows[0][0],
                    "created_at": result.result_rows[0][1],
                    "number": number,
                    "first_publish_time": first_publish_time
                }
            return None

        except Exception as e:
            logger.error(f"检查智联重复数据失败: {str(e)}")
            return None

    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
    async def _check_boss_duplicate(self, table_name: str, job_id: any) -> Optional[Dict[str, Any]]:
        """检查BOSS平台重复数据 - 基于job_id字段"""
        try:
            query = f"""
            SELECT id, created_at
            FROM job_data.{table_name}
            WHERE job_id = {job_id:String}
            LIMIT 1
            """
            result = await self.clickhouse_client.query(query, parameters={"job_id": str(job_id)})
            if result.result_rows:
                logger.info(f"发现BOSS重复数据: jobId={job_id}")
                return {
                    "id": result.result_rows[0][0],
                    "created_at": result.result_rows[0][1]
                }

            return None

        except Exception as e:
            logger.error(f"检查BOSS重复数据失败: {str(e)}")
            return None

    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
    async def _check_boss_company_duplicate(self, table_name: str, company_name: str) -> Optional[Dict[str, Any]]:
        """检查BOSS平台公司重复数据 - 基于company_name字段"""
        try:
            query = f"""
            SELECT id, created_at
            FROM job_data.{table_name}
            WHERE company_name = {company_name:String}
            LIMIT 1
            """
            result = await self.clickhouse_client.query(query, parameters={"company_name": str(company_name)})
            if result.result_rows:
                logger.info(f"发现BOSS公司重复数据: companyName={company_name}")
                return {
                    "id": result.result_rows[0][0],
                    "created_at": result.result_rows[0][1]
                }

            return None

        except Exception as e:
            logger.error(f"检查BOSS公司重复数据失败: {str(e)}")
            return None

    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
    async def _check_qcwy_company_duplicate(self, table_name: str, company_name: str) -> Optional[Dict[str, Any]]:
        """检查QCWY平台公司重复数据 - 基于company_name字段"""
        try:
            query = f"""
            SELECT id, created_at
            FROM job_data.{table_name}
            WHERE company_name = {company_name:String}
            LIMIT 1
            """
            result = await self.clickhouse_client.query(query, parameters={"company_name": str(company_name)})
            if result.result_rows:
                logger.info(f"发现QCWY公司重复数据: companyName={company_name}")
                return {
                    "id": result.result_rows[0][0],
                    "created_at": result.result_rows[0][1]
                }

            return None

        except Exception as e:
            logger.error(f"检查QCWY公司重复数据失败: {str(e)}")
            return None

    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
    async def _check_qcwy_company_duplicate_by_name(self, table_name: str, company_name: str) -> Optional[
        Dict[str, Any]]:
        """检查QCWY平台公司重复数据 - 基于company_name字段"""
        try:
            query = f"""
            SELECT id, created_at
            FROM job_data.{table_name}
            WHERE company_name = {company_name:String}
            LIMIT 1
            """
            result = await self.clickhouse_client.query(query, parameters={"company_name": str(company_name)})
            if result.result_rows:
                logger.info(f"发现QCWY公司重复数据: companyName={company_name}")
                return {
                    "id": result.result_rows[0][0],
                    "created_at": result.result_rows[0][1]
                }

            return None

        except Exception as e:
            logger.error(f"检查QCWY公司重复数据失败: {str(e)}")
            return None

    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
    async def _check_zhilian_company_duplicate(self, table_name: str, company_name: str) -> Optional[Dict[str, Any]]:
        """检查智联平台公司重复数据 - 基于company_name字段"""
        try:
            query = f"""
            SELECT id, created_at
            FROM job_data.{table_name}
            WHERE company_name = {company_name:String}
            LIMIT 1
            """
            result = await self.clickhouse_client.query(query, parameters={"company_name": str(company_name)})
            if result.result_rows:
                logger.info(f"发现智联公司重复数据: companyName={company_name}")
                return {
                    "id": result.result_rows[0][0],
                    "created_at": result.result_rows[0][1]
                }

            return None

        except Exception as e:
            logger.error(f"检查智联公司重复数据失败: {str(e)}")
            return None

    async def send_to_remote_server(self, data: Dict[str, Any]) -> bool:
        """
        发送数据到远程服务器（简化版）
        直接接收body数据并发送

        Args:
            data: 要发送的数据字典

        Returns:
            bool: 发送成功返回True，失败返回False
        """
        # 打印关键词日志
        source_type = data.get('source_type', '未知平台')
        title = data.get('title', '未知职位')
        company_name = data.get('company_name', data.get('name', '未知公司'))
        logger.info(f"📤 上报数据: [{source_type}] {title} - {company_name}")

        try:
            # 构建认证参数
            from_id = 9910056
            timestamp = int(time.time())
            salt = 'jWcIqJK6QlR2syb6HQgpel9iOoOkj01G5MDFNtQLaTxhddHUTEnURsMe2RxCTYC8'

            # 生成token
            token_string = salt + str(timestamp)
            token = hashlib.md5(token_string.encode()).hexdigest()

            url = f'http://external-data.qixin.com/extend/extend_data_push?from={from_id}&token={token}&time={timestamp}'
            headers = {
                'Content-Type': 'application/json',
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
            }
            # 直接发送原始数据
            response = requests.post(url, json=data, headers=headers, timeout=30)
            # print(response.text)
            if response.status_code == 200:
                return True
            else:
                logger.error(f"❌ 数据发送失败: {response.status_code} - {response.text[:100]}")
                return False

        except Exception as e:
            logger.error(f"❌ 发送异常: {str(e)}")
            return False

    async def batch_store_data(self,
                               data_list: List[Dict[str, Any]],
                               data_type: DataType,
                               platform: PlatformType,
                               check_duplicate: bool = True) -> Dict[str, Any]:
        """批量存储数据 - 优化版本，使用批量插入

        Args:
            data_list: 要存储的数据列表
            data_type: 数据类型 (job/company)
            platform: 平台类型 (boss/qcwy/zhilian)
            check_duplicate: 是否检查重复数据

        Returns:
            批量存储结果信息
        """
        results = {
            "total": len(data_list),
            "success": 0,
            "failed": 0,
            "duplicate": 0,
            "errors": []
        }

        if not data_list:
            return results

        try:
            # 获取表名
            json_table_name = self._get_json_table_name(data_type, platform)

            # 批量处理数据 - 直接准备插入数据，在插入时处理重复
            valid_data_list = []
            remote_push_data_list = []

            # 第一步：准备所有数据
            for i, data in enumerate(data_list):
                try:
                    # 准备插入数据
                    current_time = datetime.now()
                    json_data = {
                        'id': 0,  # 自动生成
                        'json_data': json.dumps(data, ensure_ascii=False),
                        'created_at': current_time,
                        'updated_at': current_time
                    }

                    # 添加去重字段
                    self._add_dedup_fields(json_data, data, data_type, platform)

                    valid_data_list.append(json_data)

                    # 准备远程推送数据
                    remote_data = await self._prepare_remote_push_data(data, data_type, platform)
                    if remote_data:
                        remote_push_data_list.append(remote_data)

                except Exception as e:
                    results["failed"] += 1
                    results["errors"].append({
                        "index": i,
                        "error": f"数据预处理失败: {str(e)}"
                    })

            # 第二步：批量插入到数据库（在插入时忽略重复数据）
            if valid_data_list:
                try:
                    insert_result = await self._batch_insert_to_clickhouse(json_table_name, valid_data_list,
                                                                           ignore_duplicates=check_duplicate)
                    results["success"] = insert_result["inserted"]
                    results["duplicate"] = insert_result["ignored"]
                    # logger.info(
                    #     f"批量插入完成: {insert_result['inserted']} 条成功, {insert_result['ignored']} 条重复忽略")
                except Exception as e:
                    # 如果批量插入完全失败，记录错误
                    logger.error(f"批量插入失败: {str(e)}")
                    results["failed"] = len(valid_data_list)
                    results["errors"].append({
                        "error": f"批量插入失败: {str(e)}"
                    })

            # 第三步：批量推送到远程服务器
            if remote_push_data_list:
                try:
                    await self._batch_send_to_remote_server(remote_push_data_list)
                    logger.info(f"批量推送到远程服务器成功: {len(remote_push_data_list)} 条数据")
                except Exception as e:
                    logger.warning(f"批量推送到远程服务器失败: {str(e)}")
                    # 远程推送失败不影响主要存储结果

        except Exception as e:
            logger.error(f"批量存储数据失败: {str(e)}")
            # 如果批量处理完全失败，回退到原来的逐个处理方式
            return await self._fallback_individual_store(data_list, data_type, platform, check_duplicate)

        return results

    def _add_dedup_fields(self, json_data: Dict[str, Any], data: Dict[str, Any], data_type: DataType,
                          platform: PlatformType):
        """为JSON数据添加去重字段"""
        if platform == PlatformType.BOSS and data_type == DataType.JOB:
            job_base_info = data.get('jobBaseInfoVO', {})
            if job_base_info and 'jobId' in job_base_info:
                json_data['job_id'] = str(job_base_info['jobId'])

        elif platform == PlatformType.QCWY and data_type == DataType.JOB:
            if 'jobId' in data:
                json_data['job_id'] = str(data['jobId'])
            if 'updateDateTime' in data:
                json_data['update_date_time'] = str(data['updateDateTime'])

        elif platform == PlatformType.ZHILIAN and data_type == DataType.JOB:
            if 'number' in data:
                json_data['number'] = str(data['number'])
            if 'firstPublishTime' in data:
                json_data['first_publish_time'] = str(data['firstPublishTime'])

        elif platform == PlatformType.BOSS and data_type == DataType.COMPANY:
            company_name = data.get('name') or data.get('companyFullInfoVO', {}).get('name')
            if company_name:
                json_data['company_name'] = str(company_name)

        elif platform == PlatformType.QCWY and data_type == DataType.COMPANY:
            company_name = data.get('companyName') or data.get('company_name')
            if company_name:
                json_data['company_name'] = str(company_name)

        elif platform == PlatformType.ZHILIAN and data_type == DataType.COMPANY:
            company_name = data.get('companyName') or data.get('name')
            if company_name:
                json_data['company_name'] = str(company_name)

    async def _prepare_remote_push_data(self, data: Dict[str, Any], data_type: DataType, platform: PlatformType) -> \
    Optional[Dict[str, Any]]:
        """准备远程推送数据"""
        if data_type != DataType.JOB:
            return None

        try:
            if platform == PlatformType.QCWY:
                welfare_list = data.get("jobWelfareCodeDataList")
                if isinstance(welfare_list, list):
                    welfare_str = ",".join(
                        str(item.get("chineseTitle") or item.get("typeTitle") or item.get("englishTitle") or item.get("code"))
                        for item in welfare_list if isinstance(item, dict)
                    )
                elif isinstance(welfare_list, str):
                    welfare_str = welfare_list.replace("[", "").replace("]", "")
                else:
                    welfare_str = ""
                raw_location = data.get("location") or ""
                if not raw_location:
                    work_loc = data.get("workLocation") or {}
                    raw_location = work_loc.get("workAddress") or work_loc.get("address") or ""
                if raw_location:
                    location_val = raw_location
                else:
                    location_val = "位置信息未找到"
                raw_area = data.get("jobAreaString") or ""
                if not raw_area:
                    level_detail = data.get("jobAreaLevelDetail") or {}
                    city_str = level_detail.get("cityString") or ""
                    landmark_str = level_detail.get("landMarkString") or ""
                    raw_area = f"{city_str}{landmark_str}".strip()
                if raw_area:
                    area_val = raw_area
                else:
                    area_val = "位置信息未找到"
                remote_resp =   {
                    'source_type': '前程无忧',
                    'name': data.get("companyName"),
                    'title': data.get("jobName"),
                    'title_addr': data.get("jobName"),
                    'description': data.get("jobDescribe"),
                    'age': "",
                    'sex': "",
                    'number': "",
                    'education': data.get("degreeString"),
                    'skill': await self.safe_join(data.get("jobTagsForOrder")),
                    'welfare': welfare_str,
                    'years': data.get("workYearString"),
                    'salary': f'{data.get("jobSalaryMax", "")}-{data.get("jobSalaryMin", "")}',
                    'location': location_val,
                    'position': area_val,
                    'date': data.get("confirmDateString"),
                    'start_date': data.get("confirmDateString"),
                    'end_date': "",
                    'job_type': data.get("termStr"),
                    'size': data.get("companySizeString"),
                    'employer_type': data.get("companyTypeString"),
                    'industry': f'{data.get("major1Str", "")}-{data.get("major2Str", "")}',
                    'job_1st_class': "",
                    'job_2nd_class': "",
                    'job_3rd_class': "",
                    'job_4th_class': "",
                    'url': data.get("jobHref"),
                    'company_id': data.get("coId"),
                    'company_name': data.get("fullCompanyName"),
                    'company_url': data.get("companyHref"),
                    'company_desc': data.get("company_desc", ""),
                    'base_data':data
                }
                return remote_resp

            elif platform == PlatformType.BOSS:
                bossBaseInfoVO = data.get("bossBaseInfoVO", {})
                jobBaseInfoVO = data.get("jobBaseInfoVO", {})
                brandComInfoVO = data.get("brandComInfoVO", {})
                boss_resp = {
                    'source_type': 'Boss直聘',
                    'name': await self.safe_get(brandComInfoVO, "brandName"),
                    'common_name': await self.safe_get(bossBaseInfoVO, "brandName"),
                    'title': await self.safe_get(jobBaseInfoVO, "positionName"),
                    'title_addr': await self.safe_get(jobBaseInfoVO, "positionName"),
                    'description': await self.safe_get(jobBaseInfoVO, "jobDesc"),
                    'education': await self.safe_get(jobBaseInfoVO, "degreeName"),
                    'skill': await self.safe_join(jobBaseInfoVO.get("requiredSkills") if jobBaseInfoVO else None),
                    'welfare': await self.safe_join(jobBaseInfoVO.get("salaryWelfareInfo") if jobBaseInfoVO else None),
                    'years': await self.safe_get(jobBaseInfoVO, "experienceName"),
                    'salary': f'{await self.safe_get(jobBaseInfoVO, "lowSalary")}-{await self.safe_get(jobBaseInfoVO, "highSalary")}',
                    'location': await self.safe_get(jobBaseInfoVO, "locationName", "位置信息未找到"),
                    'position': await self.safe_get(jobBaseInfoVO, "locationDesc", "位置信息未找到"),
                    'job_type': "全职",
                    'size': await self.safe_get(brandComInfoVO, "scaleName"),
                    'employer_type': "全职",
                    'industry': await self.safe_get(brandComInfoVO, "industryName"),
                    'job_1st_class': "",
                    'job_2nd_class': "",
                    'job_3rd_class': "",
                    'job_4th_class': "",
                    'date': "",
                    'start_date': "",
                    'end_date': "",
                    'age': "",
                    'sex': "",
                    'number': "",
                    'url': f"https://www.zhipin.com/job_detail/{await self.safe_get(jobBaseInfoVO, 'encryptJobId')}.html",
                    'company_id': await self.safe_get(brandComInfoVO, "encryptBrandId"),
                    'company_name': await self.safe_get(brandComInfoVO, "brandName"),
                    'company_url': f"https://www.zhipin.com/gongsi/{await self.safe_get(brandComInfoVO, 'encryptBrandId')}.html",
                    'company_desc': await self.safe_get(brandComInfoVO, "introduce"),
                    'base_data': data
                }
                return boss_resp

            elif platform == PlatformType.ZHILIAN:
                # 智联平台：从原始 data 中提取所需字段
                zhilian_resp = {
                    'source_type': '智联招聘',
                    'name': await self.safe_get(data, 'companyName'),
                    'common_name': await self.safe_get(data, 'companyName'),
                    'title': await self.safe_get(data, 'name'),
                    'title_addr': await self.safe_get(data, 'name'),
                    'description': await self.safe_get(data, 'jobSummary'),
                    'education': await self.safe_get(data, 'education'),
                    'skill': await self.safe_join([tag['value'] for tag in data.get('skillLabel', [])]),
                    'welfare': '',  # 智联该条数据无福利字段
                    'years': await self.safe_get(data, 'workingExp'),
                    'salary': await self.safe_get(data, 'salary60'),
                    'location': f"{await self.safe_get(data, 'workCity')}{await self.safe_get(data, 'cityDistrict')}",
                    'position': f"{await self.safe_get(data, 'workCity')}{await self.safe_get(data, 'cityDistrict')}",
                    'job_type': await self.safe_get(data, 'workType'),
                    'size': await self.safe_get(data, 'companySize'),
                    'employer_type': await self.safe_get(data, 'propertyName'),
                    'industry': await self.safe_get(data, 'industryName'),
                    'job_1st_class': '',
                    'job_2nd_class': '',
                    'job_3rd_class': '',
                    'job_4th_class': '',
                    'date': await self.safe_get(data, 'firstPublishTime'),
                    'start_date': '',
                    'end_date': '',
                    'age': '',
                    'sex': '',
                    'number': str(await self.safe_get(data, 'recruitNumber')),
                    'url': await self.safe_get(data, 'positionURL'),
                    'company_id': str(await self.safe_get(data, 'companyId')),
                    'company_name': await self.safe_get(data, 'companyName'),
                    'company_url': await self.safe_get(data, 'companyUrl'),
                    'company_desc':  await self.safe_get(data, 'companyDesc'),
                    'base_data': data
                }
                return zhilian_resp
        except Exception as e:
            logger.error(f"准备远程推送数据失败: {str(e)}")
            return None

    async def _batch_insert_to_clickhouse(self, table_name: str, data_list: List[Dict[str, Any]],
                                          ignore_duplicates: bool = True) -> Dict[str, int]:
        """批量插入数据到ClickHouse，支持忽略重复数据

        Args:
            table_name: 表名
            data_list: 数据列表
            ignore_duplicates: 是否忽略重复数据

        Returns:
            插入结果统计 {"inserted": 插入数量, "ignored": 忽略数量}
        """
        result = {"inserted": 0, "ignored": 0}

        if not data_list:
            return result

        try:
            columns = list(data_list[0].keys())

            filtered_list = data_list
            if ignore_duplicates:
                dedup_cols = self._get_dedup_columns_for_table(table_name)
                if dedup_cols:
                    if len(dedup_cols) == 1:
                        key_col = dedup_cols[0]
                        candidate_keys = list({str(d.get(key_col, "")) for d in data_list if d.get(key_col)})
                        if candidate_keys:
                            query = f"""
                            SELECT {key_col}
                            FROM job_data.{table_name}
                            WHERE {key_col} IN {{keys:Array(String)}}
                            """
                            existing = await self.clickhouse_client.query(query, parameters={"keys": candidate_keys})
                            existing_set = {str(r[0]) for r in existing.result_rows}
                            filtered_list = [d for d in data_list if str(d.get(key_col, "")) not in existing_set]
                    elif len(dedup_cols) == 2:
                        c1, c2 = dedup_cols
                        candidate_c1 = list({str(d.get(c1, "")) for d in data_list if d.get(c1)})
                        if candidate_c1:
                            query = f"""
                            SELECT {c1}, {c2}
                            FROM job_data.{table_name}
                            WHERE {c1} IN {{keys:Array(String)}}
                            """
                            existing = await self.clickhouse_client.query(query, parameters={"keys": candidate_c1})
                            existing_map = {}
                            for r in existing.result_rows:
                                k = str(r[0])
                                v = str(r[1])
                                existing_map.setdefault(k, set()).add(v)
                            filtered_list = [
                                d for d in data_list
                                if str(d.get(c1, "")) not in existing_map or str(d.get(c2, "")) not in existing_map.get(str(d.get(c1, "")), set())
                            ]

            batch_values = [[item.get(col) for col in columns] for item in filtered_list]
            if batch_values:
                await self.clickhouse_client.insert(f"job_data.{table_name}", batch_values, column_names=columns)
                result["inserted"] = len(batch_values)
            result["ignored"] = len(data_list) - result["inserted"]

        except Exception as e:
            logger.error(f"批量插入到表 {table_name} 失败: {str(e)}")
            raise e

        return result

    def _get_dedup_columns_for_table(self, table_name: str) -> List[str]:
        """获取表的去重列"""
        if table_name == "boss_job":
            return ["job_id"]
        if table_name == "qcwy_job":
            return ["job_id", "update_date_time"]
        if table_name == "zhilian_job":
            return ["number", "first_publish_time"]
        if table_name in ("boss_company", "qcwy_company", "zhilian_company"):
            return ["company_name"]
        return []

    async def _batch_send_to_remote_server(self, data_list: List[Dict[str, Any]]) -> None:
        """批量发送数据到远程服务器"""
        for data in data_list:
            try:
                await self.send_to_remote_server(data)
            except Exception as e:
                logger.error(f"批量推送单条数据失败: {str(e)}")
                # 继续处理下一条数据

    async def _fallback_individual_store(self, data_list: List[Dict[str, Any]], data_type: DataType,
                                         platform: PlatformType, check_duplicate: bool) -> Dict[str, Any]:
        """回退到逐个存储的方法"""
        results = {
            "total": len(data_list),
            "success": 0,
            "failed": 0,
            "duplicate": 0,
            "errors": []
        }

        for i, data in enumerate(data_list):
            try:
                result = await self.store_data(data, data_type, platform, check_duplicate)
                if result["success"]:
                    results["success"] += 1
                elif result.get("duplicate"):
                    results["duplicate"] += 1
                else:
                    results["failed"] += 1
                    results["errors"].append({
                        "index": i,
                        "error": result.get("message", "未知错误")
                    })
            except Exception as e:
                results["failed"] += 1
                results["errors"].append({
                    "index": i,
                    "error": str(e)
                })

        return results


# 创建全局实例的工厂函数
def create_data_router_service(clickhouse_client: AsyncClient) -> DataRouterService:
    return DataRouterService(clickhouse_client)