JobData/app/core/clickhouse_init.py
2026-03-22 23:22:30 +08:00

204 lines
7.4 KiB
Python

from clickhouse_connect.driver import AsyncClient
from app.log import logger
class ClickHouseInitializer:
"""ClickHouse数据库初始化器"""
# 6张数据表的 DDL 定义(含 channel 列)
_TABLE_DDLS = {
"boss_job": """
CREATE TABLE IF NOT EXISTS job_data.boss_job (
id UInt64 DEFAULT 0,
json_data String DEFAULT '',
job_id String DEFAULT '',
channel String DEFAULT 'mini',
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY created_at
SETTINGS index_granularity = 8192;
""",
"boss_company": """
CREATE TABLE IF NOT EXISTS job_data.boss_company (
id UInt64 DEFAULT 0,
json_data String DEFAULT '',
company_name String DEFAULT '',
channel String DEFAULT 'mini',
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY created_at
SETTINGS index_granularity = 8192;
""",
"qcwy_job": """
CREATE TABLE IF NOT EXISTS job_data.qcwy_job (
id UInt64 DEFAULT 0,
json_data String DEFAULT '',
job_id String DEFAULT '',
update_date_time String DEFAULT '',
channel String DEFAULT 'mini',
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY created_at
SETTINGS index_granularity = 8192;
""",
"qcwy_company": """
CREATE TABLE IF NOT EXISTS job_data.qcwy_company (
id UInt64 DEFAULT 0,
json_data String DEFAULT '',
company_name String DEFAULT '',
channel String DEFAULT 'mini',
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY created_at
SETTINGS index_granularity = 8192;
""",
"zhilian_job": """
CREATE TABLE IF NOT EXISTS job_data.zhilian_job (
id UInt64 DEFAULT 0,
json_data String DEFAULT '',
number String DEFAULT '',
first_publish_time String DEFAULT '',
channel String DEFAULT 'mini',
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY created_at
SETTINGS index_granularity = 8192;
""",
"zhilian_company": """
CREATE TABLE IF NOT EXISTS job_data.zhilian_company (
id UInt64 DEFAULT 0,
json_data String DEFAULT '',
company_name String DEFAULT '',
channel String DEFAULT 'mini',
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY created_at
SETTINGS index_granularity = 8192;
""",
}
_PENDING_COMPANY_DDL = """
CREATE TABLE IF NOT EXISTS job_data.pending_company (
source String,
company_id String,
company_name String DEFAULT '',
status String DEFAULT 'pending',
error_msg String DEFAULT '',
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now(),
version UInt64 DEFAULT 1
) ENGINE = ReplacingMergeTree(version)
ORDER BY (source, company_id)
SETTINGS index_granularity = 8192;
"""
_JOB_ANALYTICS_VIEW = """
CREATE OR REPLACE VIEW job_data.job_analytics AS
SELECT
'boss' as source,
job_id,
channel,
JSONExtractString(json_data, 'jobName') as position_name,
JSONExtractString(json_data, 'brandName') as company_name,
JSONExtractString(json_data, 'salaryDesc') as salary_text,
0.0 as salary_min,
0.0 as salary_max,
JSONExtractString(json_data, 'cityName') as city,
JSONExtractString(json_data, 'experienceName') as experience_required,
JSONExtractString(json_data, 'degreeName') as education,
created_at
FROM job_data.boss_job
UNION ALL
SELECT
'qcwy' as source,
job_id,
channel,
JSONExtractString(json_data, 'jobName') as position_name,
JSONExtractString(json_data, 'companyName') as company_name,
JSONExtractString(json_data, 'provideSalaryString') as salary_text,
0.0, 0.0,
JSONExtractString(json_data, 'workCity') as city,
JSONExtractString(json_data, 'workYear') as experience_required,
JSONExtractString(json_data, 'degree') as education,
created_at
FROM job_data.qcwy_job
UNION ALL
SELECT
'zhilian' as source,
number as job_id,
channel,
JSONExtractString(json_data, 'jobName') as position_name,
JSONExtractString(json_data, 'companyName') as company_name,
JSONExtractString(json_data, 'salary60') as salary_text,
0.0, 0.0,
JSONExtractString(json_data, 'workCity') as city,
JSONExtractString(json_data, 'workingExp') as experience_required,
JSONExtractString(json_data, 'education') as education,
created_at
FROM job_data.zhilian_job
"""
# 需要添加 channel 列的表
_CHANNEL_MIGRATION_TABLES = [
"boss_job", "boss_company",
"qcwy_job", "qcwy_company",
"zhilian_job", "zhilian_company",
]
def __init__(self, client: AsyncClient):
self.client = client
async def _create_table(self, name: str, ddl: str) -> None:
try:
await self.client.command(ddl)
logger.info(f"{name} 创建成功")
except Exception as e:
logger.error(f"创建表 {name} 失败: {e}")
raise
async def initialize_channel_migration(self) -> None:
"""对已存在的表执行 ALTER TABLE ADD COLUMN IF NOT EXISTS channel"""
for table in self._CHANNEL_MIGRATION_TABLES:
try:
await self.client.command(
f"ALTER TABLE job_data.{table} "
f"ADD COLUMN IF NOT EXISTS channel String DEFAULT 'mini'"
)
logger.info(f"{table} channel 列迁移完成")
except Exception as e:
logger.warning(f"{table} channel 列迁移跳过: {e}")
async def initialize_all_tables(self) -> None:
"""初始化所有表"""
logger.info("开始初始化 ClickHouse 数据库表...")
try:
# 创建6张数据表
for name, ddl in self._TABLE_DDLS.items():
await self._create_table(name, ddl)
# 创建待处理公司表
await self._create_table("pending_company", self._PENDING_COMPANY_DDL)
# 对已存在的表添加 channel 列
await self.initialize_channel_migration()
# 创建/重建统一分析视图(含 channel 列)
try:
await self.client.command(self._JOB_ANALYTICS_VIEW)
logger.info("招聘数据分析视图 job_analytics 创建成功")
except Exception as e:
logger.error(f"创建招聘数据分析视图失败: {e}")
raise
logger.info("ClickHouse 数据库表初始化完成")
except Exception as e:
logger.error(f"ClickHouse 数据库初始化失败: {e}")
raise