204 lines
7.4 KiB
Python
204 lines
7.4 KiB
Python
from clickhouse_connect.driver import AsyncClient
|
|
from app.log import logger
|
|
|
|
|
|
class ClickHouseInitializer:
|
|
"""ClickHouse数据库初始化器"""
|
|
|
|
# 6张数据表的 DDL 定义(含 channel 列)
|
|
_TABLE_DDLS = {
|
|
"boss_job": """
|
|
CREATE TABLE IF NOT EXISTS job_data.boss_job (
|
|
id UInt64 DEFAULT 0,
|
|
json_data String DEFAULT '',
|
|
job_id String DEFAULT '',
|
|
channel String DEFAULT 'mini',
|
|
created_at DateTime DEFAULT now(),
|
|
updated_at DateTime DEFAULT now()
|
|
) ENGINE = MergeTree()
|
|
ORDER BY created_at
|
|
SETTINGS index_granularity = 8192;
|
|
""",
|
|
"boss_company": """
|
|
CREATE TABLE IF NOT EXISTS job_data.boss_company (
|
|
id UInt64 DEFAULT 0,
|
|
json_data String DEFAULT '',
|
|
company_name String DEFAULT '',
|
|
channel String DEFAULT 'mini',
|
|
created_at DateTime DEFAULT now(),
|
|
updated_at DateTime DEFAULT now()
|
|
) ENGINE = MergeTree()
|
|
ORDER BY created_at
|
|
SETTINGS index_granularity = 8192;
|
|
""",
|
|
"qcwy_job": """
|
|
CREATE TABLE IF NOT EXISTS job_data.qcwy_job (
|
|
id UInt64 DEFAULT 0,
|
|
json_data String DEFAULT '',
|
|
job_id String DEFAULT '',
|
|
update_date_time String DEFAULT '',
|
|
channel String DEFAULT 'mini',
|
|
created_at DateTime DEFAULT now(),
|
|
updated_at DateTime DEFAULT now()
|
|
) ENGINE = MergeTree()
|
|
ORDER BY created_at
|
|
SETTINGS index_granularity = 8192;
|
|
""",
|
|
"qcwy_company": """
|
|
CREATE TABLE IF NOT EXISTS job_data.qcwy_company (
|
|
id UInt64 DEFAULT 0,
|
|
json_data String DEFAULT '',
|
|
company_name String DEFAULT '',
|
|
channel String DEFAULT 'mini',
|
|
created_at DateTime DEFAULT now(),
|
|
updated_at DateTime DEFAULT now()
|
|
) ENGINE = MergeTree()
|
|
ORDER BY created_at
|
|
SETTINGS index_granularity = 8192;
|
|
""",
|
|
"zhilian_job": """
|
|
CREATE TABLE IF NOT EXISTS job_data.zhilian_job (
|
|
id UInt64 DEFAULT 0,
|
|
json_data String DEFAULT '',
|
|
number String DEFAULT '',
|
|
first_publish_time String DEFAULT '',
|
|
channel String DEFAULT 'mini',
|
|
created_at DateTime DEFAULT now(),
|
|
updated_at DateTime DEFAULT now()
|
|
) ENGINE = MergeTree()
|
|
ORDER BY created_at
|
|
SETTINGS index_granularity = 8192;
|
|
""",
|
|
"zhilian_company": """
|
|
CREATE TABLE IF NOT EXISTS job_data.zhilian_company (
|
|
id UInt64 DEFAULT 0,
|
|
json_data String DEFAULT '',
|
|
company_name String DEFAULT '',
|
|
channel String DEFAULT 'mini',
|
|
created_at DateTime DEFAULT now(),
|
|
updated_at DateTime DEFAULT now()
|
|
) ENGINE = MergeTree()
|
|
ORDER BY created_at
|
|
SETTINGS index_granularity = 8192;
|
|
""",
|
|
}
|
|
|
|
_PENDING_COMPANY_DDL = """
|
|
CREATE TABLE IF NOT EXISTS job_data.pending_company (
|
|
source String,
|
|
company_id String,
|
|
company_name String DEFAULT '',
|
|
status String DEFAULT 'pending',
|
|
error_msg String DEFAULT '',
|
|
created_at DateTime DEFAULT now(),
|
|
updated_at DateTime DEFAULT now(),
|
|
version UInt64 DEFAULT 1
|
|
) ENGINE = ReplacingMergeTree(version)
|
|
ORDER BY (source, company_id)
|
|
SETTINGS index_granularity = 8192;
|
|
"""
|
|
|
|
_JOB_ANALYTICS_VIEW = """
|
|
CREATE OR REPLACE VIEW job_data.job_analytics AS
|
|
SELECT
|
|
'boss' as source,
|
|
job_id,
|
|
channel,
|
|
JSONExtractString(json_data, 'jobName') as position_name,
|
|
JSONExtractString(json_data, 'brandName') as company_name,
|
|
JSONExtractString(json_data, 'salaryDesc') as salary_text,
|
|
0.0 as salary_min,
|
|
0.0 as salary_max,
|
|
JSONExtractString(json_data, 'cityName') as city,
|
|
JSONExtractString(json_data, 'experienceName') as experience_required,
|
|
JSONExtractString(json_data, 'degreeName') as education,
|
|
created_at
|
|
FROM job_data.boss_job
|
|
UNION ALL
|
|
SELECT
|
|
'qcwy' as source,
|
|
job_id,
|
|
channel,
|
|
JSONExtractString(json_data, 'jobName') as position_name,
|
|
JSONExtractString(json_data, 'companyName') as company_name,
|
|
JSONExtractString(json_data, 'provideSalaryString') as salary_text,
|
|
0.0, 0.0,
|
|
JSONExtractString(json_data, 'workCity') as city,
|
|
JSONExtractString(json_data, 'workYear') as experience_required,
|
|
JSONExtractString(json_data, 'degree') as education,
|
|
created_at
|
|
FROM job_data.qcwy_job
|
|
UNION ALL
|
|
SELECT
|
|
'zhilian' as source,
|
|
number as job_id,
|
|
channel,
|
|
JSONExtractString(json_data, 'jobName') as position_name,
|
|
JSONExtractString(json_data, 'companyName') as company_name,
|
|
JSONExtractString(json_data, 'salary60') as salary_text,
|
|
0.0, 0.0,
|
|
JSONExtractString(json_data, 'workCity') as city,
|
|
JSONExtractString(json_data, 'workingExp') as experience_required,
|
|
JSONExtractString(json_data, 'education') as education,
|
|
created_at
|
|
FROM job_data.zhilian_job
|
|
"""
|
|
|
|
# 需要添加 channel 列的表
|
|
_CHANNEL_MIGRATION_TABLES = [
|
|
"boss_job", "boss_company",
|
|
"qcwy_job", "qcwy_company",
|
|
"zhilian_job", "zhilian_company",
|
|
]
|
|
|
|
def __init__(self, client: AsyncClient):
|
|
self.client = client
|
|
|
|
async def _create_table(self, name: str, ddl: str) -> None:
|
|
try:
|
|
await self.client.command(ddl)
|
|
logger.info(f"表 {name} 创建成功")
|
|
except Exception as e:
|
|
logger.error(f"创建表 {name} 失败: {e}")
|
|
raise
|
|
|
|
async def initialize_channel_migration(self) -> None:
|
|
"""对已存在的表执行 ALTER TABLE ADD COLUMN IF NOT EXISTS channel"""
|
|
for table in self._CHANNEL_MIGRATION_TABLES:
|
|
try:
|
|
await self.client.command(
|
|
f"ALTER TABLE job_data.{table} "
|
|
f"ADD COLUMN IF NOT EXISTS channel String DEFAULT 'mini'"
|
|
)
|
|
logger.info(f"表 {table} channel 列迁移完成")
|
|
except Exception as e:
|
|
logger.warning(f"表 {table} channel 列迁移跳过: {e}")
|
|
|
|
async def initialize_all_tables(self) -> None:
|
|
"""初始化所有表"""
|
|
logger.info("开始初始化 ClickHouse 数据库表...")
|
|
|
|
try:
|
|
# 创建6张数据表
|
|
for name, ddl in self._TABLE_DDLS.items():
|
|
await self._create_table(name, ddl)
|
|
|
|
# 创建待处理公司表
|
|
await self._create_table("pending_company", self._PENDING_COMPANY_DDL)
|
|
|
|
# 对已存在的表添加 channel 列
|
|
await self.initialize_channel_migration()
|
|
|
|
# 创建/重建统一分析视图(含 channel 列)
|
|
try:
|
|
await self.client.command(self._JOB_ANALYTICS_VIEW)
|
|
logger.info("招聘数据分析视图 job_analytics 创建成功")
|
|
except Exception as e:
|
|
logger.error(f"创建招聘数据分析视图失败: {e}")
|
|
raise
|
|
|
|
logger.info("ClickHouse 数据库表初始化完成")
|
|
except Exception as e:
|
|
logger.error(f"ClickHouse 数据库初始化失败: {e}")
|
|
raise
|