JobData/app/core/clickhouse_init.py

241 lines
9.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from clickhouse_connect.driver import AsyncClient
from app.log import logger
class ClickHouseInitializer:
"""ClickHouse数据库初始化器"""
def __init__(self, client: AsyncClient):
self.client = client
async def create_boss_job_json_table(self):
"""创建BOSS招聘职位JSON存储表"""
create_table_sql = """
CREATE TABLE IF NOT EXISTS job_data.boss_job (
id UInt64 DEFAULT 0,
json_data String DEFAULT '', -- 原始JSON数据
job_id String DEFAULT '', -- BOSS平台去重字段jobBaseInfoVO.jobId
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY created_at
SETTINGS index_granularity = 8192;
"""
try:
await self.client.command(create_table_sql)
logger.info("BOSS职位JSON数据表 boss_job 创建成功")
except Exception as e:
logger.error(f"创建BOSS职位JSON数据表失败: {e}")
raise
async def create_boss_company_json_table(self):
"""创建BOSS招聘公司JSON存储表"""
create_table_sql = """
CREATE TABLE IF NOT EXISTS job_data.boss_company (
id UInt64 DEFAULT 0,
json_data String DEFAULT '', -- 原始JSON数据
company_name String DEFAULT '', -- 公司名称去重字段
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY created_at
SETTINGS index_granularity = 8192;
"""
try:
await self.client.command(create_table_sql)
logger.info("BOSS公司JSON数据表 boss_company 创建成功")
except Exception as e:
logger.error(f"创建BOSS公司JSON数据表失败: {e}")
raise
async def create_qcwy_job_json_table(self):
"""创建前程无忧职位JSON存储表"""
create_table_sql = """
CREATE TABLE IF NOT EXISTS job_data.qcwy_job (
id UInt64 DEFAULT 0,
json_data String DEFAULT '', -- 原始JSON数据
job_id String DEFAULT '', -- QCWY平台去重字段jobId
update_date_time String DEFAULT '', -- QCWY平台去重字段updateDateTime
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY created_at
SETTINGS index_granularity = 8192;
"""
try:
await self.client.command(create_table_sql)
logger.info("前程无忧职位JSON数据表 qcwy_job 创建成功")
except Exception as e:
logger.error(f"创建前程无忧职位JSON数据表失败: {e}")
raise
async def create_qcwy_company_json_table(self):
"""创建前程无忧公司JSON存储表"""
create_table_sql = """
CREATE TABLE IF NOT EXISTS job_data.qcwy_company (
id UInt64 DEFAULT 0,
json_data String DEFAULT '', -- 原始JSON数据
company_name String DEFAULT '', -- 公司名称去重字段
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY created_at
SETTINGS index_granularity = 8192;
"""
try:
await self.client.command(create_table_sql)
logger.info("前程无忧公司JSON数据表 qcwy_company 创建成功")
except Exception as e:
logger.error(f"创建前程无忧公司JSON数据表失败: {e}")
raise
async def create_zhilian_job_json_table(self):
"""创建智联招聘职位JSON存储表"""
create_table_sql = """
CREATE TABLE IF NOT EXISTS job_data.zhilian_job (
id UInt64 DEFAULT 0,
json_data String DEFAULT '', -- 原始JSON数据
number String DEFAULT '', -- 智联平台去重字段number
first_publish_time String DEFAULT '', -- 智联平台去重字段firstPublishTime
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY created_at
SETTINGS index_granularity = 8192;
"""
try:
await self.client.command(create_table_sql)
logger.info("智联招聘职位JSON数据表 zhilian_job 创建成功")
except Exception as e:
logger.error(f"创建智联招聘职位JSON数据表失败: {e}")
raise
async def create_zhilian_company_json_table(self):
"""创建智联招聘公司JSON存储表"""
create_table_sql = """
CREATE TABLE IF NOT EXISTS job_data.zhilian_company (
id UInt64 DEFAULT 0,
json_data String DEFAULT '', -- 原始JSON数据
company_name String DEFAULT '', -- 公司名称去重字段
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY created_at
SETTINGS index_granularity = 8192;
"""
try:
await self.client.command(create_table_sql)
logger.info("智联招聘公司JSON数据表 zhilian_company 创建成功")
except Exception as e:
logger.error(f"创建智联招聘公司JSON数据表失败: {e}")
raise
async def create_pending_company_table(self):
"""创建待处理公司表"""
create_table_sql = """
CREATE TABLE IF NOT EXISTS job_data.pending_company (
source String,
company_id String,
company_name String DEFAULT '',
status String DEFAULT 'pending',
error_msg String DEFAULT '',
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now(),
version UInt64 DEFAULT 1
) ENGINE = ReplacingMergeTree(version)
ORDER BY (source, company_id)
SETTINGS index_granularity = 8192;
"""
try:
await self.client.command(create_table_sql)
logger.info("待处理公司表 pending_company 创建成功")
except Exception as e:
logger.error(f"创建待处理公司表失败: {e}")
raise
async def create_job_analytics_view(self):
"""创建统一的招聘数据分析视图"""
create_view_sql = """
CREATE VIEW IF NOT EXISTS job_data.job_analytics AS
SELECT
'boss' as source,
job_id,
JSONExtractString(json_data, 'jobName') as position_name,
JSONExtractString(json_data, 'brandName') as company_name,
JSONExtractString(json_data, 'salaryDesc') as salary_text,
0.0 as salary_min,
0.0 as salary_max,
JSONExtractString(json_data, 'cityName') as city,
JSONExtractString(json_data, 'experienceName') as experience_required,
JSONExtractString(json_data, 'degreeName') as education,
created_at
FROM job_data.boss_job
UNION ALL
SELECT
'qcwy' as source,
job_id,
JSONExtractString(json_data, 'jobName') as position_name,
JSONExtractString(json_data, 'companyName') as company_name,
JSONExtractString(json_data, 'provideSalaryString') as salary_text,
0.0, 0.0,
JSONExtractString(json_data, 'workCity') as city,
JSONExtractString(json_data, 'workYear') as experience_required,
JSONExtractString(json_data, 'degree') as education,
created_at
FROM job_data.qcwy_job
UNION ALL
SELECT
'zhilian' as source,
number as job_id,
JSONExtractString(json_data, 'jobName') as position_name,
JSONExtractString(json_data, 'companyName') as company_name,
JSONExtractString(json_data, 'salary60') as salary_text,
0.0, 0.0,
JSONExtractString(json_data, 'workCity') as city,
JSONExtractString(json_data, 'workingExp') as experience_required,
JSONExtractString(json_data, 'education') as education,
created_at
FROM job_data.zhilian_job
"""
try:
await self.client.command(create_view_sql)
logger.info("招聘数据分析视图 job_analytics 创建成功")
except Exception as e:
logger.error(f"创建招聘数据分析视图失败: {e}")
raise
async def initialize_all_tables(self):
"""初始化所有表"""
logger.info("开始初始化 ClickHouse 数据库表...")
try:
# 创建BOSS招聘JSON表
await self.create_boss_job_json_table()
await self.create_boss_company_json_table()
# 创建前程无忧JSON表
await self.create_qcwy_job_json_table()
await self.create_qcwy_company_json_table()
# 创建智联招聘JSON表
await self.create_zhilian_job_json_table()
await self.create_zhilian_company_json_table()
# 创建待处理公司表
await self.create_pending_company_table()
# 创建统一分析视图
await self.create_job_analytics_view()
logger.info("ClickHouse 数据库表初始化完成")
except Exception as e:
logger.error(f"ClickHouse 数据库初始化失败: {e}")
raise