241 lines
9.5 KiB
Python
241 lines
9.5 KiB
Python
from clickhouse_connect.driver import AsyncClient
|
||
from app.log import logger
|
||
|
||
|
||
class ClickHouseInitializer:
|
||
"""ClickHouse数据库初始化器"""
|
||
|
||
def __init__(self, client: AsyncClient):
|
||
self.client = client
|
||
|
||
async def create_boss_job_json_table(self):
|
||
"""创建BOSS招聘职位JSON存储表"""
|
||
create_table_sql = """
|
||
CREATE TABLE IF NOT EXISTS job_data.boss_job (
|
||
id UInt64 DEFAULT 0,
|
||
json_data String DEFAULT '', -- 原始JSON数据
|
||
job_id String DEFAULT '', -- BOSS平台去重字段:jobBaseInfoVO.jobId
|
||
created_at DateTime DEFAULT now(),
|
||
updated_at DateTime DEFAULT now()
|
||
) ENGINE = MergeTree()
|
||
ORDER BY created_at
|
||
SETTINGS index_granularity = 8192;
|
||
"""
|
||
|
||
try:
|
||
await self.client.command(create_table_sql)
|
||
logger.info("BOSS职位JSON数据表 boss_job 创建成功")
|
||
except Exception as e:
|
||
logger.error(f"创建BOSS职位JSON数据表失败: {e}")
|
||
raise
|
||
|
||
async def create_boss_company_json_table(self):
|
||
"""创建BOSS招聘公司JSON存储表"""
|
||
create_table_sql = """
|
||
CREATE TABLE IF NOT EXISTS job_data.boss_company (
|
||
id UInt64 DEFAULT 0,
|
||
json_data String DEFAULT '', -- 原始JSON数据
|
||
company_name String DEFAULT '', -- 公司名称去重字段
|
||
created_at DateTime DEFAULT now(),
|
||
updated_at DateTime DEFAULT now()
|
||
) ENGINE = MergeTree()
|
||
ORDER BY created_at
|
||
SETTINGS index_granularity = 8192;
|
||
"""
|
||
|
||
try:
|
||
await self.client.command(create_table_sql)
|
||
logger.info("BOSS公司JSON数据表 boss_company 创建成功")
|
||
except Exception as e:
|
||
logger.error(f"创建BOSS公司JSON数据表失败: {e}")
|
||
raise
|
||
|
||
async def create_qcwy_job_json_table(self):
|
||
"""创建前程无忧职位JSON存储表"""
|
||
create_table_sql = """
|
||
CREATE TABLE IF NOT EXISTS job_data.qcwy_job (
|
||
id UInt64 DEFAULT 0,
|
||
json_data String DEFAULT '', -- 原始JSON数据
|
||
job_id String DEFAULT '', -- QCWY平台去重字段:jobId
|
||
update_date_time String DEFAULT '', -- QCWY平台去重字段:updateDateTime
|
||
created_at DateTime DEFAULT now(),
|
||
updated_at DateTime DEFAULT now()
|
||
) ENGINE = MergeTree()
|
||
ORDER BY created_at
|
||
SETTINGS index_granularity = 8192;
|
||
"""
|
||
|
||
try:
|
||
await self.client.command(create_table_sql)
|
||
logger.info("前程无忧职位JSON数据表 qcwy_job 创建成功")
|
||
except Exception as e:
|
||
logger.error(f"创建前程无忧职位JSON数据表失败: {e}")
|
||
raise
|
||
|
||
async def create_qcwy_company_json_table(self):
|
||
"""创建前程无忧公司JSON存储表"""
|
||
create_table_sql = """
|
||
CREATE TABLE IF NOT EXISTS job_data.qcwy_company (
|
||
id UInt64 DEFAULT 0,
|
||
json_data String DEFAULT '', -- 原始JSON数据
|
||
company_name String DEFAULT '', -- 公司名称去重字段
|
||
created_at DateTime DEFAULT now(),
|
||
updated_at DateTime DEFAULT now()
|
||
) ENGINE = MergeTree()
|
||
ORDER BY created_at
|
||
SETTINGS index_granularity = 8192;
|
||
"""
|
||
|
||
try:
|
||
await self.client.command(create_table_sql)
|
||
logger.info("前程无忧公司JSON数据表 qcwy_company 创建成功")
|
||
except Exception as e:
|
||
logger.error(f"创建前程无忧公司JSON数据表失败: {e}")
|
||
raise
|
||
|
||
async def create_zhilian_job_json_table(self):
|
||
"""创建智联招聘职位JSON存储表"""
|
||
create_table_sql = """
|
||
CREATE TABLE IF NOT EXISTS job_data.zhilian_job (
|
||
id UInt64 DEFAULT 0,
|
||
json_data String DEFAULT '', -- 原始JSON数据
|
||
number String DEFAULT '', -- 智联平台去重字段:number
|
||
first_publish_time String DEFAULT '', -- 智联平台去重字段:firstPublishTime
|
||
created_at DateTime DEFAULT now(),
|
||
updated_at DateTime DEFAULT now()
|
||
) ENGINE = MergeTree()
|
||
ORDER BY created_at
|
||
SETTINGS index_granularity = 8192;
|
||
"""
|
||
|
||
try:
|
||
await self.client.command(create_table_sql)
|
||
logger.info("智联招聘职位JSON数据表 zhilian_job 创建成功")
|
||
except Exception as e:
|
||
logger.error(f"创建智联招聘职位JSON数据表失败: {e}")
|
||
raise
|
||
|
||
async def create_zhilian_company_json_table(self):
|
||
"""创建智联招聘公司JSON存储表"""
|
||
create_table_sql = """
|
||
CREATE TABLE IF NOT EXISTS job_data.zhilian_company (
|
||
id UInt64 DEFAULT 0,
|
||
json_data String DEFAULT '', -- 原始JSON数据
|
||
company_name String DEFAULT '', -- 公司名称去重字段
|
||
created_at DateTime DEFAULT now(),
|
||
updated_at DateTime DEFAULT now()
|
||
) ENGINE = MergeTree()
|
||
ORDER BY created_at
|
||
SETTINGS index_granularity = 8192;
|
||
"""
|
||
|
||
try:
|
||
await self.client.command(create_table_sql)
|
||
logger.info("智联招聘公司JSON数据表 zhilian_company 创建成功")
|
||
except Exception as e:
|
||
logger.error(f"创建智联招聘公司JSON数据表失败: {e}")
|
||
raise
|
||
|
||
async def create_pending_company_table(self):
|
||
"""创建待处理公司表"""
|
||
create_table_sql = """
|
||
CREATE TABLE IF NOT EXISTS job_data.pending_company (
|
||
source String,
|
||
company_id String,
|
||
company_name String DEFAULT '',
|
||
status String DEFAULT 'pending',
|
||
error_msg String DEFAULT '',
|
||
created_at DateTime DEFAULT now(),
|
||
updated_at DateTime DEFAULT now(),
|
||
version UInt64 DEFAULT 1
|
||
) ENGINE = ReplacingMergeTree(version)
|
||
ORDER BY (source, company_id)
|
||
SETTINGS index_granularity = 8192;
|
||
"""
|
||
|
||
try:
|
||
await self.client.command(create_table_sql)
|
||
logger.info("待处理公司表 pending_company 创建成功")
|
||
except Exception as e:
|
||
logger.error(f"创建待处理公司表失败: {e}")
|
||
raise
|
||
|
||
async def create_job_analytics_view(self):
|
||
"""创建统一的招聘数据分析视图"""
|
||
create_view_sql = """
|
||
CREATE VIEW IF NOT EXISTS job_data.job_analytics AS
|
||
SELECT
|
||
'boss' as source,
|
||
job_id,
|
||
JSONExtractString(json_data, 'jobName') as position_name,
|
||
JSONExtractString(json_data, 'brandName') as company_name,
|
||
JSONExtractString(json_data, 'salaryDesc') as salary_text,
|
||
0.0 as salary_min,
|
||
0.0 as salary_max,
|
||
JSONExtractString(json_data, 'cityName') as city,
|
||
JSONExtractString(json_data, 'experienceName') as experience_required,
|
||
JSONExtractString(json_data, 'degreeName') as education,
|
||
created_at
|
||
FROM job_data.boss_job
|
||
UNION ALL
|
||
SELECT
|
||
'qcwy' as source,
|
||
job_id,
|
||
JSONExtractString(json_data, 'jobName') as position_name,
|
||
JSONExtractString(json_data, 'companyName') as company_name,
|
||
JSONExtractString(json_data, 'provideSalaryString') as salary_text,
|
||
0.0, 0.0,
|
||
JSONExtractString(json_data, 'workCity') as city,
|
||
JSONExtractString(json_data, 'workYear') as experience_required,
|
||
JSONExtractString(json_data, 'degree') as education,
|
||
created_at
|
||
FROM job_data.qcwy_job
|
||
UNION ALL
|
||
SELECT
|
||
'zhilian' as source,
|
||
number as job_id,
|
||
JSONExtractString(json_data, 'jobName') as position_name,
|
||
JSONExtractString(json_data, 'companyName') as company_name,
|
||
JSONExtractString(json_data, 'salary60') as salary_text,
|
||
0.0, 0.0,
|
||
JSONExtractString(json_data, 'workCity') as city,
|
||
JSONExtractString(json_data, 'workingExp') as experience_required,
|
||
JSONExtractString(json_data, 'education') as education,
|
||
created_at
|
||
FROM job_data.zhilian_job
|
||
"""
|
||
try:
|
||
await self.client.command(create_view_sql)
|
||
logger.info("招聘数据分析视图 job_analytics 创建成功")
|
||
except Exception as e:
|
||
logger.error(f"创建招聘数据分析视图失败: {e}")
|
||
raise
|
||
|
||
async def initialize_all_tables(self):
|
||
"""初始化所有表"""
|
||
logger.info("开始初始化 ClickHouse 数据库表...")
|
||
|
||
try:
|
||
# 创建BOSS招聘JSON表
|
||
await self.create_boss_job_json_table()
|
||
await self.create_boss_company_json_table()
|
||
|
||
# 创建前程无忧JSON表
|
||
await self.create_qcwy_job_json_table()
|
||
await self.create_qcwy_company_json_table()
|
||
|
||
# 创建智联招聘JSON表
|
||
await self.create_zhilian_job_json_table()
|
||
await self.create_zhilian_company_json_table()
|
||
|
||
# 创建待处理公司表
|
||
await self.create_pending_company_table()
|
||
|
||
# 创建统一分析视图
|
||
await self.create_job_analytics_view()
|
||
|
||
logger.info("ClickHouse 数据库表初始化完成")
|
||
except Exception as e:
|
||
logger.error(f"ClickHouse 数据库初始化失败: {e}")
|
||
raise
|