JobData/app/repositories/clickhouse_repo.py

174 lines
5.7 KiB
Python

import math
from collections.abc import Generator
from datetime import datetime
from typing import Optional, Dict, Any, List
from clickhouse_connect.driver import AsyncClient
from clickhouse_connect.driver.query import QueryResult
class ClickHouseBaseRepo:
"""ClickHouse基础仓库类"""
def __init__(self, clickhouse_client: AsyncClient, table_name: str):
self._clickhouse_client = clickhouse_client
self._table_name = table_name
async def execute_query(self, query: str, parameters: Optional[Dict[str, Any]] = None) -> QueryResult:
"""执行查询"""
return await self._clickhouse_client.query(query, parameters=parameters)
async def execute_insert(self, data: List[Dict[str, Any]]) -> None:
"""批量插入数据"""
if not data:
return
columns = list(data[0].keys())
values = [[row[col] for col in columns] for row in data]
await self._clickhouse_client.insert(
table=self._table_name,
data=values,
column_names=columns
)
def _build_where_statements(
self,
filters: Optional[Dict[str, Any]] = None,
from_dt: Optional[datetime] = None,
to_dt: Optional[datetime] = None
) -> tuple[List[str], Dict[str, Any]]:
"""构建WHERE条件语句"""
where_statements = []
params = {}
if filters:
for key, value in filters.items():
if value is not None:
where_statements.append(f"{key} = %({key})s")
params[key] = value
if from_dt:
where_statements.append("created_at >= %(from_dt)s")
params['from_dt'] = from_dt
if to_dt:
where_statements.append("created_at <= %(to_dt)s")
params['to_dt'] = to_dt
return where_statements, params
class JobAnalyticsRepo(ClickHouseBaseRepo):
"""招聘数据分析仓库"""
def __init__(self, clickhouse_client: AsyncClient):
super().__init__(clickhouse_client, "job_analytics")
async def get_job_count(
self,
filters: Optional[Dict[str, Any]] = None,
from_dt: Optional[datetime] = None,
to_dt: Optional[datetime] = None,
) -> int:
"""获取职位数量"""
where_statements, params = self._build_where_statements(
filters=filters, from_dt=from_dt, to_dt=to_dt
)
where_clause = " AND ".join(where_statements) if where_statements else "1=1"
query = f"""
SELECT COUNT(*)
FROM {self._table_name}
WHERE {where_clause}
"""
result = await self.execute_query(query, parameters=params)
return int(result.result_rows[0][0])
async def group_jobs_by_column(
self,
group_by_column: str,
filters: Optional[Dict[str, Any]] = None,
from_dt: Optional[datetime] = None,
to_dt: Optional[datetime] = None,
limit: int = 10
) -> List[Dict[str, Any]]:
"""按指定列分组统计职位"""
where_statements, params = self._build_where_statements(
filters=filters, from_dt=from_dt, to_dt=to_dt
)
where_clause = " AND ".join(where_statements) if where_statements else "1=1"
query = f"""
SELECT
{group_by_column} AS category,
COUNT(*) AS job_count
FROM {self._table_name}
WHERE {where_clause}
GROUP BY {group_by_column}
ORDER BY job_count DESC
LIMIT {limit}
"""
result = await self.execute_query(query, parameters=params)
return [
{"category": row[0], "job_count": int(row[1])}
for row in result.result_rows
]
async def get_volume_trend(
self,
interval: str = "day", # day or hour
filters: Optional[Dict[str, Any]] = None,
from_dt: Optional[datetime] = None,
to_dt: Optional[datetime] = None,
) -> List[Dict[str, Any]]:
"""获取数据量趋势"""
where_statements, params = self._build_where_statements(
filters=filters, from_dt=from_dt, to_dt=to_dt
)
where_clause = " AND ".join(where_statements) if where_statements else "1=1"
if interval == "day":
time_func = "toStartOfDay"
elif interval == "hour":
time_func = "toStartOfHour"
elif interval == "week":
time_func = "toStartOfWeek"
elif interval == "month":
time_func = "toStartOfMonth"
else:
time_func = "toStartOfDay"
# 使用 toTimeZone 确保聚合按北京时间划分
query = f"""
SELECT
{time_func}(toTimeZone(created_at, 'Asia/Shanghai')) AS time_bucket,
source,
COUNT(*) AS count
FROM {self._table_name}
WHERE {where_clause}
GROUP BY time_bucket, source
ORDER BY time_bucket ASC
"""
result = await self.execute_query(query, parameters=params)
return [
{
"time": row[0].isoformat(),
"source": row[1],
"count": int(row[2])
}
for row in result.result_rows
]
async def get_source_distribution(
self,
filters: Optional[Dict[str, Any]] = None,
from_dt: Optional[datetime] = None,
to_dt: Optional[datetime] = None,
) -> List[Dict[str, Any]]:
"""获取数据来源分布"""
return await self.group_jobs_by_column("source", filters, from_dt, to_dt)