174 lines
5.7 KiB
Python
174 lines
5.7 KiB
Python
import math
|
|
from collections.abc import Generator
|
|
from datetime import datetime
|
|
from typing import Optional, Dict, Any, List
|
|
from clickhouse_connect.driver import AsyncClient
|
|
from clickhouse_connect.driver.query import QueryResult
|
|
|
|
|
|
class ClickHouseBaseRepo:
|
|
"""ClickHouse基础仓库类"""
|
|
|
|
def __init__(self, clickhouse_client: AsyncClient, table_name: str):
|
|
self._clickhouse_client = clickhouse_client
|
|
self._table_name = table_name
|
|
|
|
async def execute_query(self, query: str, parameters: Optional[Dict[str, Any]] = None) -> QueryResult:
|
|
"""执行查询"""
|
|
return await self._clickhouse_client.query(query, parameters=parameters)
|
|
|
|
async def execute_insert(self, data: List[Dict[str, Any]]) -> None:
|
|
"""批量插入数据"""
|
|
if not data:
|
|
return
|
|
|
|
columns = list(data[0].keys())
|
|
values = [[row[col] for col in columns] for row in data]
|
|
|
|
await self._clickhouse_client.insert(
|
|
table=self._table_name,
|
|
data=values,
|
|
column_names=columns
|
|
)
|
|
|
|
def _build_where_statements(
|
|
self,
|
|
filters: Optional[Dict[str, Any]] = None,
|
|
from_dt: Optional[datetime] = None,
|
|
to_dt: Optional[datetime] = None
|
|
) -> tuple[List[str], Dict[str, Any]]:
|
|
"""构建WHERE条件语句"""
|
|
where_statements = []
|
|
params = {}
|
|
|
|
if filters:
|
|
for key, value in filters.items():
|
|
if value is not None:
|
|
where_statements.append(f"{key} = %({key})s")
|
|
params[key] = value
|
|
|
|
if from_dt:
|
|
where_statements.append("created_at >= %(from_dt)s")
|
|
params['from_dt'] = from_dt
|
|
|
|
if to_dt:
|
|
where_statements.append("created_at <= %(to_dt)s")
|
|
params['to_dt'] = to_dt
|
|
|
|
return where_statements, params
|
|
|
|
|
|
class JobAnalyticsRepo(ClickHouseBaseRepo):
|
|
"""招聘数据分析仓库"""
|
|
|
|
def __init__(self, clickhouse_client: AsyncClient):
|
|
super().__init__(clickhouse_client, "job_analytics")
|
|
|
|
async def get_job_count(
|
|
self,
|
|
filters: Optional[Dict[str, Any]] = None,
|
|
from_dt: Optional[datetime] = None,
|
|
to_dt: Optional[datetime] = None,
|
|
) -> int:
|
|
"""获取职位数量"""
|
|
where_statements, params = self._build_where_statements(
|
|
filters=filters, from_dt=from_dt, to_dt=to_dt
|
|
)
|
|
|
|
where_clause = " AND ".join(where_statements) if where_statements else "1=1"
|
|
query = f"""
|
|
SELECT COUNT(*)
|
|
FROM {self._table_name}
|
|
WHERE {where_clause}
|
|
"""
|
|
|
|
result = await self.execute_query(query, parameters=params)
|
|
return int(result.result_rows[0][0])
|
|
|
|
async def group_jobs_by_column(
|
|
self,
|
|
group_by_column: str,
|
|
filters: Optional[Dict[str, Any]] = None,
|
|
from_dt: Optional[datetime] = None,
|
|
to_dt: Optional[datetime] = None,
|
|
limit: int = 10
|
|
) -> List[Dict[str, Any]]:
|
|
"""按指定列分组统计职位"""
|
|
where_statements, params = self._build_where_statements(
|
|
filters=filters, from_dt=from_dt, to_dt=to_dt
|
|
)
|
|
|
|
where_clause = " AND ".join(where_statements) if where_statements else "1=1"
|
|
query = f"""
|
|
SELECT
|
|
{group_by_column} AS category,
|
|
COUNT(*) AS job_count
|
|
FROM {self._table_name}
|
|
WHERE {where_clause}
|
|
GROUP BY {group_by_column}
|
|
ORDER BY job_count DESC
|
|
LIMIT {limit}
|
|
"""
|
|
|
|
result = await self.execute_query(query, parameters=params)
|
|
return [
|
|
{"category": row[0], "job_count": int(row[1])}
|
|
for row in result.result_rows
|
|
]
|
|
|
|
async def get_volume_trend(
|
|
self,
|
|
interval: str = "day", # day or hour
|
|
filters: Optional[Dict[str, Any]] = None,
|
|
from_dt: Optional[datetime] = None,
|
|
to_dt: Optional[datetime] = None,
|
|
) -> List[Dict[str, Any]]:
|
|
"""获取数据量趋势"""
|
|
where_statements, params = self._build_where_statements(
|
|
filters=filters, from_dt=from_dt, to_dt=to_dt
|
|
)
|
|
|
|
where_clause = " AND ".join(where_statements) if where_statements else "1=1"
|
|
|
|
if interval == "day":
|
|
time_func = "toStartOfDay"
|
|
elif interval == "hour":
|
|
time_func = "toStartOfHour"
|
|
elif interval == "week":
|
|
time_func = "toStartOfWeek"
|
|
elif interval == "month":
|
|
time_func = "toStartOfMonth"
|
|
else:
|
|
time_func = "toStartOfDay"
|
|
|
|
# 使用 toTimeZone 确保聚合按北京时间划分
|
|
query = f"""
|
|
SELECT
|
|
{time_func}(toTimeZone(created_at, 'Asia/Shanghai')) AS time_bucket,
|
|
source,
|
|
COUNT(*) AS count
|
|
FROM {self._table_name}
|
|
WHERE {where_clause}
|
|
GROUP BY time_bucket, source
|
|
ORDER BY time_bucket ASC
|
|
"""
|
|
|
|
result = await self.execute_query(query, parameters=params)
|
|
return [
|
|
{
|
|
"time": row[0].isoformat(),
|
|
"source": row[1],
|
|
"count": int(row[2])
|
|
}
|
|
for row in result.result_rows
|
|
]
|
|
|
|
async def get_source_distribution(
|
|
self,
|
|
filters: Optional[Dict[str, Any]] = None,
|
|
from_dt: Optional[datetime] = None,
|
|
to_dt: Optional[datetime] = None,
|
|
) -> List[Dict[str, Any]]:
|
|
"""获取数据来源分布"""
|
|
return await self.group_jobs_by_column("source", filters, from_dt, to_dt)
|