JobData/create_ecs_instances.py

319 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import json
import time
from datetime import datetime, timedelta, timezone
from alibabacloud_ecs20140526.client import Client as EcsClient
from alibabacloud_tea_openapi import models as open_api_models
from alibabacloud_ecs20140526 import models as ecs_models
from alibabacloud_credentials.client import Client as CredentialClient
INSTANCE_STATUS_CHECK_INTERVAL_MILLISECOND = 3000
INSTANCE_STATUS_TOTAL_CHECK_TIME_ELAPSE_MILLISECOND = 60000 * 3
def init_ecs_client() -> EcsClient:
"""
初始化 ECS 客户端
参数:无
返回EcsClient —— 使用环境变量中的 AK/SK 与 region 初始化的客户端
用途:用于后续调用阿里云 ECS 接口
"""
region_id = os.getenv("ALIYUN_REGION_ID", "cn-shanghai")
credential = CredentialClient()
config = open_api_models.Config(
credential=credential,
region_id=region_id,
)
config.endpoint = f"ecs.{region_id}.aliyuncs.com"
return EcsClient(config)
def compute_auto_release_time(hours_default=6, minutes_default=0) -> str:
"""
计算自动释放时间UTC确保至少晚于当前时间 30 分钟
参数:默认增加 6 小时;支持环境变量 AUTO_RELEASE_HOURS/AUTO_RELEASE_MINUTES
返回str —— ISO8601 格式时间,如 2025-11-21T08:00:00Z
用途:满足 Aliyun 对 AutoReleaseTime 的格式与时间窗口要求
"""
hours = int(os.getenv("AUTO_RELEASE_HOURS") or hours_default)
minutes = int(os.getenv("AUTO_RELEASE_MINUTES") or minutes_default)
delta = timedelta(hours=hours, minutes=minutes)
if delta < timedelta(minutes=30):
delta = timedelta(minutes=30)
target = datetime.now(timezone.utc) + delta
return target.strftime("%Y-%m-%dT%H:%M:%SZ")
def compose_run_instances_request() -> ecs_models.RunInstancesRequest:
"""
组装创建实例请求参数
参数:无(从环境变量读取可选覆盖项)
返回RunInstancesRequest —— 包含计费、地域/可用区、规格、镜像、磁盘、网络、数量等参数
用途:用于调用 RunInstances 创建 ECS 实例
"""
region_id = os.getenv("ALIYUN_REGION_ID", "cn-shanghai")
return ecs_models.RunInstancesRequest(
instance_charge_type=os.getenv("ALIYUN_CHARGE_TYPE", "PostPaid"),
region_id=region_id,
zone_id=os.getenv("ALIYUN_ZONE_ID", "cn-shanghai-b"),
instance_type=os.getenv("ALIYUN_INSTANCE_TYPE", "ecs.t5-lc1m1.small"),
io_optimized=os.getenv("ALIYUN_IO_OPTIMIZED", "optimized"),
spot_strategy=os.getenv("ALIYUN_SPOT_STRATEGY", "SpotAsPriceGo"),
spot_interruption_behavior=os.getenv("ALIYUN_SPOT_BEHAVIOR", "Terminate"),
image_id=os.getenv("ALIYUN_IMAGE_ID", "ubuntu_24_04_x64_20G_alibase_20251102.vhd"),
security_enhancement_strategy=os.getenv("ALIYUN_SECURITY_ENHANCE", "Active"),
system_disk=ecs_models.RunInstancesRequestSystemDisk(
size=int(os.getenv("ALIYUN_SYSTEM_DISK_SIZE", "40")),
category=os.getenv("ALIYUN_SYSTEM_DISK_CATEGORY", "cloud_efficiency"),
),
internet_charge_type=os.getenv("ALIYUN_INTERNET_CHARGE_TYPE", "PayByBandwidth"),
internet_max_bandwidth_out=int(os.getenv("ALIYUN_MAX_BW_OUT", "1")),
v_switch_id=os.getenv("ALIYUN_VSWITCH_ID"),
security_group_id=os.getenv("ALIYUN_SECURITY_GROUP_ID"),
image_options=ecs_models.RunInstancesRequestImageOptions(login_as_non_root=False),
instance_name=os.getenv("ALIYUN_INSTANCE_NAME", "launch-advisor-20251121"),
private_dns_name_options=ecs_models.RunInstancesRequestPrivateDnsNameOptions(hostname_type="Custom"),
unique_suffix=False,
http_tokens=os.getenv("ALIYUN_HTTP_TOKENS", "optional"),
tenancy="default",
affinity="default",
amount=int(os.getenv("ALIYUN_AMOUNT", "20")),
min_amount=int(os.getenv("ALIYUN_MIN_AMOUNT", "20")),
auto_release_time=compute_auto_release_time(),
)
def call_run_instances_api(ecs_client: EcsClient):
"""
调用创建实例 API
参数ecs_client
返回RunInstancesResponse 或 None
用途:提交实例创建请求并获取实例 ID 列表
"""
request = compose_run_instances_request()
try:
return ecs_client.run_instances(request)
except Exception as error:
print(getattr(error, "code", str(type(error))))
print(getattr(error, "message", str(error)))
data = getattr(error, "data", None)
if isinstance(data, dict) and data.get("Recommend") is not None:
print(data.get("Recommend"))
return None
def call_to_describe_instances(ecs_client: EcsClient, instance_ids):
"""
轮询检查实例状态直至成功或超时
参数ecs_client、instance_ids
返回None
用途:每 3 秒检查一次,累计超过设定时间则判定为超时
"""
start_time = int(time.time() * 1000)
pending = list(instance_ids)
region_id = os.getenv("ALIYUN_REGION_ID", "cn-shanghai")
while True:
time.sleep(INSTANCE_STATUS_CHECK_INTERVAL_MILLISECOND / 1000.0)
req = ecs_models.DescribeInstancesRequest(region_id=region_id, instance_ids=json.dumps(pending))
try:
resp = ecs_client.describe_instances(req)
except Exception as error:
print(getattr(error, "message", str(error)))
continue
instances = resp.body.instances.instance if hasattr(resp, "body") else resp.instances.instance
for inst in instances:
if getattr(inst, "status", None) == "Running":
iid = getattr(inst, "instance_id", None) or getattr(inst, "instanceId", None)
if iid in pending:
pending.remove(iid)
print(f"Instance boot successfully: {iid}")
if not pending:
print("Instances all boot successfully.")
return
if int(time.time() * 1000) - start_time > INSTANCE_STATUS_TOTAL_CHECK_TIME_ELAPSE_MILLISECOND:
print(f"Instances boot failed within {int(INSTANCE_STATUS_TOTAL_CHECK_TIME_ELAPSE_MILLISECOND/60000)} mins: {json.dumps(pending)}")
return
def list_all_instance_ids(ecs_client: EcsClient) -> list:
"""
列出当前地域所有实例 ID
参数ecs_client
返回List[str]
用途:用于批量清理现有实例
"""
region_id = os.getenv("ALIYUN_REGION_ID", "cn-shanghai")
ids = []
page = 1
while True:
req = ecs_models.DescribeInstancesRequest(region_id=region_id, page_size=100, page_number=page)
try:
resp = ecs_client.describe_instances(req)
except Exception:
break
body = resp.body if hasattr(resp, "body") else resp
items = body.instances.instance if hasattr(body, "instances") else []
if not items:
break
for it in items:
iid = getattr(it, "instance_id", None) or getattr(it, "instanceId", None)
if iid:
ids.append(iid)
page += 1
return ids
def wait_instances_status(ecs_client: EcsClient, instance_ids, target_status, timeout_seconds=600) -> bool:
"""
等待一组实例达到指定状态
参数ecs_client、instance_ids、target_status、timeout_seconds
返回bool
用途:用于重启后等待恢复 Running
"""
region_id = os.getenv("ALIYUN_REGION_ID", "cn-shanghai")
deadline = time.time() + timeout_seconds
pending = set(instance_ids)
while time.time() < deadline and pending:
time.sleep(3)
req = ecs_models.DescribeInstancesRequest(region_id=region_id, instance_ids=json.dumps(list(pending)))
try:
resp = ecs_client.describe_instances(req)
except Exception:
continue
body = resp.body if hasattr(resp, "body") else resp
items = body.instances.instance if hasattr(body, "instances") else []
for it in items:
st = getattr(it, "status", None)
iid = getattr(it, "instance_id", None) or getattr(it, "instanceId", None)
if st == target_status and iid in pending:
pending.remove(iid)
return not pending
def install_cloud_assistant_and_reboot(ecs_client: EcsClient, instance_ids) -> bool:
"""
为所有实例主动安装云助手并重启实例
参数ecs_client、instance_ids
返回bool
用途:确保云助手安装需要的重启操作已完成
"""
region_id = os.getenv("ALIYUN_REGION_ID", "cn-shanghai")
if not instance_ids:
return False
try:
install_req = ecs_models.InstallCloudAssistantRequest(region_id=region_id, instance_id=instance_ids)
ecs_client.install_cloud_assistant(install_req)
except Exception as e:
print(f"InstallCloudAssistant 调用失败:{e}")
return False
try:
reboot_req = ecs_models.RebootInstancesRequest(region_id=region_id, instance_id=instance_ids)
ecs_client.reboot_instances(reboot_req)
except Exception as e:
print(f"RebootInstances 调用失败:{e}")
return False
ok = wait_instances_status(ecs_client, instance_ids, target_status="Running", timeout_seconds=600)
return ok
def ensure_cloud_assistant_ready(ecs_client: EcsClient, instance_ids) -> bool:
"""
等待并确保 Cloud Assistant 就绪
参数ecs_client、instance_ids
返回bool
用途:避免实例刚启动时命令执行失败
"""
region_id = os.getenv("ALIYUN_REGION_ID", "cn-shanghai")
deadline = time.time() + int(os.getenv("CLOUD_ASSISTANT_READY_TIMEOUT_SECONDS", "600"))
while time.time() < deadline:
try:
req = ecs_models.DescribeCloudAssistantStatusRequest(region_id=region_id, instance_id=instance_ids)
resp = ecs_client.describe_cloud_assistant_status(req)
except Exception:
time.sleep(5)
continue
body = resp.body if hasattr(resp, "body") else resp
statuses = getattr(body, "instance_cloud_assistant_status", None)
if statuses is None and hasattr(body, "cloud_assistant"):
statuses = body.cloud_assistant.instance_cloud_assistant_status
ready_count = 0
for s in statuses or []:
st = getattr(s, "status", None) or getattr(s, "Status", None)
is_ready = (st is True) or (isinstance(st, str) and st.lower() in ("true", "enabled", "running"))
if is_ready:
ready_count += 1
if ready_count == len(instance_ids):
return True
time.sleep(5)
return False
def clear_all_instances(ecs_client: EcsClient):
"""
清空当前地域的所有 ECS 实例
参数ecs_client
返回None
用途:在批量创建前保持干净环境
"""
ids = list_all_instance_ids(ecs_client)
region_id = os.getenv("ALIYUN_REGION_ID", "cn-shanghai")
if not ids:
print("当前地域无实例,无需清理")
return
print(f"准备清理 {len(ids)} 台实例:{json.dumps(ids)}")
try:
stop_req = ecs_models.StopInstancesRequest(instance_id=ids, region_id=region_id)
ecs_client.stop_instances(stop_req)
except Exception as e:
print(f"停止实例失败:{e}")
wait_instances_status(ecs_client, ids, target_status="Stopped", timeout_seconds=600)
try:
del_req = ecs_models.DeleteInstancesRequest(instance_id=ids, force=True, region_id=region_id)
ecs_client.delete_instances(del_req)
except Exception:
for iid in ids:
try:
one = ecs_models.DeleteInstanceRequest(instance_id=iid, force=True, region_id=region_id)
ecs_client.delete_instance(one)
except Exception as e:
print(f"删除实例 {iid} 失败:{e}")
time.sleep(5)
left = list_all_instance_ids(ecs_client)
if not left:
print("实例已全部清理完毕")
else:
print(f"仍有实例未删除:{json.dumps(left)}")
def main():
"""
脚本入口:清理旧实例 → 创建新实例 → 安装云助手并重启 → 确认就绪
参数:无
返回None打印创建的实例 ID JSON
用途:为后续下发云助手命令准备环境
"""
ecs_client = init_ecs_client()
if os.getenv("NO_CLEAR") != "1":
clear_all_instances(ecs_client)
resp = call_run_instances_api(ecs_client)
if resp is None:
return
body = resp.body if hasattr(resp, "body") else resp
instance_ids = body.instance_id_sets.instance_id_set
print(f"Success. Instance creation succeed. InstanceIds: {json.dumps(instance_ids)}")
call_to_describe_instances(ecs_client, instance_ids)
installed = install_cloud_assistant_and_reboot(ecs_client, instance_ids)
if not installed:
print("Cloud Assistant 安装或重启失败,终止")
return
ready = ensure_cloud_assistant_ready(ecs_client, instance_ids)
if not ready:
print("Cloud Assistant 未全部就绪,终止")
return
print(json.dumps(instance_ids))
if __name__ == "__main__":
main()