降低iDRAC会话压力并开放重试配置

This commit is contained in:
2026-05-06 10:19:51 +08:00
parent 723240e925
commit 884ca887af
8 changed files with 163 additions and 38 deletions
+6
View File
@@ -2,3 +2,9 @@ HOST=192.168.1.100
USERNAME=root USERNAME=root
PASSWORD=your_idrac_password PASSWORD=your_idrac_password
FAN_SPEED_STEPS=50:20,55:25,60:30,65:40 FAN_SPEED_STEPS=50:20,55:25,60:30,65:40
CONTROL_INTERVAL_SECONDS=120
ERROR_INTERVAL_SECONDS=120
IPMI_RETRY_COUNT=5
IPMI_RETRY_DELAY_SECONDS=20
IPMI_TIMEOUT_SECONDS=60
USE_RAW_FAN_DUTY=false
+1
View File
@@ -10,6 +10,7 @@
- 清理 Python 缓存文件,避免将运行产物提交到仓库。 - 清理 Python 缓存文件,避免将运行产物提交到仓库。
- Docker 运行镜像切换到 Debian slim,只安装 `python3``ipmitool` 和时区数据,在降低体积的同时保留更好的IPMI兼容性。 - Docker 运行镜像切换到 Debian slim,只安装 `python3``ipmitool` 和时区数据,在降低体积的同时保留更好的IPMI兼容性。
- 新增 `FAN_SPEED_STEPS` 环境变量,允许用户通过 `.env` 自定义温度阈值和风扇转速档位。 - 新增 `FAN_SPEED_STEPS` 环境变量,允许用户通过 `.env` 自定义温度阈值和风扇转速档位。
- 新增轮询间隔、IPMI重试、命令超时和 raw 风扇占空比查询开关,默认减少 iDRAC8 的会话压力。
## Previous Improvements ## Previous Improvements
+16
View File
@@ -69,6 +69,12 @@ HOST=192.168.1.100
USERNAME=root USERNAME=root
PASSWORD=your_idrac_password PASSWORD=your_idrac_password
FAN_SPEED_STEPS=50:20,55:25,60:30,65:40 FAN_SPEED_STEPS=50:20,55:25,60:30,65:40
CONTROL_INTERVAL_SECONDS=120
ERROR_INTERVAL_SECONDS=120
IPMI_RETRY_COUNT=5
IPMI_RETRY_DELAY_SECONDS=20
IPMI_TIMEOUT_SECONDS=60
USE_RAW_FAN_DUTY=false
``` ```
Start the service / 启动服务: Start the service / 启动服务:
@@ -108,11 +114,21 @@ python3 start.py
| `USERNAME` | Yes | iDRAC username with IPMI permission. / 有 IPMI 权限的 iDRAC 用户名。 | | `USERNAME` | Yes | iDRAC username with IPMI permission. / 有 IPMI 权限的 iDRAC 用户名。 |
| `PASSWORD` | Yes | iDRAC password. / iDRAC 密码。 | | `PASSWORD` | Yes | iDRAC password. / iDRAC 密码。 |
| `FAN_SPEED_STEPS` | No | Temperature-to-speed rules. Default: `50:20,55:25,60:30,65:40`. / 温度和风扇转速规则,默认值:`50:20,55:25,60:30,65:40`。 | | `FAN_SPEED_STEPS` | No | Temperature-to-speed rules. Default: `50:20,55:25,60:30,65:40`. / 温度和风扇转速规则,默认值:`50:20,55:25,60:30,65:40`。 |
| `CONTROL_INTERVAL_SECONDS` | No | Normal control interval. Default: `120`. / 正常控制间隔,默认 `120` 秒。 |
| `ERROR_INTERVAL_SECONDS` | No | Wait time after a failed control cycle. Default: same as `CONTROL_INTERVAL_SECONDS`. / 控制周期失败后的等待时间,默认等于正常控制间隔。 |
| `IPMI_RETRY_COUNT` | No | Retry count for each IPMI command. Default: `5`. / 单条 IPMI 命令重试次数,默认 `5`。 |
| `IPMI_RETRY_DELAY_SECONDS` | No | Wait time between IPMI retries. Default: `20`. / IPMI 重试间隔,默认 `20` 秒。 |
| `IPMI_TIMEOUT_SECONDS` | No | Subprocess timeout for each IPMI command. Default: `60`. / 单次 IPMI 命令超时时间,默认 `60` 秒。 |
| `USE_RAW_FAN_DUTY` | No | Query raw fan duty before RPM estimation. Default: `false`. / 是否先用 raw 命令读取风扇占空比,默认 `false`。 |
The application does not include default credentials. Missing variables will stop startup with a clear error. The application does not include default credentials. Missing variables will stop startup with a clear error.
程序不内置默认地址、账号或密码。缺少环境变量时会直接停止并输出明确错误。 程序不内置默认地址、账号或密码。缺少环境变量时会直接停止并输出明确错误。
For iDRAC8 systems with unstable IPMI sessions, keep `CONTROL_INTERVAL_SECONDS` at `120` or higher. The default `USE_RAW_FAN_DUTY=false` avoids one extra IPMI session per cycle and estimates fan percentage from the RPM data already returned by `sdr`.
对于 IPMI 会话不稳定的 iDRAC8,建议 `CONTROL_INTERVAL_SECONDS` 保持 `120` 秒或更高。默认 `USE_RAW_FAN_DUTY=false` 会跳过额外的 raw 占空比查询,直接使用同一次 `sdr` 返回的 RPM 估算风扇百分比,减少 iDRAC 会话压力。
## Temperature Policy / 温控策略 ## Temperature Policy / 温控策略
The controller reads all temperature sensors and uses the highest value. The controller reads all temperature sensors and uses the highest value.
+21 -3
View File
@@ -15,13 +15,31 @@ DEFAULT_FAN_SPEED_STEPS = (
class FanController: class FanController:
# 初始化控制器并记录iDRAC连接信息 # 初始化控制器并记录iDRAC连接信息
def __init__(self, host: str, username: str, password: str, fan_speed_steps: str = None): def __init__(
self,
host: str,
username: str,
password: str,
fan_speed_steps: str = None,
ipmi_retry_count: int = 5,
ipmi_retry_delay: int = 20,
ipmi_timeout: int = 60,
use_raw_fan_duty: bool = False,
):
self.host = host self.host = host
self.username = username self.username = username
self.password = password self.password = password
self.ipmi = IpmiTool(self.host, self.username, self.password) self.ipmi = IpmiTool(
self.host,
self.username,
self.password,
retry_count=ipmi_retry_count,
retry_delay=ipmi_retry_delay,
timeout=ipmi_timeout,
)
self.fan_speed_steps = self.parse_fan_speed_steps(fan_speed_steps) self.fan_speed_steps = self.parse_fan_speed_steps(fan_speed_steps)
self.use_raw_fan_duty = use_raw_fan_duty
self.last_set_speed = None # 记录最后设置的风扇速度 self.last_set_speed = None # 记录最后设置的风扇速度
self.is_auto_mode = False # 记录当前是否为自动模式 self.is_auto_mode = False # 记录当前是否为自动模式
@@ -116,7 +134,7 @@ class FanController:
self.is_auto_mode = False self.is_auto_mode = False
# 获取当前风扇转速 # 获取当前风扇转速
current_speed = self.ipmi.get_fan_duty_cycle(sensor_data) current_speed = self.ipmi.get_fan_duty_cycle(sensor_data, use_raw=self.use_raw_fan_duty)
# 只有在当前转速与所需转速不同时才调整 # 只有在当前转速与所需转速不同时才调整
# 如果无法获取当前转速(返回-1),则检查是否已记录之前设置的速度 # 如果无法获取当前转速(返回-1),则检查是否已记录之前设置的速度
+36 -33
View File
@@ -7,31 +7,33 @@ from controller.logger import logger
# IPMI命令封装器:负责调用ipmitool读取传感器并设置Dell风扇 # IPMI命令封装器:负责调用ipmitool读取传感器并设置Dell风扇
class IpmiTool: class IpmiTool:
# 初始化iDRAC连接参数 # 初始化iDRAC连接参数
def __init__(self, host: str, username: str, password: str): def __init__(self, host: str, username: str, password: str, retry_count: int = 5, retry_delay: int = 20, timeout: int = 60):
if not host or not username or not password: if not host or not username or not password:
raise ValueError("host, username and password must be provided") raise ValueError("host, username and password must be provided")
self.host = host self.host = host
self.username = username self.username = username
self.password = password self.password = password
self.retry_count = retry_count
self.retry_delay = retry_delay
self.timeout = timeout
# 执行ipmitool命令并处理重试、超时和会话异常 # 执行ipmitool命令并处理重试、超时和会话异常
def run_cmd(self, cmd: str) -> str: def run_cmd(self, cmd: str) -> str:
basecmd = f'ipmitool -H {self.host} -I lanplus -U {self.username} -P {self.password}' basecmd = f'ipmitool -H {self.host} -I lanplus -U {self.username} -P {self.password}'
command = f'{basecmd} {cmd}' command = f'{basecmd} {cmd}'
retry_count = 5 # 增加重试次数以应对网络波动 for attempt in range(self.retry_count):
for attempt in range(retry_count):
try: try:
# print(f"Executing command: {command}") # 添加调试信息 # print(f"Executing command: {command}") # 添加调试信息
result = subprocess.run(command, shell=True, capture_output=True, text=True, timeout=60) # 增加超时时间 result = subprocess.run(command, shell=True, capture_output=True, text=True, timeout=self.timeout) # 控制单次命令最长等待时间
if result.returncode != 0: if result.returncode != 0:
# 部分ipmitool版本会把错误输出到stdout,或只返回退出码但stderr为空 # 部分ipmitool版本会把错误输出到stdout,或只返回退出码但stderr为空
error_msg = result.stderr.strip() or result.stdout.strip() or f'命令退出码: {result.returncode}' error_msg = result.stderr.strip() or result.stdout.strip() or f'命令退出码: {result.returncode}'
# 检查是否是网络连接问题 # 检查是否是网络连接问题
if "Unable to establish IPMI" in error_msg or "session" in error_msg: if "Unable to establish IPMI" in error_msg or "session" in error_msg:
logger.warning(f'IPMI会话建立失败 (尝试 {attempt + 1}/{retry_count}): {error_msg}') logger.warning(f'IPMI会话建立失败 (尝试 {attempt + 1}/{self.retry_count}): {error_msg}')
if attempt < retry_count - 1: if attempt < self.retry_count - 1:
time.sleep(10) # 网络问题需要更长的等待时间 time.sleep(self.retry_delay) # 网络问题需要更长的等待时间
continue continue
raise RuntimeError( raise RuntimeError(
f'IPMI 命令执行失败: {cmd}\n错误详情: {error_msg}' # 更清晰的错误提示 f'IPMI 命令执行失败: {cmd}\n错误详情: {error_msg}' # 更清晰的错误提示
@@ -39,16 +41,16 @@ class IpmiTool:
return result.stdout return result.stdout
except subprocess.TimeoutExpired: except subprocess.TimeoutExpired:
logger.warning(f'命令超时 (尝试 {attempt + 1}/{retry_count})') logger.warning(f'命令超时 (尝试 {attempt + 1}/{self.retry_count})')
if attempt < retry_count - 1: if attempt < self.retry_count - 1:
logger.warning(f'正在重试... (尝试次数 {attempt + 1}/{retry_count})') logger.warning(f'正在重试... (尝试次数 {attempt + 1}/{self.retry_count})')
time.sleep(10) # 每次重试前等待更长时间 time.sleep(self.retry_delay) # 每次重试前等待更长时间
else: else:
raise RuntimeError('IPMI 命令超时。请检查网络连接或服务器状态。') # 更明确的错误提示 raise RuntimeError('IPMI 命令超时。请检查网络连接或服务器状态。') # 更明确的错误提示
except Exception as e: except Exception as e:
logger.warning(f'IPMI命令执行异常 (尝试 {attempt + 1}/{retry_count}): {str(e)}') logger.warning(f'IPMI命令执行异常 (尝试 {attempt + 1}/{self.retry_count}): {str(e)}')
if attempt < retry_count - 1: if attempt < self.retry_count - 1:
time.sleep(10) # 网络问题需要更长的等待时间 time.sleep(self.retry_delay) # 网络问题需要更长的等待时间
else: else:
raise e raise e
@@ -116,29 +118,30 @@ class IpmiTool:
return fan_speeds return fan_speeds
# 获取当前风扇占空比,raw命令不可用时用RPM估算 # 获取当前风扇占空比,raw命令不可用时用RPM估算
def get_fan_duty_cycle(self, sensor_data: str = None) -> int: def get_fan_duty_cycle(self, sensor_data: str = None, use_raw: bool = False) -> int:
""" """
获取当前风扇占空比/百分比 获取当前风扇占空比/百分比
:return: current fan duty cycle in percentage :return: current fan duty cycle in percentage
""" """
try: if use_raw:
# Raw command to get current fan duty cycle try:
result = self.run_cmd('raw 0x30 0x31 0x01') # Raw command to get current fan duty cycle
# Parse the hex result to get duty cycle result = self.run_cmd('raw 0x30 0x31 0x01')
result_parts = result.strip().split() # Parse the hex result to get duty cycle
if result_parts and len(result_parts) >= 1: result_parts = result.strip().split()
# The command should return a hex value representing the duty cycle if result_parts and len(result_parts) >= 1:
duty_cycle_hex = result_parts[-1] # The command should return a hex value representing the duty cycle
duty_cycle = int(duty_cycle_hex, 16) duty_cycle_hex = result_parts[-1]
# Ensure the value is in valid range (0-100) duty_cycle = int(duty_cycle_hex, 16)
if 0 <= duty_cycle <= 100 and duty_cycle != 0: # Ensure the value is in valid range (0-100)
# If we get a reasonable value (not 0), return it if 0 <= duty_cycle <= 100 and duty_cycle != 0:
return duty_cycle # If we get a reasonable value (not 0), return it
elif duty_cycle == 0: return duty_cycle
# Value of 0 might indicate auto mode or that raw command doesn't return duty cycle on this system elif duty_cycle == 0:
logger.info('原始命令返回0,尝试从RPM估算风扇百分比') # Value of 0 might indicate auto mode or that raw command doesn't return duty cycle on this system
except Exception as e: logger.info('原始命令返回0,尝试从RPM估算风扇百分比')
logger.warning(f'获取风扇占空比的原始命令失败: {e}') except Exception as e:
logger.warning(f'获取风扇占空比的原始命令失败: {e}')
# If raw command fails or returns 0, get fan speeds from sensor data and convert to approximate percentage # If raw command fails or returns 0, get fan speeds from sensor data and convert to approximate percentage
try: try:
+46 -2
View File
@@ -5,6 +5,40 @@ import traceback
from controller.client import FanController from controller.client import FanController
from controller.logger import logger from controller.logger import logger
# 解析整数环境变量,缺省时使用默认值
def get_int_env(name: str, default: int, min_value: int = 1) -> int:
value = os.getenv(name)
if value is None:
return default
try:
parsed_value = int(value)
except ValueError as exc:
raise RuntimeError(f'{name} 必须是整数') from exc
if parsed_value < min_value:
raise RuntimeError(f'{name} 必须大于等于 {min_value}')
return parsed_value
# 解析布尔环境变量,支持 true/false、1/0、yes/no
def get_bool_env(name: str, default: bool = False) -> bool:
value = os.getenv(name)
if value is None:
return default
normalized_value = value.strip().lower()
if normalized_value in ('1', 'true', 'yes', 'on'):
return True
if normalized_value in ('0', 'false', 'no', 'off'):
return False
raise RuntimeError(f'{name} 必须是布尔值: true/false')
if __name__ == '__main__': if __name__ == '__main__':
# 从环境变量读取iDRAC连接信息,开源版本不内置任何真实默认凭据 # 从环境变量读取iDRAC连接信息,开源版本不内置任何真实默认凭据
@@ -15,6 +49,12 @@ if __name__ == '__main__':
fan_speed_steps = os.getenv('FAN_SPEED_STEPS') fan_speed_steps = os.getenv('FAN_SPEED_STEPS')
if fan_speed_steps is None: if fan_speed_steps is None:
fan_speed_steps = os.getenv('FAN_SPEED_RULES') fan_speed_steps = os.getenv('FAN_SPEED_RULES')
control_interval = get_int_env('CONTROL_INTERVAL_SECONDS', 120)
error_interval = get_int_env('ERROR_INTERVAL_SECONDS', control_interval)
ipmi_retry_count = get_int_env('IPMI_RETRY_COUNT', 5)
ipmi_retry_delay = get_int_env('IPMI_RETRY_DELAY_SECONDS', 20)
ipmi_timeout = get_int_env('IPMI_TIMEOUT_SECONDS', 60)
use_raw_fan_duty = get_bool_env('USE_RAW_FAN_DUTY', False)
if not host: if not host:
raise RuntimeError('未设置 HOST 环境变量') raise RuntimeError('未设置 HOST 环境变量')
@@ -30,16 +70,20 @@ if __name__ == '__main__':
username=username, username=username,
password=password, password=password,
fan_speed_steps=fan_speed_steps, fan_speed_steps=fan_speed_steps,
ipmi_retry_count=ipmi_retry_count,
ipmi_retry_delay=ipmi_retry_delay,
ipmi_timeout=ipmi_timeout,
use_raw_fan_duty=use_raw_fan_duty,
) )
while True: while True:
try: try:
# 执行一次温度读取和风扇控制周期 # 执行一次温度读取和风扇控制周期
client.run() client.run()
time.sleep(60) time.sleep(control_interval)
except Exception as err: except Exception as err:
logger.error( logger.error(
f'运行控制器失败 {err}. {traceback.format_exc()}' f'运行控制器失败 {err}. {traceback.format_exc()}'
) )
# iDRAC会话异常时等待下一轮,避免连续请求压垮IPMI服务 # iDRAC会话异常时等待下一轮,避免连续请求压垮IPMI服务
time.sleep(60) time.sleep(error_interval)
+5
View File
@@ -49,6 +49,11 @@ class FanControllerConfigTest(unittest.TestCase):
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
self.make_controller(invalid_value) self.make_controller(invalid_value)
def test_raw_fan_duty_query_is_disabled_by_default(self):
controller = self.make_controller()
self.assertFalse(controller.use_raw_fan_duty)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
+32
View File
@@ -0,0 +1,32 @@
import os
import unittest
from unittest.mock import patch
from start import get_bool_env, get_int_env
class StartConfigTest(unittest.TestCase):
def test_get_int_env_uses_default_when_missing(self):
with patch.dict(os.environ, {}, clear=True):
self.assertEqual(get_int_env('CONTROL_INTERVAL_SECONDS', 120), 120)
def test_get_int_env_rejects_invalid_value(self):
with patch.dict(os.environ, {'CONTROL_INTERVAL_SECONDS': 'abc'}, clear=True):
with self.assertRaises(RuntimeError):
get_int_env('CONTROL_INTERVAL_SECONDS', 120)
def test_get_bool_env_parses_supported_values(self):
with patch.dict(os.environ, {'USE_RAW_FAN_DUTY': 'true'}, clear=True):
self.assertTrue(get_bool_env('USE_RAW_FAN_DUTY'))
with patch.dict(os.environ, {'USE_RAW_FAN_DUTY': 'false'}, clear=True):
self.assertFalse(get_bool_env('USE_RAW_FAN_DUTY'))
def test_get_bool_env_rejects_invalid_value(self):
with patch.dict(os.environ, {'USE_RAW_FAN_DUTY': 'maybe'}, clear=True):
with self.assertRaises(RuntimeError):
get_bool_env('USE_RAW_FAN_DUTY')
if __name__ == '__main__':
unittest.main()