优化IPMI失败冷却和手动模式切换

This commit is contained in:
2026-05-06 10:28:05 +08:00
parent c6a49b5f14
commit 844eed34f1
7 changed files with 40 additions and 8 deletions
+1
View File
@@ -4,6 +4,7 @@ PASSWORD=your_idrac_password
FAN_SPEED_STEPS=50:20,55:25,60:30,65:40
CONTROL_INTERVAL_SECONDS=120
ERROR_INTERVAL_SECONDS=120
IPMI_FAILURE_BACKOFF_SECONDS=300
IPMI_RETRY_COUNT=5
IPMI_RETRY_DELAY_SECONDS=20
IPMI_TIMEOUT_SECONDS=60
+1
View File
@@ -12,6 +12,7 @@
- Docker 运行镜像切换到 Debian slim,只安装 `python3``ipmitool` 和时区数据,在降低体积的同时保留更好的IPMI兼容性。
- 新增 `FAN_SPEED_STEPS` 环境变量,允许用户通过 `.env` 自定义温度阈值和风扇转速档位。
- 新增轮询间隔、IPMI重试、命令超时和 raw 风扇占空比查询开关,默认减少 iDRAC8 的会话压力。
- 单轮 IPMI 重试全部失败时改为冷却跳过,并避免重复发送手动模式切换 raw 命令。
## Previous Improvements
+5 -1
View File
@@ -75,6 +75,7 @@ PASSWORD=your_idrac_password
FAN_SPEED_STEPS=50:20,55:25,60:30,65:40
CONTROL_INTERVAL_SECONDS=120
ERROR_INTERVAL_SECONDS=120
IPMI_FAILURE_BACKOFF_SECONDS=300
IPMI_RETRY_COUNT=5
IPMI_RETRY_DELAY_SECONDS=20
IPMI_TIMEOUT_SECONDS=60
@@ -120,6 +121,7 @@ python3 start.py
| `FAN_SPEED_STEPS` | No | Temperature-to-speed rules. Default: `50:20,55:25,60:30,65:40`. / 温度和风扇转速规则,默认值:`50:20,55:25,60:30,65:40`。 |
| `CONTROL_INTERVAL_SECONDS` | No | Normal control interval. Default: `120`. / 正常控制间隔,默认 `120` 秒。 |
| `ERROR_INTERVAL_SECONDS` | No | Wait time after a failed control cycle. Default: same as `CONTROL_INTERVAL_SECONDS`. / 控制周期失败后的等待时间,默认等于正常控制间隔。 |
| `IPMI_FAILURE_BACKOFF_SECONDS` | No | Cooldown after all IPMI retries fail. Default: `300`. / 单轮 IPMI 重试全部失败后的冷却时间,默认 `300` 秒。 |
| `IPMI_RETRY_COUNT` | No | Retry count for each IPMI command. Default: `5`. / 单条 IPMI 命令重试次数,默认 `5`。 |
| `IPMI_RETRY_DELAY_SECONDS` | No | Wait time between IPMI retries. Default: `20`. / IPMI 重试间隔,默认 `20` 秒。 |
| `IPMI_TIMEOUT_SECONDS` | No | Subprocess timeout for each IPMI command. Default: `60`. / 单次 IPMI 命令超时时间,默认 `60` 秒。 |
@@ -182,7 +184,9 @@ ipmitool -H 192.168.1.100 -I lanplus -U root -P your_idrac_password sdr
Common issues / 常见问题:
- `Unable to establish IPMI v2 / RMCP+ session`: iDRAC IPMI service may be busy or unstable. Check network latency, duplicate monitoring scripts, and consider resetting iDRAC with `mc reset cold`.
- `Unable to establish IPMI v2 / RMCP+ session`: iDRAC IPMI service may be busy or unstable. Occasional retries are expected on some iDRAC8 systems. If all retries fail, the controller skips that cycle and waits for `IPMI_FAILURE_BACKOFF_SECONDS`.
- `Unable to establish IPMI v2 / RMCP+ session`:iDRAC IPMI 服务可能繁忙或不稳定。部分 iDRAC8 偶发重试是正常现象;如果单轮重试全部失败,控制器会跳过本轮并按 `IPMI_FAILURE_BACKOFF_SECONDS` 冷却等待。
- Frequent IPMI session failures / 频繁 IPMI 会话失败:check network latency, duplicate monitoring scripts or duplicate containers, and consider resetting iDRAC with `mc reset cold`.
- Connection failed / 连接失败:确认容器主机能访问 iDRAC 管理 IP。
- Authentication failed / 认证失败:确认用户名、密码和 IPMI 权限。
- Permission denied / 权限不足:建议使用专用 iDRAC 用户,并授予 IPMI 控制权限。
+9 -1
View File
@@ -42,6 +42,7 @@ class FanController:
self.use_raw_fan_duty = use_raw_fan_duty
self.last_set_speed = None # 记录最后设置的风扇速度
self.is_auto_mode = False # 记录当前是否为自动模式
self.is_manual_mode = False # 记录当前是否已切换到手动模式
# 解析温控规则配置,格式为 "50:20,55:25,60:30,65:40"
def parse_fan_speed_steps(self, steps: str) -> tuple:
@@ -92,7 +93,12 @@ class FanController:
# 设置手动风扇速度
def set_fan_speed(self, speed: int):
logger.info(f'设置风扇速度: {speed}%')
self.ipmi.set_fan_speed(speed)
if not self.is_manual_mode:
# 首次进入手动风扇控制时才切换模式,避免每次调速都多发一次raw命令
self.ipmi.switch_fan_mode(auto=False)
self.is_manual_mode = True
self.ipmi.set_fan_speed(speed, ensure_manual=False)
# 根据最高温度计算目标风扇转速
def get_required_fan_speed(self, temperature: int) -> int:
@@ -122,6 +128,7 @@ class FanController:
logger.info(f'切换风扇为自动模式')
self.ipmi.switch_fan_mode(auto=True)
self.is_auto_mode = True
self.is_manual_mode = False
self.last_set_speed = None # 重置手动设置的速度
else:
logger.info(f'当前已是自动模式,无需操作')
@@ -132,6 +139,7 @@ class FanController:
logger.info(f'从自动模式切换到手动模式')
self.ipmi.switch_fan_mode(auto=False)
self.is_auto_mode = False
self.is_manual_mode = True
# 获取当前风扇转速
current_speed = self.ipmi.get_fan_duty_cycle(sensor_data, use_raw=self.use_raw_fan_duty)
+4 -1
View File
@@ -198,10 +198,11 @@ class IpmiTool:
return self.run_cmd(cmd=auto_cmd) if auto else self.run_cmd(cmd=manual_cmd)
# 设置手动风扇速度百分比
def set_fan_speed(self, speed: int):
def set_fan_speed(self, speed: int, ensure_manual: bool = True):
"""
设置风扇速度
:param speed:
:param ensure_manual:
:return:
"""
if speed < 10 or speed > 100:
@@ -209,6 +210,8 @@ class IpmiTool:
'speed must be between 10 and 100'
)
if ensure_manual:
self.switch_fan_mode(auto=False)
base_cmd = 'raw 0x30 0x30 0x02 0xff'
return self.run_cmd(cmd=f'{base_cmd} {hex(speed)}')
+8 -4
View File
@@ -1,6 +1,5 @@
import os
import time
import traceback
from controller.client import FanController
from controller.logger import logger
@@ -51,6 +50,7 @@ if __name__ == '__main__':
fan_speed_steps = os.getenv('FAN_SPEED_RULES')
control_interval = get_int_env('CONTROL_INTERVAL_SECONDS', 120)
error_interval = get_int_env('ERROR_INTERVAL_SECONDS', control_interval)
ipmi_failure_backoff = get_int_env('IPMI_FAILURE_BACKOFF_SECONDS', 300)
ipmi_retry_count = get_int_env('IPMI_RETRY_COUNT', 5)
ipmi_retry_delay = get_int_env('IPMI_RETRY_DELAY_SECONDS', 20)
ipmi_timeout = get_int_env('IPMI_TIMEOUT_SECONDS', 60)
@@ -81,9 +81,13 @@ if __name__ == '__main__':
# 执行一次温度读取和风扇控制周期
client.run()
time.sleep(control_interval)
except Exception as err:
logger.error(
f'运行控制失败 {err}. {traceback.format_exc()}'
except RuntimeError as err:
logger.warning(
f'本轮IPMI控制失败,跳过本轮并等待 {ipmi_failure_backoff} 秒: {err}'
)
# 连续会话失败时给iDRAC更长恢复窗口,避免马上进入下一轮重试
time.sleep(ipmi_failure_backoff)
except Exception as err:
logger.error(f'运行控制器失败 {err}', exc_info=True)
# iDRAC会话异常时等待下一轮,避免连续请求压垮IPMI服务
time.sleep(error_interval)
+11
View File
@@ -1,4 +1,5 @@
import unittest
from unittest.mock import Mock
from controller.client import FanController
@@ -54,6 +55,16 @@ class FanControllerConfigTest(unittest.TestCase):
self.assertFalse(controller.use_raw_fan_duty)
def test_set_fan_speed_switches_manual_mode_only_once(self):
controller = self.make_controller()
controller.ipmi = Mock()
controller.set_fan_speed(30)
controller.set_fan_speed(40)
controller.ipmi.switch_fan_mode.assert_called_once_with(auto=False)
self.assertEqual(controller.ipmi.set_fan_speed.call_count, 2)
if __name__ == '__main__':
unittest.main()