mirror of
https://github.com/lkddi/dell-fans-controller-docker.git
synced 2026-05-18 21:57:29 +08:00
优化IPMI失败冷却和手动模式切换
This commit is contained in:
@@ -4,6 +4,7 @@ PASSWORD=your_idrac_password
|
|||||||
FAN_SPEED_STEPS=50:20,55:25,60:30,65:40
|
FAN_SPEED_STEPS=50:20,55:25,60:30,65:40
|
||||||
CONTROL_INTERVAL_SECONDS=120
|
CONTROL_INTERVAL_SECONDS=120
|
||||||
ERROR_INTERVAL_SECONDS=120
|
ERROR_INTERVAL_SECONDS=120
|
||||||
|
IPMI_FAILURE_BACKOFF_SECONDS=300
|
||||||
IPMI_RETRY_COUNT=5
|
IPMI_RETRY_COUNT=5
|
||||||
IPMI_RETRY_DELAY_SECONDS=20
|
IPMI_RETRY_DELAY_SECONDS=20
|
||||||
IPMI_TIMEOUT_SECONDS=60
|
IPMI_TIMEOUT_SECONDS=60
|
||||||
|
|||||||
@@ -12,6 +12,7 @@
|
|||||||
- Docker 运行镜像切换到 Debian slim,只安装 `python3`、`ipmitool` 和时区数据,在降低体积的同时保留更好的IPMI兼容性。
|
- Docker 运行镜像切换到 Debian slim,只安装 `python3`、`ipmitool` 和时区数据,在降低体积的同时保留更好的IPMI兼容性。
|
||||||
- 新增 `FAN_SPEED_STEPS` 环境变量,允许用户通过 `.env` 自定义温度阈值和风扇转速档位。
|
- 新增 `FAN_SPEED_STEPS` 环境变量,允许用户通过 `.env` 自定义温度阈值和风扇转速档位。
|
||||||
- 新增轮询间隔、IPMI重试、命令超时和 raw 风扇占空比查询开关,默认减少 iDRAC8 的会话压力。
|
- 新增轮询间隔、IPMI重试、命令超时和 raw 风扇占空比查询开关,默认减少 iDRAC8 的会话压力。
|
||||||
|
- 单轮 IPMI 重试全部失败时改为冷却跳过,并避免重复发送手动模式切换 raw 命令。
|
||||||
|
|
||||||
## Previous Improvements
|
## Previous Improvements
|
||||||
|
|
||||||
|
|||||||
@@ -75,6 +75,7 @@ PASSWORD=your_idrac_password
|
|||||||
FAN_SPEED_STEPS=50:20,55:25,60:30,65:40
|
FAN_SPEED_STEPS=50:20,55:25,60:30,65:40
|
||||||
CONTROL_INTERVAL_SECONDS=120
|
CONTROL_INTERVAL_SECONDS=120
|
||||||
ERROR_INTERVAL_SECONDS=120
|
ERROR_INTERVAL_SECONDS=120
|
||||||
|
IPMI_FAILURE_BACKOFF_SECONDS=300
|
||||||
IPMI_RETRY_COUNT=5
|
IPMI_RETRY_COUNT=5
|
||||||
IPMI_RETRY_DELAY_SECONDS=20
|
IPMI_RETRY_DELAY_SECONDS=20
|
||||||
IPMI_TIMEOUT_SECONDS=60
|
IPMI_TIMEOUT_SECONDS=60
|
||||||
@@ -120,6 +121,7 @@ python3 start.py
|
|||||||
| `FAN_SPEED_STEPS` | No | Temperature-to-speed rules. Default: `50:20,55:25,60:30,65:40`. / 温度和风扇转速规则,默认值:`50:20,55:25,60:30,65:40`。 |
|
| `FAN_SPEED_STEPS` | No | Temperature-to-speed rules. Default: `50:20,55:25,60:30,65:40`. / 温度和风扇转速规则,默认值:`50:20,55:25,60:30,65:40`。 |
|
||||||
| `CONTROL_INTERVAL_SECONDS` | No | Normal control interval. Default: `120`. / 正常控制间隔,默认 `120` 秒。 |
|
| `CONTROL_INTERVAL_SECONDS` | No | Normal control interval. Default: `120`. / 正常控制间隔,默认 `120` 秒。 |
|
||||||
| `ERROR_INTERVAL_SECONDS` | No | Wait time after a failed control cycle. Default: same as `CONTROL_INTERVAL_SECONDS`. / 控制周期失败后的等待时间,默认等于正常控制间隔。 |
|
| `ERROR_INTERVAL_SECONDS` | No | Wait time after a failed control cycle. Default: same as `CONTROL_INTERVAL_SECONDS`. / 控制周期失败后的等待时间,默认等于正常控制间隔。 |
|
||||||
|
| `IPMI_FAILURE_BACKOFF_SECONDS` | No | Cooldown after all IPMI retries fail. Default: `300`. / 单轮 IPMI 重试全部失败后的冷却时间,默认 `300` 秒。 |
|
||||||
| `IPMI_RETRY_COUNT` | No | Retry count for each IPMI command. Default: `5`. / 单条 IPMI 命令重试次数,默认 `5`。 |
|
| `IPMI_RETRY_COUNT` | No | Retry count for each IPMI command. Default: `5`. / 单条 IPMI 命令重试次数,默认 `5`。 |
|
||||||
| `IPMI_RETRY_DELAY_SECONDS` | No | Wait time between IPMI retries. Default: `20`. / IPMI 重试间隔,默认 `20` 秒。 |
|
| `IPMI_RETRY_DELAY_SECONDS` | No | Wait time between IPMI retries. Default: `20`. / IPMI 重试间隔,默认 `20` 秒。 |
|
||||||
| `IPMI_TIMEOUT_SECONDS` | No | Subprocess timeout for each IPMI command. Default: `60`. / 单次 IPMI 命令超时时间,默认 `60` 秒。 |
|
| `IPMI_TIMEOUT_SECONDS` | No | Subprocess timeout for each IPMI command. Default: `60`. / 单次 IPMI 命令超时时间,默认 `60` 秒。 |
|
||||||
@@ -182,7 +184,9 @@ ipmitool -H 192.168.1.100 -I lanplus -U root -P your_idrac_password sdr
|
|||||||
|
|
||||||
Common issues / 常见问题:
|
Common issues / 常见问题:
|
||||||
|
|
||||||
- `Unable to establish IPMI v2 / RMCP+ session`: iDRAC IPMI service may be busy or unstable. Check network latency, duplicate monitoring scripts, and consider resetting iDRAC with `mc reset cold`.
|
- `Unable to establish IPMI v2 / RMCP+ session`: iDRAC IPMI service may be busy or unstable. Occasional retries are expected on some iDRAC8 systems. If all retries fail, the controller skips that cycle and waits for `IPMI_FAILURE_BACKOFF_SECONDS`.
|
||||||
|
- `Unable to establish IPMI v2 / RMCP+ session`:iDRAC IPMI 服务可能繁忙或不稳定。部分 iDRAC8 偶发重试是正常现象;如果单轮重试全部失败,控制器会跳过本轮并按 `IPMI_FAILURE_BACKOFF_SECONDS` 冷却等待。
|
||||||
|
- Frequent IPMI session failures / 频繁 IPMI 会话失败:check network latency, duplicate monitoring scripts or duplicate containers, and consider resetting iDRAC with `mc reset cold`.
|
||||||
- Connection failed / 连接失败:确认容器主机能访问 iDRAC 管理 IP。
|
- Connection failed / 连接失败:确认容器主机能访问 iDRAC 管理 IP。
|
||||||
- Authentication failed / 认证失败:确认用户名、密码和 IPMI 权限。
|
- Authentication failed / 认证失败:确认用户名、密码和 IPMI 权限。
|
||||||
- Permission denied / 权限不足:建议使用专用 iDRAC 用户,并授予 IPMI 控制权限。
|
- Permission denied / 权限不足:建议使用专用 iDRAC 用户,并授予 IPMI 控制权限。
|
||||||
|
|||||||
@@ -42,6 +42,7 @@ class FanController:
|
|||||||
self.use_raw_fan_duty = use_raw_fan_duty
|
self.use_raw_fan_duty = use_raw_fan_duty
|
||||||
self.last_set_speed = None # 记录最后设置的风扇速度
|
self.last_set_speed = None # 记录最后设置的风扇速度
|
||||||
self.is_auto_mode = False # 记录当前是否为自动模式
|
self.is_auto_mode = False # 记录当前是否为自动模式
|
||||||
|
self.is_manual_mode = False # 记录当前是否已切换到手动模式
|
||||||
|
|
||||||
# 解析温控规则配置,格式为 "50:20,55:25,60:30,65:40"
|
# 解析温控规则配置,格式为 "50:20,55:25,60:30,65:40"
|
||||||
def parse_fan_speed_steps(self, steps: str) -> tuple:
|
def parse_fan_speed_steps(self, steps: str) -> tuple:
|
||||||
@@ -92,7 +93,12 @@ class FanController:
|
|||||||
# 设置手动风扇速度
|
# 设置手动风扇速度
|
||||||
def set_fan_speed(self, speed: int):
|
def set_fan_speed(self, speed: int):
|
||||||
logger.info(f'设置风扇速度: {speed}%')
|
logger.info(f'设置风扇速度: {speed}%')
|
||||||
self.ipmi.set_fan_speed(speed)
|
if not self.is_manual_mode:
|
||||||
|
# 首次进入手动风扇控制时才切换模式,避免每次调速都多发一次raw命令
|
||||||
|
self.ipmi.switch_fan_mode(auto=False)
|
||||||
|
self.is_manual_mode = True
|
||||||
|
|
||||||
|
self.ipmi.set_fan_speed(speed, ensure_manual=False)
|
||||||
|
|
||||||
# 根据最高温度计算目标风扇转速
|
# 根据最高温度计算目标风扇转速
|
||||||
def get_required_fan_speed(self, temperature: int) -> int:
|
def get_required_fan_speed(self, temperature: int) -> int:
|
||||||
@@ -122,6 +128,7 @@ class FanController:
|
|||||||
logger.info(f'切换风扇为自动模式')
|
logger.info(f'切换风扇为自动模式')
|
||||||
self.ipmi.switch_fan_mode(auto=True)
|
self.ipmi.switch_fan_mode(auto=True)
|
||||||
self.is_auto_mode = True
|
self.is_auto_mode = True
|
||||||
|
self.is_manual_mode = False
|
||||||
self.last_set_speed = None # 重置手动设置的速度
|
self.last_set_speed = None # 重置手动设置的速度
|
||||||
else:
|
else:
|
||||||
logger.info(f'当前已是自动模式,无需操作')
|
logger.info(f'当前已是自动模式,无需操作')
|
||||||
@@ -132,6 +139,7 @@ class FanController:
|
|||||||
logger.info(f'从自动模式切换到手动模式')
|
logger.info(f'从自动模式切换到手动模式')
|
||||||
self.ipmi.switch_fan_mode(auto=False)
|
self.ipmi.switch_fan_mode(auto=False)
|
||||||
self.is_auto_mode = False
|
self.is_auto_mode = False
|
||||||
|
self.is_manual_mode = True
|
||||||
|
|
||||||
# 获取当前风扇转速
|
# 获取当前风扇转速
|
||||||
current_speed = self.ipmi.get_fan_duty_cycle(sensor_data, use_raw=self.use_raw_fan_duty)
|
current_speed = self.ipmi.get_fan_duty_cycle(sensor_data, use_raw=self.use_raw_fan_duty)
|
||||||
|
|||||||
+4
-1
@@ -198,10 +198,11 @@ class IpmiTool:
|
|||||||
return self.run_cmd(cmd=auto_cmd) if auto else self.run_cmd(cmd=manual_cmd)
|
return self.run_cmd(cmd=auto_cmd) if auto else self.run_cmd(cmd=manual_cmd)
|
||||||
|
|
||||||
# 设置手动风扇速度百分比
|
# 设置手动风扇速度百分比
|
||||||
def set_fan_speed(self, speed: int):
|
def set_fan_speed(self, speed: int, ensure_manual: bool = True):
|
||||||
"""
|
"""
|
||||||
设置风扇速度
|
设置风扇速度
|
||||||
:param speed:
|
:param speed:
|
||||||
|
:param ensure_manual:
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
if speed < 10 or speed > 100:
|
if speed < 10 or speed > 100:
|
||||||
@@ -209,6 +210,8 @@ class IpmiTool:
|
|||||||
'speed must be between 10 and 100'
|
'speed must be between 10 and 100'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if ensure_manual:
|
||||||
self.switch_fan_mode(auto=False)
|
self.switch_fan_mode(auto=False)
|
||||||
|
|
||||||
base_cmd = 'raw 0x30 0x30 0x02 0xff'
|
base_cmd = 'raw 0x30 0x30 0x02 0xff'
|
||||||
return self.run_cmd(cmd=f'{base_cmd} {hex(speed)}')
|
return self.run_cmd(cmd=f'{base_cmd} {hex(speed)}')
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
import traceback
|
|
||||||
|
|
||||||
from controller.client import FanController
|
from controller.client import FanController
|
||||||
from controller.logger import logger
|
from controller.logger import logger
|
||||||
@@ -51,6 +50,7 @@ if __name__ == '__main__':
|
|||||||
fan_speed_steps = os.getenv('FAN_SPEED_RULES')
|
fan_speed_steps = os.getenv('FAN_SPEED_RULES')
|
||||||
control_interval = get_int_env('CONTROL_INTERVAL_SECONDS', 120)
|
control_interval = get_int_env('CONTROL_INTERVAL_SECONDS', 120)
|
||||||
error_interval = get_int_env('ERROR_INTERVAL_SECONDS', control_interval)
|
error_interval = get_int_env('ERROR_INTERVAL_SECONDS', control_interval)
|
||||||
|
ipmi_failure_backoff = get_int_env('IPMI_FAILURE_BACKOFF_SECONDS', 300)
|
||||||
ipmi_retry_count = get_int_env('IPMI_RETRY_COUNT', 5)
|
ipmi_retry_count = get_int_env('IPMI_RETRY_COUNT', 5)
|
||||||
ipmi_retry_delay = get_int_env('IPMI_RETRY_DELAY_SECONDS', 20)
|
ipmi_retry_delay = get_int_env('IPMI_RETRY_DELAY_SECONDS', 20)
|
||||||
ipmi_timeout = get_int_env('IPMI_TIMEOUT_SECONDS', 60)
|
ipmi_timeout = get_int_env('IPMI_TIMEOUT_SECONDS', 60)
|
||||||
@@ -81,9 +81,13 @@ if __name__ == '__main__':
|
|||||||
# 执行一次温度读取和风扇控制周期
|
# 执行一次温度读取和风扇控制周期
|
||||||
client.run()
|
client.run()
|
||||||
time.sleep(control_interval)
|
time.sleep(control_interval)
|
||||||
except Exception as err:
|
except RuntimeError as err:
|
||||||
logger.error(
|
logger.warning(
|
||||||
f'运行控制器失败 {err}. {traceback.format_exc()}'
|
f'本轮IPMI控制失败,跳过本轮并等待 {ipmi_failure_backoff} 秒: {err}'
|
||||||
)
|
)
|
||||||
|
# 连续会话失败时给iDRAC更长恢复窗口,避免马上进入下一轮重试
|
||||||
|
time.sleep(ipmi_failure_backoff)
|
||||||
|
except Exception as err:
|
||||||
|
logger.error(f'运行控制器失败 {err}', exc_info=True)
|
||||||
# iDRAC会话异常时等待下一轮,避免连续请求压垮IPMI服务
|
# iDRAC会话异常时等待下一轮,避免连续请求压垮IPMI服务
|
||||||
time.sleep(error_interval)
|
time.sleep(error_interval)
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
import unittest
|
import unittest
|
||||||
|
from unittest.mock import Mock
|
||||||
|
|
||||||
from controller.client import FanController
|
from controller.client import FanController
|
||||||
|
|
||||||
@@ -54,6 +55,16 @@ class FanControllerConfigTest(unittest.TestCase):
|
|||||||
|
|
||||||
self.assertFalse(controller.use_raw_fan_duty)
|
self.assertFalse(controller.use_raw_fan_duty)
|
||||||
|
|
||||||
|
def test_set_fan_speed_switches_manual_mode_only_once(self):
|
||||||
|
controller = self.make_controller()
|
||||||
|
controller.ipmi = Mock()
|
||||||
|
|
||||||
|
controller.set_fan_speed(30)
|
||||||
|
controller.set_fan_speed(40)
|
||||||
|
|
||||||
|
controller.ipmi.switch_fan_mode.assert_called_once_with(auto=False)
|
||||||
|
self.assertEqual(controller.ipmi.set_fan_speed.call_count, 2)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
Reference in New Issue
Block a user