diff --git a/.env.example b/.env.example index 501b10a..cc5a88b 100644 --- a/.env.example +++ b/.env.example @@ -4,6 +4,7 @@ PASSWORD=your_idrac_password FAN_SPEED_STEPS=50:20,55:25,60:30,65:40 CONTROL_INTERVAL_SECONDS=120 ERROR_INTERVAL_SECONDS=120 +IPMI_FAILURE_BACKOFF_SECONDS=300 IPMI_RETRY_COUNT=5 IPMI_RETRY_DELAY_SECONDS=20 IPMI_TIMEOUT_SECONDS=60 diff --git a/CHANGES.md b/CHANGES.md index dc5556c..e8a40d6 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -12,6 +12,7 @@ - Docker 运行镜像切换到 Debian slim,只安装 `python3`、`ipmitool` 和时区数据,在降低体积的同时保留更好的IPMI兼容性。 - 新增 `FAN_SPEED_STEPS` 环境变量,允许用户通过 `.env` 自定义温度阈值和风扇转速档位。 - 新增轮询间隔、IPMI重试、命令超时和 raw 风扇占空比查询开关,默认减少 iDRAC8 的会话压力。 +- 单轮 IPMI 重试全部失败时改为冷却跳过,并避免重复发送手动模式切换 raw 命令。 ## Previous Improvements diff --git a/README.md b/README.md index f17d821..372e88b 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,7 @@ PASSWORD=your_idrac_password FAN_SPEED_STEPS=50:20,55:25,60:30,65:40 CONTROL_INTERVAL_SECONDS=120 ERROR_INTERVAL_SECONDS=120 +IPMI_FAILURE_BACKOFF_SECONDS=300 IPMI_RETRY_COUNT=5 IPMI_RETRY_DELAY_SECONDS=20 IPMI_TIMEOUT_SECONDS=60 @@ -120,6 +121,7 @@ python3 start.py | `FAN_SPEED_STEPS` | No | Temperature-to-speed rules. Default: `50:20,55:25,60:30,65:40`. / 温度和风扇转速规则,默认值:`50:20,55:25,60:30,65:40`。 | | `CONTROL_INTERVAL_SECONDS` | No | Normal control interval. Default: `120`. / 正常控制间隔,默认 `120` 秒。 | | `ERROR_INTERVAL_SECONDS` | No | Wait time after a failed control cycle. Default: same as `CONTROL_INTERVAL_SECONDS`. / 控制周期失败后的等待时间,默认等于正常控制间隔。 | +| `IPMI_FAILURE_BACKOFF_SECONDS` | No | Cooldown after all IPMI retries fail. Default: `300`. / 单轮 IPMI 重试全部失败后的冷却时间,默认 `300` 秒。 | | `IPMI_RETRY_COUNT` | No | Retry count for each IPMI command. Default: `5`. / 单条 IPMI 命令重试次数,默认 `5`。 | | `IPMI_RETRY_DELAY_SECONDS` | No | Wait time between IPMI retries. Default: `20`. / IPMI 重试间隔,默认 `20` 秒。 | | `IPMI_TIMEOUT_SECONDS` | No | Subprocess timeout for each IPMI command. Default: `60`. / 单次 IPMI 命令超时时间,默认 `60` 秒。 | @@ -182,7 +184,9 @@ ipmitool -H 192.168.1.100 -I lanplus -U root -P your_idrac_password sdr Common issues / 常见问题: -- `Unable to establish IPMI v2 / RMCP+ session`: iDRAC IPMI service may be busy or unstable. Check network latency, duplicate monitoring scripts, and consider resetting iDRAC with `mc reset cold`. +- `Unable to establish IPMI v2 / RMCP+ session`: iDRAC IPMI service may be busy or unstable. Occasional retries are expected on some iDRAC8 systems. If all retries fail, the controller skips that cycle and waits for `IPMI_FAILURE_BACKOFF_SECONDS`. +- `Unable to establish IPMI v2 / RMCP+ session`:iDRAC IPMI 服务可能繁忙或不稳定。部分 iDRAC8 偶发重试是正常现象;如果单轮重试全部失败,控制器会跳过本轮并按 `IPMI_FAILURE_BACKOFF_SECONDS` 冷却等待。 +- Frequent IPMI session failures / 频繁 IPMI 会话失败:check network latency, duplicate monitoring scripts or duplicate containers, and consider resetting iDRAC with `mc reset cold`. - Connection failed / 连接失败:确认容器主机能访问 iDRAC 管理 IP。 - Authentication failed / 认证失败:确认用户名、密码和 IPMI 权限。 - Permission denied / 权限不足:建议使用专用 iDRAC 用户,并授予 IPMI 控制权限。 diff --git a/controller/client.py b/controller/client.py index 0808505..f97c1dd 100644 --- a/controller/client.py +++ b/controller/client.py @@ -42,6 +42,7 @@ class FanController: self.use_raw_fan_duty = use_raw_fan_duty self.last_set_speed = None # 记录最后设置的风扇速度 self.is_auto_mode = False # 记录当前是否为自动模式 + self.is_manual_mode = False # 记录当前是否已切换到手动模式 # 解析温控规则配置,格式为 "50:20,55:25,60:30,65:40" def parse_fan_speed_steps(self, steps: str) -> tuple: @@ -92,7 +93,12 @@ class FanController: # 设置手动风扇速度 def set_fan_speed(self, speed: int): logger.info(f'设置风扇速度: {speed}%') - self.ipmi.set_fan_speed(speed) + if not self.is_manual_mode: + # 首次进入手动风扇控制时才切换模式,避免每次调速都多发一次raw命令 + self.ipmi.switch_fan_mode(auto=False) + self.is_manual_mode = True + + self.ipmi.set_fan_speed(speed, ensure_manual=False) # 根据最高温度计算目标风扇转速 def get_required_fan_speed(self, temperature: int) -> int: @@ -122,6 +128,7 @@ class FanController: logger.info(f'切换风扇为自动模式') self.ipmi.switch_fan_mode(auto=True) self.is_auto_mode = True + self.is_manual_mode = False self.last_set_speed = None # 重置手动设置的速度 else: logger.info(f'当前已是自动模式,无需操作') @@ -132,6 +139,7 @@ class FanController: logger.info(f'从自动模式切换到手动模式') self.ipmi.switch_fan_mode(auto=False) self.is_auto_mode = False + self.is_manual_mode = True # 获取当前风扇转速 current_speed = self.ipmi.get_fan_duty_cycle(sensor_data, use_raw=self.use_raw_fan_duty) diff --git a/controller/ipmi.py b/controller/ipmi.py index f227a8e..27fd51d 100644 --- a/controller/ipmi.py +++ b/controller/ipmi.py @@ -198,10 +198,11 @@ class IpmiTool: return self.run_cmd(cmd=auto_cmd) if auto else self.run_cmd(cmd=manual_cmd) # 设置手动风扇速度百分比 - def set_fan_speed(self, speed: int): + def set_fan_speed(self, speed: int, ensure_manual: bool = True): """ 设置风扇速度 :param speed: + :param ensure_manual: :return: """ if speed < 10 or speed > 100: @@ -209,6 +210,8 @@ class IpmiTool: 'speed must be between 10 and 100' ) - self.switch_fan_mode(auto=False) + if ensure_manual: + self.switch_fan_mode(auto=False) + base_cmd = 'raw 0x30 0x30 0x02 0xff' return self.run_cmd(cmd=f'{base_cmd} {hex(speed)}') diff --git a/start.py b/start.py index 430b671..2e2bc5d 100644 --- a/start.py +++ b/start.py @@ -1,6 +1,5 @@ import os import time -import traceback from controller.client import FanController from controller.logger import logger @@ -51,6 +50,7 @@ if __name__ == '__main__': fan_speed_steps = os.getenv('FAN_SPEED_RULES') control_interval = get_int_env('CONTROL_INTERVAL_SECONDS', 120) error_interval = get_int_env('ERROR_INTERVAL_SECONDS', control_interval) + ipmi_failure_backoff = get_int_env('IPMI_FAILURE_BACKOFF_SECONDS', 300) ipmi_retry_count = get_int_env('IPMI_RETRY_COUNT', 5) ipmi_retry_delay = get_int_env('IPMI_RETRY_DELAY_SECONDS', 20) ipmi_timeout = get_int_env('IPMI_TIMEOUT_SECONDS', 60) @@ -81,9 +81,13 @@ if __name__ == '__main__': # 执行一次温度读取和风扇控制周期 client.run() time.sleep(control_interval) - except Exception as err: - logger.error( - f'运行控制器失败 {err}. {traceback.format_exc()}' + except RuntimeError as err: + logger.warning( + f'本轮IPMI控制失败,跳过本轮并等待 {ipmi_failure_backoff} 秒: {err}' ) + # 连续会话失败时给iDRAC更长恢复窗口,避免马上进入下一轮重试 + time.sleep(ipmi_failure_backoff) + except Exception as err: + logger.error(f'运行控制器失败 {err}', exc_info=True) # iDRAC会话异常时等待下一轮,避免连续请求压垮IPMI服务 time.sleep(error_interval) diff --git a/tests/test_fan_controller.py b/tests/test_fan_controller.py index 8a3ed59..7eb70c2 100644 --- a/tests/test_fan_controller.py +++ b/tests/test_fan_controller.py @@ -1,4 +1,5 @@ import unittest +from unittest.mock import Mock from controller.client import FanController @@ -54,6 +55,16 @@ class FanControllerConfigTest(unittest.TestCase): self.assertFalse(controller.use_raw_fan_duty) + def test_set_fan_speed_switches_manual_mode_only_once(self): + controller = self.make_controller() + controller.ipmi = Mock() + + controller.set_fan_speed(30) + controller.set_fan_speed(40) + + controller.ipmi.switch_fan_mode.assert_called_once_with(auto=False) + self.assertEqual(controller.ipmi.set_fan_speed.call_count, 2) + if __name__ == '__main__': unittest.main()