diff --git a/.env.example b/.env.example index 12419db..501b10a 100644 --- a/.env.example +++ b/.env.example @@ -2,3 +2,9 @@ HOST=192.168.1.100 USERNAME=root PASSWORD=your_idrac_password FAN_SPEED_STEPS=50:20,55:25,60:30,65:40 +CONTROL_INTERVAL_SECONDS=120 +ERROR_INTERVAL_SECONDS=120 +IPMI_RETRY_COUNT=5 +IPMI_RETRY_DELAY_SECONDS=20 +IPMI_TIMEOUT_SECONDS=60 +USE_RAW_FAN_DUTY=false diff --git a/CHANGES.md b/CHANGES.md index c195999..7c974be 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -10,6 +10,7 @@ - 清理 Python 缓存文件,避免将运行产物提交到仓库。 - Docker 运行镜像切换到 Debian slim,只安装 `python3`、`ipmitool` 和时区数据,在降低体积的同时保留更好的IPMI兼容性。 - 新增 `FAN_SPEED_STEPS` 环境变量,允许用户通过 `.env` 自定义温度阈值和风扇转速档位。 +- 新增轮询间隔、IPMI重试、命令超时和 raw 风扇占空比查询开关,默认减少 iDRAC8 的会话压力。 ## Previous Improvements diff --git a/README.md b/README.md index 6c261a7..a2952b8 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,12 @@ HOST=192.168.1.100 USERNAME=root PASSWORD=your_idrac_password FAN_SPEED_STEPS=50:20,55:25,60:30,65:40 +CONTROL_INTERVAL_SECONDS=120 +ERROR_INTERVAL_SECONDS=120 +IPMI_RETRY_COUNT=5 +IPMI_RETRY_DELAY_SECONDS=20 +IPMI_TIMEOUT_SECONDS=60 +USE_RAW_FAN_DUTY=false ``` Start the service / 启动服务: @@ -108,11 +114,21 @@ python3 start.py | `USERNAME` | Yes | iDRAC username with IPMI permission. / 有 IPMI 权限的 iDRAC 用户名。 | | `PASSWORD` | Yes | iDRAC password. / iDRAC 密码。 | | `FAN_SPEED_STEPS` | No | Temperature-to-speed rules. Default: `50:20,55:25,60:30,65:40`. / 温度和风扇转速规则,默认值:`50:20,55:25,60:30,65:40`。 | +| `CONTROL_INTERVAL_SECONDS` | No | Normal control interval. Default: `120`. / 正常控制间隔,默认 `120` 秒。 | +| `ERROR_INTERVAL_SECONDS` | No | Wait time after a failed control cycle. Default: same as `CONTROL_INTERVAL_SECONDS`. / 控制周期失败后的等待时间,默认等于正常控制间隔。 | +| `IPMI_RETRY_COUNT` | No | Retry count for each IPMI command. Default: `5`. / 单条 IPMI 命令重试次数,默认 `5`。 | +| `IPMI_RETRY_DELAY_SECONDS` | No | Wait time between IPMI retries. Default: `20`. / IPMI 重试间隔,默认 `20` 秒。 | +| `IPMI_TIMEOUT_SECONDS` | No | Subprocess timeout for each IPMI command. Default: `60`. / 单次 IPMI 命令超时时间,默认 `60` 秒。 | +| `USE_RAW_FAN_DUTY` | No | Query raw fan duty before RPM estimation. Default: `false`. / 是否先用 raw 命令读取风扇占空比,默认 `false`。 | The application does not include default credentials. Missing variables will stop startup with a clear error. 程序不内置默认地址、账号或密码。缺少环境变量时会直接停止并输出明确错误。 +For iDRAC8 systems with unstable IPMI sessions, keep `CONTROL_INTERVAL_SECONDS` at `120` or higher. The default `USE_RAW_FAN_DUTY=false` avoids one extra IPMI session per cycle and estimates fan percentage from the RPM data already returned by `sdr`. + +对于 IPMI 会话不稳定的 iDRAC8,建议 `CONTROL_INTERVAL_SECONDS` 保持 `120` 秒或更高。默认 `USE_RAW_FAN_DUTY=false` 会跳过额外的 raw 占空比查询,直接使用同一次 `sdr` 返回的 RPM 估算风扇百分比,减少 iDRAC 会话压力。 + ## Temperature Policy / 温控策略 The controller reads all temperature sensors and uses the highest value. diff --git a/controller/client.py b/controller/client.py index 595c85e..0808505 100644 --- a/controller/client.py +++ b/controller/client.py @@ -15,13 +15,31 @@ DEFAULT_FAN_SPEED_STEPS = ( class FanController: # 初始化控制器并记录iDRAC连接信息 - def __init__(self, host: str, username: str, password: str, fan_speed_steps: str = None): + def __init__( + self, + host: str, + username: str, + password: str, + fan_speed_steps: str = None, + ipmi_retry_count: int = 5, + ipmi_retry_delay: int = 20, + ipmi_timeout: int = 60, + use_raw_fan_duty: bool = False, + ): self.host = host self.username = username self.password = password - self.ipmi = IpmiTool(self.host, self.username, self.password) + self.ipmi = IpmiTool( + self.host, + self.username, + self.password, + retry_count=ipmi_retry_count, + retry_delay=ipmi_retry_delay, + timeout=ipmi_timeout, + ) self.fan_speed_steps = self.parse_fan_speed_steps(fan_speed_steps) + self.use_raw_fan_duty = use_raw_fan_duty self.last_set_speed = None # 记录最后设置的风扇速度 self.is_auto_mode = False # 记录当前是否为自动模式 @@ -116,7 +134,7 @@ class FanController: self.is_auto_mode = False # 获取当前风扇转速 - current_speed = self.ipmi.get_fan_duty_cycle(sensor_data) + current_speed = self.ipmi.get_fan_duty_cycle(sensor_data, use_raw=self.use_raw_fan_duty) # 只有在当前转速与所需转速不同时才调整 # 如果无法获取当前转速(返回-1),则检查是否已记录之前设置的速度 diff --git a/controller/ipmi.py b/controller/ipmi.py index b736cac..f227a8e 100644 --- a/controller/ipmi.py +++ b/controller/ipmi.py @@ -7,31 +7,33 @@ from controller.logger import logger # IPMI命令封装器:负责调用ipmitool读取传感器并设置Dell风扇 class IpmiTool: # 初始化iDRAC连接参数 - def __init__(self, host: str, username: str, password: str): + def __init__(self, host: str, username: str, password: str, retry_count: int = 5, retry_delay: int = 20, timeout: int = 60): if not host or not username or not password: raise ValueError("host, username and password must be provided") self.host = host self.username = username self.password = password + self.retry_count = retry_count + self.retry_delay = retry_delay + self.timeout = timeout # 执行ipmitool命令并处理重试、超时和会话异常 def run_cmd(self, cmd: str) -> str: basecmd = f'ipmitool -H {self.host} -I lanplus -U {self.username} -P {self.password}' command = f'{basecmd} {cmd}' - retry_count = 5 # 增加重试次数以应对网络波动 - for attempt in range(retry_count): + for attempt in range(self.retry_count): try: # print(f"Executing command: {command}") # 添加调试信息 - result = subprocess.run(command, shell=True, capture_output=True, text=True, timeout=60) # 增加超时时间 + result = subprocess.run(command, shell=True, capture_output=True, text=True, timeout=self.timeout) # 控制单次命令最长等待时间 if result.returncode != 0: # 部分ipmitool版本会把错误输出到stdout,或只返回退出码但stderr为空 error_msg = result.stderr.strip() or result.stdout.strip() or f'命令退出码: {result.returncode}' # 检查是否是网络连接问题 if "Unable to establish IPMI" in error_msg or "session" in error_msg: - logger.warning(f'IPMI会话建立失败 (尝试 {attempt + 1}/{retry_count}): {error_msg}') - if attempt < retry_count - 1: - time.sleep(10) # 网络问题需要更长的等待时间 + logger.warning(f'IPMI会话建立失败 (尝试 {attempt + 1}/{self.retry_count}): {error_msg}') + if attempt < self.retry_count - 1: + time.sleep(self.retry_delay) # 网络问题需要更长的等待时间 continue raise RuntimeError( f'IPMI 命令执行失败: {cmd}\n错误详情: {error_msg}' # 更清晰的错误提示 @@ -39,16 +41,16 @@ class IpmiTool: return result.stdout except subprocess.TimeoutExpired: - logger.warning(f'命令超时 (尝试 {attempt + 1}/{retry_count})') - if attempt < retry_count - 1: - logger.warning(f'正在重试... (尝试次数 {attempt + 1}/{retry_count})') - time.sleep(10) # 每次重试前等待更长时间 + logger.warning(f'命令超时 (尝试 {attempt + 1}/{self.retry_count})') + if attempt < self.retry_count - 1: + logger.warning(f'正在重试... (尝试次数 {attempt + 1}/{self.retry_count})') + time.sleep(self.retry_delay) # 每次重试前等待更长时间 else: raise RuntimeError('IPMI 命令超时。请检查网络连接或服务器状态。') # 更明确的错误提示 except Exception as e: - logger.warning(f'IPMI命令执行异常 (尝试 {attempt + 1}/{retry_count}): {str(e)}') - if attempt < retry_count - 1: - time.sleep(10) # 网络问题需要更长的等待时间 + logger.warning(f'IPMI命令执行异常 (尝试 {attempt + 1}/{self.retry_count}): {str(e)}') + if attempt < self.retry_count - 1: + time.sleep(self.retry_delay) # 网络问题需要更长的等待时间 else: raise e @@ -116,29 +118,30 @@ class IpmiTool: return fan_speeds # 获取当前风扇占空比,raw命令不可用时用RPM估算 - def get_fan_duty_cycle(self, sensor_data: str = None) -> int: + def get_fan_duty_cycle(self, sensor_data: str = None, use_raw: bool = False) -> int: """ 获取当前风扇占空比/百分比 :return: current fan duty cycle in percentage """ - try: - # Raw command to get current fan duty cycle - result = self.run_cmd('raw 0x30 0x31 0x01') - # Parse the hex result to get duty cycle - result_parts = result.strip().split() - if result_parts and len(result_parts) >= 1: - # The command should return a hex value representing the duty cycle - duty_cycle_hex = result_parts[-1] - duty_cycle = int(duty_cycle_hex, 16) - # Ensure the value is in valid range (0-100) - if 0 <= duty_cycle <= 100 and duty_cycle != 0: - # If we get a reasonable value (not 0), return it - return duty_cycle - elif duty_cycle == 0: - # Value of 0 might indicate auto mode or that raw command doesn't return duty cycle on this system - logger.info('原始命令返回0,尝试从RPM估算风扇百分比') - except Exception as e: - logger.warning(f'获取风扇占空比的原始命令失败: {e}') + if use_raw: + try: + # Raw command to get current fan duty cycle + result = self.run_cmd('raw 0x30 0x31 0x01') + # Parse the hex result to get duty cycle + result_parts = result.strip().split() + if result_parts and len(result_parts) >= 1: + # The command should return a hex value representing the duty cycle + duty_cycle_hex = result_parts[-1] + duty_cycle = int(duty_cycle_hex, 16) + # Ensure the value is in valid range (0-100) + if 0 <= duty_cycle <= 100 and duty_cycle != 0: + # If we get a reasonable value (not 0), return it + return duty_cycle + elif duty_cycle == 0: + # Value of 0 might indicate auto mode or that raw command doesn't return duty cycle on this system + logger.info('原始命令返回0,尝试从RPM估算风扇百分比') + except Exception as e: + logger.warning(f'获取风扇占空比的原始命令失败: {e}') # If raw command fails or returns 0, get fan speeds from sensor data and convert to approximate percentage try: diff --git a/start.py b/start.py index a4f4a36..430b671 100644 --- a/start.py +++ b/start.py @@ -5,6 +5,40 @@ import traceback from controller.client import FanController from controller.logger import logger + +# 解析整数环境变量,缺省时使用默认值 +def get_int_env(name: str, default: int, min_value: int = 1) -> int: + value = os.getenv(name) + if value is None: + return default + + try: + parsed_value = int(value) + except ValueError as exc: + raise RuntimeError(f'{name} 必须是整数') from exc + + if parsed_value < min_value: + raise RuntimeError(f'{name} 必须大于等于 {min_value}') + + return parsed_value + + +# 解析布尔环境变量,支持 true/false、1/0、yes/no +def get_bool_env(name: str, default: bool = False) -> bool: + value = os.getenv(name) + if value is None: + return default + + normalized_value = value.strip().lower() + if normalized_value in ('1', 'true', 'yes', 'on'): + return True + + if normalized_value in ('0', 'false', 'no', 'off'): + return False + + raise RuntimeError(f'{name} 必须是布尔值: true/false') + + if __name__ == '__main__': # 从环境变量读取iDRAC连接信息,开源版本不内置任何真实默认凭据 @@ -15,6 +49,12 @@ if __name__ == '__main__': fan_speed_steps = os.getenv('FAN_SPEED_STEPS') if fan_speed_steps is None: fan_speed_steps = os.getenv('FAN_SPEED_RULES') + control_interval = get_int_env('CONTROL_INTERVAL_SECONDS', 120) + error_interval = get_int_env('ERROR_INTERVAL_SECONDS', control_interval) + ipmi_retry_count = get_int_env('IPMI_RETRY_COUNT', 5) + ipmi_retry_delay = get_int_env('IPMI_RETRY_DELAY_SECONDS', 20) + ipmi_timeout = get_int_env('IPMI_TIMEOUT_SECONDS', 60) + use_raw_fan_duty = get_bool_env('USE_RAW_FAN_DUTY', False) if not host: raise RuntimeError('未设置 HOST 环境变量') @@ -30,16 +70,20 @@ if __name__ == '__main__': username=username, password=password, fan_speed_steps=fan_speed_steps, + ipmi_retry_count=ipmi_retry_count, + ipmi_retry_delay=ipmi_retry_delay, + ipmi_timeout=ipmi_timeout, + use_raw_fan_duty=use_raw_fan_duty, ) while True: try: # 执行一次温度读取和风扇控制周期 client.run() - time.sleep(60) + time.sleep(control_interval) except Exception as err: logger.error( f'运行控制器失败 {err}. {traceback.format_exc()}' ) # iDRAC会话异常时等待下一轮,避免连续请求压垮IPMI服务 - time.sleep(60) + time.sleep(error_interval) diff --git a/tests/test_fan_controller.py b/tests/test_fan_controller.py index c758222..8a3ed59 100644 --- a/tests/test_fan_controller.py +++ b/tests/test_fan_controller.py @@ -49,6 +49,11 @@ class FanControllerConfigTest(unittest.TestCase): with self.assertRaises(ValueError): self.make_controller(invalid_value) + def test_raw_fan_duty_query_is_disabled_by_default(self): + controller = self.make_controller() + + self.assertFalse(controller.use_raw_fan_duty) + if __name__ == '__main__': unittest.main() diff --git a/tests/test_start_config.py b/tests/test_start_config.py new file mode 100644 index 0000000..00e55c5 --- /dev/null +++ b/tests/test_start_config.py @@ -0,0 +1,32 @@ +import os +import unittest +from unittest.mock import patch + +from start import get_bool_env, get_int_env + + +class StartConfigTest(unittest.TestCase): + def test_get_int_env_uses_default_when_missing(self): + with patch.dict(os.environ, {}, clear=True): + self.assertEqual(get_int_env('CONTROL_INTERVAL_SECONDS', 120), 120) + + def test_get_int_env_rejects_invalid_value(self): + with patch.dict(os.environ, {'CONTROL_INTERVAL_SECONDS': 'abc'}, clear=True): + with self.assertRaises(RuntimeError): + get_int_env('CONTROL_INTERVAL_SECONDS', 120) + + def test_get_bool_env_parses_supported_values(self): + with patch.dict(os.environ, {'USE_RAW_FAN_DUTY': 'true'}, clear=True): + self.assertTrue(get_bool_env('USE_RAW_FAN_DUTY')) + + with patch.dict(os.environ, {'USE_RAW_FAN_DUTY': 'false'}, clear=True): + self.assertFalse(get_bool_env('USE_RAW_FAN_DUTY')) + + def test_get_bool_env_rejects_invalid_value(self): + with patch.dict(os.environ, {'USE_RAW_FAN_DUTY': 'maybe'}, clear=True): + with self.assertRaises(RuntimeError): + get_bool_env('USE_RAW_FAN_DUTY') + + +if __name__ == '__main__': + unittest.main()