Skip to content

Commit 0b894b6

Browse files
fix: remove IO PSI row — false CRITICAL on GB10 with VLLM idle (v0.2.3)
1 parent 1405776 commit 0b894b6

6 files changed

Lines changed: 13 additions & 34 deletions

File tree

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
## v0.2.3 — 2026-04-23
2+
3+
### Fixed
4+
- Removed IO PSI row — false CRITICAL on GB10 with VLLM idle
5+
- IO PSI references removed from anomaly logger and pressure detection
6+
17
## v0.2.2 — 2026-04-19
28

39
### Added

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,8 @@ Detection is load-gated — evaluation only occurs when GPU utilization confirms
108108

109109
## GB10 Power Rails (spark_hwmon)
110110

111+
v0.2.3 removes the IO PSI row (false CRITICAL on GB10 with VLLM idle).
112+
111113
v0.2.2 adds a PWR row for GB10 systems using the spark_hwmon kernel module (https://github.com/antheas/spark_hwmon).
112114

113115
Install:

main.py

Lines changed: 3 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -90,20 +90,17 @@ def build(term_height: int = 40) -> Table:
9090
t.append(f" {temp} {pw} Mem {gi(g['mem_used'])}/{gi(g['mem_total'])}")
9191
if g["is_uma"]:
9292
mem_psi = psi.get("mem", psi)
93-
io_psi = psi.get("io", {})
9493
pwr = power_rails.read() if power_rails.is_available() else None
9594
# red — critical: hardware brake, throttle, high pressure, high temp
9695
uma_red = (
9796
mem_psi.get("level") in ("HIGH", "CRITICAL")
98-
or io_psi.get("level") in ("HIGH", "CRITICAL")
9997
or any(th.get("status") == "THROTTLED" for th in throttle)
10098
or any(g.get("temperature", 0) > 80 for g in gpus)
10199
or (pwr and pwr.prochot)
102100
)
103101
# yellow — warning: approaching limits
104102
uma_yellow = (
105103
mem_psi.get("level") == "MOD"
106-
or io_psi.get("level") == "MOD"
107104
or any(th.get("status") == "LOCKED" for th in throttle)
108105
or any(60 <= g.get("temperature", 0) <= 80 for g in gpus)
109106
or (pwr and pwr.cap_exceeded)
@@ -171,7 +168,6 @@ def build(term_height: int = 40) -> Table:
171168

172169
# ── PSI ──────────────────────────────────────────
173170
mem_psi = psi.get("mem", {})
174-
io_psi = psi.get("io", {})
175171
if mem_psi.get("available"):
176172
level = mem_psi["level"]
177173
color = psi_color(level)
@@ -183,18 +179,7 @@ def build(term_height: int = 40) -> Table:
183179
t.append(f"{level:8s}", style=color + " bold")
184180
t.append(f" some {mem_psi['some_avg10']:.2f} full {mem_psi['full_avg10']:.2f}")
185181
grid.add_row(t)
186-
sep(grid)
187-
if io_psi.get("available"):
188-
level = io_psi["level"]
189-
color = psi_color(level)
190-
score = min(io_psi["some_avg10"] * 100 / 0.30, 100)
191-
t = Text()
192-
t.append(" IO ", style="bold cyan")
193-
t.append(f"{bar(score)} ", style=color)
194-
t.append(f"{level:8s}", style=color + " bold")
195-
t.append(f" some {io_psi['some_avg10']:.2f} full {io_psi['full_avg10']:.2f}")
196-
grid.add_row(t)
197-
if mem_psi.get("available") or io_psi.get("available"):
182+
if mem_psi.get("available"):
198183
sep(grid)
199184

200185
# ── TEMP ────────────────────────────────────────
@@ -303,10 +288,10 @@ def fmt_rate(b: float) -> str:
303288

304289
footer = Text()
305290
if is_logging():
306-
footer.append(" [dim]Ctrl+C to quit sparkview v0.2.2[/dim]")
291+
footer.append(" [dim]Ctrl+C to quit sparkview v0.2.3[/dim]")
307292
footer.append(" ● LOGGING", style="bold red")
308293
else:
309-
footer.append(" [dim]Ctrl+C to quit sparkview v0.2.2[/dim]")
294+
footer.append(" [dim]Ctrl+C to quit sparkview v0.2.3[/dim]")
310295
grid.add_row(footer)
311296
return grid
312297

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "sparkview"
3-
version = "0.2.0"
3+
version = "0.2.3"
44
description = "GB10 Grace Blackwell unified memory GPU monitor"
55
requires-python = ">=3.10"
66
dependencies = [

sparkview/layers/logger.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,8 @@ def _detect_trigger(psi: dict, throttle: list, mem: dict, gpus: list, cpu: dict
3333
if pwr and pwr.prochot:
3434
return "PROCHOT"
3535
mem_psi = psi.get("mem", psi)
36-
io_psi = psi.get("io", {})
3736
if mem_psi.get("level") in ("MOD", "HIGH", "CRITICAL"):
3837
return f"PSI_{mem_psi.get('level')}"
39-
if io_psi.get("level") in ("MOD", "HIGH", "CRITICAL"):
40-
return f"IO_PSI_{io_psi.get('level')}"
4138
for th in throttle:
4239
if th.get("status") in ("THROTTLED", "LOCKED"):
4340
return f"CLOCK_{th.get('status')}"
@@ -56,11 +53,8 @@ def should_log(psi: dict, throttle: list, mem: dict, gpus: list, cpu: dict | Non
5653
if pwr and pwr.prochot:
5754
return True
5855
mem_psi = psi.get("mem", psi)
59-
io_psi = psi.get("io", {})
6056
if mem_psi.get("level") in ("MOD", "HIGH", "CRITICAL"):
6157
return True
62-
if io_psi.get("level") in ("MOD", "HIGH", "CRITICAL"):
63-
return True
6458
for th in throttle:
6559
if th.get("status") in ("THROTTLED", "LOCKED"):
6660
return True
@@ -152,16 +146,10 @@ def write_log(
152146
_log_file.write(f"CLOCK: {status} {clk:.0f}MHz / {clk_max:.0f}MHz {pstate}\n")
153147

154148
mem_psi = psi.get("mem", psi)
155-
io_psi = psi.get("io", {})
156149
level = mem_psi.get("level", "?")
157150
some = mem_psi.get("some_avg10", 0)
158151
full = mem_psi.get("full_avg10", 0)
159152
_log_file.write(f"PSI: {level} some {some:.2f} full {full:.2f}\n")
160-
if io_psi.get("available"):
161-
io_level = io_psi.get("level", "?")
162-
io_some = io_psi.get("some_avg10", 0)
163-
io_full = io_psi.get("full_avg10", 0)
164-
_log_file.write(f"IO: {io_level} some {io_some:.2f} full {io_full:.2f}\n")
165153

166154
pwr = power_rails.read()
167155
if pwr is not None:

sparkview/layers/pressure.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import pathlib
44

55
PSI_MEM_PATH = pathlib.Path("/proc/pressure/memory")
6-
PSI_IO_PATH = pathlib.Path("/proc/pressure/io")
76

87

98
def _parse_psi(path: pathlib.Path) -> dict:
@@ -39,5 +38,4 @@ def _parse_psi(path: pathlib.Path) -> dict:
3938

4039
def get_pressure() -> dict:
4140
mem = _parse_psi(PSI_MEM_PATH)
42-
io = _parse_psi(PSI_IO_PATH)
43-
return {"mem": mem, "io": io}
41+
return {"mem": mem}

0 commit comments

Comments
 (0)