1818
1919import argparse
2020import json
21- import os
2221import pathlib
2322import platform
2423import subprocess
2524import time
2625from datetime import datetime
2726
28-
2927# ── PSI paths ────────────────────────────────────────────────────────────────
30- PSI_MEM = pathlib .Path ("/proc/pressure/memory" )
31- PSI_IO = pathlib .Path ("/proc/pressure/io" )
32- LOG_DIR = pathlib .Path .home () / "sparkview_logs" / "psi_baseline"
28+ PSI_MEM = pathlib .Path ("/proc/pressure/memory" )
29+ PSI_IO = pathlib .Path ("/proc/pressure/io" )
30+ LOG_DIR = pathlib .Path .home () / "sparkview_logs" / "psi_baseline"
3331
3432
3533def _parse_psi (path : pathlib .Path ) -> dict :
@@ -38,7 +36,7 @@ def _parse_psi(path: pathlib.Path) -> dict:
3836 result = {}
3937 for line in lines :
4038 parts = line .split ()
41- kind = parts [0 ] # "some" or "full"
39+ kind = parts [0 ] # "some" or "full"
4240 kv = {p .split ("=" )[0 ]: float (p .split ("=" )[1 ]) for p in parts [1 :]}
4341 result [kind ] = kv
4442 return result
@@ -48,22 +46,26 @@ def _parse_psi(path: pathlib.Path) -> dict:
4846
4947def _system_info () -> dict :
5048 info = {
51- "hostname" : platform .node (),
52- "kernel" : platform .release (),
49+ "hostname" : platform .node (),
50+ "kernel" : platform .release (),
5351 "collected" : datetime .now ().strftime ("%Y-%m-%d %H:%M:%S" ),
5452 }
5553 try :
56- out = subprocess .check_output (
57- ["nvidia-smi" , "--query-gpu=driver_version,name" ,
58- "--format=csv,noheader" ],
59- text = True , timeout = 5
60- ).strip ().splitlines ()[0 ]
54+ out = (
55+ subprocess .check_output (
56+ ["nvidia-smi" , "--query-gpu=driver_version,name" , "--format=csv,noheader" ],
57+ text = True ,
58+ timeout = 5 ,
59+ )
60+ .strip ()
61+ .splitlines ()[0 ]
62+ )
6163 driver , gpu = [x .strip () for x in out .split ("," )]
6264 info ["driver" ] = driver
63- info ["gpu" ] = gpu
65+ info ["gpu" ] = gpu
6466 except Exception :
6567 info ["driver" ] = "unknown"
66- info ["gpu" ] = "unknown"
68+ info ["gpu" ] = "unknown"
6769 try :
6870 mem = pathlib .Path ("/proc/meminfo" ).read_text ()
6971 for line in mem .splitlines ():
@@ -80,18 +82,18 @@ def _stats(vals: list) -> dict:
8082 if not vals :
8183 return {}
8284 return {
83- "min" : round (min (vals ), 4 ),
84- "max" : round (max (vals ), 4 ),
85+ "min" : round (min (vals ), 4 ),
86+ "max" : round (max (vals ), 4 ),
8587 "mean" : round (sum (vals ) / len (vals ), 4 ),
86- "p90" : round (sorted (vals )[int (len (vals ) * 0.90 )], 4 ),
87- "p99" : round (sorted (vals )[int (len (vals ) * 0.99 )], 4 ),
88+ "p90" : round (sorted (vals )[int (len (vals ) * 0.90 )], 4 ),
89+ "p99" : round (sorted (vals )[int (len (vals ) * 0.99 )], 4 ),
8890 }
8991
9092
9193def collect (duration : int , label : str , interval : float = 1.0 ) -> str :
9294 LOG_DIR .mkdir (parents = True , exist_ok = True )
9395
94- print (f "sparkview PSI baseline collector" )
96+ print ("sparkview PSI baseline collector" )
9597 print (f" label: { label } " )
9698 print (f" duration: { duration } s" )
9799 print (f" output: { LOG_DIR } " )
@@ -105,25 +107,25 @@ def collect(duration: int, label: str, interval: float = 1.0) -> str:
105107 print ("ERROR: /proc/pressure/io not found — IO PSI not supported on this kernel" )
106108 return ""
107109
108- samples = []
110+ samples = []
109111 log_lines = []
110112 start = time .monotonic ()
111113 n = 0
112114
113115 try :
114116 while time .monotonic () - start < duration :
115- ts = datetime .now ().strftime ("%Y-%m-%d %H:%M:%S" )
117+ ts = datetime .now ().strftime ("%Y-%m-%d %H:%M:%S" )
116118 mem = _parse_psi (PSI_MEM )
117- io = _parse_psi (PSI_IO )
118- t = round (time .monotonic () - start , 1 )
119+ io = _parse_psi (PSI_IO )
120+ t = round (time .monotonic () - start , 1 )
119121
120122 sample = {"t" : t , "ts" : ts , "mem" : mem , "io" : io }
121123 samples .append (sample )
122124
123125 mem_some = mem .get ("some" , {}).get ("avg10" , 0.0 )
124126 mem_full = mem .get ("full" , {}).get ("avg10" , 0.0 )
125- io_some = io .get ("some" , {}).get ("avg10" , 0.0 )
126- io_full = io .get ("full" , {}).get ("avg10" , 0.0 )
127+ io_some = io .get ("some" , {}).get ("avg10" , 0.0 )
128+ io_full = io .get ("full" , {}).get ("avg10" , 0.0 )
127129
128130 line = (
129131 f"{ ts } t={ t :6.1f} s "
@@ -144,64 +146,70 @@ def collect(duration: int, label: str, interval: float = 1.0) -> str:
144146 # ── Stats ─────────────────────────────────────────────────────────────────
145147 mem_some_vals = [s ["mem" ].get ("some" , {}).get ("avg10" , 0 ) for s in samples ]
146148 mem_full_vals = [s ["mem" ].get ("full" , {}).get ("avg10" , 0 ) for s in samples ]
147- io_some_vals = [s ["io" ].get ("some" , {}).get ("avg10" , 0 ) for s in samples ]
148- io_full_vals = [s ["io" ].get ("full" , {}).get ("avg10" , 0 ) for s in samples ]
149+ io_some_vals = [s ["io" ].get ("some" , {}).get ("avg10" , 0 ) for s in samples ]
150+ io_full_vals = [s ["io" ].get ("full" , {}).get ("avg10" , 0 ) for s in samples ]
149151
150152 summary = {
151153 "mem_some" : _stats (mem_some_vals ),
152154 "mem_full" : _stats (mem_full_vals ),
153- "io_some" : _stats (io_some_vals ),
154- "io_full" : _stats (io_full_vals ),
155+ "io_some" : _stats (io_some_vals ),
156+ "io_full" : _stats (io_full_vals ),
155157 }
156158
157159 # ── Write JSON ────────────────────────────────────────────────────────────
158- ts_file = datetime .now ().strftime ("%Y%m%d_%H%M%S" )
160+ ts_file = datetime .now ().strftime ("%Y%m%d_%H%M%S" )
159161 basename = f"sparkview_psi_baseline_{ label } _{ ts_file } "
160162 json_path = LOG_DIR / f"{ basename } .json"
161- log_path = LOG_DIR / f"{ basename } .log"
163+ log_path = LOG_DIR / f"{ basename } .log"
162164
163165 output = {
164- "tool" : "sparkview_psi_baseline_collector" ,
166+ "tool" : "sparkview_psi_baseline_collector" ,
165167 "version" : "1.0.0" ,
166- "label" : label ,
168+ "label" : label ,
167169 "duration" : duration ,
168- "samples" : n ,
169- "system" : _system_info (),
170+ "samples" : n ,
171+ "system" : _system_info (),
170172 "summary" : summary ,
171- "data" : samples ,
173+ "data" : samples ,
172174 }
173175
174176 with open (json_path , "w" ) as f :
175177 json .dump (output , f , indent = 2 )
176178
177179 # ── Write human-readable log ──────────────────────────────────────────────
178180 with open (log_path , "w" ) as f :
179- f .write (f "sparkview PSI baseline log\n " )
181+ f .write ("sparkview PSI baseline log\n " )
180182 f .write (f"label: { label } \n " )
181183 f .write (f"duration: { duration } s\n " )
182184 f .write (f"samples: { n } \n " )
183185 f .write (f"system: { platform .node ()} / { platform .release ()} \n " )
184- f .write (f"\n " )
185- f .write (f"{ 'timestamp' :<22} { 't' :>7} "
186- f"{ 'mem_some' :>10} { 'mem_full' :>10} "
187- f"{ 'io_some' :>10} { 'io_full' :>10} \n " )
186+ f .write ("\n " )
187+ f .write (
188+ f"{ 'timestamp' :<22} { 't' :>7} "
189+ f"{ 'mem_some' :>10} { 'mem_full' :>10} "
190+ f"{ 'io_some' :>10} { 'io_full' :>10} \n "
191+ )
188192 f .write ("-" * 80 + "\n " )
189193 for line in log_lines :
190194 f .write (line + "\n " )
191195 f .write ("\n " )
192196 f .write ("Summary:\n " )
193197 for key , st in summary .items ():
194- f .write (f" { key :<12} min={ st .get ('min' ,'?' )} max={ st .get ('max' ,'?' )} "
195- f"mean={ st .get ('mean' ,'?' )} p90={ st .get ('p90' ,'?' )} "
196- f"p99={ st .get ('p99' ,'?' )} \n " )
198+ f .write (
199+ f" { key :<12} min={ st .get ('min' , '?' )} max={ st .get ('max' , '?' )} "
200+ f"mean={ st .get ('mean' , '?' )} p90={ st .get ('p90' , '?' )} "
201+ f"p99={ st .get ('p99' , '?' )} \n "
202+ )
197203
198204 print (f"\n json: { json_path } " )
199205 print (f" log: { log_path } " )
200206 print ()
201207 print (" Summary:" )
202208 for key , st in summary .items ():
203- print (f" { key :<12} min={ st .get ('min' ,'?' )} max={ st .get ('max' ,'?' )} "
204- f"mean={ st .get ('mean' ,'?' )} p90={ st .get ('p90' ,'?' )} " )
209+ print (
210+ f" { key :<12} min={ st .get ('min' , '?' )} max={ st .get ('max' , '?' )} "
211+ f"mean={ st .get ('mean' , '?' )} p90={ st .get ('p90' , '?' )} "
212+ )
205213
206214 return str (json_path )
207215
@@ -211,17 +219,17 @@ def collect(duration: int, label: str, interval: float = 1.0) -> str:
211219 description = "sparkview PSI baseline collector — GB10 calibration"
212220 )
213221 parser .add_argument (
214- "--duration" , type = int , default = 120 ,
215- help = "Collection duration in seconds (default: 120)"
222+ "--duration" , type = int , default = 120 , help = "Collection duration in seconds (default: 120)"
216223 )
217224 parser .add_argument (
218- "--label" , type = str , default = "idle" ,
225+ "--label" ,
226+ type = str ,
227+ default = "idle" ,
219228 choices = ["idle" , "vllm_loaded" , "inference_running" , "post_inference" , "custom" ],
220- help = "Workload label for this collection run"
229+ help = "Workload label for this collection run" ,
221230 )
222231 parser .add_argument (
223- "--interval" , type = float , default = 1.0 ,
224- help = "Sample interval in seconds (default: 1.0)"
232+ "--interval" , type = float , default = 1.0 , help = "Sample interval in seconds (default: 1.0)"
225233 )
226234 args = parser .parse_args ()
227235 collect (args .duration , args .label , args .interval )
0 commit comments