2 files changed
@@ -147,15 +147,29 @@ def _lease_entry_matches_running(gpu_id: int, entry: dict) -> bool: | |||
| 147 | 147 | return False | |
| 148 | 148 | ||
| 149 | 149 | ||
| 150 | + _LEASE_TTL_SEC = 3600 # 1 hour — if lease is older AND workspace can't confirm, remove | ||
| 151 | + | ||
| 152 | + | ||
| 150 | 153 | def _clean_global_gpu_leases_unlocked(leases: dict[str, dict]) -> dict[str, dict]: | |
| 151 | 154 | cleaned: dict[str, dict] = {} | |
| 155 | + now = time.time() | ||
| 152 | 156 | for gpu_key, entry in leases.items(): | |
| 153 | 157 | try: | |
| 154 | 158 | gpu_id = int(gpu_key) | |
| 155 | 159 | except (TypeError, ValueError): | |
| 156 | 160 | continue | |
| 161 | + # Fast path: if lease is recent, keep it without expensive workspace check | ||
| 162 | + claimed_at = entry.get("claimed_at", 0) | ||
| 163 | + if now - claimed_at < 60: # Less than 1 minute old — always keep | ||
| 164 | + cleaned[str(gpu_id)] = entry | ||
| 165 | + continue | ||
| 166 | + # Check if workspace still confirms this lease | ||
| 157 | 167 | if _lease_entry_matches_running(gpu_id, entry): | |
| 158 | 168 | cleaned[str(gpu_id)] = entry | |
| 169 | + elif now - claimed_at < _LEASE_TTL_SEC: | ||
| 170 | + # Workspace doesn't confirm but lease is recent — keep (may be in transition) | ||
| 171 | + cleaned[str(gpu_id)] = entry | ||
| 172 | + # else: workspace doesn't confirm AND lease is old → drop | ||
| 159 | 173 | return cleaned | |
| 160 | 174 | ||
| 161 | 175 | ||
@@ -1183,6 +1183,67 @@ def test_script_includes_dispatch_logic(self): | |||
| 1183 | 1183 | assert "DISPATCH" in script | |
| 1184 | 1184 | ||
| 1185 | 1185 | ||
| 1186 | + # ══════════════════════════════════════════════ | ||
| 1187 | + # TTL-based stale lease cleanup | ||
| 1188 | + # ══════════════════════════════════════════════ | ||
| 1189 | + | ||
| 1190 | + def test_stale_leases_cleaned_by_ttl(tmp_path, monkeypatch): | ||
| 1191 | + """Leases older than TTL should be cleaned even if workspace is gone.""" | ||
| 1192 | + import time as time_mod | ||
| 1193 | + from sibyl import gpu_scheduler | ||
| 1194 | + | ||
| 1195 | + monkeypatch.setattr(gpu_scheduler, "_global_gpu_leases_path", | ||
| 1196 | + lambda: tmp_path / "gpu_leases.json") | ||
| 1197 | + | ||
| 1198 | + old_lease = { | ||
| 1199 | + "0": { | ||
| 1200 | + "workspace_root": "/nonexistent/path", | ||
| 1201 | + "task_ids": ["old_task"], | ||
| 1202 | + "claimed_at": time_mod.time() - 7200, # 2 hours ago | ||
| 1203 | + } | ||
| 1204 | + } | ||
| 1205 | + cleaned = gpu_scheduler._clean_global_gpu_leases_unlocked(old_lease) | ||
| 1206 | + assert "0" not in cleaned # Should be removed — workspace gone + old | ||
| 1207 | + | ||
| 1208 | + | ||
| 1209 | + def test_recent_leases_kept_without_workspace_check(tmp_path, monkeypatch): | ||
| 1210 | + """Leases less than 60s old should always be kept.""" | ||
| 1211 | + import time as time_mod | ||
| 1212 | + from sibyl import gpu_scheduler | ||
| 1213 | + | ||
| 1214 | + monkeypatch.setattr(gpu_scheduler, "_global_gpu_leases_path", | ||
| 1215 | + lambda: tmp_path / "gpu_leases.json") | ||
| 1216 | + | ||
| 1217 | + recent_lease = { | ||
| 1218 | + "0": { | ||
| 1219 | + "workspace_root": "/nonexistent/path", | ||
| 1220 | + "task_ids": ["new_task"], | ||
| 1221 | + "claimed_at": time_mod.time() - 10, # 10 seconds ago | ||
| 1222 | + } | ||
| 1223 | + } | ||
| 1224 | + cleaned = gpu_scheduler._clean_global_gpu_leases_unlocked(recent_lease) | ||
| 1225 | + assert "0" in cleaned # Should be kept — very recent | ||
| 1226 | + | ||
| 1227 | + | ||
| 1228 | + def test_mid_age_lease_kept_when_workspace_gone(tmp_path, monkeypatch): | ||
| 1229 | + """Leases newer than TTL but older than 60s, with gone workspace, are kept.""" | ||
| 1230 | + import time as time_mod | ||
| 1231 | + from sibyl import gpu_scheduler | ||
| 1232 | + | ||
| 1233 | + monkeypatch.setattr(gpu_scheduler, "_global_gpu_leases_path", | ||
| 1234 | + lambda: tmp_path / "gpu_leases.json") | ||
| 1235 | + | ||
| 1236 | + mid_lease = { | ||
| 1237 | + "0": { | ||
| 1238 | + "workspace_root": "/nonexistent/path", | ||
| 1239 | + "task_ids": ["mid_task"], | ||
| 1240 | + "claimed_at": time_mod.time() - 600, # 10 minutes ago (< 1hr TTL) | ||
| 1241 | + } | ||
| 1242 | + } | ||
| 1243 | + cleaned = gpu_scheduler._clean_global_gpu_leases_unlocked(mid_lease) | ||
| 1244 | + assert "0" in cleaned # Should be kept — within TTL even though workspace is gone | ||
| 1245 | + | ||
| 1246 | + | ||
| 1186 | 1247 | # ══════════════════════════════════════════════ | |
| 1187 | 1248 | # Failed tasks don't block pipeline | |
| 1188 | 1249 | # ══════════════════════════════════════════════ | |
0 commit comments