Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
feat(learning): harden rebuild recovery semantics
  • Loading branch information
Xinhua Gu committed Mar 13, 2026
commit 012f902def6e35b9c38f51a20cd58aa064832228
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ public final class AceClawDaemon {
private final LearningValidationStore learningValidationStore;
private final LearningValidationRecorder learningValidationRecorder;
private final LearningMaintenanceRunStore learningMaintenanceRunStore;
private final LearningMaintenanceRecoveryStore learningMaintenanceRecoveryStore;
private final MarkdownMemoryStore markdownStore;
private final JobStore cronJobStore;
private final EventBus eventBus;
Expand Down Expand Up @@ -162,6 +163,7 @@ private AceClawDaemon(Path homeDir) {
this.learningValidationStore = new LearningValidationStore();
this.learningValidationRecorder = new LearningValidationRecorder(learningValidationStore);
this.learningMaintenanceRunStore = new LearningMaintenanceRunStore();
this.learningMaintenanceRecoveryStore = new LearningMaintenanceRecoveryStore();

// Markdown memory store (persistent MEMORY.md + topic files)
MarkdownMemoryStore mds = null;
Expand Down Expand Up @@ -671,7 +673,8 @@ private void wireAgentHandler(Path workingDir) {
maintenanceAutoRelease,
draftPipelineLock,
scope.workspaceHash(),
scope.workingDir())
scope.workingDir()),
learningMaintenanceRecoveryStore
);
}
final var runtimeSkillGeneratorForSessionEnd = dynamicSkillGenerator;
Expand Down Expand Up @@ -1823,7 +1826,11 @@ public void start() throws DaemonException {

if (learningMaintenanceScheduler != null) {
try {
Path startupWorkingDir = Path.of(System.getProperty("user.dir")).toAbsolutePath().normalize();
learningMaintenanceScheduler.start();
learningMaintenanceScheduler.registerWorkspace(
WorkspacePaths.workspaceHash(startupWorkingDir),
startupWorkingDir);
shutdownManager.register(new ShutdownManager.ShutdownParticipant() {
@Override public String name() { return "Learning Maintenance Scheduler"; }
@Override public int priority() { return 87; }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import dev.aceclaw.memory.HistoricalSessionSnapshot;

import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
Expand Down Expand Up @@ -52,29 +53,75 @@ public RebuildSummary rebuildWorkspaceIfStale(String workspaceHash) throws Excep
var legacyHistoryIds = historySessionIds.stream()
.filter(sessionId -> !snapshotSessionSet.contains(sessionId))
.collect(Collectors.toSet());
var legacyIndexedIds = indexedSessionIds.stream()
.filter(sessionId -> !snapshotSessionSet.contains(sessionId))
.collect(Collectors.toSet());
var snapshots = snapshotSessionIds.stream()
.map(historyStore::loadSnapshot)
.flatMap(java.util.Optional::stream)
.filter(snapshot -> workspaceHash.equals(snapshot.workspaceHash()))
.toList();
var expectedIndexedSessionIds = snapshots.stream()
.filter(HistoricalIndexRebuilder::producesIndexEntries)
.map(HistoricalSessionSnapshot::sessionId)
.collect(Collectors.toSet());
var expectedCoverage = expectedCoverage(snapshots);
var expectedIndexedSessionIds = expectedCoverage.keySet();
var comparableIndexedSessionIds = indexedSessionIds.stream()
.filter(sessionId -> !legacyHistoryIds.contains(sessionId))
.filter(sessionId -> !legacyHistoryIds.contains(sessionId) && !legacyIndexedIds.contains(sessionId))
.collect(Collectors.toSet());
if (comparableIndexedSessionIds.equals(expectedIndexedSessionIds)) {
if (comparableIndexedSessionIds.equals(expectedIndexedSessionIds)
&& comparableCoverage(historicalLogIndex.sessionCoverage(workspaceHash), legacyHistoryIds, legacyIndexedIds)
.equals(expectedCoverage)) {
return new RebuildSummary(false, snapshots.size(), indexedSessionIds.size(), snapshotSessionSet, indexedSessionIds);
}

historicalLogIndex.replaceSessions(workspaceHash, snapshots);
var actualCoverage = comparableCoverage(
historicalLogIndex.sessionCoverage(workspaceHash),
legacyHistoryIds,
legacyIndexedIds);
if (!actualCoverage.equals(expectedCoverage)) {
throw new IllegalStateException("Historical index rebuild produced inconsistent coverage for workspace "
+ workspaceHash + ": expected=" + expectedCoverage + ", actual=" + actualCoverage);
}
return new RebuildSummary(true, snapshots.size(), indexedSessionIds.size(), snapshotSessionSet, indexedSessionIds);
} finally {
workspaceLock.unlock();
}
}

private static Map<String, HistoricalLogIndex.SessionCoverage> expectedCoverage(List<HistoricalSessionSnapshot> snapshots) {
var coverage = new java.util.LinkedHashMap<String, HistoricalLogIndex.SessionCoverage>();
for (var snapshot : snapshots) {
var expected = new HistoricalLogIndex.SessionCoverage(
snapshot.toolMetrics().size(),
snapshot.errorsEncountered().size(),
patternRowCount(snapshot));
if (expected.toolRows() > 0 || expected.errorRows() > 0 || expected.patternRows() > 0) {
coverage.put(snapshot.sessionId(), expected);
}
}
return Map.copyOf(coverage);
}

private static int patternRowCount(HistoricalSessionSnapshot snapshot) {
int count = 0;
if (snapshot.backtrackingDetected()) {
count++;
}
if (!snapshot.endToEndStrategy().isBlank()) {
count++;
}
return count;
}

private static Map<String, HistoricalLogIndex.SessionCoverage> comparableCoverage(
Map<String, HistoricalLogIndex.SessionCoverage> actualCoverage,
Set<String> legacyHistoryIds,
Set<String> legacyIndexedIds) {
return actualCoverage.entrySet().stream()
.filter(entry -> !legacyHistoryIds.contains(entry.getKey()))
.filter(entry -> !legacyIndexedIds.contains(entry.getKey()))
.collect(Collectors.toUnmodifiableMap(Map.Entry::getKey, Map.Entry::getValue));
}

private static boolean producesIndexEntries(HistoricalSessionSnapshot snapshot) {
return !snapshot.toolMetrics().isEmpty()
|| !snapshot.errorsEncountered().isEmpty()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
package dev.aceclaw.daemon;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.nio.file.StandardOpenOption;
import java.time.Instant;
import java.util.Objects;
import java.util.Optional;

/**
* Persists per-project maintenance recovery state so interrupted runs can be retried safely.
*/
public final class LearningMaintenanceRecoveryStore {

private static final String FILE_NAME = ".aceclaw/metrics/learning-maintenance-state.json";

private final ObjectMapper mapper = new ObjectMapper().registerModule(new JavaTimeModule());

public RecoveryState markStarted(Path projectRoot, String workspaceHash, String trigger) throws IOException {
Objects.requireNonNull(projectRoot, "projectRoot");
Objects.requireNonNull(workspaceHash, "workspaceHash");
Objects.requireNonNull(trigger, "trigger");
if (workspaceHash.isBlank() || trigger.isBlank()) {
throw new IllegalArgumentException("workspaceHash and trigger must not be blank");
}
var previous = load(projectRoot).orElse(null);
int attempt = previous == null ? 1 : previous.attempt() + 1;
Instant now = Instant.now();
var next = new RecoveryState(
workspaceHash,
projectRoot.toAbsolutePath().normalize().toString(),
trigger,
RecoveryStatus.RUNNING,
attempt,
previous == null ? now : previous.firstStartedAt(),
now,
"");
write(projectRoot, next);
return next;
}

public void markFailed(Path projectRoot, String workspaceHash, String trigger, Exception error) throws IOException {
Objects.requireNonNull(error, "error");
var current = load(projectRoot).orElse(null);
Instant now = Instant.now();
var next = new RecoveryState(
workspaceHash,
projectRoot.toAbsolutePath().normalize().toString(),
trigger,
RecoveryStatus.FAILED,
current == null ? 1 : current.attempt(),
current == null ? now : current.firstStartedAt(),
now,
truncate(error.getClass().getSimpleName() + ": " + String.valueOf(error.getMessage()), 300));
write(projectRoot, next);
}

public void clear(Path projectRoot) throws IOException {
Objects.requireNonNull(projectRoot, "projectRoot");
Files.deleteIfExists(stateFile(projectRoot));
}

public Optional<RecoveryState> load(Path projectRoot) throws IOException {
Objects.requireNonNull(projectRoot, "projectRoot");
Path file = stateFile(projectRoot);
if (!Files.isRegularFile(file)) {
return Optional.empty();
}
try {
var json = Files.readString(file);
if (json == null || json.isBlank()) {
return Optional.empty();
}
return Optional.of(mapper.readValue(json, RecoveryState.class));
} catch (IOException | RuntimeException e) {
return Optional.empty();
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.
}

public boolean needsRecovery(Path projectRoot, String workspaceHash) {
Objects.requireNonNull(projectRoot, "projectRoot");
Objects.requireNonNull(workspaceHash, "workspaceHash");
if (workspaceHash.isBlank()) {
throw new IllegalArgumentException("workspaceHash must not be blank");
}
try {
return load(projectRoot)
.filter(state -> workspaceHash.equals(state.workspaceHash()))
.filter(state -> state.status() == RecoveryStatus.RUNNING || state.status() == RecoveryStatus.FAILED)
.isPresent();
} catch (IOException e) {
return false;
}
}

Path stateFile(Path projectRoot) {
return projectRoot.resolve(FILE_NAME);
}

private void write(Path projectRoot, RecoveryState state) throws IOException {
Path file = stateFile(projectRoot);
Files.createDirectories(file.getParent());
Path tmp = file.resolveSibling(file.getFileName() + ".tmp");
try {
Files.writeString(tmp, mapper.writeValueAsString(state),
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.WRITE);
Files.move(tmp, file, StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.ATOMIC_MOVE);
} catch (IOException e) {
try {
Files.deleteIfExists(tmp);
} catch (IOException ignored) {
// best effort cleanup
}
throw e;
}
}

private static String truncate(String value, int max) {
if (value == null || value.length() <= max) {
return value == null ? "" : value;
}
return value.substring(0, max);
}

public record RecoveryState(
String workspaceHash,
String projectPath,
String trigger,
RecoveryStatus status,
int attempt,
Instant firstStartedAt,
Instant updatedAt,
String lastError
) {
public RecoveryState {
workspaceHash = Objects.requireNonNull(workspaceHash, "workspaceHash");
projectPath = projectPath == null ? "" : projectPath;
trigger = Objects.requireNonNull(trigger, "trigger");
status = Objects.requireNonNull(status, "status");
firstStartedAt = firstStartedAt == null ? Instant.now() : firstStartedAt;
updatedAt = updatedAt == null ? Instant.now() : updatedAt;
lastError = lastError == null ? "" : lastError;
if (workspaceHash.isBlank() || trigger.isBlank()) {
throw new IllegalArgumentException("workspaceHash and trigger must not be blank");
}
if (attempt <= 0) {
throw new IllegalArgumentException("attempt must be positive");
}
}
}

public enum RecoveryStatus {
RUNNING,
FAILED
}
}
Loading
Loading