diff --git a/aceclaw-core/src/main/java/dev/aceclaw/core/planner/ComplexityEstimator.java b/aceclaw-core/src/main/java/dev/aceclaw/core/planner/ComplexityEstimator.java index 527a4f8a..b51d9c6d 100644 --- a/aceclaw-core/src/main/java/dev/aceclaw/core/planner/ComplexityEstimator.java +++ b/aceclaw-core/src/main/java/dev/aceclaw/core/planner/ComplexityEstimator.java @@ -15,12 +15,15 @@ public final class ComplexityEstimator { /** * Default threshold for the no-arg constructor. Mirrors - * {@code AceClawConfig.DEFAULT_PLANNER_THRESHOLD} — keep the two - * in sync. Lowered from 5 to 3 because single-signal compound - * prompts ("refactor X", "extract Y", "do A and then B") are - * already plannable but were never triggering the planner. + * {@code AceClawConfig.DEFAULT_PLANNER_THRESHOLD} — keep the + * two in sync. Settled at 4: too low (3) made every single + * "refactor X" / "extract Y" trigger a planner LLM call even on + * trivial prompts; too high (5) required two explicit signals + * which most prompts didn't hit. At 4, a single +3 signal alone + * stays as plain ReAct, but adding ANY second signal flips on + * planning. */ - private static final int DEFAULT_THRESHOLD = 3; + private static final int DEFAULT_THRESHOLD = 4; // -- Heuristic patterns --------------------------------------------------- diff --git a/aceclaw-core/src/test/java/dev/aceclaw/core/planner/ComplexityEstimatorTest.java b/aceclaw-core/src/test/java/dev/aceclaw/core/planner/ComplexityEstimatorTest.java index a3c3afef..f3cdd68d 100644 --- a/aceclaw-core/src/test/java/dev/aceclaw/core/planner/ComplexityEstimatorTest.java +++ b/aceclaw-core/src/test/java/dev/aceclaw/core/planner/ComplexityEstimatorTest.java @@ -40,15 +40,31 @@ void multipleFiles_detected() { } @Test - void refactoring_highScore() { + void refactoring_singleSignalDoesNotPlan_atDefaultThreshold() { + // The signal IS detected and contributes its 3 points, but at + // the default threshold (4) a single +3 signal alone is not + // enough to invoke the planner — the planner adds a real + // LLM-call cost, and "refactor X" can mean trivially small + // work (REFACTORING regex matches "extract" too, etc.). + // Users who want planning on a borderline single-signal prompt + // can use the /plan slash command (forcePlan) instead. var score = estimator.estimate("Refactor the authentication module"); assertTrue(score.signals().contains("refactoring")); - assertTrue(score.score() >= 3); - // After the threshold drop (5 → 3), a single 'refactoring' - // signal is enough to plan. Pinning so a future revert can't - // silently restore the old "single signal never plans" - // behavior without updating this test. - assertTrue(score.shouldPlan()); + assertEquals(3, score.score()); + assertFalse(score.shouldPlan()); + } + + @Test + void refactoring_plusSecondSignal_plans() { + // Adding ANY second signal pushes a +3 prompt over threshold 4. + // Pins the rule "single +3 signal → no plan; +3 with anything + // else → plan" so future threshold tweaks have to reckon with + // the assertion explicitly. + var withTesting = estimator.estimate("Refactor the auth module and add tests"); + assertTrue(withTesting.signals().contains("refactoring")); + assertTrue(withTesting.signals().contains("testing")); + assertTrue(withTesting.score() >= 4); + assertTrue(withTesting.shouldPlan()); } @Test diff --git a/aceclaw-daemon/src/main/java/dev/aceclaw/daemon/AceClawConfig.java b/aceclaw-daemon/src/main/java/dev/aceclaw/daemon/AceClawConfig.java index ed29babf..920c4c58 100644 --- a/aceclaw-daemon/src/main/java/dev/aceclaw/daemon/AceClawConfig.java +++ b/aceclaw-daemon/src/main/java/dev/aceclaw/daemon/AceClawConfig.java @@ -75,17 +75,27 @@ public final class AceClawConfig { private static final boolean DEFAULT_HEARTBEAT_ENABLED = true; private static final boolean DEFAULT_PLANNER_ENABLED = true; /** - * Default complexity score for triggering the planner. Lowered - * from 5 → 3 so single-signal compound prompts ("refactor X", - * "rename across", "do A and then B") trigger a plan instead of - * being treated as plain ReAct turns. Empirically, threshold=5 - * required two explicit signals which most everyday agentic - * prompts don't hit, so the planner essentially never fired for - * typical work. See {@link ComplexityEstimator} for the score - * table. Users can still override via config to restore older - * behavior. - */ - private static final int DEFAULT_PLANNER_THRESHOLD = 3; + * Default complexity score for triggering the planner. Bumped + * from 5 → 4 (initially landed at 3, dialled back after review). + * + *
Threshold 5 required two explicit signals — most everyday + * agentic prompts hit at most one, so the planner essentially + * never fired. Threshold 3 went too far the other way: every + * single "refactor X" / "extract Y" (REFACTORING regex matches + * "extract" too) triggered a planner LLM call before any actual + * work, even on trivial prompts. + * + *
Threshold 4 is the middle ground: single-signal +3 prompts
+ * ("refactor X" alone) stay as plain ReAct, but adding ANY
+ * second signal (a long description, a second action, multiple
+ * files, testing, …) flips on planning. Users who explicitly
+ * want the planner on a borderline prompt can use
+ * {@code /plan See {@link ComplexityEstimator} for the score table.
+ */
+ private static final int DEFAULT_PLANNER_THRESHOLD = 4;
private static final boolean DEFAULT_ADAPTIVE_REPLAN_ENABLED = true;
private static final boolean DEFAULT_CANDIDATE_INJECTION_ENABLED = true;
private static final boolean DEFAULT_CANDIDATE_PROMOTION_ENABLED = true;