Set early stopping tollerance

jinlow · jinlow · commit 8ad136138bed · 2026-04-13T16:21:47.000-05:00
diff --git a/py-forust/forust/__init__.py b/py-forust/forust/__init__.py
@@ -339,6 +339,7 @@ def __init__(
         grow_policy: str = "DepthWise",
         evaluation_metric: str | None = None,
         early_stopping_rounds: int | None = None,
+        early_stopping_delta: float = 1e-7,
         initialize_base_score: bool = True,
         terminate_missing_features: Iterable[Any] | None = None,
         missing_node_treatment: str = "None",
@@ -422,6 +423,8 @@ def __init__(
             early_stopping_rounds (int | None, optional): If this is specified, and an `evaluation_dataset` is passed
                 during fit, then an improvement in the `evaluation_metric` must be seen after at least this many
                 iterations of training, otherwise training will be cut short.
+            early_stopping_delta (float, optional): Minimum improvement in the evaluation metric
+                required to count as an improvement for early stopping. Defaults to 1e-7. Set to 0.0 to count any strict improvement.
             initialize_base_score (bool, optional): If this is specified, the `base_score` will be calculated at fit time using the `sample_weight` and y data in accordance with the requested `objective_type`. This will result in the passed `base_score` value being overridden.
             terminate_missing_features (set[Any], optional): An optional iterable of features (either strings, or integer values specifying the feature indices if numpy arrays are used for fitting), for which the missing node will always be terminated, even if `allow_missing_splits` is set to true. This value is only valid if `create_missing_branch` is also True.
             missing_node_treatment (str, optional): Method for selecting the `weight` for the missing node, if `create_missing_branch` is set to `True`. Defaults to "None". Valid options are:
@@ -516,6 +519,7 @@ def __init__(
             grow_policy=grow_policy,
             evaluation_metric=evaluation_metric,
             early_stopping_rounds=early_stopping_rounds,
+            early_stopping_delta=early_stopping_delta,
             initialize_base_score=initialize_base_score,
             terminate_missing_features=set(),
             missing_node_treatment=missing_node_treatment,
@@ -556,6 +560,7 @@ def __init__(
         self.other_rate = other_rate
         self.evaluation_metric = evaluation_metric
         self.early_stopping_rounds = early_stopping_rounds
+        self.early_stopping_delta = early_stopping_delta
         self.initialize_base_score = initialize_base_score
         self.terminate_missing_features = terminate_missing_features_
         self.missing_node_treatment = missing_node_treatment
diff --git a/py-forust/src/lib.rs b/py-forust/src/lib.rs
@@ -78,6 +78,7 @@ impl GradientBooster {
         grow_policy,
         evaluation_metric,
         early_stopping_rounds,
+        early_stopping_delta,
         initialize_base_score,
         terminate_missing_features,
         missing_node_treatment,
@@ -111,6 +112,7 @@ impl GradientBooster {
         grow_policy: &str,
         evaluation_metric: Option<&str>,
         early_stopping_rounds: Option<usize>,
+        early_stopping_delta: f64,
         initialize_base_score: bool,
         terminate_missing_features: HashSet<usize>,
         missing_node_treatment: &str,
@@ -157,6 +159,7 @@ impl GradientBooster {
             grow_policy_,
             evaluation_metric_,
             early_stopping_rounds,
+            early_stopping_delta,
             initialize_base_score,
             terminate_missing_features,
             missing_node_treatment_,
@@ -420,6 +423,7 @@ impl GradientBooster {
         dict.set_item("grow_policy", grow_policy_)?;
         dict.set_item("evaluation_metric", evaluation_metric_)?;
         dict.set_item("early_stopping_rounds", self.booster.early_stopping_rounds)?;
+        dict.set_item("early_stopping_delta", self.booster.early_stopping_delta)?;
         dict.set_item("initialize_base_score", self.booster.initialize_base_score)?;
         dict.set_item(
             "terminate_missing_features",
diff --git a/src/gradientbooster.rs b/src/gradientbooster.rs
@@ -159,6 +159,11 @@ pub struct GradientBooster {
     /// to keep training.
     #[serde(default = "default_early_stopping_rounds")]
     pub early_stopping_rounds: Option<usize>,
+    /// Minimum improvement in the evaluation metric required to count as
+    /// an improvement for early stopping purposes. Defaults to 1e-7 to
+    /// match XGBoost's behavior.
+    #[serde(default = "default_early_stopping_delta")]
+    pub early_stopping_delta: f64,
     /// If this is specified, the base_score will be calculated using the sample_weight and y data in accordance with the requested objective_type.
     #[serde(default = "default_initialize_base_score")]
     pub initialize_base_score: bool,
@@ -221,6 +226,9 @@ fn default_evaluation_metric() -> Option<Metric> {
 fn default_early_stopping_rounds() -> Option<usize> {
     None
 }
+fn default_early_stopping_delta() -> f64 {
+    1e-7
+}
 fn default_evaluation_history() -> Option<RowMajorMatrix<f64>> {
     None
 }
@@ -283,6 +291,7 @@ impl Default for GradientBooster {
             GrowPolicy::DepthWise,
             None,
             None,
+            1e-7,
             true,
             HashSet::new(),
             MissingNodeTreatment::AssignToParent,
@@ -334,6 +343,7 @@ impl GradientBooster {
     /// * `sample_method` - Specify the method that records should be sampled when training?
     /// * `evaluation_metric` - Define the evaluation metric to record at each iterations.
     /// * `early_stopping_rounds` - Number of rounds that must
+    /// * `early_stopping_delta` - Minimum improvement required to reset the early stopping counter.
     /// * `initialize_base_score` - If this is specified, the base_score will be calculated using the sample_weight and y data in accordance with the requested objective_type.
     /// * `missing_node_treatment` - specify how missing nodes should be handled during training.
     /// * `log_iterations` - Setting to a value (N) other than zero will result in information being logged about ever N iterations.
@@ -365,6 +375,7 @@ impl GradientBooster {
         grow_policy: GrowPolicy,
         evaluation_metric: Option<Metric>,
         early_stopping_rounds: Option<usize>,
+        early_stopping_delta: f64,
         initialize_base_score: bool,
         terminate_missing_features: HashSet<usize>,
         missing_node_treatment: MissingNodeTreatment,
@@ -398,6 +409,7 @@ impl GradientBooster {
             grow_policy,
             evaluation_metric,
             early_stopping_rounds,
+            early_stopping_delta,
             initialize_base_score,
             terminate_missing_features,
             evaluation_history: None,
@@ -662,7 +674,7 @@ impl GradientBooster {
                                 // Otherwise the best could be farther back.
                                 Some(v) => {
                                     // We have reached a new best value...
-                                    if is_comparison_better(v, m, maximize) {
+                                    if is_comparison_better(v, m, maximize, self.early_stopping_delta) {
                                         self.update_best_iteration(i);
                                         Some(m)
                                     } else {
@@ -1264,6 +1276,13 @@ impl GradientBooster {
         self
     }
 
+    /// Set the minimum improvement delta for early stopping.
+    /// * `early_stopping_delta` - Minimum improvement required.
+    pub fn set_early_stopping_delta(mut self, early_stopping_delta: f64) -> Self {
+        self.early_stopping_delta = early_stopping_delta;
+        self
+    }
+
     /// Set prediction iterations.
     /// * `early_stopping_rounds` - Early stoppings rounds.
     pub fn set_prediction_iteration(mut self, prediction_iteration: Option<usize>) -> Self {
diff --git a/src/metric.rs b/src/metric.rs
@@ -9,7 +9,7 @@ pub type MetricFn = fn(&[f64], &[f64], &[f64]) -> f64;
 /// Compare to metric values, determining if b is better.
 /// If one of them is NaN favor the non NaN value.
 /// If both are NaN, consider the first value to be better.
-pub fn is_comparison_better(value: f64, comparison: f64, maximize: bool) -> bool {
+pub fn is_comparison_better(value: f64, comparison: f64, maximize: bool, delta: f64) -> bool {
     match (value.is_nan(), comparison.is_nan()) {
         // Both nan, comparison is not better,
         // Or comparison is nan, also not better
@@ -19,13 +19,13 @@ pub fn is_comparison_better(value: f64, comparison: f64, maximize: bool) -> bool
         // Perform numerical comparison.
         (false, false) => {
             // If we are maximizing is the comparison
-            // greater, than the current value
+            // greater, than the current value by at least delta
             if maximize {
-                value < comparison
+                comparison > value + delta
             // If we are minimizing is the comparison
-            // less than the current value.
+            // less than the current value by at least delta
             } else {
-                value > comparison
+                comparison < value - delta
             }
         }
     }