[XPU] fix all_reduce all-zero accuracy issue under torch.compile (vllm-project#39844)

chaojun-zhang · mergify[bot] · web-flow · commit 993859ceb0f4 · 2026-04-18T02:33:07.000Z
Signed-off-by: Chaojun Zhang &lt;chaojun.zhang@intel.com&gt;
Co-authored-by: mergify[bot] &lt;37929162+mergify[bot]@users.noreply.github.com&gt;
diff --git a/vllm/distributed/device_communicators/xpu_communicator.py b/vllm/distributed/device_communicators/xpu_communicator.py
@@ -47,9 +47,10 @@ def __init__(
                 self.all2all_manager = AgRsAll2AllManager(self.cpu_group)
                 logger.info("Using AgRs manager on XPU device.")
 
-    def all_reduce(self, input_) -> torch.Tensor:
-        dist.all_reduce(input_, group=self.device_group)
-        return input_
+    def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
+        output = input_.clone() if torch.compiler.is_compiling() else input_
+        dist.all_reduce(output, group=self.device_group)
+        return output
 
     def reduce_scatter(self, input_: torch.Tensor, dim: int = -1):
         world_size = self.world_size