Why does access cudaMallocManaged memory throw exception?

/example
│-- CMakeLists.txt
│-- main.cu

cmake_minimum_required(VERSION 3.12)
project(SimpleCudaProject CUDA)
set(CMAKE_CUDA_STANDARD 17)
add_executable(SimpleCudaProject main.cu)
#include <cassert>
#include <cstdio>
#define CHECK(call)                                                     \
    do {                                                                \
        const cudaError_t error_code = call;                            \
        if (error_code != cudaSuccess) {                                \
            printf("CUDA Error:\n");                                    \
            printf("File: %s\n", __FILE__);                             \
            printf("Line: %d\n", __LINE__);                             \
            printf("Error code: %d\n", error_code);                     \
            printf("Error text: %s\n", cudaGetErrorString(error_code)); \
            assert(0);                                                  \
            exit(1);                                                    \
        }                                                               \
    } while (0)

__global__ void warmup() {}
int main() {
    for (size_t i = 0; i < 3; i++) {
        warmup << <1, 1 >> > ();
        float* buffer;
        CHECK(cudaMallocManaged(&buffer, sizeof(float) * 48));
        buffer[0] = 1;
        cudaFree(buffer);
    }
    return 0;
}

This is so confused. Can anyone help me?
Hardware: RTX5070ti laptop
latest Win11 and latest vs2022

nvcc -V
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2025 NVIDIA Corporation
Built on Tue_May_27_02:24:01_Pacific_Daylight_Time_2025
Cuda compilation tools, release 12.9, V12.9.86
Build cuda_12.9.r12.9/compiler.36037853_0

nvidia-smi
Tue Jun 17 14:54:47 2025
±----------------------------------------------------------------------------------------+
| NVIDIA-SMI 576.57 Driver Version: 576.57 CUDA Version: 12.9 |
|-----------------------------------------±-----------------------±---------------------+
| GPU Name Driver-Model | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA GeForce RTX 5070 … WDDM | 00000000:01:00.0 On | N/A |
| N/A 50C P8 5W / 140W | 616MiB / 12227MiB | 0% Default |
| | | N/A |
±----------------------------------------±-----------------------±---------------------+

±----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| 0 N/A N/A 2276 C+G …iceHub.ThreadedWaitDialog.exe N/A |
| 0 N/A N/A 12412 C+G C:\Windows\explorer.exe N/A |
| 0 N/A N/A 29160 C …d\Debug\SimpleCudaProject.exe N/A |
| 0 N/A N/A 29220 C+G …munity\Common7\IDE\devenv.exe N/A |
±----------------------------------------------------------------------------------------+

Managed memory on Windows has limitations which are explained here: CUDA C++ Programming Guide — CUDA C++ Programming Guide

Does it work if you cudaDeviceSynchronize() before allocation?
Does it work if you allocate outside of the loop?

This works.

int main() {
    for (size_t i = 0; i < 3; i++) {
        warmup << <1, 1 >> > ();
        cudaDeviceSynchronize();
        float* buffer;
        CHECK(cudaMallocManaged(&buffer, sizeof(float) * 48));
        buffer[0] = 1;
        cudaFree(buffer);
    }
    return 0;
}

In my test, i should allocate in loop.