Skip to content
17 changes: 17 additions & 0 deletions cuda_core/tests/test_tensor_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,23 @@ def test_invalid_data_type(self, dev, skip_if_no_tma):
data_type=42,
)

# HUMAN-REVIEWED
def test_as_tensor_map_host_view_rejected_without_tma(self):
"""``as_tensor_map`` rejects a non-device-accessible (host) view with a
clear error, exercising the ``as_tensor_map`` -> ``_from_tiled`` path
without needing TMA-capable hardware."""
host = np.zeros((64, 64), dtype=np.float32)
view = StridedMemoryView.from_any_interface(host, stream_ptr=-1)
with pytest.raises(ValueError, match="device-accessible"):
view.as_tensor_map(
box_dim=(32, 32),
data_type=TensorMapDataType.FLOAT32,
element_strides=(1, 1),
swizzle=TensorMapSwizzle.SWIZZLE_128B,
l2_promotion=TensorMapL2Promotion.L2_128B,
oob_fill=TensorMapOOBFill.NAN_REQUEST_ZERO_FMA,
)


class TestTensorMapDtypeMapping:
"""Test automatic dtype inference from numpy dtypes."""
Expand Down
124 changes: 91 additions & 33 deletions cuda_core/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1060,37 +1060,95 @@ def test_dlpack_export_non_native_endian_rejected():
bad_view.__dlpack__()


@pytest.mark.parametrize(
"dtype",
[
np.uint8,
np.uint16,
np.uint32,
np.uint64,
np.int8,
np.int16,
np.int32,
np.int64,
np.float16,
np.float32,
np.float64,
np.complex64,
np.complex128,
np.bool_,
],
)
def test_strided_memory_view_dtype_roundtrip_all(dtype):
"""Exercise dtype_dlpack_to_numpy for every NumPy-native DLPack dtype.

bfloat16 (kDLBfloat) is excluded -- NumPy's __dlpack__ doesn't reliably
export ml_dtypes-extended dtypes; cover separately via jax/torch if needed.
# HUMAN-REVIEWED
def test_strided_memory_view_proxy_cai_only_has_dlpack_false():
"""``_StridedMemoryViewProxy`` records ``has_dlpack=False`` for an object
that exposes only ``__cuda_array_interface__`` (check_has_dlpack CAI branch)."""
from cuda.core._memoryview import _StridedMemoryViewProxy

obj = _make_cuda_array_interface_obj(shape=(2,), strides=None)
proxy = _StridedMemoryViewProxy(obj)
assert proxy.has_dlpack is False
assert proxy.obj is obj


# HUMAN-REVIEWED
def test_view_as_cai_device_pointer_and_stream_ordering(init_cuda):
"""``view_as_cai`` on a real device pointer resolves the device ordinal via
``cuPointerGetAttribute`` and takes the cross-stream branch when the CAI
``stream`` differs from the consumer stream.

This only exercises the code path and checks *device* correctness (ptr,
device_id, shape); it does NOT verify stream-order correctness. Uses a
synthetic CAI object backed by a genuine device allocation, so the
cupy/numba-only device branch is exercised without those optional deps.
"""
src = np.zeros(3, dtype=dtype)
# Probe NumPy first: if it can't export this dtype, skip as env limit.
# Any failure AFTER the probe is OUR consumer regression and must fail.
try:
src.__dlpack__()
except (BufferError, TypeError) as e:
pytest.skip(f"NumPy does not export {np.dtype(dtype)} via DLPack: {e}")
view = StridedMemoryView.from_dlpack(src, stream_ptr=-1)
assert view.dtype == np.dtype(dtype) # .dtype triggers dtype_dlpack_to_numpy
dev = init_cuda
buffer = dev.memory_resource.allocate(64, stream=dev.default_stream)
producer = dev.create_stream()
consumer = dev.create_stream()
obj = _make_cuda_array_interface_obj(
shape=(8,),
strides=None,
typestr="<f4",
data=(int(buffer.handle), False),
)
obj.__cuda_array_interface__["stream"] = int(producer.handle)

view = StridedMemoryView.from_cuda_array_interface(obj, stream_ptr=consumer.handle)

assert view.is_device_accessible is True
assert view.ptr == int(buffer.handle)
assert view.device_id == dev.device_id
assert view.shape == (8,)
dev.default_stream.sync()

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I find this test a bit deceptive since it adds coverage without actually checking stream order correctness. (It does check for device correctness of course).
Now, testing stream-order correctness is annoying (you need either large buffers, or launch many kernels on the first stream and then e.g. a copy-to-host on after export and see that all of them finished)
So maybe it's OK, but the bot oversells the test by a lot.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for pointing this out. I've updated the docstring to be honest about what the test does and doesn't check.



# HUMAN-REVIEWED
def test_strided_memory_view_init_cai_path_deprecated(init_cuda):
"""The deprecated ``StridedMemoryView(obj)`` constructor routes a CAI-only
object through the CAI branch (warn + ``view_as_cai``), not the DLPack one."""
obj = _make_cuda_array_interface_obj(shape=(4,), strides=None, typestr="<f4", data=(0, False))
with pytest.deprecated_call(match="CUDA-array-interface-supporting object is deprecated"):
view = StridedMemoryView(obj, stream_ptr=-1)
assert view.is_device_accessible is True
assert view.shape == (4,)
assert view.device_id == init_cuda.device_id


# HUMAN-REVIEWED
def test_dlpack_export_device_accessible_cai_view(init_cuda):
"""Exporting a device-accessible CAI-backed view (no dl_tensor) drives the
``_smv_get_dl_device`` branch that calls ``get_buffer``/``classify_dl_device``
and reports a CUDA device via ``__dlpack_device__``."""
dev = init_cuda
buffer = dev.memory_resource.allocate(64, stream=dev.default_stream)
obj = _make_cuda_array_interface_obj(
shape=(8,),
strides=None,
typestr="<f4",
data=(int(buffer.handle), False),
)
view = StridedMemoryView.from_cuda_array_interface(obj, stream_ptr=-1)

device_type, device_id = view.__dlpack_device__()
assert device_type == int(DLDeviceType.kDLCUDA)
assert device_id == dev.device_id

capsule = view.__dlpack__()
assert _PyCapsule_IsValid(capsule, b"dltensor") == 1
del capsule # unconsumed -> deleter frees the managed tensor
dev.default_stream.sync()


# HUMAN-REVIEWED
def test_strided_memory_view_repr_with_none_dtype(init_cuda):
"""``__repr__`` of a view whose dtype is None renders the dtype via
``get_simple_repr`` taking the builtins branch (NoneType)."""
dev = init_cuda
buffer = dev.memory_resource.allocate(16, stream=dev.default_stream)
view = StridedMemoryView.from_buffer(buffer, shape=(16,), itemsize=1, dtype=None)
assert view.dtype is None
r = repr(view)
assert r.startswith("StridedMemoryView(ptr=")
assert "dtype=NoneType" in r
Loading
Loading