vtl/tensor_cuda_d_cuda.v at main · vlang/vtl · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
module vtl

import vtl.storage
import vsl.cuda
import vsl.cuda.compute

// CudaTensor holds tensor data on GPU memory

// CudaTensor defines a public data structure for this module.

// CudaTensor defines a public data structure for this module.
@[heap]
pub struct CudaTensor[T] {
pub mut:
	data    &storage.CudaStorage[T] = unsafe { nil }
	memory  MemoryFormat
	size    int
	shape   []int
	strides []int
}

// cuda creates a CudaTensor from a Tensor by copying data to GPU
pub fn (t &Tensor[T]) cuda(params storage.CudaParams) !&CudaTensor[T] {
	row_tensor := t.copy(.row_major)
	cudata := row_tensor.data.cuda(params)!
	return &CudaTensor[T]{
		data:    cudata
		memory:  row_tensor.memory
		size:    row_tensor.size
		shape:   row_tensor.shape
		strides: row_tensor.strides
	}
}

// cpu creates a Tensor from a CudaTensor by copying data from GPU to CPU
pub fn (t &CudaTensor[T]) cpu() !&Tensor[T] {
	data := t.data.cpu()!
	return &Tensor[T]{
		data:    data
		memory:  t.memory
		size:    t.size
		shape:   t.shape
		strides: t.strides
	}
}

// cuda returns the same CudaTensor (identity function for chaining)

// cuda exposes this operation as part of the public API.

// cuda exposes this operation as part of the public API.
@[inline]
pub fn (t &CudaTensor[T]) cuda(params storage.CudaParams) !&CudaTensor[T] {
	return t
}

// release releases the GPU memory
pub fn (t &CudaTensor[T]) release() {
	t.data.release()
}

// rank returns the number of dimensions
pub fn (t &CudaTensor[T]) rank() int {
	return t.shape.len
}

// numel returns the total number of elements
pub fn (t &CudaTensor[T]) numel() int {
	return t.size
}

// is_matrix returns true if the tensor is a 2D matrix

// is_matrix exposes this operation as part of the public API.

// is_matrix exposes this operation as part of the public API.
@[inline]
pub fn (t &CudaTensor[T]) is_matrix() bool {
	return t.rank() == 2
}

// is_square_matrix returns true if the tensor is a square matrix

// is_square_matrix exposes this operation as part of the public API.

// is_square_matrix exposes this operation as part of the public API.
@[inline]
pub fn (t &CudaTensor[T]) is_square_matrix() bool {
	return t.rank() == 2 && t.shape[0] == t.shape[1]
}

// is_vector returns true if the tensor is a 1D vector

// is_vector exposes this operation as part of the public API.

// is_vector exposes this operation as part of the public API.
@[inline]
pub fn (t &CudaTensor[T]) is_vector() bool {
	return t.rank() == 1
}

// is_row_major returns true if the tensor uses row-major memory layout

// is_row_major exposes this operation as part of the public API.

// is_row_major exposes this operation as part of the public API.
@[inline]
pub fn (t &CudaTensor[T]) is_row_major() bool {
	return t.memory == .row_major
}

// is_col_major returns true if the tensor uses column-major memory layout

// is_col_major exposes this operation as part of the public API.

// is_col_major exposes this operation as part of the public API.
@[inline]
pub fn (t &CudaTensor[T]) is_col_major() bool {
	return t.memory == .col_major
}

// is_row_major_contiguous returns true if the tensor is row-major and contiguous

// is_row_major_contiguous exposes this operation as part of the public API.

// is_row_major_contiguous exposes this operation as part of the public API.
@[inline]
pub fn (t &CudaTensor[T]) is_row_major_contiguous() bool {
	return is_row_major_contiguous(t.shape, t.strides, t.rank())
}

// is_col_major_contiguous returns true if the tensor is column-major and contiguous

// is_col_major_contiguous exposes this operation as part of the public API.

// is_col_major_contiguous exposes this operation as part of the public API.
@[inline]
pub fn (t &CudaTensor[T]) is_col_major_contiguous() bool {
	return is_col_major_contiguous(t.shape, t.strides, t.rank())
}

// is_contiguous returns true if the tensor is contiguous in either memory layout

// is_contiguous exposes this operation as part of the public API.

// is_contiguous exposes this operation as part of the public API.
@[inline]
pub fn (t &CudaTensor[T]) is_contiguous() bool {
	return t.is_row_major_contiguous() || t.is_col_major_contiguous()
}

// gemm_cuda computes dst = a * b (row-major, no transpose) using cuBLAS.
// Shapes: a is [m, k], b is [k, n], dst is [m, n].
// Requires T = f64 (double precision).
pub fn gemm_cuda(dst &CudaTensor[f64], a &CudaTensor[f64], b &CudaTensor[f64]) ! {
	if a.rank() != 2 || b.rank() != 2 || dst.rank() != 2 {
		return error('gemm_cuda: all tensors must be rank-2 matrices')
	}
	m := a.shape[0]
	k := a.shape[1]
	n := b.shape[1]

	// Convert row-major to column-major for cuBLAS
	a_col := cuda.row_to_col_major(a.data.to_array()!, m, k)
	b_col := cuda.row_to_col_major(b.data.to_array()!, k, n)

	// Get device and run GEMM
	dev := a.data.device
	result_col := compute.gemm_cuda(dev, a_col, b_col, m, n, k)!

	// Convert result back to row-major and copy to destination
	result_row := cuda.col_to_row_major(result_col, m, n)
	unsafe {
		C.cudaMemcpy(dst.data.ptr, result_row.data, int(sizeof(f64)) * m * n,
			cuda.cuda_memcpy_host_to_device)
	}
}

// gemv_cuda computes y = A * x (matrix-vector product) using cuBLAS.
// Shapes: a is [m, n], x is [n], y is [m].
// Requires T = f64 (double precision).
pub fn gemv_cuda(y &CudaTensor[f64], a &CudaTensor[f64], x &CudaTensor[f64]) ! {
	if a.rank() != 2 || x.rank() != 1 || y.rank() != 1 {
		return error('gemv_cuda: A must be rank-2, x and y must be rank-1')
	}
	m := a.shape[0]
	n := a.shape[1]

	// Convert row-major to column-major for cuBLAS
	a_col := cuda.row_to_col_major(a.data.to_array()!, m, n)

	// Get device and run GEMV
	dev := a.data.device
	result := compute.gemv_cuda(dev, a_col, x.data.to_array()!, m, n)!

	// Copy result to destination
	unsafe {
		C.cudaMemcpy(y.data.ptr, result.data, int(sizeof(f64)) * m, cuda.cuda_memcpy_host_to_device)
	}
}

// relu_cuda applies ReLU activation: max(0, x) on GPU.
// T must be f64.
// Returns a new CudaTensor with the result.
pub fn (t &CudaTensor[f64]) relu_cuda() !&CudaTensor[f64] {
	dev := t.data.device
	input_data := t.data.to_array()!
	result := compute.relu_cuda(dev, input_data)!

	// Create output tensor on GPU
	mut output_storage := &storage.CudaStorage[f64]{
		device: dev
	}
	mut ptr := unsafe { nil }
	sz := int(sizeof(f64)) * result.len
	status := C.cudaMalloc(&ptr, sz)
	if status != 0 {
		return error('relu_cuda: cudaMalloc failed with status ${status}')
	}
	output_storage.ptr = ptr
	output_storage.size = sz
	output_storage.count = result.len

	unsafe {
		C.cudaMemcpy(ptr, result.data, sz, cuda.cuda_memcpy_host_to_device)
	}

	return &CudaTensor[f64]{
		data:    output_storage
		memory:  t.memory
		size:    t.size
		shape:   t.shape
		strides: t.strides
	}
}

// sigmoid_cuda applies sigmoid activation: 1 / (1 + exp(-x)) on GPU.
// T must be f64.
// Returns a new CudaTensor with the result.
pub fn (t &CudaTensor[f64]) sigmoid_cuda() !&CudaTensor[f64] {
	dev := t.data.device
	input_data := t.data.to_array()!
	result := compute.sigmoid_cuda(dev, input_data)!

	// Create output tensor on GPU
	mut output_storage := &storage.CudaStorage[f64]{
		device: dev
	}
	mut ptr := unsafe { nil }
	sz := int(sizeof(f64)) * result.len
	status := C.cudaMalloc(&ptr, sz)
	if status != 0 {
		return error('sigmoid_cuda: cudaMalloc failed with status ${status}')
	}
	output_storage.ptr = ptr
	output_storage.size = sz
	output_storage.count = result.len

	unsafe {
		C.cudaMemcpy(ptr, result.data, sz, cuda.cuda_memcpy_host_to_device)
	}

	return &CudaTensor[f64]{
		data:    output_storage
		memory:  t.memory
		size:    t.size
		shape:   t.shape
		strides: t.strides
	}
}