zig-simd - 解答
実装コード
ベクトル加算
const std = @import("std");
/// スカラー版ベクトル加算
pub fn vectorAdd(a: []const f32, b: []const f32, result: []f32) void {
std.debug.assert(a.len == b.len and b.len == result.len);
for (a, b, result) |av, bv, *rv| {
rv.* = av + bv;
}
}
/// SIMD版ベクトル加算
pub fn vectorAddSimd(a: []const f32, b: []const f32, result: []f32) void {
std.debug.assert(a.len == b.len and b.len == result.len);
const vec_len = 8; // AVX: 256bit / 32bit = 8
var i: usize = 0;
// SIMDで処理できる部分
while (i + vec_len <= a.len) : (i += vec_len) {
const va: @Vector(vec_len, f32) = a[i..][0..vec_len].*;
const vb: @Vector(vec_len, f32) = b[i..][0..vec_len].*;
const vr = va + vb;
result[i..][0..vec_len].* = vr;
}
// 残りの要素(スカラー処理)
while (i < a.len) : (i += 1) {
result[i] = a[i] + b[i];
}
}
ドット積
/// スカラー版ドット積
pub fn dotProduct(a: []const f32, b: []const f32) f32 {
std.debug.assert(a.len == b.len);
var sum: f32 = 0.0;
for (a, b) |av, bv| {
sum += av * bv;
}
return sum;
}
/// SIMD版ドット積
pub fn dotProductSimd(a: []const f32, b: []const f32) f32 {
std.debug.assert(a.len == b.len);
const vec_len = 8;
var i: usize = 0;
var sum_vec: @Vector(vec_len, f32) = @splat(0.0);
// SIMDで処理
while (i + vec_len <= a.len) : (i += vec_len) {
const va: @Vector(vec_len, f32) = a[i..][0..vec_len].*;
const vb: @Vector(vec_len, f32) = b[i..][0..vec_len].*;
sum_vec += va * vb;
}
// ベクトルの要素を合計
var sum = @reduce(.Add, sum_vec);
// 残りの要素
while (i < a.len) : (i += 1) {
sum += a[i] * b[i];
}
return sum;
}
行列乗算
pub const Mat4 = struct {
data: [4][4]f32,
pub fn identity() Mat4 {
return .{ .data = .{
.{ 1, 0, 0, 0 },
.{ 0, 1, 0, 0 },
.{ 0, 0, 1, 0 },
.{ 0, 0, 0, 1 },
} };
}
/// スカラー版行列乗算
pub fn multiply(self: Mat4, other: Mat4) Mat4 {
var result: Mat4 = undefined;
for (0..4) |i| {
for (0..4) |j| {
var sum: f32 = 0.0;
for (0..4) |k| {
sum += self.data[i][k] * other.data[k][j];
}
result.data[i][j] = sum;
}
}
return result;
}
/// SIMD版行列乗算
pub fn multiplySimd(self: Mat4, other: Mat4) Mat4 {
var result: Mat4 = undefined;
// 列ベクトルを取得
const col0: @Vector(4, f32) = .{ other.data[0][0], other.data[1][0], other.data[2][0], other.data[3][0] };
const col1: @Vector(4, f32) = .{ other.data[0][1], other.data[1][1], other.data[2][1], other.data[3][1] };
const col2: @Vector(4, f32) = .{ other.data[0][2], other.data[1][2], other.data[2][2], other.data[3][2] };
const col3: @Vector(4, f32) = .{ other.data[0][3], other.data[1][3], other.data[2][3], other.data[3][3] };
inline for (0..4) |i| {
const row: @Vector(4, f32) = self.data[i];
// 各列とのドット積
result.data[i][0] = @reduce(.Add, row * col0);
result.data[i][1] = @reduce(.Add, row * col1);
result.data[i][2] = @reduce(.Add, row * col2);
result.data[i][3] = @reduce(.Add, row * col3);
}
return result;
}
};
ボーナス: 画像グレースケール変換
pub const Pixel = struct {
r: u8,
g: u8,
b: u8,
a: u8 = 255,
};
pub fn toGrayscaleSimd(pixels: []Pixel) void {
const vec_len = 4;
var i: usize = 0;
const r_weight: @Vector(vec_len, f32) = @splat(0.299);
const g_weight: @Vector(vec_len, f32) = @splat(0.587);
const b_weight: @Vector(vec_len, f32) = @splat(0.114);
while (i + vec_len <= pixels.len) : (i += vec_len) {
// ピクセルデータをベクトルに変換
var r_vals: [vec_len]f32 = undefined;
var g_vals: [vec_len]f32 = undefined;
var b_vals: [vec_len]f32 = undefined;
inline for (0..vec_len) |j| {
r_vals[j] = @floatFromInt(pixels[i + j].r);
g_vals[j] = @floatFromInt(pixels[i + j].g);
b_vals[j] = @floatFromInt(pixels[i + j].b);
}
const r_vec: @Vector(vec_len, f32) = r_vals;
const g_vec: @Vector(vec_len, f32) = g_vals;
const b_vec: @Vector(vec_len, f32) = b_vals;
// グレースケール計算
const gray_vec = r_vec * r_weight + g_vec * g_weight + b_vec * b_weight;
// 結果を書き戻し
inline for (0..vec_len) |j| {
const gray: u8 = @intFromFloat(@min(@max(gray_vec[j], 0.0), 255.0));
pixels[i + j] = .{ .r = gray, .g = gray, .b = gray };
}
}
// 残りのピクセル
while (i < pixels.len) : (i += 1) {
const gray = @as(u8, @intFromFloat(
@as(f32, @floatFromInt(pixels[i].r)) * 0.299 +
@as(f32, @floatFromInt(pixels[i].g)) * 0.587 +
@as(f32, @floatFromInt(pixels[i].b)) * 0.114
));
pixels[i] = .{ .r = gray, .g = gray, .b = gray };
}
}
テストコード
const testing = std.testing;
test "vectorAddSimd correctness" {
const a = [_]f32{ 1, 2, 3, 4, 5, 6, 7, 8 };
const b = [_]f32{ 8, 7, 6, 5, 4, 3, 2, 1 };
var result: [8]f32 = undefined;
vectorAddSimd(&a, &b, &result);
for (result) |r| {
try testing.expectApproxEqAbs(r, 9.0, 0.001);
}
}
test "dotProductSimd correctness" {
const a = [_]f32{ 1, 2, 3, 4 };
const b = [_]f32{ 4, 3, 2, 1 };
const result = dotProductSimd(&a, &b);
try testing.expectApproxEqAbs(result, 20.0, 0.001);
}
test "Mat4 multiplySimd correctness" {
const m1 = Mat4.identity();
const m2 = Mat4{ .data = .{
.{ 1, 2, 3, 4 },
.{ 5, 6, 7, 8 },
.{ 9, 10, 11, 12 },
.{ 13, 14, 15, 16 },
} };
const result = m1.multiplySimd(m2);
// 単位行列との乗算は元の行列
for (0..4) |i| {
for (0..4) |j| {
try testing.expectApproxEqAbs(result.data[i][j], m2.data[i][j], 0.001);
}
}
}