zig-simd - 解答

実装コード

ベクトル加算

const std = @import("std");

/// スカラー版ベクトル加算
pub fn vectorAdd(a: []const f32, b: []const f32, result: []f32) void {
    std.debug.assert(a.len == b.len and b.len == result.len);

    for (a, b, result) |av, bv, *rv| {
        rv.* = av + bv;
    }
}

/// SIMD版ベクトル加算
pub fn vectorAddSimd(a: []const f32, b: []const f32, result: []f32) void {
    std.debug.assert(a.len == b.len and b.len == result.len);

    const vec_len = 8; // AVX: 256bit / 32bit = 8
    var i: usize = 0;

    // SIMDで処理できる部分
    while (i + vec_len <= a.len) : (i += vec_len) {
        const va: @Vector(vec_len, f32) = a[i..][0..vec_len].*;
        const vb: @Vector(vec_len, f32) = b[i..][0..vec_len].*;
        const vr = va + vb;
        result[i..][0..vec_len].* = vr;
    }

    // 残りの要素(スカラー処理)
    while (i < a.len) : (i += 1) {
        result[i] = a[i] + b[i];
    }
}

ドット積

/// スカラー版ドット積
pub fn dotProduct(a: []const f32, b: []const f32) f32 {
    std.debug.assert(a.len == b.len);

    var sum: f32 = 0.0;
    for (a, b) |av, bv| {
        sum += av * bv;
    }
    return sum;
}

/// SIMD版ドット積
pub fn dotProductSimd(a: []const f32, b: []const f32) f32 {
    std.debug.assert(a.len == b.len);

    const vec_len = 8;
    var i: usize = 0;
    var sum_vec: @Vector(vec_len, f32) = @splat(0.0);

    // SIMDで処理
    while (i + vec_len <= a.len) : (i += vec_len) {
        const va: @Vector(vec_len, f32) = a[i..][0..vec_len].*;
        const vb: @Vector(vec_len, f32) = b[i..][0..vec_len].*;
        sum_vec += va * vb;
    }

    // ベクトルの要素を合計
    var sum = @reduce(.Add, sum_vec);

    // 残りの要素
    while (i < a.len) : (i += 1) {
        sum += a[i] * b[i];
    }

    return sum;
}

行列乗算

pub const Mat4 = struct {
    data: [4][4]f32,

    pub fn identity() Mat4 {
        return .{ .data = .{
            .{ 1, 0, 0, 0 },
            .{ 0, 1, 0, 0 },
            .{ 0, 0, 1, 0 },
            .{ 0, 0, 0, 1 },
        } };
    }

    /// スカラー版行列乗算
    pub fn multiply(self: Mat4, other: Mat4) Mat4 {
        var result: Mat4 = undefined;

        for (0..4) |i| {
            for (0..4) |j| {
                var sum: f32 = 0.0;
                for (0..4) |k| {
                    sum += self.data[i][k] * other.data[k][j];
                }
                result.data[i][j] = sum;
            }
        }

        return result;
    }

    /// SIMD版行列乗算
    pub fn multiplySimd(self: Mat4, other: Mat4) Mat4 {
        var result: Mat4 = undefined;

        // 列ベクトルを取得
        const col0: @Vector(4, f32) = .{ other.data[0][0], other.data[1][0], other.data[2][0], other.data[3][0] };
        const col1: @Vector(4, f32) = .{ other.data[0][1], other.data[1][1], other.data[2][1], other.data[3][1] };
        const col2: @Vector(4, f32) = .{ other.data[0][2], other.data[1][2], other.data[2][2], other.data[3][2] };
        const col3: @Vector(4, f32) = .{ other.data[0][3], other.data[1][3], other.data[2][3], other.data[3][3] };

        inline for (0..4) |i| {
            const row: @Vector(4, f32) = self.data[i];

            // 各列とのドット積
            result.data[i][0] = @reduce(.Add, row * col0);
            result.data[i][1] = @reduce(.Add, row * col1);
            result.data[i][2] = @reduce(.Add, row * col2);
            result.data[i][3] = @reduce(.Add, row * col3);
        }

        return result;
    }
};

ボーナス: 画像グレースケール変換

pub const Pixel = struct {
    r: u8,
    g: u8,
    b: u8,
    a: u8 = 255,
};

pub fn toGrayscaleSimd(pixels: []Pixel) void {
    const vec_len = 4;
    var i: usize = 0;

    const r_weight: @Vector(vec_len, f32) = @splat(0.299);
    const g_weight: @Vector(vec_len, f32) = @splat(0.587);
    const b_weight: @Vector(vec_len, f32) = @splat(0.114);

    while (i + vec_len <= pixels.len) : (i += vec_len) {
        // ピクセルデータをベクトルに変換
        var r_vals: [vec_len]f32 = undefined;
        var g_vals: [vec_len]f32 = undefined;
        var b_vals: [vec_len]f32 = undefined;

        inline for (0..vec_len) |j| {
            r_vals[j] = @floatFromInt(pixels[i + j].r);
            g_vals[j] = @floatFromInt(pixels[i + j].g);
            b_vals[j] = @floatFromInt(pixels[i + j].b);
        }

        const r_vec: @Vector(vec_len, f32) = r_vals;
        const g_vec: @Vector(vec_len, f32) = g_vals;
        const b_vec: @Vector(vec_len, f32) = b_vals;

        // グレースケール計算
        const gray_vec = r_vec * r_weight + g_vec * g_weight + b_vec * b_weight;

        // 結果を書き戻し
        inline for (0..vec_len) |j| {
            const gray: u8 = @intFromFloat(@min(@max(gray_vec[j], 0.0), 255.0));
            pixels[i + j] = .{ .r = gray, .g = gray, .b = gray };
        }
    }

    // 残りのピクセル
    while (i < pixels.len) : (i += 1) {
        const gray = @as(u8, @intFromFloat(
            @as(f32, @floatFromInt(pixels[i].r)) * 0.299 +
            @as(f32, @floatFromInt(pixels[i].g)) * 0.587 +
            @as(f32, @floatFromInt(pixels[i].b)) * 0.114
        ));
        pixels[i] = .{ .r = gray, .g = gray, .b = gray };
    }
}

テストコード

const testing = std.testing;

test "vectorAddSimd correctness" {
    const a = [_]f32{ 1, 2, 3, 4, 5, 6, 7, 8 };
    const b = [_]f32{ 8, 7, 6, 5, 4, 3, 2, 1 };
    var result: [8]f32 = undefined;

    vectorAddSimd(&a, &b, &result);

    for (result) |r| {
        try testing.expectApproxEqAbs(r, 9.0, 0.001);
    }
}

test "dotProductSimd correctness" {
    const a = [_]f32{ 1, 2, 3, 4 };
    const b = [_]f32{ 4, 3, 2, 1 };

    const result = dotProductSimd(&a, &b);

    try testing.expectApproxEqAbs(result, 20.0, 0.001);
}

test "Mat4 multiplySimd correctness" {
    const m1 = Mat4.identity();
    const m2 = Mat4{ .data = .{
        .{ 1, 2, 3, 4 },
        .{ 5, 6, 7, 8 },
        .{ 9, 10, 11, 12 },
        .{ 13, 14, 15, 16 },
    } };

    const result = m1.multiplySimd(m2);

    // 単位行列との乗算は元の行列
    for (0..4) |i| {
        for (0..4) |j| {
            try testing.expectApproxEqAbs(result.data[i][j], m2.data[i][j], 0.001);
        }
    }
}