第17章: SIMD基礎

学習目標

この章を終えると、以下ができるようになります:

SIMDの概念と利点を理解する
Zigの @Vector 型を使用したベクトル演算
数値計算をSIMDで最適化する
SIMDの制限と注意点を理解する
実践的なSIMD活用例を実装できる

SIMDとは

Single Instruction Multiple Data

SIMD（Single Instruction Multiple Data）は、1つの命令で複数のデータを同時に処理する並列処理技術です。

スカラー処理:               SIMD処理:
a0 + b0 = c0              [a0, a1, a2, a3] + [b0, b1, b2, b3]
a1 + b1 = c1       →         = [c0, c1, c2, c3]
a2 + b2 = c2
a3 + b3 = c3              1命令で4つの加算を実行

SIMDの利点

高速化: 1サイクルで複数の演算を実行
効率: メモリ帯域幅の効率的な利用
省電力: 命令数の削減により消費電力を抑制

実世界での応用

用途              例
────────────────────────────────────
画像処理          ピクセル単位の演算
音声処理          サンプル単位の処理
科学計算          行列演算、シミュレーション
機械学習          ベクトル・行列演算
暗号化            ブロック暗号の並列処理

Zigの@Vector型

基本的な使い方

const std = @import("std");

pub fn basicVectorExample() void {
    // 4要素のi32ベクトル
    const a: @Vector(4, i32) = .{ 1, 2, 3, 4 };
    const b: @Vector(4, i32) = .{ 5, 6, 7, 8 };

    // ベクトル加算
    const c = a + b; // { 6, 8, 10, 12 }

    std.debug.print("a = {any}\n", .{a});
    std.debug.print("b = {any}\n", .{b});
    std.debug.print("a + b = {any}\n", .{c});
}

pub fn vectorTypes() void {
    // さまざまな型のベクトル
    const int_vec: @Vector(4, i32) = .{ 1, 2, 3, 4 };
    const float_vec: @Vector(4, f32) = .{ 1.0, 2.0, 3.0, 4.0 };
    const uint_vec: @Vector(8, u8) = .{ 1, 2, 3, 4, 5, 6, 7, 8 };

    std.debug.print("i32 vector: {any}\n", .{int_vec});
    std.debug.print("f32 vector: {any}\n", .{float_vec});
    std.debug.print("u8 vector: {any}\n", .{uint_vec});
}

ベクトルのサイズ

ベクトルサイズは2の累乗が一般的ですが、任意のサイズも指定可能です:

const std = @import("std");

pub fn vectorSizes() void {
    // 一般的なサイズ
    const v2: @Vector(2, f32) = .{ 1.0, 2.0 };
    const v4: @Vector(4, f32) = .{ 1.0, 2.0, 3.0, 4.0 };
    const v8: @Vector(8, f32) = .{ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0 };

    // 任意のサイズ
    const v3: @Vector(3, f32) = .{ 1.0, 2.0, 3.0 };
    const v5: @Vector(5, i32) = .{ 1, 2, 3, 4, 5 };

    _ = v2;
    _ = v4;
    _ = v8;
    _ = v3;
    _ = v5;
}

注意: ハードウェアがネイティブにサポートしないサイズは、コンパイラがエミュレートします。

ベクトル演算

算術演算

const std = @import("std");

pub fn arithmeticOperations() void {
    const a: @Vector(4, f32) = .{ 1.0, 2.0, 3.0, 4.0 };
    const b: @Vector(4, f32) = .{ 5.0, 6.0, 7.0, 8.0 };

    // 加算
    const sum = a + b; // { 6.0, 8.0, 10.0, 12.0 }

    // 減算
    const diff = a - b; // { -4.0, -4.0, -4.0, -4.0 }

    // 乗算
    const prod = a * b; // { 5.0, 12.0, 21.0, 32.0 }

    // 除算
    const quot = b / a; // { 5.0, 3.0, 2.33..., 2.0 }

    std.debug.print("a + b = {any}\n", .{sum});
    std.debug.print("a - b = {any}\n", .{diff});
    std.debug.print("a * b = {any}\n", .{prod});
    std.debug.print("b / a = {any}\n", .{quot});
}

pub fn scalarOperations() void {
    const v: @Vector(4, f32) = .{ 1.0, 2.0, 3.0, 4.0 };
    const scalar: @Vector(4, f32) = @splat(2.0);

    // スカラー乗算
    const scaled = v * scalar; // { 2.0, 4.0, 6.0, 8.0 }

    std.debug.print("v * 2 = {any}\n", .{scaled});
}

比較演算

const std = @import("std");

pub fn comparisonOperations() void {
    const a: @Vector(4, f32) = .{ 1.0, 2.0, 3.0, 4.0 };
    const b: @Vector(4, f32) = .{ 2.0, 2.0, 2.0, 2.0 };

    // 各要素を比較（結果はboolのベクトル）
    const less = a < b;     // { true, false, false, false }
    const equal = a == b;   // { false, true, false, false }
    const greater = a > b;  // { false, false, true, true }

    std.debug.print("a < b: {any}\n", .{less});
    std.debug.print("a == b: {any}\n", .{equal});
    std.debug.print("a > b: {any}\n", .{greater});
}

ビット演算

const std = @import("std");

pub fn bitwiseOperations() void {
    const a: @Vector(4, u32) = .{ 0xFF, 0xAA, 0x55, 0x00 };
    const b: @Vector(4, u32) = .{ 0x0F, 0xF0, 0x33, 0xFF };

    // ビット演算
    const and_result = a & b;
    const or_result = a | b;
    const xor_result = a ^ b;
    const not_result = ~a;

    std.debug.print("a & b = {any}\n", .{and_result});
    std.debug.print("a | b = {any}\n", .{or_result});
    std.debug.print("a ^ b = {any}\n", .{xor_result});
    std.debug.print("~a = {any}\n", .{not_result});
}

ベクトル関数

@splat - スカラーをベクトルに

const std = @import("std");

pub fn splatExample() void {
    // スカラー値を全要素にコピー
    const v1: @Vector(4, f32) = @splat(3.14);
    // { 3.14, 3.14, 3.14, 3.14 }

    const v2: @Vector(8, i32) = @splat(42);
    // { 42, 42, 42, 42, 42, 42, 42, 42 }

    std.debug.print("splat(3.14) = {any}\n", .{v1});
    std.debug.print("splat(42) = {any}\n", .{v2});
}

@reduce - ベクトルを集約

const std = @import("std");

pub fn reduceExample() void {
    const v: @Vector(4, i32) = .{ 1, 2, 3, 4 };

    // 各種reduce操作
    const sum = @reduce(.Add, v);     // 1 + 2 + 3 + 4 = 10
    const prod = @reduce(.Mul, v);    // 1 * 2 * 3 * 4 = 24
    const min = @reduce(.Min, v);     // min(1, 2, 3, 4) = 1
    const max = @reduce(.Max, v);     // max(1, 2, 3, 4) = 4

    std.debug.print("sum = {}\n", .{sum});
    std.debug.print("prod = {}\n", .{prod});
    std.debug.print("min = {}\n", .{min});
    std.debug.print("max = {}\n", .{max});
}

pub fn dotProduct() void {
    const a: @Vector(4, f32) = .{ 1.0, 2.0, 3.0, 4.0 };
    const b: @Vector(4, f32) = .{ 5.0, 6.0, 7.0, 8.0 };

    // 内積: (1*5) + (2*6) + (3*7) + (4*8) = 70
    const dot = @reduce(.Add, a * b);

    std.debug.print("dot product = {d:.2}\n", .{dot});
}

@shuffle - 要素の並び替え

const std = @import("std");

pub fn shuffleExample() void {
    const v: @Vector(4, i32) = .{ 1, 2, 3, 4 };

    // 要素を並び替え
    const shuffled = @shuffle(i32, v, undefined, [4]i32{ 3, 2, 1, 0 });
    // { 4, 3, 2, 1 }

    std.debug.print("original = {any}\n", .{v});
    std.debug.print("shuffled = {any}\n", .{shuffled});
}

pub fn swizzle() void {
    const v: @Vector(4, f32) = .{ 1.0, 2.0, 3.0, 4.0 };

    // スウィズル（特定要素の抽出・複製）
    const xy = @shuffle(f32, v, undefined, [2]i32{ 0, 1 });
    const xyxy = @shuffle(f32, v, undefined, [4]i32{ 0, 1, 0, 1 });

    std.debug.print("xy = {any}\n", .{xy});
    std.debug.print("xyxy = {any}\n", .{xyxy});
}

@select - 条件選択

const std = @import("std");

pub fn selectExample() void {
    const a: @Vector(4, f32) = .{ 1.0, 2.0, 3.0, 4.0 };
    const b: @Vector(4, f32) = .{ 5.0, 6.0, 7.0, 8.0 };
    const mask: @Vector(4, bool) = .{ true, false, true, false };

    // mask が true なら a, false なら b を選択
    const result = @select(f32, mask, a, b);
    // { 1.0, 6.0, 3.0, 8.0 }

    std.debug.print("result = {any}\n", .{result});
}

pub fn clampVector() void {
    const v: @Vector(4, f32) = .{ -1.0, 0.5, 1.5, 3.0 };
    const min_val: @Vector(4, f32) = @splat(0.0);
    const max_val: @Vector(4, f32) = @splat(1.0);

    // 範囲制限
    const clamped = @select(
        f32,
        v < min_val,
        min_val,
        @select(f32, v > max_val, max_val, v),
    );
    // { 0.0, 0.5, 1.0, 1.0 }

    std.debug.print("clamped = {any}\n", .{clamped});
}

実践例

ベクトル加算

const std = @import("std");

pub fn vectorAddScalar(a: []const f32, b: []const f32, result: []f32) void {
    std.debug.assert(a.len == b.len and b.len == result.len);

    for (a, b, result) |av, bv, *rv| {
        rv.* = av + bv;
    }
}

pub fn vectorAddSIMD(a: []const f32, b: []const f32, result: []f32) void {
    std.debug.assert(a.len == b.len and b.len == result.len);

    const VecSize = 4;
    const Vec = @Vector(VecSize, f32);

    var i: usize = 0;
    while (i + VecSize <= a.len) : (i += VecSize) {
        const av: Vec = a[i..][0..VecSize].*;
        const bv: Vec = b[i..][0..VecSize].*;
        const rv = av + bv;

        @memcpy(result[i..][0..VecSize], &rv);
    }

    // 残りの要素をスカラー処理
    while (i < a.len) : (i += 1) {
        result[i] = a[i] + b[i];
    }
}

pub fn benchmark() !void {
    const size = 1_000_000;
    const allocator = std.heap.page_allocator;

    const a = try allocator.alloc(f32, size);
    defer allocator.free(a);

    const b = try allocator.alloc(f32, size);
    defer allocator.free(b);

    const result = try allocator.alloc(f32, size);
    defer allocator.free(result);

    // 初期化
    for (a, 0..) |*v, i| v.* = @floatFromInt(i);
    for (b, 0..) |*v, i| v.* = @floatFromInt(i * 2);

    // スカラー版
    const start1 = std.time.microTimestamp();
    vectorAddScalar(a, b, result);
    const end1 = std.time.microTimestamp();

    // SIMD版
    const start2 = std.time.microTimestamp();
    vectorAddSIMD(a, b, result);
    const end2 = std.time.microTimestamp();

    std.debug.print("Scalar: {} μs\n", .{end1 - start1});
    std.debug.print("SIMD: {} μs\n", .{end2 - start2});
}

内積計算

const std = @import("std");

pub fn dotProductScalar(a: []const f32, b: []const f32) f32 {
    std.debug.assert(a.len == b.len);

    var sum: f32 = 0.0;
    for (a, b) |av, bv| {
        sum += av * bv;
    }
    return sum;
}

pub fn dotProductSIMD(a: []const f32, b: []const f32) f32 {
    std.debug.assert(a.len == b.len);

    const VecSize = 4;
    const Vec = @Vector(VecSize, f32);

    var sum_vec: Vec = @splat(0.0);
    var i: usize = 0;

    while (i + VecSize <= a.len) : (i += VecSize) {
        const av: Vec = a[i..][0..VecSize].*;
        const bv: Vec = b[i..][0..VecSize].*;
        sum_vec += av * bv;
    }

    var sum = @reduce(.Add, sum_vec);

    // 残りの要素
    while (i < a.len) : (i += 1) {
        sum += a[i] * b[i];
    }

    return sum;
}

画像処理

const std = @import("std");

pub const Image = struct {
    width: usize,
    height: usize,
    pixels: []u8, // RGBA形式

    pub fn brightenScalar(self: *Image, factor: f32) void {
        for (self.pixels) |*pixel| {
            const new_val = @min(255, @as(u32, @intFromFloat(@as(f32, @floatFromInt(pixel.*)) * factor)));
            pixel.* = @intCast(new_val);
        }
    }

    pub fn brightenSIMD(self: *Image, factor: f32) void {
        const VecSize = 16;
        const Vec = @Vector(VecSize, u8);
        const FloatVec = @Vector(VecSize, f32);

        const factor_vec: FloatVec = @splat(factor);
        const max_vec: FloatVec = @splat(255.0);

        var i: usize = 0;
        while (i + VecSize <= self.pixels.len) : (i += VecSize) {
            const pixels: Vec = self.pixels[i..][0..VecSize].*;

            // u8 -> f32
            var float_pixels: FloatVec = undefined;
            inline for (0..VecSize) |j| {
                float_pixels[j] = @floatFromInt(pixels[j]);
            }

            // 明るさ調整
            float_pixels = float_pixels * factor_vec;

            // クランプ
            float_pixels = @select(
                f32,
                float_pixels > max_vec,
                max_vec,
                float_pixels,
            );

            // f32 -> u8
            var result: Vec = undefined;
            inline for (0..VecSize) |j| {
                result[j] = @intFromFloat(float_pixels[j]);
            }

            @memcpy(self.pixels[i..][0..VecSize], &result);
        }

        // 残りの要素
        while (i < self.pixels.len) : (i += 1) {
            const new_val = @min(255, @as(u32, @intFromFloat(@as(f32, @floatFromInt(self.pixels[i])) * factor)));
            self.pixels[i] = @intCast(new_val);
        }
    }
};

行列乗算

const std = @import("std");

pub fn matrixMultiplySIMD(
    a: []const f32,
    b: []const f32,
    result: []f32,
    rows_a: usize,
    cols_a: usize,
    cols_b: usize,
) void {
    const VecSize = 4;
    const Vec = @Vector(VecSize, f32);

    for (0..rows_a) |i| {
        for (0..cols_b) |j| {
            var sum_vec: Vec = @splat(0.0);
            var k: usize = 0;

            while (k + VecSize <= cols_a) : (k += VecSize) {
                var a_vec: Vec = undefined;
                var b_vec: Vec = undefined;

                inline for (0..VecSize) |v| {
                    a_vec[v] = a[i * cols_a + k + v];
                    b_vec[v] = b[(k + v) * cols_b + j];
                }

                sum_vec += a_vec * b_vec;
            }

            var sum = @reduce(.Add, sum_vec);

            while (k < cols_a) : (k += 1) {
                sum += a[i * cols_a + k] * b[k * cols_b + j];
            }

            result[i * cols_b + j] = sum;
        }
    }
}

パフォーマンスの考慮事項

アライメント

const std = @import("std");

pub fn alignedAllocation() !void {
    const allocator = std.heap.page_allocator;

    // 16バイトアライメント
    const aligned_data = try allocator.alignedAlloc(f32, 16, 1024);
    defer allocator.free(aligned_data);

    std.debug.print("Address: {*}\n", .{aligned_data.ptr});
}

キャッシュの最適化

// データをキャッシュラインに配置
const CacheLinePadded = struct {
    data: @Vector(4, f32) align(64),
    padding: [48]u8 = [_]u8{0} ** 48, // 64バイトに合わせる
};

SIMDの制限事項

注意点

ハードウェア依存: CPUが対応していない命令は遅い
アライメント: 不適切なアライメントはパフォーマンス低下
分岐: 条件分岐が多いとSIMDの効果が薄い
データ依存: 演算間のデータ依存関係に注意

// 悪い例: データ依存関係
// a[i] = a[i-1] + 1; // 前の結果に依存

// 良い例: 独立した演算
// result[i] = a[i] + b[i]; // 各要素が独立

まとめ

この章では、ZigのSIMD機能について学びました:

@Vector型: ベクトル演算の基本
ベクトル関数: @splat, @reduce, @shuffle, @select
実践例: 配列演算、画像処理、行列計算
パフォーマンス: アライメント、キャッシュ最適化
制限事項: ハードウェア依存、分岐の影響

次の章では、Zigの標準ライブラリについて学びます。

参考文献

Zig Language Reference - Vectors: https://ziglang.org/documentation/master/#Vectors
SIMD Programming: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/