July 21, 2025

Golang Cgo Metal (sm3)

周末打了场比赛( 刷到 adwa的blog )，这道题需要爆破 $2^{32}$ bit 并调用一些函数验证，据上面 adwa 博客说 python 300h，go 并发 8min （我自己的 M1 Pro 需要15分钟）我自己对 Go 的极致（大概吧）优化不到五分钟（优化点包括但不限于：防止 gc、手动 make、将 sm3 的库中代码优化（你还能有库牛逼?.jpg）、大小核优化：大核心负责爆破的核心运算，小核心负责任务调度），

赛后与 rec 的队友交流时，对方提到可以利用 CUDA 提速（哥们有东西是真教啊），但我手头没有 NVDIA GPU

最后试了下 Metal (GPU) 编程，只需要不到25s，如果是最新款芯片，调整下核心数量，还可以更快：预计 M4 Pro 只需要 10s，不显示进度的话还能再快一些

想起 “有限空间爆破” ~~~

package main

/*
#cgo CFLAGS: -x objective-c -fobjc-arc
#cgo LDFLAGS: -framework Metal -framework Foundation -framework CoreGraphics

#import <Metal/Metal.h>
#import <Foundation/Foundation.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>

// Metal 设备和资源
id<MTLDevice> device;
id<MTLCommandQueue> commandQueue;
id<MTLComputePipelineState> computePipelineState;
id<MTLBuffer> candidateBuffer;
id<MTLBuffer> resultBuffer;
id<MTLBuffer> targetBuffer;
id<MTLBuffer> foundBuffer;

// SM3 Metal shader 源码
const char* sm3MetalSource = R"(
#include <metal_stdlib>
using namespace metal;

// SM3 常量
constant uint32_t SM3_IV[8] = {
    0x7380166f, 0x4914b2b9, 0x172442d7, 0xda8a0600,
    0xa96f30bc, 0x163138aa, 0xe38dee4d, 0xb0fb0e4e
};

// 循环左移
inline uint32_t rotateLeft(uint32_t x, uint32_t n) {
    return (x << n) | (x >> (32 - n));
}

// SM3 函数
inline uint32_t ff0(uint32_t x, uint32_t y, uint32_t z) { return x ^ y ^ z; }
inline uint32_t ff1(uint32_t x, uint32_t y, uint32_t z) { return (x & y) | (x & z) | (y & z); }
inline uint32_t gg0(uint32_t x, uint32_t y, uint32_t z) { return x ^ y ^ z; }
inline uint32_t gg1(uint32_t x, uint32_t y, uint32_t z) { return (x & y) | (~x & z); }
inline uint32_t p0(uint32_t x) { return x ^ rotateLeft(x, 9) ^ rotateLeft(x, 17); }
inline uint32_t p1(uint32_t x) { return x ^ rotateLeft(x, 15) ^ rotateLeft(x, 23); }

// 字符映射
inline uchar indexToChar(uint64_t index) {
    const uchar chars[4] = {'a', 'b', 'c', 'd'};
    return chars[index & 3];
}

// SM3 核心计算 - 使用线程本地内存
void sm3_hash_local(thread const uchar* input, thread uchar* output) {
    uint32_t digest[8];
    for (int i = 0; i < 8; i++) {
        digest[i] = SM3_IV[i];
    }

    // 准备消息块
    uint32_t W[68];
    uint32_t W1[64];

    // 填充消息
    uchar padded[64];
    for (int i = 0; i < 32; i++) {
        padded[i] = input[i];
    }
    padded[32] = 0x80;
    for (int i = 33; i < 62; i++) {
        padded[i] = 0;
    }
    padded[62] = 0x01;
    padded[63] = 0x00;

    // 消息扩展
    for (int i = 0; i < 16; i++) {
        W[i] = ((uint32_t)padded[i*4] << 24) |
               ((uint32_t)padded[i*4+1] << 16) |
               ((uint32_t)padded[i*4+2] << 8) |
               ((uint32_t)padded[i*4+3]);
    }

    for (int i = 16; i < 68; i++) {
        W[i] = p1(W[i-16] ^ W[i-9] ^ rotateLeft(W[i-3], 15)) ^
               rotateLeft(W[i-13], 7) ^ W[i-6];
    }

    for (int i = 0; i < 64; i++) {
        W1[i] = W[i] ^ W[i+4];
    }

    // 压缩函数
    uint32_t A = digest[0], B = digest[1], C = digest[2], D = digest[3];
    uint32_t E = digest[4], F = digest[5], G = digest[6], H = digest[7];

    for (int i = 0; i < 16; i++) {
        uint32_t SS1 = rotateLeft(rotateLeft(A, 12) + E + rotateLeft(0x79cc4519, i), 7);
        uint32_t SS2 = SS1 ^ rotateLeft(A, 12);
        uint32_t TT1 = ff0(A, B, C) + D + SS2 + W1[i];
        uint32_t TT2 = gg0(E, F, G) + H + SS1 + W[i];
        D = C;
        C = rotateLeft(B, 9);
        B = A;
        A = TT1;
        H = G;
        G = rotateLeft(F, 19);
        F = E;
        E = p0(TT2);
    }

    for (int i = 16; i < 64; i++) {
        uint32_t SS1 = rotateLeft(rotateLeft(A, 12) + E + rotateLeft(0x7a879d8a, i), 7);
        uint32_t SS2 = SS1 ^ rotateLeft(A, 12);
        uint32_t TT1 = ff1(A, B, C) + D + SS2 + W1[i];
        uint32_t TT2 = gg1(E, F, G) + H + SS1 + W[i];
        D = C;
        C = rotateLeft(B, 9);
        B = A;
        A = TT1;
        H = G;
        G = rotateLeft(F, 19);
        F = E;
        E = p0(TT2);
    }

    // 最终哈希值
    digest[0] ^= A; digest[1] ^= B; digest[2] ^= C; digest[3] ^= D;
    digest[4] ^= E; digest[5] ^= F; digest[6] ^= G; digest[7] ^= H;

    // 输出大端序
    for (int i = 0; i < 8; i++) {
        output[i*4] = (digest[i] >> 24) & 0xff;
        output[i*4+1] = (digest[i] >> 16) & 0xff;
        output[i*4+2] = (digest[i] >> 8) & 0xff;
        output[i*4+3] = digest[i] & 0xff;
    }
}

// GPU 内核函数
kernel void sm3_search(
    device uchar* result [[buffer(0)]],      // 输出结果
    constant uchar* target [[buffer(1)]],    // 目标哈希
    device atomic_int* found [[buffer(2)]],  // 找到标志
    constant uint64_t* baseIndex [[buffer(3)]], // 基础索引
    uint3 gid [[thread_position_in_grid]]    // 线程ID
) {
    // 计算全局索引
    uint64_t globalId = gid.x + gid.y * 1024 + gid.z * 1024 * 1024;
    uint64_t candidateIndex = baseIndex[0] + globalId;

    // 检查是否已找到
    if (atomic_load_explicit(found, memory_order_relaxed) != 0) {
        return;
    }

    // 生成候选值 - 使用线程本地内存
    thread uchar candidate[32];

    // 固定前缀 "adcddbbadcacabad"
    candidate[0] = 'a'; candidate[1] = 'd'; candidate[2] = 'c'; candidate[3] = 'd';
    candidate[4] = 'd'; candidate[5] = 'b'; candidate[6] = 'b'; candidate[7] = 'a';
    candidate[8] = 'd'; candidate[9] = 'c'; candidate[10] = 'a'; candidate[11] = 'c';
    candidate[12] = 'a'; candidate[13] = 'b'; candidate[14] = 'a'; candidate[15] = 'd';

    // 生成后16字节
    uint64_t idx = candidateIndex;
    for (int i = 0; i < 16; i++) {
        candidate[16 + i] = indexToChar(idx);
        idx >>= 2;
    }

    // 计算哈希 - 使用线程本地内存
    thread uchar hash[32];
    sm3_hash_local(candidate, hash);

    // 比较结果
    bool match = true;
    for (int i = 0; i < 32; i++) {
        if (hash[i] != target[i]) {
            match = false;
            break;
        }
    }

    if (match) {
        // 找到了！
        atomic_store_explicit(found, 1, memory_order_relaxed);

        // 保存结果到全局内存
        for (int i = 0; i < 32; i++) {
            result[i] = candidate[i];
        }
    }
}
)";

// 获取GPU信息
typedef struct {
    int coreCount;
    int maxThreadsPerThreadgroup;
    int maxThreadgroupsPerMeshGrid;
    int registryID;
    char name[256];
} GPUInfo;

// 使用system_profiler获取准确的GPU核心数
int getGPUCoresFromSystemProfiler() {
    FILE *fp;
    char buffer[128];
    int cores = 0;

    // 执行system_profiler命令
    fp = popen("system_profiler SPDisplaysDataType | awk '/Total Number of Cores:/{print $5}'", "r");
    if (fp == NULL) {
        printf("Failed to run system_profiler command\n");
        return 0;
    }

    // 读取输出
    if (fgets(buffer, sizeof(buffer), fp) != NULL) {
        cores = atoi(buffer);
        printf("GPU cores detected by system_profiler: %d\n", cores);
    }

    pclose(fp);
    return cores;
}

GPUInfo getGPUInfo() {
    GPUInfo info = {0};

    if (device) {
        // GPU名称
        strncpy(info.name, [[device name] UTF8String], 255);

        // 使用system_profiler获取准确的核心数
        info.coreCount = getGPUCoresFromSystemProfiler();

        // 如果system_profiler失败，尝试其他方法
        if (info.coreCount == 0) {
            // 获取GPU核心数 - M1/M2特定
            if ([device respondsToSelector:@selector(recommendedMaxWorkingSetSize)]) {
                // 从注册表ID推断核心数
                info.registryID = (int)[device registryID];

                // 通过GPU family和特性推断核心数
                if ([device supportsFamily:MTLGPUFamilyApple7]) {
                    // M1系列
                    NSString *name = [device name];
                    if ([name containsString:@"M1 Max"]) {
                        info.coreCount = 32;
                    } else if ([name containsString:@"M1 Pro"]) {
                        info.coreCount = 14; // M1 Pro通常是14或16核
                    } else if ([name containsString:@"M1"]) {
                        info.coreCount = 8;
                    }
                } else if ([device supportsFamily:MTLGPUFamilyApple8]) {
                    // M2系列
                    NSString *name = [device name];
                    if ([name containsString:@"M2 Max"]) {
                        info.coreCount = 38;
                    } else if ([name containsString:@"M2 Pro"]) {
                        info.coreCount = 19;
                    } else if ([name containsString:@"M2"]) {
                        info.coreCount = 10;
                    }
                }
            }

            // 如果仍然无法确定，使用默认值
            if (info.coreCount == 0) {
                info.coreCount = 8; // 保守估计
            }
        }
    }

    return info;
}

// 初始化 Metal
int initMetal(GPUInfo* gpuInfo) {
    @autoreleasepool {
        NSError *error = nil;

        // 获取所有GPU设备
        NSArray<id<MTLDevice>> *devices = MTLCopyAllDevices();
        if (devices.count > 0) {
            printf("Found %lu GPU devices:\n", devices.count);
            for (int i = 0; i < devices.count; i++) {
                id<MTLDevice> dev = devices[i];
                printf("  %d: %s\n", i, [[dev name] UTF8String]);
            }
            // 使用第一个设备（通常是最强大的）
            device = devices[0];
        } else {
            // 获取默认GPU设备
            device = MTLCreateSystemDefaultDevice();
        }

        if (!device) {
            printf("Metal is not supported on this device\n");
            return -1;
        }

        // 获取GPU详细信息
        *gpuInfo = getGPUInfo();

        printf("\n=== GPU Information ===\n");
        printf("GPU: %s\n", gpuInfo->name);
        printf("GPU Cores (system_profiler): %d\n", gpuInfo->coreCount);
        printf("Registry ID: %d\n", gpuInfo->registryID);

        // 输出GPU能力
        printf("\nGPU Capabilities:\n");
        printf("  Unified Memory: %s\n", [device hasUnifiedMemory] ? "YES" : "NO");
        printf("  Max Buffer Length: %.2f GB\n", (double)[device maxBufferLength] / (1024*1024*1024));
        printf("  Max Threads Per Threadgroup: %lu x %lu x %lu\n",
               [device maxThreadsPerThreadgroup].width,
               [device maxThreadsPerThreadgroup].height,
               [device maxThreadsPerThreadgroup].depth);

        if ([device respondsToSelector:@selector(recommendedMaxWorkingSetSize)]) {
            printf("  Recommended Max Working Set: %.2f GB\n",
                   (double)[device recommendedMaxWorkingSetSize] / (1024*1024*1024));
        }

        // GPU Family支持
        printf("\nGPU Family Support:\n");
        if ([device supportsFamily:MTLGPUFamilyApple8]) {
            printf("  Apple GPU Family 8 (M2)\n");
        } else if ([device supportsFamily:MTLGPUFamilyApple7]) {
            printf("  Apple GPU Family 7 (M1)\n");
        }

        // 创建命令队列
        commandQueue = [device newCommandQueue];
        if (!commandQueue) {
            printf("Failed to create command queue\n");
            return -1;
        }

        // 编译着色器
        NSString *source = [NSString stringWithUTF8String:sm3MetalSource];
        MTLCompileOptions *options = [[MTLCompileOptions alloc] init];
        options.fastMathEnabled = YES;

        id<MTLLibrary> library = [device newLibraryWithSource:source options:options error:&error];
        if (!library) {
            printf("Failed to compile shader: %s\n", [[error description] UTF8String]);
            return -1;
        }

        // 获取内核函数
        id<MTLFunction> kernelFunction = [library newFunctionWithName:@"sm3_search"];
        if (!kernelFunction) {
            printf("Failed to find kernel function\n");
            return -1;
        }

        // 创建计算管线状态
        computePipelineState = [device newComputePipelineStateWithFunction:kernelFunction error:&error];
        if (!computePipelineState) {
            printf("Failed to create pipeline state: %s\n", [[error description] UTF8String]);
            return -1;
        }

        // 获取最大线程组大小
        gpuInfo->maxThreadsPerThreadgroup = (int)computePipelineState.maxTotalThreadsPerThreadgroup;
        printf("\nPipeline Info:\n");
        printf("  Max Threads Per Threadgroup: %d\n", gpuInfo->maxThreadsPerThreadgroup);
        printf("  Thread Execution Width: %lu\n", computePipelineState.threadExecutionWidth);

        // 创建缓冲区
        resultBuffer = [device newBufferWithLength:32 options:MTLResourceStorageModeShared];
        targetBuffer = [device newBufferWithLength:32 options:MTLResourceStorageModeShared];
        foundBuffer = [device newBufferWithLength:sizeof(int) options:MTLResourceStorageModeShared];
        candidateBuffer = [device newBufferWithLength:sizeof(uint64_t) options:MTLResourceStorageModeShared];

        if (!resultBuffer || !targetBuffer || !foundBuffer || !candidateBuffer) {
            printf("Failed to create buffers\n");
            return -1;
        }

        return 0;
    }
}

// 在GPU上搜索
int searchOnGPU(uint64_t startIndex, uint64_t count, const uint8_t* target, uint8_t* result, int maxThreadsPerThreadgroup) {
    @autoreleasepool {
        // 设置目标哈希
        memcpy([targetBuffer contents], target, 32);

        // 设置基础索引
        *(uint64_t*)[candidateBuffer contents] = startIndex;

        // 重置找到标志
        *(int*)[foundBuffer contents] = 0;

        // 创建命令缓冲区
        id<MTLCommandBuffer> commandBuffer = [commandQueue commandBuffer];
        if (!commandBuffer) {
            printf("Failed to create command buffer\n");
            return -1;
        }

        id<MTLComputeCommandEncoder> encoder = [commandBuffer computeCommandEncoder];
        if (!encoder) {
            printf("Failed to create compute encoder\n");
            return -1;
        }

        [encoder setComputePipelineState:computePipelineState];
        [encoder setBuffer:resultBuffer offset:0 atIndex:0];
        [encoder setBuffer:targetBuffer offset:0 atIndex:1];
        [encoder setBuffer:foundBuffer offset:0 atIndex:2];
        [encoder setBuffer:candidateBuffer offset:0 atIndex:3];

        // 计算线程组大小 - 根据GPU能力动态调整
        NSUInteger threadsPerThreadgroup = MIN(maxThreadsPerThreadgroup, 256);
        if (threadsPerThreadgroup > computePipelineState.maxTotalThreadsPerThreadgroup) {
            threadsPerThreadgroup = computePipelineState.maxTotalThreadsPerThreadgroup;
        }

        NSUInteger threadgroupsPerGrid = (count + threadsPerThreadgroup - 1) / threadsPerThreadgroup;

        // 限制总线程组数
        if (threadgroupsPerGrid > 65536) {
            threadgroupsPerGrid = 65536;
        }

        MTLSize threadsPerThreadgroupSize = MTLSizeMake(threadsPerThreadgroup, 1, 1);
        MTLSize threadgroupsPerGridSize = MTLSizeMake(threadgroupsPerGrid, 1, 1);

        // 分发计算
        [encoder dispatchThreadgroups:threadgroupsPerGridSize
                threadsPerThreadgroup:threadsPerThreadgroupSize];

        [encoder endEncoding];

        // 提交并等待完成
        [commandBuffer commit];
        [commandBuffer waitUntilCompleted];

        // 检查结果
        if (*(int*)[foundBuffer contents] != 0) {
            memcpy(result, [resultBuffer contents], 32);
            return 1;
        }

        return 0;
    }
}

// 清理资源
void cleanupMetal() {
    device = nil;
    commandQueue = nil;
    computePipelineState = nil;
    resultBuffer = nil;
    targetBuffer = nil;
    foundBuffer = nil;
    candidateBuffer = nil;
}
*/
import "C"

import (
	"context"
	"encoding/hex"
	"fmt"
	"log"
	"runtime"
	"sync"
	"sync/atomic"
	"time"
	"unsafe"

	"github.com/schollz/progressbar/v3"
)

// GPU 配置（动态获取）
var (
	GPUCores                 int
	MaxThreadsPerThreadgroup int
	GPUBatchSize             int
)

var (
	tarHex   = "aab05fca300811223b3b957bfe33130770fb7a6b55b030a5809c559344f66f79"
	tarBytes []byte
)

var (
	globalProgress atomic.Int64
	foundFlag      atomic.Int32
	foundResult    [32]byte
	resultMutex    sync.Mutex
)

func init() {
	var err error
	tarBytes, err = hex.DecodeString(tarHex)
	if err != nil {
		log.Fatalf("无法解码目标哈希: %v", err)
	}

	// 初始化 Metal 并获取GPU信息
	fmt.Println("初始化 Metal GPU...")
	var gpuInfo C.GPUInfo
	if ret := C.initMetal(&gpuInfo); ret != 0 {
		log.Fatalf("Metal 初始化失败")
	}

	// 设置GPU参数
	GPUCores = int(gpuInfo.coreCount)
	MaxThreadsPerThreadgroup = int(gpuInfo.maxThreadsPerThreadgroup)

	// 计算最优批处理大小
	// 考虑GPU核心数和最大线程数
	GPUBatchSize = GPUCores * MaxThreadsPerThreadgroup * 16 // 16倍过度订阅
	if GPUBatchSize > (1 << 22) {                           // 最大4M
		GPUBatchSize = 1 << 22
	}

	fmt.Printf("\n=== GPU配置 ===\n")
	fmt.Printf("GPU核心数: %d\n", GPUCores)
	fmt.Printf("最大线程组大小: %d\n", MaxThreadsPerThreadgroup)
	fmt.Printf("批处理大小: %d (%.2fM)\n", GPUBatchSize, float64(GPUBatchSize)/(1024*1024))
	fmt.Println("\nMetal GPU 初始化成功！")
}

func main() {
	// 使用所有CPU核心协调GPU任务
	runtime.GOMAXPROCS(runtime.NumCPU())

	totalOperations := int64(256 * (0xffffff + 1))
	bar := progressbar.NewOptions64(totalOperations,
		progressbar.OptionSetDescription(fmt.Sprintf("GPU加速版 (%d核GPU)...", GPUCores)),
		progressbar.OptionShowBytes(false),
		progressbar.OptionSetWidth(30),
		progressbar.OptionShowCount(),
		progressbar.OptionSetTheme(progressbar.Theme{
			Saucer: "=", SaucerHead: ">", SaucerPadding: " ",
			BarStart: "[", BarEnd: "]",
		}),
		progressbar.OptionThrottle(50*time.Millisecond),
	)

	// 创建任务队列
	jobs := make(chan uint64, 256)
	ctx, cancel := context.WithCancel(context.Background())

	wg := &sync.WaitGroup{}

	// 进度更新
	progressDone := make(chan struct{})
	go progressUpdater(bar, progressDone)

	// 启动GPU调度器
	numSchedulers := 4 // 使用4个调度器管理GPU任务
	for i := 0; i < numSchedulers; i++ {
		wg.Add(1)
		go gpuScheduler(i, wg, ctx, jobs)
	}

	timeStart := time.Now()

	// 分发任务
	fmt.Printf("\n正在使用 %d核GPU 进行并行计算...\n", GPUCores)
	fmt.Printf("每批次并行线程数: %d (%.2fM)\n", GPUBatchSize, float64(GPUBatchSize)/(1024*1024))
	fmt.Printf("最大线程组大小: %d\n\n", MaxThreadsPerThreadgroup)

	go func() {
		for j := uint64(0); j <= 0xff; j++ {
			select {
			case jobs <- j:
			case <-ctx.Done():
				return
			}
		}
		close(jobs)
	}()

	wg.Wait()
	cancel()
	close(progressDone)

	// 清理 Metal 资源
	C.cleanupMetal()

	timeEnd := time.Now()
	bar.Finish()

	duration := timeEnd.Sub(timeStart)
	totalHashes := globalProgress.Load()
	hashesPerSecond := float64(totalHashes) / duration.Seconds()

	fmt.Printf("\n=== GPU 性能统计 ===\n")
	fmt.Printf("GPU: %d核\n", GPUCores)
	fmt.Printf("总耗时: %v\n", duration)
	fmt.Printf("总哈希数: %d\n", totalHashes)
	fmt.Printf("哈希速率: %.2f MH/s\n", hashesPerSecond/1000000)
	fmt.Printf("每核心速率: %.2f MH/s\n", hashesPerSecond/1000000/float64(GPUCores))
	fmt.Printf("GPU吞吐量: %.2f GB/s\n", (hashesPerSecond*64)/(1024*1024*1024))

	if foundFlag.Load() != 0 {
		fmt.Printf("\n找到的结果: %s\n", string(foundResult[:]))
	}
}

func progressUpdater(bar *progressbar.ProgressBar, done <-chan struct{}) {
	ticker := time.NewTicker(50 * time.Millisecond)
	defer ticker.Stop()

	var lastProgress int64
	var lastTime time.Time = time.Now()
	var lastHashes int64

	for {
		select {
		case <-ticker.C:
			current := globalProgress.Load()
			if current > lastProgress {
				bar.Add64(current - lastProgress)

				// 计算实时速率
				now := time.Now()
				elapsed := now.Sub(lastTime).Seconds()
				if elapsed > 1.0 {
					rate := float64(current-lastHashes) / elapsed / 1000000
					bar.Describe(fmt.Sprintf("GPU计算中 (%.2f MH/s)...", rate))
					lastTime = now
					lastHashes = current
				}

				lastProgress = current
			}
		case <-done:
			current := globalProgress.Load()
			if current > lastProgress {
				bar.Add64(current - lastProgress)
			}
			return
		}
	}
}

func gpuScheduler(id int, wg *sync.WaitGroup, ctx context.Context, jobs <-chan uint64) {
	defer wg.Done()

	result := make([]byte, 32)

	for j := range jobs {
		if foundFlag.Load() != 0 {
			break
		}

		// 处理一个大任务块
		remaining := uint64(0xffffff + 1)
		offset := uint64(0)

		for remaining > 0 && foundFlag.Load() == 0 {
			// 计算这批的大小
			batchSize := uint64(GPUBatchSize)
			if batchSize > remaining {
				batchSize = remaining
			}

			startIndex := (j << 24) + offset

			// 在GPU上搜索
			ret := C.searchOnGPU(
				C.uint64_t(startIndex),
				C.uint64_t(batchSize),
				(*C.uint8_t)(unsafe.Pointer(&tarBytes[0])),
				(*C.uint8_t)(unsafe.Pointer(&result[0])),
				C.int(MaxThreadsPerThreadgroup),
			)

			if ret == 1 {
				// 找到了！
				foundFlag.Store(1)
				resultMutex.Lock()
				copy(foundResult[:], result)
				resultMutex.Unlock()
				fmt.Printf("\n[GPU Scheduler %d] 找到结果: %s\n", id, string(result))
				break
			}

			// 更新进度
			globalProgress.Add(int64(batchSize))

			offset += batchSize
			remaining -= batchSize

			// 检查上下文
			select {
			case <-ctx.Done():
				return
			default:
			}
		}
	}
}

项目技术文档：基于 Go 和 Apple Metal 的 GPU 加速哈希计算

1. 项目概述

本项目旨在演示如何利用现代计算技术栈，将一个计算密集型任务——SM3 哈希碰撞搜索，从传统的 CPU 计算迁移到 GPU 上进行大规模并行加速。项目核心是使用 Go 语言作为“主机”或“调度器”，负责任务分发、进度管理和与用户交互；同时，利用 Apple 的 Metal 框架和 Metal Shading Language (MSL) 编写高性能的 GPU“内核”，在图形处理器上执行数以百万计的并行哈希计算。

其最终实现了一个性能极高、可充分压榨 Apple Silicon (M1/…) GPU 算力的哈希计算引擎。

2. 核心技术栈

技术领域	具体技术	在项目中的作用
主机语言	Go (Golang)	负责并发调度、任务队列管理、进度同步、与 C/Metal 代码交互。
GPU 计算框架	Apple Metal	提供与 GPU 硬件交互的底层 API，管理设备、命令、内存和着色器。
GPU 着色器语言	Metal Shading Language (MSL)	用于编写在 GPU 上实际执行的 SM3 哈希计算内核函数。
语言互操作性	Cgo	作为 Go 与 C/Objective-C 之间的桥梁，使得 Go 代码能够调用 Metal API。
macOS 框架	Foundation, CoreGraphics	Objective-C 的核心框架，用于 Metal 的对象管理和交互。
并发模型	Goroutines & Channels	Go 语言的并发原语，用于构建高效的 GPU 任务调度器。
原子操作	Go `sync/atomic`, MSL `atomic`	在 Go 和 GPU 端实现无锁的状态同步（如“是否找到结果”的标志位）。

3. 架构设计

本项目的架构是一种典型的 CPU-GPU 异构计算模型。CPU 和 GPU 分工明确，协同工作。

graph TD
    subgraph Go Host (运行在 CPU)
        A[主程序 Main] --&gt;|分发任务| B(任务通道 Jobs Channel);
        B --&gt; C{GPU 调度器 Schedulers};
        C --&gt;|调用 Cgo| D[C/Objective-C 桥接层];
        E[进度更新器] --&gt; F[UI 进度条];
        G[全局原子变量] &lt;--&gt; C;
        G &lt;--&gt; E;
    end

    subgraph C/Objective-C Bridge
        D --&gt;|调用 Metal API| H[Metal 命令队列];
    end

    subgraph Apple GPU
        H --&gt;|提交命令| I[GPU 计算单元];
        I -- 并行执行 --&gt; J(SM3 内核着色器);
        J -- 读写 --&gt; K[GPU 共享内存 Buffers];
        J -- 更新 --&gt; G;
    end

    style A fill:#D5E8D4,stroke:#82B366
    style C fill:#DAE8FC,stroke:#6C8EBF
    style J fill:#F8CECC,stroke:#B85450

Go 主机层 (Host)：
- main 函数负责初始化环境、创建任务通道和启动多个 gpuScheduler 协程（Goroutine）。
- 它将庞大的搜索空间（256times224）切分成 256 个主任务块，并放入 jobs 通道。
- 多个 gpuScheduler 作为调度器，从 jobs 通道中消费任务块。
Cgo 桥接层 (Bridge)：
- gpuScheduler 通过 Cgo 调用 C 语言封装的 searchOnGPU 函数，将计算任务提交给 Metal。
- 这是 Go 世界与 C/Objective-C 世界的边界。
Metal GPU 层 (Device)：
- searchOnGPU 函数通过 Metal API 将数据（如目标哈希、计算的起始索引）拷贝到 GPU 显存。
- 它向 GPU 的命令队列提交一个计算命令，指令 GPU 启动成千上万个线程。
- 每个 GPU 线程独立执行 sm3_search 内核着色器代码，计算一个候选值的哈希。
- 如果某个线程找到了匹配的结果，它会通过一个原子操作更新全局的 found 标志位，并把结果写回 GPU 显存。

4. Go 主机代码详解

4.1. Cgo 桥接与构建指令

代码通过特殊的注释与 C/Objective-C 世界链接。

/*
#cgo CFLAGS: -x objective-c -fobjc-arc
#cgo LDFLAGS: -framework Metal -framework Foundation -framework CoreGraphics
... C/Objective-C 代码 ...
*/
import "C"

#cgo CFLAGS: 告诉 C 编译器，嵌入的代码是 Objective-C，并启用 ARC（自动引用计数）内存管理。
#cgo LDFLAGS: 告诉链接器，需要链接 Metal、Foundation 和 CoreGraphics 这三个 macOS 系统框架。
import "C": 启用 Cgo，使得 Go 代码可以访问 C 命名空间下的函数和类型，如 C.searchOnGPU()。

4.2. GPU 调度器与并发模型

gpuScheduler 是整个并发模型的核心。

func gpuScheduler(id int, wg *sync.WaitGroup, ctx context.Context, jobs <-chan uint64) {
    // ...
    for j := range jobs { // 从通道消费任务
        // ...
        for remaining > 0 && foundFlag.Load() == 0 {
            // ...
            // 将一个大任务块再次切分为更小的批次，提交给GPU
            ret := C.searchOnGPU(...) 
            // ...
        }
    }
}

任务队列: jobs := make(chan uint64, 256) 是一个带缓冲的通道，实现了生产者-消费者模型，解耦了任务分发和执行。
工作池: 启动了 numSchedulers (例如 4 个) gpuScheduler 协程，它们构成一个工作池，并发地处理 GPU 任务提交，确保 GPU 命令队列始终有任务可做。
两级任务切分:
1. CPU 级: 整个搜索空间被切分为 256 个大任务块。
2. GPU 级: 每个大任务块在 gpuScheduler 内部又被切分为大小为 GPUBatchSize 的小批次。这种批处理方式是为了平衡 GPU 的 dispatch 开销和单次计算的粒度。

4.3. 原子操作与无锁同步

为了在不使用互斥锁（Mutex）的情况下高效地同步状态，代码大量使用了原子操作。

globalProgress atomic.Int64: 全局进度计数器，每个 gpuScheduler 完成一个批次后，都会原子地增加这个值。
foundFlag atomic.Int32: 全局的“找到”标志。GPU 内核或任何 scheduler 只要发现结果，就会原子地将其置为 1。所有循环都会检查这个标志，一旦它变为 1，就立即停止工作，实现快速退出。

4.4. Go 语言核心技术点应用

除了宏观的并发模型，本项目还深度依赖了 Go 语言的多个核心特性来实现健壮、高效的主机程序。

并发同步 sync.WaitGroup

WaitGroup 是 Go 中用于等待一组协程完成任务的经典工具。本项目中使用了两个 WaitGroup 实例，各有分工：
1. wg *sync.WaitGroup: 用于确保所有 gpuScheduler 协程在程序退出前都已完全停止。main 函数在启动每个 scheduler 前调用 wg.Add(1)，而每个 scheduler 的 defer wg.Done() 语句则保证其在退出时将计数器减一。最后的 wg.Wait() 会一直阻塞，直到所有 scheduler 都已执行完毕。
2. foundWg *sync.WaitGroup: 这是一个巧妙的应用，用于“等待第一个成功信号”。它的计数器初始为 1 (foundWg.Add(1))。任何一个 scheduler 只要找到了结果，就会调用 foundWg.Done()。main 函数中的 foundWg.Wait() 会因此解除阻塞，并立即执行后续的 cancel()，从而实现了“一人成功，全体收工”的逻辑。
上下文与优雅退出 context.Context

context 包是 Go 语言中用于控制协程生命周期、传递取消信号和请求作用域数据的标准方式。
```
ctx, cancel := context.WithCancel(context.Background())
// ...
// 在找到结果后
cancel()
```
1. context.WithCancel 创建了一个可被手动取消的上下文 ctx 和一个 cancel 函数。
2. ctx 被传递给所有需要控制的协程（如 gpuScheduler）。
3. 协程内部通过 select 语句监听 ctx.Done() 通道：
```
select {
case <-ctx.Done():
   return // 上下文被取消，立即退出
default:
   // 继续正常工作
}
```
4. 当 main 函数调用 cancel() 时，ctx.Done() 通道会关闭，所有正在监听它的协程都会收到信号，从而实现优雅、可控的退出。
协程与闭包 (Goroutines & Closures)

Go 通过 go 关键字可以极其廉价地创建协程。在本项目中，启动 gpuScheduler 和任务分发协程时，都利用了闭包的特性。
```
go gpuScheduler(i, wg, ctx, jobs)
```
当 gpuScheduler 在一个新的协程中被启动时，它不仅仅是函数调用，更是一个闭包。它“捕获”了 main 函数作用域中的变量，如 wg, ctx, jobs 等。这使得在不同协程间共享状态和通信变得非常自然和方便，是 Go 并发编程强大表达力的体现。
安全数据拷贝 copy()

在 gpuScheduler 中，当从 GPU 拿回结果时，使用了 copy 函数：
```
resultMutex.Lock()
copy(foundResult[:], result)
resultMutex.Unlock()
```
这里的 copy 至关重要。它将从 C 函数返回的、可能存在于临时内存中的 result 切片内容，安全地、逐字节地复制到全局变量 foundResult 数组中。与直接赋值 foundResult = result 不同，copy 确保了数据的深度复制，避免了后续因 result 内存被回收而导致 foundResult 指向无效内存（悬垂指针）的风险。同时，整个拷贝操作被 resultMutex 互斥锁保护，确保了并发写操作的线程安全。

5. Metal GPU 代码详解

5.1. Metal 初始化流程 (`initMetal`)

这是 Go 程序启动时调用的 C 函数，负责准备好 GPU 环境。

获取设备: MTLCreateSystemDefaultDevice() 获取系统默认的 GPU 设备。
获取设备信息: 通过 Objective-C 的反射和查询机制 ([device name], supportsFamily:)，尽可能准确地推断出 GPU 的核心数等信息，用于后续的性能调优。
创建命令队列: [device newCommandQueue] 创建一个用于提交计算任务的队列。
编译着色器:
- sm3MetalSource 字符串包含了 MSL 源码。
- [device newLibraryWithSource:options:error:] 将 MSL 源码动态编译成一个 MTLLibrary。
- fastMathEnabled = YES 是一个编译优化选项，允许 GPU 使用可能略微降低精度但速度更快的数学计算。
创建计算管线状态 (PSO): newComputePipelineStateWithFunction: 基于编译好的内核函数 (sm3_search) 创建一个 PSO。PSO 是一个预烘焙好的状态对象，包含了执行内核所需的所有信息，后续可以被高效地复用。
创建缓冲区: newBufferWithLength: 在 GPU 可访问的内存中创建多个缓冲区 (MTLBuffer)，用于在 CPU 和 GPU 之间传递数据。MTLResourceStorageModeShared 表示这块内存由 CPU 和 GPU 共享，在 Apple Silicon 的统一内存架构下效率最高。

5.2. SM3 内核着色器 (`sm3_search`)

这是在 GPU 上成千上万个线程中并行执行的代码，是性能的核心。

kernel void sm3_search(
    device uchar* result [[buffer(0)]],      // 输出缓冲区
    constant uchar* target [[buffer(1)]],    // 只读的目标哈希
    device atomic_int* found [[buffer(2)]],  // 原子标志位
    /* ... */
    uint3 gid [[thread_position_in_grid]]    // 线程的全局唯一ID
) {
    // ...
}

kernel: 声明这是一个计算内核函数。
device / constant: 地址空间修饰符。device 表示数据在全局设备内存中，constant 是一个优化的只读设备内存。
[[buffer(n)]]: 将函数参数与 Go 代码中设置的缓冲区绑定起来。
[[thread_position_in_grid]]: Metal 提供的内置变量，让每个线程知道自己的唯一ID。

内核内的优化

本地化内存 (thread):
```
thread uchar candidate[32];
thread uchar hash[32];
```
将每个线程的候选值和计算出的哈希值存储在 thread 地址空间。这是 GPU 上最快的内存，相当于 CPU 的寄存器，极大地减少了对慢速全局显存的读写，是关键的性能优化。
原子标志位检查:
```
if (atomic_load_explicit(found, memory_order_relaxed) != 0) {
    return;
}
```
在计算前，每个线程都会先检查全局 found 标志。一旦某个线程找到了结果并设置了此标志，其他所有线程会在下一次检查时立刻退出，避免了大量不必要的计算。
内联函数 (inline): rotateLeft, ff0 等 SM3 的辅助函数被声明为 inline，编译器会将其代码直接嵌入调用处，消除了函数调用的开销。

5.3. GPU 任务执行 (`searchOnGPU`)

这个 C 函数负责将一个计算批次提交到 GPU。

数据拷贝: 使用 memcpy 将目标哈希、起始索引等数据从 CPU 内存拷贝到之前创建的共享 MTLBuffer 中。
创建命令编码器: [commandBuffer computeCommandEncoder] 创建一个编码器，用于记录计算指令。
设置管线和缓冲区: [encoder setComputePipelineState:...] 和 [encoder setBuffer:...] 将 PSO 和数据缓冲区与此次计算任务绑定。
分发线程:
```
[encoder dispatchThreadgroups:threadgroupsPerGridSize
        threadsPerThreadgroup:threadsPerThreadgroupSize];
```
这是最核心的指令，告诉 GPU 需要启动多少个线程组（Threadgroup），以及每个线程组包含多少个线程。GPU 会根据这些参数启动海量的线程来执行内核函数。
提交与等待: [commandBuffer commit] 将编码好的命令提交到命令队列，[commandBuffer waitUntilCompleted] 则会阻塞当前 CPU 线程，直到 GPU 完成此次计算。

6. 性能与优化策略总结

策略	实现方式	带来的好处
大规模并行化	将计算从 CPU 迁移到 GPU，使用 Metal 分发数百万线程。	利用 GPU 数百上千个核心的特性，实现数量级上的性能飞跃。
两级任务调度	Go 协程调度大任务块，每个块内再切分为 GPU 批次。	充分利用 CPU 的调度能力，持续向 GPU“喂”送数据，保持 GPU 忙碌。
内存优化	使用 `MTLResourceStorageModeShared` 统一内存；在内核中使用 `thread` 本地内存。	减少了 CPU 与 GPU 之间的数据拷贝开销；最大化了内核计算速度。
批处理机制	将大量计算合并为一次 `dispatch` 调用。	摊薄了单次向 GPU 提交命令的固定开销，提升了吞吐量。
快速退出机制	使用 `atomic` 变量在 Go 和 GPU 端同步“找到”状态。	一旦找到结果，能迅速终止所有不必要的计算，节省时间和能源。
动态配置	程序启动时检测 GPU 信息，动态调整批处理大小等参数。	使程序能更好地适应不同配置的 Mac 设备，实现更优性能。
编译时优化	在编译 MSL 时启用 `fastMathEnabled`。	进一步压榨 GPU 的浮点计算性能。