{
  "metadata": {
    "generated_at": "2026-05-12 10:57:28 MDT",
    "gpu": "NVIDIA RTX PRO 6000 Blackwell Workstation Edition",
    "torch": "2.11.0+cu130",
    "cuda": "13.0",
    "l2_flush_mb": 256,
    "gpu_warmup": 30,
    "gpu_iters": 200,
    "cpu_repeats": 7,
    "gpu_l2_bytes_reported": 134217728
  },
  "rows": [
    {
      "device": "gpu",
      "kernel": "b12x_nvfp4_dense",
      "kind": "gemm",
      "m": 128,
      "n": 4096,
      "k": 5376,
      "flops": 5637144576.0,
      "median_us": 24.399999529123306,
      "min_us": 22.207999601960182,
      "max_us": 28.960000723600388,
      "stdev_us": 1.140464425269814,
      "samples": 200,
      "inner_runs_median": null,
      "tflops": 231.03051986831505,
      "correctness_cosine": 0.9999998807907104,
      "shape_key": "gemm_m128_n4096_k5376"
    },
    {
      "device": "gpu",
      "kernel": "b12x_nvfp4_dense",
      "kind": "gemm",
      "m": 512,
      "n": 4096,
      "k": 5376,
      "flops": 22548578304.0,
      "median_us": 31.10400028526783,
      "min_us": 28.255999088287354,
      "max_us": 35.13599932193756,
      "stdev_us": 1.0377582872739457,
      "samples": 200,
      "inner_runs_median": null,
      "tflops": 724.9414254500235,
      "correctness_cosine": 1.0,
      "shape_key": "gemm_m512_n4096_k5376"
    },
    {
      "device": "gpu",
      "kernel": "b12x_nvfp4_dense",
      "kind": "gemm",
      "m": 2048,
      "n": 4096,
      "k": 5376,
      "flops": 90194313216.0,
      "median_us": 77.85599678754807,
      "min_us": 75.45600086450577,
      "max_us": 82.20800012350082,
      "stdev_us": 0.9932769143580772,
      "samples": 200,
      "inner_runs_median": null,
      "tflops": 1158.4761217831492,
      "correctness_cosine": 1.0,
      "shape_key": "gemm_m2048_n4096_k5376"
    },
    {
      "device": "gpu",
      "kernel": "b12x_nvfp4_dense",
      "kind": "gemm",
      "m": 4096,
      "n": 4096,
      "k": 5376,
      "flops": 180388626432.0,
      "median_us": 147.95199781656265,
      "min_us": 145.79200744628906,
      "max_us": 153.9199948310852,
      "stdev_us": 1.0908065456716094,
      "samples": 200,
      "inner_runs_median": null,
      "tflops": 1219.2375168576884,
      "correctness_cosine": 1.0,
      "shape_key": "gemm_m4096_n4096_k5376"
    },
    {
      "device": "gpu",
      "kernel": "attempt12_nvfp4_gemv",
      "kind": "gemv",
      "m": 7168,
      "n": 1,
      "k": 16384,
      "flops": 234881024.0,
      "median_us": 65.43999910354614,
      "min_us": 62.3680017888546,
      "max_us": 69.95200365781784,
      "stdev_us": 1.470922481580563,
      "samples": 200,
      "inner_runs_median": null,
      "tflops": 3.589257750880256,
      "correctness_cosine": null,
      "shape_key": "gemv_m7168_n1_k16384"
    },
    {
      "device": "gpu",
      "kernel": "attempt12_nvfp4_gemv",
      "kind": "gemv",
      "m": 4096,
      "n": 8,
      "k": 7168,
      "flops": 469762048.0,
      "median_us": 98.59199821949005,
      "min_us": 96.22400254011154,
      "max_us": 104.73600029945374,
      "stdev_us": 1.4261789223895622,
      "samples": 200,
      "inner_runs_median": null,
      "tflops": 4.7647076485273585,
      "correctness_cosine": null,
      "shape_key": "gemv_m4096_n8_k7168"
    },
    {
      "device": "gpu",
      "kernel": "attempt12_nvfp4_gemv",
      "kind": "gemv",
      "m": 7168,
      "n": 4,
      "k": 2048,
      "flops": 117440512.0,
      "median_us": 28.575999662280083,
      "min_us": 26.208000257611275,
      "max_us": 33.08799862861633,
      "stdev_us": 1.4033055642020222,
      "samples": 200,
      "inner_runs_median": null,
      "tflops": 4.109760406913072,
      "correctness_cosine": null,
      "shape_key": "gemv_m7168_n4_k2048"
    },
    {
      "device": "gpu",
      "kernel": "attempt12_nvfp4_gemv",
      "kind": "gemv",
      "m": 4608,
      "n": 2,
      "k": 2432,
      "flops": 44826624.0,
      "median_us": 16.287999227643013,
      "min_us": 14.047999866306782,
      "max_us": 21.12000063061714,
      "stdev_us": 1.2866044737418407,
      "samples": 200,
      "inner_runs_median": null,
      "tflops": 2.752125867241137,
      "correctness_cosine": null,
      "shape_key": "gemv_m4608_n2_k2432"
    },
    {
      "device": "gpu",
      "kernel": "attempt12_nvfp4_gemv",
      "kind": "gemv",
      "m": 7168,
      "n": 2,
      "k": 384,
      "flops": 11010048.0,
      "median_us": 8.415999822318554,
      "min_us": 6.144000217318535,
      "max_us": 14.336000196635723,
      "stdev_us": 0.9398815140942707,
      "samples": 200,
      "inner_runs_median": null,
      "tflops": 1.3082281645018858,
      "correctness_cosine": null,
      "shape_key": "gemv_m7168_n2_k384"
    },
    {
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "kind": "gemm",
      "m": 128,
      "n": 4096,
      "k": 5376,
      "flops": 5637144576.0,
      "median_us": 5348.0,
      "min_us": 5302.0,
      "max_us": 5608.0,
      "stdev_us": 106.10170323124615,
      "samples": 7,
      "inner_runs_median": 1,
      "tflops": 1.0540659267015706,
      "correctness_cosine": null,
      "shape_key": "gemm_m128_n4096_k5376"
    },
    {
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "kind": "gemm",
      "m": 512,
      "n": 4096,
      "k": 5376,
      "flops": 22548578304.0,
      "median_us": 12996.0,
      "min_us": 12978.0,
      "max_us": 13132.0,
      "stdev_us": 63.123838980365,
      "samples": 7,
      "inner_runs_median": 1,
      "tflops": 1.7350398818097876,
      "correctness_cosine": null,
      "shape_key": "gemm_m512_n4096_k5376"
    },
    {
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "kind": "gemm",
      "m": 2048,
      "n": 4096,
      "k": 5376,
      "flops": 90194313216.0,
      "median_us": 43859.0,
      "min_us": 43519.0,
      "max_us": 49233.0,
      "stdev_us": 2042.7731034994936,
      "samples": 7,
      "inner_runs_median": 1,
      "tflops": 2.0564607769442986,
      "correctness_cosine": null,
      "shape_key": "gemm_m2048_n4096_k5376"
    },
    {
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "kind": "gemm",
      "m": 4096,
      "n": 4096,
      "k": 5376,
      "flops": 180388626432.0,
      "median_us": 85177.0,
      "min_us": 85008.0,
      "max_us": 86961.0,
      "stdev_us": 687.6497239562761,
      "samples": 7,
      "inner_runs_median": 1,
      "tflops": 2.1178091084682484,
      "correctness_cosine": null,
      "shape_key": "gemm_m4096_n4096_k5376"
    },
    {
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "kind": "gemv",
      "m": 7168,
      "n": 1,
      "k": 16384,
      "flops": 234881024.0,
      "median_us": 1695.59,
      "min_us": 1671.94,
      "max_us": 1730.65,
      "stdev_us": 21.84507833858806,
      "samples": 7,
      "inner_runs_median": 17,
      "tflops": 0.1385246574938517,
      "correctness_cosine": null,
      "shape_key": "gemv_m7168_n1_k16384"
    },
    {
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "kind": "gemv",
      "m": 4096,
      "n": 8,
      "k": 7168,
      "flops": 469762048.0,
      "median_us": 841.44,
      "min_us": 826.89,
      "max_us": 854.0,
      "stdev_us": 9.198980792405647,
      "samples": 7,
      "inner_runs_median": 9,
      "tflops": 0.5582834759459974,
      "correctness_cosine": null,
      "shape_key": "gemv_m4096_n8_k7168"
    },
    {
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "kind": "gemv",
      "m": 7168,
      "n": 4,
      "k": 2048,
      "flops": 117440512.0,
      "median_us": 337.91,
      "min_us": 267.48,
      "max_us": 375.94,
      "stdev_us": 47.892540029488835,
      "samples": 7,
      "inner_runs_median": 33,
      "tflops": 0.3475496789085851,
      "correctness_cosine": null,
      "shape_key": "gemv_m7168_n4_k2048"
    },
    {
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "kind": "gemv",
      "m": 4608,
      "n": 2,
      "k": 2432,
      "flops": 44826624.0,
      "median_us": 169.31,
      "min_us": 168.48,
      "max_us": 177.61,
      "stdev_us": 3.2990749064086557,
      "samples": 7,
      "inner_runs_median": 85,
      "tflops": 0.26476064024570317,
      "correctness_cosine": null,
      "shape_key": "gemv_m4608_n2_k2432"
    },
    {
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "kind": "gemv",
      "m": 7168,
      "n": 2,
      "k": 384,
      "flops": 11010048.0,
      "median_us": 48.92,
      "min_us": 47.07,
      "max_us": 52.35,
      "stdev_us": 1.9278213708990883,
      "samples": 7,
      "inner_runs_median": 344,
      "tflops": 0.22506230580539657,
      "correctness_cosine": null,
      "shape_key": "gemv_m7168_n2_k384"
    }
  ]
}