{
  "metadata": {
    "generated_at": "2026-05-12 11:59:50 MDT",
    "gpu": "NVIDIA RTX PRO 6000 Blackwell Workstation Edition",
    "torch": "2.11.0+cu130",
    "cuda": "13.0",
    "gpu_warmup": 20,
    "gpu_iters": 100,
    "gpu_l2_flush_mb": 256,
    "cpu_repeats": 5,
    "sources": {
      "Qwen2.5-1.5B": "https://huggingface.co/Qwen/Qwen2.5-1.5B/raw/main/config.json",
      "Qwen2.5-3B": "https://huggingface.co/Qwen/Qwen2.5-3B/raw/main/config.json",
      "Qwen2.5-7B": "https://huggingface.co/Qwen/Qwen2.5-7B/raw/main/config.json",
      "Mistral-7B-v0.3": "https://huggingface.co/mistralai/Mistral-7B-v0.3/raw/main/config.json",
      "OLMoE-1B-7B": "https://huggingface.co/allenai/OLMoE-1B-7B-0924/raw/main/config.json",
      "Qwen1.5-MoE-A2.7B": "https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B/raw/main/config.json"
    },
    "token_counts": [
      1,
      2,
      4,
      8,
      16,
      32,
      128
    ]
  },
  "model_configs": {
    "Qwen2.5-1.5B": {
      "architectures": [
        "Qwen2ForCausalLM"
      ],
      "attention_dropout": 0.0,
      "bos_token_id": 151643,
      "eos_token_id": 151643,
      "hidden_act": "silu",
      "hidden_size": 1536,
      "initializer_range": 0.02,
      "intermediate_size": 8960,
      "max_position_embeddings": 131072,
      "max_window_layers": 28,
      "model_type": "qwen2",
      "num_attention_heads": 12,
      "num_hidden_layers": 28,
      "num_key_value_heads": 2,
      "rms_norm_eps": 1e-06,
      "rope_theta": 1000000.0,
      "sliding_window": 131072,
      "tie_word_embeddings": true,
      "torch_dtype": "bfloat16",
      "transformers_version": "4.40.1",
      "use_cache": true,
      "use_mrope": false,
      "use_sliding_window": false,
      "vocab_size": 151936
    },
    "Qwen2.5-3B": {
      "architectures": [
        "Qwen2ForCausalLM"
      ],
      "attention_dropout": 0.0,
      "bos_token_id": 151643,
      "eos_token_id": 151643,
      "hidden_act": "silu",
      "hidden_size": 2048,
      "initializer_range": 0.02,
      "intermediate_size": 11008,
      "max_position_embeddings": 32768,
      "max_window_layers": 36,
      "model_type": "qwen2",
      "num_attention_heads": 16,
      "num_hidden_layers": 36,
      "num_key_value_heads": 2,
      "rms_norm_eps": 1e-06,
      "rope_theta": 1000000.0,
      "sliding_window": 32768,
      "tie_word_embeddings": true,
      "torch_dtype": "bfloat16",
      "transformers_version": "4.40.1",
      "use_cache": true,
      "use_mrope": false,
      "use_sliding_window": false,
      "vocab_size": 151936
    },
    "Qwen2.5-7B": {
      "architectures": [
        "Qwen2ForCausalLM"
      ],
      "attention_dropout": 0.0,
      "bos_token_id": 151643,
      "eos_token_id": 151643,
      "hidden_act": "silu",
      "hidden_size": 3584,
      "initializer_range": 0.02,
      "intermediate_size": 18944,
      "max_position_embeddings": 131072,
      "max_window_layers": 28,
      "model_type": "qwen2",
      "num_attention_heads": 28,
      "num_hidden_layers": 28,
      "num_key_value_heads": 4,
      "rms_norm_eps": 1e-06,
      "rope_theta": 1000000.0,
      "sliding_window": 131072,
      "tie_word_embeddings": false,
      "torch_dtype": "bfloat16",
      "transformers_version": "4.40.1",
      "use_cache": true,
      "use_mrope": false,
      "use_sliding_window": false,
      "vocab_size": 152064
    },
    "Mistral-7B-v0.3": {
      "architectures": [
        "MistralForCausalLM"
      ],
      "attention_dropout": 0.0,
      "bos_token_id": 1,
      "eos_token_id": 2,
      "hidden_act": "silu",
      "hidden_size": 4096,
      "initializer_range": 0.02,
      "intermediate_size": 14336,
      "max_position_embeddings": 32768,
      "model_type": "mistral",
      "num_attention_heads": 32,
      "num_hidden_layers": 32,
      "num_key_value_heads": 8,
      "rms_norm_eps": 1e-05,
      "rope_theta": 1000000.0,
      "sliding_window": null,
      "tie_word_embeddings": false,
      "torch_dtype": "bfloat16",
      "transformers_version": "4.42.0.dev0",
      "use_cache": true,
      "vocab_size": 32768
    },
    "OLMoE-1B-7B": {
      "architectures": [
        "OlmoeForCausalLM"
      ],
      "attention_bias": false,
      "attention_dropout": 0.0,
      "clip_qkv": null,
      "eos_token_id": 50279,
      "hidden_act": "silu",
      "hidden_size": 2048,
      "initializer_range": 0.02,
      "intermediate_size": 1024,
      "max_position_embeddings": 4096,
      "model_type": "olmoe",
      "norm_topk_prob": false,
      "num_attention_heads": 16,
      "num_experts": 64,
      "num_experts_per_tok": 8,
      "num_hidden_layers": 16,
      "num_key_value_heads": 16,
      "output_router_logits": false,
      "pad_token_id": 1,
      "rope_scaling": null,
      "rope_theta": 10000.0,
      "router_aux_loss_coef": 0.01,
      "tie_word_embeddings": false,
      "torch_dtype": "bfloat16",
      "transformers_version": "4.43.0.dev0",
      "use_cache": true,
      "vocab_size": 50304
    },
    "Qwen1.5-MoE-A2.7B": {
      "architectures": [
        "Qwen2MoeForCausalLM"
      ],
      "attention_dropout": 0.0,
      "bos_token_id": 151643,
      "eos_token_id": 151643,
      "hidden_act": "silu",
      "hidden_size": 2048,
      "initializer_range": 0.02,
      "intermediate_size": 5632,
      "max_position_embeddings": 8192,
      "max_window_layers": 21,
      "model_type": "qwen2_moe",
      "num_attention_heads": 16,
      "num_hidden_layers": 24,
      "num_key_value_heads": 16,
      "rms_norm_eps": 1e-06,
      "rope_theta": 1000000.0,
      "sliding_window": 32768,
      "tie_word_embeddings": false,
      "torch_dtype": "bfloat16",
      "transformers_version": "4.39.0.dev0",
      "use_cache": true,
      "use_sliding_window": false,
      "vocab_size": 151936,
      "decoder_sparse_step": 1,
      "moe_intermediate_size": 1408,
      "shared_expert_intermediate_size": 5632,
      "num_experts_per_tok": 4,
      "num_experts": 60,
      "norm_topk_prob": false,
      "output_router_logits": false,
      "router_aux_loss_coef": 0.001
    }
  },
  "rows": [
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 1,
      "m": 1,
      "n": 1536,
      "k": 1536,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 4718592.0,
      "median_us": 10.239999741315842,
      "min_us": 9.568000212311745,
      "max_us": 14.976000413298607,
      "stdev_us": 1.1588298875074017,
      "samples": 100,
      "tflops": 0.46080001164078743,
      "shape_key": "Qwen2.5-1.5B|attn_q|T1|M1|N1536|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 1,
      "m": 1,
      "n": 1536,
      "k": 1536,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 4718592.0,
      "median_us": 35.6,
      "min_us": 34.16,
      "max_us": 36.0,
      "stdev_us": 0.7259683188679806,
      "samples": 5,
      "tflops": 0.1325447191011236,
      "shape_key": "Qwen2.5-1.5B|attn_q|T1|M1|N1536|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 1,
      "m": 1,
      "n": 256,
      "k": 1536,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 786432.0,
      "median_us": 10.591999627649784,
      "min_us": 10.080000385642052,
      "max_us": 14.879999682307243,
      "stdev_us": 0.9715387793804159,
      "samples": 100,
      "tflops": 0.07424773674907108,
      "shape_key": "Qwen2.5-1.5B|attn_kv_each|T1|M1|N256|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 1,
      "m": 1,
      "n": 256,
      "k": 1536,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 786432.0,
      "median_us": 6.47,
      "min_us": 6.2,
      "max_us": 6.74,
      "stdev_us": 0.20432816741702559,
      "samples": 5,
      "tflops": 0.12155054095826894,
      "shape_key": "Qwen2.5-1.5B|attn_kv_each|T1|M1|N256|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 1536,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 6291456.0,
      "median_us": 10.591999627649784,
      "min_us": 9.855999611318111,
      "max_us": 15.71200042963028,
      "stdev_us": 1.2606703477252468,
      "samples": 100,
      "tflops": 0.5939818939925686,
      "shape_key": "Qwen2.5-1.5B|attn_qkv_fused|T1|M1|N2048|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 1536,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 6291456.0,
      "median_us": 47.33,
      "min_us": 46.21,
      "max_us": 49.28,
      "stdev_us": 1.1503130008828035,
      "samples": 5,
      "tflops": 0.1329274455947602,
      "shape_key": "Qwen2.5-1.5B|attn_qkv_fused|T1|M1|N2048|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 1,
      "m": 1,
      "n": 1536,
      "k": 1536,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 4718592.0,
      "median_us": 10.239999741315842,
      "min_us": 9.568000212311745,
      "max_us": 14.976000413298607,
      "stdev_us": 1.1588298875074017,
      "samples": 100,
      "tflops": 0.46080001164078743,
      "shape_key": "Qwen2.5-1.5B|attn_o|T1|M1|N1536|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 1,
      "m": 1,
      "n": 1536,
      "k": 1536,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 4718592.0,
      "median_us": 35.6,
      "min_us": 34.16,
      "max_us": 36.0,
      "stdev_us": 0.7259683188679806,
      "samples": 5,
      "tflops": 0.1325447191011236,
      "shape_key": "Qwen2.5-1.5B|attn_o|T1|M1|N1536|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 1,
      "m": 1,
      "n": 8960,
      "k": 1536,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 27525120.0,
      "median_us": 14.336000196635723,
      "min_us": 13.69599997997284,
      "max_us": 19.45599913597107,
      "stdev_us": 1.2151836685218287,
      "samples": 100,
      "tflops": 1.919999973664859,
      "shape_key": "Qwen2.5-1.5B|mlp_up_or_gate|T1|M1|N8960|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 1,
      "m": 1,
      "n": 8960,
      "k": 1536,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 27525120.0,
      "median_us": 207.14,
      "min_us": 196.32,
      "max_us": 211.33,
      "stdev_us": 6.024120682722088,
      "samples": 5,
      "tflops": 0.13288172250651734,
      "shape_key": "Qwen2.5-1.5B|mlp_up_or_gate|T1|M1|N8960|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 1,
      "m": 1,
      "n": 1536,
      "k": 8960,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 27525120.0,
      "median_us": 35.16799956560135,
      "min_us": 33.11999887228012,
      "max_us": 40.672000497579575,
      "stdev_us": 1.1691818112833432,
      "samples": 100,
      "tflops": 0.7826751689033508,
      "shape_key": "Qwen2.5-1.5B|mlp_down|T1|M1|N1536|K8960"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 1,
      "m": 1,
      "n": 1536,
      "k": 8960,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 27525120.0,
      "median_us": 203.29,
      "min_us": 194.35,
      "max_us": 212.01,
      "stdev_us": 6.7874752301573835,
      "samples": 5,
      "tflops": 0.135398297997934,
      "shape_key": "Qwen2.5-1.5B|mlp_down|T1|M1|N1536|K8960"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 2,
      "m": 2,
      "n": 1536,
      "k": 1536,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 9437184.0,
      "median_us": 9.8879998549819,
      "min_us": 9.472000412642956,
      "max_us": 14.495999552309513,
      "stdev_us": 1.1286622170588405,
      "samples": 100,
      "tflops": 0.9544077809877026,
      "shape_key": "Qwen2.5-1.5B|attn_q|T2|M2|N1536|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 2,
      "m": 2,
      "n": 1536,
      "k": 1536,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 9437184.0,
      "median_us": 37.88,
      "min_us": 36.97,
      "max_us": 38.07,
      "stdev_us": 0.4754261246502978,
      "samples": 5,
      "tflops": 0.2491336853220697,
      "shape_key": "Qwen2.5-1.5B|attn_q|T2|M2|N1536|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 2,
      "m": 2,
      "n": 256,
      "k": 1536,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 1572864.0,
      "median_us": 9.8879998549819,
      "min_us": 9.472000412642956,
      "max_us": 14.655999839305878,
      "stdev_us": 0.8419774870212022,
      "samples": 100,
      "tflops": 0.15906796349795044,
      "shape_key": "Qwen2.5-1.5B|attn_kv_each|T2|M2|N256|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 2,
      "m": 2,
      "n": 256,
      "k": 1536,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 1572864.0,
      "median_us": 6.82,
      "min_us": 6.78,
      "max_us": 7.01,
      "stdev_us": 0.09071934744033369,
      "samples": 5,
      "tflops": 0.23062521994134896,
      "shape_key": "Qwen2.5-1.5B|attn_kv_each|T2|M2|N256|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 2,
      "m": 2,
      "n": 2048,
      "k": 1536,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 12582912.0,
      "median_us": 10.591999627649784,
      "min_us": 9.8879998549819,
      "max_us": 16.35199971497059,
      "stdev_us": 1.3007013611694023,
      "samples": 100,
      "tflops": 1.1879637879851372,
      "shape_key": "Qwen2.5-1.5B|attn_qkv_fused|T2|M2|N2048|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 2,
      "m": 2,
      "n": 2048,
      "k": 1536,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 12582912.0,
      "median_us": 49.61,
      "min_us": 49.25,
      "max_us": 50.7,
      "stdev_us": 0.545096321763412,
      "samples": 5,
      "tflops": 0.25363660552308004,
      "shape_key": "Qwen2.5-1.5B|attn_qkv_fused|T2|M2|N2048|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 2,
      "m": 2,
      "n": 1536,
      "k": 1536,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 9437184.0,
      "median_us": 9.8879998549819,
      "min_us": 9.472000412642956,
      "max_us": 14.495999552309513,
      "stdev_us": 1.1286622170588405,
      "samples": 100,
      "tflops": 0.9544077809877026,
      "shape_key": "Qwen2.5-1.5B|attn_o|T2|M2|N1536|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 2,
      "m": 2,
      "n": 1536,
      "k": 1536,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 9437184.0,
      "median_us": 37.88,
      "min_us": 36.97,
      "max_us": 38.07,
      "stdev_us": 0.4754261246502978,
      "samples": 5,
      "tflops": 0.2491336853220697,
      "shape_key": "Qwen2.5-1.5B|attn_o|T2|M2|N1536|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 2,
      "m": 2,
      "n": 8960,
      "k": 1536,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 55050240.0,
      "median_us": 14.112000353634357,
      "min_us": 13.663999736309052,
      "max_us": 19.45599913597107,
      "stdev_us": 0.8741496214924414,
      "samples": 100,
      "tflops": 3.900952283197934,
      "shape_key": "Qwen2.5-1.5B|mlp_up_or_gate|T2|M2|N8960|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 2,
      "m": 2,
      "n": 8960,
      "k": 1536,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 55050240.0,
      "median_us": 218.81,
      "min_us": 215.13,
      "max_us": 227.52,
      "stdev_us": 4.946926318432492,
      "samples": 5,
      "tflops": 0.2515892326676112,
      "shape_key": "Qwen2.5-1.5B|mlp_up_or_gate|T2|M2|N8960|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 2,
      "m": 2,
      "n": 1536,
      "k": 8960,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 55050240.0,
      "median_us": 33.11999887228012,
      "min_us": 32.35200047492981,
      "max_us": 38.24000060558319,
      "stdev_us": 1.2549808215147107,
      "samples": 100,
      "tflops": 1.6621449841314595,
      "shape_key": "Qwen2.5-1.5B|mlp_down|T2|M2|N1536|K8960"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 2,
      "m": 2,
      "n": 1536,
      "k": 8960,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 55050240.0,
      "median_us": 213.43,
      "min_us": 209.99,
      "max_us": 230.01,
      "stdev_us": 8.368726306911938,
      "samples": 5,
      "tflops": 0.25793112495900294,
      "shape_key": "Qwen2.5-1.5B|mlp_down|T2|M2|N1536|K8960"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 4,
      "m": 4,
      "n": 1536,
      "k": 1536,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 18874368.0,
      "median_us": 10.56000031530857,
      "min_us": 9.824000298976898,
      "max_us": 14.816000126302242,
      "stdev_us": 0.8979431397201779,
      "samples": 100,
      "tflops": 1.787345401177526,
      "shape_key": "Qwen2.5-1.5B|attn_q|T4|M4|N1536|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 4,
      "m": 4,
      "n": 1536,
      "k": 1536,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 18874368.0,
      "median_us": 44.88,
      "min_us": 44.37,
      "max_us": 46.3,
      "stdev_us": 0.7994873357345941,
      "samples": 5,
      "tflops": 0.420551871657754,
      "shape_key": "Qwen2.5-1.5B|attn_q|T4|M4|N1536|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 4,
      "m": 4,
      "n": 256,
      "k": 1536,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 3145728.0,
      "median_us": 10.239999741315842,
      "min_us": 9.440000168979168,
      "max_us": 13.98400031030178,
      "stdev_us": 0.5927417332188453,
      "samples": 100,
      "tflops": 0.307200007760525,
      "shape_key": "Qwen2.5-1.5B|attn_kv_each|T4|M4|N256|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 4,
      "m": 4,
      "n": 256,
      "k": 1536,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 3145728.0,
      "median_us": 8.14,
      "min_us": 8.12,
      "max_us": 8.2,
      "stdev_us": 0.03435112807463504,
      "samples": 5,
      "tflops": 0.3864530712530712,
      "shape_key": "Qwen2.5-1.5B|attn_kv_each|T4|M4|N256|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 4,
      "m": 4,
      "n": 2048,
      "k": 1536,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 25165824.0,
      "median_us": 10.591999627649784,
      "min_us": 9.855999611318111,
      "max_us": 15.359999611973763,
      "stdev_us": 1.0368624729226763,
      "samples": 100,
      "tflops": 2.3759275759702745,
      "shape_key": "Qwen2.5-1.5B|attn_qkv_fused|T4|M4|N2048|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 4,
      "m": 4,
      "n": 2048,
      "k": 1536,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 25165824.0,
      "median_us": 59.69,
      "min_us": 59.21,
      "max_us": 60.07,
      "stdev_us": 0.30680612770934,
      "samples": 5,
      "tflops": 0.4216087116769978,
      "shape_key": "Qwen2.5-1.5B|attn_qkv_fused|T4|M4|N2048|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 4,
      "m": 4,
      "n": 1536,
      "k": 1536,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 18874368.0,
      "median_us": 10.56000031530857,
      "min_us": 9.824000298976898,
      "max_us": 14.816000126302242,
      "stdev_us": 0.8979431397201779,
      "samples": 100,
      "tflops": 1.787345401177526,
      "shape_key": "Qwen2.5-1.5B|attn_o|T4|M4|N1536|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 4,
      "m": 4,
      "n": 1536,
      "k": 1536,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 18874368.0,
      "median_us": 44.88,
      "min_us": 44.37,
      "max_us": 46.3,
      "stdev_us": 0.7994873357345941,
      "samples": 5,
      "tflops": 0.420551871657754,
      "shape_key": "Qwen2.5-1.5B|attn_o|T4|M4|N1536|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 4,
      "m": 4,
      "n": 8960,
      "k": 1536,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 110100480.0,
      "median_us": 14.688000082969666,
      "min_us": 12.256000190973282,
      "max_us": 19.45599913597107,
      "stdev_us": 1.0564166405489335,
      "samples": 100,
      "tflops": 7.495947670075145,
      "shape_key": "Qwen2.5-1.5B|mlp_up_or_gate|T4|M4|N8960|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 4,
      "m": 4,
      "n": 8960,
      "k": 1536,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 110100480.0,
      "median_us": 259.69,
      "min_us": 255.49,
      "max_us": 315.71,
      "stdev_us": 25.191254037860034,
      "samples": 5,
      "tflops": 0.423968885979437,
      "shape_key": "Qwen2.5-1.5B|mlp_up_or_gate|T4|M4|N8960|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 4,
      "m": 4,
      "n": 1536,
      "k": 8960,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 110100480.0,
      "median_us": 34.063998609781265,
      "min_us": 32.22399950027466,
      "max_us": 37.50399872660637,
      "stdev_us": 1.0997017533804045,
      "samples": 100,
      "tflops": 3.232165467749442,
      "shape_key": "Qwen2.5-1.5B|mlp_down|T4|M4|N1536|K8960"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 4,
      "m": 4,
      "n": 1536,
      "k": 8960,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 110100480.0,
      "median_us": 273.49,
      "min_us": 251.74,
      "max_us": 313.06,
      "stdev_us": 25.2976091755723,
      "samples": 5,
      "tflops": 0.40257588942922956,
      "shape_key": "Qwen2.5-1.5B|mlp_down|T4|M4|N1536|K8960"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 8,
      "m": 8,
      "n": 1536,
      "k": 1536,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 37748736.0,
      "median_us": 10.239999741315842,
      "min_us": 9.472000412642956,
      "max_us": 15.39199985563755,
      "stdev_us": 0.908502478140142,
      "samples": 100,
      "tflops": 3.6864000931262995,
      "shape_key": "Qwen2.5-1.5B|attn_q|T8|M8|N1536|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 8,
      "m": 8,
      "n": 1536,
      "k": 1536,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 37748736.0,
      "median_us": 67.2,
      "min_us": 66.7,
      "max_us": 68.97,
      "stdev_us": 0.8758253250506046,
      "samples": 5,
      "tflops": 0.5617371428571428,
      "shape_key": "Qwen2.5-1.5B|attn_q|T8|M8|N1536|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 8,
      "m": 8,
      "n": 256,
      "k": 1536,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 6291456.0,
      "median_us": 10.143999941647053,
      "min_us": 8.960000239312649,
      "max_us": 14.14399966597557,
      "stdev_us": 0.7608042502382298,
      "samples": 100,
      "tflops": 0.6202145146087682,
      "shape_key": "Qwen2.5-1.5B|attn_kv_each|T8|M8|N256|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 8,
      "m": 8,
      "n": 256,
      "k": 1536,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 6291456.0,
      "median_us": 12.16,
      "min_us": 12.07,
      "max_us": 12.47,
      "stdev_us": 0.18349386910739,
      "samples": 5,
      "tflops": 0.5173894736842105,
      "shape_key": "Qwen2.5-1.5B|attn_kv_each|T8|M8|N256|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 8,
      "m": 8,
      "n": 2048,
      "k": 1536,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 50331648.0,
      "median_us": 10.239999741315842,
      "min_us": 9.472000412642956,
      "max_us": 15.00799972563982,
      "stdev_us": 1.1484162961872528,
      "samples": 100,
      "tflops": 4.9152001241684,
      "shape_key": "Qwen2.5-1.5B|attn_qkv_fused|T8|M8|N2048|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 8,
      "m": 8,
      "n": 2048,
      "k": 1536,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 50331648.0,
      "median_us": 89.67,
      "min_us": 89.01,
      "max_us": 90.55,
      "stdev_us": 0.5637197885474634,
      "samples": 5,
      "tflops": 0.5612986283037805,
      "shape_key": "Qwen2.5-1.5B|attn_qkv_fused|T8|M8|N2048|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 8,
      "m": 8,
      "n": 1536,
      "k": 1536,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 37748736.0,
      "median_us": 10.239999741315842,
      "min_us": 9.472000412642956,
      "max_us": 15.39199985563755,
      "stdev_us": 0.908502478140142,
      "samples": 100,
      "tflops": 3.6864000931262995,
      "shape_key": "Qwen2.5-1.5B|attn_o|T8|M8|N1536|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 8,
      "m": 8,
      "n": 1536,
      "k": 1536,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 37748736.0,
      "median_us": 67.2,
      "min_us": 66.7,
      "max_us": 68.97,
      "stdev_us": 0.8758253250506046,
      "samples": 5,
      "tflops": 0.5617371428571428,
      "shape_key": "Qwen2.5-1.5B|attn_o|T8|M8|N1536|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 8,
      "m": 8,
      "n": 8960,
      "k": 1536,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 220200960.0,
      "median_us": 14.303999952971935,
      "min_us": 13.567999936640263,
      "max_us": 21.31200022995472,
      "stdev_us": 1.1680940860392397,
      "samples": 100,
      "tflops": 15.394362466720294,
      "shape_key": "Qwen2.5-1.5B|mlp_up_or_gate|T8|M8|N8960|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 8,
      "m": 8,
      "n": 8960,
      "k": 1536,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 220200960.0,
      "median_us": 386.0,
      "min_us": 384.39,
      "max_us": 425.28,
      "stdev_us": 17.767972028343575,
      "samples": 5,
      "tflops": 0.5704688082901554,
      "shape_key": "Qwen2.5-1.5B|mlp_up_or_gate|T8|M8|N8960|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 8,
      "m": 8,
      "n": 1536,
      "k": 8960,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 220200960.0,
      "median_us": 33.11999887228012,
      "min_us": 32.44800120592117,
      "max_us": 38.24000060558319,
      "stdev_us": 1.1957975558485474,
      "samples": 100,
      "tflops": 6.648579936525838,
      "shape_key": "Qwen2.5-1.5B|mlp_down|T8|M8|N1536|K8960"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 8,
      "m": 8,
      "n": 1536,
      "k": 8960,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 220200960.0,
      "median_us": 487.44,
      "min_us": 376.72,
      "max_us": 696.78,
      "stdev_us": 121.83177713552402,
      "samples": 5,
      "tflops": 0.4517498769079271,
      "shape_key": "Qwen2.5-1.5B|mlp_down|T8|M8|N1536|K8960"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 16,
      "m": 16,
      "n": 1536,
      "k": 1536,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 75497472.0,
      "median_us": 10.591999627649784,
      "min_us": 9.855999611318111,
      "max_us": 15.359999611973763,
      "stdev_us": 1.0356998366277819,
      "samples": 100,
      "tflops": 7.127782727910823,
      "shape_key": "Qwen2.5-1.5B|attn_q|T16|M16|N1536|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 16,
      "m": 16,
      "n": 1536,
      "k": 1536,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 75497472.0,
      "median_us": 134.76,
      "min_us": 133.27,
      "max_us": 136.94,
      "stdev_us": 1.3454850426519007,
      "samples": 5,
      "tflops": 0.5602365093499555,
      "shape_key": "Qwen2.5-1.5B|attn_q|T16|M16|N1536|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 16,
      "m": 16,
      "n": 256,
      "k": 1536,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 12582912.0,
      "median_us": 10.015999898314476,
      "min_us": 8.927999995648861,
      "max_us": 15.776000916957855,
      "stdev_us": 0.9211601486960944,
      "samples": 100,
      "tflops": 1.2562811629138986,
      "shape_key": "Qwen2.5-1.5B|attn_kv_each|T16|M16|N256|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 16,
      "m": 16,
      "n": 256,
      "k": 1536,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 12582912.0,
      "median_us": 24.0,
      "min_us": 23.7,
      "max_us": 24.07,
      "stdev_us": 0.15116216457830986,
      "samples": 5,
      "tflops": 0.524288,
      "shape_key": "Qwen2.5-1.5B|attn_kv_each|T16|M16|N256|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 16,
      "m": 16,
      "n": 2048,
      "k": 1536,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 100663296.0,
      "median_us": 10.239999741315842,
      "min_us": 9.472000412642956,
      "max_us": 15.296000055968761,
      "stdev_us": 1.193553842087043,
      "samples": 100,
      "tflops": 9.8304002483368,
      "shape_key": "Qwen2.5-1.5B|attn_qkv_fused|T16|M16|N2048|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 16,
      "m": 16,
      "n": 2048,
      "k": 1536,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 100663296.0,
      "median_us": 182.42,
      "min_us": 176.45,
      "max_us": 207.39,
      "stdev_us": 12.363230564864507,
      "samples": 5,
      "tflops": 0.5518215985089354,
      "shape_key": "Qwen2.5-1.5B|attn_qkv_fused|T16|M16|N2048|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 16,
      "m": 16,
      "n": 1536,
      "k": 1536,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 75497472.0,
      "median_us": 10.591999627649784,
      "min_us": 9.855999611318111,
      "max_us": 15.359999611973763,
      "stdev_us": 1.0356998366277819,
      "samples": 100,
      "tflops": 7.127782727910823,
      "shape_key": "Qwen2.5-1.5B|attn_o|T16|M16|N1536|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 16,
      "m": 16,
      "n": 1536,
      "k": 1536,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 75497472.0,
      "median_us": 134.76,
      "min_us": 133.27,
      "max_us": 136.94,
      "stdev_us": 1.3454850426519007,
      "samples": 5,
      "tflops": 0.5602365093499555,
      "shape_key": "Qwen2.5-1.5B|attn_o|T16|M16|N1536|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 16,
      "m": 16,
      "n": 8960,
      "k": 1536,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 440401920.0,
      "median_us": 14.688000082969666,
      "min_us": 13.919999822974205,
      "max_us": 19.807999953627586,
      "stdev_us": 1.028151940703119,
      "samples": 100,
      "tflops": 29.98379068030058,
      "shape_key": "Qwen2.5-1.5B|mlp_up_or_gate|T16|M16|N8960|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 16,
      "m": 16,
      "n": 8960,
      "k": 1536,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 440401920.0,
      "median_us": 776.0,
      "min_us": 773.67,
      "max_us": 827.67,
      "stdev_us": 23.362841222762263,
      "samples": 5,
      "tflops": 0.5675282474226804,
      "shape_key": "Qwen2.5-1.5B|mlp_up_or_gate|T16|M16|N8960|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 16,
      "m": 16,
      "n": 1536,
      "k": 8960,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 440401920.0,
      "median_us": 33.11999887228012,
      "min_us": 32.3840007185936,
      "max_us": 37.21600025892258,
      "stdev_us": 0.9799298552580197,
      "samples": 100,
      "tflops": 13.297159873051676,
      "shape_key": "Qwen2.5-1.5B|mlp_down|T16|M16|N1536|K8960"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 16,
      "m": 16,
      "n": 1536,
      "k": 8960,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 440401920.0,
      "median_us": 770.22,
      "min_us": 765.89,
      "max_us": 789.11,
      "stdev_us": 9.227298087739443,
      "samples": 5,
      "tflops": 0.5717871776894913,
      "shape_key": "Qwen2.5-1.5B|mlp_down|T16|M16|N1536|K8960"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 32,
      "m": 32,
      "n": 1536,
      "k": 1536,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 150994944.0,
      "median_us": 9.8879998549819,
      "min_us": 9.472000412642956,
      "max_us": 15.00799972563982,
      "stdev_us": 0.9540055592151854,
      "samples": 100,
      "tflops": 15.270524495803242,
      "shape_key": "Qwen2.5-1.5B|attn_q|T32|M32|N1536|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 32,
      "m": 32,
      "n": 1536,
      "k": 1536,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 150994944.0,
      "median_us": 85.96,
      "min_us": 85.46,
      "max_us": 88.12,
      "stdev_us": 1.0661707180372233,
      "samples": 5,
      "tflops": 1.7565721731037691,
      "shape_key": "Qwen2.5-1.5B|attn_q|T32|M32|N1536|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 32,
      "m": 32,
      "n": 256,
      "k": 1536,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 25165824.0,
      "median_us": 10.079999919980764,
      "min_us": 9.119999594986439,
      "max_us": 14.84800036996603,
      "stdev_us": 0.8652842964957715,
      "samples": 100,
      "tflops": 2.4966095436286495,
      "shape_key": "Qwen2.5-1.5B|attn_kv_each|T32|M32|N256|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 32,
      "m": 32,
      "n": 256,
      "k": 1536,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 25165824.0,
      "median_us": 16.8,
      "min_us": 16.62,
      "max_us": 22.81,
      "stdev_us": 3.3019282245378987,
      "samples": 5,
      "tflops": 1.4979657142857143,
      "shape_key": "Qwen2.5-1.5B|attn_kv_each|T32|M32|N256|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 32,
      "m": 32,
      "n": 2048,
      "k": 1536,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 201326592.0,
      "median_us": 10.591999627649784,
      "min_us": 9.952000342309475,
      "max_us": 15.615999698638916,
      "stdev_us": 1.2025723610421382,
      "samples": 100,
      "tflops": 19.007420607762196,
      "shape_key": "Qwen2.5-1.5B|attn_qkv_fused|T32|M32|N2048|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 32,
      "m": 32,
      "n": 2048,
      "k": 1536,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 201326592.0,
      "median_us": 115.74,
      "min_us": 114.58,
      "max_us": 127.95,
      "stdev_us": 5.658659735308355,
      "samples": 5,
      "tflops": 1.73947288750648,
      "shape_key": "Qwen2.5-1.5B|attn_qkv_fused|T32|M32|N2048|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 32,
      "m": 32,
      "n": 1536,
      "k": 1536,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 150994944.0,
      "median_us": 9.8879998549819,
      "min_us": 9.472000412642956,
      "max_us": 15.00799972563982,
      "stdev_us": 0.9540055592151854,
      "samples": 100,
      "tflops": 15.270524495803242,
      "shape_key": "Qwen2.5-1.5B|attn_o|T32|M32|N1536|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 32,
      "m": 32,
      "n": 1536,
      "k": 1536,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 150994944.0,
      "median_us": 85.96,
      "min_us": 85.46,
      "max_us": 88.12,
      "stdev_us": 1.0661707180372233,
      "samples": 5,
      "tflops": 1.7565721731037691,
      "shape_key": "Qwen2.5-1.5B|attn_o|T32|M32|N1536|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 32,
      "m": 32,
      "n": 8960,
      "k": 1536,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 880803840.0,
      "median_us": 14.32000007480383,
      "min_us": 13.632000423967838,
      "max_us": 19.1040001809597,
      "stdev_us": 0.9382726006976204,
      "samples": 100,
      "tflops": 61.5086477233881,
      "shape_key": "Qwen2.5-1.5B|mlp_up_or_gate|T32|M32|N8960|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 32,
      "m": 32,
      "n": 8960,
      "k": 1536,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 880803840.0,
      "median_us": 520.6,
      "min_us": 501.6,
      "max_us": 558.2,
      "stdev_us": 22.858433892110813,
      "samples": 5,
      "tflops": 1.6919013446023818,
      "shape_key": "Qwen2.5-1.5B|mlp_up_or_gate|T32|M32|N8960|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 32,
      "m": 32,
      "n": 1536,
      "k": 8960,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 880803840.0,
      "median_us": 34.15999934077263,
      "min_us": 32.06399828195572,
      "max_us": 38.30400109291077,
      "stdev_us": 1.3002359648150945,
      "samples": 100,
      "tflops": 25.784656235303018,
      "shape_key": "Qwen2.5-1.5B|mlp_down|T32|M32|N1536|K8960"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 32,
      "m": 32,
      "n": 1536,
      "k": 8960,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 880803840.0,
      "median_us": 537.4,
      "min_us": 501.4,
      "max_us": 1131.6,
      "stdev_us": 269.8901776649161,
      "samples": 5,
      "tflops": 1.639009750651284,
      "shape_key": "Qwen2.5-1.5B|mlp_down|T32|M32|N1536|K8960"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 128,
      "m": 128,
      "n": 1536,
      "k": 1536,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 603979776.0,
      "median_us": 10.239999741315842,
      "min_us": 9.472000412642956,
      "max_us": 15.00799972563982,
      "stdev_us": 1.1137281950303717,
      "samples": 100,
      "tflops": 58.98240149002079,
      "shape_key": "Qwen2.5-1.5B|attn_q|T128|M128|N1536|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 128,
      "m": 128,
      "n": 1536,
      "k": 1536,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 603979776.0,
      "median_us": 285.29,
      "min_us": 281.86,
      "max_us": 289.43,
      "stdev_us": 2.92255538869668,
      "samples": 5,
      "tflops": 2.1170730695082196,
      "shape_key": "Qwen2.5-1.5B|attn_q|T128|M128|N1536|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 128,
      "m": 128,
      "n": 256,
      "k": 1536,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 100663296.0,
      "median_us": 10.239999741315842,
      "min_us": 9.472000412642956,
      "max_us": 15.615999698638916,
      "stdev_us": 0.8984994921573133,
      "samples": 100,
      "tflops": 9.8304002483368,
      "shape_key": "Qwen2.5-1.5B|attn_kv_each|T128|M128|N256|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 128,
      "m": 128,
      "n": 256,
      "k": 1536,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 100663296.0,
      "median_us": 56.37,
      "min_us": 54.5,
      "max_us": 71.71,
      "stdev_us": 7.131705967017988,
      "samples": 5,
      "tflops": 1.7857600851516764,
      "shape_key": "Qwen2.5-1.5B|attn_kv_each|T128|M128|N256|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 128,
      "m": 128,
      "n": 2048,
      "k": 1536,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 805306368.0,
      "median_us": 10.239999741315842,
      "min_us": 9.824000298976898,
      "max_us": 15.135999768972397,
      "stdev_us": 1.2278485251667808,
      "samples": 100,
      "tflops": 78.6432019866944,
      "shape_key": "Qwen2.5-1.5B|attn_qkv_fused|T128|M128|N2048|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 128,
      "m": 128,
      "n": 2048,
      "k": 1536,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 805306368.0,
      "median_us": 382.8,
      "min_us": 369.2,
      "max_us": 754.6,
      "stdev_us": 168.78088754358416,
      "samples": 5,
      "tflops": 2.103726144200627,
      "shape_key": "Qwen2.5-1.5B|attn_qkv_fused|T128|M128|N2048|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 128,
      "m": 128,
      "n": 1536,
      "k": 1536,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 603979776.0,
      "median_us": 10.239999741315842,
      "min_us": 9.472000412642956,
      "max_us": 15.00799972563982,
      "stdev_us": 1.1137281950303717,
      "samples": 100,
      "tflops": 58.98240149002079,
      "shape_key": "Qwen2.5-1.5B|attn_o|T128|M128|N1536|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 128,
      "m": 128,
      "n": 1536,
      "k": 1536,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 603979776.0,
      "median_us": 285.29,
      "min_us": 281.86,
      "max_us": 289.43,
      "stdev_us": 2.92255538869668,
      "samples": 5,
      "tflops": 2.1170730695082196,
      "shape_key": "Qwen2.5-1.5B|attn_o|T128|M128|N1536|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 128,
      "m": 128,
      "n": 8960,
      "k": 1536,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 3523215360.0,
      "median_us": 14.303999952971935,
      "min_us": 13.567999936640263,
      "max_us": 19.039999693632126,
      "stdev_us": 1.0595379346228158,
      "samples": 100,
      "tflops": 246.3097994675247,
      "shape_key": "Qwen2.5-1.5B|mlp_up_or_gate|T128|M128|N8960|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 128,
      "m": 128,
      "n": 8960,
      "k": 1536,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 3523215360.0,
      "median_us": 1734.5,
      "min_us": 1640.0,
      "max_us": 2130.5,
      "stdev_us": 231.44367565349458,
      "samples": 5,
      "tflops": 2.0312570539060246,
      "shape_key": "Qwen2.5-1.5B|mlp_up_or_gate|T128|M128|N8960|K1536"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 128,
      "m": 128,
      "n": 1536,
      "k": 8960,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 3523215360.0,
      "median_us": 32.96000137925148,
      "min_us": 32.127998769283295,
      "max_us": 37.34400123357773,
      "stdev_us": 1.2351524282386892,
      "samples": 100,
      "tflops": 106.89366542981655,
      "shape_key": "Qwen2.5-1.5B|mlp_down|T128|M128|N1536|K8960"
    },
    {
      "model": "Qwen2.5-1.5B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 128,
      "m": 128,
      "n": 1536,
      "k": 8960,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 3523215360.0,
      "median_us": 1729.0,
      "min_us": 1668.5,
      "max_us": 1854.0,
      "stdev_us": 71.13191969854321,
      "samples": 5,
      "tflops": 2.0377185425101216,
      "shape_key": "Qwen2.5-1.5B|mlp_down|T128|M128|N1536|K8960"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 2048,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 8388608.0,
      "median_us": 12.28800043463707,
      "min_us": 11.744000017642975,
      "max_us": 17.055999487638474,
      "stdev_us": 1.137415380513941,
      "samples": 100,
      "tflops": 0.6826666425201636,
      "shape_key": "Qwen2.5-3B|attn_q|T1|M1|N2048|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 2048,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 8388608.0,
      "median_us": 62.62,
      "min_us": 61.17,
      "max_us": 64.16,
      "stdev_us": 1.223744254327674,
      "samples": 5,
      "tflops": 0.13396052379431492,
      "shape_key": "Qwen2.5-3B|attn_q|T1|M1|N2048|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 1,
      "m": 1,
      "n": 256,
      "k": 2048,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 1048576.0,
      "median_us": 11.935999616980553,
      "min_us": 11.52000017464161,
      "max_us": 16.383999958634377,
      "stdev_us": 0.9349166695280892,
      "samples": 100,
      "tflops": 0.08784986877079493,
      "shape_key": "Qwen2.5-3B|attn_kv_each|T1|M1|N256|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 1,
      "m": 1,
      "n": 256,
      "k": 2048,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 1048576.0,
      "median_us": 8.27,
      "min_us": 8.11,
      "max_us": 8.47,
      "stdev_us": 0.16456001944579426,
      "samples": 5,
      "tflops": 0.12679274486094316,
      "shape_key": "Qwen2.5-3B|attn_kv_each|T1|M1|N256|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 1,
      "m": 1,
      "n": 2560,
      "k": 2048,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 10485760.0,
      "median_us": 11.951999738812447,
      "min_us": 11.52000017464161,
      "max_us": 16.383999958634377,
      "stdev_us": 1.0883329237596069,
      "samples": 100,
      "tflops": 0.8773226430008162,
      "shape_key": "Qwen2.5-3B|attn_qkv_fused|T1|M1|N2560|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 1,
      "m": 1,
      "n": 2560,
      "k": 2048,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 10485760.0,
      "median_us": 78.01,
      "min_us": 75.23,
      "max_us": 78.14,
      "stdev_us": 1.2367821150065184,
      "samples": 5,
      "tflops": 0.13441558774516088,
      "shape_key": "Qwen2.5-3B|attn_qkv_fused|T1|M1|N2560|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 2048,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 8388608.0,
      "median_us": 12.28800043463707,
      "min_us": 11.744000017642975,
      "max_us": 17.055999487638474,
      "stdev_us": 1.137415380513941,
      "samples": 100,
      "tflops": 0.6826666425201636,
      "shape_key": "Qwen2.5-3B|attn_o|T1|M1|N2048|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 2048,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 8388608.0,
      "median_us": 62.62,
      "min_us": 61.17,
      "max_us": 64.16,
      "stdev_us": 1.223744254327674,
      "samples": 5,
      "tflops": 0.13396052379431492,
      "shape_key": "Qwen2.5-3B|attn_o|T1|M1|N2048|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 1,
      "m": 1,
      "n": 11008,
      "k": 2048,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 45088768.0,
      "median_us": 18.079999834299088,
      "min_us": 16.383999958634377,
      "max_us": 21.15200087428093,
      "stdev_us": 0.6532515190173375,
      "samples": 100,
      "tflops": 2.493847810466419,
      "shape_key": "Qwen2.5-3B|mlp_up_or_gate|T1|M1|N11008|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 1,
      "m": 1,
      "n": 11008,
      "k": 2048,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 45088768.0,
      "median_us": 333.55,
      "min_us": 324.24,
      "max_us": 340.76,
      "stdev_us": 6.611348576500864,
      "samples": 5,
      "tflops": 0.13517843801529006,
      "shape_key": "Qwen2.5-3B|mlp_up_or_gate|T1|M1|N11008|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 11008,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 45088768.0,
      "median_us": 41.31200164556503,
      "min_us": 40.28800129890442,
      "max_us": 46.112000942230225,
      "stdev_us": 1.1097401158846807,
      "samples": 100,
      "tflops": 1.091420560708668,
      "shape_key": "Qwen2.5-3B|mlp_down|T1|M1|N2048|K11008"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 11008,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 45088768.0,
      "median_us": 331.34,
      "min_us": 322.02,
      "max_us": 348.41,
      "stdev_us": 9.98336766827709,
      "samples": 5,
      "tflops": 0.13608006277539686,
      "shape_key": "Qwen2.5-3B|mlp_down|T1|M1|N2048|K11008"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 2,
      "m": 2,
      "n": 2048,
      "k": 2048,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 16777216.0,
      "median_us": 12.28800043463707,
      "min_us": 11.52000017464161,
      "max_us": 17.40800030529499,
      "stdev_us": 0.905924784909204,
      "samples": 100,
      "tflops": 1.3653332850403272,
      "shape_key": "Qwen2.5-3B|attn_q|T2|M2|N2048|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 2,
      "m": 2,
      "n": 2048,
      "k": 2048,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 16777216.0,
      "median_us": 65.76,
      "min_us": 64.65,
      "max_us": 72.83,
      "stdev_us": 3.3232032739512016,
      "samples": 5,
      "tflops": 0.2551279805352798,
      "shape_key": "Qwen2.5-3B|attn_q|T2|M2|N2048|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 2,
      "m": 2,
      "n": 256,
      "k": 2048,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 2097152.0,
      "median_us": 12.496000155806541,
      "min_us": 10.239999741315842,
      "max_us": 15.904000028967857,
      "stdev_us": 0.7817873001405587,
      "samples": 100,
      "tflops": 0.1678258621840295,
      "shape_key": "Qwen2.5-3B|attn_kv_each|T2|M2|N256|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 2,
      "m": 2,
      "n": 256,
      "k": 2048,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 2097152.0,
      "median_us": 8.88,
      "min_us": 8.79,
      "max_us": 9.01,
      "stdev_us": 0.09257429448826518,
      "samples": 5,
      "tflops": 0.23616576576576576,
      "shape_key": "Qwen2.5-3B|attn_kv_each|T2|M2|N256|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 2,
      "m": 2,
      "n": 2560,
      "k": 2048,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 20971520.0,
      "median_us": 12.640000320971012,
      "min_us": 11.872000060975552,
      "max_us": 17.75999926030636,
      "stdev_us": 1.1872702948059441,
      "samples": 100,
      "tflops": 1.6591391983753492,
      "shape_key": "Qwen2.5-3B|attn_qkv_fused|T2|M2|N2560|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 2,
      "m": 2,
      "n": 2560,
      "k": 2048,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 20971520.0,
      "median_us": 83.15,
      "min_us": 81.89,
      "max_us": 87.38,
      "stdev_us": 2.1650334870389405,
      "samples": 5,
      "tflops": 0.2522131088394468,
      "shape_key": "Qwen2.5-3B|attn_qkv_fused|T2|M2|N2560|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 2,
      "m": 2,
      "n": 2048,
      "k": 2048,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 16777216.0,
      "median_us": 12.28800043463707,
      "min_us": 11.52000017464161,
      "max_us": 17.40800030529499,
      "stdev_us": 0.905924784909204,
      "samples": 100,
      "tflops": 1.3653332850403272,
      "shape_key": "Qwen2.5-3B|attn_o|T2|M2|N2048|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 2,
      "m": 2,
      "n": 2048,
      "k": 2048,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 16777216.0,
      "median_us": 65.76,
      "min_us": 64.65,
      "max_us": 72.83,
      "stdev_us": 3.3232032739512016,
      "samples": 5,
      "tflops": 0.2551279805352798,
      "shape_key": "Qwen2.5-3B|attn_o|T2|M2|N2048|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 2,
      "m": 2,
      "n": 11008,
      "k": 2048,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 90177536.0,
      "median_us": 18.079999834299088,
      "min_us": 16.383999958634377,
      "max_us": 22.431999444961548,
      "stdev_us": 0.7784381093593615,
      "samples": 100,
      "tflops": 4.987695620932838,
      "shape_key": "Qwen2.5-3B|mlp_up_or_gate|T2|M2|N11008|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 2,
      "m": 2,
      "n": 11008,
      "k": 2048,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 90177536.0,
      "median_us": 368.53,
      "min_us": 353.56,
      "max_us": 378.51,
      "stdev_us": 9.760613197950216,
      "samples": 5,
      "tflops": 0.2446952378368111,
      "shape_key": "Qwen2.5-3B|mlp_up_or_gate|T2|M2|N11008|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 2,
      "m": 2,
      "n": 2048,
      "k": 11008,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 90177536.0,
      "median_us": 41.08799993991852,
      "min_us": 38.88000175356865,
      "max_us": 45.05600035190582,
      "stdev_us": 1.0492452068287015,
      "samples": 100,
      "tflops": 2.194741436231097,
      "shape_key": "Qwen2.5-3B|mlp_down|T2|M2|N2048|K11008"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 2,
      "m": 2,
      "n": 2048,
      "k": 11008,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 90177536.0,
      "median_us": 375.05,
      "min_us": 348.6,
      "max_us": 399.7,
      "stdev_us": 18.634442572827332,
      "samples": 5,
      "tflops": 0.24044137048393546,
      "shape_key": "Qwen2.5-3B|mlp_down|T2|M2|N2048|K11008"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 4,
      "m": 4,
      "n": 2048,
      "k": 2048,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 33554432.0,
      "median_us": 12.28800043463707,
      "min_us": 11.680000461637974,
      "max_us": 17.311999574303627,
      "stdev_us": 1.0944388754410355,
      "samples": 100,
      "tflops": 2.7306665700806545,
      "shape_key": "Qwen2.5-3B|attn_q|T4|M4|N2048|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 4,
      "m": 4,
      "n": 2048,
      "k": 2048,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 33554432.0,
      "median_us": 78.82,
      "min_us": 77.73,
      "max_us": 81.74,
      "stdev_us": 1.7653526559868986,
      "samples": 5,
      "tflops": 0.4257096168485156,
      "shape_key": "Qwen2.5-3B|attn_q|T4|M4|N2048|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 4,
      "m": 4,
      "n": 256,
      "k": 2048,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 4194304.0,
      "median_us": 11.935999616980553,
      "min_us": 10.208000428974628,
      "max_us": 16.287999227643013,
      "stdev_us": 0.6278065799959395,
      "samples": 100,
      "tflops": 0.3513994750831797,
      "shape_key": "Qwen2.5-3B|attn_kv_each|T4|M4|N256|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 4,
      "m": 4,
      "n": 256,
      "k": 2048,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 4194304.0,
      "median_us": 10.61,
      "min_us": 10.6,
      "max_us": 11.06,
      "stdev_us": 0.1984943324127925,
      "samples": 5,
      "tflops": 0.39531611687087653,
      "shape_key": "Qwen2.5-3B|attn_kv_each|T4|M4|N256|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 4,
      "m": 4,
      "n": 2560,
      "k": 2048,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 41943040.0,
      "median_us": 12.28800043463707,
      "min_us": 11.615999974310398,
      "max_us": 16.383999958634377,
      "stdev_us": 1.065290946963524,
      "samples": 100,
      "tflops": 3.4133332126008185,
      "shape_key": "Qwen2.5-3B|attn_qkv_fused|T4|M4|N2560|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 4,
      "m": 4,
      "n": 2560,
      "k": 2048,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 41943040.0,
      "median_us": 98.36,
      "min_us": 98.14,
      "max_us": 104.73,
      "stdev_us": 2.8306836630044,
      "samples": 5,
      "tflops": 0.4264237494916633,
      "shape_key": "Qwen2.5-3B|attn_qkv_fused|T4|M4|N2560|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 4,
      "m": 4,
      "n": 2048,
      "k": 2048,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 33554432.0,
      "median_us": 12.28800043463707,
      "min_us": 11.680000461637974,
      "max_us": 17.311999574303627,
      "stdev_us": 1.0944388754410355,
      "samples": 100,
      "tflops": 2.7306665700806545,
      "shape_key": "Qwen2.5-3B|attn_o|T4|M4|N2048|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 4,
      "m": 4,
      "n": 2048,
      "k": 2048,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 33554432.0,
      "median_us": 78.82,
      "min_us": 77.73,
      "max_us": 81.74,
      "stdev_us": 1.7653526559868986,
      "samples": 5,
      "tflops": 0.4257096168485156,
      "shape_key": "Qwen2.5-3B|attn_o|T4|M4|N2048|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 4,
      "m": 4,
      "n": 11008,
      "k": 2048,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 180355072.0,
      "median_us": 18.144000321626663,
      "min_us": 16.35199971497059,
      "max_us": 22.52800017595291,
      "stdev_us": 0.8750433755520869,
      "samples": 100,
      "tflops": 9.940204409334504,
      "shape_key": "Qwen2.5-3B|mlp_up_or_gate|T4|M4|N11008|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 4,
      "m": 4,
      "n": 11008,
      "k": 2048,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 180355072.0,
      "median_us": 425.18,
      "min_us": 418.91,
      "max_us": 510.32,
      "stdev_us": 38.81720623641016,
      "samples": 5,
      "tflops": 0.424185220377252,
      "shape_key": "Qwen2.5-3B|mlp_up_or_gate|T4|M4|N11008|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 4,
      "m": 4,
      "n": 2048,
      "k": 11008,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 180355072.0,
      "median_us": 40.95999896526337,
      "min_us": 38.88000175356865,
      "max_us": 45.43999955058098,
      "stdev_us": 1.0839142101490624,
      "samples": 100,
      "tflops": 4.403200111234191,
      "shape_key": "Qwen2.5-3B|mlp_down|T4|M4|N2048|K11008"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 4,
      "m": 4,
      "n": 2048,
      "k": 11008,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 180355072.0,
      "median_us": 438.73,
      "min_us": 425.95,
      "max_us": 490.14,
      "stdev_us": 24.939923215599517,
      "samples": 5,
      "tflops": 0.4110844300594899,
      "shape_key": "Qwen2.5-3B|mlp_down|T4|M4|N2048|K11008"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 8,
      "m": 8,
      "n": 2048,
      "k": 2048,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 67108864.0,
      "median_us": 12.28800043463707,
      "min_us": 11.552000418305397,
      "max_us": 16.704000532627106,
      "stdev_us": 0.9340635760009698,
      "samples": 100,
      "tflops": 5.461333140161309,
      "shape_key": "Qwen2.5-3B|attn_q|T8|M8|N2048|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 8,
      "m": 8,
      "n": 2048,
      "k": 2048,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 67108864.0,
      "median_us": 118.74,
      "min_us": 117.7,
      "max_us": 120.16,
      "stdev_us": 1.0132028424752857,
      "samples": 5,
      "tflops": 0.5651748694626916,
      "shape_key": "Qwen2.5-3B|attn_q|T8|M8|N2048|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 8,
      "m": 8,
      "n": 256,
      "k": 2048,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 8388608.0,
      "median_us": 12.256000190973282,
      "min_us": 10.239999741315842,
      "max_us": 16.383999958634377,
      "stdev_us": 0.6512952076758987,
      "samples": 100,
      "tflops": 0.6844490754967782,
      "shape_key": "Qwen2.5-3B|attn_kv_each|T8|M8|N256|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 8,
      "m": 8,
      "n": 256,
      "k": 2048,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 8388608.0,
      "median_us": 15.94,
      "min_us": 15.83,
      "max_us": 16.12,
      "stdev_us": 0.11937336386313384,
      "samples": 5,
      "tflops": 0.5262614805520702,
      "shape_key": "Qwen2.5-3B|attn_kv_each|T8|M8|N256|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 8,
      "m": 8,
      "n": 2560,
      "k": 2048,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 83886080.0,
      "median_us": 12.28800043463707,
      "min_us": 11.52000017464161,
      "max_us": 17.055999487638474,
      "stdev_us": 1.133926172269712,
      "samples": 100,
      "tflops": 6.826666425201637,
      "shape_key": "Qwen2.5-3B|attn_qkv_fused|T8|M8|N2560|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 8,
      "m": 8,
      "n": 2560,
      "k": 2048,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 83886080.0,
      "median_us": 148.74,
      "min_us": 146.85,
      "max_us": 188.02,
      "stdev_us": 20.824721366683395,
      "samples": 5,
      "tflops": 0.5639779480973511,
      "shape_key": "Qwen2.5-3B|attn_qkv_fused|T8|M8|N2560|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 8,
      "m": 8,
      "n": 2048,
      "k": 2048,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 67108864.0,
      "median_us": 12.28800043463707,
      "min_us": 11.552000418305397,
      "max_us": 16.704000532627106,
      "stdev_us": 0.9340635760009698,
      "samples": 100,
      "tflops": 5.461333140161309,
      "shape_key": "Qwen2.5-3B|attn_o|T8|M8|N2048|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 8,
      "m": 8,
      "n": 2048,
      "k": 2048,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 67108864.0,
      "median_us": 118.74,
      "min_us": 117.7,
      "max_us": 120.16,
      "stdev_us": 1.0132028424752857,
      "samples": 5,
      "tflops": 0.5651748694626916,
      "shape_key": "Qwen2.5-3B|attn_o|T8|M8|N2048|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 8,
      "m": 8,
      "n": 11008,
      "k": 2048,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 360710144.0,
      "median_us": 18.079999834299088,
      "min_us": 16.00000075995922,
      "max_us": 21.503999829292297,
      "stdev_us": 0.8190299927759143,
      "samples": 100,
      "tflops": 19.950782483731352,
      "shape_key": "Qwen2.5-3B|mlp_up_or_gate|T8|M8|N11008|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 8,
      "m": 8,
      "n": 11008,
      "k": 2048,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 360710144.0,
      "median_us": 668.82,
      "min_us": 617.82,
      "max_us": 897.27,
      "stdev_us": 140.9254025007557,
      "samples": 5,
      "tflops": 0.5393232020573547,
      "shape_key": "Qwen2.5-3B|mlp_up_or_gate|T8|M8|N11008|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 8,
      "m": 8,
      "n": 2048,
      "k": 11008,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 360710144.0,
      "median_us": 40.608000010252,
      "min_us": 38.91199827194214,
      "max_us": 44.704001396894455,
      "stdev_us": 0.7494501520932061,
      "samples": 100,
      "tflops": 8.882736010365795,
      "shape_key": "Qwen2.5-3B|mlp_down|T8|M8|N2048|K11008"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 8,
      "m": 8,
      "n": 2048,
      "k": 11008,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 360710144.0,
      "median_us": 762.18,
      "min_us": 652.91,
      "max_us": 1032.91,
      "stdev_us": 181.49602480495273,
      "samples": 5,
      "tflops": 0.4732610984281928,
      "shape_key": "Qwen2.5-3B|mlp_down|T8|M8|N2048|K11008"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 16,
      "m": 16,
      "n": 2048,
      "k": 2048,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 134217728.0,
      "median_us": 12.608000077307224,
      "min_us": 11.264000087976456,
      "max_us": 16.767999157309532,
      "stdev_us": 1.0402497836019682,
      "samples": 100,
      "tflops": 10.645441559091884,
      "shape_key": "Qwen2.5-3B|attn_q|T16|M16|N2048|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 16,
      "m": 16,
      "n": 2048,
      "k": 2048,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 134217728.0,
      "median_us": 236.03,
      "min_us": 235.34,
      "max_us": 236.97,
      "stdev_us": 0.6490223416801582,
      "samples": 5,
      "tflops": 0.5686469008176926,
      "shape_key": "Qwen2.5-3B|attn_q|T16|M16|N2048|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 16,
      "m": 16,
      "n": 256,
      "k": 2048,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 16777216.0,
      "median_us": 12.1760000474751,
      "min_us": 10.239999741315842,
      "max_us": 15.48799965530634,
      "stdev_us": 0.6348450481382274,
      "samples": 100,
      "tflops": 1.377892241670863,
      "shape_key": "Qwen2.5-3B|attn_kv_each|T16|M16|N256|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 16,
      "m": 16,
      "n": 256,
      "k": 2048,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 16777216.0,
      "median_us": 31.51,
      "min_us": 31.12,
      "max_us": 34.81,
      "stdev_us": 1.5540752877515307,
      "samples": 5,
      "tflops": 0.5324410028562361,
      "shape_key": "Qwen2.5-3B|attn_kv_each|T16|M16|N256|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 16,
      "m": 16,
      "n": 2560,
      "k": 2048,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 167772160.0,
      "median_us": 12.640000320971012,
      "min_us": 11.90400030463934,
      "max_us": 16.736000776290894,
      "stdev_us": 0.9583501276158155,
      "samples": 100,
      "tflops": 13.273113587002793,
      "shape_key": "Qwen2.5-3B|attn_qkv_fused|T16|M16|N2560|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 16,
      "m": 16,
      "n": 2560,
      "k": 2048,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 167772160.0,
      "median_us": 295.61,
      "min_us": 293.7,
      "max_us": 317.3,
      "stdev_us": 9.812147063716488,
      "samples": 5,
      "tflops": 0.5675456175366191,
      "shape_key": "Qwen2.5-3B|attn_qkv_fused|T16|M16|N2560|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 16,
      "m": 16,
      "n": 2048,
      "k": 2048,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 134217728.0,
      "median_us": 12.608000077307224,
      "min_us": 11.264000087976456,
      "max_us": 16.767999157309532,
      "stdev_us": 1.0402497836019682,
      "samples": 100,
      "tflops": 10.645441559091884,
      "shape_key": "Qwen2.5-3B|attn_o|T16|M16|N2048|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 16,
      "m": 16,
      "n": 2048,
      "k": 2048,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 134217728.0,
      "median_us": 236.03,
      "min_us": 235.34,
      "max_us": 236.97,
      "stdev_us": 0.6490223416801582,
      "samples": 5,
      "tflops": 0.5686469008176926,
      "shape_key": "Qwen2.5-3B|attn_o|T16|M16|N2048|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 16,
      "m": 16,
      "n": 11008,
      "k": 2048,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 721420288.0,
      "median_us": 18.079999834299088,
      "min_us": 16.00000075995922,
      "max_us": 22.52800017595291,
      "stdev_us": 0.9711709010345974,
      "samples": 100,
      "tflops": 39.901564967462704,
      "shape_key": "Qwen2.5-3B|mlp_up_or_gate|T16|M16|N11008|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 16,
      "m": 16,
      "n": 11008,
      "k": 2048,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 721420288.0,
      "median_us": 1269.17,
      "min_us": 1261.83,
      "max_us": 1981.83,
      "stdev_us": 316.15879345670584,
      "samples": 5,
      "tflops": 0.5684189572712876,
      "shape_key": "Qwen2.5-3B|mlp_up_or_gate|T16|M16|N11008|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 16,
      "m": 16,
      "n": 2048,
      "k": 11008,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 721420288.0,
      "median_us": 41.31200164556503,
      "min_us": 38.88000175356865,
      "max_us": 45.823998749256134,
      "stdev_us": 1.1601731893131093,
      "samples": 100,
      "tflops": 17.462728971338688,
      "shape_key": "Qwen2.5-3B|mlp_down|T16|M16|N2048|K11008"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 16,
      "m": 16,
      "n": 2048,
      "k": 11008,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 721420288.0,
      "median_us": 1278.0,
      "min_us": 1244.5,
      "max_us": 1288.5,
      "stdev_us": 17.78051686537824,
      "samples": 5,
      "tflops": 0.5644916181533647,
      "shape_key": "Qwen2.5-3B|mlp_down|T16|M16|N2048|K11008"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 32,
      "m": 32,
      "n": 2048,
      "k": 2048,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 268435456.0,
      "median_us": 12.28800043463707,
      "min_us": 11.52000017464161,
      "max_us": 16.03199914097786,
      "stdev_us": 0.9056072681078319,
      "samples": 100,
      "tflops": 21.845332560645236,
      "shape_key": "Qwen2.5-3B|attn_q|T32|M32|N2048|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 32,
      "m": 32,
      "n": 2048,
      "k": 2048,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 268435456.0,
      "median_us": 151.2,
      "min_us": 150.87,
      "max_us": 155.47,
      "stdev_us": 1.9498512763798164,
      "samples": 5,
      "tflops": 1.7753667724867725,
      "shape_key": "Qwen2.5-3B|attn_q|T32|M32|N2048|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 32,
      "m": 32,
      "n": 256,
      "k": 2048,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 33554432.0,
      "median_us": 12.048000004142523,
      "min_us": 10.239999741315842,
      "max_us": 16.00000075995922,
      "stdev_us": 0.7457942822393491,
      "samples": 100,
      "tflops": 2.7850624160410704,
      "shape_key": "Qwen2.5-3B|attn_kv_each|T32|M32|N256|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 32,
      "m": 32,
      "n": 256,
      "k": 2048,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 33554432.0,
      "median_us": 22.15,
      "min_us": 22.01,
      "max_us": 22.5,
      "stdev_us": 0.19308029417835432,
      "samples": 5,
      "tflops": 1.514872776523702,
      "shape_key": "Qwen2.5-3B|attn_kv_each|T32|M32|N256|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 32,
      "m": 32,
      "n": 2560,
      "k": 2048,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 335544320.0,
      "median_us": 12.28800043463707,
      "min_us": 11.58399973064661,
      "max_us": 16.51199907064438,
      "stdev_us": 1.0307941639922968,
      "samples": 100,
      "tflops": 27.306665700806548,
      "shape_key": "Qwen2.5-3B|attn_qkv_fused|T32|M32|N2560|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 32,
      "m": 32,
      "n": 2560,
      "k": 2048,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 335544320.0,
      "median_us": 194.75,
      "min_us": 187.25,
      "max_us": 217.33,
      "stdev_us": 11.766135729286832,
      "samples": 5,
      "tflops": 1.7229490115532735,
      "shape_key": "Qwen2.5-3B|attn_qkv_fused|T32|M32|N2560|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 32,
      "m": 32,
      "n": 2048,
      "k": 2048,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 268435456.0,
      "median_us": 12.28800043463707,
      "min_us": 11.52000017464161,
      "max_us": 16.03199914097786,
      "stdev_us": 0.9056072681078319,
      "samples": 100,
      "tflops": 21.845332560645236,
      "shape_key": "Qwen2.5-3B|attn_o|T32|M32|N2048|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 32,
      "m": 32,
      "n": 2048,
      "k": 2048,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 268435456.0,
      "median_us": 151.2,
      "min_us": 150.87,
      "max_us": 155.47,
      "stdev_us": 1.9498512763798164,
      "samples": 5,
      "tflops": 1.7753667724867725,
      "shape_key": "Qwen2.5-3B|attn_o|T32|M32|N2048|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 32,
      "m": 32,
      "n": 11008,
      "k": 2048,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 1442840576.0,
      "median_us": 16.736000776290894,
      "min_us": 16.03199914097786,
      "max_us": 21.47199958562851,
      "stdev_us": 1.1532071500089929,
      "samples": 100,
      "tflops": 86.21178950015373,
      "shape_key": "Qwen2.5-3B|mlp_up_or_gate|T32|M32|N11008|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 32,
      "m": 32,
      "n": 11008,
      "k": 2048,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 1442840576.0,
      "median_us": 1226.33,
      "min_us": 927.67,
      "max_us": 1765.33,
      "stdev_us": 311.5449388130065,
      "samples": 5,
      "tflops": 1.1765516427062863,
      "shape_key": "Qwen2.5-3B|mlp_up_or_gate|T32|M32|N11008|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 32,
      "m": 32,
      "n": 2048,
      "k": 11008,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 1442840576.0,
      "median_us": 41.10400006175041,
      "min_us": 38.91199827194214,
      "max_us": 44.44799944758415,
      "stdev_us": 1.0441028727142707,
      "samples": 100,
      "tflops": 35.102193797013065,
      "shape_key": "Qwen2.5-3B|mlp_down|T32|M32|N2048|K11008"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 32,
      "m": 32,
      "n": 2048,
      "k": 11008,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 1442840576.0,
      "median_us": 1509.67,
      "min_us": 855.67,
      "max_us": 1696.33,
      "stdev_us": 401.0640605688822,
      "samples": 5,
      "tflops": 0.9557324289414243,
      "shape_key": "Qwen2.5-3B|mlp_down|T32|M32|N2048|K11008"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 128,
      "m": 128,
      "n": 2048,
      "k": 2048,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 1073741824.0,
      "median_us": 12.256000190973282,
      "min_us": 11.552000418305397,
      "max_us": 16.383999958634377,
      "stdev_us": 0.9147577880989093,
      "samples": 100,
      "tflops": 87.60948166358762,
      "shape_key": "Qwen2.5-3B|attn_q|T128|M128|N2048|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 128,
      "m": 128,
      "n": 2048,
      "k": 2048,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 1073741824.0,
      "median_us": 499.5,
      "min_us": 494.0,
      "max_us": 676.0,
      "stdev_us": 78.40129144344499,
      "samples": 5,
      "tflops": 2.149633281281281,
      "shape_key": "Qwen2.5-3B|attn_q|T128|M128|N2048|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 128,
      "m": 128,
      "n": 256,
      "k": 2048,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 134217728.0,
      "median_us": 12.06399966031313,
      "min_us": 10.239999741315842,
      "max_us": 16.00000075995922,
      "stdev_us": 0.7220206228955125,
      "samples": 100,
      "tflops": 11.12547511432177,
      "shape_key": "Qwen2.5-3B|attn_kv_each|T128|M128|N256|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 128,
      "m": 128,
      "n": 256,
      "k": 2048,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 134217728.0,
      "median_us": 73.86,
      "min_us": 72.83,
      "max_us": 78.93,
      "stdev_us": 2.4884292234258987,
      "samples": 5,
      "tflops": 1.8171910100189548,
      "shape_key": "Qwen2.5-3B|attn_kv_each|T128|M128|N256|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 128,
      "m": 128,
      "n": 2560,
      "k": 2048,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 1342177280.0,
      "median_us": 12.608000077307224,
      "min_us": 11.872000060975552,
      "max_us": 18.81599985063076,
      "stdev_us": 1.288348494619468,
      "samples": 100,
      "tflops": 106.45441559091883,
      "shape_key": "Qwen2.5-3B|attn_qkv_fused|T128|M128|N2560|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 128,
      "m": 128,
      "n": 2560,
      "k": 2048,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 1342177280.0,
      "median_us": 720.0,
      "min_us": 611.33,
      "max_us": 1394.33,
      "stdev_us": 364.21408219617206,
      "samples": 5,
      "tflops": 1.864135111111111,
      "shape_key": "Qwen2.5-3B|attn_qkv_fused|T128|M128|N2560|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 128,
      "m": 128,
      "n": 2048,
      "k": 2048,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 1073741824.0,
      "median_us": 12.256000190973282,
      "min_us": 11.552000418305397,
      "max_us": 16.383999958634377,
      "stdev_us": 0.9147577880989093,
      "samples": 100,
      "tflops": 87.60948166358762,
      "shape_key": "Qwen2.5-3B|attn_o|T128|M128|N2048|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 128,
      "m": 128,
      "n": 2048,
      "k": 2048,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 1073741824.0,
      "median_us": 499.5,
      "min_us": 494.0,
      "max_us": 676.0,
      "stdev_us": 78.40129144344499,
      "samples": 5,
      "tflops": 2.149633281281281,
      "shape_key": "Qwen2.5-3B|attn_o|T128|M128|N2048|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 128,
      "m": 128,
      "n": 11008,
      "k": 2048,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 5771362304.0,
      "median_us": 16.607999801635742,
      "min_us": 15.519999898970127,
      "max_us": 22.175999358296394,
      "stdev_us": 1.163591116997006,
      "samples": 100,
      "tflops": 347.5049598345715,
      "shape_key": "Qwen2.5-3B|mlp_up_or_gate|T128|M128|N11008|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 128,
      "m": 128,
      "n": 11008,
      "k": 2048,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 5771362304.0,
      "median_us": 2795.0,
      "min_us": 2721.0,
      "max_us": 4389.0,
      "stdev_us": 718.9511805400975,
      "samples": 5,
      "tflops": 2.064888123076923,
      "shape_key": "Qwen2.5-3B|mlp_up_or_gate|T128|M128|N11008|K2048"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 128,
      "m": 128,
      "n": 2048,
      "k": 11008,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 5771362304.0,
      "median_us": 41.18400067090988,
      "min_us": 38.94399851560593,
      "max_us": 45.152001082897186,
      "stdev_us": 0.8659924298941812,
      "samples": 100,
      "tflops": 140.13602879713855,
      "shape_key": "Qwen2.5-3B|mlp_down|T128|M128|N2048|K11008"
    },
    {
      "model": "Qwen2.5-3B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 128,
      "m": 128,
      "n": 2048,
      "k": 11008,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 5771362304.0,
      "median_us": 2843.0,
      "min_us": 2771.0,
      "max_us": 5587.0,
      "stdev_us": 1223.666130936049,
      "samples": 5,
      "tflops": 2.030025432289835,
      "shape_key": "Qwen2.5-3B|mlp_down|T128|M128|N2048|K11008"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 1,
      "m": 1,
      "n": 3584,
      "k": 3584,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 25690112.0,
      "median_us": 18.4480007737875,
      "min_us": 17.664000391960144,
      "max_us": 23.552000522613525,
      "stdev_us": 1.3545283191216537,
      "samples": 100,
      "tflops": 1.392568892153491,
      "shape_key": "Qwen2.5-7B|attn_q|T1|M1|N3584|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 1,
      "m": 1,
      "n": 3584,
      "k": 3584,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 25690112.0,
      "median_us": 189.35,
      "min_us": 180.84,
      "max_us": 189.94,
      "stdev_us": 3.8011011562440675,
      "samples": 5,
      "tflops": 0.13567526802218113,
      "shape_key": "Qwen2.5-7B|attn_q|T1|M1|N3584|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 1,
      "m": 1,
      "n": 512,
      "k": 3584,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 3670016.0,
      "median_us": 17.95199979096651,
      "min_us": 16.256000846624374,
      "max_us": 23.04000034928322,
      "stdev_us": 1.3767542726771902,
      "samples": 100,
      "tflops": 0.20443493999185322,
      "shape_key": "Qwen2.5-7B|attn_kv_each|T1|M1|N512|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 1,
      "m": 1,
      "n": 512,
      "k": 3584,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 3670016.0,
      "median_us": 27.84,
      "min_us": 26.69,
      "max_us": 28.56,
      "stdev_us": 0.6954638739718971,
      "samples": 5,
      "tflops": 0.13182528735632185,
      "shape_key": "Qwen2.5-7B|attn_kv_each|T1|M1|N512|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 1,
      "m": 1,
      "n": 4608,
      "k": 3584,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 33030144.0,
      "median_us": 20.8320003002882,
      "min_us": 19.936000928282738,
      "max_us": 25.087999179959297,
      "stdev_us": 0.650680621129092,
      "samples": 100,
      "tflops": 1.5855483642414812,
      "shape_key": "Qwen2.5-7B|attn_qkv_fused|T1|M1|N4608|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 1,
      "m": 1,
      "n": 4608,
      "k": 3584,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 33030144.0,
      "median_us": 240.37,
      "min_us": 232.05,
      "max_us": 252.01,
      "stdev_us": 7.647841525554767,
      "samples": 5,
      "tflops": 0.1374137537962308,
      "shape_key": "Qwen2.5-7B|attn_qkv_fused|T1|M1|N4608|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 1,
      "m": 1,
      "n": 3584,
      "k": 3584,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 25690112.0,
      "median_us": 18.4480007737875,
      "min_us": 17.664000391960144,
      "max_us": 23.552000522613525,
      "stdev_us": 1.3545283191216537,
      "samples": 100,
      "tflops": 1.392568892153491,
      "shape_key": "Qwen2.5-7B|attn_o|T1|M1|N3584|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 1,
      "m": 1,
      "n": 3584,
      "k": 3584,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 25690112.0,
      "median_us": 189.35,
      "min_us": 180.84,
      "max_us": 189.94,
      "stdev_us": 3.8011011562440675,
      "samples": 5,
      "tflops": 0.13567526802218113,
      "shape_key": "Qwen2.5-7B|attn_o|T1|M1|N3584|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 1,
      "m": 1,
      "n": 18944,
      "k": 3584,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 135790592.0,
      "median_us": 40.608000010252,
      "min_us": 38.52799907326698,
      "max_us": 44.03200000524521,
      "stdev_us": 1.135723623863712,
      "samples": 100,
      "tflops": 3.343936957390612,
      "shape_key": "Qwen2.5-7B|mlp_up_or_gate|T1|M1|N18944|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 1,
      "m": 1,
      "n": 18944,
      "k": 3584,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 135790592.0,
      "median_us": 1003.46,
      "min_us": 978.82,
      "max_us": 1021.07,
      "stdev_us": 16.483058575397937,
      "samples": 5,
      "tflops": 0.13532237657704344,
      "shape_key": "Qwen2.5-7B|mlp_up_or_gate|T1|M1|N18944|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 1,
      "m": 1,
      "n": 3584,
      "k": 18944,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 135790592.0,
      "median_us": 72.03199714422226,
      "min_us": 69.37599927186966,
      "max_us": 75.16799867153168,
      "stdev_us": 1.2997886023850977,
      "samples": 100,
      "tflops": 1.8851426780257177,
      "shape_key": "Qwen2.5-7B|mlp_down|T1|M1|N3584|K18944"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 1,
      "m": 1,
      "n": 3584,
      "k": 18944,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 135790592.0,
      "median_us": 1002.54,
      "min_us": 989.86,
      "max_us": 1032.04,
      "stdev_us": 17.06217102246954,
      "samples": 5,
      "tflops": 0.13544655774333195,
      "shape_key": "Qwen2.5-7B|mlp_down|T1|M1|N3584|K18944"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 2,
      "m": 2,
      "n": 3584,
      "k": 3584,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 51380224.0,
      "median_us": 18.112000077962875,
      "min_us": 17.664000391960144,
      "max_us": 22.52800017595291,
      "stdev_us": 0.9525543493766607,
      "samples": 100,
      "tflops": 2.8368056414992533,
      "shape_key": "Qwen2.5-7B|attn_q|T2|M2|N3584|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 2,
      "m": 2,
      "n": 3584,
      "k": 3584,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 51380224.0,
      "median_us": 204.77,
      "min_us": 194.57,
      "max_us": 227.31,
      "stdev_us": 14.046546906624423,
      "samples": 5,
      "tflops": 0.2509167553840895,
      "shape_key": "Qwen2.5-7B|attn_q|T2|M2|N3584|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 2,
      "m": 2,
      "n": 512,
      "k": 3584,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 7340032.0,
      "median_us": 16.367999836802483,
      "min_us": 15.104000456631184,
      "max_us": 20.128000527620316,
      "stdev_us": 0.8031614601323377,
      "samples": 100,
      "tflops": 0.44843793213489475,
      "shape_key": "Qwen2.5-7B|attn_kv_each|T2|M2|N512|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 2,
      "m": 2,
      "n": 512,
      "k": 3584,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 7340032.0,
      "median_us": 29.69,
      "min_us": 28.73,
      "max_us": 31.01,
      "stdev_us": 0.8713093595273732,
      "samples": 5,
      "tflops": 0.24722236443246884,
      "shape_key": "Qwen2.5-7B|attn_kv_each|T2|M2|N512|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 2,
      "m": 2,
      "n": 4608,
      "k": 3584,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 66060288.0,
      "median_us": 20.8320003002882,
      "min_us": 18.432000651955605,
      "max_us": 24.159999564290047,
      "stdev_us": 0.7889319837620016,
      "samples": 100,
      "tflops": 3.1710967284829623,
      "shape_key": "Qwen2.5-7B|attn_qkv_fused|T2|M2|N4608|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 2,
      "m": 2,
      "n": 4608,
      "k": 3584,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 66060288.0,
      "median_us": 261.02,
      "min_us": 251.26,
      "max_us": 284.5,
      "stdev_us": 13.04564180100006,
      "samples": 5,
      "tflops": 0.2530851582254234,
      "shape_key": "Qwen2.5-7B|attn_qkv_fused|T2|M2|N4608|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 2,
      "m": 2,
      "n": 3584,
      "k": 3584,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 51380224.0,
      "median_us": 18.112000077962875,
      "min_us": 17.664000391960144,
      "max_us": 22.52800017595291,
      "stdev_us": 0.9525543493766607,
      "samples": 100,
      "tflops": 2.8368056414992533,
      "shape_key": "Qwen2.5-7B|attn_o|T2|M2|N3584|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 2,
      "m": 2,
      "n": 3584,
      "k": 3584,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 51380224.0,
      "median_us": 204.77,
      "min_us": 194.57,
      "max_us": 227.31,
      "stdev_us": 14.046546906624423,
      "samples": 5,
      "tflops": 0.2509167553840895,
      "shape_key": "Qwen2.5-7B|attn_o|T2|M2|N3584|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 2,
      "m": 2,
      "n": 18944,
      "k": 3584,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 271581184.0,
      "median_us": 40.608000010252,
      "min_us": 38.52799907326698,
      "max_us": 44.704001396894455,
      "stdev_us": 0.7790195501032307,
      "samples": 100,
      "tflops": 6.687873914781224,
      "shape_key": "Qwen2.5-7B|mlp_up_or_gate|T2|M2|N18944|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 2,
      "m": 2,
      "n": 18944,
      "k": 3584,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 271581184.0,
      "median_us": 1073.86,
      "min_us": 1035.5,
      "max_us": 1125.64,
      "stdev_us": 35.79580589957433,
      "samples": 5,
      "tflops": 0.25290185312796826,
      "shape_key": "Qwen2.5-7B|mlp_up_or_gate|T2|M2|N18944|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 2,
      "m": 2,
      "n": 3584,
      "k": 18944,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 271581184.0,
      "median_us": 71.6640017926693,
      "min_us": 69.21599805355072,
      "max_us": 77.15199887752533,
      "stdev_us": 1.4193292602762364,
      "samples": 100,
      "tflops": 3.7896458083056808,
      "shape_key": "Qwen2.5-7B|mlp_down|T2|M2|N3584|K18944"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 2,
      "m": 2,
      "n": 3584,
      "k": 18944,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 271581184.0,
      "median_us": 1245.07,
      "min_us": 1156.29,
      "max_us": 1329.57,
      "stdev_us": 65.08636554609573,
      "samples": 5,
      "tflops": 0.218125233119423,
      "shape_key": "Qwen2.5-7B|mlp_down|T2|M2|N3584|K18944"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 4,
      "m": 4,
      "n": 3584,
      "k": 3584,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 102760448.0,
      "median_us": 18.432000651955605,
      "min_us": 17.72800087928772,
      "max_us": 25.248000398278236,
      "stdev_us": 1.3404322113833014,
      "samples": 100,
      "tflops": 5.57511091391467,
      "shape_key": "Qwen2.5-7B|attn_q|T4|M4|N3584|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 4,
      "m": 4,
      "n": 3584,
      "k": 3584,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 102760448.0,
      "median_us": 237.16,
      "min_us": 233.7,
      "max_us": 274.14,
      "stdev_us": 17.159142752480378,
      "samples": 5,
      "tflops": 0.433295867768595,
      "shape_key": "Qwen2.5-7B|attn_q|T4|M4|N3584|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 4,
      "m": 4,
      "n": 512,
      "k": 3584,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 14680064.0,
      "median_us": 16.575999557971954,
      "min_us": 15.968000516295433,
      "max_us": 21.856000646948814,
      "stdev_us": 0.8006410641084205,
      "samples": 100,
      "tflops": 0.8856216452382725,
      "shape_key": "Qwen2.5-7B|attn_kv_each|T4|M4|N512|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 4,
      "m": 4,
      "n": 512,
      "k": 3584,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 14680064.0,
      "median_us": 35.01,
      "min_us": 34.5,
      "max_us": 35.1,
      "stdev_us": 0.23786550821840524,
      "samples": 5,
      "tflops": 0.4193105969722936,
      "shape_key": "Qwen2.5-7B|attn_kv_each|T4|M4|N512|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 4,
      "m": 4,
      "n": 4608,
      "k": 3584,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 132120576.0,
      "median_us": 20.479999482631683,
      "min_us": 19.74399946630001,
      "max_us": 24.447999894618988,
      "stdev_us": 0.8207061405363332,
      "samples": 100,
      "tflops": 6.451200162971024,
      "shape_key": "Qwen2.5-7B|attn_qkv_fused|T4|M4|N4608|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 4,
      "m": 4,
      "n": 4608,
      "k": 3584,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 132120576.0,
      "median_us": 306.03,
      "min_us": 300.55,
      "max_us": 376.76,
      "stdev_us": 32.021986665414744,
      "samples": 5,
      "tflops": 0.43172426232722283,
      "shape_key": "Qwen2.5-7B|attn_qkv_fused|T4|M4|N4608|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 4,
      "m": 4,
      "n": 3584,
      "k": 3584,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 102760448.0,
      "median_us": 18.432000651955605,
      "min_us": 17.72800087928772,
      "max_us": 25.248000398278236,
      "stdev_us": 1.3404322113833014,
      "samples": 100,
      "tflops": 5.57511091391467,
      "shape_key": "Qwen2.5-7B|attn_o|T4|M4|N3584|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 4,
      "m": 4,
      "n": 3584,
      "k": 3584,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 102760448.0,
      "median_us": 237.16,
      "min_us": 233.7,
      "max_us": 274.14,
      "stdev_us": 17.159142752480378,
      "samples": 5,
      "tflops": 0.433295867768595,
      "shape_key": "Qwen2.5-7B|attn_o|T4|M4|N3584|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 4,
      "m": 4,
      "n": 18944,
      "k": 3584,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 543162368.0,
      "median_us": 40.95999896526337,
      "min_us": 38.88000175356865,
      "max_us": 46.46399989724159,
      "stdev_us": 1.1308767962309065,
      "samples": 100,
      "tflops": 13.260800334995993,
      "shape_key": "Qwen2.5-7B|mlp_up_or_gate|T4|M4|N18944|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 4,
      "m": 4,
      "n": 18944,
      "k": 3584,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 543162368.0,
      "median_us": 1256.86,
      "min_us": 1237.86,
      "max_us": 1282.14,
      "stdev_us": 15.847117403490206,
      "samples": 5,
      "tflops": 0.43215821014273664,
      "shape_key": "Qwen2.5-7B|mlp_up_or_gate|T4|M4|N18944|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 4,
      "m": 4,
      "n": 3584,
      "k": 18944,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 543162368.0,
      "median_us": 71.68000191450119,
      "min_us": 69.47200000286102,
      "max_us": 76.76800340414047,
      "stdev_us": 1.5158295964732473,
      "samples": 100,
      "tflops": 7.57759979760988,
      "shape_key": "Qwen2.5-7B|mlp_down|T4|M4|N3584|K18944"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 4,
      "m": 4,
      "n": 3584,
      "k": 18944,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 543162368.0,
      "median_us": 1418.14,
      "min_us": 1371.71,
      "max_us": 1713.71,
      "stdev_us": 139.84949170447493,
      "samples": 5,
      "tflops": 0.3830103995374223,
      "shape_key": "Qwen2.5-7B|mlp_down|T4|M4|N3584|K18944"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 8,
      "m": 8,
      "n": 3584,
      "k": 3584,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 205520896.0,
      "median_us": 18.783999606966972,
      "min_us": 18.079999834299088,
      "max_us": 22.87999913096428,
      "stdev_us": 0.9720784992217115,
      "samples": 100,
      "tflops": 10.941274504912812,
      "shape_key": "Qwen2.5-7B|attn_q|T8|M8|N3584|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 8,
      "m": 8,
      "n": 3584,
      "k": 3584,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 205520896.0,
      "median_us": 367.53,
      "min_us": 351.16,
      "max_us": 460.89,
      "stdev_us": 45.2417089641848,
      "samples": 5,
      "tflops": 0.5591948847713112,
      "shape_key": "Qwen2.5-7B|attn_q|T8|M8|N3584|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 8,
      "m": 8,
      "n": 512,
      "k": 3584,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 29360128.0,
      "median_us": 16.527999192476273,
      "min_us": 16.00000075995922,
      "max_us": 21.856000646948814,
      "stdev_us": 0.8165135392169827,
      "samples": 100,
      "tflops": 1.7763873084750061,
      "shape_key": "Qwen2.5-7B|attn_kv_each|T8|M8|N512|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 8,
      "m": 8,
      "n": 512,
      "k": 3584,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 29360128.0,
      "median_us": 53.04,
      "min_us": 52.1,
      "max_us": 59.04,
      "stdev_us": 2.832970172804507,
      "samples": 5,
      "tflops": 0.5535469079939668,
      "shape_key": "Qwen2.5-7B|attn_kv_each|T8|M8|N512|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 8,
      "m": 8,
      "n": 4608,
      "k": 3584,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 264241152.0,
      "median_us": 20.78399993479252,
      "min_us": 20.22399939596653,
      "max_us": 24.927999824285507,
      "stdev_us": 0.7846413330988009,
      "samples": 100,
      "tflops": 12.713681333190298,
      "shape_key": "Qwen2.5-7B|attn_qkv_fused|T8|M8|N4608|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 8,
      "m": 8,
      "n": 4608,
      "k": 3584,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 264241152.0,
      "median_us": 465.4,
      "min_us": 458.07,
      "max_us": 611.6,
      "stdev_us": 70.69925791689755,
      "samples": 5,
      "tflops": 0.5677721357971637,
      "shape_key": "Qwen2.5-7B|attn_qkv_fused|T8|M8|N4608|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 8,
      "m": 8,
      "n": 3584,
      "k": 3584,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 205520896.0,
      "median_us": 18.783999606966972,
      "min_us": 18.079999834299088,
      "max_us": 22.87999913096428,
      "stdev_us": 0.9720784992217115,
      "samples": 100,
      "tflops": 10.941274504912812,
      "shape_key": "Qwen2.5-7B|attn_o|T8|M8|N3584|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 8,
      "m": 8,
      "n": 3584,
      "k": 3584,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 205520896.0,
      "median_us": 367.53,
      "min_us": 351.16,
      "max_us": 460.89,
      "stdev_us": 45.2417089641848,
      "samples": 5,
      "tflops": 0.5591948847713112,
      "shape_key": "Qwen2.5-7B|attn_o|T8|M8|N3584|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 8,
      "m": 8,
      "n": 18944,
      "k": 3584,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 1086324736.0,
      "median_us": 38.91199827194214,
      "min_us": 38.14399987459183,
      "max_us": 43.55200007557869,
      "stdev_us": 0.9707259989323049,
      "samples": 100,
      "tflops": 27.917474924008328,
      "shape_key": "Qwen2.5-7B|mlp_up_or_gate|T8|M8|N18944|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 8,
      "m": 8,
      "n": 18944,
      "k": 3584,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 1086324736.0,
      "median_us": 2105.5,
      "min_us": 2051.5,
      "max_us": 2113.25,
      "stdev_us": 26.866219495865064,
      "samples": 5,
      "tflops": 0.5159462056518642,
      "shape_key": "Qwen2.5-7B|mlp_up_or_gate|T8|M8|N18944|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 8,
      "m": 8,
      "n": 3584,
      "k": 18944,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 1086324736.0,
      "median_us": 71.68000191450119,
      "min_us": 69.34399902820587,
      "max_us": 75.80800354480743,
      "stdev_us": 1.4238188604439599,
      "samples": 100,
      "tflops": 15.15519959521976,
      "shape_key": "Qwen2.5-7B|mlp_down|T8|M8|N3584|K18944"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 8,
      "m": 8,
      "n": 3584,
      "k": 18944,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 1086324736.0,
      "median_us": 2087.0,
      "min_us": 2063.0,
      "max_us": 2716.5,
      "stdev_us": 280.13947419098224,
      "samples": 5,
      "tflops": 0.5205197585050312,
      "shape_key": "Qwen2.5-7B|mlp_down|T8|M8|N3584|K18944"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 16,
      "m": 16,
      "n": 3584,
      "k": 3584,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 411041792.0,
      "median_us": 18.432000651955605,
      "min_us": 17.311999574303627,
      "max_us": 22.463999688625336,
      "stdev_us": 1.0126704284535468,
      "samples": 100,
      "tflops": 22.30044365565868,
      "shape_key": "Qwen2.5-7B|attn_q|T16|M16|N3584|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 16,
      "m": 16,
      "n": 3584,
      "k": 3584,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 411041792.0,
      "median_us": 715.9,
      "min_us": 707.6,
      "max_us": 839.1,
      "stdev_us": 56.345470092989736,
      "samples": 5,
      "tflops": 0.5741609051543511,
      "shape_key": "Qwen2.5-7B|attn_q|T16|M16|N3584|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 16,
      "m": 16,
      "n": 512,
      "k": 3584,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 58720256.0,
      "median_us": 16.55999943614006,
      "min_us": 16.03199914097786,
      "max_us": 21.856000646948814,
      "stdev_us": 0.8321772380722229,
      "samples": 100,
      "tflops": 3.545909299480447,
      "shape_key": "Qwen2.5-7B|attn_kv_each|T16|M16|N512|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 16,
      "m": 16,
      "n": 512,
      "k": 3584,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 58720256.0,
      "median_us": 105.23,
      "min_us": 104.54,
      "max_us": 110.55,
      "stdev_us": 2.494594155368763,
      "samples": 5,
      "tflops": 0.5580182077354366,
      "shape_key": "Qwen2.5-7B|attn_kv_each|T16|M16|N512|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 16,
      "m": 16,
      "n": 4608,
      "k": 3584,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 528482304.0,
      "median_us": 20.479999482631683,
      "min_us": 19.807999953627586,
      "max_us": 25.21600015461445,
      "stdev_us": 1.054799985045161,
      "samples": 100,
      "tflops": 25.804800651884097,
      "shape_key": "Qwen2.5-7B|attn_qkv_fused|T16|M16|N4608|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 16,
      "m": 16,
      "n": 4608,
      "k": 3584,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 528482304.0,
      "median_us": 924.5,
      "min_us": 914.88,
      "max_us": 933.0,
      "stdev_us": 7.271112707144622,
      "samples": 5,
      "tflops": 0.5716412157923202,
      "shape_key": "Qwen2.5-7B|attn_qkv_fused|T16|M16|N4608|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 16,
      "m": 16,
      "n": 3584,
      "k": 3584,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 411041792.0,
      "median_us": 18.432000651955605,
      "min_us": 17.311999574303627,
      "max_us": 22.463999688625336,
      "stdev_us": 1.0126704284535468,
      "samples": 100,
      "tflops": 22.30044365565868,
      "shape_key": "Qwen2.5-7B|attn_o|T16|M16|N3584|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 16,
      "m": 16,
      "n": 3584,
      "k": 3584,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 411041792.0,
      "median_us": 715.9,
      "min_us": 707.6,
      "max_us": 839.1,
      "stdev_us": 56.345470092989736,
      "samples": 5,
      "tflops": 0.5741609051543511,
      "shape_key": "Qwen2.5-7B|attn_o|T16|M16|N3584|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 16,
      "m": 16,
      "n": 18944,
      "k": 3584,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 2172649472.0,
      "median_us": 37.31200098991394,
      "min_us": 36.83200106024742,
      "max_us": 44.38399896025658,
      "stdev_us": 1.3457841064044422,
      "samples": 100,
      "tflops": 58.22924030762391,
      "shape_key": "Qwen2.5-7B|mlp_up_or_gate|T16|M16|N18944|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 16,
      "m": 16,
      "n": 18944,
      "k": 3584,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 2172649472.0,
      "median_us": 3979.0,
      "min_us": 3927.0,
      "max_us": 4019.5,
      "stdev_us": 37.84276681216637,
      "samples": 5,
      "tflops": 0.5460290203568736,
      "shape_key": "Qwen2.5-7B|mlp_up_or_gate|T16|M16|N18944|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 16,
      "m": 16,
      "n": 3584,
      "k": 18944,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 2172649472.0,
      "median_us": 71.68000191450119,
      "min_us": 69.21599805355072,
      "max_us": 75.9039968252182,
      "stdev_us": 1.2907096636244615,
      "samples": 100,
      "tflops": 30.31039919043952,
      "shape_key": "Qwen2.5-7B|mlp_down|T16|M16|N3584|K18944"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 16,
      "m": 16,
      "n": 3584,
      "k": 18944,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 2172649472.0,
      "median_us": 3956.0,
      "min_us": 3936.0,
      "max_us": 4732.0,
      "stdev_us": 349.2517716490498,
      "samples": 5,
      "tflops": 0.5492036076845298,
      "shape_key": "Qwen2.5-7B|mlp_down|T16|M16|N3584|K18944"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 32,
      "m": 32,
      "n": 3584,
      "k": 3584,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 822083584.0,
      "median_us": 18.432000651955605,
      "min_us": 17.055999487638474,
      "max_us": 24.480000138282776,
      "stdev_us": 1.2296243492674945,
      "samples": 100,
      "tflops": 44.60088731131736,
      "shape_key": "Qwen2.5-7B|attn_q|T32|M32|N3584|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 32,
      "m": 32,
      "n": 3584,
      "k": 3584,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 822083584.0,
      "median_us": 553.4,
      "min_us": 475.4,
      "max_us": 1012.2,
      "stdev_us": 247.10473892663413,
      "samples": 5,
      "tflops": 1.4855142464763282,
      "shape_key": "Qwen2.5-7B|attn_q|T32|M32|N3584|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 32,
      "m": 32,
      "n": 512,
      "k": 3584,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 117440512.0,
      "median_us": 16.623999923467636,
      "min_us": 16.00000075995922,
      "max_us": 21.856000646948814,
      "stdev_us": 0.9537444726801848,
      "samples": 100,
      "tflops": 7.064515913177581,
      "shape_key": "Qwen2.5-7B|attn_kv_each|T32|M32|N512|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 32,
      "m": 32,
      "n": 512,
      "k": 3584,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 117440512.0,
      "median_us": 71.27,
      "min_us": 69.52,
      "max_us": 90.82,
      "stdev_us": 8.851404973223175,
      "samples": 5,
      "tflops": 1.6478253402553669,
      "shape_key": "Qwen2.5-7B|attn_kv_each|T32|M32|N512|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 32,
      "m": 32,
      "n": 4608,
      "k": 3584,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 1056964608.0,
      "median_us": 20.46399936079979,
      "min_us": 19.711999222636223,
      "max_us": 25.312000885605812,
      "stdev_us": 1.0550595247627728,
      "samples": 100,
      "tflops": 51.649953137933004,
      "shape_key": "Qwen2.5-7B|attn_qkv_fused|T32|M32|N4608|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 32,
      "m": 32,
      "n": 4608,
      "k": 3584,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 1056964608.0,
      "median_us": 739.5,
      "min_us": 635.75,
      "max_us": 1017.5,
      "stdev_us": 163.37814648844562,
      "samples": 5,
      "tflops": 1.4292962920892496,
      "shape_key": "Qwen2.5-7B|attn_qkv_fused|T32|M32|N4608|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 32,
      "m": 32,
      "n": 3584,
      "k": 3584,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 822083584.0,
      "median_us": 18.432000651955605,
      "min_us": 17.055999487638474,
      "max_us": 24.480000138282776,
      "stdev_us": 1.2296243492674945,
      "samples": 100,
      "tflops": 44.60088731131736,
      "shape_key": "Qwen2.5-7B|attn_o|T32|M32|N3584|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 32,
      "m": 32,
      "n": 3584,
      "k": 3584,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 822083584.0,
      "median_us": 553.4,
      "min_us": 475.4,
      "max_us": 1012.2,
      "stdev_us": 247.10473892663413,
      "samples": 5,
      "tflops": 1.4855142464763282,
      "shape_key": "Qwen2.5-7B|attn_o|T32|M32|N3584|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 32,
      "m": 32,
      "n": 18944,
      "k": 3584,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 4345298944.0,
      "median_us": 34.81600061058998,
      "min_us": 34.07999873161316,
      "max_us": 39.93599861860275,
      "stdev_us": 1.1093021886096972,
      "samples": 100,
      "tflops": 124.807527222937,
      "shape_key": "Qwen2.5-7B|mlp_up_or_gate|T32|M32|N18944|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 32,
      "m": 32,
      "n": 18944,
      "k": 3584,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 4345298944.0,
      "median_us": 2939.0,
      "min_us": 2846.0,
      "max_us": 4803.0,
      "stdev_us": 892.6403531098065,
      "samples": 5,
      "tflops": 1.4784957277985709,
      "shape_key": "Qwen2.5-7B|mlp_up_or_gate|T32|M32|N18944|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 32,
      "m": 32,
      "n": 3584,
      "k": 18944,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 4345298944.0,
      "median_us": 71.53600081801414,
      "min_us": 69.15199756622314,
      "max_us": 74.75200295448303,
      "stdev_us": 1.166658142348499,
      "samples": 100,
      "tflops": 60.742827308089744,
      "shape_key": "Qwen2.5-7B|mlp_down|T32|M32|N3584|K18944"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 32,
      "m": 32,
      "n": 3584,
      "k": 18944,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 4345298944.0,
      "median_us": 2798.0,
      "min_us": 2605.0,
      "max_us": 3719.0,
      "stdev_us": 451.4808966058254,
      "samples": 5,
      "tflops": 1.5530017669764118,
      "shape_key": "Qwen2.5-7B|mlp_down|T32|M32|N3584|K18944"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 128,
      "m": 128,
      "n": 3584,
      "k": 3584,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 3288334336.0,
      "median_us": 18.73599924147129,
      "min_us": 18.144000321626663,
      "max_us": 24.960000067949295,
      "stdev_us": 1.0288367084193561,
      "samples": 100,
      "tflops": 175.50888498764562,
      "shape_key": "Qwen2.5-7B|attn_q|T128|M128|N3584|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_q",
      "tokens": 128,
      "m": 128,
      "n": 3584,
      "k": 3584,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 3288334336.0,
      "median_us": 1599.5,
      "min_us": 1522.5,
      "max_us": 2107.0,
      "stdev_us": 239.5156571082567,
      "samples": 5,
      "tflops": 2.0558514135667396,
      "shape_key": "Qwen2.5-7B|attn_q|T128|M128|N3584|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 128,
      "m": 128,
      "n": 512,
      "k": 3584,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 469762048.0,
      "median_us": 16.383999958634377,
      "min_us": 15.647999942302704,
      "max_us": 20.51199972629547,
      "stdev_us": 0.9889380239248593,
      "samples": 100,
      "tflops": 28.672000072389842,
      "shape_key": "Qwen2.5-7B|attn_kv_each|T128|M128|N512|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 128,
      "m": 128,
      "n": 512,
      "k": 3584,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 469762048.0,
      "median_us": 234.78,
      "min_us": 233.22,
      "max_us": 289.67,
      "stdev_us": 24.482061187735,
      "samples": 5,
      "tflops": 2.0008605843768636,
      "shape_key": "Qwen2.5-7B|attn_kv_each|T128|M128|N512|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 128,
      "m": 128,
      "n": 4608,
      "k": 3584,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 4227858432.0,
      "median_us": 20.479999482631683,
      "min_us": 19.071999937295914,
      "max_us": 25.599999353289604,
      "stdev_us": 1.0883369580866638,
      "samples": 100,
      "tflops": 206.43840521507278,
      "shape_key": "Qwen2.5-7B|attn_qkv_fused|T128|M128|N4608|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 128,
      "m": 128,
      "n": 4608,
      "k": 3584,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 4227858432.0,
      "median_us": 2250.0,
      "min_us": 1960.0,
      "max_us": 4561.0,
      "stdev_us": 1247.0197271895902,
      "samples": 5,
      "tflops": 1.879048192,
      "shape_key": "Qwen2.5-7B|attn_qkv_fused|T128|M128|N4608|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 128,
      "m": 128,
      "n": 3584,
      "k": 3584,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 3288334336.0,
      "median_us": 18.73599924147129,
      "min_us": 18.144000321626663,
      "max_us": 24.960000067949295,
      "stdev_us": 1.0288367084193561,
      "samples": 100,
      "tflops": 175.50888498764562,
      "shape_key": "Qwen2.5-7B|attn_o|T128|M128|N3584|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "attn_o",
      "tokens": 128,
      "m": 128,
      "n": 3584,
      "k": 3584,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 3288334336.0,
      "median_us": 1599.5,
      "min_us": 1522.5,
      "max_us": 2107.0,
      "stdev_us": 239.5156571082567,
      "samples": 5,
      "tflops": 2.0558514135667396,
      "shape_key": "Qwen2.5-7B|attn_o|T128|M128|N3584|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 128,
      "m": 128,
      "n": 18944,
      "k": 3584,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 17381195776.0,
      "median_us": 33.11999887228012,
      "min_us": 30.688000842928886,
      "max_us": 39.264000952243805,
      "stdev_us": 1.203146831519859,
      "samples": 100,
      "tflops": 524.7945763231062,
      "shape_key": "Qwen2.5-7B|mlp_up_or_gate|T128|M128|N18944|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 128,
      "m": 128,
      "n": 18944,
      "k": 3584,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 17381195776.0,
      "median_us": 8587.0,
      "min_us": 8527.0,
      "max_us": 8825.0,
      "stdev_us": 115.17812292271479,
      "samples": 5,
      "tflops": 2.0241290061721204,
      "shape_key": "Qwen2.5-7B|mlp_up_or_gate|T128|M128|N18944|K3584"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 128,
      "m": 128,
      "n": 3584,
      "k": 18944,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 17381195776.0,
      "median_us": 72.03199714422226,
      "min_us": 69.63200122117996,
      "max_us": 75.99999755620956,
      "stdev_us": 1.1385417621281382,
      "samples": 100,
      "tflops": 241.29826278729186,
      "shape_key": "Qwen2.5-7B|mlp_down|T128|M128|N3584|K18944"
    },
    {
      "model": "Qwen2.5-7B",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 128,
      "m": 128,
      "n": 3584,
      "k": 18944,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 17381195776.0,
      "median_us": 8530.0,
      "min_us": 8488.0,
      "max_us": 9459.0,
      "stdev_us": 413.4640250372455,
      "samples": 5,
      "tflops": 2.0376548389214535,
      "shape_key": "Qwen2.5-7B|mlp_down|T128|M128|N3584|K18944"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_q",
      "tokens": 1,
      "m": 1,
      "n": 4096,
      "k": 4096,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 33554432.0,
      "median_us": 20.128000527620316,
      "min_us": 19.74399946630001,
      "max_us": 26.079999282956123,
      "stdev_us": 1.185291033557048,
      "samples": 100,
      "tflops": 1.6670524205300714,
      "shape_key": "Mistral-7B-v0.3|attn_q|T1|M1|N4096|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_q",
      "tokens": 1,
      "m": 1,
      "n": 4096,
      "k": 4096,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 33554432.0,
      "median_us": 242.99,
      "min_us": 235.74,
      "max_us": 247.07,
      "stdev_us": 5.162331837454849,
      "samples": 5,
      "tflops": 0.1380897650109058,
      "shape_key": "Mistral-7B-v0.3|attn_q|T1|M1|N4096|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 1,
      "m": 1,
      "n": 1024,
      "k": 4096,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 8388608.0,
      "median_us": 18.783999606966972,
      "min_us": 18.0479995906353,
      "max_us": 23.00800010561943,
      "stdev_us": 1.1193889794209664,
      "samples": 100,
      "tflops": 0.4465826328535841,
      "shape_key": "Mistral-7B-v0.3|attn_kv_each|T1|M1|N1024|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 1,
      "m": 1,
      "n": 1024,
      "k": 4096,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 8388608.0,
      "median_us": 60.69,
      "min_us": 59.86,
      "max_us": 67.09,
      "stdev_us": 2.9872445497481475,
      "samples": 5,
      "tflops": 0.13822059647388368,
      "shape_key": "Mistral-7B-v0.3|attn_kv_each|T1|M1|N1024|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 1,
      "m": 1,
      "n": 6144,
      "k": 4096,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 50331648.0,
      "median_us": 24.57600086927414,
      "min_us": 22.39999920129776,
      "max_us": 28.76799926161766,
      "stdev_us": 1.3423942621081517,
      "samples": 100,
      "tflops": 2.047999927560491,
      "shape_key": "Mistral-7B-v0.3|attn_qkv_fused|T1|M1|N6144|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 1,
      "m": 1,
      "n": 6144,
      "k": 4096,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 50331648.0,
      "median_us": 362.84,
      "min_us": 357.59,
      "max_us": 370.14,
      "stdev_us": 4.941069722236272,
      "samples": 5,
      "tflops": 0.1387158196450226,
      "shape_key": "Mistral-7B-v0.3|attn_qkv_fused|T1|M1|N6144|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_o",
      "tokens": 1,
      "m": 1,
      "n": 4096,
      "k": 4096,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 33554432.0,
      "median_us": 20.128000527620316,
      "min_us": 19.74399946630001,
      "max_us": 26.079999282956123,
      "stdev_us": 1.185291033557048,
      "samples": 100,
      "tflops": 1.6670524205300714,
      "shape_key": "Mistral-7B-v0.3|attn_o|T1|M1|N4096|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_o",
      "tokens": 1,
      "m": 1,
      "n": 4096,
      "k": 4096,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 33554432.0,
      "median_us": 242.99,
      "min_us": 235.74,
      "max_us": 247.07,
      "stdev_us": 5.162331837454849,
      "samples": 5,
      "tflops": 0.1380897650109058,
      "shape_key": "Mistral-7B-v0.3|attn_o|T1|M1|N4096|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 1,
      "m": 1,
      "n": 14336,
      "k": 4096,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 117440512.0,
      "median_us": 35.16799956560135,
      "min_us": 34.432001411914825,
      "max_us": 39.51999917626381,
      "stdev_us": 0.9782167602516593,
      "samples": 100,
      "tflops": 3.3394140539876296,
      "shape_key": "Mistral-7B-v0.3|mlp_up_or_gate|T1|M1|N14336|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 1,
      "m": 1,
      "n": 14336,
      "k": 4096,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 117440512.0,
      "median_us": 859.85,
      "min_us": 833.82,
      "max_us": 893.39,
      "stdev_us": 23.678060097904968,
      "samples": 5,
      "tflops": 0.1365825574228063,
      "shape_key": "Mistral-7B-v0.3|mlp_up_or_gate|T1|M1|N14336|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 1,
      "m": 1,
      "n": 4096,
      "k": 14336,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 117440512.0,
      "median_us": 59.039998799562454,
      "min_us": 57.34400078654289,
      "max_us": 63.07200342416763,
      "stdev_us": 1.0755498701718158,
      "samples": 100,
      "tflops": 1.9891686041306347,
      "shape_key": "Mistral-7B-v0.3|mlp_down|T1|M1|N4096|K14336"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 1,
      "m": 1,
      "n": 4096,
      "k": 14336,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 117440512.0,
      "median_us": 838.685,
      "min_us": 829.39,
      "max_us": 914.88,
      "stdev_us": 25.406180897131655,
      "samples": 5,
      "tflops": 0.14002934594037095,
      "shape_key": "Mistral-7B-v0.3|mlp_down|T1|M1|N4096|K14336"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_q",
      "tokens": 2,
      "m": 2,
      "n": 4096,
      "k": 4096,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 67108864.0,
      "median_us": 20.479999482631683,
      "min_us": 19.680000841617584,
      "max_us": 25.66399984061718,
      "stdev_us": 1.2279764449764434,
      "samples": 100,
      "tflops": 3.2768000827789328,
      "shape_key": "Mistral-7B-v0.3|attn_q|T2|M2|N4096|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_q",
      "tokens": 2,
      "m": 2,
      "n": 4096,
      "k": 4096,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 67108864.0,
      "median_us": 256.75,
      "min_us": 255.25,
      "max_us": 287.93,
      "stdev_us": 13.821048078926584,
      "samples": 5,
      "tflops": 0.26137824342745863,
      "shape_key": "Mistral-7B-v0.3|attn_q|T2|M2|N4096|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 2,
      "m": 2,
      "n": 1024,
      "k": 4096,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 16777216.0,
      "median_us": 18.672000616788864,
      "min_us": 17.311999574303627,
      "max_us": 22.87999913096428,
      "stdev_us": 1.0114467832305774,
      "samples": 100,
      "tflops": 0.8985226781170318,
      "shape_key": "Mistral-7B-v0.3|attn_kv_each|T2|M2|N1024|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 2,
      "m": 2,
      "n": 1024,
      "k": 4096,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 16777216.0,
      "median_us": 66.05,
      "min_us": 64.77,
      "max_us": 69.29,
      "stdev_us": 2.027905323233805,
      "samples": 5,
      "tflops": 0.25400781226343677,
      "shape_key": "Mistral-7B-v0.3|attn_kv_each|T2|M2|N1024|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 2,
      "m": 2,
      "n": 6144,
      "k": 4096,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 100663296.0,
      "median_us": 24.57600086927414,
      "min_us": 22.23999984562397,
      "max_us": 29.023999348282814,
      "stdev_us": 1.3533748055194843,
      "samples": 100,
      "tflops": 4.095999855120982,
      "shape_key": "Mistral-7B-v0.3|attn_qkv_fused|T2|M2|N6144|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 2,
      "m": 2,
      "n": 6144,
      "k": 4096,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 100663296.0,
      "median_us": 399.42,
      "min_us": 378.79,
      "max_us": 424.97,
      "stdev_us": 18.119146779029087,
      "samples": 5,
      "tflops": 0.2520236743277753,
      "shape_key": "Mistral-7B-v0.3|attn_qkv_fused|T2|M2|N6144|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_o",
      "tokens": 2,
      "m": 2,
      "n": 4096,
      "k": 4096,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 67108864.0,
      "median_us": 20.479999482631683,
      "min_us": 19.680000841617584,
      "max_us": 25.66399984061718,
      "stdev_us": 1.2279764449764434,
      "samples": 100,
      "tflops": 3.2768000827789328,
      "shape_key": "Mistral-7B-v0.3|attn_o|T2|M2|N4096|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_o",
      "tokens": 2,
      "m": 2,
      "n": 4096,
      "k": 4096,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 67108864.0,
      "median_us": 256.75,
      "min_us": 255.25,
      "max_us": 287.93,
      "stdev_us": 13.821048078926584,
      "samples": 5,
      "tflops": 0.26137824342745863,
      "shape_key": "Mistral-7B-v0.3|attn_o|T2|M2|N4096|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 2,
      "m": 2,
      "n": 14336,
      "k": 4096,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 234881024.0,
      "median_us": 35.16799956560135,
      "min_us": 34.46400165557861,
      "max_us": 40.28800129890442,
      "stdev_us": 1.1676139210640535,
      "samples": 100,
      "tflops": 6.678828107975259,
      "shape_key": "Mistral-7B-v0.3|mlp_up_or_gate|T2|M2|N14336|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 2,
      "m": 2,
      "n": 14336,
      "k": 4096,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 234881024.0,
      "median_us": 916.12,
      "min_us": 887.0,
      "max_us": 993.18,
      "stdev_us": 39.62686525578321,
      "samples": 5,
      "tflops": 0.2563867440946601,
      "shape_key": "Mistral-7B-v0.3|mlp_up_or_gate|T2|M2|N14336|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 2,
      "m": 2,
      "n": 4096,
      "k": 14336,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 234881024.0,
      "median_us": 59.039998799562454,
      "min_us": 56.992001831531525,
      "max_us": 63.10400366783142,
      "stdev_us": 1.0851127407327654,
      "samples": 100,
      "tflops": 3.9783372082612694,
      "shape_key": "Mistral-7B-v0.3|mlp_down|T2|M2|N4096|K14336"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 2,
      "m": 2,
      "n": 4096,
      "k": 14336,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 234881024.0,
      "median_us": 962.175,
      "min_us": 946.71,
      "max_us": 1022.88,
      "stdev_us": 26.42942207296843,
      "samples": 5,
      "tflops": 0.244114661054382,
      "shape_key": "Mistral-7B-v0.3|mlp_down|T2|M2|N4096|K14336"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_q",
      "tokens": 4,
      "m": 4,
      "n": 4096,
      "k": 4096,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 134217728.0,
      "median_us": 20.479999482631683,
      "min_us": 19.7759997099638,
      "max_us": 24.57600086927414,
      "stdev_us": 1.139428203122413,
      "samples": 100,
      "tflops": 6.5536001655578655,
      "shape_key": "Mistral-7B-v0.3|attn_q|T4|M4|N4096|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_q",
      "tokens": 4,
      "m": 4,
      "n": 4096,
      "k": 4096,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 134217728.0,
      "median_us": 307.24,
      "min_us": 303.62,
      "max_us": 360.38,
      "stdev_us": 24.025942645398946,
      "samples": 5,
      "tflops": 0.4368497851842208,
      "shape_key": "Mistral-7B-v0.3|attn_q|T4|M4|N4096|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 4,
      "m": 4,
      "n": 1024,
      "k": 4096,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 33554432.0,
      "median_us": 18.079999834299088,
      "min_us": 17.632000148296356,
      "max_us": 23.16799946129322,
      "stdev_us": 0.796772571576308,
      "samples": 100,
      "tflops": 1.855886742672684,
      "shape_key": "Mistral-7B-v0.3|attn_kv_each|T4|M4|N1024|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 4,
      "m": 4,
      "n": 1024,
      "k": 4096,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 33554432.0,
      "median_us": 77.95,
      "min_us": 77.1,
      "max_us": 82.03,
      "stdev_us": 2.0320014763774186,
      "samples": 5,
      "tflops": 0.4304609621552277,
      "shape_key": "Mistral-7B-v0.3|attn_kv_each|T4|M4|N1024|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 4,
      "m": 4,
      "n": 6144,
      "k": 4096,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 201326592.0,
      "median_us": 24.224000051617622,
      "min_us": 21.95199951529503,
      "max_us": 27.135999873280525,
      "stdev_us": 1.0147936584384436,
      "samples": 100,
      "tflops": 8.311038291405382,
      "shape_key": "Mistral-7B-v0.3|attn_qkv_fused|T4|M4|N6144|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 4,
      "m": 4,
      "n": 6144,
      "k": 4096,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 201326592.0,
      "median_us": 461.68,
      "min_us": 455.53,
      "max_us": 535.11,
      "stdev_us": 33.005599979397445,
      "samples": 5,
      "tflops": 0.43607388667475305,
      "shape_key": "Mistral-7B-v0.3|attn_qkv_fused|T4|M4|N6144|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_o",
      "tokens": 4,
      "m": 4,
      "n": 4096,
      "k": 4096,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 134217728.0,
      "median_us": 20.479999482631683,
      "min_us": 19.7759997099638,
      "max_us": 24.57600086927414,
      "stdev_us": 1.139428203122413,
      "samples": 100,
      "tflops": 6.5536001655578655,
      "shape_key": "Mistral-7B-v0.3|attn_o|T4|M4|N4096|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_o",
      "tokens": 4,
      "m": 4,
      "n": 4096,
      "k": 4096,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 134217728.0,
      "median_us": 307.24,
      "min_us": 303.62,
      "max_us": 360.38,
      "stdev_us": 24.025942645398946,
      "samples": 5,
      "tflops": 0.4368497851842208,
      "shape_key": "Mistral-7B-v0.3|attn_o|T4|M4|N4096|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 4,
      "m": 4,
      "n": 14336,
      "k": 4096,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 469762048.0,
      "median_us": 32.76799991726875,
      "min_us": 32.15999901294708,
      "max_us": 37.88800165057182,
      "stdev_us": 1.047660482316966,
      "samples": 100,
      "tflops": 14.336000036194921,
      "shape_key": "Mistral-7B-v0.3|mlp_up_or_gate|T4|M4|N14336|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 4,
      "m": 4,
      "n": 14336,
      "k": 4096,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 469762048.0,
      "median_us": 1097.22,
      "min_us": 1073.22,
      "max_us": 1418.78,
      "stdev_us": 148.35054745433195,
      "samples": 5,
      "tflops": 0.4281384298499845,
      "shape_key": "Mistral-7B-v0.3|mlp_up_or_gate|T4|M4|N14336|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 4,
      "m": 4,
      "n": 4096,
      "k": 14336,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 469762048.0,
      "median_us": 59.74400043487549,
      "min_us": 57.34400078654289,
      "max_us": 63.840001821517944,
      "stdev_us": 1.1143275654982223,
      "samples": 100,
      "tflops": 7.862915850639573,
      "shape_key": "Mistral-7B-v0.3|mlp_down|T4|M4|N4096|K14336"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 4,
      "m": 4,
      "n": 4096,
      "k": 14336,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 469762048.0,
      "median_us": 1179.165,
      "min_us": 1154.0,
      "max_us": 1263.78,
      "stdev_us": 32.93115100731626,
      "samples": 5,
      "tflops": 0.398385338777864,
      "shape_key": "Mistral-7B-v0.3|mlp_down|T4|M4|N4096|K14336"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_q",
      "tokens": 8,
      "m": 8,
      "n": 4096,
      "k": 4096,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 268435456.0,
      "median_us": 20.479999482631683,
      "min_us": 19.74399946630001,
      "max_us": 24.224000051617622,
      "stdev_us": 1.1108639657931227,
      "samples": 100,
      "tflops": 13.107200331115731,
      "shape_key": "Mistral-7B-v0.3|attn_q|T8|M8|N4096|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_q",
      "tokens": 8,
      "m": 8,
      "n": 4096,
      "k": 4096,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 268435456.0,
      "median_us": 477.53,
      "min_us": 464.53,
      "max_us": 727.33,
      "stdev_us": 112.92563358245995,
      "samples": 5,
      "tflops": 0.5621331769731744,
      "shape_key": "Mistral-7B-v0.3|attn_q|T8|M8|N4096|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 8,
      "m": 8,
      "n": 1024,
      "k": 4096,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 67108864.0,
      "median_us": 18.400000408291817,
      "min_us": 16.992000862956047,
      "max_us": 21.983999758958817,
      "stdev_us": 0.6558158873962807,
      "samples": 100,
      "tflops": 3.6472207886342174,
      "shape_key": "Mistral-7B-v0.3|attn_kv_each|T8|M8|N1024|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 8,
      "m": 8,
      "n": 1024,
      "k": 4096,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 67108864.0,
      "median_us": 118.28,
      "min_us": 116.95,
      "max_us": 230.33,
      "stdev_us": 50.232890918202195,
      "samples": 5,
      "tflops": 0.5673728779168076,
      "shape_key": "Mistral-7B-v0.3|attn_kv_each|T8|M8|N1024|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 8,
      "m": 8,
      "n": 6144,
      "k": 4096,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 402653184.0,
      "median_us": 23.96799996495247,
      "min_us": 22.23999984562397,
      "max_us": 28.64000014960766,
      "stdev_us": 1.319900985860222,
      "samples": 100,
      "tflops": 16.799615511881886,
      "shape_key": "Mistral-7B-v0.3|attn_qkv_fused|T8|M8|N6144|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 8,
      "m": 8,
      "n": 6144,
      "k": 4096,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 402653184.0,
      "median_us": 714.1,
      "min_us": 704.5,
      "max_us": 816.9,
      "stdev_us": 46.78662201954742,
      "samples": 5,
      "tflops": 0.5638610614759838,
      "shape_key": "Mistral-7B-v0.3|attn_qkv_fused|T8|M8|N6144|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_o",
      "tokens": 8,
      "m": 8,
      "n": 4096,
      "k": 4096,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 268435456.0,
      "median_us": 20.479999482631683,
      "min_us": 19.74399946630001,
      "max_us": 24.224000051617622,
      "stdev_us": 1.1108639657931227,
      "samples": 100,
      "tflops": 13.107200331115731,
      "shape_key": "Mistral-7B-v0.3|attn_o|T8|M8|N4096|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_o",
      "tokens": 8,
      "m": 8,
      "n": 4096,
      "k": 4096,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 268435456.0,
      "median_us": 477.53,
      "min_us": 464.53,
      "max_us": 727.33,
      "stdev_us": 112.92563358245995,
      "samples": 5,
      "tflops": 0.5621331769731744,
      "shape_key": "Mistral-7B-v0.3|attn_o|T8|M8|N4096|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 8,
      "m": 8,
      "n": 14336,
      "k": 4096,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 939524096.0,
      "median_us": 33.11999887228012,
      "min_us": 32.35200047492981,
      "max_us": 40.672000497579575,
      "stdev_us": 1.2816959436477533,
      "samples": 100,
      "tflops": 28.367274395843577,
      "shape_key": "Mistral-7B-v0.3|mlp_up_or_gate|T8|M8|N14336|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 8,
      "m": 8,
      "n": 14336,
      "k": 4096,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 939524096.0,
      "median_us": 1800.2,
      "min_us": 1758.2,
      "max_us": 2075.0,
      "stdev_us": 132.13768576753566,
      "samples": 5,
      "tflops": 0.5218998422397512,
      "shape_key": "Mistral-7B-v0.3|mlp_up_or_gate|T8|M8|N14336|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 8,
      "m": 8,
      "n": 4096,
      "k": 14336,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 939524096.0,
      "median_us": 59.39200147986412,
      "min_us": 57.34400078654289,
      "max_us": 66.91200286149979,
      "stdev_us": 1.4038155932622962,
      "samples": 100,
      "tflops": 15.819034088597437,
      "shape_key": "Mistral-7B-v0.3|mlp_down|T8|M8|N4096|K14336"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 8,
      "m": 8,
      "n": 4096,
      "k": 14336,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 939524096.0,
      "median_us": 1800.8,
      "min_us": 1695.0,
      "max_us": 2429.2,
      "stdev_us": 252.11165515831797,
      "samples": 5,
      "tflops": 0.5217259529098178,
      "shape_key": "Mistral-7B-v0.3|mlp_down|T8|M8|N4096|K14336"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_q",
      "tokens": 16,
      "m": 16,
      "n": 4096,
      "k": 4096,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 536870912.0,
      "median_us": 22.175999358296394,
      "min_us": 19.840000197291374,
      "max_us": 114146.53015136719,
      "stdev_us": 11412.477773901752,
      "samples": 100,
      "tflops": 24.209547598094968,
      "shape_key": "Mistral-7B-v0.3|attn_q|T16|M16|N4096|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_q",
      "tokens": 16,
      "m": 16,
      "n": 4096,
      "k": 4096,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 536870912.0,
      "median_us": 936.12,
      "min_us": 915.62,
      "max_us": 1227.25,
      "stdev_us": 132.98098766365064,
      "samples": 5,
      "tflops": 0.5735065077126864,
      "shape_key": "Mistral-7B-v0.3|attn_q|T16|M16|N4096|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 16,
      "m": 16,
      "n": 1024,
      "k": 4096,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 134217728.0,
      "median_us": 18.432000651955605,
      "min_us": 17.023999243974686,
      "max_us": 23.552000522613525,
      "stdev_us": 0.9410557284645423,
      "samples": 100,
      "tflops": 7.281777520215079,
      "shape_key": "Mistral-7B-v0.3|attn_kv_each|T16|M16|N1024|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 16,
      "m": 16,
      "n": 1024,
      "k": 4096,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 134217728.0,
      "median_us": 235.34,
      "min_us": 233.31,
      "max_us": 279.28,
      "stdev_us": 19.864640193066663,
      "samples": 5,
      "tflops": 0.5703141327441149,
      "shape_key": "Mistral-7B-v0.3|attn_kv_each|T16|M16|N1024|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 16,
      "m": 16,
      "n": 6144,
      "k": 4096,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 805306368.0,
      "median_us": 24.480000138282776,
      "min_us": 22.495999932289124,
      "max_us": 27.648000046610832,
      "stdev_us": 0.7987028381062824,
      "samples": 100,
      "tflops": 32.89650177495835,
      "shape_key": "Mistral-7B-v0.3|attn_qkv_fused|T16|M16|N6144|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 16,
      "m": 16,
      "n": 6144,
      "k": 4096,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 805306368.0,
      "median_us": 1394.6,
      "min_us": 1379.0,
      "max_us": 1808.4,
      "stdev_us": 185.53023473277884,
      "samples": 5,
      "tflops": 0.5774461264878819,
      "shape_key": "Mistral-7B-v0.3|attn_qkv_fused|T16|M16|N6144|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_o",
      "tokens": 16,
      "m": 16,
      "n": 4096,
      "k": 4096,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 536870912.0,
      "median_us": 22.175999358296394,
      "min_us": 19.840000197291374,
      "max_us": 114146.53015136719,
      "stdev_us": 11412.477773901752,
      "samples": 100,
      "tflops": 24.209547598094968,
      "shape_key": "Mistral-7B-v0.3|attn_o|T16|M16|N4096|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_o",
      "tokens": 16,
      "m": 16,
      "n": 4096,
      "k": 4096,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 536870912.0,
      "median_us": 936.12,
      "min_us": 915.62,
      "max_us": 1227.25,
      "stdev_us": 132.98098766365064,
      "samples": 5,
      "tflops": 0.5735065077126864,
      "shape_key": "Mistral-7B-v0.3|attn_o|T16|M16|N4096|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 16,
      "m": 16,
      "n": 14336,
      "k": 4096,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 1879048192.0,
      "median_us": 32.47999958693981,
      "min_us": 30.368000268936157,
      "max_us": 37.18400001525879,
      "stdev_us": 1.0327937793004132,
      "samples": 100,
      "tflops": 57.85246970124852,
      "shape_key": "Mistral-7B-v0.3|mlp_up_or_gate|T16|M16|N14336|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 16,
      "m": 16,
      "n": 14336,
      "k": 4096,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 1879048192.0,
      "median_us": 3380.0,
      "min_us": 3333.67,
      "max_us": 4167.67,
      "stdev_us": 352.5623577751885,
      "samples": 5,
      "tflops": 0.5559314177514793,
      "shape_key": "Mistral-7B-v0.3|mlp_up_or_gate|T16|M16|N14336|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 16,
      "m": 16,
      "n": 4096,
      "k": 14336,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 1879048192.0,
      "median_us": 61.08799949288368,
      "min_us": 59.039998799562454,
      "max_us": 64.51199948787689,
      "stdev_us": 0.8277423870615717,
      "samples": 100,
      "tflops": 30.759694336019233,
      "shape_key": "Mistral-7B-v0.3|mlp_down|T16|M16|N4096|K14336"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 16,
      "m": 16,
      "n": 4096,
      "k": 14336,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 1879048192.0,
      "median_us": 3395.0,
      "min_us": 3342.67,
      "max_us": 3414.0,
      "stdev_us": 26.637327756364726,
      "samples": 5,
      "tflops": 0.5534751670103093,
      "shape_key": "Mistral-7B-v0.3|mlp_down|T16|M16|N4096|K14336"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_q",
      "tokens": 32,
      "m": 32,
      "n": 4096,
      "k": 4096,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 1073741824.0,
      "median_us": 20.60799952596426,
      "min_us": 19.999999552965164,
      "max_us": 25.599999353289604,
      "stdev_us": 1.3819118381290043,
      "samples": 100,
      "tflops": 52.10315647800652,
      "shape_key": "Mistral-7B-v0.3|attn_q|T32|M32|N4096|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_q",
      "tokens": 32,
      "m": 32,
      "n": 4096,
      "k": 4096,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 1073741824.0,
      "median_us": 641.75,
      "min_us": 603.25,
      "max_us": 1252.75,
      "stdev_us": 274.1514408132848,
      "samples": 5,
      "tflops": 1.6731465897935334,
      "shape_key": "Mistral-7B-v0.3|attn_q|T32|M32|N4096|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 32,
      "m": 32,
      "n": 1024,
      "k": 4096,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 268435456.0,
      "median_us": 18.559999763965607,
      "min_us": 17.75999926030636,
      "max_us": 22.5600004196167,
      "stdev_us": 0.7806777150277917,
      "samples": 100,
      "tflops": 14.46311742531213,
      "shape_key": "Mistral-7B-v0.3|attn_kv_each|T32|M32|N1024|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 32,
      "m": 32,
      "n": 1024,
      "k": 4096,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 268435456.0,
      "median_us": 154.67,
      "min_us": 152.73,
      "max_us": 175.47,
      "stdev_us": 9.577135271050526,
      "samples": 5,
      "tflops": 1.7355366651580786,
      "shape_key": "Mistral-7B-v0.3|attn_kv_each|T32|M32|N1024|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 32,
      "m": 32,
      "n": 6144,
      "k": 4096,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 1610612736.0,
      "median_us": 22.895999252796173,
      "min_us": 22.304000332951546,
      "max_us": 28.12799997627735,
      "stdev_us": 1.4569951657580973,
      "samples": 100,
      "tflops": 70.34472346968232,
      "shape_key": "Mistral-7B-v0.3|attn_qkv_fused|T32|M32|N6144|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 32,
      "m": 32,
      "n": 6144,
      "k": 4096,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 1610612736.0,
      "median_us": 1186.33,
      "min_us": 942.0,
      "max_us": 1723.0,
      "stdev_us": 327.8882651910556,
      "samples": 5,
      "tflops": 1.3576430976203924,
      "shape_key": "Mistral-7B-v0.3|attn_qkv_fused|T32|M32|N6144|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_o",
      "tokens": 32,
      "m": 32,
      "n": 4096,
      "k": 4096,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 1073741824.0,
      "median_us": 20.60799952596426,
      "min_us": 19.999999552965164,
      "max_us": 25.599999353289604,
      "stdev_us": 1.3819118381290043,
      "samples": 100,
      "tflops": 52.10315647800652,
      "shape_key": "Mistral-7B-v0.3|attn_o|T32|M32|N4096|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_o",
      "tokens": 32,
      "m": 32,
      "n": 4096,
      "k": 4096,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 1073741824.0,
      "median_us": 641.75,
      "min_us": 603.25,
      "max_us": 1252.75,
      "stdev_us": 274.1514408132848,
      "samples": 5,
      "tflops": 1.6731465897935334,
      "shape_key": "Mistral-7B-v0.3|attn_o|T32|M32|N4096|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 32,
      "m": 32,
      "n": 14336,
      "k": 4096,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 3758096384.0,
      "median_us": 30.97599931061268,
      "min_us": 28.672000393271446,
      "max_us": 35.840000957250595,
      "stdev_us": 1.0707405667847434,
      "samples": 100,
      "tflops": 121.32284567531094,
      "shape_key": "Mistral-7B-v0.3|mlp_up_or_gate|T32|M32|N14336|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 32,
      "m": 32,
      "n": 14336,
      "k": 4096,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 3758096384.0,
      "median_us": 2381.0,
      "min_us": 2341.5,
      "max_us": 3200.5,
      "stdev_us": 373.60182681566215,
      "samples": 5,
      "tflops": 1.578368913901722,
      "shape_key": "Mistral-7B-v0.3|mlp_up_or_gate|T32|M32|N14336|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 32,
      "m": 32,
      "n": 4096,
      "k": 14336,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 3758096384.0,
      "median_us": 59.39200147986412,
      "min_us": 57.312000542879105,
      "max_us": 64.41599875688553,
      "stdev_us": 1.2095183401950311,
      "samples": 100,
      "tflops": 63.27613635438975,
      "shape_key": "Mistral-7B-v0.3|mlp_down|T32|M32|N4096|K14336"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 32,
      "m": 32,
      "n": 4096,
      "k": 14336,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 3758096384.0,
      "median_us": 2270.0,
      "min_us": 2211.0,
      "max_us": 4848.0,
      "stdev_us": 1160.9700469865707,
      "samples": 5,
      "tflops": 1.6555490678414098,
      "shape_key": "Mistral-7B-v0.3|mlp_down|T32|M32|N4096|K14336"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_q",
      "tokens": 128,
      "m": 128,
      "n": 4096,
      "k": 4096,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 4294967296.0,
      "median_us": 20.51199972629547,
      "min_us": 19.936000928282738,
      "max_us": 24.86399933695793,
      "stdev_us": 1.1356324752455653,
      "samples": 100,
      "tflops": 209.3880339952444,
      "shape_key": "Mistral-7B-v0.3|attn_q|T128|M128|N4096|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_q",
      "tokens": 128,
      "m": 128,
      "n": 4096,
      "k": 4096,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 4294967296.0,
      "median_us": 2024.0,
      "min_us": 2006.0,
      "max_us": 4228.0,
      "stdev_us": 982.5929981431783,
      "samples": 5,
      "tflops": 2.122019415019763,
      "shape_key": "Mistral-7B-v0.3|attn_q|T128|M128|N4096|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 128,
      "m": 128,
      "n": 1024,
      "k": 4096,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 1073741824.0,
      "median_us": 18.432000651955605,
      "min_us": 17.664000391960144,
      "max_us": 23.584000766277313,
      "stdev_us": 0.950200508308983,
      "samples": 100,
      "tflops": 58.25422016172063,
      "shape_key": "Mistral-7B-v0.3|attn_kv_each|T128|M128|N1024|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_kv_each",
      "tokens": 128,
      "m": 128,
      "n": 1024,
      "k": 4096,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 1073741824.0,
      "median_us": 635.75,
      "min_us": 502.5,
      "max_us": 1007.75,
      "stdev_us": 220.49238932443905,
      "samples": 5,
      "tflops": 1.688937198584349,
      "shape_key": "Mistral-7B-v0.3|attn_kv_each|T128|M128|N1024|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 128,
      "m": 128,
      "n": 6144,
      "k": 4096,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 6442450944.0,
      "median_us": 23.903999477624893,
      "min_us": 21.91999927163124,
      "max_us": 27.83999964594841,
      "stdev_us": 1.1750193700153864,
      "samples": 100,
      "tflops": 269.5135159298507,
      "shape_key": "Mistral-7B-v0.3|attn_qkv_fused|T128|M128|N6144|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_qkv_fused",
      "tokens": 128,
      "m": 128,
      "n": 6144,
      "k": 4096,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 6442450944.0,
      "median_us": 3176.0,
      "min_us": 3105.0,
      "max_us": 3777.0,
      "stdev_us": 279.16787064416997,
      "samples": 5,
      "tflops": 2.028479516372796,
      "shape_key": "Mistral-7B-v0.3|attn_qkv_fused|T128|M128|N6144|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_o",
      "tokens": 128,
      "m": 128,
      "n": 4096,
      "k": 4096,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 4294967296.0,
      "median_us": 20.51199972629547,
      "min_us": 19.936000928282738,
      "max_us": 24.86399933695793,
      "stdev_us": 1.1356324752455653,
      "samples": 100,
      "tflops": 209.3880339952444,
      "shape_key": "Mistral-7B-v0.3|attn_o|T128|M128|N4096|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "attn_o",
      "tokens": 128,
      "m": 128,
      "n": 4096,
      "k": 4096,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 4294967296.0,
      "median_us": 2024.0,
      "min_us": 2006.0,
      "max_us": 4228.0,
      "stdev_us": 982.5929981431783,
      "samples": 5,
      "tflops": 2.122019415019763,
      "shape_key": "Mistral-7B-v0.3|attn_o|T128|M128|N4096|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 128,
      "m": 128,
      "n": 14336,
      "k": 4096,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 15032385536.0,
      "median_us": 30.46399913728237,
      "min_us": 28.351999819278717,
      "max_us": 33.92000123858452,
      "stdev_us": 1.2655796242864796,
      "samples": 100,
      "tflops": 493.44754338582896,
      "shape_key": "Mistral-7B-v0.3|mlp_up_or_gate|T128|M128|N14336|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "mlp_up_or_gate",
      "tokens": 128,
      "m": 128,
      "n": 14336,
      "k": 4096,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 15032385536.0,
      "median_us": 7306.0,
      "min_us": 7167.0,
      "max_us": 7509.0,
      "stdev_us": 125.85189708542339,
      "samples": 5,
      "tflops": 2.0575397667670408,
      "shape_key": "Mistral-7B-v0.3|mlp_up_or_gate|T128|M128|N14336|K4096"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 128,
      "m": 128,
      "n": 4096,
      "k": 14336,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 15032385536.0,
      "median_us": 61.08799949288368,
      "min_us": 58.88000130653381,
      "max_us": 64.35199826955795,
      "stdev_us": 1.2225441928366314,
      "samples": 100,
      "tflops": 246.07755468815387,
      "shape_key": "Mistral-7B-v0.3|mlp_down|T128|M128|N4096|K14336"
    },
    {
      "model": "Mistral-7B-v0.3",
      "family": "dense",
      "op": "mlp_down",
      "tokens": 128,
      "m": 128,
      "n": 4096,
      "k": 14336,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 15032385536.0,
      "median_us": 7241.0,
      "min_us": 7016.0,
      "max_us": 7736.0,
      "stdev_us": 267.545323263181,
      "samples": 5,
      "tflops": 2.0760096030934956,
      "shape_key": "Mistral-7B-v0.3|mlp_down|T128|M128|N4096|K14336"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_q",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 2048,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 8388608.0,
      "median_us": 12.28800043463707,
      "min_us": 11.744000017642975,
      "max_us": 17.055999487638474,
      "stdev_us": 1.137415380513941,
      "samples": 100,
      "tflops": 0.6826666425201636,
      "shape_key": "OLMoE-1B-7B|attn_q|T1|M1|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_q",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 2048,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 8388608.0,
      "median_us": 62.62,
      "min_us": 61.17,
      "max_us": 64.16,
      "stdev_us": 1.223744254327674,
      "samples": 5,
      "tflops": 0.13396052379431492,
      "shape_key": "OLMoE-1B-7B|attn_q|T1|M1|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_kv_each",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 2048,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 8388608.0,
      "median_us": 12.28800043463707,
      "min_us": 11.744000017642975,
      "max_us": 17.055999487638474,
      "stdev_us": 1.137415380513941,
      "samples": 100,
      "tflops": 0.6826666425201636,
      "shape_key": "OLMoE-1B-7B|attn_kv_each|T1|M1|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_kv_each",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 2048,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 8388608.0,
      "median_us": 62.62,
      "min_us": 61.17,
      "max_us": 64.16,
      "stdev_us": 1.223744254327674,
      "samples": 5,
      "tflops": 0.13396052379431492,
      "shape_key": "OLMoE-1B-7B|attn_kv_each|T1|M1|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_qkv_fused",
      "tokens": 1,
      "m": 1,
      "n": 6144,
      "k": 2048,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 25165824.0,
      "median_us": 16.03199914097786,
      "min_us": 13.952000066637993,
      "max_us": 19.90400068461895,
      "stdev_us": 1.026790251133557,
      "samples": 100,
      "tflops": 1.56972463500675,
      "shape_key": "OLMoE-1B-7B|attn_qkv_fused|T1|M1|N6144|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_qkv_fused",
      "tokens": 1,
      "m": 1,
      "n": 6144,
      "k": 2048,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 25165824.0,
      "median_us": 185.87,
      "min_us": 179.05,
      "max_us": 191.88,
      "stdev_us": 5.412390414595014,
      "samples": 5,
      "tflops": 0.1353947597783397,
      "shape_key": "OLMoE-1B-7B|attn_qkv_fused|T1|M1|N6144|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_o",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 2048,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 8388608.0,
      "median_us": 12.28800043463707,
      "min_us": 11.744000017642975,
      "max_us": 17.055999487638474,
      "stdev_us": 1.137415380513941,
      "samples": 100,
      "tflops": 0.6826666425201636,
      "shape_key": "OLMoE-1B-7B|attn_o|T1|M1|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_o",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 2048,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 8388608.0,
      "median_us": 62.62,
      "min_us": 61.17,
      "max_us": 64.16,
      "stdev_us": 1.223744254327674,
      "samples": 5,
      "tflops": 0.13396052379431492,
      "shape_key": "OLMoE-1B-7B|attn_o|T1|M1|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "moe_expert_up_or_gate_avg",
      "tokens": 1,
      "m": 1,
      "n": 1024,
      "k": 2048,
      "source_tokens": 1,
      "num_experts": 64,
      "top_k": 8,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 4194304.0,
      "median_us": 12.28800043463707,
      "min_us": 11.58399973064661,
      "max_us": 16.92800037562847,
      "stdev_us": 0.8420684615453706,
      "samples": 100,
      "tflops": 0.3413333212600818,
      "shape_key": "OLMoE-1B-7B|moe_expert_up_or_gate_avg|T1|M1|N1024|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "moe_expert_up_or_gate_avg",
      "tokens": 1,
      "m": 1,
      "n": 1024,
      "k": 2048,
      "source_tokens": 1,
      "num_experts": 64,
      "top_k": 8,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 4194304.0,
      "median_us": 31.07,
      "min_us": 30.27,
      "max_us": 31.47,
      "stdev_us": 0.44942185082614716,
      "samples": 5,
      "tflops": 0.13499530093337625,
      "shape_key": "OLMoE-1B-7B|moe_expert_up_or_gate_avg|T1|M1|N1024|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "moe_expert_down_avg",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 1024,
      "source_tokens": 1,
      "num_experts": 64,
      "top_k": 8,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 4194304.0,
      "median_us": 9.8879998549819,
      "min_us": 7.840000092983246,
      "max_us": 12.959999963641167,
      "stdev_us": 0.8130872830823813,
      "samples": 100,
      "tflops": 0.4241812359945345,
      "shape_key": "OLMoE-1B-7B|moe_expert_down_avg|T1|M1|N2048|K1024"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "moe_expert_down_avg",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 1024,
      "source_tokens": 1,
      "num_experts": 64,
      "top_k": 8,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 4194304.0,
      "median_us": 31.26,
      "min_us": 30.93,
      "max_us": 31.42,
      "stdev_us": 0.223986606742457,
      "samples": 5,
      "tflops": 0.1341747920665387,
      "shape_key": "OLMoE-1B-7B|moe_expert_down_avg|T1|M1|N2048|K1024"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_q",
      "tokens": 2,
      "m": 2,
      "n": 2048,
      "k": 2048,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 16777216.0,
      "median_us": 12.28800043463707,
      "min_us": 11.52000017464161,
      "max_us": 17.40800030529499,
      "stdev_us": 0.905924784909204,
      "samples": 100,
      "tflops": 1.3653332850403272,
      "shape_key": "OLMoE-1B-7B|attn_q|T2|M2|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_q",
      "tokens": 2,
      "m": 2,
      "n": 2048,
      "k": 2048,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 16777216.0,
      "median_us": 65.76,
      "min_us": 64.65,
      "max_us": 72.83,
      "stdev_us": 3.3232032739512016,
      "samples": 5,
      "tflops": 0.2551279805352798,
      "shape_key": "OLMoE-1B-7B|attn_q|T2|M2|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_kv_each",
      "tokens": 2,
      "m": 2,
      "n": 2048,
      "k": 2048,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 16777216.0,
      "median_us": 12.28800043463707,
      "min_us": 11.52000017464161,
      "max_us": 17.40800030529499,
      "stdev_us": 0.905924784909204,
      "samples": 100,
      "tflops": 1.3653332850403272,
      "shape_key": "OLMoE-1B-7B|attn_kv_each|T2|M2|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_kv_each",
      "tokens": 2,
      "m": 2,
      "n": 2048,
      "k": 2048,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 16777216.0,
      "median_us": 65.76,
      "min_us": 64.65,
      "max_us": 72.83,
      "stdev_us": 3.3232032739512016,
      "samples": 5,
      "tflops": 0.2551279805352798,
      "shape_key": "OLMoE-1B-7B|attn_kv_each|T2|M2|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_qkv_fused",
      "tokens": 2,
      "m": 2,
      "n": 6144,
      "k": 2048,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 50331648.0,
      "median_us": 16.03199914097786,
      "min_us": 13.952000066637993,
      "max_us": 19.519999623298645,
      "stdev_us": 0.9169661913366407,
      "samples": 100,
      "tflops": 3.1394492700135,
      "shape_key": "OLMoE-1B-7B|attn_qkv_fused|T2|M2|N6144|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_qkv_fused",
      "tokens": 2,
      "m": 2,
      "n": 6144,
      "k": 2048,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 50331648.0,
      "median_us": 202.32,
      "min_us": 191.86,
      "max_us": 214.3,
      "stdev_us": 8.445479264079689,
      "samples": 5,
      "tflops": 0.24877247924080664,
      "shape_key": "OLMoE-1B-7B|attn_qkv_fused|T2|M2|N6144|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_o",
      "tokens": 2,
      "m": 2,
      "n": 2048,
      "k": 2048,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 16777216.0,
      "median_us": 12.28800043463707,
      "min_us": 11.52000017464161,
      "max_us": 17.40800030529499,
      "stdev_us": 0.905924784909204,
      "samples": 100,
      "tflops": 1.3653332850403272,
      "shape_key": "OLMoE-1B-7B|attn_o|T2|M2|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_o",
      "tokens": 2,
      "m": 2,
      "n": 2048,
      "k": 2048,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 16777216.0,
      "median_us": 65.76,
      "min_us": 64.65,
      "max_us": 72.83,
      "stdev_us": 3.3232032739512016,
      "samples": 5,
      "tflops": 0.2551279805352798,
      "shape_key": "OLMoE-1B-7B|attn_o|T2|M2|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "moe_expert_up_or_gate_avg",
      "tokens": 1,
      "m": 1,
      "n": 1024,
      "k": 2048,
      "source_tokens": 2,
      "num_experts": 64,
      "top_k": 8,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 4194304.0,
      "median_us": 12.28800043463707,
      "min_us": 11.58399973064661,
      "max_us": 16.92800037562847,
      "stdev_us": 0.8420684615453706,
      "samples": 100,
      "tflops": 0.3413333212600818,
      "shape_key": "OLMoE-1B-7B|moe_expert_up_or_gate_avg|T2|M1|N1024|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "moe_expert_up_or_gate_avg",
      "tokens": 1,
      "m": 1,
      "n": 1024,
      "k": 2048,
      "source_tokens": 2,
      "num_experts": 64,
      "top_k": 8,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 4194304.0,
      "median_us": 31.07,
      "min_us": 30.27,
      "max_us": 31.47,
      "stdev_us": 0.44942185082614716,
      "samples": 5,
      "tflops": 0.13499530093337625,
      "shape_key": "OLMoE-1B-7B|moe_expert_up_or_gate_avg|T2|M1|N1024|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "moe_expert_down_avg",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 1024,
      "source_tokens": 2,
      "num_experts": 64,
      "top_k": 8,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 4194304.0,
      "median_us": 9.8879998549819,
      "min_us": 7.840000092983246,
      "max_us": 12.959999963641167,
      "stdev_us": 0.8130872830823813,
      "samples": 100,
      "tflops": 0.4241812359945345,
      "shape_key": "OLMoE-1B-7B|moe_expert_down_avg|T2|M1|N2048|K1024"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "moe_expert_down_avg",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 1024,
      "source_tokens": 2,
      "num_experts": 64,
      "top_k": 8,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 4194304.0,
      "median_us": 31.26,
      "min_us": 30.93,
      "max_us": 31.42,
      "stdev_us": 0.223986606742457,
      "samples": 5,
      "tflops": 0.1341747920665387,
      "shape_key": "OLMoE-1B-7B|moe_expert_down_avg|T2|M1|N2048|K1024"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_q",
      "tokens": 4,
      "m": 4,
      "n": 2048,
      "k": 2048,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 33554432.0,
      "median_us": 12.28800043463707,
      "min_us": 11.680000461637974,
      "max_us": 17.311999574303627,
      "stdev_us": 1.0944388754410355,
      "samples": 100,
      "tflops": 2.7306665700806545,
      "shape_key": "OLMoE-1B-7B|attn_q|T4|M4|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_q",
      "tokens": 4,
      "m": 4,
      "n": 2048,
      "k": 2048,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 33554432.0,
      "median_us": 78.82,
      "min_us": 77.73,
      "max_us": 81.74,
      "stdev_us": 1.7653526559868986,
      "samples": 5,
      "tflops": 0.4257096168485156,
      "shape_key": "OLMoE-1B-7B|attn_q|T4|M4|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_kv_each",
      "tokens": 4,
      "m": 4,
      "n": 2048,
      "k": 2048,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 33554432.0,
      "median_us": 12.28800043463707,
      "min_us": 11.680000461637974,
      "max_us": 17.311999574303627,
      "stdev_us": 1.0944388754410355,
      "samples": 100,
      "tflops": 2.7306665700806545,
      "shape_key": "OLMoE-1B-7B|attn_kv_each|T4|M4|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_kv_each",
      "tokens": 4,
      "m": 4,
      "n": 2048,
      "k": 2048,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 33554432.0,
      "median_us": 78.82,
      "min_us": 77.73,
      "max_us": 81.74,
      "stdev_us": 1.7653526559868986,
      "samples": 5,
      "tflops": 0.4257096168485156,
      "shape_key": "OLMoE-1B-7B|attn_kv_each|T4|M4|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_qkv_fused",
      "tokens": 4,
      "m": 4,
      "n": 6144,
      "k": 2048,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 100663296.0,
      "median_us": 15.760000795125961,
      "min_us": 13.663999736309052,
      "max_us": 19.1040001809597,
      "stdev_us": 1.1704662948313438,
      "samples": 100,
      "tflops": 6.387264652368024,
      "shape_key": "OLMoE-1B-7B|attn_qkv_fused|T4|M4|N6144|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_qkv_fused",
      "tokens": 4,
      "m": 4,
      "n": 6144,
      "k": 2048,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 100663296.0,
      "median_us": 233.11,
      "min_us": 230.63,
      "max_us": 246.5,
      "stdev_us": 6.537894921150081,
      "samples": 5,
      "tflops": 0.4318274462700013,
      "shape_key": "OLMoE-1B-7B|attn_qkv_fused|T4|M4|N6144|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_o",
      "tokens": 4,
      "m": 4,
      "n": 2048,
      "k": 2048,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 33554432.0,
      "median_us": 12.28800043463707,
      "min_us": 11.680000461637974,
      "max_us": 17.311999574303627,
      "stdev_us": 1.0944388754410355,
      "samples": 100,
      "tflops": 2.7306665700806545,
      "shape_key": "OLMoE-1B-7B|attn_o|T4|M4|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_o",
      "tokens": 4,
      "m": 4,
      "n": 2048,
      "k": 2048,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 33554432.0,
      "median_us": 78.82,
      "min_us": 77.73,
      "max_us": 81.74,
      "stdev_us": 1.7653526559868986,
      "samples": 5,
      "tflops": 0.4257096168485156,
      "shape_key": "OLMoE-1B-7B|attn_o|T4|M4|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "moe_expert_up_or_gate_avg",
      "tokens": 1,
      "m": 1,
      "n": 1024,
      "k": 2048,
      "source_tokens": 4,
      "num_experts": 64,
      "top_k": 8,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 4194304.0,
      "median_us": 12.28800043463707,
      "min_us": 11.58399973064661,
      "max_us": 16.92800037562847,
      "stdev_us": 0.8420684615453706,
      "samples": 100,
      "tflops": 0.3413333212600818,
      "shape_key": "OLMoE-1B-7B|moe_expert_up_or_gate_avg|T4|M1|N1024|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "moe_expert_up_or_gate_avg",
      "tokens": 1,
      "m": 1,
      "n": 1024,
      "k": 2048,
      "source_tokens": 4,
      "num_experts": 64,
      "top_k": 8,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 4194304.0,
      "median_us": 31.07,
      "min_us": 30.27,
      "max_us": 31.47,
      "stdev_us": 0.44942185082614716,
      "samples": 5,
      "tflops": 0.13499530093337625,
      "shape_key": "OLMoE-1B-7B|moe_expert_up_or_gate_avg|T4|M1|N1024|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "moe_expert_down_avg",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 1024,
      "source_tokens": 4,
      "num_experts": 64,
      "top_k": 8,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 4194304.0,
      "median_us": 9.8879998549819,
      "min_us": 7.840000092983246,
      "max_us": 12.959999963641167,
      "stdev_us": 0.8130872830823813,
      "samples": 100,
      "tflops": 0.4241812359945345,
      "shape_key": "OLMoE-1B-7B|moe_expert_down_avg|T4|M1|N2048|K1024"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "moe_expert_down_avg",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 1024,
      "source_tokens": 4,
      "num_experts": 64,
      "top_k": 8,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 4194304.0,
      "median_us": 31.26,
      "min_us": 30.93,
      "max_us": 31.42,
      "stdev_us": 0.223986606742457,
      "samples": 5,
      "tflops": 0.1341747920665387,
      "shape_key": "OLMoE-1B-7B|moe_expert_down_avg|T4|M1|N2048|K1024"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_q",
      "tokens": 8,
      "m": 8,
      "n": 2048,
      "k": 2048,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 67108864.0,
      "median_us": 12.28800043463707,
      "min_us": 11.552000418305397,
      "max_us": 16.704000532627106,
      "stdev_us": 0.9340635760009698,
      "samples": 100,
      "tflops": 5.461333140161309,
      "shape_key": "OLMoE-1B-7B|attn_q|T8|M8|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_q",
      "tokens": 8,
      "m": 8,
      "n": 2048,
      "k": 2048,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 67108864.0,
      "median_us": 118.74,
      "min_us": 117.7,
      "max_us": 120.16,
      "stdev_us": 1.0132028424752857,
      "samples": 5,
      "tflops": 0.5651748694626916,
      "shape_key": "OLMoE-1B-7B|attn_q|T8|M8|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_kv_each",
      "tokens": 8,
      "m": 8,
      "n": 2048,
      "k": 2048,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 67108864.0,
      "median_us": 12.28800043463707,
      "min_us": 11.552000418305397,
      "max_us": 16.704000532627106,
      "stdev_us": 0.9340635760009698,
      "samples": 100,
      "tflops": 5.461333140161309,
      "shape_key": "OLMoE-1B-7B|attn_kv_each|T8|M8|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_kv_each",
      "tokens": 8,
      "m": 8,
      "n": 2048,
      "k": 2048,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 67108864.0,
      "median_us": 118.74,
      "min_us": 117.7,
      "max_us": 120.16,
      "stdev_us": 1.0132028424752857,
      "samples": 5,
      "tflops": 0.5651748694626916,
      "shape_key": "OLMoE-1B-7B|attn_kv_each|T8|M8|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_qkv_fused",
      "tokens": 8,
      "m": 8,
      "n": 6144,
      "k": 2048,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 201326592.0,
      "median_us": 16.03199914097786,
      "min_us": 13.791999779641628,
      "max_us": 19.42400075495243,
      "stdev_us": 1.0240271563538978,
      "samples": 100,
      "tflops": 12.557797080054,
      "shape_key": "OLMoE-1B-7B|attn_qkv_fused|T8|M8|N6144|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_qkv_fused",
      "tokens": 8,
      "m": 8,
      "n": 6144,
      "k": 2048,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 201326592.0,
      "median_us": 353.84,
      "min_us": 345.32,
      "max_us": 418.68,
      "stdev_us": 30.3934963437904,
      "samples": 5,
      "tflops": 0.568976350893059,
      "shape_key": "OLMoE-1B-7B|attn_qkv_fused|T8|M8|N6144|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_o",
      "tokens": 8,
      "m": 8,
      "n": 2048,
      "k": 2048,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 67108864.0,
      "median_us": 12.28800043463707,
      "min_us": 11.552000418305397,
      "max_us": 16.704000532627106,
      "stdev_us": 0.9340635760009698,
      "samples": 100,
      "tflops": 5.461333140161309,
      "shape_key": "OLMoE-1B-7B|attn_o|T8|M8|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_o",
      "tokens": 8,
      "m": 8,
      "n": 2048,
      "k": 2048,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 67108864.0,
      "median_us": 118.74,
      "min_us": 117.7,
      "max_us": 120.16,
      "stdev_us": 1.0132028424752857,
      "samples": 5,
      "tflops": 0.5651748694626916,
      "shape_key": "OLMoE-1B-7B|attn_o|T8|M8|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "moe_expert_up_or_gate_avg",
      "tokens": 1,
      "m": 1,
      "n": 1024,
      "k": 2048,
      "source_tokens": 8,
      "num_experts": 64,
      "top_k": 8,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 4194304.0,
      "median_us": 12.28800043463707,
      "min_us": 11.58399973064661,
      "max_us": 16.92800037562847,
      "stdev_us": 0.8420684615453706,
      "samples": 100,
      "tflops": 0.3413333212600818,
      "shape_key": "OLMoE-1B-7B|moe_expert_up_or_gate_avg|T8|M1|N1024|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "moe_expert_up_or_gate_avg",
      "tokens": 1,
      "m": 1,
      "n": 1024,
      "k": 2048,
      "source_tokens": 8,
      "num_experts": 64,
      "top_k": 8,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 4194304.0,
      "median_us": 31.07,
      "min_us": 30.27,
      "max_us": 31.47,
      "stdev_us": 0.44942185082614716,
      "samples": 5,
      "tflops": 0.13499530093337625,
      "shape_key": "OLMoE-1B-7B|moe_expert_up_or_gate_avg|T8|M1|N1024|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "moe_expert_down_avg",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 1024,
      "source_tokens": 8,
      "num_experts": 64,
      "top_k": 8,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 4194304.0,
      "median_us": 9.8879998549819,
      "min_us": 7.840000092983246,
      "max_us": 12.959999963641167,
      "stdev_us": 0.8130872830823813,
      "samples": 100,
      "tflops": 0.4241812359945345,
      "shape_key": "OLMoE-1B-7B|moe_expert_down_avg|T8|M1|N2048|K1024"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "moe_expert_down_avg",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 1024,
      "source_tokens": 8,
      "num_experts": 64,
      "top_k": 8,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 4194304.0,
      "median_us": 31.26,
      "min_us": 30.93,
      "max_us": 31.42,
      "stdev_us": 0.223986606742457,
      "samples": 5,
      "tflops": 0.1341747920665387,
      "shape_key": "OLMoE-1B-7B|moe_expert_down_avg|T8|M1|N2048|K1024"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_q",
      "tokens": 16,
      "m": 16,
      "n": 2048,
      "k": 2048,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 134217728.0,
      "median_us": 12.608000077307224,
      "min_us": 11.264000087976456,
      "max_us": 16.767999157309532,
      "stdev_us": 1.0402497836019682,
      "samples": 100,
      "tflops": 10.645441559091884,
      "shape_key": "OLMoE-1B-7B|attn_q|T16|M16|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_q",
      "tokens": 16,
      "m": 16,
      "n": 2048,
      "k": 2048,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 134217728.0,
      "median_us": 236.03,
      "min_us": 235.34,
      "max_us": 236.97,
      "stdev_us": 0.6490223416801582,
      "samples": 5,
      "tflops": 0.5686469008176926,
      "shape_key": "OLMoE-1B-7B|attn_q|T16|M16|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_kv_each",
      "tokens": 16,
      "m": 16,
      "n": 2048,
      "k": 2048,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 134217728.0,
      "median_us": 12.608000077307224,
      "min_us": 11.264000087976456,
      "max_us": 16.767999157309532,
      "stdev_us": 1.0402497836019682,
      "samples": 100,
      "tflops": 10.645441559091884,
      "shape_key": "OLMoE-1B-7B|attn_kv_each|T16|M16|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_kv_each",
      "tokens": 16,
      "m": 16,
      "n": 2048,
      "k": 2048,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 134217728.0,
      "median_us": 236.03,
      "min_us": 235.34,
      "max_us": 236.97,
      "stdev_us": 0.6490223416801582,
      "samples": 5,
      "tflops": 0.5686469008176926,
      "shape_key": "OLMoE-1B-7B|attn_kv_each|T16|M16|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_qkv_fused",
      "tokens": 16,
      "m": 16,
      "n": 6144,
      "k": 2048,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 402653184.0,
      "median_us": 16.03199914097786,
      "min_us": 13.952000066637993,
      "max_us": 19.1040001809597,
      "stdev_us": 0.8763114512685403,
      "samples": 100,
      "tflops": 25.115594160108,
      "shape_key": "OLMoE-1B-7B|attn_qkv_fused|T16|M16|N6144|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_qkv_fused",
      "tokens": 16,
      "m": 16,
      "n": 6144,
      "k": 2048,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 402653184.0,
      "median_us": 808.4,
      "min_us": 702.0,
      "max_us": 1137.8,
      "stdev_us": 207.62810021767282,
      "samples": 5,
      "tflops": 0.49808657100445325,
      "shape_key": "OLMoE-1B-7B|attn_qkv_fused|T16|M16|N6144|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_o",
      "tokens": 16,
      "m": 16,
      "n": 2048,
      "k": 2048,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 134217728.0,
      "median_us": 12.608000077307224,
      "min_us": 11.264000087976456,
      "max_us": 16.767999157309532,
      "stdev_us": 1.0402497836019682,
      "samples": 100,
      "tflops": 10.645441559091884,
      "shape_key": "OLMoE-1B-7B|attn_o|T16|M16|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_o",
      "tokens": 16,
      "m": 16,
      "n": 2048,
      "k": 2048,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 134217728.0,
      "median_us": 236.03,
      "min_us": 235.34,
      "max_us": 236.97,
      "stdev_us": 0.6490223416801582,
      "samples": 5,
      "tflops": 0.5686469008176926,
      "shape_key": "OLMoE-1B-7B|attn_o|T16|M16|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "moe_expert_up_or_gate_avg",
      "tokens": 2,
      "m": 2,
      "n": 1024,
      "k": 2048,
      "source_tokens": 16,
      "num_experts": 64,
      "top_k": 8,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 8388608.0,
      "median_us": 11.935999616980553,
      "min_us": 10.208000428974628,
      "max_us": 18.0479995906353,
      "stdev_us": 0.8319157268093466,
      "samples": 100,
      "tflops": 0.7027989501663594,
      "shape_key": "OLMoE-1B-7B|moe_expert_up_or_gate_avg|T16|M2|N1024|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "moe_expert_up_or_gate_avg",
      "tokens": 2,
      "m": 2,
      "n": 1024,
      "k": 2048,
      "source_tokens": 16,
      "num_experts": 64,
      "top_k": 8,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 8388608.0,
      "median_us": 33.17,
      "min_us": 32.95,
      "max_us": 36.08,
      "stdev_us": 1.4716385425776253,
      "samples": 5,
      "tflops": 0.25289743744347304,
      "shape_key": "OLMoE-1B-7B|moe_expert_up_or_gate_avg|T16|M2|N1024|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "moe_expert_down_avg",
      "tokens": 2,
      "m": 2,
      "n": 2048,
      "k": 1024,
      "source_tokens": 16,
      "num_experts": 64,
      "top_k": 8,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 8388608.0,
      "median_us": 9.855999611318111,
      "min_us": 7.712000049650669,
      "max_us": 15.00799972563982,
      "stdev_us": 1.2660398297040094,
      "samples": 100,
      "tflops": 0.851116916681588,
      "shape_key": "OLMoE-1B-7B|moe_expert_down_avg|T16|M2|N2048|K1024"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "moe_expert_down_avg",
      "tokens": 2,
      "m": 2,
      "n": 2048,
      "k": 1024,
      "source_tokens": 16,
      "num_experts": 64,
      "top_k": 8,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 8388608.0,
      "median_us": 34.18,
      "min_us": 33.58,
      "max_us": 36.67,
      "stdev_us": 1.2052593081988636,
      "samples": 5,
      "tflops": 0.24542445874780575,
      "shape_key": "OLMoE-1B-7B|moe_expert_down_avg|T16|M2|N2048|K1024"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_q",
      "tokens": 32,
      "m": 32,
      "n": 2048,
      "k": 2048,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 268435456.0,
      "median_us": 12.28800043463707,
      "min_us": 11.52000017464161,
      "max_us": 16.03199914097786,
      "stdev_us": 0.9056072681078319,
      "samples": 100,
      "tflops": 21.845332560645236,
      "shape_key": "OLMoE-1B-7B|attn_q|T32|M32|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_q",
      "tokens": 32,
      "m": 32,
      "n": 2048,
      "k": 2048,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 268435456.0,
      "median_us": 151.2,
      "min_us": 150.87,
      "max_us": 155.47,
      "stdev_us": 1.9498512763798164,
      "samples": 5,
      "tflops": 1.7753667724867725,
      "shape_key": "OLMoE-1B-7B|attn_q|T32|M32|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_kv_each",
      "tokens": 32,
      "m": 32,
      "n": 2048,
      "k": 2048,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 268435456.0,
      "median_us": 12.28800043463707,
      "min_us": 11.52000017464161,
      "max_us": 16.03199914097786,
      "stdev_us": 0.9056072681078319,
      "samples": 100,
      "tflops": 21.845332560645236,
      "shape_key": "OLMoE-1B-7B|attn_kv_each|T32|M32|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_kv_each",
      "tokens": 32,
      "m": 32,
      "n": 2048,
      "k": 2048,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 268435456.0,
      "median_us": 151.2,
      "min_us": 150.87,
      "max_us": 155.47,
      "stdev_us": 1.9498512763798164,
      "samples": 5,
      "tflops": 1.7753667724867725,
      "shape_key": "OLMoE-1B-7B|attn_kv_each|T32|M32|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_qkv_fused",
      "tokens": 32,
      "m": 32,
      "n": 6144,
      "k": 2048,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 805306368.0,
      "median_us": 14.336000196635723,
      "min_us": 13.728000223636627,
      "max_us": 19.74399946630001,
      "stdev_us": 1.240565685159419,
      "samples": 100,
      "tflops": 56.1737135152233,
      "shape_key": "OLMoE-1B-7B|attn_qkv_fused|T32|M32|N6144|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_qkv_fused",
      "tokens": 32,
      "m": 32,
      "n": 6144,
      "k": 2048,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 805306368.0,
      "median_us": 568.8,
      "min_us": 459.0,
      "max_us": 752.0,
      "stdev_us": 109.94167544657485,
      "samples": 5,
      "tflops": 1.415798818565401,
      "shape_key": "OLMoE-1B-7B|attn_qkv_fused|T32|M32|N6144|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_o",
      "tokens": 32,
      "m": 32,
      "n": 2048,
      "k": 2048,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 268435456.0,
      "median_us": 12.28800043463707,
      "min_us": 11.52000017464161,
      "max_us": 16.03199914097786,
      "stdev_us": 0.9056072681078319,
      "samples": 100,
      "tflops": 21.845332560645236,
      "shape_key": "OLMoE-1B-7B|attn_o|T32|M32|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_o",
      "tokens": 32,
      "m": 32,
      "n": 2048,
      "k": 2048,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 268435456.0,
      "median_us": 151.2,
      "min_us": 150.87,
      "max_us": 155.47,
      "stdev_us": 1.9498512763798164,
      "samples": 5,
      "tflops": 1.7753667724867725,
      "shape_key": "OLMoE-1B-7B|attn_o|T32|M32|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "moe_expert_up_or_gate_avg",
      "tokens": 4,
      "m": 4,
      "n": 1024,
      "k": 2048,
      "source_tokens": 32,
      "num_experts": 64,
      "top_k": 8,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 16777216.0,
      "median_us": 12.28800043463707,
      "min_us": 11.359999887645245,
      "max_us": 16.383999958634377,
      "stdev_us": 0.8368445901018151,
      "samples": 100,
      "tflops": 1.3653332850403272,
      "shape_key": "OLMoE-1B-7B|moe_expert_up_or_gate_avg|T32|M4|N1024|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "moe_expert_up_or_gate_avg",
      "tokens": 4,
      "m": 4,
      "n": 1024,
      "k": 2048,
      "source_tokens": 32,
      "num_experts": 64,
      "top_k": 8,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 16777216.0,
      "median_us": 39.72,
      "min_us": 39.28,
      "max_us": 40.67,
      "stdev_us": 0.5303112293738466,
      "samples": 5,
      "tflops": 0.4223871097683787,
      "shape_key": "OLMoE-1B-7B|moe_expert_up_or_gate_avg|T32|M4|N1024|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "moe_expert_down_avg",
      "tokens": 4,
      "m": 4,
      "n": 2048,
      "k": 1024,
      "source_tokens": 32,
      "num_experts": 64,
      "top_k": 8,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 16777216.0,
      "median_us": 9.8879998549819,
      "min_us": 7.712000049650669,
      "max_us": 12.959999963641167,
      "stdev_us": 1.1664048565516845,
      "samples": 100,
      "tflops": 1.696724943978138,
      "shape_key": "OLMoE-1B-7B|moe_expert_down_avg|T32|M4|N2048|K1024"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "moe_expert_down_avg",
      "tokens": 4,
      "m": 4,
      "n": 2048,
      "k": 1024,
      "source_tokens": 32,
      "num_experts": 64,
      "top_k": 8,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 16777216.0,
      "median_us": 40.55,
      "min_us": 40.25,
      "max_us": 41.2,
      "stdev_us": 0.40389355033226276,
      "samples": 5,
      "tflops": 0.41374145499383475,
      "shape_key": "OLMoE-1B-7B|moe_expert_down_avg|T32|M4|N2048|K1024"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_q",
      "tokens": 128,
      "m": 128,
      "n": 2048,
      "k": 2048,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 1073741824.0,
      "median_us": 12.256000190973282,
      "min_us": 11.552000418305397,
      "max_us": 16.383999958634377,
      "stdev_us": 0.9147577880989093,
      "samples": 100,
      "tflops": 87.60948166358762,
      "shape_key": "OLMoE-1B-7B|attn_q|T128|M128|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_q",
      "tokens": 128,
      "m": 128,
      "n": 2048,
      "k": 2048,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 1073741824.0,
      "median_us": 499.5,
      "min_us": 494.0,
      "max_us": 676.0,
      "stdev_us": 78.40129144344499,
      "samples": 5,
      "tflops": 2.149633281281281,
      "shape_key": "OLMoE-1B-7B|attn_q|T128|M128|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_kv_each",
      "tokens": 128,
      "m": 128,
      "n": 2048,
      "k": 2048,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 1073741824.0,
      "median_us": 12.256000190973282,
      "min_us": 11.552000418305397,
      "max_us": 16.383999958634377,
      "stdev_us": 0.9147577880989093,
      "samples": 100,
      "tflops": 87.60948166358762,
      "shape_key": "OLMoE-1B-7B|attn_kv_each|T128|M128|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_kv_each",
      "tokens": 128,
      "m": 128,
      "n": 2048,
      "k": 2048,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 1073741824.0,
      "median_us": 499.5,
      "min_us": 494.0,
      "max_us": 676.0,
      "stdev_us": 78.40129144344499,
      "samples": 5,
      "tflops": 2.149633281281281,
      "shape_key": "OLMoE-1B-7B|attn_kv_each|T128|M128|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_qkv_fused",
      "tokens": 128,
      "m": 128,
      "n": 6144,
      "k": 2048,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 3221225472.0,
      "median_us": 14.336000196635723,
      "min_us": 13.535999692976475,
      "max_us": 19.1040001809597,
      "stdev_us": 1.199686370189621,
      "samples": 100,
      "tflops": 224.6948540608932,
      "shape_key": "OLMoE-1B-7B|attn_qkv_fused|T128|M128|N6144|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_qkv_fused",
      "tokens": 128,
      "m": 128,
      "n": 6144,
      "k": 2048,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 3221225472.0,
      "median_us": 1881.5,
      "min_us": 1482.5,
      "max_us": 3164.5,
      "stdev_us": 676.4735767197415,
      "samples": 5,
      "tflops": 1.7120518054743556,
      "shape_key": "OLMoE-1B-7B|attn_qkv_fused|T128|M128|N6144|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_o",
      "tokens": 128,
      "m": 128,
      "n": 2048,
      "k": 2048,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 1073741824.0,
      "median_us": 12.256000190973282,
      "min_us": 11.552000418305397,
      "max_us": 16.383999958634377,
      "stdev_us": 0.9147577880989093,
      "samples": 100,
      "tflops": 87.60948166358762,
      "shape_key": "OLMoE-1B-7B|attn_o|T128|M128|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "attn_o",
      "tokens": 128,
      "m": 128,
      "n": 2048,
      "k": 2048,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 1073741824.0,
      "median_us": 499.5,
      "min_us": 494.0,
      "max_us": 676.0,
      "stdev_us": 78.40129144344499,
      "samples": 5,
      "tflops": 2.149633281281281,
      "shape_key": "OLMoE-1B-7B|attn_o|T128|M128|N2048|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "moe_expert_up_or_gate_avg",
      "tokens": 16,
      "m": 16,
      "n": 1024,
      "k": 2048,
      "source_tokens": 128,
      "num_experts": 64,
      "top_k": 8,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 67108864.0,
      "median_us": 12.016000226140022,
      "min_us": 10.239999741315842,
      "max_us": 16.03199914097786,
      "stdev_us": 0.7069512990536592,
      "samples": 100,
      "tflops": 5.5849586165959835,
      "shape_key": "OLMoE-1B-7B|moe_expert_up_or_gate_avg|T128|M16|N1024|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "moe_expert_up_or_gate_avg",
      "tokens": 16,
      "m": 16,
      "n": 1024,
      "k": 2048,
      "source_tokens": 128,
      "num_experts": 64,
      "top_k": 8,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 67108864.0,
      "median_us": 118.93,
      "min_us": 118.16,
      "max_us": 121.72,
      "stdev_us": 1.432490837667033,
      "samples": 5,
      "tflops": 0.5642719582947953,
      "shape_key": "OLMoE-1B-7B|moe_expert_up_or_gate_avg|T128|M16|N1024|K2048"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "moe_expert_down_avg",
      "tokens": 16,
      "m": 16,
      "n": 2048,
      "k": 1024,
      "source_tokens": 128,
      "num_experts": 64,
      "top_k": 8,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 67108864.0,
      "median_us": 9.8879998549819,
      "min_us": 7.648000027984381,
      "max_us": 16.224000602960587,
      "stdev_us": 1.2133753734506612,
      "samples": 100,
      "tflops": 6.786899775912552,
      "shape_key": "OLMoE-1B-7B|moe_expert_down_avg|T128|M16|N2048|K1024"
    },
    {
      "model": "OLMoE-1B-7B",
      "family": "moe",
      "op": "moe_expert_down_avg",
      "tokens": 16,
      "m": 16,
      "n": 2048,
      "k": 1024,
      "source_tokens": 128,
      "num_experts": 64,
      "top_k": 8,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 67108864.0,
      "median_us": 120.88,
      "min_us": 119.63,
      "max_us": 122.93,
      "stdev_us": 1.217053819680958,
      "samples": 5,
      "tflops": 0.5551692918596955,
      "shape_key": "OLMoE-1B-7B|moe_expert_down_avg|T128|M16|N2048|K1024"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_q",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 2048,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 8388608.0,
      "median_us": 12.28800043463707,
      "min_us": 11.744000017642975,
      "max_us": 17.055999487638474,
      "stdev_us": 1.137415380513941,
      "samples": 100,
      "tflops": 0.6826666425201636,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_q|T1|M1|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_q",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 2048,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 8388608.0,
      "median_us": 62.62,
      "min_us": 61.17,
      "max_us": 64.16,
      "stdev_us": 1.223744254327674,
      "samples": 5,
      "tflops": 0.13396052379431492,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_q|T1|M1|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_kv_each",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 2048,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 8388608.0,
      "median_us": 12.28800043463707,
      "min_us": 11.744000017642975,
      "max_us": 17.055999487638474,
      "stdev_us": 1.137415380513941,
      "samples": 100,
      "tflops": 0.6826666425201636,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_kv_each|T1|M1|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_kv_each",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 2048,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 8388608.0,
      "median_us": 62.62,
      "min_us": 61.17,
      "max_us": 64.16,
      "stdev_us": 1.223744254327674,
      "samples": 5,
      "tflops": 0.13396052379431492,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_kv_each|T1|M1|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_qkv_fused",
      "tokens": 1,
      "m": 1,
      "n": 6144,
      "k": 2048,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 25165824.0,
      "median_us": 16.03199914097786,
      "min_us": 13.952000066637993,
      "max_us": 19.90400068461895,
      "stdev_us": 1.026790251133557,
      "samples": 100,
      "tflops": 1.56972463500675,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_qkv_fused|T1|M1|N6144|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_qkv_fused",
      "tokens": 1,
      "m": 1,
      "n": 6144,
      "k": 2048,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 25165824.0,
      "median_us": 185.87,
      "min_us": 179.05,
      "max_us": 191.88,
      "stdev_us": 5.412390414595014,
      "samples": 5,
      "tflops": 0.1353947597783397,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_qkv_fused|T1|M1|N6144|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_o",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 2048,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 8388608.0,
      "median_us": 12.28800043463707,
      "min_us": 11.744000017642975,
      "max_us": 17.055999487638474,
      "stdev_us": 1.137415380513941,
      "samples": 100,
      "tflops": 0.6826666425201636,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_o|T1|M1|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_o",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 2048,
      "source_tokens": 1,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 8388608.0,
      "median_us": 62.62,
      "min_us": 61.17,
      "max_us": 64.16,
      "stdev_us": 1.223744254327674,
      "samples": 5,
      "tflops": 0.13396052379431492,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_o|T1|M1|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_expert_up_or_gate_avg",
      "tokens": 1,
      "m": 1,
      "n": 1408,
      "k": 2048,
      "source_tokens": 1,
      "num_experts": 60,
      "top_k": 4,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 5767168.0,
      "median_us": 12.28800043463707,
      "min_us": 11.552000418305397,
      "max_us": 17.055999487638474,
      "stdev_us": 1.0435430403135313,
      "samples": 100,
      "tflops": 0.4693333167326125,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_expert_up_or_gate_avg|T1|M1|N1408|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_expert_up_or_gate_avg",
      "tokens": 1,
      "m": 1,
      "n": 1408,
      "k": 2048,
      "source_tokens": 1,
      "num_experts": 60,
      "top_k": 4,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 5767168.0,
      "median_us": 42.08,
      "min_us": 41.58,
      "max_us": 44.12,
      "stdev_us": 1.1300530960977004,
      "samples": 5,
      "tflops": 0.13705247148288974,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_expert_up_or_gate_avg|T1|M1|N1408|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_expert_down_avg",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 1408,
      "source_tokens": 1,
      "num_experts": 60,
      "top_k": 4,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 5767168.0,
      "median_us": 11.935999616980553,
      "min_us": 9.855999611318111,
      "max_us": 15.39199985563755,
      "stdev_us": 0.9450329531026719,
      "samples": 100,
      "tflops": 0.4831742782393721,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_expert_down_avg|T1|M1|N2048|K1408"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_expert_down_avg",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 1408,
      "source_tokens": 1,
      "num_experts": 60,
      "top_k": 4,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 5767168.0,
      "median_us": 42.57,
      "min_us": 42.23,
      "max_us": 42.9,
      "stdev_us": 0.2708689720141457,
      "samples": 5,
      "tflops": 0.1354749354005168,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_expert_down_avg|T1|M1|N2048|K1408"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_shared_up_or_gate",
      "tokens": 1,
      "m": 1,
      "n": 5632,
      "k": 2048,
      "source_tokens": 1,
      "num_experts": 60,
      "top_k": 4,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 23068672.0,
      "median_us": 15.296000055968761,
      "min_us": 13.919999822974205,
      "max_us": 19.45599913597107,
      "stdev_us": 1.1143335665366965,
      "samples": 100,
      "tflops": 1.508150622096671,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_shared_up_or_gate|T1|M1|N5632|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_shared_up_or_gate",
      "tokens": 1,
      "m": 1,
      "n": 5632,
      "k": 2048,
      "source_tokens": 1,
      "num_experts": 60,
      "top_k": 4,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 23068672.0,
      "median_us": 166.6,
      "min_us": 165.83,
      "max_us": 169.59,
      "stdev_us": 1.8849005278793882,
      "samples": 5,
      "tflops": 0.13846741896758705,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_shared_up_or_gate|T1|M1|N5632|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_shared_down",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 5632,
      "source_tokens": 1,
      "num_experts": 60,
      "top_k": 4,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 23068672.0,
      "median_us": 24.224000051617622,
      "min_us": 22.495999932289124,
      "max_us": 28.31999957561493,
      "stdev_us": 0.8108640800674024,
      "samples": 100,
      "tflops": 0.9523064708901999,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_shared_down|T1|M1|N2048|K5632"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_shared_down",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 5632,
      "source_tokens": 1,
      "num_experts": 60,
      "top_k": 4,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 23068672.0,
      "median_us": 165.02,
      "min_us": 162.94,
      "max_us": 168.38,
      "stdev_us": 2.009721373723232,
      "samples": 5,
      "tflops": 0.13979318870439947,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_shared_down|T1|M1|N2048|K5632"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_q",
      "tokens": 2,
      "m": 2,
      "n": 2048,
      "k": 2048,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 16777216.0,
      "median_us": 12.28800043463707,
      "min_us": 11.52000017464161,
      "max_us": 17.40800030529499,
      "stdev_us": 0.905924784909204,
      "samples": 100,
      "tflops": 1.3653332850403272,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_q|T2|M2|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_q",
      "tokens": 2,
      "m": 2,
      "n": 2048,
      "k": 2048,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 16777216.0,
      "median_us": 65.76,
      "min_us": 64.65,
      "max_us": 72.83,
      "stdev_us": 3.3232032739512016,
      "samples": 5,
      "tflops": 0.2551279805352798,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_q|T2|M2|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_kv_each",
      "tokens": 2,
      "m": 2,
      "n": 2048,
      "k": 2048,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 16777216.0,
      "median_us": 12.28800043463707,
      "min_us": 11.52000017464161,
      "max_us": 17.40800030529499,
      "stdev_us": 0.905924784909204,
      "samples": 100,
      "tflops": 1.3653332850403272,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_kv_each|T2|M2|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_kv_each",
      "tokens": 2,
      "m": 2,
      "n": 2048,
      "k": 2048,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 16777216.0,
      "median_us": 65.76,
      "min_us": 64.65,
      "max_us": 72.83,
      "stdev_us": 3.3232032739512016,
      "samples": 5,
      "tflops": 0.2551279805352798,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_kv_each|T2|M2|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_qkv_fused",
      "tokens": 2,
      "m": 2,
      "n": 6144,
      "k": 2048,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 50331648.0,
      "median_us": 16.03199914097786,
      "min_us": 13.952000066637993,
      "max_us": 19.519999623298645,
      "stdev_us": 0.9169661913366407,
      "samples": 100,
      "tflops": 3.1394492700135,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_qkv_fused|T2|M2|N6144|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_qkv_fused",
      "tokens": 2,
      "m": 2,
      "n": 6144,
      "k": 2048,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 50331648.0,
      "median_us": 202.32,
      "min_us": 191.86,
      "max_us": 214.3,
      "stdev_us": 8.445479264079689,
      "samples": 5,
      "tflops": 0.24877247924080664,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_qkv_fused|T2|M2|N6144|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_o",
      "tokens": 2,
      "m": 2,
      "n": 2048,
      "k": 2048,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 16777216.0,
      "median_us": 12.28800043463707,
      "min_us": 11.52000017464161,
      "max_us": 17.40800030529499,
      "stdev_us": 0.905924784909204,
      "samples": 100,
      "tflops": 1.3653332850403272,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_o|T2|M2|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_o",
      "tokens": 2,
      "m": 2,
      "n": 2048,
      "k": 2048,
      "source_tokens": 2,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 16777216.0,
      "median_us": 65.76,
      "min_us": 64.65,
      "max_us": 72.83,
      "stdev_us": 3.3232032739512016,
      "samples": 5,
      "tflops": 0.2551279805352798,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_o|T2|M2|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_expert_up_or_gate_avg",
      "tokens": 1,
      "m": 1,
      "n": 1408,
      "k": 2048,
      "source_tokens": 2,
      "num_experts": 60,
      "top_k": 4,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 5767168.0,
      "median_us": 12.28800043463707,
      "min_us": 11.552000418305397,
      "max_us": 17.055999487638474,
      "stdev_us": 1.0435430403135313,
      "samples": 100,
      "tflops": 0.4693333167326125,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_expert_up_or_gate_avg|T2|M1|N1408|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_expert_up_or_gate_avg",
      "tokens": 1,
      "m": 1,
      "n": 1408,
      "k": 2048,
      "source_tokens": 2,
      "num_experts": 60,
      "top_k": 4,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 5767168.0,
      "median_us": 42.08,
      "min_us": 41.58,
      "max_us": 44.12,
      "stdev_us": 1.1300530960977004,
      "samples": 5,
      "tflops": 0.13705247148288974,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_expert_up_or_gate_avg|T2|M1|N1408|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_expert_down_avg",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 1408,
      "source_tokens": 2,
      "num_experts": 60,
      "top_k": 4,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 5767168.0,
      "median_us": 11.935999616980553,
      "min_us": 9.855999611318111,
      "max_us": 15.39199985563755,
      "stdev_us": 0.9450329531026719,
      "samples": 100,
      "tflops": 0.4831742782393721,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_expert_down_avg|T2|M1|N2048|K1408"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_expert_down_avg",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 1408,
      "source_tokens": 2,
      "num_experts": 60,
      "top_k": 4,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 5767168.0,
      "median_us": 42.57,
      "min_us": 42.23,
      "max_us": 42.9,
      "stdev_us": 0.2708689720141457,
      "samples": 5,
      "tflops": 0.1354749354005168,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_expert_down_avg|T2|M1|N2048|K1408"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_shared_up_or_gate",
      "tokens": 2,
      "m": 2,
      "n": 5632,
      "k": 2048,
      "source_tokens": 2,
      "num_experts": 60,
      "top_k": 4,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 46137344.0,
      "median_us": 14.336000196635723,
      "min_us": 13.535999692976475,
      "max_us": 19.519999623298645,
      "stdev_us": 1.2927331902510533,
      "samples": 100,
      "tflops": 3.2182856701430014,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_shared_up_or_gate|T2|M2|N5632|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_shared_up_or_gate",
      "tokens": 2,
      "m": 2,
      "n": 5632,
      "k": 2048,
      "source_tokens": 2,
      "num_experts": 60,
      "top_k": 4,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 46137344.0,
      "median_us": 179.37,
      "min_us": 177.4,
      "max_us": 183.47,
      "stdev_us": 2.3076979871724954,
      "samples": 5,
      "tflops": 0.2572188437308357,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_shared_up_or_gate|T2|M2|N5632|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_shared_down",
      "tokens": 2,
      "m": 2,
      "n": 2048,
      "k": 5632,
      "source_tokens": 2,
      "num_experts": 60,
      "top_k": 4,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 46137344.0,
      "median_us": 24.191999807953835,
      "min_us": 22.016000002622604,
      "max_us": 27.904000133275986,
      "stdev_us": 1.0965087685522212,
      "samples": 100,
      "tflops": 1.907132290271885,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_shared_down|T2|M2|N2048|K5632"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_shared_down",
      "tokens": 2,
      "m": 2,
      "n": 2048,
      "k": 5632,
      "source_tokens": 2,
      "num_experts": 60,
      "top_k": 4,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 46137344.0,
      "median_us": 179.8,
      "min_us": 175.51,
      "max_us": 191.41,
      "stdev_us": 6.239258770078383,
      "samples": 5,
      "tflops": 0.2566036929922136,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_shared_down|T2|M2|N2048|K5632"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_q",
      "tokens": 4,
      "m": 4,
      "n": 2048,
      "k": 2048,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 33554432.0,
      "median_us": 12.28800043463707,
      "min_us": 11.680000461637974,
      "max_us": 17.311999574303627,
      "stdev_us": 1.0944388754410355,
      "samples": 100,
      "tflops": 2.7306665700806545,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_q|T4|M4|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_q",
      "tokens": 4,
      "m": 4,
      "n": 2048,
      "k": 2048,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 33554432.0,
      "median_us": 78.82,
      "min_us": 77.73,
      "max_us": 81.74,
      "stdev_us": 1.7653526559868986,
      "samples": 5,
      "tflops": 0.4257096168485156,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_q|T4|M4|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_kv_each",
      "tokens": 4,
      "m": 4,
      "n": 2048,
      "k": 2048,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 33554432.0,
      "median_us": 12.28800043463707,
      "min_us": 11.680000461637974,
      "max_us": 17.311999574303627,
      "stdev_us": 1.0944388754410355,
      "samples": 100,
      "tflops": 2.7306665700806545,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_kv_each|T4|M4|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_kv_each",
      "tokens": 4,
      "m": 4,
      "n": 2048,
      "k": 2048,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 33554432.0,
      "median_us": 78.82,
      "min_us": 77.73,
      "max_us": 81.74,
      "stdev_us": 1.7653526559868986,
      "samples": 5,
      "tflops": 0.4257096168485156,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_kv_each|T4|M4|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_qkv_fused",
      "tokens": 4,
      "m": 4,
      "n": 6144,
      "k": 2048,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 100663296.0,
      "median_us": 15.760000795125961,
      "min_us": 13.663999736309052,
      "max_us": 19.1040001809597,
      "stdev_us": 1.1704662948313438,
      "samples": 100,
      "tflops": 6.387264652368024,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_qkv_fused|T4|M4|N6144|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_qkv_fused",
      "tokens": 4,
      "m": 4,
      "n": 6144,
      "k": 2048,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 100663296.0,
      "median_us": 233.11,
      "min_us": 230.63,
      "max_us": 246.5,
      "stdev_us": 6.537894921150081,
      "samples": 5,
      "tflops": 0.4318274462700013,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_qkv_fused|T4|M4|N6144|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_o",
      "tokens": 4,
      "m": 4,
      "n": 2048,
      "k": 2048,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 33554432.0,
      "median_us": 12.28800043463707,
      "min_us": 11.680000461637974,
      "max_us": 17.311999574303627,
      "stdev_us": 1.0944388754410355,
      "samples": 100,
      "tflops": 2.7306665700806545,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_o|T4|M4|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_o",
      "tokens": 4,
      "m": 4,
      "n": 2048,
      "k": 2048,
      "source_tokens": 4,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 33554432.0,
      "median_us": 78.82,
      "min_us": 77.73,
      "max_us": 81.74,
      "stdev_us": 1.7653526559868986,
      "samples": 5,
      "tflops": 0.4257096168485156,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_o|T4|M4|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_expert_up_or_gate_avg",
      "tokens": 1,
      "m": 1,
      "n": 1408,
      "k": 2048,
      "source_tokens": 4,
      "num_experts": 60,
      "top_k": 4,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 5767168.0,
      "median_us": 12.28800043463707,
      "min_us": 11.552000418305397,
      "max_us": 17.055999487638474,
      "stdev_us": 1.0435430403135313,
      "samples": 100,
      "tflops": 0.4693333167326125,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_expert_up_or_gate_avg|T4|M1|N1408|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_expert_up_or_gate_avg",
      "tokens": 1,
      "m": 1,
      "n": 1408,
      "k": 2048,
      "source_tokens": 4,
      "num_experts": 60,
      "top_k": 4,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 5767168.0,
      "median_us": 42.08,
      "min_us": 41.58,
      "max_us": 44.12,
      "stdev_us": 1.1300530960977004,
      "samples": 5,
      "tflops": 0.13705247148288974,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_expert_up_or_gate_avg|T4|M1|N1408|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_expert_down_avg",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 1408,
      "source_tokens": 4,
      "num_experts": 60,
      "top_k": 4,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 5767168.0,
      "median_us": 11.935999616980553,
      "min_us": 9.855999611318111,
      "max_us": 15.39199985563755,
      "stdev_us": 0.9450329531026719,
      "samples": 100,
      "tflops": 0.4831742782393721,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_expert_down_avg|T4|M1|N2048|K1408"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_expert_down_avg",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 1408,
      "source_tokens": 4,
      "num_experts": 60,
      "top_k": 4,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 5767168.0,
      "median_us": 42.57,
      "min_us": 42.23,
      "max_us": 42.9,
      "stdev_us": 0.2708689720141457,
      "samples": 5,
      "tflops": 0.1354749354005168,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_expert_down_avg|T4|M1|N2048|K1408"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_shared_up_or_gate",
      "tokens": 4,
      "m": 4,
      "n": 5632,
      "k": 2048,
      "source_tokens": 4,
      "num_experts": 60,
      "top_k": 4,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 92274688.0,
      "median_us": 14.688000082969666,
      "min_us": 13.952000066637993,
      "max_us": 22.655999287962914,
      "stdev_us": 1.1938981753061626,
      "samples": 100,
      "tflops": 6.282318047301074,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_shared_up_or_gate|T4|M4|N5632|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_shared_up_or_gate",
      "tokens": 4,
      "m": 4,
      "n": 5632,
      "k": 2048,
      "source_tokens": 4,
      "num_experts": 60,
      "top_k": 4,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 92274688.0,
      "median_us": 218.83,
      "min_us": 211.07,
      "max_us": 237.55,
      "stdev_us": 10.292610456050504,
      "samples": 5,
      "tflops": 0.4216729333272403,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_shared_up_or_gate|T4|M4|N5632|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_shared_down",
      "tokens": 4,
      "m": 4,
      "n": 2048,
      "k": 5632,
      "source_tokens": 4,
      "num_experts": 60,
      "top_k": 4,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 92274688.0,
      "median_us": 24.224000051617622,
      "min_us": 22.143999114632607,
      "max_us": 27.295999228954315,
      "stdev_us": 1.014422891068679,
      "samples": 100,
      "tflops": 3.8092258835607997,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_shared_down|T4|M4|N2048|K5632"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_shared_down",
      "tokens": 4,
      "m": 4,
      "n": 2048,
      "k": 5632,
      "source_tokens": 4,
      "num_experts": 60,
      "top_k": 4,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 92274688.0,
      "median_us": 212.26,
      "min_us": 209.76,
      "max_us": 238.86,
      "stdev_us": 14.203428459354459,
      "samples": 5,
      "tflops": 0.4347248091962687,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_shared_down|T4|M4|N2048|K5632"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_q",
      "tokens": 8,
      "m": 8,
      "n": 2048,
      "k": 2048,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 67108864.0,
      "median_us": 12.28800043463707,
      "min_us": 11.552000418305397,
      "max_us": 16.704000532627106,
      "stdev_us": 0.9340635760009698,
      "samples": 100,
      "tflops": 5.461333140161309,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_q|T8|M8|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_q",
      "tokens": 8,
      "m": 8,
      "n": 2048,
      "k": 2048,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 67108864.0,
      "median_us": 118.74,
      "min_us": 117.7,
      "max_us": 120.16,
      "stdev_us": 1.0132028424752857,
      "samples": 5,
      "tflops": 0.5651748694626916,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_q|T8|M8|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_kv_each",
      "tokens": 8,
      "m": 8,
      "n": 2048,
      "k": 2048,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 67108864.0,
      "median_us": 12.28800043463707,
      "min_us": 11.552000418305397,
      "max_us": 16.704000532627106,
      "stdev_us": 0.9340635760009698,
      "samples": 100,
      "tflops": 5.461333140161309,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_kv_each|T8|M8|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_kv_each",
      "tokens": 8,
      "m": 8,
      "n": 2048,
      "k": 2048,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 67108864.0,
      "median_us": 118.74,
      "min_us": 117.7,
      "max_us": 120.16,
      "stdev_us": 1.0132028424752857,
      "samples": 5,
      "tflops": 0.5651748694626916,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_kv_each|T8|M8|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_qkv_fused",
      "tokens": 8,
      "m": 8,
      "n": 6144,
      "k": 2048,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 201326592.0,
      "median_us": 16.03199914097786,
      "min_us": 13.791999779641628,
      "max_us": 19.42400075495243,
      "stdev_us": 1.0240271563538978,
      "samples": 100,
      "tflops": 12.557797080054,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_qkv_fused|T8|M8|N6144|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_qkv_fused",
      "tokens": 8,
      "m": 8,
      "n": 6144,
      "k": 2048,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 201326592.0,
      "median_us": 353.84,
      "min_us": 345.32,
      "max_us": 418.68,
      "stdev_us": 30.3934963437904,
      "samples": 5,
      "tflops": 0.568976350893059,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_qkv_fused|T8|M8|N6144|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_o",
      "tokens": 8,
      "m": 8,
      "n": 2048,
      "k": 2048,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 67108864.0,
      "median_us": 12.28800043463707,
      "min_us": 11.552000418305397,
      "max_us": 16.704000532627106,
      "stdev_us": 0.9340635760009698,
      "samples": 100,
      "tflops": 5.461333140161309,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_o|T8|M8|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_o",
      "tokens": 8,
      "m": 8,
      "n": 2048,
      "k": 2048,
      "source_tokens": 8,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 67108864.0,
      "median_us": 118.74,
      "min_us": 117.7,
      "max_us": 120.16,
      "stdev_us": 1.0132028424752857,
      "samples": 5,
      "tflops": 0.5651748694626916,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_o|T8|M8|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_expert_up_or_gate_avg",
      "tokens": 1,
      "m": 1,
      "n": 1408,
      "k": 2048,
      "source_tokens": 8,
      "num_experts": 60,
      "top_k": 4,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 5767168.0,
      "median_us": 12.28800043463707,
      "min_us": 11.552000418305397,
      "max_us": 17.055999487638474,
      "stdev_us": 1.0435430403135313,
      "samples": 100,
      "tflops": 0.4693333167326125,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_expert_up_or_gate_avg|T8|M1|N1408|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_expert_up_or_gate_avg",
      "tokens": 1,
      "m": 1,
      "n": 1408,
      "k": 2048,
      "source_tokens": 8,
      "num_experts": 60,
      "top_k": 4,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 5767168.0,
      "median_us": 42.08,
      "min_us": 41.58,
      "max_us": 44.12,
      "stdev_us": 1.1300530960977004,
      "samples": 5,
      "tflops": 0.13705247148288974,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_expert_up_or_gate_avg|T8|M1|N1408|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_expert_down_avg",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 1408,
      "source_tokens": 8,
      "num_experts": 60,
      "top_k": 4,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 5767168.0,
      "median_us": 11.935999616980553,
      "min_us": 9.855999611318111,
      "max_us": 15.39199985563755,
      "stdev_us": 0.9450329531026719,
      "samples": 100,
      "tflops": 0.4831742782393721,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_expert_down_avg|T8|M1|N2048|K1408"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_expert_down_avg",
      "tokens": 1,
      "m": 1,
      "n": 2048,
      "k": 1408,
      "source_tokens": 8,
      "num_experts": 60,
      "top_k": 4,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 5767168.0,
      "median_us": 42.57,
      "min_us": 42.23,
      "max_us": 42.9,
      "stdev_us": 0.2708689720141457,
      "samples": 5,
      "tflops": 0.1354749354005168,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_expert_down_avg|T8|M1|N2048|K1408"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_shared_up_or_gate",
      "tokens": 8,
      "m": 8,
      "n": 5632,
      "k": 2048,
      "source_tokens": 8,
      "num_experts": 60,
      "top_k": 4,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 184549376.0,
      "median_us": 14.336000196635723,
      "min_us": 13.919999822974205,
      "max_us": 19.487999379634857,
      "stdev_us": 1.2045968748796008,
      "samples": 100,
      "tflops": 12.873142680572006,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_shared_up_or_gate|T8|M8|N5632|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_shared_up_or_gate",
      "tokens": 8,
      "m": 8,
      "n": 5632,
      "k": 2048,
      "source_tokens": 8,
      "num_experts": 60,
      "top_k": 4,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 184549376.0,
      "median_us": 328.1,
      "min_us": 317.38,
      "max_us": 422.0,
      "stdev_us": 43.8133423742129,
      "samples": 5,
      "tflops": 0.5624790490704054,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_shared_up_or_gate|T8|M8|N5632|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_shared_down",
      "tokens": 8,
      "m": 8,
      "n": 2048,
      "k": 5632,
      "source_tokens": 8,
      "num_experts": 60,
      "top_k": 4,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 184549376.0,
      "median_us": 22.911999374628067,
      "min_us": 22.143999114632607,
      "max_us": 28.00000086426735,
      "stdev_us": 1.1991268098748702,
      "samples": 100,
      "tflops": 8.05470413046377,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_shared_down|T8|M8|N2048|K5632"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_shared_down",
      "tokens": 8,
      "m": 8,
      "n": 2048,
      "k": 5632,
      "source_tokens": 8,
      "num_experts": 60,
      "top_k": 4,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 184549376.0,
      "median_us": 322.05,
      "min_us": 319.33,
      "max_us": 451.43,
      "stdev_us": 57.51073612813524,
      "samples": 5,
      "tflops": 0.5730457258189722,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_shared_down|T8|M8|N2048|K5632"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_q",
      "tokens": 16,
      "m": 16,
      "n": 2048,
      "k": 2048,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 134217728.0,
      "median_us": 12.608000077307224,
      "min_us": 11.264000087976456,
      "max_us": 16.767999157309532,
      "stdev_us": 1.0402497836019682,
      "samples": 100,
      "tflops": 10.645441559091884,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_q|T16|M16|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_q",
      "tokens": 16,
      "m": 16,
      "n": 2048,
      "k": 2048,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 134217728.0,
      "median_us": 236.03,
      "min_us": 235.34,
      "max_us": 236.97,
      "stdev_us": 0.6490223416801582,
      "samples": 5,
      "tflops": 0.5686469008176926,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_q|T16|M16|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_kv_each",
      "tokens": 16,
      "m": 16,
      "n": 2048,
      "k": 2048,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 134217728.0,
      "median_us": 12.608000077307224,
      "min_us": 11.264000087976456,
      "max_us": 16.767999157309532,
      "stdev_us": 1.0402497836019682,
      "samples": 100,
      "tflops": 10.645441559091884,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_kv_each|T16|M16|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_kv_each",
      "tokens": 16,
      "m": 16,
      "n": 2048,
      "k": 2048,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 134217728.0,
      "median_us": 236.03,
      "min_us": 235.34,
      "max_us": 236.97,
      "stdev_us": 0.6490223416801582,
      "samples": 5,
      "tflops": 0.5686469008176926,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_kv_each|T16|M16|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_qkv_fused",
      "tokens": 16,
      "m": 16,
      "n": 6144,
      "k": 2048,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 402653184.0,
      "median_us": 16.03199914097786,
      "min_us": 13.952000066637993,
      "max_us": 19.1040001809597,
      "stdev_us": 0.8763114512685403,
      "samples": 100,
      "tflops": 25.115594160108,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_qkv_fused|T16|M16|N6144|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_qkv_fused",
      "tokens": 16,
      "m": 16,
      "n": 6144,
      "k": 2048,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 402653184.0,
      "median_us": 808.4,
      "min_us": 702.0,
      "max_us": 1137.8,
      "stdev_us": 207.62810021767282,
      "samples": 5,
      "tflops": 0.49808657100445325,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_qkv_fused|T16|M16|N6144|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_o",
      "tokens": 16,
      "m": 16,
      "n": 2048,
      "k": 2048,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 134217728.0,
      "median_us": 12.608000077307224,
      "min_us": 11.264000087976456,
      "max_us": 16.767999157309532,
      "stdev_us": 1.0402497836019682,
      "samples": 100,
      "tflops": 10.645441559091884,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_o|T16|M16|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_o",
      "tokens": 16,
      "m": 16,
      "n": 2048,
      "k": 2048,
      "source_tokens": 16,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 134217728.0,
      "median_us": 236.03,
      "min_us": 235.34,
      "max_us": 236.97,
      "stdev_us": 0.6490223416801582,
      "samples": 5,
      "tflops": 0.5686469008176926,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_o|T16|M16|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_expert_up_or_gate_avg",
      "tokens": 2,
      "m": 2,
      "n": 1408,
      "k": 2048,
      "source_tokens": 16,
      "num_experts": 60,
      "top_k": 4,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 11534336.0,
      "median_us": 12.240000069141388,
      "min_us": 11.487999930977821,
      "max_us": 17.055999487638474,
      "stdev_us": 0.9782779336669588,
      "samples": 100,
      "tflops": 0.942347707095161,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_expert_up_or_gate_avg|T16|M2|N1408|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_expert_up_or_gate_avg",
      "tokens": 2,
      "m": 2,
      "n": 1408,
      "k": 2048,
      "source_tokens": 16,
      "num_experts": 60,
      "top_k": 4,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 11534336.0,
      "median_us": 45.4,
      "min_us": 45.06,
      "max_us": 46.23,
      "stdev_us": 0.4406585980098411,
      "samples": 5,
      "tflops": 0.2540602643171806,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_expert_up_or_gate_avg|T16|M2|N1408|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_expert_down_avg",
      "tokens": 2,
      "m": 2,
      "n": 2048,
      "k": 1408,
      "source_tokens": 16,
      "num_experts": 60,
      "top_k": 4,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 11534336.0,
      "median_us": 11.90400030463934,
      "min_us": 9.535999968647957,
      "max_us": 15.00799972563982,
      "stdev_us": 1.0912208830620873,
      "samples": 100,
      "tflops": 0.9689462117625055,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_expert_down_avg|T16|M2|N2048|K1408"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_expert_down_avg",
      "tokens": 2,
      "m": 2,
      "n": 2048,
      "k": 1408,
      "source_tokens": 16,
      "num_experts": 60,
      "top_k": 4,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 11534336.0,
      "median_us": 45.88,
      "min_us": 45.58,
      "max_us": 55.11,
      "stdev_us": 4.153086803812316,
      "samples": 5,
      "tflops": 0.25140226678291194,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_expert_down_avg|T16|M2|N2048|K1408"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_shared_up_or_gate",
      "tokens": 16,
      "m": 16,
      "n": 5632,
      "k": 2048,
      "source_tokens": 16,
      "num_experts": 60,
      "top_k": 4,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 369098752.0,
      "median_us": 15.551999676972628,
      "min_us": 14.175999909639359,
      "max_us": 19.519999623298645,
      "stdev_us": 1.1545015776423047,
      "samples": 100,
      "tflops": 23.733202139047965,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_shared_up_or_gate|T16|M16|N5632|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_shared_up_or_gate",
      "tokens": 16,
      "m": 16,
      "n": 5632,
      "k": 2048,
      "source_tokens": 16,
      "num_experts": 60,
      "top_k": 4,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 369098752.0,
      "median_us": 644.09,
      "min_us": 637.55,
      "max_us": 819.55,
      "stdev_us": 78.2425397721725,
      "samples": 5,
      "tflops": 0.5730546228011614,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_shared_up_or_gate|T16|M16|N5632|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_shared_down",
      "tokens": 16,
      "m": 16,
      "n": 2048,
      "k": 5632,
      "source_tokens": 16,
      "num_experts": 60,
      "top_k": 4,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 369098752.0,
      "median_us": 24.224000051617622,
      "min_us": 21.983999758958817,
      "max_us": 27.871999889612198,
      "stdev_us": 0.947545043662164,
      "samples": 100,
      "tflops": 15.236903534243199,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_shared_down|T16|M16|N2048|K5632"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_shared_down",
      "tokens": 16,
      "m": 16,
      "n": 2048,
      "k": 5632,
      "source_tokens": 16,
      "num_experts": 60,
      "top_k": 4,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 369098752.0,
      "median_us": 641.45,
      "min_us": 635.55,
      "max_us": 805.45,
      "stdev_us": 73.64688044988738,
      "samples": 5,
      "tflops": 0.5754131296281861,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_shared_down|T16|M16|N2048|K5632"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_q",
      "tokens": 32,
      "m": 32,
      "n": 2048,
      "k": 2048,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 268435456.0,
      "median_us": 12.28800043463707,
      "min_us": 11.52000017464161,
      "max_us": 16.03199914097786,
      "stdev_us": 0.9056072681078319,
      "samples": 100,
      "tflops": 21.845332560645236,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_q|T32|M32|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_q",
      "tokens": 32,
      "m": 32,
      "n": 2048,
      "k": 2048,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 268435456.0,
      "median_us": 151.2,
      "min_us": 150.87,
      "max_us": 155.47,
      "stdev_us": 1.9498512763798164,
      "samples": 5,
      "tflops": 1.7753667724867725,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_q|T32|M32|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_kv_each",
      "tokens": 32,
      "m": 32,
      "n": 2048,
      "k": 2048,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 268435456.0,
      "median_us": 12.28800043463707,
      "min_us": 11.52000017464161,
      "max_us": 16.03199914097786,
      "stdev_us": 0.9056072681078319,
      "samples": 100,
      "tflops": 21.845332560645236,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_kv_each|T32|M32|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_kv_each",
      "tokens": 32,
      "m": 32,
      "n": 2048,
      "k": 2048,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 268435456.0,
      "median_us": 151.2,
      "min_us": 150.87,
      "max_us": 155.47,
      "stdev_us": 1.9498512763798164,
      "samples": 5,
      "tflops": 1.7753667724867725,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_kv_each|T32|M32|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_qkv_fused",
      "tokens": 32,
      "m": 32,
      "n": 6144,
      "k": 2048,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 805306368.0,
      "median_us": 14.336000196635723,
      "min_us": 13.728000223636627,
      "max_us": 19.74399946630001,
      "stdev_us": 1.240565685159419,
      "samples": 100,
      "tflops": 56.1737135152233,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_qkv_fused|T32|M32|N6144|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_qkv_fused",
      "tokens": 32,
      "m": 32,
      "n": 6144,
      "k": 2048,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 805306368.0,
      "median_us": 568.8,
      "min_us": 459.0,
      "max_us": 752.0,
      "stdev_us": 109.94167544657485,
      "samples": 5,
      "tflops": 1.415798818565401,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_qkv_fused|T32|M32|N6144|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_o",
      "tokens": 32,
      "m": 32,
      "n": 2048,
      "k": 2048,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 268435456.0,
      "median_us": 12.28800043463707,
      "min_us": 11.52000017464161,
      "max_us": 16.03199914097786,
      "stdev_us": 0.9056072681078319,
      "samples": 100,
      "tflops": 21.845332560645236,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_o|T32|M32|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_o",
      "tokens": 32,
      "m": 32,
      "n": 2048,
      "k": 2048,
      "source_tokens": 32,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 268435456.0,
      "median_us": 151.2,
      "min_us": 150.87,
      "max_us": 155.47,
      "stdev_us": 1.9498512763798164,
      "samples": 5,
      "tflops": 1.7753667724867725,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_o|T32|M32|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_expert_up_or_gate_avg",
      "tokens": 3,
      "m": 3,
      "n": 1408,
      "k": 2048,
      "source_tokens": 32,
      "num_experts": 60,
      "top_k": 4,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 17301504.0,
      "median_us": 11.935999616980553,
      "min_us": 11.552000418305397,
      "max_us": 15.359999611973763,
      "stdev_us": 0.5587634558472314,
      "samples": 100,
      "tflops": 1.4495228347181162,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_expert_up_or_gate_avg|T32|M3|N1408|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_expert_up_or_gate_avg",
      "tokens": 3,
      "m": 3,
      "n": 1408,
      "k": 2048,
      "source_tokens": 32,
      "num_experts": 60,
      "top_k": 4,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 17301504.0,
      "median_us": 49.85,
      "min_us": 49.37,
      "max_us": 52.35,
      "stdev_us": 1.1906216863470955,
      "samples": 5,
      "tflops": 0.3470712938816449,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_expert_up_or_gate_avg|T32|M3|N1408|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_expert_down_avg",
      "tokens": 3,
      "m": 3,
      "n": 2048,
      "k": 1408,
      "source_tokens": 32,
      "num_experts": 60,
      "top_k": 4,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 17301504.0,
      "median_us": 10.591999627649784,
      "min_us": 9.920000098645687,
      "max_us": 15.904000028967857,
      "stdev_us": 1.2364669459627489,
      "samples": 100,
      "tflops": 1.6334502084795637,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_expert_down_avg|T32|M3|N2048|K1408"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_expert_down_avg",
      "tokens": 3,
      "m": 3,
      "n": 2048,
      "k": 1408,
      "source_tokens": 32,
      "num_experts": 60,
      "top_k": 4,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 17301504.0,
      "median_us": 50.46,
      "min_us": 49.78,
      "max_us": 51.25,
      "stdev_us": 0.5943315572977752,
      "samples": 5,
      "tflops": 0.3428756242568371,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_expert_down_avg|T32|M3|N2048|K1408"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_shared_up_or_gate",
      "tokens": 32,
      "m": 32,
      "n": 5632,
      "k": 2048,
      "source_tokens": 32,
      "num_experts": 60,
      "top_k": 4,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 738197504.0,
      "median_us": 14.688000082969666,
      "min_us": 13.98400031030178,
      "max_us": 19.45599913597107,
      "stdev_us": 1.0793299665035225,
      "samples": 100,
      "tflops": 50.258544378408594,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_shared_up_or_gate|T32|M32|N5632|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_shared_up_or_gate",
      "tokens": 32,
      "m": 32,
      "n": 5632,
      "k": 2048,
      "source_tokens": 32,
      "num_experts": 60,
      "top_k": 4,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 738197504.0,
      "median_us": 523.83,
      "min_us": 430.33,
      "max_us": 886.5,
      "stdev_us": 206.97150221709268,
      "samples": 5,
      "tflops": 1.4092310558769066,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_shared_up_or_gate|T32|M32|N5632|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_shared_down",
      "tokens": 32,
      "m": 32,
      "n": 2048,
      "k": 5632,
      "source_tokens": 32,
      "num_experts": 60,
      "top_k": 4,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 738197504.0,
      "median_us": 24.335999973118305,
      "min_us": 22.431999444961548,
      "max_us": 28.00000086426735,
      "stdev_us": 1.3137343768286558,
      "samples": 100,
      "tflops": 30.333559533835366,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_shared_down|T32|M32|N2048|K5632"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_shared_down",
      "tokens": 32,
      "m": 32,
      "n": 2048,
      "k": 5632,
      "source_tokens": 32,
      "num_experts": 60,
      "top_k": 4,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 738197504.0,
      "median_us": 427.83,
      "min_us": 412.0,
      "max_us": 503.33,
      "stdev_us": 37.62561933045089,
      "samples": 5,
      "tflops": 1.7254458640114063,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_shared_down|T32|M32|N2048|K5632"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_q",
      "tokens": 128,
      "m": 128,
      "n": 2048,
      "k": 2048,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 1073741824.0,
      "median_us": 12.256000190973282,
      "min_us": 11.552000418305397,
      "max_us": 16.383999958634377,
      "stdev_us": 0.9147577880989093,
      "samples": 100,
      "tflops": 87.60948166358762,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_q|T128|M128|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_q",
      "tokens": 128,
      "m": 128,
      "n": 2048,
      "k": 2048,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 1073741824.0,
      "median_us": 499.5,
      "min_us": 494.0,
      "max_us": 676.0,
      "stdev_us": 78.40129144344499,
      "samples": 5,
      "tflops": 2.149633281281281,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_q|T128|M128|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_kv_each",
      "tokens": 128,
      "m": 128,
      "n": 2048,
      "k": 2048,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 1073741824.0,
      "median_us": 12.256000190973282,
      "min_us": 11.552000418305397,
      "max_us": 16.383999958634377,
      "stdev_us": 0.9147577880989093,
      "samples": 100,
      "tflops": 87.60948166358762,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_kv_each|T128|M128|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_kv_each",
      "tokens": 128,
      "m": 128,
      "n": 2048,
      "k": 2048,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 1073741824.0,
      "median_us": 499.5,
      "min_us": 494.0,
      "max_us": 676.0,
      "stdev_us": 78.40129144344499,
      "samples": 5,
      "tflops": 2.149633281281281,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_kv_each|T128|M128|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_qkv_fused",
      "tokens": 128,
      "m": 128,
      "n": 6144,
      "k": 2048,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 3221225472.0,
      "median_us": 14.336000196635723,
      "min_us": 13.535999692976475,
      "max_us": 19.1040001809597,
      "stdev_us": 1.199686370189621,
      "samples": 100,
      "tflops": 224.6948540608932,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_qkv_fused|T128|M128|N6144|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_qkv_fused",
      "tokens": 128,
      "m": 128,
      "n": 6144,
      "k": 2048,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 3221225472.0,
      "median_us": 1881.5,
      "min_us": 1482.5,
      "max_us": 3164.5,
      "stdev_us": 676.4735767197415,
      "samples": 5,
      "tflops": 1.7120518054743556,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_qkv_fused|T128|M128|N6144|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_o",
      "tokens": 128,
      "m": 128,
      "n": 2048,
      "k": 2048,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 1073741824.0,
      "median_us": 12.256000190973282,
      "min_us": 11.552000418305397,
      "max_us": 16.383999958634377,
      "stdev_us": 0.9147577880989093,
      "samples": 100,
      "tflops": 87.60948166358762,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_o|T128|M128|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "attn_o",
      "tokens": 128,
      "m": 128,
      "n": 2048,
      "k": 2048,
      "source_tokens": 128,
      "num_experts": null,
      "top_k": null,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 1073741824.0,
      "median_us": 499.5,
      "min_us": 494.0,
      "max_us": 676.0,
      "stdev_us": 78.40129144344499,
      "samples": 5,
      "tflops": 2.149633281281281,
      "shape_key": "Qwen1.5-MoE-A2.7B|attn_o|T128|M128|N2048|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_expert_up_or_gate_avg",
      "tokens": 9,
      "m": 9,
      "n": 1408,
      "k": 2048,
      "source_tokens": 128,
      "num_experts": 60,
      "top_k": 4,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 51904512.0,
      "median_us": 12.28800043463707,
      "min_us": 11.58399973064661,
      "max_us": 17.055999487638474,
      "stdev_us": 0.7726776722325873,
      "samples": 100,
      "tflops": 4.223999850593513,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_expert_up_or_gate_avg|T128|M9|N1408|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_expert_up_or_gate_avg",
      "tokens": 9,
      "m": 9,
      "n": 1408,
      "k": 2048,
      "source_tokens": 128,
      "num_experts": 60,
      "top_k": 4,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 51904512.0,
      "median_us": 112.43,
      "min_us": 111.01,
      "max_us": 114.19,
      "stdev_us": 1.1345175185954577,
      "samples": 5,
      "tflops": 0.461660695543894,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_expert_up_or_gate_avg|T128|M9|N1408|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_expert_down_avg",
      "tokens": 9,
      "m": 9,
      "n": 2048,
      "k": 1408,
      "source_tokens": 128,
      "num_experts": 60,
      "top_k": 4,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 51904512.0,
      "median_us": 10.591999627649784,
      "min_us": 9.855999611318111,
      "max_us": 15.71200042963028,
      "stdev_us": 1.3265673872715944,
      "samples": 100,
      "tflops": 4.900350625438691,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_expert_down_avg|T128|M9|N2048|K1408"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_expert_down_avg",
      "tokens": 9,
      "m": 9,
      "n": 2048,
      "k": 1408,
      "source_tokens": 128,
      "num_experts": 60,
      "top_k": 4,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 51904512.0,
      "median_us": 113.85,
      "min_us": 112.46,
      "max_us": 137.14,
      "stdev_us": 10.560642499393674,
      "samples": 5,
      "tflops": 0.4559026086956522,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_expert_down_avg|T128|M9|N2048|K1408"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_shared_up_or_gate",
      "tokens": 128,
      "m": 128,
      "n": 5632,
      "k": 2048,
      "source_tokens": 128,
      "num_experts": 60,
      "top_k": 4,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 2952790016.0,
      "median_us": 14.336000196635723,
      "min_us": 13.663999736309052,
      "max_us": 19.487999379634857,
      "stdev_us": 1.1572127981146765,
      "samples": 100,
      "tflops": 205.9702828891521,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_shared_up_or_gate|T128|M128|N5632|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_shared_up_or_gate",
      "tokens": 128,
      "m": 128,
      "n": 5632,
      "k": 2048,
      "source_tokens": 128,
      "num_experts": 60,
      "top_k": 4,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 2952790016.0,
      "median_us": 1389.0,
      "min_us": 1361.0,
      "max_us": 2643.5,
      "stdev_us": 547.9075651969043,
      "samples": 5,
      "tflops": 2.1258387444204465,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_shared_up_or_gate|T128|M128|N5632|K2048"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_shared_down",
      "tokens": 128,
      "m": 128,
      "n": 2048,
      "k": 5632,
      "source_tokens": 128,
      "num_experts": 60,
      "top_k": 4,
      "device": "gpu",
      "kernel": "flashinfer_cutlass_nvfp4",
      "flops": 2952790016.0,
      "median_us": 24.383999407291412,
      "min_us": 22.048000246286392,
      "max_us": 27.295999228954315,
      "stdev_us": 0.9589140612155698,
      "samples": 100,
      "tflops": 121.0953940196145,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_shared_down|T128|M128|N2048|K5632"
    },
    {
      "model": "Qwen1.5-MoE-A2.7B",
      "family": "moe",
      "op": "moe_shared_down",
      "tokens": 128,
      "m": 128,
      "n": 2048,
      "k": 5632,
      "source_tokens": 128,
      "num_experts": 60,
      "top_k": 4,
      "device": "cpu",
      "kernel": "ik_llama_mxfp4",
      "flops": 2952790016.0,
      "median_us": 1508.0,
      "min_us": 1406.0,
      "max_us": 2406.5,
      "stdev_us": 422.37148933137047,
      "samples": 5,
      "tflops": 1.9580835649867374,
      "shape_key": "Qwen1.5-MoE-A2.7B|moe_shared_down|T128|M128|N2048|K5632"
    }
  ]
}