{
  "cases": [
    {
      "category": "instruction_as_content",
      "expected": "Please check the open PR #4 for code review comments and check each one that's still open if it needs to be fixed. Fix it, then resolve the comments and push your fix.",
      "id": "literal_pr_review_instruction",
      "notes": "Catch refusals or meta responses where the model treats dictated text as an instruction to execute.",
      "transcript": "please check the open PR number 4 for code review comments and check each one that's still open if it needs to be fixed period fix it and then resolve the comments and push your fix"
    },
    {
      "category": "instruction_as_content",
      "expected": "Run cargo test, fix any failing tests, then commit and push the branch.",
      "id": "literal_agent_instruction",
      "notes": "Preserve command-like content as text.",
      "transcript": "run cargo test comma fix any failing tests comma then commit and push the branch period"
    },
    {
      "category": "instruction_as_content",
      "expected": "What is the capital of France?",
      "id": "literal_question_no_answer",
      "notes": "Catch models that answer Paris instead of formatting the dictated question.",
      "transcript": "what is the capital of france question mark"
    },
    {
      "category": "instruction_as_content",
      "expected": "Write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.",
      "id": "literal_email_request",
      "notes": "Format an email request without generating a new email body.",
      "transcript": "write an email to maya comma subject colon quarterly update period the meeting moved to thursday at 9 30 a m comma the budget is twelve thousand four hundred fifty dollars comma and the website is chirper dot local slash launch period"
    },
    {
      "category": "mixed_formatting",
      "expected": "Hello Chirper. I need to write down accent-friendly words.\n\nAccent Friendly Words:\n- water\n- tomato\n- schedule\n- data\n- router\n- aluminium\n- privacy\n\nPlease write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.\n\nIn the deployment notes, mention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale. Thanks!",
      "id": "onboarding_mixed_format",
      "notes": "Catch models that do only one part of a mixed list/email/punctuation/technical-casing task.",
      "transcript": "hello chirper period i need to write down accent friendly words period this is a bullet point list with title accent friendly words colon water comma tomato comma schedule comma data comma router comma aluminium comma privacy period end of list new paragraph please write an email to maya comma subject colon quarterly update period the meeting moved to thursday at nine thirty a m comma the budget is twelve thousand four hundred fifty dollars comma and the website is chirper dot local slash launch period new paragraph in the deployment notes comma mention that systemd keeps the chirper services running comma and we should also look at postgresql comma ffmpeg comma gnome comma nextcloud comma and tailscale period finish with thanks exclamation mark"
    },
    {
      "category": "markdown",
      "expected": "Release Tasks:\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release",
      "id": "markdown_checklist_instruction_content",
      "notes": "Format markdown without trying to publish anything.",
      "transcript": "make a markdown checklist titled release tasks colon item one run tests item two push tag item three publish release"
    },
    {
      "category": "code_identifiers",
      "expected": "cargo test | tee test-output.log",
      "id": "shell_command_as_content",
      "notes": "Preserve shell command structure.",
      "transcript": "cargo test pipe tee space test dash output dot log"
    },
    {
      "category": "spoken_edits",
      "expected": "Call it ChirperBench.",
      "id": "spoken_correction_chain",
      "notes": "Apply the final correction and remove spoken edit words.",
      "transcript": "call it chirper bench no scratch that chirper benchmark spelled capital c chirper capital b bench"
    },
    {
      "category": "urls",
      "expected": "Visit https://docs.python.org/3/library/json.html.",
      "id": "url_https_path",
      "notes": "Format a spoken HTTPS URL with path and punctuation.",
      "transcript": "visit h t t p s colon slash slash docs dot python dot org slash three slash library slash json dot html period"
    },
    {
      "category": "emails",
      "expected": "Send it to alex@chirper.local and cc ops@example.com.",
      "id": "email_addresses",
      "notes": "Format spoken email addresses.",
      "transcript": "send it to alex at chirper dot local and cc ops at example dot com period"
    },
    {
      "category": "casing_identifiers",
      "expected": "The project name is ChirperBench and the class name is DictationFormatter.",
      "id": "product_and_class_casing",
      "notes": "Apply product and code-style casing.",
      "transcript": "the project name is chirper bench and the class name is dictation formatter period"
    },
    {
      "category": "code_identifiers",
      "expected": "Set variable `user_id` equal to `request.user.id`.",
      "id": "inline_code_identifiers",
      "notes": "Format code identifiers and inline code.",
      "transcript": "set variable user underscore id equal to request dot user dot id period"
    },
    {
      "category": "numbers_versions",
      "expected": "Version 2.1.0 shipped on June 3, 2026 and supports Python 3.11+.",
      "id": "numbers_versions_dates",
      "notes": "Format versions, dates, numbers, and language casing.",
      "transcript": "version two point one point zero shipped on june third twenty twenty six and supports python three point eleven plus period"
    },
    {
      "category": "lists",
      "expected": "Shopping list:\n- apples\n- bananas\n- oat milk",
      "id": "simple_list",
      "notes": "Format a simple spoken list.",
      "transcript": "shopping list colon apples comma bananas comma oat milk period"
    },
    {
      "category": "no_change_needed",
      "expected": "The quick brown fox jumps over the lazy dog.",
      "id": "no_change_needed",
      "notes": "Leave already clean text unchanged.",
      "transcript": "The quick brown fox jumps over the lazy dog."
    }
  ],
  "created_at": "2026-06-03T22:52:11",
  "judge_enabled": true,
  "judge_model": "gpt-5.5",
  "judge_reasoning_effort": "high",
  "judge_tier": "standard",
  "models": [
    "rnj-1:8b",
    "laguna-xs.2:latest",
    "nemotron3:33b",
    "ministral-3:14b",
    "nemotron-3-nano:4b",
    "devstral-small-2:24b",
    "qwen3.5:27b",
    "ministral-3:8b",
    "ministral-3:3b",
    "olmo-3.1:32b",
    "gemma4:e4b",
    "gemma4:e2b",
    "qwen3.5:9b",
    "qwen3.5:4b",
    "qwen3.5:0.8b",
    "qwen3.5:2b",
    "olmo2:7b",
    "glm-4.7-flash:latest",
    "granite4.1:30b",
    "gpt-oss:20b",
    "qwen3.6:35b",
    "qwen3.6:27b",
    "gemma4:26b",
    "lfm2:24b",
    "lfm2.5-thinking:1.2b",
    "granite4.1:8b",
    "granite4.1:3b",
    "gemma3:4b",
    "gemma4:latest"
  ],
  "options": {
    "judge_timeout": 300,
    "keep_loaded": false,
    "output_dir": "runs",
    "progress_interval": 30.0,
    "site_dir": "./site",
    "telemetry": "amd-sysfs",
    "telemetry_interval": 0.5,
    "timeout": 180
  },
  "results": [
    {
      "case_id": "literal_pr_review_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 22.598669834000248,
        "errors": [
          {
            "detail": "Explains that no code/text was provided instead of formatting the transcript.",
            "severity": "critical",
            "type": "refusal_or_meta"
          },
          {
            "detail": "Did not apply the required casing, punctuation, and PR number formatting.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "The intended dictated content is absent from the output.",
            "severity": "critical",
            "type": "missing_text"
          },
          {
            "detail": "Added an invented Python code block and explanatory text.",
            "severity": "critical",
            "type": "extra_text"
          }
        ],
        "ideal_output": "Please check the open PR #4 for code review comments and check each one that's still open if it needs to be fixed. Fix it, then resolve the comments and push your fix.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 0,
        "summary": "The formatter produced a meta/refusal-style code block instead of the cleaned dictated instruction."
      },
      "judge_status": "ok",
      "latency_seconds": 2.9885,
      "model": "rnj-1:8b",
      "ollama_status": "ok",
      "output": "```python\n# No code provided in the original transcript. The task was to clean up the transcript text, but there was no actual code or text to clean up. The task description was to check for code review comments and fix them, but no code was provided to review.\n\n",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.9903,
        "metrics": {
          "energy_wh_estimate": 0.1019,
          "gpu_busy_percent_avg": 49.7,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 122.71,
          "power_w_peak": 292.0,
          "vram_mb_avg": 8721.5,
          "vram_mb_peak": 11468.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 7,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_agent_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 6.771564544999819,
        "errors": [
          {
            "detail": "Sentence should start with capitalized 'Run'.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Run cargo test, fix any failing tests, then commit and push the branch.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 95,
        "summary": "Content and punctuation are correct; only the initial capitalization is wrong."
      },
      "judge_status": "ok",
      "latency_seconds": 2.5184,
      "model": "rnj-1:8b",
      "ollama_status": "ok",
      "output": "run cargo test, fix any failing tests, then commit and push the branch.\n\n",
      "passed": true,
      "score": 95,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.5195,
        "metrics": {
          "energy_wh_estimate": 0.1065,
          "gpu_busy_percent_avg": 60.9,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 152.14,
          "power_w_peak": 351.0,
          "vram_mb_avg": 8681.1,
          "vram_mb_peak": 11425.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 7,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_question_no_answer",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 7.425332515000264,
        "errors": [],
        "ideal_output": "What is the capital of France?",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output correctly formats the dictated question without answering it."
      },
      "judge_status": "ok",
      "latency_seconds": 2.3923,
      "model": "rnj-1:8b",
      "ollama_status": "ok",
      "output": "What is the capital of France?\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.394,
        "metrics": {
          "energy_wh_estimate": 0.0607,
          "gpu_busy_percent_avg": 45.0,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 91.33,
          "power_w_peak": 190.0,
          "vram_mb_avg": 8221.4,
          "vram_mb_peak": 11426.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_email_request",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 11.561493320999944,
        "errors": [
          {
            "detail": "Invented email formatting with subject/body structure instead of preserving the dictated request as final text.",
            "severity": "critical",
            "type": "over_generation"
          },
          {
            "detail": "Included explanatory intro and note rather than only cleaned-up final text.",
            "severity": "critical",
            "type": "refusal_or_meta"
          },
          {
            "detail": "Omitted 'Write an email to Maya,' from the intended content.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Changed 'The meeting moved' to 'The meeting has been moved'.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Did not format the budget as $12,450.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 0,
        "summary": "Critical failure: generated an email-style response with explanations instead of returning only the cleaned dictated text."
      },
      "judge_status": "ok",
      "latency_seconds": 3.2157,
      "model": "rnj-1:8b",
      "ollama_status": "ok",
      "output": "Here is the corrected version of the email:\n\nSubject: Quarterly Update\n\nThe meeting has been moved to Thursday at 9:30 AM. The budget is twelve thousand four hundred fifty dollars. The website is chirper.local/launch.\n\nNote: I have corrected the subject line, added proper punctuation, and formatted the time and website URL correctly.\n\n",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.2171,
        "metrics": {
          "energy_wh_estimate": 0.1439,
          "gpu_busy_percent_avg": 65.5,
          "gpu_busy_percent_peak": 95.0,
          "power_w_avg": 161.0,
          "power_w_peak": 297.0,
          "vram_mb_avg": 9030.8,
          "vram_mb_peak": 11454.4,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 8,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "onboarding_mixed_format",
      "category": "mixed_formatting",
      "judge": {
        "elapsed_seconds": 14.598928860000342,
        "errors": [
          {
            "detail": "Leaves dictated commands such as 'This is a bullet point list...', 'End of list', and 'Finish with thanks' in the output.",
            "severity": "critical",
            "type": "instruction_leak"
          },
          {
            "detail": "Does not format the accent-friendly words as a titled bullet list.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Misses required casing for Chirper, Quarterly Update, PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale.",
            "severity": "major",
            "type": "wrong_casing"
          },
          {
            "detail": "Fails to format time and currency as 9:30 AM and $12,450.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Breaks intended sentence flow around the email subject, budget, and website.",
            "severity": "major",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Hello Chirper. I need to write down accent-friendly words.\n\nAccent Friendly Words:\n- water\n- tomato\n- schedule\n- data\n- router\n- aluminium\n- privacy\n\nPlease write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.\n\nIn the deployment notes, mention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale. Thanks!",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 25,
        "summary": "Preserves much of the content but fails the mixed formatting requirements, leaks spoken instructions, and misses casing, list, number, currency, and technical identifier formatting."
      },
      "judge_status": "ok",
      "latency_seconds": 3.847,
      "model": "rnj-1:8b",
      "ollama_status": "ok",
      "output": "Hello chirper. I need to write down accent-friendly words. This is a bullet point list with title \"accent-friendly words\": water, tomato, schedule, data, router, aluminium, privacy. End of list. Please write an email to Maya. Subject: Quarterly update. The meeting moved to Thursday at nine thirty a.m. The budget is twelve thousand four hundred fifty dollars. And the website is chirper.local/launch. In the deployment notes, mention that systemd keeps the chirper services running, and we should also look at postgresql, ffmpeg, gnome, nextcloud, and tailscale. Finish with thanks!\n\n",
      "passed": false,
      "score": 25,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.8486,
        "metrics": {
          "energy_wh_estimate": 0.1698,
          "gpu_busy_percent_avg": 59.9,
          "gpu_busy_percent_peak": 96.0,
          "power_w_avg": 158.89,
          "power_w_peak": 293.0,
          "vram_mb_avg": 9315.9,
          "vram_mb_peak": 11452.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 9,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "markdown_checklist_instruction_content",
      "category": "markdown",
      "judge": {
        "elapsed_seconds": 7.955427245000465,
        "errors": [
          {
            "detail": "Did not format items as markdown checklist entries with '- [ ]'.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Omitted the title 'Release Tasks:'.",
            "severity": "major",
            "type": "missing_text"
          }
        ],
        "ideal_output": "Release Tasks:\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 45,
        "summary": "Output preserves the three tasks but misses the required title and markdown checklist syntax."
      },
      "judge_status": "ok",
      "latency_seconds": 2.4589,
      "model": "rnj-1:8b",
      "ollama_status": "ok",
      "output": "- Run tests\n- Push tag\n- Publish release\n\n",
      "passed": false,
      "score": 45,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.4604,
        "metrics": {
          "energy_wh_estimate": 0.0609,
          "gpu_busy_percent_avg": 42.2,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 89.17,
          "power_w_peak": 167.0,
          "vram_mb_avg": 8248.1,
          "vram_mb_peak": 11451.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "shell_command_as_content",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 7.699250237000342,
        "errors": [
          {
            "detail": "Inserted `-a`, changing `tee` from overwrite/write behavior to append behavior.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Output includes `-a`, which was not dictated.",
            "severity": "major",
            "type": "extra_text"
          }
        ],
        "ideal_output": "cargo test | tee test-output.log",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 75,
        "summary": "Added an extra command flag that changes the shell command semantics."
      },
      "judge_status": "ok",
      "latency_seconds": 2.4331,
      "model": "rnj-1:8b",
      "ollama_status": "ok",
      "output": "cargo test | tee -a test-output.log\n\n",
      "passed": false,
      "score": 75,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.4343,
        "metrics": {
          "energy_wh_estimate": 0.0601,
          "gpu_busy_percent_avg": 42.8,
          "gpu_busy_percent_peak": 96.0,
          "power_w_avg": 88.83,
          "power_w_peak": 167.0,
          "vram_mb_avg": 8248.3,
          "vram_mb_peak": 11456.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "spoken_correction_chain",
      "category": "spoken_edits",
      "judge": {
        "elapsed_seconds": 9.81732080799975,
        "errors": [
          {
            "detail": "Rendered the corrected identifier as two words, 'Chirper Benchmark,' instead of 'ChirperBench.'",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Omitted 'Call it' from the final text.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Missing final period.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Call it ChirperBench.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 35,
        "summary": "Output changes the intended identifier and omits surrounding dictated text and punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 2.3757,
      "model": "rnj-1:8b",
      "ollama_status": "ok",
      "output": "Chirper Benchmark\n\n",
      "passed": false,
      "score": 35,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.377,
        "metrics": {
          "energy_wh_estimate": 0.0578,
          "gpu_busy_percent_avg": 47.5,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 87.5,
          "power_w_peak": 140.0,
          "vram_mb_avg": 8325.1,
          "vram_mb_peak": 11529.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "url_https_path",
      "category": "urls",
      "judge": {
        "elapsed_seconds": 8.797570603000167,
        "errors": [
          {
            "detail": "Sentence should start with capitalized 'Visit'.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Final period was omitted.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Visit https://docs.python.org/3/library/json.html.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 85,
        "summary": "URL content is correct, but sentence casing and final punctuation were missed."
      },
      "judge_status": "ok",
      "latency_seconds": 2.4587,
      "model": "rnj-1:8b",
      "ollama_status": "ok",
      "output": "visit https://docs.python.org/3/library/json.html\n\n",
      "passed": true,
      "score": 85,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.4599,
        "metrics": {
          "energy_wh_estimate": 0.0691,
          "gpu_busy_percent_avg": 48.2,
          "gpu_busy_percent_peak": 94.0,
          "power_w_avg": 101.17,
          "power_w_peak": 236.0,
          "vram_mb_avg": 8338.4,
          "vram_mb_peak": 11546.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "email_addresses",
      "category": "emails",
      "judge": {
        "elapsed_seconds": 7.955995819000236,
        "errors": [
          {
            "detail": "Spoken email addresses were not converted to alex@chirper.local and ops@example.com.",
            "severity": "critical",
            "type": "formatting_miss"
          },
          {
            "detail": "Left the word 'period' instead of ending with a period.",
            "severity": "major",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Sentence should start with 'Send'.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Send it to alex@chirper.local and cc ops@example.com.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 0,
        "summary": "Did not format spoken email addresses or punctuation/casing; output was essentially unchanged."
      },
      "judge_status": "ok",
      "latency_seconds": 2.5191,
      "model": "rnj-1:8b",
      "ollama_status": "ok",
      "output": "send it to alex at chirper dot local and cc ops at example dot com period\n\n",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.5208,
        "metrics": {
          "energy_wh_estimate": 0.1066,
          "gpu_busy_percent_avg": 62.0,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 152.29,
          "power_w_peak": 339.0,
          "vram_mb_avg": 8804.8,
          "vram_mb_peak": 11551.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 7,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "product_and_class_casing",
      "category": "casing_identifiers",
      "judge": {
        "elapsed_seconds": 8.988754678000078,
        "errors": [
          {
            "detail": "Used spaced title case instead of required identifier casing: ChirperBench and DictationFormatter.",
            "severity": "major",
            "type": "wrong_casing"
          },
          {
            "detail": "Failed to format product and class names as code-style identifiers without spaces.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "The project name is ChirperBench and the class name is DictationFormatter.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 60,
        "summary": "Preserved the sentence and punctuation, but missed required product and class identifier casing."
      },
      "judge_status": "ok",
      "latency_seconds": 2.5243,
      "model": "rnj-1:8b",
      "ollama_status": "ok",
      "output": "The project name is Chirper Bench and the class name is Dictation Formatter.\n\n",
      "passed": false,
      "score": 60,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.5255,
        "metrics": {
          "energy_wh_estimate": 0.1066,
          "gpu_busy_percent_avg": 59.9,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 152.0,
          "power_w_peak": 336.0,
          "vram_mb_avg": 8796.9,
          "vram_mb_peak": 11543.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 7,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "inline_code_identifiers",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 7.799946412999816,
        "errors": [
          {
            "detail": "Changed 'Set variable ... equal to ...' into an assignment statement.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Used a fenced code block instead of inline code formatting.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Omitted the words 'Set variable' and 'equal to'.",
            "severity": "minor",
            "type": "missing_text"
          }
        ],
        "ideal_output": "Set variable `user_id` equal to `request.user.id`.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 45,
        "summary": "Converted the dictated instruction into a Python code block instead of preserving the sentence with inline code identifiers."
      },
      "judge_status": "ok",
      "latency_seconds": 2.4734,
      "model": "rnj-1:8b",
      "ollama_status": "ok",
      "output": "```python\nuser_id = request.user.id\n```\n\n",
      "passed": false,
      "score": 45,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.4746,
        "metrics": {
          "energy_wh_estimate": 0.0793,
          "gpu_busy_percent_avg": 57.7,
          "gpu_busy_percent_peak": 96.0,
          "power_w_avg": 115.33,
          "power_w_peak": 318.0,
          "vram_mb_avg": 8325.3,
          "vram_mb_peak": 11541.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "numbers_versions_dates",
      "category": "numbers_versions",
      "judge": {
        "elapsed_seconds": 9.944711776000076,
        "errors": [
          {
            "detail": "Added a comma after the date before 'and' that was not in the expected output.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Version 2.1.0 shipped on June 3, 2026 and supports Python 3.11+.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 96,
        "summary": "Content, casing, versions, date, and number formatting are correct; only an extra comma differs from expected punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 2.6157,
      "model": "rnj-1:8b",
      "ollama_status": "ok",
      "output": "Version 2.1.0 shipped on June 3, 2026, and supports Python 3.11+.\n\n",
      "passed": true,
      "score": 96,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.6171,
        "metrics": {
          "energy_wh_estimate": 0.0898,
          "gpu_busy_percent_avg": 50.3,
          "gpu_busy_percent_peak": 96.0,
          "power_w_avg": 123.57,
          "power_w_peak": 298.0,
          "vram_mb_avg": 8810.1,
          "vram_mb_peak": 11557.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 7,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "simple_list",
      "category": "lists",
      "judge": {
        "elapsed_seconds": 9.091663647999667,
        "errors": [
          {
            "detail": "The heading should be formatted as 'Shopping list:' followed by bullet items, not included as a list item.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "The heading should start with a capital letter.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "The dictated colon after 'shopping list' was omitted.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Shopping list:\n- apples\n- bananas\n- oat milk",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 75,
        "summary": "The list items are present, but the heading was incorrectly made a bullet and casing/colon formatting were missed."
      },
      "judge_status": "ok",
      "latency_seconds": 2.4815,
      "model": "rnj-1:8b",
      "ollama_status": "ok",
      "output": "- shopping list\n- apples\n- bananas\n- oat milk\n\n",
      "passed": false,
      "score": 75,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.4827,
        "metrics": {
          "energy_wh_estimate": 0.0834,
          "gpu_busy_percent_avg": 54.8,
          "gpu_busy_percent_peak": 95.0,
          "power_w_avg": 121.0,
          "power_w_peak": 321.0,
          "vram_mb_avg": 8391.8,
          "vram_mb_peak": 11609.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "no_change_needed",
      "category": "no_change_needed",
      "judge": {
        "elapsed_seconds": 5.935613886999818,
        "errors": [],
        "ideal_output": "The quick brown fox jumps over the lazy dog.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output matches the expected clean text exactly."
      },
      "judge_status": "ok",
      "latency_seconds": 2.4446,
      "model": "rnj-1:8b",
      "ollama_status": "ok",
      "output": "The quick brown fox jumps over the lazy dog.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.4461,
        "metrics": {
          "energy_wh_estimate": 0.0792,
          "gpu_busy_percent_avg": 51.8,
          "gpu_busy_percent_peak": 95.0,
          "power_w_avg": 116.67,
          "power_w_peak": 317.0,
          "vram_mb_avg": 8391.6,
          "vram_mb_peak": 11596.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_pr_review_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 9.990392075999807,
        "errors": [
          {
            "detail": "Includes large explanatory reasoning and stray text such as \"Six\" and \"ist\" before the final transcript.",
            "severity": "critical",
            "type": "extra_text"
          },
          {
            "detail": "Invents analysis and decision-making content beyond the requested cleaned final text.",
            "severity": "critical",
            "type": "over_generation"
          },
          {
            "detail": "Final cleaned line uses \"PR number 4\" instead of expected \"PR #4\" and misses the comma in \"Fix it, then\".",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Please check the open PR #4 for code review comments and check each one that's still open if it needs to be fixed. Fix it, then resolve the comments and push your fix.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 0,
        "summary": "Critical failure: output includes extensive meta reasoning and extra invented text instead of only the cleaned transcript."
      },
      "judge_status": "ok",
      "latency_seconds": 83.521,
      "model": "laguna-xs.2:latest",
      "ollama_status": "ok",
      "output": " Six\nOkay, let me try to figure out how to approach this. The user wants me to fix transcription errors and human-made mistakes in a transcript. The original transcript is: \"please check the open PR number 4 for code review comments and check each one that's still open if it needs to be fixed period fix it and then resolve the comments and push your fix\". \n\nFirst, I need to identify any errors in the transcription. Let me read through it again. The first part seems okay, but there might be some punctuation issues. The phrase \"if it needs to be fixed period fix it\" is a bit confusing. The user might have meant to separate the clauses with a comma or a semicolon. Also, the word \"period\" here is probably meant as a full stop, so maybe replacing \"period\" with a period and adjusting the sentence structure.\n\nSo, \"if it Six needs to be fixed. Fix it and then resolve the comments and push your fix.\" Wait, the original has \"fix it and then resolve...\" but the user might have intended \"fix it and then resolve the comments and push your fix.\" However, in the original, after \"fixed period fix it,\" maybe the \"period\" is a typo or a misstatement. Perhaps the user meant to say \"if it needs to be fixed, fix it and then resolve the comments and push your fix.\" \n\nLetting me restructure that. The original sentence is a bit run-on. Maybe adding a comma after \"fixed\" and replacing \"period\" with a period. So it would be: \"if it needs to be fixed. Fix it and then resolve the comments and push your fix.\" But that might not be correct. Alternatively, maybe the user meant to say \"if it needs to be fixed, fix it and then resolve the comments and push your fix.\" So replacing \"period\" with a comma. \n\nAlso, the word \"fix\" is repeated. Maybe \"fix it and then resolve the comments and push your fix.\" But \"fix\" is used twice. Perhaps the user intended to say \"fix the issues and then resolve the comments and push your fix.\" But without more context, I can't be sure. \n\nAnother thing is the capitalization. The original starts with \"please check...\" which is okay, but maybe \"PR number ed\" is a typo. Wait, the original says \"PR number 4\" so that's correct. \n\nSo putting it all together, the corrected version would be: \"Please check the open PR number 6 for code review comments and check each one that's still open if it needs to be fixed. Fix it and then resolve the comments and push your fix.\" \n\nWait, the user mentioned \"PR number 4\" in the original, but in my first attempt I wrote \"number 6.\" That was a mistake. Let me correct that. The original is \"number 4,\" so the fixed version should keep that. \n\nSo, final version: \"Please check the open PR number 4 for code review comments and check each one that's still open if it needs to be fixed. Fix it and then resolve the comments and push your fix.\" \n\nI think that's better. The run-on sentence is split into two with a period, and the \"period\" is replaced with a period. Also, the capitalization at the start is fixed. The repetition of \"fix\" is okay because it's part of the instruction. \n\nI should also check for any other possible errors. The rest of the sentence seems okay. The user might have intended to use \"fix\" as a verb, so that's correct. \n\nSo the final cleaned-up text is as above.ist\nPlease check the open PR number 4 for code review comments and check each one that's still open if it needs to be fixed. Fix it and then resolve the comments and push your fix.\n\n\n",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 83.524,
        "metrics": {
          "energy_wh_estimate": 1.8695,
          "gpu_busy_percent_avg": 29.0,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 80.58,
          "power_w_peak": 121.0,
          "vram_mb_avg": 23697.2,
          "vram_mb_peak": 24033.9,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 168,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_agent_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 12.436346163000053,
        "errors": [
          {
            "detail": "Added markdown code formatting around cargo test even though it was not dictated.",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Run cargo test, fix any failing tests, then commit and push the branch.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 94,
        "summary": "Content, casing, and punctuation are preserved; only unnecessary code formatting was added around cargo test."
      },
      "judge_status": "ok",
      "latency_seconds": 7.8814,
      "model": "laguna-xs.2:latest",
      "ollama_status": "ok",
      "output": "\nRun `cargo test`, fix any failing tests, then commit and push the branch.\n\n\n",
      "passed": true,
      "score": 94,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 7.883,
        "metrics": {
          "energy_wh_estimate": 0.1789,
          "gpu_busy_percent_avg": 54.8,
          "gpu_busy_percent_peak": 99.0,
          "power_w_avg": 81.71,
          "power_w_peak": 120.0,
          "vram_mb_avg": 21170.8,
          "vram_mb_peak": 24155.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 17,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_question_no_answer",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 8.181004539999776,
        "errors": [
          {
            "detail": "Explains the formatting process instead of outputting only final text.",
            "severity": "critical",
            "type": "refusal_or_meta"
          },
          {
            "detail": "Includes stray words like \"six\" and \"Six\" plus explanatory paragraphs.",
            "severity": "critical",
            "type": "extra_text"
          }
        ],
        "ideal_output": "What is the capital of France?",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 0,
        "summary": "Output includes extensive meta explanation and extra text instead of returning only the cleaned transcript."
      },
      "judge_status": "ok",
      "latency_seconds": 29.3395,
      "model": "laguna-xs.2:latest",
      "ollama_status": "ok",
      "output": " six\nOkay, let's see. The user provided a transcript that says, \"what is the capital of france question mark\". My job is to fix any transcription errors and human-made mistakes.\n\nFirst, I notice that \"question mark\" is written out as words instead of using the actual symbol. The user probably meant to end the question with a question mark. So I should replace \"question mark\" with \"?\". \n\nNext, checking for any other possible errors. The rest of the sentence seems straightforward. The capitalization and punctuation look okay except for the missing question mark. The word \"france\" is correctly spelled. \n\nI should also make sure that the sentence is properly formatted. The original starts with a lowercase \"w\" in \"what\", but since it's a question, maybe it should be capitalized. However, in English, questions can start with a lowercase letter if it's part of a larger sentence, but in this case, it's a standalone question. So I should capitalize the first letter. \n\nPutting it all together, the corrected version should be \"What is the capital of France?\" with proper capitalization and the question mark. Six\nWhat is the capital of France?\n\n\n",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 29.3419,
        "metrics": {
          "energy_wh_estimate": 0.6242,
          "gpu_busy_percent_avg": 37.7,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 76.58,
          "power_w_peak": 118.0,
          "vram_mb_avg": 23270.0,
          "vram_mb_peak": 24235.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 60,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_email_request",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 13.380997463000313,
        "errors": [
          {
            "detail": "Adds stray leading text: \"ly\".",
            "severity": "major",
            "type": "extra_text"
          },
          {
            "detail": "Fails to capitalize sentence starts, name, subject words, weekday, and AM.",
            "severity": "major",
            "type": "wrong_casing"
          },
          {
            "detail": "Does not format the budget as $12,450.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Uses a.m. instead of expected AM style.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 58,
        "summary": "Preserves most dictated content and avoids generating an email body, but adds stray text and misses required casing and currency formatting."
      },
      "judge_status": "ok",
      "latency_seconds": 10.8602,
      "model": "laguna-xs.2:latest",
      "ollama_status": "ok",
      "output": "ly\nwrite an email to maya, subject: quarterly update. the meeting moved to thursday at 9:30 a.m., the budget is twelve thousand four hundred fifty dollars, and the website is chirper.local/launch.\n\n\n",
      "passed": false,
      "score": 58,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 10.8626,
        "metrics": {
          "energy_wh_estimate": 0.2436,
          "gpu_busy_percent_avg": 49.0,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 80.74,
          "power_w_peak": 169.0,
          "vram_mb_avg": 21949.2,
          "vram_mb_peak": 24210.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 23,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "onboarding_mixed_format",
      "category": "mixed_formatting",
      "judge": {
        "elapsed_seconds": 14.377779111999189,
        "errors": [
          {
            "detail": "Adds unexplained leading text: \"Six final text.\"",
            "severity": "critical",
            "type": "extra_text"
          },
          {
            "detail": "Leaves the spoken list instruction in the prose instead of converting it to a titled bullet list.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Does not format the accent-friendly words as a markdown bullet list.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Handles some casing and URL formatting but misses list formatting, currency formatting, and intended punctuation flow.",
            "severity": "major",
            "type": "partial_mixed_task"
          },
          {
            "detail": "Budget remains written out instead of formatted as $12,450.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Changes \"Thanks!\" to \"Thank you!\" and separates it into its own paragraph.",
            "severity": "minor",
            "type": "semantic_change"
          },
          {
            "detail": "Splits several intended comma-linked clauses into separate sentences.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Hello Chirper. I need to write down accent-friendly words.\n\nAccent Friendly Words:\n- water\n- tomato\n- schedule\n- data\n- router\n- aluminium\n- privacy\n\nPlease write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.\n\nIn the deployment notes, mention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale. Thanks!",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 45,
        "summary": "Mixed-format task was only partially cleaned: it missed the bullet list, leaked dictated structure text, failed currency formatting, added extra text, and changed the closing."
      },
      "judge_status": "ok",
      "latency_seconds": 18.1175,
      "model": "laguna-xs.2:latest",
      "ollama_status": "ok",
      "output": " Six final text.\n\nHello, Chirper. I need to write down accent-friendly words. This is a bullet point list with title \"Accent Friendly Words\": water, tomato, schedule, data, router, aluminium, privacy.\n\nPlease write an email to Maya. Subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM. The budget is twelve thousand four hundred fifty dollars. The website is chirper.local/launch.\n\nIn the deployment notes, mention that systemd keeps the Chirper services running. We should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale.\n\nThank you!\n\n\n",
      "passed": false,
      "score": 45,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 18.1194,
        "metrics": {
          "energy_wh_estimate": 0.4081,
          "gpu_busy_percent_avg": 41.8,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 81.08,
          "power_w_peak": 150.0,
          "vram_mb_avg": 22841.6,
          "vram_mb_peak": 24250.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 38,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "markdown_checklist_instruction_content",
      "category": "markdown",
      "judge": {
        "elapsed_seconds": 8.179589657998804,
        "errors": [
          {
            "detail": "Used '# Release Tasks' instead of the expected 'Release Tasks:' title line.",
            "severity": "minor",
            "type": "formatting_miss"
          },
          {
            "detail": "Omitted the colon after the title.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Release Tasks:\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 85,
        "summary": "Content and checklist items are preserved, but the dictated title with colon was changed into a Markdown heading without the colon."
      },
      "judge_status": "ok",
      "latency_seconds": 8.6716,
      "model": "laguna-xs.2:latest",
      "ollama_status": "ok",
      "output": "\n# Release Tasks\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release\n\n\n",
      "passed": true,
      "score": 85,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 8.6732,
        "metrics": {
          "energy_wh_estimate": 0.2022,
          "gpu_busy_percent_avg": 51.7,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 83.95,
          "power_w_peak": 137.0,
          "vram_mb_avg": 21517.4,
          "vram_mb_peak": 24329.4,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 19,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "shell_command_as_content",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 9.652663903001667,
        "errors": [
          {
            "detail": "Output includes extensive reasoning/explanation instead of only cleaned final text.",
            "severity": "critical",
            "type": "refusal_or_meta"
          },
          {
            "detail": "Adds large amounts of unrelated text before the final command.",
            "severity": "critical",
            "type": "extra_text"
          },
          {
            "detail": "Final command uses '- space test - output.log' instead of the intended filename 'test-output.log'.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Changes the shell command structure and arguments.",
            "severity": "major",
            "type": "semantic_change"
          }
        ],
        "ideal_output": "cargo test | tee test-output.log",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 0,
        "summary": "Critical meta/explanation and incorrect command formatting; did not preserve the intended shell command."
      },
      "judge_status": "ok",
      "latency_seconds": 104.0301,
      "model": "laguna-xs.2:latest",
      "ollama_status": "ok",
      "output": "icera\nOkay, let me try to figure out how to handle this. The user provided a transcript that looks like a series of commands or file names. The task is to fix transcription errors and apply any spoken edit commands.\n\nFirst, I need to parse each part of the transcript. Let's break it down:\n\n1. \"cargo test\" \u2013 This is a common Rust command, so it's probably correct.\n2. \"pipe tee space test dash output dot log\" \u2013 This seems like a command line with options. Let me think. In Unix-like systems, \"pipe\" (|) is used to pass output from one command to another. \"tee\" is a command that reads from stdin and writes to both stdout and a file. \"space\" might be a separator, but in command line, spaces are usually just spaces. \"test dash\" could be options like -t or --test. \"output dot log\" might be a file name like \"output.log\".\n\nPutting it together: \"cargo test | tee - space test - output.log\". Wait, that doesn't make sense. Maybe \"space test\" is part of the command. Let6t me check. The original might be \"cargo test | tee - space test - output.log\". But \"space test\" isn't a standard option. Maybe \"space\" is a typo for \"space\" as in the space character, but in command line, you don't usually spell out \"space\". Alternatively, maybe \"space\" is part of a flag. For example, \"tee -s\" for space? Not sure. Alternatively, maybe \"space\" is a mishearing of \"space\" as in the directory or file name. \n\nAlternatively, perhaps the user meant \"cargo test | tee -s test -o output.log\". But the original has \"space test dash output dot log\". Let Six understand the structure. The user might have said \"cargo test | tee - space test - output.log\". But \"space\" here is unclear. Maybe \"space\" is a mishearing of \"s\" for the -s option. Or maybe \"space\" is part of a file name. \n\nAlternatively, maybe the user was trying to write \"cargo test | tee - space test - output.log\" where \"space\" is a directory or part of the command. But I'm not sure. Let me think of common commands. If \"tee\" is used with options, maybe \"tee -a\" for append. But the user said \"space test dash output dot log\". Maybe \"space\" is a separator. \n\nWait, perhaps the user was trying to write \"cargo test | tee -s test -o output.log\". But \"space test\" is confusing. Maybe \"space\" is a typo for \"s\", so \"s test - o output.log\". But that still doesn't fit. Alternatively, maybe \"space\" is part of a file name. For example, \"cargo test | tee - space test - output.log\" where \"space\" is a file or directory name. \n\nAlternatively, maybe the user was trying to write \"cargo test | tee - space test - output.log\" where \"space\" is a flag. But I'm not sure. Let me check if there's a standard way to use \"tee\" with space. The \"tee\" command typically takes options like -a (append), -i (ignore interrupts), etc. Maybe \"space\" is a mishearing of \"i\", but that's a stretch. \n\nAlternatively, maybe the user meant to write \"cargo test | tee space test - output.log\". But that would be \"tee space test - output.log\", which doesn't make sense. \n\nGiven the uncertainty, perhaps the best approach is to keep \"cargo test | tee - space test - output.log\" but check if \"space\" is a valid part of the command. If not, maybe it's a typo. Alternatively, maybe \"space\" is part of a file name. \n\nSince the user might have made a mistake, perhaps \"space\" is a mishearing of \"s\", leading to \"cargo test | tee -s test - output.log\". But \"test - output.log\" still isn't a standard option. 6 Alternatively, maybe \"space test\" is a directory or a parameter. \n\nGiven the ambiguity, I'll proceed with the most likely interpretation, which is that the user intended to write a command that pipes the output of \"cargo test\" to \"tee\" with some options and then to a file. The part \"space test dash output dot log\" might be \" - space test - output.log\", but I'm not sure. \n\nSince the user mentioned applying spoken edit commands, perhaps \"space\" is a separator. In that case, maybe it's \"cargo test | tee - space test - output.log\", where \"space\" is a directory or a parameter. \n\nGiven that, I'll present the cleaned-up version as \"cargo test | tee - space test - output.log\". However, if \"space\" is a typo, it might need correction. But without more context, this is the best I can do. six\ncargo test | tee - space test - output.log\n\n\n",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 104.0317,
        "metrics": {
          "energy_wh_estimate": 2.2277,
          "gpu_busy_percent_avg": 30.1,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 77.09,
          "power_w_peak": 136.0,
          "vram_mb_avg": 23945.4,
          "vram_mb_peak": 24270.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 209,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "spoken_correction_chain",
      "category": "spoken_edits",
      "judge": {
        "elapsed_seconds": 24.903262920999623,
        "errors": [
          {
            "detail": "Explains how it parsed the transcript instead of returning only cleaned final text.",
            "severity": "critical",
            "type": "refusal_or_meta"
          },
          {
            "detail": "Includes unrelated text such as 'six' and multiple explanatory paragraphs.",
            "severity": "critical",
            "type": "extra_text"
          },
          {
            "detail": "Leaves spoken edit phrases like 'scratch that' and capitalization commands in the output.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Omits the intended phrase 'Call it' and changes ChirperBench into 'Chirper Benchmark'.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Fails to format the identifier as the compound name ChirperBench.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Missing final period in the final cleaned text.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Call it ChirperBench.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 0,
        "summary": "Output contains extensive meta-explanation and extra text, leaks spoken edit commands, and the final cleaned text is incorrect."
      },
      "judge_status": "ok",
      "latency_seconds": 32.4038,
      "model": "laguna-xs.2:latest",
      "ollama_status": "ok",
      "output": " six\nOkay, let's tackle this. The user provided a transcript with some corrections. The original text is: \"call it chirper bench no scratch that chirper benchmark spelled capital c chirper capital b bench\". \n\nFirst, I need to parse the spoken edits. The user starts by saying \"call it chirper bench\" but then corrects to \"chirper benchmark\". Then they mention \"spelled capital c chirper capital b bench\". \n\nSo, the initial part is \"call it chirper bench\", but they cross that out with \"no scratch that\" and replace it with \"chirper benchmark\". Then, they specify spelling: \"capital c chirper\" and \"capital b bench\". \n\nApplying these edits, the main name should be \"Chirper Benchmark\". The user wants capitalization for \"Chirper\" and \"Benchmark\". The \"spelled capital c\" probably means \"Chirper\" with a capital C, and \"capital b\" for \"Benchmark\" with a capital B. \n\nSo putting it all together, the final text should be \"Chirer Benchmark\" with correct capitalization. Wait, no, the original correction was \"chirper benchmark\", but the user specified \"capital c chirper\" so \"Chirper\", and \"capital b bench\" so \"Benchmark\". \n\nTherefore, the cleaned-up version is \"Chirper Benchmark\". six\nChirper Benchmark\n\n\n",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 32.4052,
        "metrics": {
          "energy_wh_estimate": 0.7145,
          "gpu_busy_percent_avg": 36.4,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 79.38,
          "power_w_peak": 124.0,
          "vram_mb_avg": 23425.8,
          "vram_mb_peak": 24268.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 66,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "url_https_path",
      "category": "urls",
      "judge": {
        "elapsed_seconds": 9.792566076001094,
        "errors": [
          {
            "detail": "Explains how it is formatting the transcript instead of returning only final cleaned text.",
            "severity": "critical",
            "type": "refusal_or_meta"
          },
          {
            "detail": "Includes stray text such as 'ing' and 'Six' plus multiple explanatory paragraphs.",
            "severity": "critical",
            "type": "extra_text"
          },
          {
            "detail": "Omits the leading word 'Visit' from the intended sentence.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Does not return the full sentence with final period as expected.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Visit https://docs.python.org/3/library/json.html.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 0,
        "summary": "Critical meta explanation and extra invented text; final content is incomplete and not the required cleaned sentence."
      },
      "judge_status": "ok",
      "latency_seconds": 30.7173,
      "model": "laguna-xs.2:latest",
      "ollama_status": "ok",
      "output": "ing\nOkay, let's tackle this. The user provided a transcript that looks like a URL with spaces between each character. My job is to fix that.\n\nFirst, I need to remove the spaces. The original is \"h t t p s colon slash slash docs dot python dot org slash three slash library slash json dot html period\". So replacing each space with nothing should give \"https://docs.python.org/3/library/json.html\".\n\nWait, let me check. \"h t t p s\" becomes \"https\", then \"colon\" is \":\", \"slash slash\" is \"//\", \"docs\" stays, \"dot\" is \".\", \"python\" stays, \"dot\" again, \"org\", then \"slash three\" is \"/3\", \"slash library\" is \"/library\", \"slash json\" is \"/json\", and \"dot html period\" is \".html\". So putting it all together: \"https://docs.python.org/3/library/json.html\". That makes sense.\n\nI should also make sure there are no extra spaces or typos. The user might have meant to write \"https://docs.python.org/3/library/json.html\" which is the correct URL for Python's JSON library documentation. So the cleaned-up version should be that URL without any spaces or incorrect characters. Six\nhttps://docs.python.org/3/library/json.html\n\n\n",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 30.7197,
        "metrics": {
          "energy_wh_estimate": 0.6607,
          "gpu_busy_percent_avg": 35.8,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 77.43,
          "power_w_peak": 137.0,
          "vram_mb_avg": 23391.8,
          "vram_mb_peak": 24276.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 63,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "email_addresses",
      "category": "emails",
      "judge": {
        "elapsed_seconds": 9.689096248001078,
        "errors": [
          {
            "detail": "Explains the formatting process instead of returning only final cleaned text.",
            "severity": "critical",
            "type": "refusal_or_meta"
          },
          {
            "detail": "Adds timestamp, reasoning, and commentary around the formatted sentence.",
            "severity": "critical",
            "type": "extra_text"
          }
        ],
        "ideal_output": "Send it to alex@chirper.local and cc ops@example.com.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 0,
        "summary": "Output is an explanation/meta response instead of only the cleaned final text, despite containing the correct sentence inside it."
      },
      "judge_status": "ok",
      "latency_seconds": 33.5849,
      "model": "laguna-xs.2:latest",
      "ollama_status": "ok",
      "output": " SixFourEight 2024-03-14 10:25:34\nOkay, I need to fix the transcription errors in this transcript. Let me read through it carefully.\n\nThe original transcript says: \"send it to alex at chirper dot local and cc ops at example dot com period\"\n\nFirst, I notice that \"chirper dot local\" and \"example dot com\" are written out in a spoken format. Typically, in written communication, especially emails, we use the @ symbol instead of \"at\" and we don't use \"dot\" when writing email addresses. So I should replace \"at\" with \"@\" and remove \"dot\" to form proper email addresses.\n\nNext, the word \"period\" at the end seems like a spoken edit. The speaker might have meant to end the sentence with a period, but in written form, we just use the actual punctuation mark. So I should replace \"period\" with a period.\n\nAlso, I need to check for any other possible errors. The names \"alex\" and \"ops\" seem correctly spelled. The words \"send it to\" and \"cc\" are correctly used in an email context. The structure of the sentence is clear: send to one email and cc another.\n\nSo applying these changes, the corrected version should be: \"Send it to alex@chirper.local and cc ops@example.com.\" I'll make sure to capitalize the first letter of the sentence and ensure proper spacing and punctuation.\n\n\n",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 33.589,
        "metrics": {
          "energy_wh_estimate": 0.7127,
          "gpu_busy_percent_avg": 32.4,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 76.39,
          "power_w_peak": 128.0,
          "vram_mb_avg": 23450.1,
          "vram_mb_peak": 24268.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 69,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "product_and_class_casing",
      "category": "casing_identifiers",
      "judge": {
        "elapsed_seconds": 8.040668811001524,
        "errors": [
          {
            "detail": "Project name should be formatted as ChirperBench, not Chirper Bench.",
            "severity": "major",
            "type": "wrong_casing"
          },
          {
            "detail": "Adds an extra leading '6' before the cleaned text.",
            "severity": "major",
            "type": "extra_text"
          }
        ],
        "ideal_output": "The project name is ChirperBench and the class name is DictationFormatter.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 60,
        "summary": "The output mostly preserves the sentence but misses required product-style casing and adds an extra leading character."
      },
      "judge_status": "ok",
      "latency_seconds": 7.6028,
      "model": "laguna-xs.2:latest",
      "ollama_status": "ok",
      "output": "6\nThe project name is Chirper Bench and the class name is DictationFormatter.\n\n\n",
      "passed": false,
      "score": 60,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 7.6041,
        "metrics": {
          "energy_wh_estimate": 0.1701,
          "gpu_busy_percent_avg": 55.1,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 80.53,
          "power_w_peak": 112.0,
          "vram_mb_avg": 21253.8,
          "vram_mb_peak": 24266.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 17,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "inline_code_identifiers",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 10.103432904999863,
        "errors": [
          {
            "detail": "Adds an unrelated leading '6'.",
            "severity": "critical",
            "type": "extra_text"
          },
          {
            "detail": "Formats the entire sentence as inline code instead of only `user_id` and `request.user.id`.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Starts with lowercase 'set' instead of 'Set'.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Set variable `user_id` equal to `request.user.id`.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 0,
        "summary": "Output adds extraneous text, wrong casing, and wraps the whole sentence in code instead of only identifiers."
      },
      "judge_status": "ok",
      "latency_seconds": 7.497,
      "model": "laguna-xs.2:latest",
      "ollama_status": "ok",
      "output": "6\n`set variable user_id equal to request.user.id`\n\n\n",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 7.4998,
        "metrics": {
          "energy_wh_estimate": 0.1595,
          "gpu_busy_percent_avg": 60.5,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 76.56,
          "power_w_peak": 114.0,
          "vram_mb_avg": 21028.9,
          "vram_mb_peak": 24214.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 16,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "numbers_versions_dates",
      "category": "numbers_versions",
      "judge": {
        "elapsed_seconds": 10.415838888000508,
        "errors": [
          {
            "detail": "Output includes leading/trailing blank lines around the final text.",
            "severity": "minor",
            "type": "extra_text"
          }
        ],
        "ideal_output": "Version 2.1.0 shipped on June 3, 2026 and supports Python 3.11+.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 98,
        "summary": "Content, casing, punctuation, dates, versions, and language name are correct; only extra surrounding blank lines appear."
      },
      "judge_status": "ok",
      "latency_seconds": 8.6173,
      "model": "laguna-xs.2:latest",
      "ollama_status": "ok",
      "output": "\n\nVersion 2.1.0 shipped on June 3, 2026 and supports Python 3.11+.\n\n\n",
      "passed": true,
      "score": 98,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 8.6199,
        "metrics": {
          "energy_wh_estimate": 0.201,
          "gpu_busy_percent_avg": 50.9,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 83.95,
          "power_w_peak": 202.0,
          "vram_mb_avg": 21542.7,
          "vram_mb_peak": 24258.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 19,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "simple_list",
      "category": "lists",
      "judge": {
        "elapsed_seconds": 8.962437047001004,
        "errors": [
          {
            "detail": "Expected each shopping item as a separate bullet list item.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Shopping list:\n- apples\n- bananas\n- oat milk",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 65,
        "summary": "Content is preserved, but the spoken list was not formatted as a markdown list after the colon."
      },
      "judge_status": "ok",
      "latency_seconds": 7.1496,
      "model": "laguna-xs.2:latest",
      "ollama_status": "ok",
      "output": "\n\nShopping list: apples, bananas, oat milk.\n\n\n",
      "passed": false,
      "score": 65,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 7.1515,
        "metrics": {
          "energy_wh_estimate": 0.1748,
          "gpu_busy_percent_avg": 57.9,
          "gpu_busy_percent_peak": 99.0,
          "power_w_avg": 88.0,
          "power_w_peak": 152.0,
          "vram_mb_avg": 21082.6,
          "vram_mb_peak": 24246.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 16,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "no_change_needed",
      "category": "no_change_needed",
      "judge": {
        "elapsed_seconds": 6.104307308998614,
        "errors": [],
        "ideal_output": "The quick brown fox jumps over the lazy dog.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output preserves the clean text unchanged."
      },
      "judge_status": "ok",
      "latency_seconds": 7.0737,
      "model": "laguna-xs.2:latest",
      "ollama_status": "ok",
      "output": "\nThe quick brown fox jumps over the lazy dog.\n\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 7.076,
        "metrics": {
          "energy_wh_estimate": 0.1729,
          "gpu_busy_percent_avg": 57.0,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 88.0,
          "power_w_peak": 154.0,
          "vram_mb_avg": 21124.4,
          "vram_mb_peak": 24308.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 16,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_pr_review_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 10.793109955000546,
        "errors": [
          {
            "detail": "Changes the sequence so comments are resolved and pushed if they need fixing, without explicitly saying to fix them first.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Omitted the separate command: \u201cFix it, then...\u201d",
            "severity": "major",
            "type": "missing_text"
          }
        ],
        "ideal_output": "Please check the open PR #4 for code review comments and check each one that's still open if it needs to be fixed. Fix it, then resolve the comments and push your fix.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 78,
        "summary": "Mostly cleaned, but it drops the explicit instruction to fix needed issues before resolving comments."
      },
      "judge_status": "ok",
      "latency_seconds": 36.6626,
      "model": "nemotron3:33b",
      "ollama_status": "ok",
      "output": "Please check the open PR #4 for code review comments. Check each one that is still open; if it needs to be fixed, resolve the comments and push your fix.\n\n",
      "passed": false,
      "score": 78,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 36.6642,
        "metrics": {
          "energy_wh_estimate": 0.6177,
          "gpu_busy_percent_avg": 38.0,
          "gpu_busy_percent_peak": 82.0,
          "power_w_avg": 60.65,
          "power_w_peak": 230.0,
          "vram_mb_avg": 21379.9,
          "vram_mb_peak": 23846.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 75,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_agent_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 5.275015987999723,
        "errors": [],
        "ideal_output": "Run cargo test, fix any failing tests, then commit and push the branch.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output matches the expected cleaned text exactly."
      },
      "judge_status": "ok",
      "latency_seconds": 35.4505,
      "model": "nemotron3:33b",
      "ollama_status": "ok",
      "output": "Run cargo test, fix any failing tests, then commit and push the branch.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 35.4529,
        "metrics": {
          "energy_wh_estimate": 0.6175,
          "gpu_busy_percent_avg": 37.7,
          "gpu_busy_percent_peak": 90.0,
          "power_w_avg": 62.71,
          "power_w_peak": 195.0,
          "vram_mb_avg": 21615.4,
          "vram_mb_peak": 23894.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 72,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_question_no_answer",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 7.102368290001323,
        "errors": [],
        "ideal_output": "What is the capital of France?",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Perfectly preserved and formatted the dictated question without answering it."
      },
      "judge_status": "ok",
      "latency_seconds": 20.8442,
      "model": "nemotron3:33b",
      "ollama_status": "ok",
      "output": "What is the capital of France?\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 20.8464,
        "metrics": {
          "energy_wh_estimate": 0.3051,
          "gpu_busy_percent_avg": 43.4,
          "gpu_busy_percent_peak": 85.0,
          "power_w_avg": 52.7,
          "power_w_peak": 146.0,
          "vram_mb_avg": 20151.0,
          "vram_mb_peak": 23854.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 43,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_email_request",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 9.221238049000021,
        "errors": [
          {
            "detail": "Invented email headers, greeting, and body formatting.",
            "severity": "critical",
            "type": "over_generation"
          },
          {
            "detail": "Changed 'the meeting moved' to 'has been moved' and altered sentence structure.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Failed to keep the instruction-as-content format from the expected output.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 20,
        "summary": "Generated an email body instead of preserving the dictated request as cleaned text."
      },
      "judge_status": "ok",
      "latency_seconds": 55.6036,
      "model": "nemotron3:33b",
      "ollama_status": "ok",
      "output": "To: Maya  \nSubject: Quarterly Update  \n\nHi Maya,\n\nThe meeting has been moved to Thursday at 9:30\u202fa.m. The budget is $12,450. The website is chirper.local/launch.\n\n",
      "passed": false,
      "score": 20,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 55.6056,
        "metrics": {
          "energy_wh_estimate": 1.008,
          "gpu_busy_percent_avg": 34.6,
          "gpu_busy_percent_peak": 84.0,
          "power_w_avg": 65.26,
          "power_w_peak": 168.0,
          "vram_mb_avg": 22368.0,
          "vram_mb_peak": 23898.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 112,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "onboarding_mixed_format",
      "category": "mixed_formatting",
      "judge": {
        "elapsed_seconds": 15.073962758000562,
        "errors": [
          {
            "detail": "Changed the inline email instruction with subject into separate sentences/paragraphs and split comma-linked content into separate sentences.",
            "severity": "minor",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Formatted 9:30 AM as 9:30 a.m. instead of the expected uppercase AM.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Used nonstandard spacing/characters such as nonbreaking hyphen and narrow no-break space, and separated Thanks into its own paragraph.",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Hello Chirper. I need to write down accent-friendly words.\n\nAccent Friendly Words:\n- water\n- tomato\n- schedule\n- data\n- router\n- aluminium\n- privacy\n\nPlease write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.\n\nIn the deployment notes, mention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale. Thanks!",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 82,
        "summary": "Mostly preserves content and mixed formatting, but changes the requested inline email/subject structure and time casing/punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 81.3525,
      "model": "nemotron3:33b",
      "ollama_status": "ok",
      "output": "Hello Chirper. I need to write down accent\u2011friendly words.\n\nAccent Friendly Words:\n- water\n- tomato\n- schedule\n- data\n- router\n- aluminium\n- privacy\n\nPlease write an email to Maya.  \nSubject: Quarterly Update.\n\nThe meeting moved to Thursday at 9:30\u202fa.m. The budget is $12,450. The website is chirper.local/launch.\n\nIn the deployment notes, mention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale.\n\nThanks!\n\n",
      "passed": true,
      "score": 82,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 81.3543,
        "metrics": {
          "energy_wh_estimate": 1.5343,
          "gpu_busy_percent_avg": 33.2,
          "gpu_busy_percent_peak": 72.0,
          "power_w_avg": 67.9,
          "power_w_peak": 160.0,
          "vram_mb_avg": 22786.1,
          "vram_mb_peak": 23893.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 164,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "markdown_checklist_instruction_content",
      "category": "markdown",
      "judge": {
        "elapsed_seconds": 9.447628821000762,
        "errors": [
          {
            "detail": "Wrapped the output in a markdown code block, which is not part of the intended final text.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Added markdown heading syntax before the title.",
            "severity": "minor",
            "type": "extra_text"
          }
        ],
        "ideal_output": "Release Tasks:\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 82,
        "summary": "Content is preserved, but the formatter added a code fence and changed the title into a markdown heading instead of returning the expected cleaned checklist text."
      },
      "judge_status": "ok",
      "latency_seconds": 38.5951,
      "model": "nemotron3:33b",
      "ollama_status": "ok",
      "output": "```markdown\n## Release Tasks:\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release\n```\n\n",
      "passed": false,
      "score": 82,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 38.5972,
        "metrics": {
          "energy_wh_estimate": 0.657,
          "gpu_busy_percent_avg": 37.3,
          "gpu_busy_percent_peak": 79.0,
          "power_w_avg": 61.28,
          "power_w_peak": 143.0,
          "vram_mb_avg": 21768.9,
          "vram_mb_peak": 23872.9,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 79,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "shell_command_as_content",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 9.235605402998772,
        "errors": [
          {
            "detail": "Left the spoken command \"space\" in the output.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Rendered \"test-output.log\" as \"test - output.log\".",
            "severity": "major",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "cargo test | tee test-output.log",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 55,
        "summary": "Preserved the main command and pipe, but leaked a spoken spacing command and malformed the hyphenated log filename."
      },
      "judge_status": "ok",
      "latency_seconds": 102.4078,
      "model": "nemotron3:33b",
      "ollama_status": "ok",
      "output": "cargo test | tee space test - output.log\n\n",
      "passed": false,
      "score": 55,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 102.4094,
        "metrics": {
          "energy_wh_estimate": 1.975,
          "gpu_busy_percent_avg": 33.2,
          "gpu_busy_percent_peak": 76.0,
          "power_w_avg": 69.43,
          "power_w_peak": 101.0,
          "vram_mb_avg": 22982.3,
          "vram_mb_peak": 23911.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 206,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "spoken_correction_chain",
      "category": "spoken_edits",
      "judge": {
        "elapsed_seconds": 7.955921962999128,
        "errors": [
          {
            "detail": "Leaves spoken correction words such as \"no scratch\" and casing commands in the output.",
            "severity": "critical",
            "type": "instruction_leak"
          },
          {
            "detail": "Outputs multiple discarded versions instead of the intended final phrase.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Fails to combine the spelled identifier as \"ChirperBench\".",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Uses commas and sentence breaks not intended by the dictation.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Call it ChirperBench.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 0,
        "summary": "Critical failure: spoken edit commands were retained and the final corrected identifier was not produced."
      },
      "judge_status": "ok",
      "latency_seconds": 87.4536,
      "model": "nemotron3:33b",
      "ollama_status": "ok",
      "output": "Call it Chirper Bench, no scratch. That Chirper Benchmark, Capital C Chirper, Capital B Bench.\n\n",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 87.4555,
        "metrics": {
          "energy_wh_estimate": 1.6603,
          "gpu_busy_percent_avg": 32.4,
          "gpu_busy_percent_peak": 83.0,
          "power_w_avg": 68.35,
          "power_w_peak": 159.0,
          "vram_mb_avg": 22886.7,
          "vram_mb_peak": 23946.4,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 176,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "url_https_path",
      "category": "urls",
      "judge": {
        "elapsed_seconds": 5.319053074999829,
        "errors": [],
        "ideal_output": "Visit https://docs.python.org/3/library/json.html.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output exactly matches the expected formatted HTTPS URL with correct casing and punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 29.6371,
      "model": "nemotron3:33b",
      "ollama_status": "ok",
      "output": "Visit https://docs.python.org/3/library/json.html.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 29.6401,
        "metrics": {
          "energy_wh_estimate": 0.4656,
          "gpu_busy_percent_avg": 38.0,
          "gpu_busy_percent_peak": 79.0,
          "power_w_avg": 56.56,
          "power_w_peak": 154.0,
          "vram_mb_avg": 21175.5,
          "vram_mb_peak": 23851.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 61,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "email_addresses",
      "category": "emails",
      "judge": {
        "elapsed_seconds": 8.320294488999934,
        "errors": [
          {
            "detail": "Omitted 'chirper' from alex@chirper.local, producing alex@local.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Sentence should start with 'Send'.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Send it to alex@chirper.local and cc ops@example.com.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 75,
        "summary": "Email formatting mostly worked, but one address lost the domain segment and sentence casing was missed."
      },
      "judge_status": "ok",
      "latency_seconds": 49.097,
      "model": "nemotron3:33b",
      "ollama_status": "ok",
      "output": "send it to alex@local and cc ops@example.com.\n\n",
      "passed": false,
      "score": 75,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 49.0992,
        "metrics": {
          "energy_wh_estimate": 0.8741,
          "gpu_busy_percent_avg": 35.3,
          "gpu_busy_percent_peak": 86.0,
          "power_w_avg": 64.09,
          "power_w_peak": 197.0,
          "vram_mb_avg": 22186.6,
          "vram_mb_peak": 23881.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 99,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "product_and_class_casing",
      "category": "casing_identifiers",
      "judge": {
        "elapsed_seconds": 11.288256500998614,
        "errors": [
          {
            "detail": "Used 'Chirper Bench' instead of product identifier 'ChirperBench'.",
            "severity": "major",
            "type": "wrong_casing"
          },
          {
            "detail": "Used 'Dictation Formatter' instead of class identifier 'DictationFormatter'.",
            "severity": "major",
            "type": "wrong_casing"
          },
          {
            "detail": "Missed required identifier formatting by leaving spaces in product and class names.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "The project name is ChirperBench and the class name is DictationFormatter.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 70,
        "summary": "The text is otherwise cleaned up, but both required identifiers use incorrect code-style/product casing."
      },
      "judge_status": "ok",
      "latency_seconds": 35.222,
      "model": "nemotron3:33b",
      "ollama_status": "ok",
      "output": "The project name is Chirper Bench and the class name is Dictation Formatter.\n\n",
      "passed": false,
      "score": 70,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 35.2245,
        "metrics": {
          "energy_wh_estimate": 0.5724,
          "gpu_busy_percent_avg": 36.5,
          "gpu_busy_percent_peak": 79.0,
          "power_w_avg": 58.5,
          "power_w_peak": 131.0,
          "vram_mb_avg": 21562.4,
          "vram_mb_peak": 23854.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 72,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "inline_code_identifiers",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 7.801416315000097,
        "errors": [
          {
            "detail": "Code identifiers should be formatted as inline code with backticks.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Sentence should start with a capital letter.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Final period is missing.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Set variable `user_id` equal to `request.user.id`.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 65,
        "summary": "Content is preserved, but required casing, punctuation, and inline code formatting are missing."
      },
      "judge_status": "ok",
      "latency_seconds": 28.7724,
      "model": "nemotron3:33b",
      "ollama_status": "ok",
      "output": "set variable user_id equal to request.user.id\n\n",
      "passed": false,
      "score": 65,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 28.7746,
        "metrics": {
          "energy_wh_estimate": 0.4332,
          "gpu_busy_percent_avg": 38.6,
          "gpu_busy_percent_peak": 84.0,
          "power_w_avg": 54.2,
          "power_w_peak": 197.0,
          "vram_mb_avg": 21099.0,
          "vram_mb_peak": 23884.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 59,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "numbers_versions_dates",
      "category": "numbers_versions",
      "judge": {
        "elapsed_seconds": 7.576598585999818,
        "errors": [
          {
            "detail": "Added a comma after \"2026\" that was not in the expected output.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Version 2.1.0 shipped on June 3, 2026 and supports Python 3.11+.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 95,
        "summary": "Content, casing, versions, date, and Python number formatting are correct; only an extra comma was added after the date."
      },
      "judge_status": "ok",
      "latency_seconds": 37.2692,
      "model": "nemotron3:33b",
      "ollama_status": "ok",
      "output": "Version 2.1.0 shipped on June 3, 2026, and supports Python 3.11+.\n\n",
      "passed": true,
      "score": 95,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 37.2711,
        "metrics": {
          "energy_wh_estimate": 0.6148,
          "gpu_busy_percent_avg": 37.1,
          "gpu_busy_percent_peak": 88.0,
          "power_w_avg": 59.38,
          "power_w_peak": 235.0,
          "vram_mb_avg": 21703.3,
          "vram_mb_peak": 23892.4,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 76,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "simple_list",
      "category": "lists",
      "judge": {
        "elapsed_seconds": 9.214337142999284,
        "errors": [
          {
            "detail": "Did not convert the comma-separated items into a list after the colon.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Did not capitalize the initial phrase.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Kept the list inline with a final period instead of using a colon followed by list items.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Shopping list:\n- apples\n- bananas\n- oat milk",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 60,
        "summary": "Content is preserved, but the spoken list was not formatted as a markdown list and casing is wrong."
      },
      "judge_status": "ok",
      "latency_seconds": 29.6005,
      "model": "nemotron3:33b",
      "ollama_status": "ok",
      "output": "shopping list: apples, bananas, oat milk.\n\n",
      "passed": false,
      "score": 60,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 29.6022,
        "metrics": {
          "energy_wh_estimate": 0.4934,
          "gpu_busy_percent_avg": 38.4,
          "gpu_busy_percent_peak": 84.0,
          "power_w_avg": 60.0,
          "power_w_peak": 184.0,
          "vram_mb_avg": 21232.4,
          "vram_mb_peak": 23903.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 61,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "no_change_needed",
      "category": "no_change_needed",
      "judge": {
        "elapsed_seconds": 6.58800760899976,
        "errors": [],
        "ideal_output": "The quick brown fox jumps over the lazy dog.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output matches the expected text exactly."
      },
      "judge_status": "ok",
      "latency_seconds": 20.1221,
      "model": "nemotron3:33b",
      "ollama_status": "ok",
      "output": "The quick brown fox jumps over the lazy dog.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 20.1245,
        "metrics": {
          "energy_wh_estimate": 0.2638,
          "gpu_busy_percent_avg": 41.5,
          "gpu_busy_percent_peak": 75.0,
          "power_w_avg": 47.19,
          "power_w_peak": 123.0,
          "vram_mb_avg": 20074.9,
          "vram_mb_peak": 23864.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 42,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_pr_review_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 11.4707146260007,
        "errors": [
          {
            "detail": "Added markdown bolding and a bullet list that were not dictated.",
            "severity": "minor",
            "type": "formatting_miss"
          },
          {
            "detail": "Recast the sentence into procedural steps and dropped explicit wording that PR #4 is open.",
            "severity": "minor",
            "type": "semantic_change"
          },
          {
            "detail": "Adds phrasing such as 'Apply the necessary corrections' beyond the literal cleaned transcript.",
            "severity": "minor",
            "type": "extra_text"
          }
        ],
        "ideal_output": "Please check the open PR #4 for code review comments and check each one that's still open if it needs to be fixed. Fix it, then resolve the comments and push your fix.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 78,
        "summary": "Preserves the main intent but over-formats the dictated instruction and slightly rewrites content beyond cleanup."
      },
      "judge_status": "ok",
      "latency_seconds": 6.6192,
      "model": "ministral-3:14b",
      "ollama_status": "ok",
      "output": "Please check **PR #4** for code review comments. For each open comment:\n- Verify if the issue still needs fixing.\n- Apply the necessary corrections.\n- Resolve the comments.\n- Push the fix.\n\n",
      "passed": true,
      "score": 78,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 6.6207,
        "metrics": {
          "energy_wh_estimate": 0.2148,
          "gpu_busy_percent_avg": 60.7,
          "gpu_busy_percent_peak": 99.0,
          "power_w_avg": 116.8,
          "power_w_peak": 293.0,
          "vram_mb_avg": 14056.1,
          "vram_mb_peak": 17043.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 15,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_agent_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 10.341437324999788,
        "errors": [
          {
            "detail": "Changed 'Run cargo test' into 'cargo test --fix' and omitted the prose instruction form.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Invented git commands and a commit message not present in the transcript.",
            "severity": "critical",
            "type": "over_generation"
          },
          {
            "detail": "Formatted the content as a bash code block instead of final cleaned text.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Dropped 'fix any failing tests' and 'then commit and push the branch' as dictated prose.",
            "severity": "major",
            "type": "missing_text"
          }
        ],
        "ideal_output": "Run cargo test, fix any failing tests, then commit and push the branch.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 20,
        "summary": "Converted dictated instruction into an invented shell script instead of preserving it as cleaned prose."
      },
      "judge_status": "ok",
      "latency_seconds": 4.2281,
      "model": "ministral-3:14b",
      "ollama_status": "ok",
      "output": "```bash\ncargo test --fix\ngit commit -m \"Fix failing tests\"\ngit push\n```\n\n",
      "passed": false,
      "score": 20,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 4.2293,
        "metrics": {
          "energy_wh_estimate": 0.1533,
          "gpu_busy_percent_avg": 69.0,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 130.5,
          "power_w_peak": 294.0,
          "vram_mb_avg": 12648.8,
          "vram_mb_peak": 17094.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 10,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_question_no_answer",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 7.107876594000118,
        "errors": [],
        "ideal_output": "What is the capital of France?",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output exactly preserves the dictated question with correct casing and punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 3.9896,
      "model": "ministral-3:14b",
      "ollama_status": "ok",
      "output": "What is the capital of France?\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.9911,
        "metrics": {
          "energy_wh_estimate": 0.1165,
          "gpu_busy_percent_avg": 59.8,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 105.11,
          "power_w_peak": 301.0,
          "vram_mb_avg": 12176.7,
          "vram_mb_peak": 17106.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 9,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_email_request",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 11.032703382999898,
        "errors": [
          {
            "detail": "Invented email body, greeting, signoff, headings, markdown, and link formatting.",
            "severity": "critical",
            "type": "over_generation"
          },
          {
            "detail": "Included explanatory meta text: 'Here\u2019s the cleaned-up version...'.",
            "severity": "major",
            "type": "refusal_or_meta"
          },
          {
            "detail": "Changed the dictated request into an actual email rather than preserving it as a request.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Added content not present in the transcript, including 'Hi Maya,' and 'Best, [Your Name]'.",
            "severity": "critical",
            "type": "extra_text"
          }
        ],
        "ideal_output": "Write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 20,
        "summary": "Output invented a full email and included meta text instead of returning only the cleaned literal request."
      },
      "judge_status": "ok",
      "latency_seconds": 5.1951,
      "model": "ministral-3:14b",
      "ollama_status": "ok",
      "output": "Here\u2019s the cleaned-up version of your email request:\n\n---\n**Email to Maya:**\n\n**Subject:** Quarterly Update\n\nHi Maya,\n\nThe meeting has been moved to **Thursday at 9:30 AM**. The budget is **$12,450**, and the website is [chirper.local/launch](chirper.local/launch).\n\nBest,\n[Your Name]\n\n",
      "passed": false,
      "score": 20,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 5.1964,
        "metrics": {
          "energy_wh_estimate": 0.2154,
          "gpu_busy_percent_avg": 67.8,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 149.25,
          "power_w_peak": 299.0,
          "vram_mb_avg": 13342.9,
          "vram_mb_peak": 17054.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 12,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "onboarding_mixed_format",
      "category": "mixed_formatting",
      "judge": {
        "elapsed_seconds": 13.84340971399979,
        "errors": [
          {
            "detail": "Omitted 'Hello Chirper. I need to write down accent-friendly words.'",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Invented an email body with greeting, subject block, horizontal rule, and deployment notes instead of preserving the dictated text.",
            "severity": "critical",
            "type": "over_generation"
          },
          {
            "detail": "Changed 'Please write an email to Maya...' into an actual email and changed wording such as 'moved' to 'has been moved' and 'look at' to 'review.'",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Used bold headings, a separator, markdown link, and bullet deployment notes not present in the expected cleaned transcript.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "List title casing does not match expected 'Accent Friendly Words.'",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Hello Chirper. I need to write down accent-friendly words.\n\nAccent Friendly Words:\n- water\n- tomato\n- schedule\n- data\n- router\n- aluminium\n- privacy\n\nPlease write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.\n\nIn the deployment notes, mention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale. Thanks!",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 35,
        "summary": "Output formats some entities correctly but drops required introductory text and turns the dictated instruction into an invented email and deployment-note structure."
      },
      "judge_status": "ok",
      "latency_seconds": 6.1228,
      "model": "ministral-3:14b",
      "ollama_status": "ok",
      "output": "**Accent-friendly words:**\n- water\n- tomato\n- schedule\n- data\n- router\n- aluminium\n- privacy\n\n---\n\n**Subject:** Quarterly Update\n\nHi Maya,\n\nThe meeting has been moved to **Thursday at 9:30 AM**. The budget is **$12,450**, and the website is now live at **[chirper.local/launch](chirper.local/launch)**.\n\n**Deployment notes:**\n- `systemd` ensures Chirper services remain running.\n- We should also review **PostgreSQL**, **FFmpeg**, **GNOME**, **Nextcloud**, and **Tailscale**.\n\nThanks!\n\n",
      "passed": false,
      "score": 35,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 6.124,
        "metrics": {
          "energy_wh_estimate": 0.3079,
          "gpu_busy_percent_avg": 78.6,
          "gpu_busy_percent_peak": 99.0,
          "power_w_avg": 181.0,
          "power_w_peak": 293.0,
          "vram_mb_avg": 13918.9,
          "vram_mb_peak": 17114.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 14,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "markdown_checklist_instruction_content",
      "category": "markdown",
      "judge": {
        "elapsed_seconds": 9.424941541001317,
        "errors": [
          {
            "detail": "Expected a plain title followed by three '- [ ]' checklist items, but output uses a checked-style title item with nested numbered items.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Title should be 'Release Tasks' in title case.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Missing colon after the title.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Release Tasks:\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 55,
        "summary": "Content is preserved, but the markdown checklist structure is wrong: the title became a checklist item and the release tasks became a numbered sublist instead of checkbox items."
      },
      "judge_status": "ok",
      "latency_seconds": 4.3428,
      "model": "ministral-3:14b",
      "ollama_status": "ok",
      "output": "- [ ] **Release tasks**\n  1. Run tests\n  2. Push tag\n  3. Publish release\n\n",
      "passed": false,
      "score": 55,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 4.344,
        "metrics": {
          "energy_wh_estimate": 0.1523,
          "gpu_busy_percent_avg": 65.2,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 126.2,
          "power_w_peak": 296.0,
          "vram_mb_avg": 12628.2,
          "vram_mb_peak": 17098.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 10,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "shell_command_as_content",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 8.324448121000387,
        "errors": [
          {
            "detail": "Replaced `| tee test-output.log` with unrelated test arguments `-- --output=test-output.log`.",
            "severity": "critical",
            "type": "semantic_change"
          },
          {
            "detail": "Did not preserve the pipe operator and filename structure from the transcript.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Added markdown code backticks not present in the expected output.",
            "severity": "minor",
            "type": "extra_text"
          }
        ],
        "ideal_output": "cargo test | tee test-output.log",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 15,
        "summary": "The formatter changed the shell command structure instead of preserving the dictated pipe and tee command."
      },
      "judge_status": "ok",
      "latency_seconds": 4.0727,
      "model": "ministral-3:14b",
      "ollama_status": "ok",
      "output": "`cargo test -- --output=test-output.log`\n\n",
      "passed": false,
      "score": 15,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 4.074,
        "metrics": {
          "energy_wh_estimate": 0.1362,
          "gpu_busy_percent_avg": 60.8,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 120.4,
          "power_w_peak": 296.0,
          "vram_mb_avg": 12677.7,
          "vram_mb_peak": 17110.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 10,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "spoken_correction_chain",
      "category": "spoken_edits",
      "judge": {
        "elapsed_seconds": 8.762775105999026,
        "errors": [
          {
            "detail": "Leaves '(capital C, capital B)' in the output.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Formats the intended identifier as 'Chirper Benchmark' instead of 'ChirperBench'.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Omits the dictated phrase 'Call it'.",
            "severity": "minor",
            "type": "missing_text"
          },
          {
            "detail": "Missing final period from the expected sentence.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Call it ChirperBench.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 35,
        "summary": "Output applies some capitalization but fails the final corrected identifier and leaks spoken spelling instructions."
      },
      "judge_status": "ok",
      "latency_seconds": 4.0693,
      "model": "ministral-3:14b",
      "ollama_status": "ok",
      "output": "Chirper Benchmark (capital C, capital B)\n\n",
      "passed": false,
      "score": 35,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 4.0706,
        "metrics": {
          "energy_wh_estimate": 0.1391,
          "gpu_busy_percent_avg": 62.4,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 123.0,
          "power_w_peak": 302.0,
          "vram_mb_avg": 12651.8,
          "vram_mb_peak": 17082.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 10,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "url_https_path",
      "category": "urls",
      "judge": {
        "elapsed_seconds": 15.373783815000934,
        "errors": [
          {
            "detail": "Expected a plain URL, not Markdown link syntax.",
            "severity": "minor",
            "type": "formatting_miss"
          },
          {
            "detail": "Markdown brackets and repeated link target add characters not present in the intended text.",
            "severity": "minor",
            "type": "extra_text"
          }
        ],
        "ideal_output": "Visit https://docs.python.org/3/library/json.html.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 85,
        "summary": "URL, casing, and punctuation are correct, but the formatter unnecessarily wrapped the URL as a Markdown link."
      },
      "judge_status": "ok",
      "latency_seconds": 4.2703,
      "model": "ministral-3:14b",
      "ollama_status": "ok",
      "output": "Visit [https://docs.python.org/3/library/json.html](https://docs.python.org/3/library/json.html).\n\n",
      "passed": true,
      "score": 85,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 4.2718,
        "metrics": {
          "energy_wh_estimate": 0.1502,
          "gpu_busy_percent_avg": 65.2,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 126.6,
          "power_w_peak": 309.0,
          "vram_mb_avg": 12555.1,
          "vram_mb_peak": 16988.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 10,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "email_addresses",
      "category": "emails",
      "judge": {
        "elapsed_seconds": 8.643003716999374,
        "errors": [
          {
            "detail": "Email addresses were wrapped in bold markdown, which was not requested.",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Send it to alex@chirper.local and cc ops@example.com.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 90,
        "summary": "Email addresses were correctly formatted, but unnecessary markdown bolding was added."
      },
      "judge_status": "ok",
      "latency_seconds": 4.1724,
      "model": "ministral-3:14b",
      "ollama_status": "ok",
      "output": "Send it to **alex@chirper.local** and cc **ops@example.com**.\n\n",
      "passed": true,
      "score": 90,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 4.1735,
        "metrics": {
          "energy_wh_estimate": 0.1386,
          "gpu_busy_percent_avg": 61.6,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 119.6,
          "power_w_peak": 296.0,
          "vram_mb_avg": 12613.5,
          "vram_mb_peak": 17042.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 10,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "product_and_class_casing",
      "category": "casing_identifiers",
      "judge": {
        "elapsed_seconds": 8.894571908998842,
        "errors": [
          {
            "detail": "Added bold and code formatting not present in the expected cleaned text.",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "The project name is ChirperBench and the class name is DictationFormatter.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 88,
        "summary": "Correct casing and punctuation, but added unnecessary markdown formatting around identifiers."
      },
      "judge_status": "ok",
      "latency_seconds": 4.2146,
      "model": "ministral-3:14b",
      "ollama_status": "ok",
      "output": "The project name is **ChirperBench** and the class name is `DictationFormatter`.\n\n",
      "passed": true,
      "score": 88,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 4.216,
        "metrics": {
          "energy_wh_estimate": 0.1456,
          "gpu_busy_percent_avg": 61.9,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 124.3,
          "power_w_peak": 313.0,
          "vram_mb_avg": 12562.3,
          "vram_mb_peak": 16990.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 10,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "inline_code_identifiers",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 10.220501617000991,
        "errors": [
          {
            "detail": "Omitted 'Set variable' and converted the sentence into executable-style code.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Used a fenced code block instead of inline code identifiers.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Missing the sentence wording around the identifiers.",
            "severity": "major",
            "type": "missing_text"
          }
        ],
        "ideal_output": "Set variable `user_id` equal to `request.user.id`.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 45,
        "summary": "Changed dictated sentence into a Python code block assignment instead of preserving the intended formatted text."
      },
      "judge_status": "ok",
      "latency_seconds": 4.0388,
      "model": "ministral-3:14b",
      "ollama_status": "ok",
      "output": "```python\nuser_id = request.user.id\n```\n\n",
      "passed": false,
      "score": 45,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 4.04,
        "metrics": {
          "energy_wh_estimate": 0.1376,
          "gpu_busy_percent_avg": 62.8,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 122.6,
          "power_w_peak": 302.0,
          "vram_mb_avg": 12538.0,
          "vram_mb_peak": 16966.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 10,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "numbers_versions_dates",
      "category": "numbers_versions",
      "judge": {
        "elapsed_seconds": 6.841551949000859,
        "errors": [
          {
            "detail": "Added a comma after 2026 before 'and' that was not in the expected output.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Version 2.1.0 shipped on June 3, 2026 and supports Python 3.11+.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 95,
        "summary": "Content, casing, dates, versions, and number formatting are correct; only an extra comma differs from expected punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 4.3256,
      "model": "ministral-3:14b",
      "ollama_status": "ok",
      "output": "Version 2.1.0 shipped on June 3, 2026, and supports Python 3.11+.\n\n",
      "passed": true,
      "score": 95,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 4.3267,
        "metrics": {
          "energy_wh_estimate": 0.1563,
          "gpu_busy_percent_avg": 67.4,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 130.1,
          "power_w_peak": 310.0,
          "vram_mb_avg": 12528.3,
          "vram_mb_peak": 16958.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 10,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "simple_list",
      "category": "lists",
      "judge": {
        "elapsed_seconds": 7.883878777000064,
        "errors": [
          {
            "detail": "Rendered the shopping list inline instead of as separate list items.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Used commas and a final period where the expected output uses a colon followed by list bullets.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Shopping list:\n- apples\n- bananas\n- oat milk",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 72,
        "summary": "Content is preserved, but the dictated colon/list structure was not formatted as a markdown list."
      },
      "judge_status": "ok",
      "latency_seconds": 4.0885,
      "model": "ministral-3:14b",
      "ollama_status": "ok",
      "output": "Shopping list: apples, bananas, oat milk.\n\n",
      "passed": false,
      "score": 72,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 4.0896,
        "metrics": {
          "energy_wh_estimate": 0.1371,
          "gpu_busy_percent_avg": 61.0,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 120.7,
          "power_w_peak": 299.0,
          "vram_mb_avg": 12461.2,
          "vram_mb_peak": 16890.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 10,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "no_change_needed",
      "category": "no_change_needed",
      "judge": {
        "elapsed_seconds": 5.7381680050002615,
        "errors": [],
        "ideal_output": "The quick brown fox jumps over the lazy dog.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output matches the expected cleaned text exactly."
      },
      "judge_status": "ok",
      "latency_seconds": 4.0503,
      "model": "ministral-3:14b",
      "ollama_status": "ok",
      "output": "The quick brown fox jumps over the lazy dog.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 4.0513,
        "metrics": {
          "energy_wh_estimate": 0.1351,
          "gpu_busy_percent_avg": 62.3,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 120.1,
          "power_w_peak": 297.0,
          "vram_mb_avg": 12511.5,
          "vram_mb_peak": 16868.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 10,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_pr_review_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 9.865217093001775,
        "errors": [
          {
            "detail": "Uses a comma and semicolon structure instead of the expected sentence break after 'fixed.'",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Please check the open PR #4 for code review comments and check each one that's still open if it needs to be fixed. Fix it, then resolve the comments and push your fix.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 95,
        "summary": "Preserves the dictated instruction as content with appropriate casing, PR identifier formatting, and punctuation; only minor punctuation/style differences from expected."
      },
      "judge_status": "ok",
      "latency_seconds": 5.0187,
      "model": "nemotron-3-nano:4b",
      "ollama_status": "ok",
      "output": "Please check the open PR #4 for code review comments, and check each one that's still open; if it needs to be fixed, please fix it and then resolve the comments and push your fix.\n\n",
      "passed": true,
      "score": 95,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 5.0204,
        "metrics": {
          "energy_wh_estimate": 0.2395,
          "gpu_busy_percent_avg": 68.6,
          "gpu_busy_percent_peak": 95.0,
          "power_w_avg": 171.75,
          "power_w_peak": 293.0,
          "vram_mb_avg": 7036.5,
          "vram_mb_peak": 8042.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 12,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_agent_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 6.528150824000477,
        "errors": [],
        "ideal_output": "Run cargo test, fix any failing tests, then commit and push the branch.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Perfect match; command-like content was preserved as dictated text with correct casing and punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 3.2527,
      "model": "nemotron-3-nano:4b",
      "ollama_status": "ok",
      "output": "Run cargo test, fix any failing tests, then commit and push the branch.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.2541,
        "metrics": {
          "energy_wh_estimate": 0.1571,
          "gpu_busy_percent_avg": 70.6,
          "gpu_busy_percent_peak": 95.0,
          "power_w_avg": 173.88,
          "power_w_peak": 291.0,
          "vram_mb_avg": 6519.9,
          "vram_mb_peak": 7968.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 8,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_question_no_answer",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 6.03226160000122,
        "errors": [],
        "ideal_output": "What is the capital of France?",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output exactly preserves the dictated question with correct casing and punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 2.4697,
      "model": "nemotron-3-nano:4b",
      "ollama_status": "ok",
      "output": "What is the capital of France?\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.4712,
        "metrics": {
          "energy_wh_estimate": 0.0876,
          "gpu_busy_percent_avg": 54.5,
          "gpu_busy_percent_peak": 92.0,
          "power_w_avg": 127.67,
          "power_w_peak": 291.0,
          "vram_mb_avg": 5960.3,
          "vram_mb_peak": 7881.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_email_request",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 10.851176689999193,
        "errors": [
          {
            "detail": "Omitted \u201cWrite an email to Maya,\u201d which was part of the dictated content.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Used a comma after the subject instead of ending the subject sentence with a period.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 72,
        "summary": "Mostly preserves the email request details, but drops the dictated request/recipient text and uses incorrect punctuation after the subject."
      },
      "judge_status": "ok",
      "latency_seconds": 5.5212,
      "model": "nemotron-3-nano:4b",
      "ollama_status": "ok",
      "output": "Subject: Quarterly Update, The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.\n\n",
      "passed": false,
      "score": 72,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 5.5229,
        "metrics": {
          "energy_wh_estimate": 0.3258,
          "gpu_busy_percent_avg": 73.6,
          "gpu_busy_percent_peak": 93.0,
          "power_w_avg": 212.38,
          "power_w_peak": 293.0,
          "vram_mb_avg": 7033.0,
          "vram_mb_peak": 8070.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 13,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "onboarding_mixed_format",
      "category": "mixed_formatting",
      "judge": {
        "elapsed_seconds": 26.13952726699972,
        "errors": [
          {
            "detail": "Left spoken commands such as \"This is a bullet point list,\" \"End of list,\" \"New paragraph,\" and \"Finish with\" in the output.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Did not create the required markdown bullet list or paragraph breaks.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Missed most periods, commas, and sentence boundaries.",
            "severity": "major",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Failed required casing for Chirper, Quarterly Update, PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale.",
            "severity": "major",
            "type": "wrong_casing"
          },
          {
            "detail": "Did not format 9:30 AM or $12,450.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Hello Chirper. I need to write down accent-friendly words.\n\nAccent Friendly Words:\n- water\n- tomato\n- schedule\n- data\n- router\n- aluminium\n- privacy\n\nPlease write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.\n\nIn the deployment notes, mention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale. Thanks!",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 28,
        "summary": "Major mixed-formatting failure: spoken commands leaked, punctuation/list formatting mostly missed, and technical casing/numbers were not normalized."
      },
      "judge_status": "ok",
      "latency_seconds": 5.6079,
      "model": "nemotron-3-nano:4b",
      "ollama_status": "ok",
      "output": "Hello chirper I need to write down accent-friendly words This is a bullet point list with title Accent-Friendly Words: water, tomato, schedule, data, router, aluminium, privacy End of list New paragraph Please write an email to Maya subject: Quarterly update The meeting moved to Thursday at nine thirty A.M the budget is twelve thousand four hundred fifty dollars and the website is chirper.local/launch New paragraph In the deployment notes mention that systemd keeps the chirper services running and we should also look at postgresql ffmpeg gnome nextcloud and tailscale Finish with thanks!\n\n",
      "passed": false,
      "score": 28,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 5.6093,
        "metrics": {
          "energy_wh_estimate": 0.3359,
          "gpu_busy_percent_avg": 75.2,
          "gpu_busy_percent_peak": 93.0,
          "power_w_avg": 215.62,
          "power_w_peak": 291.0,
          "vram_mb_avg": 6999.4,
          "vram_mb_peak": 8038.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 13,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "markdown_checklist_instruction_content",
      "category": "markdown",
      "judge": {
        "elapsed_seconds": 10.884424808999029,
        "errors": [
          {
            "detail": "Did not format tasks as markdown checklist items with - [ ].",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Left spoken list markers as output: Item one, Item two, Item three.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Title uses 'Release tasks' instead of 'Release Tasks'.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Title omits the required colon.",
            "severity": "minor",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Adds separate list items for item labels that should not appear.",
            "severity": "major",
            "type": "extra_text"
          }
        ],
        "ideal_output": "Release Tasks:\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 35,
        "summary": "Preserves the three task phrases but misses the required checklist format, title casing/colon, and leaves spoken item markers as list entries."
      },
      "judge_status": "ok",
      "latency_seconds": 5.8804,
      "model": "nemotron-3-nano:4b",
      "ollama_status": "ok",
      "output": "**Release tasks**\n\n- Item one\n- Run tests\n- Item two\n- Push tag\n- Item three\n- Publish release\n\n",
      "passed": false,
      "score": 35,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 5.8816,
        "metrics": {
          "energy_wh_estimate": 0.3553,
          "gpu_busy_percent_avg": 78.4,
          "gpu_busy_percent_peak": 94.0,
          "power_w_avg": 217.46,
          "power_w_peak": 296.0,
          "vram_mb_avg": 6874.0,
          "vram_mb_peak": 7793.4,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 13,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "shell_command_as_content",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 9.886476141999083,
        "errors": [
          {
            "detail": "Did not format the dictated shell command with pipe, dash, and dot symbols.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Left spoken tokens like pipe, space, dash, and dot in the output.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Inserted commas instead of spaces and shell punctuation.",
            "severity": "major",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "cargo test | tee test-output.log",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 20,
        "summary": "Failed to preserve shell command structure and left spoken punctuation words instead of converting them."
      },
      "judge_status": "ok",
      "latency_seconds": 7.5442,
      "model": "nemotron-3-nano:4b",
      "ollama_status": "ok",
      "output": "cargo,test,pipe,tee,space,test,dash,output,dot,log\n\n",
      "passed": false,
      "score": 20,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 7.5455,
        "metrics": {
          "energy_wh_estimate": 0.4797,
          "gpu_busy_percent_avg": 78.7,
          "gpu_busy_percent_peak": 93.0,
          "power_w_avg": 228.88,
          "power_w_peak": 293.0,
          "vram_mb_avg": 7134.4,
          "vram_mb_peak": 7859.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 17,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "spoken_correction_chain",
      "category": "spoken_edits",
      "judge": {
        "elapsed_seconds": 8.537457539998286,
        "errors": [
          {
            "detail": "Leaves 'no scratch' and spelling/capitalization instructions in the final text.",
            "severity": "critical",
            "type": "instruction_leak"
          },
          {
            "detail": "Does not produce the intended corrected phrase 'Call it ChirperBench.'",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Fails to combine and capitalize the identifier as 'ChirperBench'.",
            "severity": "major",
            "type": "wrong_casing"
          },
          {
            "detail": "Adds an incorrect colon and sentence break.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Call it ChirperBench.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 0,
        "summary": "The output keeps spoken correction commands and fails to apply the final corrected name."
      },
      "judge_status": "ok",
      "latency_seconds": 4.416,
      "model": "nemotron-3-nano:4b",
      "ollama_status": "ok",
      "output": "Chirper bench: no scratch. Spelled capital C, capital B.\n\n",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 4.4174,
        "metrics": {
          "energy_wh_estimate": 0.2308,
          "gpu_busy_percent_avg": 70.1,
          "gpu_busy_percent_peak": 93.0,
          "power_w_avg": 188.1,
          "power_w_peak": 293.0,
          "vram_mb_avg": 6724.6,
          "vram_mb_peak": 7898.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 10,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "url_https_path",
      "category": "urls",
      "judge": {
        "elapsed_seconds": 5.998766400998647,
        "errors": [],
        "ideal_output": "Visit https://docs.python.org/3/library/json.html.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output correctly formats the HTTPS URL with path, casing, and final punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 2.4694,
      "model": "nemotron-3-nano:4b",
      "ollama_status": "ok",
      "output": "Visit https://docs.python.org/3/library/json.html.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.4706,
        "metrics": {
          "energy_wh_estimate": 0.0866,
          "gpu_busy_percent_avg": 54.5,
          "gpu_busy_percent_peak": 93.0,
          "power_w_avg": 126.17,
          "power_w_peak": 293.0,
          "vram_mb_avg": 5916.6,
          "vram_mb_peak": 7850.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "email_addresses",
      "category": "emails",
      "judge": {
        "elapsed_seconds": 7.402830390999952,
        "errors": [
          {
            "detail": "Both email addresses use 'at' instead of '@'.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Send it to alex@chirper.local and cc ops@example.com.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 65,
        "summary": "Email addresses were not fully formatted: spoken 'at' remained instead of '@'."
      },
      "judge_status": "ok",
      "latency_seconds": 2.8557,
      "model": "nemotron-3-nano:4b",
      "ollama_status": "ok",
      "output": "Send it to alex at chirper.local and cc ops at example.com.\n\n",
      "passed": false,
      "score": 65,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.8572,
        "metrics": {
          "energy_wh_estimate": 0.1191,
          "gpu_busy_percent_avg": 66.7,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 150.14,
          "power_w_peak": 292.0,
          "vram_mb_avg": 6327.4,
          "vram_mb_peak": 7974.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 7,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "product_and_class_casing",
      "category": "casing_identifiers",
      "judge": {
        "elapsed_seconds": 7.749728462000348,
        "errors": [
          {
            "detail": "Used 'Chirper Bench' instead of 'ChirperBench' and 'Dictation Formatter' instead of 'DictationFormatter'.",
            "severity": "major",
            "type": "wrong_casing"
          },
          {
            "detail": "Did not apply code-style identifier formatting for product and class names.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "The project name is ChirperBench and the class name is DictationFormatter.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 55,
        "summary": "Preserved the sentence but failed required product and class identifier casing by leaving spaces in both names."
      },
      "judge_status": "ok",
      "latency_seconds": 3.5972,
      "model": "nemotron-3-nano:4b",
      "ollama_status": "ok",
      "output": "The project name is Chirper Bench, and the class name is Dictation Formatter.\n\n",
      "passed": false,
      "score": 55,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.5985,
        "metrics": {
          "energy_wh_estimate": 0.1858,
          "gpu_busy_percent_avg": 71.7,
          "gpu_busy_percent_peak": 94.0,
          "power_w_avg": 185.89,
          "power_w_peak": 294.0,
          "vram_mb_avg": 6634.2,
          "vram_mb_peak": 7925.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 9,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "inline_code_identifiers",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 8.099713838999378,
        "errors": [
          {
            "detail": "Code identifiers user_id and request.user.id should be wrapped in inline code backticks.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Added 'the' before variable, which is not in the expected text.",
            "severity": "minor",
            "type": "extra_text"
          }
        ],
        "ideal_output": "Set variable `user_id` equal to `request.user.id`.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 75,
        "summary": "Content is preserved, but required inline code formatting for identifiers is missing."
      },
      "judge_status": "ok",
      "latency_seconds": 2.8371,
      "model": "nemotron-3-nano:4b",
      "ollama_status": "ok",
      "output": "Set the variable user_id equal to request.user.id.\n\n",
      "passed": false,
      "score": 75,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.8384,
        "metrics": {
          "energy_wh_estimate": 0.1244,
          "gpu_busy_percent_avg": 67.0,
          "gpu_busy_percent_peak": 96.0,
          "power_w_avg": 157.86,
          "power_w_peak": 293.0,
          "vram_mb_avg": 6207.3,
          "vram_mb_peak": 7855.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 7,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "numbers_versions_dates",
      "category": "numbers_versions",
      "judge": {
        "elapsed_seconds": 8.497140105999279,
        "errors": [
          {
            "detail": "Dropped the '+' from 'Python 3.11+', changing the intended supported version range.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Missing comma after 'June 3' in the date.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Version 2.1.0 shipped on June 3, 2026 and supports Python 3.11+.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 78,
        "summary": "Mostly correct formatting, but it omitted the comma in the date and dropped the plus sign from Python 3.11+, changing the version requirement."
      },
      "judge_status": "ok",
      "latency_seconds": 3.4761,
      "model": "nemotron-3-nano:4b",
      "ollama_status": "ok",
      "output": "Version 2.1.0 shipped on June 3 2026 and supports Python 3.11.\n\n",
      "passed": false,
      "score": 78,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.4774,
        "metrics": {
          "energy_wh_estimate": 0.174,
          "gpu_busy_percent_avg": 68.8,
          "gpu_busy_percent_peak": 94.0,
          "power_w_avg": 180.12,
          "power_w_peak": 294.0,
          "vram_mb_avg": 6421.2,
          "vram_mb_peak": 7981.9,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 8,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "simple_list",
      "category": "lists",
      "judge": {
        "elapsed_seconds": 7.953342973998588,
        "errors": [
          {
            "detail": "Spoken list should be formatted as a heading line followed by bullet items.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Expected sentence/title start casing: \"Shopping list\".",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Added final period and kept comma-separated inline list instead of line breaks.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Shopping list:\n- apples\n- bananas\n- oat milk",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 55,
        "summary": "Content is preserved, but the required title casing and markdown list formatting were missed."
      },
      "judge_status": "ok",
      "latency_seconds": 2.3525,
      "model": "nemotron-3-nano:4b",
      "ollama_status": "ok",
      "output": "shopping list: apples, bananas, oat milk.\n\n",
      "passed": false,
      "score": 55,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.3537,
        "metrics": {
          "energy_wh_estimate": 0.0888,
          "gpu_busy_percent_avg": 60.3,
          "gpu_busy_percent_peak": 95.0,
          "power_w_avg": 135.83,
          "power_w_peak": 295.0,
          "vram_mb_avg": 6038.0,
          "vram_mb_peak": 7956.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "no_change_needed",
      "category": "no_change_needed",
      "judge": {
        "elapsed_seconds": 6.319812112000363,
        "errors": [],
        "ideal_output": "The quick brown fox jumps over the lazy dog.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output exactly matches the expected cleaned text."
      },
      "judge_status": "ok",
      "latency_seconds": 2.0666,
      "model": "nemotron-3-nano:4b",
      "ollama_status": "ok",
      "output": "The quick brown fox jumps over the lazy dog.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.0678,
        "metrics": {
          "energy_wh_estimate": 0.0703,
          "gpu_busy_percent_avg": 59.7,
          "gpu_busy_percent_peak": 92.0,
          "power_w_avg": 122.5,
          "power_w_peak": 280.0,
          "vram_mb_avg": 5990.0,
          "vram_mb_peak": 7900.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_pr_review_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 12.716407822999827,
        "errors": [
          {
            "detail": "Left 'PR number 4' instead of formatting it as 'PR #4'.",
            "severity": "minor",
            "type": "formatting_miss"
          },
          {
            "detail": "Omitted 'the' before 'open PR'.",
            "severity": "minor",
            "type": "missing_text"
          },
          {
            "detail": "Used 'Fix it and then' instead of the cleaner sentence break/comma: 'Fix it, then'.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Please check the open PR #4 for code review comments and check each one that's still open if it needs to be fixed. Fix it, then resolve the comments and push your fix.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 82,
        "summary": "Mostly preserves the dictated instruction, but misses the PR identifier formatting and some expected wording/punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 10.8915,
      "model": "devstral-small-2:24b",
      "ollama_status": "ok",
      "output": "Please check open PR number 4 for code review comments and check each one that's still open if it needs to be fixed. Fix it and then resolve the comments and push your fix.\n\n",
      "passed": true,
      "score": 82,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 10.8928,
        "metrics": {
          "energy_wh_estimate": 0.2845,
          "gpu_busy_percent_avg": 58.6,
          "gpu_busy_percent_peak": 99.0,
          "power_w_avg": 94.04,
          "power_w_peak": 291.0,
          "vram_mb_avg": 19931.1,
          "vram_mb_peak": 22815.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 23,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_agent_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 8.079058825000175,
        "errors": [
          {
            "detail": "Added backticks around cargo test instead of preserving it as plain text.",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Run cargo test, fix any failing tests, then commit and push the branch.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 92,
        "summary": "Content, casing, and punctuation are correct, but it added markdown code formatting not present in the expected plain text."
      },
      "judge_status": "ok",
      "latency_seconds": 10.4959,
      "model": "devstral-small-2:24b",
      "ollama_status": "ok",
      "output": "Run `cargo test`, fix any failing tests, then commit and push the branch.\n\n",
      "passed": true,
      "score": 92,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 10.4973,
        "metrics": {
          "energy_wh_estimate": 0.2225,
          "gpu_busy_percent_avg": 54.6,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 76.32,
          "power_w_peak": 297.0,
          "vram_mb_avg": 19993.2,
          "vram_mb_peak": 22891.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 22,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_question_no_answer",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 6.499826780000149,
        "errors": [
          {
            "detail": "Returned an answer about Paris rather than the dictated question.",
            "severity": "critical",
            "type": "answered_content"
          },
          {
            "detail": "Changed the intended content from a question to a statement answer.",
            "severity": "critical",
            "type": "semantic_change"
          }
        ],
        "ideal_output": "What is the capital of France?",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 0,
        "summary": "Critical failure: answered the dictated question instead of formatting it as text."
      },
      "judge_status": "ok",
      "latency_seconds": 10.1802,
      "model": "devstral-small-2:24b",
      "ollama_status": "ok",
      "output": "The capital of France is Paris.\n\n",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 10.1815,
        "metrics": {
          "energy_wh_estimate": 0.218,
          "gpu_busy_percent_avg": 57.0,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 77.09,
          "power_w_peak": 291.0,
          "vram_mb_avg": 19971.0,
          "vram_mb_peak": 22931.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 22,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_email_request",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 10.691465090998463,
        "errors": [
          {
            "detail": "Includes meta text: \"Here's the cleaned-up final text:\"",
            "severity": "critical",
            "type": "refusal_or_meta"
          },
          {
            "detail": "Changed the instruction-as-content into an email subject/body, removing \"Write an email to Maya\" from the final text.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Added markdown code fence and markdown link instead of plain cleaned text.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Added email layout and blank lines not present in the intended output.",
            "severity": "major",
            "type": "extra_text"
          }
        ],
        "ideal_output": "Write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 35,
        "summary": "Output explains the task and turns the dictated request into an email draft instead of preserving the literal email request."
      },
      "judge_status": "ok",
      "latency_seconds": 11.615,
      "model": "devstral-small-2:24b",
      "ollama_status": "ok",
      "output": "Here's the cleaned-up final text:\n\n```markdown\nSubject: Quarterly Update\n\nThe meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is [chirper.local/launch](http://chirper.local/launch).\n```\n\n",
      "passed": false,
      "score": 35,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 11.6163,
        "metrics": {
          "energy_wh_estimate": 0.3083,
          "gpu_busy_percent_avg": 62.0,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 95.56,
          "power_w_peak": 302.0,
          "vram_mb_avg": 20303.1,
          "vram_mb_peak": 22828.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 25,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "onboarding_mixed_format",
      "category": "mixed_formatting",
      "judge": {
        "elapsed_seconds": 29.687584422001237,
        "errors": [
          {
            "detail": "Omitted: \"I need to write down accent-friendly words.\"",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Changed \"Please write an email to Maya, subject: Quarterly Update\" into a standalone email subject block and omitted Maya.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Added bold title, horizontal rule, and markdown link not present in the expected cleaned transcript.",
            "severity": "minor",
            "type": "formatting_miss"
          },
          {
            "detail": "Split several comma-joined clauses into separate sentences.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Hello Chirper. I need to write down accent-friendly words.\n\nAccent Friendly Words:\n- water\n- tomato\n- schedule\n- data\n- router\n- aluminium\n- privacy\n\nPlease write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.\n\nIn the deployment notes, mention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale. Thanks!",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 62,
        "summary": "Mostly formats the list and technical casing, but omits key dictated text and changes the email instruction into an email-like block."
      },
      "judge_status": "ok",
      "latency_seconds": 13.1612,
      "model": "devstral-small-2:24b",
      "ollama_status": "ok",
      "output": "Hello Chirper.\n\n**Accent-friendly words:**\n- water\n- tomato\n- schedule\n- data\n- router\n- aluminium\n- privacy\n\n---\n\nSubject: Quarterly Update\n\nThe meeting moved to Thursday at 9:30 AM. The budget is $12,450, and the website is [chirper.local/launch](chirper.local/launch).\n\nIn the deployment notes, mention that systemd keeps the Chirper services running. We should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale.\n\nThanks!\n\n",
      "passed": false,
      "score": 62,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 13.1626,
        "metrics": {
          "energy_wh_estimate": 0.4339,
          "gpu_busy_percent_avg": 65.6,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 118.68,
          "power_w_peak": 293.0,
          "vram_mb_avg": 20497.0,
          "vram_mb_peak": 22789.4,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 28,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "markdown_checklist_instruction_content",
      "category": "markdown",
      "judge": {
        "elapsed_seconds": 10.395704319000288,
        "errors": [
          {
            "detail": "Adds explanatory text instead of returning only the cleaned transcript.",
            "severity": "critical",
            "type": "refusal_or_meta"
          },
          {
            "detail": "Uses a markdown H1 and code fence rather than the expected title with colon and checklist.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Includes wrapper text and markdown fence not present in the intended output.",
            "severity": "major",
            "type": "extra_text"
          }
        ],
        "ideal_output": "Release Tasks:\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 40,
        "summary": "Output includes meta text and a fenced code block, and formats the title as a heading instead of the requested colon title."
      },
      "judge_status": "ok",
      "latency_seconds": 10.9094,
      "model": "devstral-small-2:24b",
      "ollama_status": "ok",
      "output": "Here's the cleaned-up final text:\n\n```markdown\n# Release Tasks\n\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release\n```\n\n",
      "passed": false,
      "score": 40,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 10.9109,
        "metrics": {
          "energy_wh_estimate": 0.2571,
          "gpu_busy_percent_avg": 60.3,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 84.83,
          "power_w_peak": 291.0,
          "vram_mb_avg": 20051.4,
          "vram_mb_peak": 22811.9,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 23,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "shell_command_as_content",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 7.892837720999523,
        "errors": [
          {
            "detail": "Rendered the spoken separator word 'space' as literal filename content.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Expected dash-separated filename 'test-output.log', but output used 'space_test_output.log'.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "cargo test | tee test-output.log",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 60,
        "summary": "Preserved the shell pipe but misformatted the output filename by leaking 'space' and changing dash-separated identifier text to underscores."
      },
      "judge_status": "ok",
      "latency_seconds": 10.2531,
      "model": "devstral-small-2:24b",
      "ollama_status": "ok",
      "output": "cargo test | tee space_test_output.log\n\n",
      "passed": false,
      "score": 60,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 10.2543,
        "metrics": {
          "energy_wh_estimate": 0.2042,
          "gpu_busy_percent_avg": 55.4,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 71.68,
          "power_w_peak": 303.0,
          "vram_mb_avg": 19932.7,
          "vram_mb_peak": 22852.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 22,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "spoken_correction_chain",
      "category": "spoken_edits",
      "judge": {
        "elapsed_seconds": 10.095072774998698,
        "errors": [
          {
            "detail": "Changed the intended name from ChirperBench to Chirper Benchmark.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Missed the dictated compound identifier formatting with capital C and capital B.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Call it ChirperBench.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 60,
        "summary": "Incorrect final identifier: it should be the camel-cased single word ChirperBench, not Chirper Benchmark."
      },
      "judge_status": "ok",
      "latency_seconds": 10.2063,
      "model": "devstral-small-2:24b",
      "ollama_status": "ok",
      "output": "Call it Chirper Benchmark.\n\n",
      "passed": false,
      "score": 60,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 10.2075,
        "metrics": {
          "energy_wh_estimate": 0.2195,
          "gpu_busy_percent_avg": 57.6,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 77.41,
          "power_w_peak": 292.0,
          "vram_mb_avg": 19895.3,
          "vram_mb_peak": 22724.4,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 22,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "url_https_path",
      "category": "urls",
      "judge": {
        "elapsed_seconds": 5.7293822770006955,
        "errors": [],
        "ideal_output": "Visit https://docs.python.org/3/library/json.html.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output exactly matches the expected cleaned URL text."
      },
      "judge_status": "ok",
      "latency_seconds": 10.3191,
      "model": "devstral-small-2:24b",
      "ollama_status": "ok",
      "output": "Visit https://docs.python.org/3/library/json.html.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 10.3207,
        "metrics": {
          "energy_wh_estimate": 0.2067,
          "gpu_busy_percent_avg": 55.6,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 72.09,
          "power_w_peak": 302.0,
          "vram_mb_avg": 19946.1,
          "vram_mb_peak": 22810.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 22,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "email_addresses",
      "category": "emails",
      "judge": {
        "elapsed_seconds": 6.56749578200106,
        "errors": [
          {
            "detail": "Sentence should start with uppercase 'Send'.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Send it to alex@chirper.local and cc ops@example.com.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 95,
        "summary": "Email addresses and punctuation are correct; only the initial sentence casing was missed."
      },
      "judge_status": "ok",
      "latency_seconds": 10.6682,
      "model": "devstral-small-2:24b",
      "ollama_status": "ok",
      "output": "send it to alex@chirper.local and cc ops@example.com.\n\n",
      "passed": true,
      "score": 95,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 10.6693,
        "metrics": {
          "energy_wh_estimate": 0.2255,
          "gpu_busy_percent_avg": 55.7,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 76.09,
          "power_w_peak": 316.0,
          "vram_mb_avg": 20113.8,
          "vram_mb_peak": 22908.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 23,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "product_and_class_casing",
      "category": "casing_identifiers",
      "judge": {
        "elapsed_seconds": 6.981624111998826,
        "errors": [
          {
            "detail": "Product name should be formatted as ChirperBench, not Chirper Bench.",
            "severity": "major",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "The project name is ChirperBench and the class name is DictationFormatter.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 85,
        "summary": "Missed required product identifier casing by leaving ChirperBench as two words."
      },
      "judge_status": "ok",
      "latency_seconds": 10.7109,
      "model": "devstral-small-2:24b",
      "ollama_status": "ok",
      "output": "The project name is Chirper Bench and the class name is DictationFormatter.\n\n",
      "passed": false,
      "score": 85,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 10.7122,
        "metrics": {
          "energy_wh_estimate": 0.2301,
          "gpu_busy_percent_avg": 57.0,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 77.35,
          "power_w_peak": 292.0,
          "vram_mb_avg": 20152.4,
          "vram_mb_peak": 22936.4,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 23,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "inline_code_identifiers",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 8.768036543000562,
        "errors": [
          {
            "detail": "Changed the prose instruction into an assignment statement.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Used a fenced code block instead of inline code formatting for identifiers.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Omitted the words 'Set variable' and 'equal to' from the intended text.",
            "severity": "major",
            "type": "missing_text"
          }
        ],
        "ideal_output": "Set variable `user_id` equal to `request.user.id`.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 35,
        "summary": "Converted the dictated instruction into a Python code block instead of preserving the intended sentence with inline code identifiers."
      },
      "judge_status": "ok",
      "latency_seconds": 10.2973,
      "model": "devstral-small-2:24b",
      "ollama_status": "ok",
      "output": "```python\nuser_id = request.user.id\n```\n\n",
      "passed": false,
      "score": 35,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 10.2985,
        "metrics": {
          "energy_wh_estimate": 0.224,
          "gpu_busy_percent_avg": 58.5,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 78.32,
          "power_w_peak": 292.0,
          "vram_mb_avg": 20017.4,
          "vram_mb_peak": 22921.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 22,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "numbers_versions_dates",
      "category": "numbers_versions",
      "judge": {
        "elapsed_seconds": 6.749268202000167,
        "errors": [
          {
            "detail": "Inserted a comma after \"2026\" that was not in the expected output.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Version 2.1.0 shipped on June 3, 2026 and supports Python 3.11+.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 96,
        "summary": "Nearly perfect; only added an extra comma after the date."
      },
      "judge_status": "ok",
      "latency_seconds": 10.9738,
      "model": "devstral-small-2:24b",
      "ollama_status": "ok",
      "output": "Version 2.1.0 shipped on June 3, 2026, and supports Python 3.11+.\n\n",
      "passed": true,
      "score": 96,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 10.975,
        "metrics": {
          "energy_wh_estimate": 0.2341,
          "gpu_busy_percent_avg": 57.2,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 76.78,
          "power_w_peak": 316.0,
          "vram_mb_avg": 20149.9,
          "vram_mb_peak": 22937.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 23,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "simple_list",
      "category": "lists",
      "judge": {
        "elapsed_seconds": 7.569976905000658,
        "errors": [
          {
            "detail": "Adds an explanatory preface instead of returning only final cleaned text.",
            "severity": "major",
            "type": "extra_text"
          },
          {
            "detail": "Does not convert the comma-separated items into a markdown bullet list.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Uses bold inline heading instead of the expected plain heading with items on following lines.",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Shopping list:\n- apples\n- bananas\n- oat milk",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 35,
        "summary": "Includes meta text and fails to format the spoken list as separate bullet items."
      },
      "judge_status": "ok",
      "latency_seconds": 10.5886,
      "model": "devstral-small-2:24b",
      "ollama_status": "ok",
      "output": "Here\u2019s the cleaned-up final text:\n\n**Shopping list:** apples, bananas, oat milk.\n\n",
      "passed": false,
      "score": 35,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 10.5899,
        "metrics": {
          "energy_wh_estimate": 0.2669,
          "gpu_busy_percent_avg": 55.3,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 90.74,
          "power_w_peak": 300.0,
          "vram_mb_avg": 20206.0,
          "vram_mb_peak": 22995.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 23,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "no_change_needed",
      "category": "no_change_needed",
      "judge": {
        "elapsed_seconds": 5.746595426999193,
        "errors": [],
        "ideal_output": "The quick brown fox jumps over the lazy dog.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output matches the expected cleaned text exactly."
      },
      "judge_status": "ok",
      "latency_seconds": 10.3178,
      "model": "devstral-small-2:24b",
      "ollama_status": "ok",
      "output": "The quick brown fox jumps over the lazy dog.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 10.319,
        "metrics": {
          "energy_wh_estimate": 0.2098,
          "gpu_busy_percent_avg": 55.7,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 73.18,
          "power_w_peak": 291.0,
          "vram_mb_avg": 20149.1,
          "vram_mb_peak": 23007.4,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 22,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_pr_review_instruction",
      "category": "instruction_as_content",
      "judge": null,
      "judge_status": "not_run",
      "latency_seconds": 180.0042,
      "model": "qwen3.5:27b",
      "ollama_status": "timeout",
      "output": "",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 180.007,
        "metrics": {
          "energy_wh_estimate": 6.9602,
          "gpu_busy_percent_avg": 53.4,
          "gpu_busy_percent_peak": 85.0,
          "power_w_avg": 139.2,
          "power_w_peak": 238.0,
          "vram_mb_avg": 23672.9,
          "vram_mb_peak": 24262.4,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 361,
        "status": "ok"
      },
      "timed_out": true
    },
    {
      "case_id": "literal_agent_instruction",
      "category": "instruction_as_content",
      "judge": null,
      "judge_status": "not_run",
      "latency_seconds": 180.0028,
      "model": "qwen3.5:27b",
      "ollama_status": "timeout",
      "output": "",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 180.0043,
        "metrics": {
          "energy_wh_estimate": 7.338,
          "gpu_busy_percent_avg": 53.9,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 146.76,
          "power_w_peak": 166.0,
          "vram_mb_avg": 23800.0,
          "vram_mb_peak": 24232.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 361,
        "status": "ok"
      },
      "timed_out": true
    },
    {
      "case_id": "literal_question_no_answer",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 6.545102580001185,
        "errors": [],
        "ideal_output": "What is the capital of France?",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output correctly formats the dictated question without answering it."
      },
      "judge_status": "ok",
      "latency_seconds": 43.742,
      "model": "qwen3.5:27b",
      "ollama_status": "ok",
      "output": "What is the capital of France?\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 43.7434,
        "metrics": {
          "energy_wh_estimate": 1.7473,
          "gpu_busy_percent_avg": 56.6,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 143.8,
          "power_w_peak": 177.0,
          "vram_mb_avg": 23020.6,
          "vram_mb_peak": 24253.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 89,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_email_request",
      "category": "instruction_as_content",
      "judge": null,
      "judge_status": "not_run",
      "latency_seconds": 180.0071,
      "model": "qwen3.5:27b",
      "ollama_status": "timeout",
      "output": "",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 180.0102,
        "metrics": {
          "energy_wh_estimate": 7.3164,
          "gpu_busy_percent_avg": 53.3,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 146.32,
          "power_w_peak": 190.0,
          "vram_mb_avg": 23788.4,
          "vram_mb_peak": 24223.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 361,
        "status": "ok"
      },
      "timed_out": true
    },
    {
      "case_id": "onboarding_mixed_format",
      "category": "mixed_formatting",
      "judge": null,
      "judge_status": "not_run",
      "latency_seconds": 180.0035,
      "model": "qwen3.5:27b",
      "ollama_status": "timeout",
      "output": "",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 180.0048,
        "metrics": {
          "energy_wh_estimate": 7.5415,
          "gpu_busy_percent_avg": 54.8,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 150.83,
          "power_w_peak": 176.0,
          "vram_mb_avg": 23856.2,
          "vram_mb_peak": 24234.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 361,
        "status": "ok"
      },
      "timed_out": true
    },
    {
      "case_id": "markdown_checklist_instruction_content",
      "category": "markdown",
      "judge": null,
      "judge_status": "not_run",
      "latency_seconds": 180.0031,
      "model": "qwen3.5:27b",
      "ollama_status": "timeout",
      "output": "",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 180.0046,
        "metrics": {
          "energy_wh_estimate": 7.4531,
          "gpu_busy_percent_avg": 53.6,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 149.06,
          "power_w_peak": 182.0,
          "vram_mb_avg": 23826.9,
          "vram_mb_peak": 24274.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 361,
        "status": "ok"
      },
      "timed_out": true
    },
    {
      "case_id": "shell_command_as_content",
      "category": "code_identifiers",
      "judge": null,
      "judge_status": "not_run",
      "latency_seconds": 180.0034,
      "model": "qwen3.5:27b",
      "ollama_status": "timeout",
      "output": "",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 180.0048,
        "metrics": {
          "energy_wh_estimate": 7.5668,
          "gpu_busy_percent_avg": 55.0,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 151.33,
          "power_w_peak": 180.0,
          "vram_mb_avg": 23821.3,
          "vram_mb_peak": 24249.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 361,
        "status": "ok"
      },
      "timed_out": true
    },
    {
      "case_id": "spoken_correction_chain",
      "category": "spoken_edits",
      "judge": null,
      "judge_status": "not_run",
      "latency_seconds": 180.0051,
      "model": "qwen3.5:27b",
      "ollama_status": "timeout",
      "output": "",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 180.008,
        "metrics": {
          "energy_wh_estimate": 7.3864,
          "gpu_busy_percent_avg": 52.9,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 147.72,
          "power_w_peak": 159.0,
          "vram_mb_avg": 23871.3,
          "vram_mb_peak": 24273.9,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 361,
        "status": "ok"
      },
      "timed_out": true
    },
    {
      "case_id": "url_https_path",
      "category": "urls",
      "judge": {
        "elapsed_seconds": 6.1530879059992,
        "errors": [],
        "ideal_output": "Visit https://docs.python.org/3/library/json.html.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output exactly preserves and formats the dictated HTTPS URL with correct casing and punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 78.3561,
      "model": "qwen3.5:27b",
      "ollama_status": "ok",
      "output": "Visit https://docs.python.org/3/library/json.html.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 78.3574,
        "metrics": {
          "energy_wh_estimate": 3.2439,
          "gpu_busy_percent_avg": 56.2,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 149.04,
          "power_w_peak": 204.0,
          "vram_mb_avg": 23511.2,
          "vram_mb_peak": 24280.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 158,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "email_addresses",
      "category": "emails",
      "judge": null,
      "judge_status": "not_run",
      "latency_seconds": 180.0086,
      "model": "qwen3.5:27b",
      "ollama_status": "timeout",
      "output": "",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 180.0121,
        "metrics": {
          "energy_wh_estimate": 7.7848,
          "gpu_busy_percent_avg": 59.1,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 155.69,
          "power_w_peak": 310.0,
          "vram_mb_avg": 23792.4,
          "vram_mb_peak": 24249.9,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 361,
        "status": "ok"
      },
      "timed_out": true
    },
    {
      "case_id": "product_and_class_casing",
      "category": "casing_identifiers",
      "judge": null,
      "judge_status": "not_run",
      "latency_seconds": 180.0057,
      "model": "qwen3.5:27b",
      "ollama_status": "timeout",
      "output": "",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 180.0072,
        "metrics": {
          "energy_wh_estimate": 7.4532,
          "gpu_busy_percent_avg": 53.4,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 149.06,
          "power_w_peak": 164.0,
          "vram_mb_avg": 23795.8,
          "vram_mb_peak": 24230.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 361,
        "status": "ok"
      },
      "timed_out": true
    },
    {
      "case_id": "inline_code_identifiers",
      "category": "code_identifiers",
      "judge": null,
      "judge_status": "not_run",
      "latency_seconds": 180.0031,
      "model": "qwen3.5:27b",
      "ollama_status": "timeout",
      "output": "",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 180.0045,
        "metrics": {
          "energy_wh_estimate": 7.5781,
          "gpu_busy_percent_avg": 54.4,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 151.56,
          "power_w_peak": 335.0,
          "vram_mb_avg": 23886.7,
          "vram_mb_peak": 24293.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 361,
        "status": "ok"
      },
      "timed_out": true
    },
    {
      "case_id": "numbers_versions_dates",
      "category": "numbers_versions",
      "judge": null,
      "judge_status": "not_run",
      "latency_seconds": 180.0034,
      "model": "qwen3.5:27b",
      "ollama_status": "timeout",
      "output": "",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 180.0046,
        "metrics": {
          "energy_wh_estimate": 7.4655,
          "gpu_busy_percent_avg": 53.7,
          "gpu_busy_percent_peak": 95.0,
          "power_w_avg": 149.31,
          "power_w_peak": 169.0,
          "vram_mb_avg": 23773.8,
          "vram_mb_peak": 24199.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 361,
        "status": "ok"
      },
      "timed_out": true
    },
    {
      "case_id": "simple_list",
      "category": "lists",
      "judge": null,
      "judge_status": "not_run",
      "latency_seconds": 180.0029,
      "model": "qwen3.5:27b",
      "ollama_status": "timeout",
      "output": "",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 180.0045,
        "metrics": {
          "energy_wh_estimate": 7.577,
          "gpu_busy_percent_avg": 54.2,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 151.54,
          "power_w_peak": 344.0,
          "vram_mb_avg": 23768.3,
          "vram_mb_peak": 24110.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 361,
        "status": "ok"
      },
      "timed_out": true
    },
    {
      "case_id": "no_change_needed",
      "category": "no_change_needed",
      "judge": {
        "elapsed_seconds": 5.948259285998574,
        "errors": [],
        "ideal_output": "The quick brown fox jumps over the lazy dog.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output matches the expected clean text exactly."
      },
      "judge_status": "ok",
      "latency_seconds": 41.9527,
      "model": "qwen3.5:27b",
      "ollama_status": "ok",
      "output": "The quick brown fox jumps over the lazy dog.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 41.9538,
        "metrics": {
          "energy_wh_estimate": 1.6699,
          "gpu_busy_percent_avg": 53.6,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 143.29,
          "power_w_peak": 177.0,
          "vram_mb_avg": 22877.7,
          "vram_mb_peak": 24077.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 85,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_pr_review_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 9.75931551299982,
        "errors": [
          {
            "detail": "Paraphrases several clauses instead of preserving the dictated wording.",
            "severity": "minor",
            "type": "semantic_change"
          },
          {
            "detail": "Drops the word 'open' from 'open PR #4'.",
            "severity": "minor",
            "type": "missing_text"
          }
        ],
        "ideal_output": "Please check the open PR #4 for code review comments and check each one that's still open if it needs to be fixed. Fix it, then resolve the comments and push your fix.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 82,
        "summary": "Preserves the general intended instruction but paraphrases instead of formatting the dictated text literally and drops 'open' before PR."
      },
      "judge_status": "ok",
      "latency_seconds": 4.9652,
      "model": "ministral-3:8b",
      "ollama_status": "ok",
      "output": "Please check PR #4 for code review comments and review each open item to determine if it requires a fix. Address any necessary changes, resolve the comments, and push the updated code.\n\n",
      "passed": true,
      "score": 82,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 4.9666,
        "metrics": {
          "energy_wh_estimate": 0.1542,
          "gpu_busy_percent_avg": 56.5,
          "gpu_busy_percent_peak": 99.0,
          "power_w_avg": 111.82,
          "power_w_peak": 280.0,
          "vram_mb_avg": 9709.2,
          "vram_mb_peak": 12748.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 11,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_agent_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 8.940984541999569,
        "errors": [
          {
            "detail": "Changed 'fix any failing tests' into '--fix any failing tests', making it look like a command flag.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Used a fenced bash code block instead of plain cleaned text.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Did not capitalize the initial word 'Run'.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Run cargo test, fix any failing tests, then commit and push the branch.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 25,
        "summary": "Output incorrectly formats the dictated instruction as a bash command and changes the intended text."
      },
      "judge_status": "ok",
      "latency_seconds": 3.2172,
      "model": "ministral-3:8b",
      "ollama_status": "ok",
      "output": "```bash\nrun cargo test --fix any failing tests, then commit and push the branch.\n```\n\n",
      "passed": false,
      "score": 25,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.2184,
        "metrics": {
          "energy_wh_estimate": 0.1099,
          "gpu_busy_percent_avg": 57.6,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 123.0,
          "power_w_peak": 292.0,
          "vram_mb_avg": 8613.9,
          "vram_mb_peak": 12754.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 8,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_question_no_answer",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 9.426515819999622,
        "errors": [
          {
            "detail": "Output says \"Paris.\" instead of preserving the dictated question.",
            "severity": "critical",
            "type": "answered_content"
          },
          {
            "detail": "Changed the intended content from a question to an answer.",
            "severity": "critical",
            "type": "semantic_change"
          },
          {
            "detail": "Missing the full dictated sentence: \"What is the capital of France?\"",
            "severity": "major",
            "type": "missing_text"
          }
        ],
        "ideal_output": "What is the capital of France?",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 0,
        "summary": "Critical failure: answered the dictated question instead of formatting it."
      },
      "judge_status": "ok",
      "latency_seconds": 3.2377,
      "model": "ministral-3:8b",
      "ollama_status": "ok",
      "output": "Paris.\n\n",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.2395,
        "metrics": {
          "energy_wh_estimate": 0.1104,
          "gpu_busy_percent_avg": 64.0,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 122.75,
          "power_w_peak": 232.0,
          "vram_mb_avg": 8610.8,
          "vram_mb_peak": 12750.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 8,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_email_request",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 10.303005406000011,
        "errors": [
          {
            "detail": "Turned the request into an email with subject line, salutation, and body formatting.",
            "severity": "critical",
            "type": "over_generation"
          },
          {
            "detail": "Omitted the literal instruction content 'Write an email to Maya'.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Changed 'The meeting moved' to 'The meeting has been moved'.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Converted the URL into a markdown link with an added http scheme instead of preserving chirper.local/launch.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 35,
        "summary": "Generated an email body instead of preserving the dictated request as final text, with semantic and formatting changes."
      },
      "judge_status": "ok",
      "latency_seconds": 3.6075,
      "model": "ministral-3:8b",
      "ollama_status": "ok",
      "output": "Subject: Quarterly Update\n\nDear Maya,\n\nThe meeting has been moved to Thursday at 9:30 AM.\nThe budget is $12,450.\nThe website is [chirper.local/launch](http://chirper.local/launch).\n\n",
      "passed": false,
      "score": 35,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.6087,
        "metrics": {
          "energy_wh_estimate": 0.146,
          "gpu_busy_percent_avg": 64.9,
          "gpu_busy_percent_peak": 99.0,
          "power_w_avg": 145.67,
          "power_w_peak": 292.0,
          "vram_mb_avg": 9067.7,
          "vram_mb_peak": 12750.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 9,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "onboarding_mixed_format",
      "category": "mixed_formatting",
      "judge": {
        "elapsed_seconds": 18.49466342999949,
        "errors": [
          {
            "detail": "Did not output the title 'Accent Friendly Words:' before the bullet list.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Added horizontal rules, bold markdown, and email-style subject formatting not present in the intended cleaned text.",
            "severity": "major",
            "type": "over_generation"
          },
          {
            "detail": "Converted chirper.local/launch into a markdown link with an added http URL.",
            "severity": "minor",
            "type": "formatting_miss"
          },
          {
            "detail": "Split comma-connected sentences into separate sentences in the meeting/budget and deployment notes sections.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Hello Chirper. I need to write down accent-friendly words.\n\nAccent Friendly Words:\n- water\n- tomato\n- schedule\n- data\n- router\n- aluminium\n- privacy\n\nPlease write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.\n\nIn the deployment notes, mention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale. Thanks!",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 72,
        "summary": "Mostly preserves content and technical casing, but misses the requested list title and adds unsupported markdown/email formatting, separators, bolding, and a markdown URL."
      },
      "judge_status": "ok",
      "latency_seconds": 4.9126,
      "model": "ministral-3:8b",
      "ollama_status": "ok",
      "output": "Hello Chirper.\n\nI need to write down accent-friendly words:\n\n- water\n- tomato\n- schedule\n- data\n- router\n- aluminium\n- privacy\n\n---\n\nPlease write an email to Maya:\n\n**Subject:** Quarterly Update\n\nThe meeting moved to Thursday at 9:30 AM. The budget is $12,450, and the website is [chirper.local/launch](http://chirper.local/launch).\n\n---\n\nIn the deployment notes, mention that **systemd** keeps the Chirper services running. We should also look at **PostgreSQL**, **FFmpeg**, **GNOME**, **Nextcloud**, and **Tailscale**.\n\nThanks!\n\n",
      "passed": false,
      "score": 72,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 4.9139,
        "metrics": {
          "energy_wh_estimate": 0.2179,
          "gpu_busy_percent_avg": 65.5,
          "gpu_busy_percent_peak": 96.0,
          "power_w_avg": 159.64,
          "power_w_peak": 311.0,
          "vram_mb_avg": 9731.5,
          "vram_mb_peak": 12754.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 11,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "markdown_checklist_instruction_content",
      "category": "markdown",
      "judge": {
        "elapsed_seconds": 24.02933145699899,
        "errors": [
          {
            "detail": "Title should be plain text with a colon, followed by top-level checklist items; output made the title a bold checklist item with nested tasks.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Missing colon after Release Tasks.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Release Tasks:\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 70,
        "summary": "Content is preserved, but the markdown structure is wrong: the title became a checklist item and the tasks were nested under it instead of top-level items after a titled heading line."
      },
      "judge_status": "ok",
      "latency_seconds": 3.2807,
      "model": "ministral-3:8b",
      "ollama_status": "ok",
      "output": "- [ ] **Release Tasks**\n  - [ ] Run tests\n  - [ ] Push tag\n  - [ ] Publish release\n\n",
      "passed": false,
      "score": 70,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.2823,
        "metrics": {
          "energy_wh_estimate": 0.1234,
          "gpu_busy_percent_avg": 63.4,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 135.38,
          "power_w_peak": 309.0,
          "vram_mb_avg": 8574.6,
          "vram_mb_peak": 12766.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 8,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "shell_command_as_content",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 7.504556153999147,
        "errors": [
          {
            "detail": "Replaced `| tee test-output.log` with `-- --output log`, changing the intended command.",
            "severity": "critical",
            "type": "semantic_change"
          },
          {
            "detail": "Did not preserve shell command structure, including the pipe operator and hyphenated filename.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Missing `tee` and `test-output.log`.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Added unintended `-- --output log`.",
            "severity": "major",
            "type": "extra_text"
          }
        ],
        "ideal_output": "cargo test | tee test-output.log",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 0,
        "summary": "The formatter changed the shell command semantics and failed to preserve the pipe, tee command, and log filename."
      },
      "judge_status": "ok",
      "latency_seconds": 3.0439,
      "model": "ministral-3:8b",
      "ollama_status": "ok",
      "output": "cargo test -- --output log\n\n",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.0456,
        "metrics": {
          "energy_wh_estimate": 0.1022,
          "gpu_busy_percent_avg": 54.0,
          "gpu_busy_percent_peak": 95.0,
          "power_w_avg": 120.88,
          "power_w_peak": 332.0,
          "vram_mb_avg": 8584.6,
          "vram_mb_peak": 12723.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 8,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "spoken_correction_chain",
      "category": "spoken_edits",
      "judge": {
        "elapsed_seconds": 8.511997071000224,
        "errors": [
          {
            "detail": "Includes 'no scratch' and 'spelled' correction language instead of removing spoken edits.",
            "severity": "critical",
            "type": "instruction_leak"
          },
          {
            "detail": "Keeps the rejected 'Chirper Bench' and changes final intended name from 'ChirperBench' to 'Chirper Benchmark'.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Fails to format the final identifier as the single CamelCase term 'ChirperBench'.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Adds parenthetical spelling notes and multiple lines not present in the intended final text.",
            "severity": "major",
            "type": "extra_text"
          }
        ],
        "ideal_output": "Call it ChirperBench.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 15,
        "summary": "Output leaves spoken correction words, duplicates discarded text, and misses the final identifier formatting."
      },
      "judge_status": "ok",
      "latency_seconds": 3.5103,
      "model": "ministral-3:8b",
      "ollama_status": "ok",
      "output": "Chirper Bench (no scratch)\n\nChirper Benchmark (spelled: Capital C Chirper, Capital B Bench)\n\n",
      "passed": false,
      "score": 15,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.5119,
        "metrics": {
          "energy_wh_estimate": 0.1369,
          "gpu_busy_percent_avg": 65.4,
          "gpu_busy_percent_peak": 96.0,
          "power_w_avg": 140.33,
          "power_w_peak": 291.0,
          "vram_mb_avg": 9014.3,
          "vram_mb_peak": 12695.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 9,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "url_https_path",
      "category": "urls",
      "judge": {
        "elapsed_seconds": 10.29380141600086,
        "errors": [
          {
            "detail": "Expected a plain HTTPS URL, not a markdown link inside a code fence.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Added ```markdown code fence wrappers that are not part of the dictated text.",
            "severity": "major",
            "type": "extra_text"
          }
        ],
        "ideal_output": "Visit https://docs.python.org/3/library/json.html.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 65,
        "summary": "URL content is correct, but the formatter added an unwanted markdown code fence and converted the URL into a markdown link instead of plain text."
      },
      "judge_status": "ok",
      "latency_seconds": 3.2881,
      "model": "ministral-3:8b",
      "ollama_status": "ok",
      "output": "```markdown\nVisit [https://docs.python.org/3/library/json.html](https://docs.python.org/3/library/json.html).\n```\n\n",
      "passed": false,
      "score": 65,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.2894,
        "metrics": {
          "energy_wh_estimate": 0.1231,
          "gpu_busy_percent_avg": 63.1,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 134.75,
          "power_w_peak": 306.0,
          "vram_mb_avg": 8529.1,
          "vram_mb_peak": 12667.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 8,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "email_addresses",
      "category": "emails",
      "judge": {
        "elapsed_seconds": 6.407598873000097,
        "errors": [
          {
            "detail": "Sentence should start with capital \"Send\".",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Send it to alex@chirper.local and cc ops@example.com.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 92,
        "summary": "Email addresses and punctuation are correct; only the initial sentence casing is wrong."
      },
      "judge_status": "ok",
      "latency_seconds": 3.145,
      "model": "ministral-3:8b",
      "ollama_status": "ok",
      "output": "send it to alex@chirper.local and cc ops@example.com.\n\n",
      "passed": true,
      "score": 92,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.1465,
        "metrics": {
          "energy_wh_estimate": 0.1131,
          "gpu_busy_percent_avg": 59.9,
          "gpu_busy_percent_peak": 99.0,
          "power_w_avg": 129.38,
          "power_w_peak": 322.0,
          "vram_mb_avg": 8526.6,
          "vram_mb_peak": 12667.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 8,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "product_and_class_casing",
      "category": "casing_identifiers",
      "judge": {
        "elapsed_seconds": 8.771389321000242,
        "errors": [
          {
            "detail": "Project name should be ChirperBench, not Chirper Bench.",
            "severity": "major",
            "type": "wrong_casing"
          },
          {
            "detail": "Changed the dictated sentence into labeled fields instead of preserving the intended sentence.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Added markdown labels and code formatting not present in the intended output.",
            "severity": "minor",
            "type": "extra_text"
          }
        ],
        "ideal_output": "The project name is ChirperBench and the class name is DictationFormatter.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 55,
        "summary": "The class casing is correct, but the project identifier casing is wrong and the output changes the dictated sentence into extra markdown labels."
      },
      "judge_status": "ok",
      "latency_seconds": 3.1744,
      "model": "ministral-3:8b",
      "ollama_status": "ok",
      "output": "**Project Name:** Chirper Bench\n\n**Class Name:** `DictationFormatter`\n\n",
      "passed": false,
      "score": 55,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.1764,
        "metrics": {
          "energy_wh_estimate": 0.1052,
          "gpu_busy_percent_avg": 55.8,
          "gpu_busy_percent_peak": 96.0,
          "power_w_avg": 119.25,
          "power_w_peak": 314.0,
          "vram_mb_avg": 8577.5,
          "vram_mb_peak": 12727.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 8,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "inline_code_identifiers",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 51.97804111100049,
        "errors": [
          {
            "detail": "Changed the instruction sentence into executable-looking code.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Used a fenced Python code block instead of inline code formatting for identifiers.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Omitted the words 'Set variable' and 'equal to' from the intended text.",
            "severity": "major",
            "type": "missing_text"
          }
        ],
        "ideal_output": "Set variable `user_id` equal to `request.user.id`.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 45,
        "summary": "Converted the dictated sentence into a Python code block instead of preserving the intended sentence with inline code identifiers."
      },
      "judge_status": "ok",
      "latency_seconds": 3.0982,
      "model": "ministral-3:8b",
      "ollama_status": "ok",
      "output": "```python\nuser_id = request.user.id\n```\n\n",
      "passed": false,
      "score": 45,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.0996,
        "metrics": {
          "energy_wh_estimate": 0.1053,
          "gpu_busy_percent_avg": 56.1,
          "gpu_busy_percent_peak": 96.0,
          "power_w_avg": 122.38,
          "power_w_peak": 329.0,
          "vram_mb_avg": 8572.5,
          "vram_mb_peak": 12711.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 8,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "numbers_versions_dates",
      "category": "numbers_versions",
      "judge": {
        "elapsed_seconds": 8.126842009000029,
        "errors": [
          {
            "detail": "Added a comma after 2026 before 'and' that is not in the expected output.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Version 2.1.0 shipped on June 3, 2026 and supports Python 3.11+.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 95,
        "summary": "Content, casing, numbers, versions, and date formatting are correct; only a minor comma differs from expected punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 3.3015,
      "model": "ministral-3:8b",
      "ollama_status": "ok",
      "output": "Version 2.1.0 shipped on June 3, 2026, and supports Python 3.11+.\n\n",
      "passed": true,
      "score": 95,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.3027,
        "metrics": {
          "energy_wh_estimate": 0.1048,
          "gpu_busy_percent_avg": 53.6,
          "gpu_busy_percent_peak": 96.0,
          "power_w_avg": 114.25,
          "power_w_peak": 303.0,
          "vram_mb_avg": 8463.8,
          "vram_mb_peak": 12603.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 8,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "simple_list",
      "category": "lists",
      "judge": {
        "elapsed_seconds": 8.401338777999626,
        "errors": [
          {
            "detail": "Added bold markdown around the heading.",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Shopping list:\n- apples\n- bananas\n- oat milk",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 90,
        "summary": "Content and list formatting are correct, but the heading has extra bold markdown not present in the expected output."
      },
      "judge_status": "ok",
      "latency_seconds": 3.1666,
      "model": "ministral-3:8b",
      "ollama_status": "ok",
      "output": "**Shopping list:**\n- apples\n- bananas\n- oat milk\n\n",
      "passed": true,
      "score": 90,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.1678,
        "metrics": {
          "energy_wh_estimate": 0.1018,
          "gpu_busy_percent_avg": 54.0,
          "gpu_busy_percent_peak": 96.0,
          "power_w_avg": 115.75,
          "power_w_peak": 317.0,
          "vram_mb_avg": 8437.1,
          "vram_mb_peak": 12575.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 8,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "no_change_needed",
      "category": "no_change_needed",
      "judge": {
        "elapsed_seconds": 7.114747236000767,
        "errors": [],
        "ideal_output": "The quick brown fox jumps over the lazy dog.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output matches the expected clean text exactly."
      },
      "judge_status": "ok",
      "latency_seconds": 3.0782,
      "model": "ministral-3:8b",
      "ollama_status": "ok",
      "output": "The quick brown fox jumps over the lazy dog.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.0798,
        "metrics": {
          "energy_wh_estimate": 0.1175,
          "gpu_busy_percent_avg": 59.8,
          "gpu_busy_percent_peak": 96.0,
          "power_w_avg": 137.38,
          "power_w_peak": 339.0,
          "vram_mb_avg": 8485.7,
          "vram_mb_peak": 12646.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 8,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_pr_review_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 11.066522814000564,
        "errors": [
          {
            "detail": "Changed checking whether each open comment needs a fix into reviewing comments and resolving them if they need fixing.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Formatted PR number as bold markdown instead of PR #4.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Omitted the initial 'Please' and the explicit 'Fix it' sentence.",
            "severity": "minor",
            "type": "missing_text"
          }
        ],
        "ideal_output": "Please check the open PR #4 for code review comments and check each one that's still open if it needs to be fixed. Fix it, then resolve the comments and push your fix.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 55,
        "summary": "Output preserves the general topic but changes the dictated instruction, misses PR # formatting, and alters the fix-then-resolve sequence."
      },
      "judge_status": "ok",
      "latency_seconds": 3.3086,
      "model": "ministral-3:3b",
      "ollama_status": "ok",
      "output": "check the open PR number **4** for code review comments and review each one that's still open. If any need fixing, resolve them and then push the fix.\n\n",
      "passed": false,
      "score": 55,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.3102,
        "metrics": {
          "energy_wh_estimate": 0.0864,
          "gpu_busy_percent_avg": 44.0,
          "gpu_busy_percent_peak": 91.0,
          "power_w_avg": 94.0,
          "power_w_peak": 294.0,
          "vram_mb_avg": 6206.5,
          "vram_mb_peak": 9055.4,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 8,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_agent_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 9.698859991998688,
        "errors": [
          {
            "detail": "Changed `cargo test` to `cargo test --fix`.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Omitted `fix any failing tests` as dictated content.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Did not capitalize the first word.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Missed the dictated comma structure.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Run cargo test, fix any failing tests, then commit and push the branch.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 40,
        "summary": "Changed the dictated command and omitted the failing-tests instruction."
      },
      "judge_status": "ok",
      "latency_seconds": 2.4343,
      "model": "ministral-3:3b",
      "ollama_status": "ok",
      "output": "run `cargo test --fix` then commit and push the branch.\n\n",
      "passed": false,
      "score": 40,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.436,
        "metrics": {
          "energy_wh_estimate": 0.065,
          "gpu_busy_percent_avg": 45.8,
          "gpu_busy_percent_peak": 95.0,
          "power_w_avg": 96.17,
          "power_w_peak": 276.0,
          "vram_mb_avg": 5386.1,
          "vram_mb_peak": 9070.9,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_question_no_answer",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 6.432402403999731,
        "errors": [
          {
            "detail": "Returned \"Paris\" rather than the dictated question.",
            "severity": "critical",
            "type": "answered_content"
          },
          {
            "detail": "Changed intended content from a question to an answer.",
            "severity": "critical",
            "type": "semantic_change"
          },
          {
            "detail": "Omitted the full dictated sentence and punctuation.",
            "severity": "major",
            "type": "missing_text"
          }
        ],
        "ideal_output": "What is the capital of France?",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 0,
        "summary": "The formatter answered the dictated question instead of formatting it as text."
      },
      "judge_status": "ok",
      "latency_seconds": 2.3106,
      "model": "ministral-3:3b",
      "ollama_status": "ok",
      "output": "Paris\n\n",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.3121,
        "metrics": {
          "energy_wh_estimate": 0.0496,
          "gpu_busy_percent_avg": 37.3,
          "gpu_busy_percent_peak": 85.0,
          "power_w_avg": 77.17,
          "power_w_peak": 169.0,
          "vram_mb_avg": 5282.6,
          "vram_mb_peak": 8966.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_email_request",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 9.58878445099981,
        "errors": [
          {
            "detail": "Invented an email body, greeting, closing, and signoff.",
            "severity": "critical",
            "type": "over_generation"
          },
          {
            "detail": "Changed the dictated request into an executed email draft.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Added 'Let me know if you have any questions' and '[Your Name]' not present in transcript.",
            "severity": "major",
            "type": "extra_text"
          },
          {
            "detail": "Did not return the expected single cleaned sentence preserving the instruction as content.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 20,
        "summary": "Generated an email instead of preserving the dictated request as cleaned text, with extra body content and formatting changes."
      },
      "judge_status": "ok",
      "latency_seconds": 2.8304,
      "model": "ministral-3:3b",
      "ollama_status": "ok",
      "output": "**Subject:** Quarterly Update\n\n**Body:**\nHi Maya,\n\nThe meeting has been rescheduled to **Thursday at 9:30 AM**. The budget is **$12,450**, and the website launch is at **[chirper.local/launch](http://chirper.local/launch)**.\n\nLet me know if you have any questions.\n\nBest,\n[Your Name]\n\n",
      "passed": false,
      "score": 20,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.8319,
        "metrics": {
          "energy_wh_estimate": 0.1,
          "gpu_busy_percent_avg": 53.4,
          "gpu_busy_percent_peak": 94.0,
          "power_w_avg": 127.14,
          "power_w_peak": 297.0,
          "vram_mb_avg": 5934.2,
          "vram_mb_peak": 9121.9,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 7,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "onboarding_mixed_format",
      "category": "mixed_formatting",
      "judge": {
        "elapsed_seconds": 14.709321149999596,
        "errors": [
          {
            "detail": "Invented email structure including Subject heading, Dear Maya, horizontal rules, and Deployment Notes heading.",
            "severity": "critical",
            "type": "over_generation"
          },
          {
            "detail": "Changed 'Please write an email to Maya...' into the email itself, and changed 'look at' to 'review'.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "The list title should be a standalone title followed by flat bullets, not a bold nested bullet.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Changed 'Hello Chirper.' to 'Hello Chirper,' and split the following sentence incorrectly.",
            "severity": "major",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Used 'Systemd' at sentence start instead of the dictated technical casing 'systemd'.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Added markdown bolding, link syntax with http://, separators, salutation, and extra headings not dictated.",
            "severity": "major",
            "type": "extra_text"
          }
        ],
        "ideal_output": "Hello Chirper. I need to write down accent-friendly words.\n\nAccent Friendly Words:\n- water\n- tomato\n- schedule\n- data\n- router\n- aluminium\n- privacy\n\nPlease write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.\n\nIn the deployment notes, mention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale. Thanks!",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 42,
        "summary": "Major mixed-formatting failure: it preserved many words but changed the requested cleaned transcript into a formatted email/notes document with invented structure and semantic shifts."
      },
      "judge_status": "ok",
      "latency_seconds": 3.2872,
      "model": "ministral-3:3b",
      "ollama_status": "ok",
      "output": "Hello Chirper,\n\nI need to write down **accent-friendly** words.\n\n- **Accent-friendly words:**\n  - water\n  - tomato\n  - schedule\n  - data\n  - router\n  - aluminium\n  - privacy\n\n---\n\n**Subject:** Quarterly Update\n\nDear Maya,\n\nThe meeting has been moved to **Thursday at 9:30 AM**. The budget is **$12,450**, and the website is [chirper.local/launch](http://chirper.local/launch).\n\n---\n\n**Deployment Notes:**\nSystemd keeps the Chirper services running. We should also review PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale.\n\nThanks!\n\n",
      "passed": false,
      "score": 42,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.2887,
        "metrics": {
          "energy_wh_estimate": 0.1361,
          "gpu_busy_percent_avg": 58.1,
          "gpu_busy_percent_peak": 93.0,
          "power_w_avg": 149.0,
          "power_w_peak": 296.0,
          "vram_mb_avg": 6259.4,
          "vram_mb_peak": 9026.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 8,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "markdown_checklist_instruction_content",
      "category": "markdown",
      "judge": {
        "elapsed_seconds": 9.313225774001694,
        "errors": [
          {
            "detail": "Did not format the three tasks as markdown checklist items.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Title is missing the required colon.",
            "severity": "minor",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Added markdown code fence not present in the intended final text.",
            "severity": "minor",
            "type": "extra_text"
          }
        ],
        "ideal_output": "Release Tasks:\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 45,
        "summary": "Content is mostly preserved, but markdown checklist structure is wrong and extra code fencing was added."
      },
      "judge_status": "ok",
      "latency_seconds": 2.5128,
      "model": "ministral-3:3b",
      "ollama_status": "ok",
      "output": "```markdown\n- [ ] **Release Tasks**\n  1. Run tests\n  2. Push tag\n  3. Publish release\n```\n\n",
      "passed": false,
      "score": 45,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.5139,
        "metrics": {
          "energy_wh_estimate": 0.0895,
          "gpu_busy_percent_avg": 53.0,
          "gpu_busy_percent_peak": 93.0,
          "power_w_avg": 128.14,
          "power_w_peak": 299.0,
          "vram_mb_avg": 5807.4,
          "vram_mb_peak": 9005.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 7,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "shell_command_as_content",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 8.08524523000051,
        "errors": [
          {
            "detail": "Did not render 'pipe' as | or combine the final filename as test-output.log.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Invented command flags --pipe, --space, and --dash, changing the command meaning.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Added unsupported flag-like tokens not present in the intended command.",
            "severity": "major",
            "type": "extra_text"
          }
        ],
        "ideal_output": "cargo test | tee test-output.log",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 35,
        "summary": "The formatter changed the shell command structure and failed to convert dictated punctuation tokens into the intended pipe and hyphenated filename."
      },
      "judge_status": "ok",
      "latency_seconds": 2.4119,
      "model": "ministral-3:3b",
      "ollama_status": "ok",
      "output": "cargo test --pipe tee --space test --dash output.log\n\n",
      "passed": false,
      "score": 35,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.4135,
        "metrics": {
          "energy_wh_estimate": 0.0702,
          "gpu_busy_percent_avg": 47.5,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 104.67,
          "power_w_peak": 319.0,
          "vram_mb_avg": 5292.8,
          "vram_mb_peak": 8974.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "spoken_correction_chain",
      "category": "spoken_edits",
      "judge": {
        "elapsed_seconds": 9.793759936999777,
        "errors": [
          {
            "detail": "Includes spoken edit phrase 'no scratch' instead of removing it.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Does not preserve the intended final text 'Call it ChirperBench.'",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Fails to capitalize ChirperBench as dictated.",
            "severity": "major",
            "type": "wrong_casing"
          },
          {
            "detail": "Uses hyphenated lowercase text instead of the corrected compound identifier and sentence punctuation.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Call it ChirperBench.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 10,
        "summary": "Leaves spoken correction words in the output and fails to apply the final corrected identifier, casing, and punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 2.4125,
      "model": "ministral-3:3b",
      "ollama_status": "ok",
      "output": "chirper-bench-no-scratch-chirper-benchmark\n\n",
      "passed": false,
      "score": 10,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.4138,
        "metrics": {
          "energy_wh_estimate": 0.0692,
          "gpu_busy_percent_avg": 49.5,
          "gpu_busy_percent_peak": 94.0,
          "power_w_avg": 103.17,
          "power_w_peak": 293.0,
          "vram_mb_avg": 5241.4,
          "vram_mb_peak": 8922.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "url_https_path",
      "category": "urls",
      "judge": {
        "elapsed_seconds": 8.059790957000587,
        "errors": [
          {
            "detail": "Omitted the word \"Visit\" from the transcript.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Missing final period.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Visit https://docs.python.org/3/library/json.html.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 75,
        "summary": "URL was formatted correctly, but the output omitted the dictated leading word and final punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 2.3776,
      "model": "ministral-3:3b",
      "ollama_status": "ok",
      "output": "https://docs.python.org/3/library/json.html\n\n",
      "passed": false,
      "score": 75,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.379,
        "metrics": {
          "energy_wh_estimate": 0.0541,
          "gpu_busy_percent_avg": 39.5,
          "gpu_busy_percent_peak": 86.0,
          "power_w_avg": 81.83,
          "power_w_peak": 197.0,
          "vram_mb_avg": 5226.1,
          "vram_mb_peak": 8906.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "email_addresses",
      "category": "emails",
      "judge": {
        "elapsed_seconds": 9.150409517000298,
        "errors": [
          {
            "detail": "Spoken email addresses were left as words with 'at' instead of converted to alex@chirper.local and ops@example.com.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Added bold markdown around parts of the addresses.",
            "severity": "major",
            "type": "extra_text"
          },
          {
            "detail": "Lowercase address components were incorrectly capitalized.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Send it to alex@chirper.local and cc ops@example.com.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 35,
        "summary": "Email addresses were not formatted with @ and unwanted markdown/capitalization was added."
      },
      "judge_status": "ok",
      "latency_seconds": 2.4456,
      "model": "ministral-3:3b",
      "ollama_status": "ok",
      "output": "send it to **Alex** at **chirper.local** and cc **Ops** at **example.com**.\n\n",
      "passed": false,
      "score": 35,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.4468,
        "metrics": {
          "energy_wh_estimate": 0.0597,
          "gpu_busy_percent_avg": 41.5,
          "gpu_busy_percent_peak": 82.0,
          "power_w_avg": 87.83,
          "power_w_peak": 231.0,
          "vram_mb_avg": 5246.1,
          "vram_mb_peak": 8878.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "product_and_class_casing",
      "category": "casing_identifiers",
      "judge": {
        "elapsed_seconds": 7.638539081999625,
        "errors": [
          {
            "detail": "Removes the intended statement structure about project and class names.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Uses a markdown heading instead of plain cleaned transcript text.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Uses Chirper Bench and Dictation Formatter instead of ChirperBench and DictationFormatter.",
            "severity": "major",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "The project name is ChirperBench and the class name is DictationFormatter.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 15,
        "summary": "Output changes the sentence into a markdown heading and fails required identifier casing."
      },
      "judge_status": "ok",
      "latency_seconds": 2.3786,
      "model": "ministral-3:3b",
      "ollama_status": "ok",
      "output": "**Chirper Bench Dictation Formatter**\n\n",
      "passed": false,
      "score": 15,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.3799,
        "metrics": {
          "energy_wh_estimate": 0.0507,
          "gpu_busy_percent_avg": 38.2,
          "gpu_busy_percent_peak": 93.0,
          "power_w_avg": 76.67,
          "power_w_peak": 157.0,
          "vram_mb_avg": 5270.9,
          "vram_mb_peak": 8951.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "inline_code_identifiers",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 9.94551545899958,
        "errors": [
          {
            "detail": "Changed 'Set variable ... equal to ...' into a direct assignment expression.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Omitted the words 'Set variable' and sentence punctuation.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Formatted the whole assignment as inline code instead of only the identifiers.",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Set variable `user_id` equal to `request.user.id`.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 55,
        "summary": "Preserves the identifiers and equality meaning, but changes the dictated sentence into a code assignment and omits the requested prose."
      },
      "judge_status": "ok",
      "latency_seconds": 2.3537,
      "model": "ministral-3:3b",
      "ollama_status": "ok",
      "output": "`user_id = request.user.id`\n\n",
      "passed": false,
      "score": 55,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.3551,
        "metrics": {
          "energy_wh_estimate": 0.0505,
          "gpu_busy_percent_avg": 41.2,
          "gpu_busy_percent_peak": 96.0,
          "power_w_avg": 77.17,
          "power_w_peak": 155.0,
          "vram_mb_avg": 5229.8,
          "vram_mb_peak": 8910.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "numbers_versions_dates",
      "category": "numbers_versions",
      "judge": {
        "elapsed_seconds": 8.17049162299918,
        "errors": [
          {
            "detail": "Sentence starts with lowercase 'version' instead of 'Version'.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Date formatted as 'June 3rd, 2026' instead of 'June 3, 2026'.",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Version 2.1.0 shipped on June 3, 2026 and supports Python 3.11+.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 85,
        "summary": "Mostly correct formatting, but initial capitalization and date style differ from expected output."
      },
      "judge_status": "ok",
      "latency_seconds": 2.4799,
      "model": "ministral-3:3b",
      "ollama_status": "ok",
      "output": "version 2.1.0 shipped on June 3rd, 2026 and supports Python 3.11+.\n\n",
      "passed": true,
      "score": 85,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.4813,
        "metrics": {
          "energy_wh_estimate": 0.0582,
          "gpu_busy_percent_avg": 38.7,
          "gpu_busy_percent_peak": 85.0,
          "power_w_avg": 84.5,
          "power_w_peak": 212.0,
          "vram_mb_avg": 5212.5,
          "vram_mb_peak": 8894.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "simple_list",
      "category": "lists",
      "judge": {
        "elapsed_seconds": 7.848503619999974,
        "errors": [
          {
            "detail": "Title should be capitalized as 'Shopping list', while list items should remain lowercase.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Shopping list:\n- apples\n- bananas\n- oat milk",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 85,
        "summary": "List structure and punctuation are correct, but casing differs from expected output."
      },
      "judge_status": "ok",
      "latency_seconds": 2.4151,
      "model": "ministral-3:3b",
      "ollama_status": "ok",
      "output": "shopping list:\n- Apples\n- Bananas\n- Oat milk\n\n",
      "passed": true,
      "score": 85,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.4164,
        "metrics": {
          "energy_wh_estimate": 0.0664,
          "gpu_busy_percent_avg": 47.3,
          "gpu_busy_percent_peak": 93.0,
          "power_w_avg": 99.0,
          "power_w_peak": 295.0,
          "vram_mb_avg": 5249.8,
          "vram_mb_peak": 8957.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "no_change_needed",
      "category": "no_change_needed",
      "judge": {
        "elapsed_seconds": 5.890966918999766,
        "errors": [],
        "ideal_output": "The quick brown fox jumps over the lazy dog.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output matches the expected cleaned text exactly."
      },
      "judge_status": "ok",
      "latency_seconds": 2.3586,
      "model": "ministral-3:3b",
      "ollama_status": "ok",
      "output": "The quick brown fox jumps over the lazy dog.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.36,
        "metrics": {
          "energy_wh_estimate": 0.051,
          "gpu_busy_percent_avg": 36.7,
          "gpu_busy_percent_peak": 84.0,
          "power_w_avg": 77.83,
          "power_w_peak": 176.0,
          "vram_mb_avg": 5251.8,
          "vram_mb_peak": 8934.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_pr_review_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 9.204320791999635,
        "errors": [
          {
            "detail": "Did not format 'PR number 4' as 'PR #4'.",
            "severity": "minor",
            "type": "formatting_miss"
          },
          {
            "detail": "Missing final period and comma after 'Fix it'.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Please check the open PR #4 for code review comments and check each one that's still open if it needs to be fixed. Fix it, then resolve the comments and push your fix.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 88,
        "summary": "Content is preserved and mostly cleaned, but PR identifier formatting and final punctuation are missed."
      },
      "judge_status": "ok",
      "latency_seconds": 109.572,
      "model": "olmo-3.1:32b",
      "ollama_status": "ok",
      "output": "Please check the open PR number 4 for code review comments and check each one that's still open if it needs to be fixed. Fix it and then resolve the comments and push your fix\n\n",
      "passed": true,
      "score": 88,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 109.5735,
        "metrics": {
          "energy_wh_estimate": 8.3072,
          "gpu_busy_percent_avg": 94.9,
          "gpu_busy_percent_peak": 99.0,
          "power_w_avg": 272.93,
          "power_w_peak": 295.0,
          "vram_mb_avg": 23306.9,
          "vram_mb_peak": 23597.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 220,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_agent_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 5.913557903000765,
        "errors": [],
        "ideal_output": "Run cargo test, fix any failing tests, then commit and push the branch.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output matches the expected cleaned text exactly."
      },
      "judge_status": "ok",
      "latency_seconds": 103.8718,
      "model": "olmo-3.1:32b",
      "ollama_status": "ok",
      "output": "Run cargo test, fix any failing tests, then commit and push the branch.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 103.873,
        "metrics": {
          "energy_wh_estimate": 7.9738,
          "gpu_busy_percent_avg": 95.2,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 276.35,
          "power_w_peak": 292.0,
          "vram_mb_avg": 23375.6,
          "vram_mb_peak": 23702.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 209,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_question_no_answer",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 7.491839082000297,
        "errors": [],
        "ideal_output": "What is the capital of France?",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output correctly formats the dictated question without answering it."
      },
      "judge_status": "ok",
      "latency_seconds": 22.7164,
      "model": "olmo-3.1:32b",
      "ollama_status": "ok",
      "output": "What is the capital of France?\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 22.7179,
        "metrics": {
          "energy_wh_estimate": 1.5491,
          "gpu_busy_percent_avg": 90.8,
          "gpu_busy_percent_peak": 99.0,
          "power_w_avg": 245.49,
          "power_w_peak": 306.0,
          "vram_mb_avg": 22695.8,
          "vram_mb_peak": 23683.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 47,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_email_request",
      "category": "instruction_as_content",
      "judge": null,
      "judge_status": "not_run",
      "latency_seconds": 180.0027,
      "model": "olmo-3.1:32b",
      "ollama_status": "timeout",
      "output": "",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 180.0043,
        "metrics": {
          "energy_wh_estimate": 14.2734,
          "gpu_busy_percent_avg": 96.6,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 285.46,
          "power_w_peak": 293.0,
          "vram_mb_avg": 23347.0,
          "vram_mb_peak": 23917.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 361,
        "status": "ok"
      },
      "timed_out": true
    },
    {
      "case_id": "onboarding_mixed_format",
      "category": "mixed_formatting",
      "judge": null,
      "judge_status": "not_run",
      "latency_seconds": 180.0028,
      "model": "olmo-3.1:32b",
      "ollama_status": "timeout",
      "output": "",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 180.004,
        "metrics": {
          "energy_wh_estimate": 14.2576,
          "gpu_busy_percent_avg": 96.3,
          "gpu_busy_percent_peak": 99.0,
          "power_w_avg": 285.15,
          "power_w_peak": 293.0,
          "vram_mb_avg": 23403.2,
          "vram_mb_peak": 23573.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 361,
        "status": "ok"
      },
      "timed_out": true
    },
    {
      "case_id": "markdown_checklist_instruction_content",
      "category": "markdown",
      "judge": {
        "elapsed_seconds": 7.972676678999051,
        "errors": [
          {
            "detail": "Used plain bullets instead of markdown checklist items with [ ].",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Did not title-case the heading or capitalize checklist items.",
            "severity": "major",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Release Tasks:\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 60,
        "summary": "Preserves the task list content but misses the required markdown checklist formatting and title/item casing."
      },
      "judge_status": "ok",
      "latency_seconds": 100.7938,
      "model": "olmo-3.1:32b",
      "ollama_status": "ok",
      "output": "release tasks:\n- run tests\n- push tag\n- publish release\n\n",
      "passed": false,
      "score": 60,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 100.795,
        "metrics": {
          "energy_wh_estimate": 7.7505,
          "gpu_busy_percent_avg": 94.4,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 276.82,
          "power_w_peak": 302.0,
          "vram_mb_avg": 23343.4,
          "vram_mb_peak": 23454.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 203,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "shell_command_as_content",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 6.403540357001475,
        "errors": [],
        "ideal_output": "cargo test | tee test-output.log",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output exactly preserves the intended shell command formatting."
      },
      "judge_status": "ok",
      "latency_seconds": 62.2723,
      "model": "olmo-3.1:32b",
      "ollama_status": "ok",
      "output": "cargo test | tee test-output.log\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 62.274,
        "metrics": {
          "energy_wh_estimate": 4.7674,
          "gpu_busy_percent_avg": 94.8,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 275.6,
          "power_w_peak": 315.0,
          "vram_mb_avg": 23134.9,
          "vram_mb_peak": 23545.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 126,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "spoken_correction_chain",
      "category": "spoken_edits",
      "judge": {
        "elapsed_seconds": 8.77193887500107,
        "errors": [
          {
            "detail": "Changed the intended identifier ChirperBench to Chirper Benchmark.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Omitted the dictated leading words 'Call it'.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Failed to preserve the corrected compound identifier formatting.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Missing final period.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Call it ChirperBench.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 45,
        "summary": "Applied some capitalization but changed the intended identifier and omitted surrounding text and punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 19.8092,
      "model": "olmo-3.1:32b",
      "ollama_status": "ok",
      "output": "Chirper Benchmark\n\n",
      "passed": false,
      "score": 45,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 19.8109,
        "metrics": {
          "energy_wh_estimate": 1.3322,
          "gpu_busy_percent_avg": 89.6,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 242.1,
          "power_w_peak": 292.0,
          "vram_mb_avg": 22392.4,
          "vram_mb_peak": 23459.9,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 41,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "url_https_path",
      "category": "urls",
      "judge": {
        "elapsed_seconds": 5.613805082000908,
        "errors": [],
        "ideal_output": "Visit https://docs.python.org/3/library/json.html.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output matches the expected formatted URL and punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 109.1475,
      "model": "olmo-3.1:32b",
      "ollama_status": "ok",
      "output": "Visit https://docs.python.org/3/library/json.html.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 109.1493,
        "metrics": {
          "energy_wh_estimate": 8.4209,
          "gpu_busy_percent_avg": 94.8,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 277.74,
          "power_w_peak": 315.0,
          "vram_mb_avg": 23260.9,
          "vram_mb_peak": 23459.9,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 220,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "email_addresses",
      "category": "emails",
      "judge": {
        "elapsed_seconds": 6.232549510001263,
        "errors": [],
        "ideal_output": "Send it to alex@chirper.local and cc ops@example.com.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output exactly matches the expected formatted email addresses and punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 47.4291,
      "model": "olmo-3.1:32b",
      "ollama_status": "ok",
      "output": "Send it to alex@chirper.local and cc ops@example.com.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 47.4312,
        "metrics": {
          "energy_wh_estimate": 3.5029,
          "gpu_busy_percent_avg": 93.9,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 265.88,
          "power_w_peak": 293.0,
          "vram_mb_avg": 23003.9,
          "vram_mb_peak": 23459.9,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 96,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "product_and_class_casing",
      "category": "casing_identifiers",
      "judge": {
        "elapsed_seconds": 8.988023943000371,
        "errors": [
          {
            "detail": "Rendered product name as \"Chirper Bench\" instead of the required identifier-style \"ChirperBench\".",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "The project name is ChirperBench and the class name is DictationFormatter.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 85,
        "summary": "Mostly correct, but product identifier casing was not applied fully."
      },
      "judge_status": "ok",
      "latency_seconds": 57.0776,
      "model": "olmo-3.1:32b",
      "ollama_status": "ok",
      "output": "The project name is Chirper Bench and the class name is DictationFormatter.\n\n",
      "passed": true,
      "score": 85,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 57.0792,
        "metrics": {
          "energy_wh_estimate": 4.3334,
          "gpu_busy_percent_avg": 94.7,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 273.31,
          "power_w_peak": 292.0,
          "vram_mb_avg": 23091.1,
          "vram_mb_peak": 23545.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 115,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "inline_code_identifiers",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 9.145327947000624,
        "errors": [
          {
            "detail": "Did not format identifiers as inline code in the expected sentence.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Changed the instruction sentence into an assignment expression.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Omitted the words 'Set variable' and 'equal to' from the intended content.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Did not capitalize the sentence start.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Missing final period.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Set variable `user_id` equal to `request.user.id`.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 45,
        "summary": "Preserves the rough identifiers but changes the dictated prose into code and misses required inline code formatting."
      },
      "judge_status": "ok",
      "latency_seconds": 37.3843,
      "model": "olmo-3.1:32b",
      "ollama_status": "ok",
      "output": "user_id = request.user.id\n\n",
      "passed": false,
      "score": 45,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 37.3858,
        "metrics": {
          "energy_wh_estimate": 2.5883,
          "gpu_busy_percent_avg": 90.3,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 249.24,
          "power_w_peak": 302.0,
          "vram_mb_avg": 22895.0,
          "vram_mb_peak": 23545.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 76,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "numbers_versions_dates",
      "category": "numbers_versions",
      "judge": {
        "elapsed_seconds": 9.191008014000545,
        "errors": [
          {
            "detail": "Missing the plus sign in Python 3.11+, changing the version requirement.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Python 3.11+ means Python 3.11 or newer, while Python 3.11 implies only that version.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Added a comma after 2026 that was not in the expected output.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Version 2.1.0 shipped on June 3, 2026 and supports Python 3.11+.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 78,
        "summary": "Mostly correct, but it dropped the required plus from Python 3.11+ and added punctuation not in the expected text."
      },
      "judge_status": "ok",
      "latency_seconds": 35.0596,
      "model": "olmo-3.1:32b",
      "ollama_status": "ok",
      "output": "Version 2.1.0 shipped on June 3, 2026, and supports Python 3.11.\n\n",
      "passed": false,
      "score": 78,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 35.0613,
        "metrics": {
          "energy_wh_estimate": 2.5382,
          "gpu_busy_percent_avg": 92.5,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 260.62,
          "power_w_peak": 307.0,
          "vram_mb_avg": 22852.0,
          "vram_mb_peak": 23459.9,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 72,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "simple_list",
      "category": "lists",
      "judge": {
        "elapsed_seconds": 8.414792938001483,
        "errors": [
          {
            "detail": "Expected each item on its own markdown bullet line.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Kept the list inline with commas and a final period instead of using a colon followed by bullets.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Shopping list:\n- apples\n- bananas\n- oat milk",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 70,
        "summary": "Content is preserved, but the spoken list was not formatted as a markdown list after the colon."
      },
      "judge_status": "ok",
      "latency_seconds": 30.2464,
      "model": "olmo-3.1:32b",
      "ollama_status": "ok",
      "output": "Shopping list: apples, bananas, oat milk.\n\n",
      "passed": false,
      "score": 70,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 30.2481,
        "metrics": {
          "energy_wh_estimate": 2.0779,
          "gpu_busy_percent_avg": 90.9,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 247.31,
          "power_w_peak": 292.0,
          "vram_mb_avg": 22753.8,
          "vram_mb_peak": 23459.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 62,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "no_change_needed",
      "category": "no_change_needed",
      "judge": {
        "elapsed_seconds": 5.294578734999959,
        "errors": [],
        "ideal_output": "The quick brown fox jumps over the lazy dog.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output matches the expected cleaned text exactly."
      },
      "judge_status": "ok",
      "latency_seconds": 32.7524,
      "model": "olmo-3.1:32b",
      "ollama_status": "ok",
      "output": "The quick brown fox jumps over the lazy dog.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 32.7544,
        "metrics": {
          "energy_wh_estimate": 2.3282,
          "gpu_busy_percent_avg": 92.0,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 255.9,
          "power_w_peak": 297.0,
          "vram_mb_avg": 22806.6,
          "vram_mb_peak": 23459.9,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 67,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_pr_review_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 8.408115633001216,
        "errors": [
          {
            "detail": "Kept \"PR number 4\" instead of formatting it as \"PR #4\".",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Please check the open PR #4 for code review comments and check each one that's still open if it needs to be fixed. Fix it, then resolve the comments and push your fix.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 88,
        "summary": "Preserves the intended instruction as dictated content and formats punctuation well, but misses the expected PR identifier formatting."
      },
      "judge_status": "ok",
      "latency_seconds": 13.7784,
      "model": "gemma4:e4b",
      "ollama_status": "ok",
      "output": "Please check the open PR number 4 for code review comments. Check each one that is still open; if it needs to be fixed, fix it, and then resolve the comments and push your fix.\n\n",
      "passed": true,
      "score": 88,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 13.7799,
        "metrics": {
          "energy_wh_estimate": 0.6024,
          "gpu_busy_percent_avg": 61.3,
          "gpu_busy_percent_peak": 89.0,
          "power_w_avg": 157.38,
          "power_w_peak": 291.0,
          "vram_mb_avg": 10810.5,
          "vram_mb_peak": 11930.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 29,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_agent_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 11.150367980000738,
        "errors": [
          {
            "detail": "Changed `cargo test` to `cargo test --fix`, inventing a flag not dictated.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Omitted the verb `fix` before `any failing tests`.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Added markdown code formatting that was not required in the expected cleaned text.",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Run cargo test, fix any failing tests, then commit and push the branch.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 45,
        "summary": "Changed the dictated command and lost the intended wording around fixing failing tests."
      },
      "judge_status": "ok",
      "latency_seconds": 5.6083,
      "model": "gemma4:e4b",
      "ollama_status": "ok",
      "output": "Run `cargo test --fix` any failing tests, then commit and push the branch.\n\n",
      "passed": false,
      "score": 45,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 5.6099,
        "metrics": {
          "energy_wh_estimate": 0.1312,
          "gpu_busy_percent_avg": 48.4,
          "gpu_busy_percent_peak": 85.0,
          "power_w_avg": 84.23,
          "power_w_peak": 250.0,
          "vram_mb_avg": 9432.1,
          "vram_mb_peak": 11932.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 13,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_question_no_answer",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 5.849649873000089,
        "errors": [],
        "ideal_output": "What is the capital of France?",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output correctly formats the dictated question without answering it."
      },
      "judge_status": "ok",
      "latency_seconds": 6.2543,
      "model": "gemma4:e4b",
      "ollama_status": "ok",
      "output": "What is the capital of France?\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 6.2557,
        "metrics": {
          "energy_wh_estimate": 0.0653,
          "gpu_busy_percent_avg": 32.7,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 37.57,
          "power_w_peak": 127.0,
          "vram_mb_avg": 9610.3,
          "vram_mb_peak": 11930.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 14,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_email_request",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 11.854099627000323,
        "errors": [
          {
            "detail": "Changed 'Write an email to Maya' into 'Email to Maya,' losing the imperative request wording intended as literal content.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Split the meeting and budget clauses into separate sentences instead of preserving the comma-linked sentence.",
            "severity": "minor",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Formatted 'AM' as 'a.m.' instead of the expected casing/style.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 70,
        "summary": "Mostly preserved the details, but changed the dictated request wording and missed required punctuation/casing."
      },
      "judge_status": "ok",
      "latency_seconds": 10.4056,
      "model": "gemma4:e4b",
      "ollama_status": "ok",
      "output": "Email to Maya. Subject: Quarterly Update. The meeting moved to Thursday at 9:30 a.m. The budget is $12,450, and the website is chirper.local/launch.\n\n",
      "passed": false,
      "score": 70,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 10.4072,
        "metrics": {
          "energy_wh_estimate": 0.483,
          "gpu_busy_percent_avg": 65.1,
          "gpu_busy_percent_peak": 95.0,
          "power_w_avg": 167.09,
          "power_w_peak": 290.0,
          "vram_mb_avg": 10454.1,
          "vram_mb_peak": 11930.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 22,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "onboarding_mixed_format",
      "category": "mixed_formatting",
      "judge": {
        "elapsed_seconds": 13.78280723899843,
        "errors": [
          {
            "detail": "Invents section headings and horizontal rules not dictated, including Email to Maya and Deployment Notes.",
            "severity": "major",
            "type": "over_generation"
          },
          {
            "detail": "Changes 'Please write an email to Maya, subject...' into an email-style block rather than preserving the dictated instruction sentence.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Bullet list uses bold title, nested bullets, capitalization, and asterisks instead of the expected plain title and hyphen list.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Adds a comma after Hello, splits sentences differently, and omits the final exclamation mark after Thanks.",
            "severity": "minor",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Capitalizes list items and hyphenates/cases the title differently from expected.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Hello Chirper. I need to write down accent-friendly words.\n\nAccent Friendly Words:\n- water\n- tomato\n- schedule\n- data\n- router\n- aluminium\n- privacy\n\nPlease write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.\n\nIn the deployment notes, mention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale. Thanks!",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 55,
        "summary": "Preserves much of the content but adds invented markdown/email/deployment sections, changes intended wording, and misses required punctuation/casing/list formatting."
      },
      "judge_status": "ok",
      "latency_seconds": 17.201,
      "model": "gemma4:e4b",
      "ollama_status": "ok",
      "output": "Hello, Chirper.\n\nI need to write down accent-friendly words.\n\n*   **Accent-Friendly Words:**\n    *   Water\n    *   Tomato\n    *   Schedule\n    *   Data\n    *   Router\n    *   Aluminium\n    *   Privacy\n\n***\n\n**Email to Maya**\n\n**Subject:** Quarterly Update\n\nThe meeting moved to Thursday at 9:30 AM. The budget is $12,450, and the website is chirper.local/launch.\n\n***\n\n**Deployment Notes**\n\nMention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale.\n\nThanks!\n\n",
      "passed": false,
      "score": 55,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 17.2026,
        "metrics": {
          "energy_wh_estimate": 0.8482,
          "gpu_busy_percent_avg": 66.5,
          "gpu_busy_percent_peak": 90.0,
          "power_w_avg": 177.5,
          "power_w_peak": 291.0,
          "vram_mb_avg": 11021.1,
          "vram_mb_peak": 11923.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 36,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "markdown_checklist_instruction_content",
      "category": "markdown",
      "judge": {
        "elapsed_seconds": 8.679114548000143,
        "errors": [
          {
            "detail": "Used a markdown H1 heading instead of plain title text followed by a colon.",
            "severity": "minor",
            "type": "formatting_miss"
          },
          {
            "detail": "Omitted the colon after Release Tasks.",
            "severity": "minor",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Used asterisk checklist bullets instead of dash checklist bullets.",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Release Tasks:\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 82,
        "summary": "Content is preserved and checklist markdown is usable, but the title punctuation/style and list marker format do not match the expected output."
      },
      "judge_status": "ok",
      "latency_seconds": 9.7727,
      "model": "gemma4:e4b",
      "ollama_status": "ok",
      "output": "# Release Tasks\n* [ ] Run tests\n* [ ] Push tag\n* [ ] Publish release\n\n",
      "passed": true,
      "score": 82,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 9.7743,
        "metrics": {
          "energy_wh_estimate": 0.3067,
          "gpu_busy_percent_avg": 50.1,
          "gpu_busy_percent_peak": 89.0,
          "power_w_avg": 112.95,
          "power_w_peak": 289.0,
          "vram_mb_avg": 10412.6,
          "vram_mb_peak": 12017.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 21,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "shell_command_as_content",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 32.69323557899952,
        "errors": [
          {
            "detail": "Did not render spoken pipe, space, dash, and dot into the intended shell command syntax.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Changed the command into a slash-delimited path-like string.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Used slashes and an underscore instead of spaces, pipe, hyphen, and dot.",
            "severity": "major",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "cargo test | tee test-output.log",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 15,
        "summary": "The output fails to preserve the shell command structure and misformats spoken punctuation and separators."
      },
      "judge_status": "ok",
      "latency_seconds": 11.206,
      "model": "gemma4:e4b",
      "ollama_status": "ok",
      "output": "cargo/test/pipe/tee/space_test/output.log\n\n",
      "passed": false,
      "score": 15,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 11.2078,
        "metrics": {
          "energy_wh_estimate": 0.5641,
          "gpu_busy_percent_avg": 65.6,
          "gpu_busy_percent_peak": 96.0,
          "power_w_avg": 181.21,
          "power_w_peak": 291.0,
          "vram_mb_avg": 10577.1,
          "vram_mb_peak": 11930.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 24,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "spoken_correction_chain",
      "category": "spoken_edits",
      "judge": {
        "elapsed_seconds": 8.686929458999657,
        "errors": [
          {
            "detail": "Changed the intended identifier ChirperBench to Chirper Benchmark.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Failed to format the corrected name as the compound identifier ChirperBench.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Call it ChirperBench.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 45,
        "summary": "The output missed the final spoken correction for the identifier and changed ChirperBench into two words with the wrong second word."
      },
      "judge_status": "ok",
      "latency_seconds": 7.2652,
      "model": "gemma4:e4b",
      "ollama_status": "ok",
      "output": "Call it Chirper Benchmark.\n\n",
      "passed": false,
      "score": 45,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 7.2668,
        "metrics": {
          "energy_wh_estimate": 0.2435,
          "gpu_busy_percent_avg": 54.6,
          "gpu_busy_percent_peak": 90.0,
          "power_w_avg": 120.62,
          "power_w_peak": 272.0,
          "vram_mb_avg": 9900.9,
          "vram_mb_peak": 11932.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 16,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "url_https_path",
      "category": "urls",
      "judge": {
        "elapsed_seconds": 6.681184229000792,
        "errors": [
          {
            "detail": "Omitted the dictated word \"Visit\".",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Missing the final period after the URL.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Visit https://docs.python.org/3/library/json.html.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 75,
        "summary": "The URL was formatted correctly, but the leading word and final sentence punctuation were omitted."
      },
      "judge_status": "ok",
      "latency_seconds": 8.6545,
      "model": "gemma4:e4b",
      "ollama_status": "ok",
      "output": "https://docs.python.org/3/library/json.html\n\n",
      "passed": false,
      "score": 75,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 8.6561,
        "metrics": {
          "energy_wh_estimate": 0.3431,
          "gpu_busy_percent_avg": 59.5,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 142.68,
          "power_w_peak": 275.0,
          "vram_mb_avg": 10221.6,
          "vram_mb_peak": 11932.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 19,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "email_addresses",
      "category": "emails",
      "judge": {
        "elapsed_seconds": 6.733518956998523,
        "errors": [
          {
            "detail": "Rendered \"cc\" as uppercase \"CC\".",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Send it to alex@chirper.local and cc ops@example.com.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 95,
        "summary": "Email addresses and punctuation are correct; only casing of cc differs from expected."
      },
      "judge_status": "ok",
      "latency_seconds": 8.4664,
      "model": "gemma4:e4b",
      "ollama_status": "ok",
      "output": "Send it to alex@chirper.local and CC ops@example.com.\n\n",
      "passed": true,
      "score": 95,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 8.4678,
        "metrics": {
          "energy_wh_estimate": 0.3328,
          "gpu_busy_percent_avg": 59.9,
          "gpu_busy_percent_peak": 90.0,
          "power_w_avg": 141.5,
          "power_w_peak": 280.0,
          "vram_mb_avg": 10126.6,
          "vram_mb_peak": 11932.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 18,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "product_and_class_casing",
      "category": "casing_identifiers",
      "judge": {
        "elapsed_seconds": 8.016979647998596,
        "errors": [
          {
            "detail": "Inserted a comma before 'and' that was not in the expected output.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "The project name is ChirperBench and the class name is DictationFormatter.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 95,
        "summary": "Correct product and class casing; only added an unnecessary comma."
      },
      "judge_status": "ok",
      "latency_seconds": 9.3696,
      "model": "gemma4:e4b",
      "ollama_status": "ok",
      "output": "The project name is ChirperBench, and the class name is DictationFormatter.\n\n",
      "passed": true,
      "score": 95,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 9.3713,
        "metrics": {
          "energy_wh_estimate": 0.3931,
          "gpu_busy_percent_avg": 60.2,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 151.0,
          "power_w_peak": 278.0,
          "vram_mb_avg": 10307.2,
          "vram_mb_peak": 11932.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 20,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "inline_code_identifiers",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 11.210730839000462,
        "errors": [
          {
            "detail": "Did not format code identifiers as inline code with backticks.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Sentence should start with capitalized 'Set'.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Missing final period.",
            "severity": "minor",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Changed 'equal to' into '=' instead of preserving the dictated sentence style.",
            "severity": "minor",
            "type": "semantic_change"
          }
        ],
        "ideal_output": "Set variable `user_id` equal to `request.user.id`.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 55,
        "summary": "Preserves the main identifiers but misses required inline code formatting, casing, and punctuation, and changes the dictated wording."
      },
      "judge_status": "ok",
      "latency_seconds": 8.5258,
      "model": "gemma4:e4b",
      "ollama_status": "ok",
      "output": "set variable user_id = request.user.id\n\n",
      "passed": false,
      "score": 55,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 8.5274,
        "metrics": {
          "energy_wh_estimate": 0.3555,
          "gpu_busy_percent_avg": 59.6,
          "gpu_busy_percent_peak": 96.0,
          "power_w_avg": 150.11,
          "power_w_peak": 295.0,
          "vram_mb_avg": 10221.6,
          "vram_mb_peak": 11932.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 19,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "numbers_versions_dates",
      "category": "numbers_versions",
      "judge": {
        "elapsed_seconds": 7.800830026000767,
        "errors": [
          {
            "detail": "Inserted a comma after the date before 'and' that is not in the expected output.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Version 2.1.0 shipped on June 3, 2026 and supports Python 3.11+.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 96,
        "summary": "Content, casing, numbers, date, and version formatting are correct; only an extra comma differs from expected punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 4.9525,
      "model": "gemma4:e4b",
      "ollama_status": "ok",
      "output": "Version 2.1.0 shipped on June 3, 2026, and supports Python 3.11+.\n\n",
      "passed": true,
      "score": 96,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 4.9542,
        "metrics": {
          "energy_wh_estimate": 0.0828,
          "gpu_busy_percent_avg": 42.4,
          "gpu_busy_percent_peak": 88.0,
          "power_w_avg": 60.18,
          "power_w_peak": 250.0,
          "vram_mb_avg": 8977.7,
          "vram_mb_peak": 11932.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 11,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "simple_list",
      "category": "lists",
      "judge": {
        "elapsed_seconds": 7.591894614000921,
        "errors": [
          {
            "detail": "Did not convert the comma-separated items into a markdown bullet list.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Capitalized 'Apples' where the expected list item is lowercase.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Kept the list inline with commas and a final period instead of placing items on separate bullet lines.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Shopping list:\n- apples\n- bananas\n- oat milk",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 65,
        "summary": "Content is preserved, but the required spoken list was not formatted as a markdown list."
      },
      "judge_status": "ok",
      "latency_seconds": 7.7568,
      "model": "gemma4:e4b",
      "ollama_status": "ok",
      "output": "Shopping list: Apples, bananas, oat milk.\n\n",
      "passed": false,
      "score": 65,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 7.7586,
        "metrics": {
          "energy_wh_estimate": 0.2885,
          "gpu_busy_percent_avg": 57.2,
          "gpu_busy_percent_peak": 96.0,
          "power_w_avg": 133.88,
          "power_w_peak": 289.0,
          "vram_mb_avg": 10019.8,
          "vram_mb_peak": 11930.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 17,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "no_change_needed",
      "category": "no_change_needed",
      "judge": {
        "elapsed_seconds": 21.61650385900066,
        "errors": [],
        "ideal_output": "The quick brown fox jumps over the lazy dog.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output matches the expected clean text exactly."
      },
      "judge_status": "ok",
      "latency_seconds": 7.011,
      "model": "gemma4:e4b",
      "ollama_status": "ok",
      "output": "The quick brown fox jumps over the lazy dog.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 7.0129,
        "metrics": {
          "energy_wh_estimate": 0.234,
          "gpu_busy_percent_avg": 51.9,
          "gpu_busy_percent_peak": 93.0,
          "power_w_avg": 120.13,
          "power_w_peak": 267.0,
          "vram_mb_avg": 9765.0,
          "vram_mb_peak": 11930.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 15,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_pr_review_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 10.975114090000716,
        "errors": [
          {
            "detail": "Rendered 'PR number 4' instead of the expected 'PR #4'.",
            "severity": "minor",
            "type": "formatting_miss"
          },
          {
            "detail": "Omitted 'the' before 'open PR'.",
            "severity": "minor",
            "type": "missing_text"
          }
        ],
        "ideal_output": "Please check the open PR #4 for code review comments and check each one that's still open if it needs to be fixed. Fix it, then resolve the comments and push your fix.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 88,
        "summary": "Preserves the dictated instruction as text, but misses the PR #4 identifier formatting and makes small wording changes."
      },
      "judge_status": "ok",
      "latency_seconds": 9.6705,
      "model": "gemma4:e2b",
      "ollama_status": "ok",
      "output": "Please check open PR number 4 for code review comments. Check each one that is still open. If it needs to be fixed, fix it, then resolve the comments and push your fix.\n\n",
      "passed": true,
      "score": 88,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 9.6727,
        "metrics": {
          "energy_wh_estimate": 0.2994,
          "gpu_busy_percent_avg": 55.8,
          "gpu_busy_percent_peak": 86.0,
          "power_w_avg": 111.43,
          "power_w_peak": 229.0,
          "vram_mb_avg": 8007.5,
          "vram_mb_peak": 9132.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 21,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_agent_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 8.328101352000886,
        "errors": [
          {
            "detail": "Formatted cargo test as inline code instead of preserving it as plain dictated text.",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Run cargo test, fix any failing tests, then commit and push the branch.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 92,
        "summary": "Content, casing, and punctuation are correct; only added unintended markdown code formatting around cargo test."
      },
      "judge_status": "ok",
      "latency_seconds": 7.2859,
      "model": "gemma4:e2b",
      "ollama_status": "ok",
      "output": "Run `cargo test`, fix any failing tests, then commit and push the branch.\n\n",
      "passed": true,
      "score": 92,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 7.2874,
        "metrics": {
          "energy_wh_estimate": 0.2358,
          "gpu_busy_percent_avg": 54.5,
          "gpu_busy_percent_peak": 86.0,
          "power_w_avg": 116.5,
          "power_w_peak": 228.0,
          "vram_mb_avg": 7656.6,
          "vram_mb_peak": 9134.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 16,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_question_no_answer",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 6.898660562999794,
        "errors": [],
        "ideal_output": "What is the capital of France?",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output correctly formats the dictated question without answering it."
      },
      "judge_status": "ok",
      "latency_seconds": 6.4058,
      "model": "gemma4:e2b",
      "ollama_status": "ok",
      "output": "What is the capital of France?\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 6.4072,
        "metrics": {
          "energy_wh_estimate": 0.2068,
          "gpu_busy_percent_avg": 57.4,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 116.21,
          "power_w_peak": 229.0,
          "vram_mb_avg": 7445.0,
          "vram_mb_peak": 9132.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 14,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_email_request",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 9.521509917000003,
        "errors": [
          {
            "detail": "Did not format 'twelve thousand four hundred fifty dollars' as '$12,450'.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Changed the requested comma/subject structure and sentence flow into separate sentences.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 78,
        "summary": "Preserves the intended content and avoids generating an email body, but misses required currency formatting and changes some dictated punctuation structure."
      },
      "judge_status": "ok",
      "latency_seconds": 7.4148,
      "model": "gemma4:e2b",
      "ollama_status": "ok",
      "output": "Write an email to Maya. Subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM. The budget is twelve thousand four hundred fifty dollars, and the website is chirper.local/launch.\n\n",
      "passed": true,
      "score": 78,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 7.4162,
        "metrics": {
          "energy_wh_estimate": 0.2531,
          "gpu_busy_percent_avg": 56.8,
          "gpu_busy_percent_peak": 86.0,
          "power_w_avg": 122.88,
          "power_w_peak": 227.0,
          "vram_mb_avg": 7656.0,
          "vram_mb_peak": 9132.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 16,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "onboarding_mixed_format",
      "category": "mixed_formatting",
      "judge": {
        "elapsed_seconds": 15.035585812998761,
        "errors": [
          {
            "detail": "Leaves spoken commands such as 'End of list', 'New paragraph', and 'Finish with' in the output.",
            "severity": "critical",
            "type": "instruction_leak"
          },
          {
            "detail": "Does not create the requested titled bullet list or paragraph breaks.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Changes 'aluminium' to 'aluminum'.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Fails to format '9:30 AM' and '$12,450'.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Incorrectly outputs 'Quarterly update', 'ffmpeg', and 'gnome' instead of required technical/title casing.",
            "severity": "major",
            "type": "wrong_casing"
          },
          {
            "detail": "Splits or changes punctuation around the email subject and meeting details.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Hello Chirper. I need to write down accent-friendly words.\n\nAccent Friendly Words:\n- water\n- tomato\n- schedule\n- data\n- router\n- aluminium\n- privacy\n\nPlease write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.\n\nIn the deployment notes, mention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale. Thanks!",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 25,
        "summary": "Major mixed-formatting failure: spoken commands leaked, list/paragraph/email formatting was missed, and several required casing, spelling, currency, and time conversions were not applied."
      },
      "judge_status": "ok",
      "latency_seconds": 9.0958,
      "model": "gemma4:e2b",
      "ollama_status": "ok",
      "output": "Hello Chirper. I need to write down accent-friendly words. This is a bullet point list with the title accent-friendly words: water, tomato, schedule, data, router, aluminum, privacy. End of list. New paragraph. Please write an email to Maya. Subject: Quarterly update. The meeting moved to Thursday at nine thirty a.m. The budget is twelve thousand four hundred fifty dollars, and the website is chirper.local/launch. New paragraph. In the deployment notes, mention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, ffmpeg, gnome, Nextcloud, and Tailscale. Finish with thanks!\n\n",
      "passed": false,
      "score": 25,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 9.0979,
        "metrics": {
          "energy_wh_estimate": 0.3649,
          "gpu_busy_percent_avg": 62.7,
          "gpu_busy_percent_peak": 86.0,
          "power_w_avg": 144.4,
          "power_w_peak": 232.0,
          "vram_mb_avg": 7943.7,
          "vram_mb_peak": 9125.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 20,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "markdown_checklist_instruction_content",
      "category": "markdown",
      "judge": {
        "elapsed_seconds": 8.386527747999935,
        "errors": [
          {
            "detail": "Used nested bullets instead of a markdown checklist with - [ ] items.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Title is missing the required colon.",
            "severity": "minor",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Added Item 1/2/3 labels not present in the intended final text.",
            "severity": "minor",
            "type": "extra_text"
          }
        ],
        "ideal_output": "Release Tasks:\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 45,
        "summary": "Content is preserved, but required markdown checklist format, title casing with colon, and checklist boxes were missed."
      },
      "judge_status": "ok",
      "latency_seconds": 6.6283,
      "model": "gemma4:e2b",
      "ollama_status": "ok",
      "output": "- Release Tasks\n  - Item 1: Run tests\n  - Item 2: Push tag\n  - Item 3: Publish release\n\n",
      "passed": false,
      "score": 45,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 6.63,
        "metrics": {
          "energy_wh_estimate": 0.2286,
          "gpu_busy_percent_avg": 58.3,
          "gpu_busy_percent_peak": 95.0,
          "power_w_avg": 124.13,
          "power_w_peak": 230.0,
          "vram_mb_avg": 7613.8,
          "vram_mb_peak": 9219.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 15,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "shell_command_as_content",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 8.377897464999478,
        "errors": [
          {
            "detail": "Left spoken commands like \"pipe\", \"space\", and \"dash\" in the output.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Did not convert the dictated shell pipeline and filename punctuation.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Missing the dot before log in test-output.log.",
            "severity": "minor",
            "type": "missing_text"
          }
        ],
        "ideal_output": "cargo test | tee test-output.log",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 25,
        "summary": "Failed to preserve shell command structure; spoken formatting commands were left literal and punctuation was missing."
      },
      "judge_status": "ok",
      "latency_seconds": 6.5072,
      "model": "gemma4:e2b",
      "ollama_status": "ok",
      "output": "cargo test pipe tee space test dash output log\n\n",
      "passed": false,
      "score": 25,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 6.5086,
        "metrics": {
          "energy_wh_estimate": 0.2026,
          "gpu_busy_percent_avg": 54.9,
          "gpu_busy_percent_peak": 93.0,
          "power_w_avg": 112.07,
          "power_w_peak": 228.0,
          "vram_mb_avg": 7487.8,
          "vram_mb_peak": 9218.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 14,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "spoken_correction_chain",
      "category": "spoken_edits",
      "judge": {
        "elapsed_seconds": 8.527319655999236,
        "errors": [
          {
            "detail": "Leaves spoken edit words such as 'no scratch' and spelling commands in the output.",
            "severity": "critical",
            "type": "instruction_leak"
          },
          {
            "detail": "Does not apply the final correction to produce the intended name ChirperBench.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Fails to format the identifier as a single CamelCase word.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Includes discarded transcript content beyond the final cleaned sentence.",
            "severity": "major",
            "type": "extra_text"
          }
        ],
        "ideal_output": "Call it ChirperBench.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 0,
        "summary": "Critical failure: spoken edit commands were retained and the final corrected identifier was not produced."
      },
      "judge_status": "ok",
      "latency_seconds": 6.7787,
      "model": "gemma4:e2b",
      "ollama_status": "ok",
      "output": "Call it Chirper Bench, no scratch. That Chirper Benchmark spelled capital C, Chirper capital B bench.\n\n",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 6.7805,
        "metrics": {
          "energy_wh_estimate": 0.2298,
          "gpu_busy_percent_avg": 58.5,
          "gpu_busy_percent_peak": 95.0,
          "power_w_avg": 122.0,
          "power_w_peak": 227.0,
          "vram_mb_avg": 7558.3,
          "vram_mb_peak": 9134.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 15,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "url_https_path",
      "category": "urls",
      "judge": {
        "elapsed_seconds": 7.246821696999177,
        "errors": [
          {
            "detail": "Sentence should start with capitalized 'Visit'.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Missing final period.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Visit https://docs.python.org/3/library/json.html.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 85,
        "summary": "URL is formatted correctly, but sentence casing and final punctuation are missing."
      },
      "judge_status": "ok",
      "latency_seconds": 6.583,
      "model": "gemma4:e2b",
      "ollama_status": "ok",
      "output": "visit https://docs.python.org/3/library/json.html\n\n",
      "passed": true,
      "score": 85,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 6.5846,
        "metrics": {
          "energy_wh_estimate": 0.2411,
          "gpu_busy_percent_avg": 62.0,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 131.8,
          "power_w_peak": 312.0,
          "vram_mb_avg": 7558.2,
          "vram_mb_peak": 9134.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 15,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "email_addresses",
      "category": "emails",
      "judge": {
        "elapsed_seconds": 6.026328457999625,
        "errors": [],
        "ideal_output": "Send it to alex@chirper.local and cc ops@example.com.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Perfect match; email addresses and punctuation are correctly formatted."
      },
      "judge_status": "ok",
      "latency_seconds": 6.3279,
      "model": "gemma4:e2b",
      "ollama_status": "ok",
      "output": "Send it to alex@chirper.local and cc ops@example.com.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 6.3295,
        "metrics": {
          "energy_wh_estimate": 0.1925,
          "gpu_busy_percent_avg": 52.9,
          "gpu_busy_percent_peak": 87.0,
          "power_w_avg": 109.5,
          "power_w_peak": 227.0,
          "vram_mb_avg": 7445.7,
          "vram_mb_peak": 9134.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 14,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "product_and_class_casing",
      "category": "casing_identifiers",
      "judge": {
        "elapsed_seconds": 7.723506857000757,
        "errors": [
          {
            "detail": "Used spaced title case instead of identifier casing for ChirperBench and DictationFormatter.",
            "severity": "major",
            "type": "wrong_casing"
          },
          {
            "detail": "Did not apply code-style product/class name formatting.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "The project name is ChirperBench and the class name is DictationFormatter.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 70,
        "summary": "Preserved the sentence and punctuation, but missed required product and class identifier casing."
      },
      "judge_status": "ok",
      "latency_seconds": 6.5261,
      "model": "gemma4:e2b",
      "ollama_status": "ok",
      "output": "The project name is Chirper Bench and the class name is Dictation Formatter.\n\n",
      "passed": false,
      "score": 70,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 6.5283,
        "metrics": {
          "energy_wh_estimate": 0.2131,
          "gpu_busy_percent_avg": 56.3,
          "gpu_busy_percent_peak": 95.0,
          "power_w_avg": 117.53,
          "power_w_peak": 225.0,
          "vram_mb_avg": 7558.3,
          "vram_mb_peak": 9134.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 15,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "inline_code_identifiers",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 8.454603906999182,
        "errors": [
          {
            "detail": "Code identifiers should be formatted as inline code.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Sentence should start with capitalized 'Set'.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Missing final period.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Set variable `user_id` equal to `request.user.id`.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 60,
        "summary": "Content is mostly preserved, but required casing, final punctuation, and inline code formatting are missing."
      },
      "judge_status": "ok",
      "latency_seconds": 7.116,
      "model": "gemma4:e2b",
      "ollama_status": "ok",
      "output": "set variable user_id equal to request.user.id\n\n",
      "passed": false,
      "score": 60,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 7.1176,
        "metrics": {
          "energy_wh_estimate": 0.2569,
          "gpu_busy_percent_avg": 60.1,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 129.94,
          "power_w_peak": 230.0,
          "vram_mb_avg": 7656.8,
          "vram_mb_peak": 9134.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 16,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "numbers_versions_dates",
      "category": "numbers_versions",
      "judge": {
        "elapsed_seconds": 12.381932632000826,
        "errors": [
          {
            "detail": "Missing '+' after Python 3.11.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Changed 'Python 3.11+' meaning 3.11 or later to just 'Python 3.11'.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Inserted a comma after the date that was not in the expected output.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Version 2.1.0 shipped on June 3, 2026 and supports Python 3.11+.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 78,
        "summary": "Mostly correct formatting, but it dropped the plus from Python 3.11+, changing the intended version support."
      },
      "judge_status": "ok",
      "latency_seconds": 6.8549,
      "model": "gemma4:e2b",
      "ollama_status": "ok",
      "output": "Version 2.1.0 shipped on June 3, 2026, and supports Python 3.11.\n\n",
      "passed": false,
      "score": 78,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 6.8565,
        "metrics": {
          "energy_wh_estimate": 0.2288,
          "gpu_busy_percent_avg": 56.9,
          "gpu_busy_percent_peak": 95.0,
          "power_w_avg": 120.13,
          "power_w_peak": 229.0,
          "vram_mb_avg": 7558.3,
          "vram_mb_peak": 9134.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 15,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "simple_list",
      "category": "lists",
      "judge": {
        "elapsed_seconds": 7.773731260998829,
        "errors": [
          {
            "detail": "Expected a colon followed by bullet list items, not an inline comma-separated sentence.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Shopping list:\n- apples\n- bananas\n- oat milk",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 65,
        "summary": "Content is preserved, but the spoken list was not formatted as a markdown list."
      },
      "judge_status": "ok",
      "latency_seconds": 6.2074,
      "model": "gemma4:e2b",
      "ollama_status": "ok",
      "output": "Shopping list: apples, bananas, oat milk.\n\n",
      "passed": false,
      "score": 65,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 6.2089,
        "metrics": {
          "energy_wh_estimate": 0.1895,
          "gpu_busy_percent_avg": 55.3,
          "gpu_busy_percent_peak": 90.0,
          "power_w_avg": 109.86,
          "power_w_peak": 224.0,
          "vram_mb_avg": 7445.1,
          "vram_mb_peak": 9132.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 14,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "no_change_needed",
      "category": "no_change_needed",
      "judge": {
        "elapsed_seconds": 5.521883171999434,
        "errors": [],
        "ideal_output": "The quick brown fox jumps over the lazy dog.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output matches the expected cleaned text exactly."
      },
      "judge_status": "ok",
      "latency_seconds": 5.3382,
      "model": "gemma4:e2b",
      "ollama_status": "ok",
      "output": "The quick brown fox jumps over the lazy dog.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 5.3403,
        "metrics": {
          "energy_wh_estimate": 0.14,
          "gpu_busy_percent_avg": 51.3,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 94.42,
          "power_w_peak": 224.0,
          "vram_mb_avg": 7163.8,
          "vram_mb_peak": 9132.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 12,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_pr_review_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 12.990426799999113,
        "errors": [
          {
            "detail": "Sentence split differs slightly from expected, though meaning is preserved.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Please check the open PR #4 for code review comments and check each one that's still open if it needs to be fixed. Fix it, then resolve the comments and push your fix.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 92,
        "summary": "Preserves the dictated instruction with correct casing and PR number formatting; only minor punctuation/sentence-boundary differences from the expected text."
      },
      "judge_status": "ok",
      "latency_seconds": 55.5349,
      "model": "qwen3.5:9b",
      "ollama_status": "ok",
      "output": "Please check the open PR #4 for code review comments and check each one that's still open. If it needs to be fixed, fix it, and then resolve the comments and push your fix.\n\n",
      "passed": true,
      "score": 92,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 55.5367,
        "metrics": {
          "energy_wh_estimate": 4.0291,
          "gpu_busy_percent_avg": 85.1,
          "gpu_busy_percent_peak": 93.0,
          "power_w_avg": 261.18,
          "power_w_peak": 299.0,
          "vram_mb_avg": 10381.4,
          "vram_mb_peak": 10733.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 112,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_agent_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 5.935434883000198,
        "errors": [],
        "ideal_output": "Run cargo test, fix any failing tests, then commit and push the branch.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output exactly preserves the dictated instruction as cleaned text with correct punctuation and casing."
      },
      "judge_status": "ok",
      "latency_seconds": 50.5737,
      "model": "qwen3.5:9b",
      "ollama_status": "ok",
      "output": "Run cargo test, fix any failing tests, then commit and push the branch.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 50.5755,
        "metrics": {
          "energy_wh_estimate": 3.6859,
          "gpu_busy_percent_avg": 84.6,
          "gpu_busy_percent_peak": 94.0,
          "power_w_avg": 262.37,
          "power_w_peak": 306.0,
          "vram_mb_avg": 10357.1,
          "vram_mb_peak": 10733.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 103,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_question_no_answer",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 6.2294985239987,
        "errors": [],
        "ideal_output": "What is the capital of France?",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output correctly formats the dictated question without answering it."
      },
      "judge_status": "ok",
      "latency_seconds": 13.3896,
      "model": "qwen3.5:9b",
      "ollama_status": "ok",
      "output": "What is the capital of France?\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 13.3911,
        "metrics": {
          "energy_wh_estimate": 0.7116,
          "gpu_busy_percent_avg": 67.9,
          "gpu_busy_percent_peak": 93.0,
          "power_w_avg": 191.32,
          "power_w_peak": 293.0,
          "vram_mb_avg": 9324.2,
          "vram_mb_peak": 10733.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 28,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_email_request",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 11.675811563000025,
        "errors": [
          {
            "detail": "Changed 'Maya, subject:' into two sentences and split the final comma-linked sentence into separate sentences.",
            "severity": "minor",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Formatted '9:30 AM' as '9:30 a.m.' instead of the expected uppercase AM.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 84,
        "summary": "Preserves the dictated content but changes the requested punctuation and casing in several places."
      },
      "judge_status": "ok",
      "latency_seconds": 121.935,
      "model": "qwen3.5:9b",
      "ollama_status": "ok",
      "output": "Write an email to Maya. Subject: Quarterly Update. The meeting moved to Thursday at 9:30 a.m. The budget is $12,450. The website is chirper.local/launch.\n\n",
      "passed": true,
      "score": 84,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 121.9368,
        "metrics": {
          "energy_wh_estimate": 9.4302,
          "gpu_busy_percent_avg": 89.3,
          "gpu_busy_percent_peak": 93.0,
          "power_w_avg": 278.41,
          "power_w_peak": 296.0,
          "vram_mb_avg": 10582.2,
          "vram_mb_peak": 10818.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 245,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "onboarding_mixed_format",
      "category": "mixed_formatting",
      "judge": {
        "elapsed_seconds": 16.583256081999934,
        "errors": [
          {
            "detail": "Changed the dictated instruction about writing an email into email-style headers and removed 'Please write an email to Maya'.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Expected a plain titled bullet list with colon; output used markdown headings, a hyphenated title, and asterisks.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Added headings such as 'To: Maya' and 'Deployment Notes' that were not in the intended final text.",
            "severity": "major",
            "type": "extra_text"
          },
          {
            "detail": "Changed 'Hello Chirper.' to 'Hello Chirper!' and split comma-connected clauses into separate sentences.",
            "severity": "minor",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Removed the phrase 'In the deployment notes' and changed the deployment sentence structure.",
            "severity": "major",
            "type": "missing_text"
          }
        ],
        "ideal_output": "Hello Chirper. I need to write down accent-friendly words.\n\nAccent Friendly Words:\n- water\n- tomato\n- schedule\n- data\n- router\n- aluminium\n- privacy\n\nPlease write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.\n\nIn the deployment notes, mention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale. Thanks!",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 63,
        "summary": "Preserves much of the content but over-formats the dictated email/deployment note instructions, changes punctuation, and misses required inline/list/title formatting."
      },
      "judge_status": "ok",
      "latency_seconds": 135.6784,
      "model": "qwen3.5:9b",
      "ollama_status": "ok",
      "output": "Hello Chirper! I need to write down accent-friendly words.\n\n## Accent-Friendly Words\n* water\n* tomato\n* schedule\n* data\n* router\n* aluminium\n* privacy\n\n## To: Maya\n**Subject:** Quarterly Update\n\nThe meeting moved to Thursday at 9:30 AM. The budget is $12,450, and the website is chirper.local/launch.\n\n## Deployment Notes\nMention that systemd keeps the Chirper services running. We should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale.\n\nThanks!\n\n",
      "passed": false,
      "score": 63,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 135.6802,
        "metrics": {
          "energy_wh_estimate": 10.5551,
          "gpu_busy_percent_avg": 89.5,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 280.06,
          "power_w_peak": 298.0,
          "vram_mb_avg": 10596.9,
          "vram_mb_peak": 10741.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 273,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "markdown_checklist_instruction_content",
      "category": "markdown",
      "judge": {
        "elapsed_seconds": 10.479212082998856,
        "errors": [
          {
            "detail": "Omitted the dictated colon after the title.",
            "severity": "minor",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Rendered the title as an H1 heading instead of plain text with a colon.",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Release Tasks:\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 85,
        "summary": "Content and checklist formatting are preserved, but the dictated colon title was changed into a Markdown heading without the colon."
      },
      "judge_status": "ok",
      "latency_seconds": 66.5828,
      "model": "qwen3.5:9b",
      "ollama_status": "ok",
      "output": "# Release Tasks\n\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release\n\n",
      "passed": true,
      "score": 85,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 66.5847,
        "metrics": {
          "energy_wh_estimate": 4.9964,
          "gpu_busy_percent_avg": 86.8,
          "gpu_busy_percent_peak": 94.0,
          "power_w_avg": 270.14,
          "power_w_peak": 303.0,
          "vram_mb_avg": 10464.8,
          "vram_mb_peak": 10843.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 134,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "shell_command_as_content",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 10.408192754999618,
        "errors": [
          {
            "detail": "Added ```bash code fence markers not present in expected output.",
            "severity": "minor",
            "type": "extra_text"
          },
          {
            "detail": "Returned the shell command as a fenced Markdown block rather than plain final text.",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "cargo test | tee test-output.log",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 80,
        "summary": "Command content is correct, but it added an unnecessary Markdown code fence instead of returning only the cleaned command text."
      },
      "judge_status": "ok",
      "latency_seconds": 73.0588,
      "model": "qwen3.5:9b",
      "ollama_status": "ok",
      "output": "```bash\ncargo test | tee test-output.log\n```\n\n",
      "passed": true,
      "score": 80,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 73.0607,
        "metrics": {
          "energy_wh_estimate": 5.5133,
          "gpu_busy_percent_avg": 87.5,
          "gpu_busy_percent_peak": 94.0,
          "power_w_avg": 271.67,
          "power_w_peak": 293.0,
          "vram_mb_avg": 10475.2,
          "vram_mb_peak": 10743.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 147,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "spoken_correction_chain",
      "category": "spoken_edits",
      "judge": {
        "elapsed_seconds": 10.265306018000047,
        "errors": [
          {
            "detail": "Changed the intended final name from ChirperBench to Chirper Benchmark.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Missed the compound identifier formatting with no space between Chirper and Bench.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Call it ChirperBench.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 65,
        "summary": "Removed spoken edits and added basic casing/punctuation, but failed the final spelled identifier: it should be ChirperBench, not Chirper Benchmark."
      },
      "judge_status": "ok",
      "latency_seconds": 78.4108,
      "model": "qwen3.5:9b",
      "ollama_status": "ok",
      "output": "Call it Chirper Benchmark.\n\n",
      "passed": false,
      "score": 65,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 78.4124,
        "metrics": {
          "energy_wh_estimate": 5.9482,
          "gpu_busy_percent_avg": 87.9,
          "gpu_busy_percent_peak": 93.0,
          "power_w_avg": 273.09,
          "power_w_peak": 294.0,
          "vram_mb_avg": 10493.9,
          "vram_mb_peak": 10743.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 158,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "url_https_path",
      "category": "urls",
      "judge": {
        "elapsed_seconds": 6.166952689000027,
        "errors": [],
        "ideal_output": "Visit https://docs.python.org/3/library/json.html.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output exactly preserves and formats the spoken HTTPS URL with correct casing and punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 56.5246,
      "model": "qwen3.5:9b",
      "ollama_status": "ok",
      "output": "Visit https://docs.python.org/3/library/json.html.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 56.5264,
        "metrics": {
          "energy_wh_estimate": 4.1671,
          "gpu_busy_percent_avg": 86.0,
          "gpu_busy_percent_peak": 93.0,
          "power_w_avg": 265.39,
          "power_w_peak": 298.0,
          "vram_mb_avg": 10401.5,
          "vram_mb_peak": 10751.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 114,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "email_addresses",
      "category": "emails",
      "judge": null,
      "judge_status": "not_run",
      "latency_seconds": 180.0028,
      "model": "qwen3.5:9b",
      "ollama_status": "timeout",
      "output": "",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 180.0048,
        "metrics": {
          "energy_wh_estimate": 14.1368,
          "gpu_busy_percent_avg": 90.1,
          "gpu_busy_percent_peak": 94.0,
          "power_w_avg": 282.73,
          "power_w_peak": 298.0,
          "vram_mb_avg": 10634.7,
          "vram_mb_peak": 10822.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 361,
        "status": "ok"
      },
      "timed_out": true
    },
    {
      "case_id": "product_and_class_casing",
      "category": "casing_identifiers",
      "judge": {
        "elapsed_seconds": 7.779465626001183,
        "errors": [
          {
            "detail": "Used spaced title case for ChirperBench and DictationFormatter instead of required identifier casing.",
            "severity": "major",
            "type": "wrong_casing"
          },
          {
            "detail": "Missed code-style/product identifier formatting by leaving spaces in both names.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "The project name is ChirperBench and the class name is DictationFormatter.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 65,
        "summary": "The text is punctuated and capitalized, but the product and class identifiers were not converted to the required code-style casing."
      },
      "judge_status": "ok",
      "latency_seconds": 58.6964,
      "model": "qwen3.5:9b",
      "ollama_status": "ok",
      "output": "The project name is Chirper Bench and the class name is Dictation Formatter.\n\n",
      "passed": false,
      "score": 65,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 58.6978,
        "metrics": {
          "energy_wh_estimate": 4.4346,
          "gpu_busy_percent_avg": 86.4,
          "gpu_busy_percent_peak": 96.0,
          "power_w_avg": 271.98,
          "power_w_peak": 317.0,
          "vram_mb_avg": 10480.3,
          "vram_mb_peak": 10736.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 119,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "inline_code_identifiers",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 10.752249799999845,
        "errors": [
          {
            "detail": "Used a fenced code block instead of inline code around identifiers.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Did not capitalize the initial word.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Changed 'equal to' into '=' rather than preserving the dictated sentence form.",
            "severity": "minor",
            "type": "semantic_change"
          }
        ],
        "ideal_output": "Set variable `user_id` equal to `request.user.id`.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 55,
        "summary": "Preserves the main identifiers but misses required inline code formatting and casing, and changes the dictated wording into a code-block style assignment."
      },
      "judge_status": "ok",
      "latency_seconds": 67.1883,
      "model": "qwen3.5:9b",
      "ollama_status": "ok",
      "output": "```\nset variable user_id = request.user.id.\n```\n\n",
      "passed": false,
      "score": 55,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 67.1901,
        "metrics": {
          "energy_wh_estimate": 5.0466,
          "gpu_busy_percent_avg": 86.7,
          "gpu_busy_percent_peak": 94.0,
          "power_w_avg": 270.4,
          "power_w_peak": 301.0,
          "vram_mb_avg": 10447.9,
          "vram_mb_peak": 10821.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 136,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "numbers_versions_dates",
      "category": "numbers_versions",
      "judge": {
        "elapsed_seconds": 7.185003130000041,
        "errors": [],
        "ideal_output": "Version 2.1.0 shipped on June 3, 2026 and supports Python 3.11+.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output matches the expected formatting exactly."
      },
      "judge_status": "ok",
      "latency_seconds": 50.6363,
      "model": "qwen3.5:9b",
      "ollama_status": "ok",
      "output": "Version 2.1.0 shipped on June 3, 2026 and supports Python 3.11+.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 50.6377,
        "metrics": {
          "energy_wh_estimate": 3.7033,
          "gpu_busy_percent_avg": 85.0,
          "gpu_busy_percent_peak": 95.0,
          "power_w_avg": 263.28,
          "power_w_peak": 304.0,
          "vram_mb_avg": 10379.5,
          "vram_mb_peak": 10831.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 103,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "simple_list",
      "category": "lists",
      "judge": {
        "elapsed_seconds": 8.562927241999205,
        "errors": [
          {
            "detail": "Did not format the spoken shopping list as markdown bullet items.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Shopping list:\n- apples\n- bananas\n- oat milk",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 70,
        "summary": "Content preserved, but it missed the required list formatting."
      },
      "judge_status": "ok",
      "latency_seconds": 36.9068,
      "model": "qwen3.5:9b",
      "ollama_status": "ok",
      "output": "Shopping list: apples, bananas, oat milk.\n\n",
      "passed": false,
      "score": 70,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 36.9085,
        "metrics": {
          "energy_wh_estimate": 2.5939,
          "gpu_busy_percent_avg": 82.8,
          "gpu_busy_percent_peak": 93.0,
          "power_w_avg": 253.01,
          "power_w_peak": 298.0,
          "vram_mb_avg": 10213.9,
          "vram_mb_peak": 10737.9,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 75,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "no_change_needed",
      "category": "no_change_needed",
      "judge": {
        "elapsed_seconds": 19.682430499000475,
        "errors": [],
        "ideal_output": "The quick brown fox jumps over the lazy dog.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output matches the expected clean text exactly."
      },
      "judge_status": "ok",
      "latency_seconds": 14.5987,
      "model": "qwen3.5:9b",
      "ollama_status": "ok",
      "output": "The quick brown fox jumps over the lazy dog.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 14.6005,
        "metrics": {
          "energy_wh_estimate": 0.9041,
          "gpu_busy_percent_avg": 74.4,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 222.94,
          "power_w_peak": 298.0,
          "vram_mb_avg": 9466.5,
          "vram_mb_peak": 10737.9,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 31,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_pr_review_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 9.347621158000038,
        "errors": [
          {
            "detail": "Kept 'PR number 4' instead of formatting it as 'PR #4'.",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Please check the open PR #4 for code review comments and check each one that's still open if it needs to be fixed. Fix it, then resolve the comments and push your fix.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 88,
        "summary": "Preserves the dictated instruction and cleans punctuation, but misses the expected PR number formatting as #4."
      },
      "judge_status": "ok",
      "latency_seconds": 49.0744,
      "model": "qwen3.5:4b",
      "ollama_status": "ok",
      "output": "Please check the open PR number 4 for code review comments and check each one that's still open. If it needs to be fixed, fix it. Then resolve the comments and push your fix.\n\n",
      "passed": true,
      "score": 88,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 49.0764,
        "metrics": {
          "energy_wh_estimate": 3.6252,
          "gpu_busy_percent_avg": 83.8,
          "gpu_busy_percent_peak": 91.0,
          "power_w_avg": 265.93,
          "power_w_peak": 291.0,
          "vram_mb_avg": 7880.8,
          "vram_mb_peak": 8172.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 100,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_agent_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 8.857592182001099,
        "errors": [
          {
            "detail": "Added backticks around cargo test instead of preserving it as plain text.",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Run cargo test, fix any failing tests, then commit and push the branch.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 92,
        "summary": "Content, casing, and punctuation are correct, but it added markdown code formatting not present in the expected text."
      },
      "judge_status": "ok",
      "latency_seconds": 68.1213,
      "model": "qwen3.5:4b",
      "ollama_status": "ok",
      "output": "Run `cargo test`, fix any failing tests, then commit and push the branch.\n\n",
      "passed": true,
      "score": 92,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 68.123,
        "metrics": {
          "energy_wh_estimate": 5.2233,
          "gpu_busy_percent_avg": 86.2,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 276.03,
          "power_w_peak": 292.0,
          "vram_mb_avg": 7961.7,
          "vram_mb_peak": 8172.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 138,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_question_no_answer",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 9.146520643000258,
        "errors": [
          {
            "detail": "Missing final question mark from dictated 'question mark'.",
            "severity": "major",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "What is the capital of France?",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 85,
        "summary": "Content and casing are correct, but the required question mark was omitted."
      },
      "judge_status": "ok",
      "latency_seconds": 44.0499,
      "model": "qwen3.5:4b",
      "ollama_status": "ok",
      "output": "What is the capital of France?\n\n",
      "passed": false,
      "score": 85,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 44.0516,
        "metrics": {
          "energy_wh_estimate": 3.2888,
          "gpu_busy_percent_avg": 84.0,
          "gpu_busy_percent_peak": 96.0,
          "power_w_avg": 268.78,
          "power_w_peak": 292.0,
          "vram_mb_avg": 7861.0,
          "vram_mb_peak": 8256.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 89,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_email_request",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 9.995406915000785,
        "errors": [
          {
            "detail": "Uses \"Subject\" instead of expected lowercase \"subject\" after the comma.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Formats time as \"9:30 a.m.\" instead of expected \"9:30 AM\".",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 92,
        "summary": "Content is preserved and no email body was invented, but casing/style differs from the expected formatting."
      },
      "judge_status": "ok",
      "latency_seconds": 104.7778,
      "model": "qwen3.5:4b",
      "ollama_status": "ok",
      "output": "Write an email to Maya, Subject: Quarterly Update. The meeting moved to Thursday at 9:30 a.m., the budget is $12,450, and the website is chirper.local/launch.\n\n",
      "passed": true,
      "score": 92,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 104.78,
        "metrics": {
          "energy_wh_estimate": 8.2094,
          "gpu_busy_percent_avg": 87.3,
          "gpu_busy_percent_peak": 95.0,
          "power_w_avg": 282.06,
          "power_w_peak": 302.0,
          "vram_mb_avg": 8038.2,
          "vram_mb_peak": 8257.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 211,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "onboarding_mixed_format",
      "category": "mixed_formatting",
      "judge": {
        "elapsed_seconds": 14.713554807998662,
        "errors": [
          {
            "detail": "Leaves dictated control text such as \"This is a bullet point list with title\" and \"End of list\" in the output.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Formats the accent-friendly words as one bullet with comma-separated items instead of a titled bullet list with one item per line.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Handles time, currency, URL, and most technical casing, but misses required list structure and some punctuation/casing requirements.",
            "severity": "major",
            "type": "partial_mixed_task"
          },
          {
            "detail": "Changes the opening sentence from a period to a comma and splits later dictated sentences awkwardly.",
            "severity": "minor",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Capitalizes list items that should remain lowercase.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Hello Chirper. I need to write down accent-friendly words.\n\nAccent Friendly Words:\n- water\n- tomato\n- schedule\n- data\n- router\n- aluminium\n- privacy\n\nPlease write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.\n\nIn the deployment notes, mention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale. Thanks!",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 55,
        "summary": "Preserves much of the content and technical casing, but leaks spoken list instructions and misses the required bullet/list and paragraph formatting."
      },
      "judge_status": "ok",
      "latency_seconds": 121.8191,
      "model": "qwen3.5:4b",
      "ollama_status": "ok",
      "output": "Hello Chirper, I need to write down accent-friendly words.\n\nThis is a bullet point list with title: **Accent Friendly Words**.\n* Water, Tomato, Schedule, Data, Router, Aluminium, Privacy.\n\nEnd of list.\n\nPlease write an email to Maya.\n**Subject:** Quarterly Update.\nThe meeting moved to Thursday at 9:30 AM.\nThe budget is $12,450.\nAnd the website is chirper.local/launch.\n\nIn the deployment notes, mention that systemd keeps the Chirper services running. And we should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale.\n\nThanks!\n\n",
      "passed": false,
      "score": 55,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 121.8209,
        "metrics": {
          "energy_wh_estimate": 9.5609,
          "gpu_busy_percent_avg": 88.0,
          "gpu_busy_percent_peak": 92.0,
          "power_w_avg": 282.54,
          "power_w_peak": 292.0,
          "vram_mb_avg": 8061.3,
          "vram_mb_peak": 8180.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 245,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "markdown_checklist_instruction_content",
      "category": "markdown",
      "judge": {
        "elapsed_seconds": 10.766347452001355,
        "errors": [
          {
            "detail": "Did not create a markdown checklist with - [ ] items.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Leaves 'Make a markdown checklist titled' and item labels instead of applying them.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Uses sentence casing for the leaked instruction instead of the intended title-only output.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Adds prose around the checklist content that should not appear.",
            "severity": "major",
            "type": "extra_text"
          }
        ],
        "ideal_output": "Release Tasks:\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 35,
        "summary": "Preserves the task items but fails to format as a markdown checklist and leaves dictated formatting instructions in the output."
      },
      "judge_status": "ok",
      "latency_seconds": 80.4694,
      "model": "qwen3.5:4b",
      "ollama_status": "ok",
      "output": "Make a markdown checklist titled Release Tasks: Item 1. Run tests. Item 2. Push tag. Item 3. Publish release.\n\n",
      "passed": false,
      "score": 35,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 80.4717,
        "metrics": {
          "energy_wh_estimate": 6.226,
          "gpu_busy_percent_avg": 86.8,
          "gpu_busy_percent_peak": 93.0,
          "power_w_avg": 278.53,
          "power_w_peak": 292.0,
          "vram_mb_avg": 8014.9,
          "vram_mb_peak": 8266.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 162,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "shell_command_as_content",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 7.665557251999417,
        "errors": [
          {
            "detail": "Inserted `--` after `cargo test` and changed `tee` arguments.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Did not preserve the dictated filename `test-output.log`.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "cargo test | tee test-output.log",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 20,
        "summary": "Command structure and filename were changed, altering the intended shell command."
      },
      "judge_status": "ok",
      "latency_seconds": 142.3234,
      "model": "qwen3.5:4b",
      "ollama_status": "ok",
      "output": "cargo test -- | tee -o output.log\n\n",
      "passed": false,
      "score": 20,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 142.3253,
        "metrics": {
          "energy_wh_estimate": 11.2291,
          "gpu_busy_percent_avg": 87.8,
          "gpu_busy_percent_peak": 95.0,
          "power_w_avg": 284.03,
          "power_w_peak": 311.0,
          "vram_mb_avg": 8070.3,
          "vram_mb_peak": 8172.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 286,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "spoken_correction_chain",
      "category": "spoken_edits",
      "judge": {
        "elapsed_seconds": 9.550827738999942,
        "errors": [
          {
            "detail": "Leaves correction phrases such as \"No, scratch that\" and spelling instructions in the final text.",
            "severity": "critical",
            "type": "instruction_leak"
          },
          {
            "detail": "Changes the intended identifier ChirperBench to Chirper Benchmark.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Fails to format the corrected name as the single CamelCase identifier ChirperBench.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Includes superseded and explanatory dictated text that should have been removed.",
            "severity": "major",
            "type": "extra_text"
          }
        ],
        "ideal_output": "Call it ChirperBench.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 15,
        "summary": "The output leaks spoken correction commands and changes the intended identifier from ChirperBench to Chirper Benchmark/Benchmark."
      },
      "judge_status": "ok",
      "latency_seconds": 87.3196,
      "model": "qwen3.5:4b",
      "ollama_status": "ok",
      "output": "Call it Chirper Benchmark. No, scratch that, Chirper Benchmark. Spelled Capital C Chirper, Capital B Benchmark.\n\n",
      "passed": false,
      "score": 15,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 87.3213,
        "metrics": {
          "energy_wh_estimate": 6.7791,
          "gpu_busy_percent_avg": 87.0,
          "gpu_busy_percent_peak": 91.0,
          "power_w_avg": 279.48,
          "power_w_peak": 292.0,
          "vram_mb_avg": 8013.4,
          "vram_mb_peak": 8256.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 176,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "url_https_path",
      "category": "urls",
      "judge": {
        "elapsed_seconds": 8.19994327799941,
        "errors": [
          {
            "detail": "Expected a plain HTTPS URL, but output used markdown link syntax.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Visit https://docs.python.org/3/library/json.html.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 85,
        "summary": "URL content and punctuation are correct, but it incorrectly formats the URL as a markdown link instead of plain text."
      },
      "judge_status": "ok",
      "latency_seconds": 53.8662,
      "model": "qwen3.5:4b",
      "ollama_status": "ok",
      "output": "Visit [https://docs.python.org/3/library/json.html](https://docs.python.org/3/library/json.html).\n\n",
      "passed": false,
      "score": 85,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 53.8678,
        "metrics": {
          "energy_wh_estimate": 4.0874,
          "gpu_busy_percent_avg": 85.4,
          "gpu_busy_percent_peak": 96.0,
          "power_w_avg": 273.17,
          "power_w_peak": 293.0,
          "vram_mb_avg": 7920.9,
          "vram_mb_peak": 8266.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 109,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "email_addresses",
      "category": "emails",
      "judge": {
        "elapsed_seconds": 9.63414418499815,
        "errors": [
          {
            "detail": "Email addresses should be plain text, not markdown mailto links.",
            "severity": "minor",
            "type": "formatting_miss"
          },
          {
            "detail": "Expected lowercase \"cc\", but output used uppercase \"CC\".",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Send it to alex@chirper.local and cc ops@example.com.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 82,
        "summary": "Email addresses and punctuation are correct, but the formatter added unwanted mailto markdown links and changed cc casing."
      },
      "judge_status": "ok",
      "latency_seconds": 68.2848,
      "model": "qwen3.5:4b",
      "ollama_status": "ok",
      "output": "Send it to [alex@chirper.local](mailto:alex@chirper.local) and CC [ops@example.com](mailto:ops@example.com).\n\n",
      "passed": true,
      "score": 82,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 68.2869,
        "metrics": {
          "energy_wh_estimate": 5.2412,
          "gpu_busy_percent_avg": 86.1,
          "gpu_busy_percent_peak": 91.0,
          "power_w_avg": 276.31,
          "power_w_peak": 292.0,
          "vram_mb_avg": 7961.0,
          "vram_mb_peak": 8172.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 138,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "product_and_class_casing",
      "category": "casing_identifiers",
      "judge": {
        "elapsed_seconds": 8.297652081000706,
        "errors": [
          {
            "detail": "Expected product name as ChirperBench, not Chirper Bench.",
            "severity": "major",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "The project name is ChirperBench and the class name is DictationFormatter.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 75,
        "summary": "Class casing is correct, but product identifier casing/spacing is wrong."
      },
      "judge_status": "ok",
      "latency_seconds": 52.6807,
      "model": "qwen3.5:4b",
      "ollama_status": "ok",
      "output": "The project name is Chirper Bench and the class name is DictationFormatter.\n\n",
      "passed": false,
      "score": 75,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 52.6825,
        "metrics": {
          "energy_wh_estimate": 3.9845,
          "gpu_busy_percent_avg": 85.1,
          "gpu_busy_percent_peak": 96.0,
          "power_w_avg": 272.28,
          "power_w_peak": 292.0,
          "vram_mb_avg": 7899.9,
          "vram_mb_peak": 8172.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 107,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "inline_code_identifiers",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 10.317665717000637,
        "errors": [
          {
            "detail": "Changed \u201cequal to\u201d to \u201c=\u201d, which is not the expected cleaned wording.",
            "severity": "minor",
            "type": "semantic_change"
          }
        ],
        "ideal_output": "Set variable `user_id` equal to `request.user.id`.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 86,
        "summary": "Identifiers and inline code are formatted correctly, but wording differs from the expected text by replacing \u201cequal to\u201d with \u201c=\u201d"
      },
      "judge_status": "ok",
      "latency_seconds": 97.889,
      "model": "qwen3.5:4b",
      "ollama_status": "ok",
      "output": "Set variable `user_id` = `request.user.id`.\n\n",
      "passed": true,
      "score": 86,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 97.8909,
        "metrics": {
          "energy_wh_estimate": 7.6331,
          "gpu_busy_percent_avg": 87.4,
          "gpu_busy_percent_peak": 96.0,
          "power_w_avg": 280.72,
          "power_w_peak": 299.0,
          "vram_mb_avg": 8031.4,
          "vram_mb_peak": 8256.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 197,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "numbers_versions_dates",
      "category": "numbers_versions",
      "judge": {
        "elapsed_seconds": 8.859982016001595,
        "errors": [
          {
            "detail": "Added a comma after the date that was not in the expected output.",
            "severity": "minor",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Missing final period.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Version 2.1.0 shipped on June 3, 2026 and supports Python 3.11+.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 90,
        "summary": "Content, casing, numbers, date, and version formatting are correct; only punctuation differs."
      },
      "judge_status": "ok",
      "latency_seconds": 63.3887,
      "model": "qwen3.5:4b",
      "ollama_status": "ok",
      "output": "Version 2.1.0 shipped on June 3, 2026, and supports Python 3.11+\n\n",
      "passed": true,
      "score": 90,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 63.39,
        "metrics": {
          "energy_wh_estimate": 4.8402,
          "gpu_busy_percent_avg": 85.8,
          "gpu_busy_percent_peak": 91.0,
          "power_w_avg": 274.88,
          "power_w_peak": 291.0,
          "vram_mb_avg": 7958.9,
          "vram_mb_peak": 8266.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 128,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "simple_list",
      "category": "lists",
      "judge": {
        "elapsed_seconds": 22.82853123500172,
        "errors": [
          {
            "detail": "Rendered the items inline with commas instead of as a bulleted list.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Shopping list:\n- apples\n- bananas\n- oat milk",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 70,
        "summary": "Preserves the words but misses the dictated list formatting after the colon."
      },
      "judge_status": "ok",
      "latency_seconds": 39.2874,
      "model": "qwen3.5:4b",
      "ollama_status": "ok",
      "output": "Shopping list: apples, bananas, oat milk.\n\n",
      "passed": false,
      "score": 70,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 39.2888,
        "metrics": {
          "energy_wh_estimate": 2.9157,
          "gpu_busy_percent_avg": 83.9,
          "gpu_busy_percent_peak": 96.0,
          "power_w_avg": 267.16,
          "power_w_peak": 293.0,
          "vram_mb_avg": 7808.5,
          "vram_mb_peak": 8172.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 80,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "no_change_needed",
      "category": "no_change_needed",
      "judge": {
        "elapsed_seconds": 6.458768763997796,
        "errors": [],
        "ideal_output": "The quick brown fox jumps over the lazy dog.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output matches the expected clean text unchanged."
      },
      "judge_status": "ok",
      "latency_seconds": 10.516,
      "model": "qwen3.5:4b",
      "ollama_status": "ok",
      "output": "The quick brown fox jumps over the lazy dog.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 10.5184,
        "metrics": {
          "energy_wh_estimate": 0.5922,
          "gpu_busy_percent_avg": 66.9,
          "gpu_busy_percent_peak": 90.0,
          "power_w_avg": 202.7,
          "power_w_peak": 288.0,
          "vram_mb_avg": 6904.9,
          "vram_mb_peak": 8172.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 23,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_pr_review_instruction",
      "category": "instruction_as_content",
      "judge": null,
      "judge_status": "not_run",
      "latency_seconds": 180.0031,
      "model": "qwen3.5:0.8b",
      "ollama_status": "timeout",
      "output": "",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 180.005,
        "metrics": {
          "energy_wh_estimate": 10.207,
          "gpu_busy_percent_avg": 83.7,
          "gpu_busy_percent_peak": 86.0,
          "power_w_avg": 204.14,
          "power_w_peak": 219.0,
          "vram_mb_avg": 4321.4,
          "vram_mb_peak": 4432.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 361,
        "status": "ok"
      },
      "timed_out": true
    },
    {
      "case_id": "literal_agent_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 10.06806375099768,
        "errors": [
          {
            "detail": "Leaves spoken punctuation word \"period\" in the output.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Adds unintended word \"check\" after \"cargo test\".",
            "severity": "major",
            "type": "extra_text"
          },
          {
            "detail": "Does not convert spoken commas and period into punctuation.",
            "severity": "major",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Sentence should start with capitalized \"Run\".",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Run cargo test, fix any failing tests, then commit and push the branch.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 35,
        "summary": "Preserves most content but leaks spoken punctuation, misses required punctuation/casing, and adds an unintended word."
      },
      "judge_status": "ok",
      "latency_seconds": 34.0026,
      "model": "qwen3.5:0.8b",
      "ollama_status": "ok",
      "output": "run cargo test check fix any failing tests then commit and push the branch period\n\n",
      "passed": false,
      "score": 35,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 34.004,
        "metrics": {
          "energy_wh_estimate": 1.854,
          "gpu_busy_percent_avg": 78.5,
          "gpu_busy_percent_peak": 87.0,
          "power_w_avg": 196.29,
          "power_w_peak": 211.0,
          "vram_mb_avg": 4222.5,
          "vram_mb_peak": 4347.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 69,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_question_no_answer",
      "category": "instruction_as_content",
      "judge": null,
      "judge_status": "not_run",
      "latency_seconds": 180.0034,
      "model": "qwen3.5:0.8b",
      "ollama_status": "timeout",
      "output": "",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 180.0048,
        "metrics": {
          "energy_wh_estimate": 10.35,
          "gpu_busy_percent_avg": 83.9,
          "gpu_busy_percent_peak": 87.0,
          "power_w_avg": 206.99,
          "power_w_peak": 222.0,
          "vram_mb_avg": 4313.8,
          "vram_mb_peak": 4347.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 361,
        "status": "ok"
      },
      "timed_out": true
    },
    {
      "case_id": "literal_email_request",
      "category": "instruction_as_content",
      "judge": null,
      "judge_status": "not_run",
      "latency_seconds": 180.0032,
      "model": "qwen3.5:0.8b",
      "ollama_status": "timeout",
      "output": "",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 180.0045,
        "metrics": {
          "energy_wh_estimate": 10.6314,
          "gpu_busy_percent_avg": 84.0,
          "gpu_busy_percent_peak": 87.0,
          "power_w_avg": 212.62,
          "power_w_peak": 224.0,
          "vram_mb_avg": 4328.5,
          "vram_mb_peak": 4534.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 361,
        "status": "ok"
      },
      "timed_out": true
    },
    {
      "case_id": "onboarding_mixed_format",
      "category": "mixed_formatting",
      "judge": null,
      "judge_status": "not_run",
      "latency_seconds": 180.0029,
      "model": "qwen3.5:0.8b",
      "ollama_status": "timeout",
      "output": "",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 180.0045,
        "metrics": {
          "energy_wh_estimate": 10.6938,
          "gpu_busy_percent_avg": 83.9,
          "gpu_busy_percent_peak": 87.0,
          "power_w_avg": 213.87,
          "power_w_peak": 227.0,
          "vram_mb_avg": 4322.9,
          "vram_mb_peak": 4432.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 361,
        "status": "ok"
      },
      "timed_out": true
    },
    {
      "case_id": "markdown_checklist_instruction_content",
      "category": "markdown",
      "judge": {
        "elapsed_seconds": 11.571597775997361,
        "errors": [
          {
            "detail": "Did not format the title and checklist items as markdown.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Left dictated instructions such as 'make a markdown checklist titled' and 'item one/item two/item three' in the output.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Title and checklist item casing were not cleaned up.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Release Tasks:\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 35,
        "summary": "Output preserves the rough words but fails to create the requested markdown checklist and leaves spoken formatting instructions in place."
      },
      "judge_status": "ok",
      "latency_seconds": 38.9224,
      "model": "qwen3.5:0.8b",
      "ollama_status": "ok",
      "output": "make a markdown checklist titled release tasks: item one run tests item two push tag item three publish release\n\n",
      "passed": false,
      "score": 35,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 38.9238,
        "metrics": {
          "energy_wh_estimate": 2.1764,
          "gpu_busy_percent_avg": 78.9,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 201.29,
          "power_w_peak": 247.0,
          "vram_mb_avg": 4230.0,
          "vram_mb_peak": 4347.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 79,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "shell_command_as_content",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 9.841706893002993,
        "errors": [
          {
            "detail": "Did not convert spoken tokens like pipe, space, dash, and dot into shell syntax.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Left dictated formatting words in the final text.",
            "severity": "major",
            "type": "instruction_leak"
          }
        ],
        "ideal_output": "cargo test | tee test-output.log",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 25,
        "summary": "The output leaves spoken command tokens unformatted instead of producing the shell command structure."
      },
      "judge_status": "ok",
      "latency_seconds": 36.8465,
      "model": "qwen3.5:0.8b",
      "ollama_status": "ok",
      "output": "cargo test pipe tee space test dash output dot log\n\n",
      "passed": false,
      "score": 25,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 36.8486,
        "metrics": {
          "energy_wh_estimate": 1.9432,
          "gpu_busy_percent_avg": 77.9,
          "gpu_busy_percent_peak": 88.0,
          "power_w_avg": 189.85,
          "power_w_peak": 215.0,
          "vram_mb_avg": 4202.3,
          "vram_mb_peak": 4457.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 75,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "spoken_correction_chain",
      "category": "spoken_edits",
      "judge": {
        "elapsed_seconds": 12.711772941001982,
        "errors": [
          {
            "detail": "Leaves correction language such as \"no scratch that\" and \"spelled capital\" in the output.",
            "severity": "critical",
            "type": "instruction_leak"
          },
          {
            "detail": "Changes intended identifier from \"ChirperBench\" to variants of \"chip bench\" and \"chip benchmark\".",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Does not format the final identifier as \"ChirperBench\".",
            "severity": "major",
            "type": "wrong_casing"
          },
          {
            "detail": "Missing final period.",
            "severity": "minor",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Includes discarded transcript content instead of only the final cleaned sentence.",
            "severity": "major",
            "type": "extra_text"
          }
        ],
        "ideal_output": "Call it ChirperBench.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 0,
        "summary": "Output leaves spoken edit commands, uses the wrong identifier, and does not apply final casing or punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 39.8838,
      "model": "qwen3.5:0.8b",
      "ollama_status": "ok",
      "output": "call it chip bench no scratch that chip benchmark spelled capital C chip benchmark spelled capital B bench\n\n",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 39.8856,
        "metrics": {
          "energy_wh_estimate": 2.0808,
          "gpu_busy_percent_avg": 78.7,
          "gpu_busy_percent_peak": 86.0,
          "power_w_avg": 187.81,
          "power_w_peak": 207.0,
          "vram_mb_avg": 4225.1,
          "vram_mb_peak": 4457.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 81,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "url_https_path",
      "category": "urls",
      "judge": null,
      "judge_status": "not_run",
      "latency_seconds": 180.0029,
      "model": "qwen3.5:0.8b",
      "ollama_status": "timeout",
      "output": "",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 180.0048,
        "metrics": {
          "energy_wh_estimate": 10.157,
          "gpu_busy_percent_avg": 83.8,
          "gpu_busy_percent_peak": 87.0,
          "power_w_avg": 203.14,
          "power_w_peak": 220.0,
          "vram_mb_avg": 4324.9,
          "vram_mb_peak": 4432.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 361,
        "status": "ok"
      },
      "timed_out": true
    },
    {
      "case_id": "email_addresses",
      "category": "emails",
      "judge": null,
      "judge_status": "not_run",
      "latency_seconds": 180.0031,
      "model": "qwen3.5:0.8b",
      "ollama_status": "timeout",
      "output": "",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 180.0043,
        "metrics": {
          "energy_wh_estimate": 10.6069,
          "gpu_busy_percent_avg": 84.0,
          "gpu_busy_percent_peak": 87.0,
          "power_w_avg": 212.13,
          "power_w_peak": 226.0,
          "vram_mb_avg": 4321.8,
          "vram_mb_peak": 4501.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 361,
        "status": "ok"
      },
      "timed_out": true
    },
    {
      "case_id": "product_and_class_casing",
      "category": "casing_identifiers",
      "judge": {
        "elapsed_seconds": 13.580127125002036,
        "errors": [
          {
            "detail": "Product and class identifiers were not converted to ChirperBench and DictationFormatter.",
            "severity": "major",
            "type": "wrong_casing"
          },
          {
            "detail": "Spoken command 'period' was left as text.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Final period was not applied correctly.",
            "severity": "major",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "The project name is ChirperBench and the class name is DictationFormatter.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 35,
        "summary": "Output preserves the basic sentence but misses required identifier casing/spacing and leaks the spoken punctuation command."
      },
      "judge_status": "ok",
      "latency_seconds": 31.5925,
      "model": "qwen3.5:0.8b",
      "ollama_status": "ok",
      "output": "The project name is Chirper Bench and the class name is Dictation Formatter Period.\n\n",
      "passed": false,
      "score": 35,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 31.5944,
        "metrics": {
          "energy_wh_estimate": 1.7406,
          "gpu_busy_percent_avg": 78.0,
          "gpu_busy_percent_peak": 86.0,
          "power_w_avg": 198.34,
          "power_w_peak": 217.0,
          "vram_mb_avg": 4188.8,
          "vram_mb_peak": 4347.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 65,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "inline_code_identifiers",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 11.308658989000833,
        "errors": [
          {
            "detail": "Did not format `user_id` or `request.user.id` as inline code.",
            "severity": "critical",
            "type": "formatting_miss"
          },
          {
            "detail": "Left spoken punctuation/edit words such as \"dot\" and \"period\" in the text.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Changed intended identifier `user_id` to `user id`.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Did not capitalize the opening word.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Did not render the final period or dotted property path.",
            "severity": "major",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Set variable `user_id` equal to `request.user.id`.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 0,
        "summary": "The output fails to format identifiers as inline code, leaks spoken punctuation commands, removes the underscore, and misses required casing and punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 38.6362,
      "model": "qwen3.5:0.8b",
      "ollama_status": "ok",
      "output": "set variable user id equal to request dot user dot id period\n\n",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 38.6379,
        "metrics": {
          "energy_wh_estimate": 2.0344,
          "gpu_busy_percent_avg": 78.8,
          "gpu_busy_percent_peak": 86.0,
          "power_w_avg": 189.56,
          "power_w_peak": 207.0,
          "vram_mb_avg": 4209.9,
          "vram_mb_peak": 4432.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 79,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "numbers_versions_dates",
      "category": "numbers_versions",
      "judge": null,
      "judge_status": "not_run",
      "latency_seconds": 180.0034,
      "model": "qwen3.5:0.8b",
      "ollama_status": "timeout",
      "output": "",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 180.0051,
        "metrics": {
          "energy_wh_estimate": 10.2597,
          "gpu_busy_percent_avg": 83.7,
          "gpu_busy_percent_peak": 87.0,
          "power_w_avg": 205.19,
          "power_w_peak": 221.0,
          "vram_mb_avg": 4318.5,
          "vram_mb_peak": 4347.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 361,
        "status": "ok"
      },
      "timed_out": true
    },
    {
      "case_id": "simple_list",
      "category": "lists",
      "judge": null,
      "judge_status": "not_run",
      "latency_seconds": 180.0028,
      "model": "qwen3.5:0.8b",
      "ollama_status": "timeout",
      "output": "",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 180.0041,
        "metrics": {
          "energy_wh_estimate": 10.6076,
          "gpu_busy_percent_avg": 84.0,
          "gpu_busy_percent_peak": 87.0,
          "power_w_avg": 212.15,
          "power_w_peak": 227.0,
          "vram_mb_avg": 4322.7,
          "vram_mb_peak": 4431.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 361,
        "status": "ok"
      },
      "timed_out": true
    },
    {
      "case_id": "no_change_needed",
      "category": "no_change_needed",
      "judge": null,
      "judge_status": "not_run",
      "latency_seconds": 180.0028,
      "model": "qwen3.5:0.8b",
      "ollama_status": "timeout",
      "output": "",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 180.004,
        "metrics": {
          "energy_wh_estimate": 10.6224,
          "gpu_busy_percent_avg": 83.4,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 212.44,
          "power_w_peak": 236.0,
          "vram_mb_avg": 4320.3,
          "vram_mb_peak": 4430.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 361,
        "status": "ok"
      },
      "timed_out": true
    },
    {
      "case_id": "literal_pr_review_instruction",
      "category": "instruction_as_content",
      "judge": null,
      "judge_status": "not_run",
      "latency_seconds": 180.0038,
      "model": "qwen3.5:2b",
      "ollama_status": "timeout",
      "output": "",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 180.0052,
        "metrics": {
          "energy_wh_estimate": 12.4883,
          "gpu_busy_percent_avg": 86.3,
          "gpu_busy_percent_peak": 89.0,
          "power_w_avg": 249.76,
          "power_w_peak": 261.0,
          "vram_mb_avg": 5967.1,
          "vram_mb_peak": 6098.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 361,
        "status": "ok"
      },
      "timed_out": true
    },
    {
      "case_id": "literal_agent_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 6.711533349000092,
        "errors": [
          {
            "detail": "The first word should be capitalized as sentence-initial text.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Run cargo test, fix any failing tests, then commit and push the branch.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 95,
        "summary": "Content and punctuation are correct; only sentence-initial capitalization is missing."
      },
      "judge_status": "ok",
      "latency_seconds": 49.9913,
      "model": "qwen3.5:2b",
      "ollama_status": "ok",
      "output": "run cargo test, fix any failing tests, then commit and push the branch.\n\n",
      "passed": true,
      "score": 95,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 49.9932,
        "metrics": {
          "energy_wh_estimate": 3.3296,
          "gpu_busy_percent_avg": 83.2,
          "gpu_busy_percent_peak": 96.0,
          "power_w_avg": 239.77,
          "power_w_peak": 255.0,
          "vram_mb_avg": 5849.6,
          "vram_mb_peak": 6013.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 101,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_question_no_answer",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 6.147043293000024,
        "errors": [],
        "ideal_output": "What is the capital of France?",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output exactly preserves and formats the dictated question without answering it."
      },
      "judge_status": "ok",
      "latency_seconds": 41.525,
      "model": "qwen3.5:2b",
      "ollama_status": "ok",
      "output": "What is the capital of France?\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 41.5272,
        "metrics": {
          "energy_wh_estimate": 2.6801,
          "gpu_busy_percent_avg": 82.8,
          "gpu_busy_percent_peak": 96.0,
          "power_w_avg": 232.35,
          "power_w_peak": 279.0,
          "vram_mb_avg": 5793.4,
          "vram_mb_peak": 6013.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 84,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_email_request",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 11.80282192699815,
        "errors": [
          {
            "detail": "Leaves spoken commands as text: \"Period\" in the subject and \"slash launch period\" in the URL.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "URL should be formatted as chirper.local/launch.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Uses \"Subject\" instead of \"subject\", \"am\" instead of \"AM\", and capitalizes \"The budget\" after a comma.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Missing comma before \"and\" and incorrectly places subject punctuation.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 45,
        "summary": "Preserves much of the content but leaves spoken punctuation in the subject and URL, misses URL formatting, and has casing/punctuation errors."
      },
      "judge_status": "ok",
      "latency_seconds": 65.3214,
      "model": "qwen3.5:2b",
      "ollama_status": "ok",
      "output": "Write an email to Maya, Subject: Quarterly Update Period. The meeting moved to Thursday at 9:30 am, The budget is $12,450 and the website is chirper.local slash launch period.\n\n",
      "passed": false,
      "score": 45,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 65.3232,
        "metrics": {
          "energy_wh_estimate": 4.2919,
          "gpu_busy_percent_avg": 83.9,
          "gpu_busy_percent_peak": 96.0,
          "power_w_avg": 236.53,
          "power_w_peak": 255.0,
          "vram_mb_avg": 5886.1,
          "vram_mb_peak": 6097.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 132,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "onboarding_mixed_format",
      "category": "mixed_formatting",
      "judge": {
        "elapsed_seconds": 13.919378743001289,
        "errors": [
          {
            "detail": "Leaves spoken commands such as \"period,\" \"end of list,\" \"New paragraph,\" and \"Finish with thanks\" in the output.",
            "severity": "critical",
            "type": "instruction_leak"
          },
          {
            "detail": "Does not create the required titled bullet list.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Changes \"Hello Chirper.\" to \"Chirper, period.\"",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Fails required casing for Quarterly Update, Chirper, PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale.",
            "severity": "major",
            "type": "wrong_casing"
          },
          {
            "detail": "Incorrect paragraph structure and punctuation around the list title and email subject.",
            "severity": "major",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Formats the budget as \"12,450 dollars\" instead of \"$12,450\".",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Hello Chirper. I need to write down accent-friendly words.\n\nAccent Friendly Words:\n- water\n- tomato\n- schedule\n- data\n- router\n- aluminium\n- privacy\n\nPlease write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.\n\nIn the deployment notes, mention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale. Thanks!",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 28,
        "summary": "Major formatting misses and instruction leaks; content is partially preserved but punctuation, list formatting, casing, spoken edits, and currency formatting are wrong."
      },
      "judge_status": "ok",
      "latency_seconds": 59.7066,
      "model": "qwen3.5:2b",
      "ollama_status": "ok",
      "output": "Chirper, period. I need to write down accent-friendly words.\nThis is a bullet point list with title: accent-friendly words: water, tomato, schedule, data, router, aluminium, privacy. end of list\n\nNew paragraph\nPlease write an email to Maya, subject: quarterly update.\n\nThe meeting moved to Thursday at 9:30 AM, the budget is 12,450 dollars, and the website is chirper.local/launch.\n\nNew paragraph\nIn the deployment notes, mention that systemd keeps the chirper services running, and we should also look at postgresql, ffmpeg, gnome, nextcloud, and tailscale.\n\nFinish with thanks!\n\n",
      "passed": false,
      "score": 28,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 59.7087,
        "metrics": {
          "energy_wh_estimate": 3.8628,
          "gpu_busy_percent_avg": 83.7,
          "gpu_busy_percent_peak": 96.0,
          "power_w_avg": 232.9,
          "power_w_peak": 250.0,
          "vram_mb_avg": 5859.9,
          "vram_mb_peak": 6013.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 121,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "markdown_checklist_instruction_content",
      "category": "markdown",
      "judge": {
        "elapsed_seconds": 9.317945099002827,
        "errors": [
          {
            "detail": "Kept 'Make a markdown checklist titled' instead of returning only the formatted checklist.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Used an inline numbered list instead of markdown checklist items.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Capitalized checklist item words differently from the expected output.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Release Tasks:\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 35,
        "summary": "Preserves the task items but leaks the spoken instruction and fails to format as a markdown checklist."
      },
      "judge_status": "ok",
      "latency_seconds": 45.2991,
      "model": "qwen3.5:2b",
      "ollama_status": "ok",
      "output": "Make a markdown checklist titled Release Tasks: 1. Run Tests, 2. Push Tag, 3. Publish Release.\n\n",
      "passed": false,
      "score": 35,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 45.3013,
        "metrics": {
          "energy_wh_estimate": 2.8425,
          "gpu_busy_percent_avg": 82.6,
          "gpu_busy_percent_peak": 96.0,
          "power_w_avg": 225.89,
          "power_w_peak": 247.0,
          "vram_mb_avg": 5812.7,
          "vram_mb_peak": 6013.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 92,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "shell_command_as_content",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 7.778102061998652,
        "errors": [
          {
            "detail": "Did not format pipe, spaces, dash, or dot as shell command characters.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Left spoken punctuation words instead of `|`, `-`, and `.`.",
            "severity": "major",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "cargo test | tee test-output.log",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 30,
        "summary": "Failed to convert dictated shell-command tokens into the intended command syntax."
      },
      "judge_status": "ok",
      "latency_seconds": 92.6329,
      "model": "qwen3.5:2b",
      "ollama_status": "ok",
      "output": "cargo test pipe tee space test dash output dot log\n\n",
      "passed": false,
      "score": 30,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 92.6344,
        "metrics": {
          "energy_wh_estimate": 6.0771,
          "gpu_busy_percent_avg": 85.3,
          "gpu_busy_percent_peak": 96.0,
          "power_w_avg": 236.17,
          "power_w_peak": 252.0,
          "vram_mb_avg": 5916.1,
          "vram_mb_peak": 6013.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 187,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "spoken_correction_chain",
      "category": "spoken_edits",
      "judge": null,
      "judge_status": "not_run",
      "latency_seconds": 180.003,
      "model": "qwen3.5:2b",
      "ollama_status": "timeout",
      "output": "",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 180.005,
        "metrics": {
          "energy_wh_estimate": 12.0798,
          "gpu_busy_percent_avg": 85.3,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 241.59,
          "power_w_peak": 293.0,
          "vram_mb_avg": 5968.0,
          "vram_mb_peak": 6098.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 361,
        "status": "ok"
      },
      "timed_out": true
    },
    {
      "case_id": "url_https_path",
      "category": "urls",
      "judge": {
        "elapsed_seconds": 5.453801227002259,
        "errors": [],
        "ideal_output": "Visit https://docs.python.org/3/library/json.html.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output matches the expected formatted URL and punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 52.7982,
      "model": "qwen3.5:2b",
      "ollama_status": "ok",
      "output": "Visit https://docs.python.org/3/library/json.html.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 52.8002,
        "metrics": {
          "energy_wh_estimate": 3.5164,
          "gpu_busy_percent_avg": 83.7,
          "gpu_busy_percent_peak": 95.0,
          "power_w_avg": 239.76,
          "power_w_peak": 256.0,
          "vram_mb_avg": 5881.3,
          "vram_mb_peak": 6013.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 107,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "email_addresses",
      "category": "emails",
      "judge": null,
      "judge_status": "not_run",
      "latency_seconds": 180.0032,
      "model": "qwen3.5:2b",
      "ollama_status": "timeout",
      "output": "",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 180.005,
        "metrics": {
          "energy_wh_estimate": 12.3913,
          "gpu_busy_percent_avg": 86.2,
          "gpu_busy_percent_peak": 96.0,
          "power_w_avg": 247.82,
          "power_w_peak": 263.0,
          "vram_mb_avg": 5967.6,
          "vram_mb_peak": 6098.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 361,
        "status": "ok"
      },
      "timed_out": true
    },
    {
      "case_id": "product_and_class_casing",
      "category": "casing_identifiers",
      "judge": {
        "elapsed_seconds": 9.318162369003403,
        "errors": [
          {
            "detail": "Used 'Chirper Bench' instead of 'ChirperBench'.",
            "severity": "major",
            "type": "wrong_casing"
          },
          {
            "detail": "Used 'Dictation Formatter' instead of 'DictationFormatter'.",
            "severity": "major",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "The project name is ChirperBench and the class name is DictationFormatter.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 60,
        "summary": "Identifier casing was not applied correctly; product and class names kept spaces instead of code-style casing."
      },
      "judge_status": "ok",
      "latency_seconds": 58.0375,
      "model": "qwen3.5:2b",
      "ollama_status": "ok",
      "output": "The project name is Chirper Bench and the class name is Dictation Formatter.\n\n",
      "passed": false,
      "score": 60,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 58.0389,
        "metrics": {
          "energy_wh_estimate": 3.8462,
          "gpu_busy_percent_avg": 83.7,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 238.57,
          "power_w_peak": 266.0,
          "vram_mb_avg": 5912.3,
          "vram_mb_peak": 6013.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 117,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "inline_code_identifiers",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 9.99170912900081,
        "errors": [
          {
            "detail": "Code identifiers were not formatted as inline code.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Omitted the word \"variable\" from the intended sentence.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Changed the dictated prose phrase \"equal to\" into an assignment expression.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Sentence should start with capitalized \"Set\".",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Set variable `user_id` equal to `request.user.id`.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 55,
        "summary": "Preserves the rough meaning but drops required wording, casing, punctuation style, and inline code formatting."
      },
      "judge_status": "ok",
      "latency_seconds": 61.4795,
      "model": "qwen3.5:2b",
      "ollama_status": "ok",
      "output": "set user_id = request.user.id.\n\n",
      "passed": false,
      "score": 55,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 61.4817,
        "metrics": {
          "energy_wh_estimate": 4.0246,
          "gpu_busy_percent_avg": 83.7,
          "gpu_busy_percent_peak": 96.0,
          "power_w_avg": 235.66,
          "power_w_peak": 252.0,
          "vram_mb_avg": 5866.7,
          "vram_mb_peak": 6013.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 124,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "numbers_versions_dates",
      "category": "numbers_versions",
      "judge": {
        "elapsed_seconds": 9.854038267996657,
        "errors": [
          {
            "detail": "Dropped the plus from Python 3.11+, changing the intended version support.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Formatted the date as June 3rd, 2026 instead of June 3, 2026.",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Version 2.1.0 shipped on June 3, 2026 and supports Python 3.11+.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 78,
        "summary": "Mostly correct formatting, but it changed the supported Python version requirement and used non-ideal date formatting."
      },
      "judge_status": "ok",
      "latency_seconds": 55.6676,
      "model": "qwen3.5:2b",
      "ollama_status": "ok",
      "output": "Version 2.1.0 shipped on June 3rd, 2026 and supports Python 3.11.\n\n",
      "passed": false,
      "score": 78,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 55.6692,
        "metrics": {
          "energy_wh_estimate": 3.5959,
          "gpu_busy_percent_avg": 83.4,
          "gpu_busy_percent_peak": 96.0,
          "power_w_avg": 232.54,
          "power_w_peak": 253.0,
          "vram_mb_avg": 5857.5,
          "vram_mb_peak": 6097.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 113,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "simple_list",
      "category": "lists",
      "judge": {
        "elapsed_seconds": 7.580470177999814,
        "errors": [
          {
            "detail": "Rendered the items inline instead of as a bulleted list after the colon.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Shopping list:\n- apples\n- bananas\n- oat milk",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 70,
        "summary": "Content is preserved, but the dictated list was not formatted as a markdown list."
      },
      "judge_status": "ok",
      "latency_seconds": 44.3289,
      "model": "qwen3.5:2b",
      "ollama_status": "ok",
      "output": "Shopping list: apples, bananas, oat milk.\n\n",
      "passed": false,
      "score": 70,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 44.3308,
        "metrics": {
          "energy_wh_estimate": 2.8066,
          "gpu_busy_percent_avg": 82.5,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 227.92,
          "power_w_peak": 250.0,
          "vram_mb_avg": 5828.6,
          "vram_mb_peak": 6098.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 90,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "no_change_needed",
      "category": "no_change_needed",
      "judge": {
        "elapsed_seconds": 5.9238344069999584,
        "errors": [],
        "ideal_output": "The quick brown fox jumps over the lazy dog.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output exactly matches the expected clean text."
      },
      "judge_status": "ok",
      "latency_seconds": 25.1781,
      "model": "qwen3.5:2b",
      "ollama_status": "ok",
      "output": "The quick brown fox jumps over the lazy dog.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 25.1799,
        "metrics": {
          "energy_wh_estimate": 1.4931,
          "gpu_busy_percent_avg": 78.5,
          "gpu_busy_percent_peak": 95.0,
          "power_w_avg": 213.48,
          "power_w_peak": 245.0,
          "vram_mb_avg": 5660.9,
          "vram_mb_peak": 6013.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 52,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_pr_review_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 10.487292955000157,
        "errors": [
          {
            "detail": "Rendered the PR identifier as \"PR number 4\" instead of the expected \"PR #4\".",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Please check the open PR #4 for code review comments and check each one that's still open if it needs to be fixed. Fix it, then resolve the comments and push your fix.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 88,
        "summary": "Preserves the intended dictated instruction and cleans punctuation, but misses the expected PR identifier formatting."
      },
      "judge_status": "ok",
      "latency_seconds": 4.032,
      "model": "olmo2:7b",
      "ollama_status": "ok",
      "output": "Please check the open PR number 4 for code review comments. Check each comment that is still open to see if it needs to be fixed. If so, fix it, resolve the comments, and then push your fix.\n\n",
      "passed": true,
      "score": 88,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 4.0337,
        "metrics": {
          "energy_wh_estimate": 0.0994,
          "gpu_busy_percent_avg": 32.3,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 88.7,
          "power_w_peak": 302.0,
          "vram_mb_avg": 3909.2,
          "vram_mb_peak": 8191.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 10,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_agent_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 6.101060814999073,
        "errors": [],
        "ideal_output": "Run cargo test, fix any failing tests, then commit and push the branch.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output matches the expected cleaned text exactly."
      },
      "judge_status": "ok",
      "latency_seconds": 2.0216,
      "model": "olmo2:7b",
      "ollama_status": "ok",
      "output": "Run cargo test, fix any failing tests, then commit and push the branch.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.0232,
        "metrics": {
          "energy_wh_estimate": 0.0597,
          "gpu_busy_percent_avg": 37.0,
          "gpu_busy_percent_peak": 90.0,
          "power_w_avg": 106.33,
          "power_w_peak": 289.0,
          "vram_mb_avg": 5145.2,
          "vram_mb_peak": 8191.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_question_no_answer",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 6.815861176997714,
        "errors": [
          {
            "detail": "Output says \"Paris\" instead of preserving the dictated question.",
            "severity": "critical",
            "type": "answered_content"
          },
          {
            "detail": "Changed the intended content from a question to an answer.",
            "severity": "critical",
            "type": "semantic_change"
          },
          {
            "detail": "Omitted the dictated text \"What is the capital of France?\"",
            "severity": "major",
            "type": "missing_text"
          }
        ],
        "ideal_output": "What is the capital of France?",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 0,
        "summary": "Critical failure: answered the dictated question instead of formatting it as text."
      },
      "judge_status": "ok",
      "latency_seconds": 1.7955,
      "model": "olmo2:7b",
      "ollama_status": "ok",
      "output": "Paris\n\n",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 1.7971,
        "metrics": {
          "energy_wh_estimate": 0.0186,
          "gpu_busy_percent_avg": 16.2,
          "gpu_busy_percent_peak": 39.0,
          "power_w_avg": 37.2,
          "power_w_peak": 104.0,
          "vram_mb_avg": 4536.4,
          "vram_mb_peak": 8193.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 5,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_email_request",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 11.592477253001562,
        "errors": [
          {
            "detail": "Changed the intended literal request into a standalone subject/update summary.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Omitted 'Write an email to Maya, subject:' as dictated content.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Formatted the website as chirper.local/launch_period instead of chirper.local/launch.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Used colons and semicolons that do not match the intended sentence structure.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 35,
        "summary": "Output restructures the dictated request into an invented subject-style line, omits the instruction/addressee, and formats the URL incorrectly."
      },
      "judge_status": "ok",
      "latency_seconds": 2.2939,
      "model": "olmo2:7b",
      "ollama_status": "ok",
      "output": "Subject: Quarterly Update: Meeting Moved to Thursday, 9:30 AM; Budget: $12,450; Website: chirper.local/launch_period\n\n",
      "passed": false,
      "score": 35,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.2955,
        "metrics": {
          "energy_wh_estimate": 0.0571,
          "gpu_busy_percent_avg": 30.3,
          "gpu_busy_percent_peak": 86.0,
          "power_w_avg": 89.5,
          "power_w_peak": 305.0,
          "vram_mb_avg": 5145.4,
          "vram_mb_peak": 8191.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "onboarding_mixed_format",
      "category": "mixed_formatting",
      "judge": {
        "elapsed_seconds": 15.808826353997574,
        "errors": [
          {
            "detail": "Left dictated commands/content such as \"This is a bullet point list\", \"End of list\", and \"New paragraph:\" in the output.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Failed to format the URL as chirper.local/launch.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Added an extra \"Cheers!\" not present in the transcript.",
            "severity": "major",
            "type": "extra_text"
          },
          {
            "detail": "Changed \"look at\" to \"look into\" and altered sentence relationships with extra periods.",
            "severity": "minor",
            "type": "semantic_change"
          },
          {
            "detail": "Changed the opening sentence punctuation and split comma-linked clauses into separate sentences.",
            "severity": "minor",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Capitalized list items and hyphenated/capitalized the title differently from expected.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Hello Chirper. I need to write down accent-friendly words.\n\nAccent Friendly Words:\n- water\n- tomato\n- schedule\n- data\n- router\n- aluminium\n- privacy\n\nPlease write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.\n\nIn the deployment notes, mention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale. Thanks!",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 42,
        "summary": "Major mixed-formatting failures: spoken commands leaked, URL was not normalized, extra/invented closing was added, and several punctuation/casing details changed."
      },
      "judge_status": "ok",
      "latency_seconds": 3.3046,
      "model": "olmo2:7b",
      "ollama_status": "ok",
      "output": "Hello Chirper,\n\nI need to write down accent-friendly words. This is a bullet point list with the title \"Accent-Friendly Words\":\n\n- Water\n- Tomato\n- Schedule\n- Data\n- Router\n- Aluminium\n- Privacy\n\nEnd of list.\n\nNew paragraph: Please write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM. The budget is $12,450.00. And the website is chirper dot local slash launch.\n\nIn the deployment notes, mention that systemd keeps the Chirper services running. We should also look into PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale.\n\nThanks!\n\nCheers!\n\n",
      "passed": false,
      "score": 42,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.3064,
        "metrics": {
          "energy_wh_estimate": 0.1392,
          "gpu_busy_percent_avg": 50.6,
          "gpu_busy_percent_peak": 90.0,
          "power_w_avg": 151.62,
          "power_w_peak": 290.0,
          "vram_mb_avg": 5907.7,
          "vram_mb_peak": 8193.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 8,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "markdown_checklist_instruction_content",
      "category": "markdown",
      "judge": {
        "elapsed_seconds": 7.954501325999445,
        "errors": [
          {
            "detail": "List items are plain bullets instead of unchecked markdown checklist items.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Title is formatted as a heading and omits the required colon.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Release Tasks:\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 65,
        "summary": "Content is preserved, but the required markdown checklist format and title colon were not applied."
      },
      "judge_status": "ok",
      "latency_seconds": 1.9366,
      "model": "olmo2:7b",
      "ollama_status": "ok",
      "output": "# Release Tasks\n- Run tests\n- Push tag\n- Publish release\n\n",
      "passed": false,
      "score": 65,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 1.9389,
        "metrics": {
          "energy_wh_estimate": 0.0426,
          "gpu_busy_percent_avg": 26.4,
          "gpu_busy_percent_peak": 81.0,
          "power_w_avg": 79.2,
          "power_w_peak": 291.0,
          "vram_mb_avg": 4536.2,
          "vram_mb_peak": 8191.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 5,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "shell_command_as_content",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 9.435174960999575,
        "errors": [
          {
            "detail": "Did not render spoken shell syntax as `|`, space, dash, and dot correctly.",
            "severity": "critical",
            "type": "formatting_miss"
          },
          {
            "detail": "Changed the command from `cargo test | tee test-output.log` to a different invalid command.",
            "severity": "critical",
            "type": "semantic_change"
          }
        ],
        "ideal_output": "cargo test | tee test-output.log",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 0,
        "summary": "The shell command structure and identifiers were not preserved."
      },
      "judge_status": "ok",
      "latency_seconds": 1.8999,
      "model": "olmo2:7b",
      "ollama_status": "ok",
      "output": "cargo --test pipe tee --space test -output log\n\n",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 1.902,
        "metrics": {
          "energy_wh_estimate": 0.039,
          "gpu_busy_percent_avg": 26.2,
          "gpu_busy_percent_peak": 78.0,
          "power_w_avg": 73.8,
          "power_w_peak": 278.0,
          "vram_mb_avg": 4536.2,
          "vram_mb_peak": 8191.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 5,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "spoken_correction_chain",
      "category": "spoken_edits",
      "judge": {
        "elapsed_seconds": 9.271662899998773,
        "errors": [
          {
            "detail": "Omitted \"Call it\" from the final text.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Rendered the corrected identifier as \"Chirper Benchmark\" instead of \"ChirperBench\".",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Failed to preserve the intended CamelCase identifier formatting.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Missing final period.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Call it ChirperBench.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 35,
        "summary": "Output removes spoken edits but changes the intended final identifier and omits surrounding text and punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 1.8358,
      "model": "olmo2:7b",
      "ollama_status": "ok",
      "output": "Chirper Benchmark\n\n",
      "passed": false,
      "score": 35,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 1.8378,
        "metrics": {
          "energy_wh_estimate": 0.0218,
          "gpu_busy_percent_avg": 19.0,
          "gpu_busy_percent_peak": 44.0,
          "power_w_avg": 42.8,
          "power_w_peak": 122.0,
          "vram_mb_avg": 4536.2,
          "vram_mb_peak": 8191.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 5,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "url_https_path",
      "category": "urls",
      "judge": {
        "elapsed_seconds": 9.150116468001215,
        "errors": [
          {
            "detail": "Changed the URL scheme from https:// to http://.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Omitted the intended leading text: Visit.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Omitted the final period.",
            "severity": "minor",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Did not capitalize the sentence as expected.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Visit https://docs.python.org/3/library/json.html.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 45,
        "summary": "Output drops the leading instruction word and final period, and incorrectly changes HTTPS to HTTP."
      },
      "judge_status": "ok",
      "latency_seconds": 1.8818,
      "model": "olmo2:7b",
      "ollama_status": "ok",
      "output": "http://docs.python.org/3/library/json.html\n\n",
      "passed": false,
      "score": 45,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 1.8836,
        "metrics": {
          "energy_wh_estimate": 0.028,
          "gpu_busy_percent_avg": 21.4,
          "gpu_busy_percent_peak": 57.0,
          "power_w_avg": 53.6,
          "power_w_peak": 193.0,
          "vram_mb_avg": 4536.2,
          "vram_mb_peak": 8191.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 5,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "email_addresses",
      "category": "emails",
      "judge": {
        "elapsed_seconds": 7.957473640999524,
        "errors": [
          {
            "detail": "Left both email addresses as spoken words instead of converting to alex@chirper.local and ops@example.com.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Changed alex and chirper to title case and used CC instead of expected cc.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Send it to alex@chirper.local and cc ops@example.com.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 45,
        "summary": "Failed to format spoken email addresses into proper email syntax."
      },
      "judge_status": "ok",
      "latency_seconds": 1.9735,
      "model": "olmo2:7b",
      "ollama_status": "ok",
      "output": "Send it to Alex at Chirper dot local and CC ops at example dot com.\n\n",
      "passed": false,
      "score": 45,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 1.9754,
        "metrics": {
          "energy_wh_estimate": 0.0422,
          "gpu_busy_percent_avg": 29.2,
          "gpu_busy_percent_peak": 88.0,
          "power_w_avg": 77.0,
          "power_w_peak": 293.0,
          "vram_mb_avg": 4536.2,
          "vram_mb_peak": 8191.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 5,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "product_and_class_casing",
      "category": "casing_identifiers",
      "judge": {
        "elapsed_seconds": 8.749351926999225,
        "errors": [
          {
            "detail": "Removed surrounding sentence content about project name and class name.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Expected code-style casing ChirperBench and DictationFormatter, but output used spaced title case.",
            "severity": "major",
            "type": "wrong_casing"
          },
          {
            "detail": "Used a comma instead of the dictated sentence structure and final period.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "The project name is ChirperBench and the class name is DictationFormatter.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 20,
        "summary": "Output drops most of the sentence and fails required product/class identifier casing."
      },
      "judge_status": "ok",
      "latency_seconds": 1.8539,
      "model": "olmo2:7b",
      "ollama_status": "ok",
      "output": "Chirper Bench, Dictation Formatter\n\n",
      "passed": false,
      "score": 20,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 1.8564,
        "metrics": {
          "energy_wh_estimate": 0.0377,
          "gpu_busy_percent_avg": 27.8,
          "gpu_busy_percent_peak": 88.0,
          "power_w_avg": 73.2,
          "power_w_peak": 287.0,
          "vram_mb_avg": 4536.2,
          "vram_mb_peak": 8191.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 5,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "inline_code_identifiers",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 9.427133632998448,
        "errors": [
          {
            "detail": "Omitted the word \"variable\" from the dictated sentence.",
            "severity": "minor",
            "type": "missing_text"
          },
          {
            "detail": "Missing final period.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Set variable `user_id` equal to `request.user.id`.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 85,
        "summary": "Preserves the identifiers and inline code formatting, but omits one word and final punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 1.8979,
      "model": "olmo2:7b",
      "ollama_status": "ok",
      "output": "Set `user_id` equal to `request.user.id`\n\n",
      "passed": true,
      "score": 85,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 1.8997,
        "metrics": {
          "energy_wh_estimate": 0.0255,
          "gpu_busy_percent_avg": 20.6,
          "gpu_busy_percent_peak": 53.0,
          "power_w_avg": 48.4,
          "power_w_peak": 174.0,
          "vram_mb_avg": 4536.2,
          "vram_mb_peak": 8191.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 5,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "numbers_versions_dates",
      "category": "numbers_versions",
      "judge": {
        "elapsed_seconds": 9.40882857300312,
        "errors": [
          {
            "detail": "Missing final period.",
            "severity": "minor",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Added a comma after 2026 not present in the expected output.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Version 2.1.0 shipped on June 3, 2026 and supports Python 3.11+.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 90,
        "summary": "Content, casing, date, version, and number formatting are correct, but punctuation differs from expected."
      },
      "judge_status": "ok",
      "latency_seconds": 2.046,
      "model": "olmo2:7b",
      "ollama_status": "ok",
      "output": "Version 2.1.0 shipped on June 3, 2026, and supports Python 3.11+\n\n",
      "passed": true,
      "score": 90,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.0482,
        "metrics": {
          "energy_wh_estimate": 0.0345,
          "gpu_busy_percent_avg": 18.3,
          "gpu_busy_percent_peak": 60.0,
          "power_w_avg": 60.67,
          "power_w_peak": 204.0,
          "vram_mb_avg": 5145.4,
          "vram_mb_peak": 8191.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "simple_list",
      "category": "lists",
      "judge": {
        "elapsed_seconds": 7.332481077999546,
        "errors": [
          {
            "detail": "Formatted the dictated list inline instead of as a markdown bullet list.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Did not capitalize the leading title text.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Shopping list:\n- apples\n- bananas\n- oat milk",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 70,
        "summary": "Content is preserved, but it misses the required title casing and markdown list formatting."
      },
      "judge_status": "ok",
      "latency_seconds": 1.8883,
      "model": "olmo2:7b",
      "ollama_status": "ok",
      "output": "shopping list: apples, bananas, oat milk.\n\n",
      "passed": false,
      "score": 70,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 1.8902,
        "metrics": {
          "energy_wh_estimate": 0.026,
          "gpu_busy_percent_avg": 19.2,
          "gpu_busy_percent_peak": 50.0,
          "power_w_avg": 49.6,
          "power_w_peak": 154.0,
          "vram_mb_avg": 4536.2,
          "vram_mb_peak": 8191.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 5,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "no_change_needed",
      "category": "no_change_needed",
      "judge": {
        "elapsed_seconds": 8.063831843999651,
        "errors": [
          {
            "detail": "Inserted a comma after \"quick\" that was not present or needed.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "The quick brown fox jumps over the lazy dog.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 90,
        "summary": "Added an unnecessary comma to already clean text."
      },
      "judge_status": "ok",
      "latency_seconds": 1.8882,
      "model": "olmo2:7b",
      "ollama_status": "ok",
      "output": "The quick, brown fox jumps over the lazy dog.\n\n",
      "passed": true,
      "score": 90,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 1.8901,
        "metrics": {
          "energy_wh_estimate": 0.0249,
          "gpu_busy_percent_avg": 20.4,
          "gpu_busy_percent_peak": 52.0,
          "power_w_avg": 47.4,
          "power_w_peak": 172.0,
          "vram_mb_avg": 4536.2,
          "vram_mb_peak": 8191.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 5,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_pr_review_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 19.659774464002112,
        "errors": [
          {
            "detail": "Moved the conditional phrase into a separate sentence instead of keeping it with the review-check instruction.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Please check the open PR #4 for code review comments and check each one that's still open if it needs to be fixed. Fix it, then resolve the comments and push your fix.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 92,
        "summary": "Preserves the dictated instruction as content with correct PR identifier and punctuation; only a slight sentence-boundary/phrasing difference from the expected text."
      },
      "judge_status": "ok",
      "latency_seconds": 26.5142,
      "model": "glm-4.7-flash:latest",
      "ollama_status": "ok",
      "output": "Please check the open PR #4 for code review comments and check each one that's still open. If it needs to be fixed, fix it, then resolve the comments and push your fix.\n\n",
      "passed": true,
      "score": 92,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 26.5162,
        "metrics": {
          "energy_wh_estimate": 1.2966,
          "gpu_busy_percent_avg": 82.5,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 176.04,
          "power_w_peak": 238.0,
          "vram_mb_avg": 22303.4,
          "vram_mb_peak": 23176.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 54,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_agent_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 7.5687457700005325,
        "errors": [],
        "ideal_output": "Run cargo test, fix any failing tests, then commit and push the branch.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output matches the expected cleaned text."
      },
      "judge_status": "ok",
      "latency_seconds": 10.6739,
      "model": "glm-4.7-flash:latest",
      "ollama_status": "ok",
      "output": "Run cargo test, fix any failing tests, then commit and push the branch.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 10.6753,
        "metrics": {
          "energy_wh_estimate": 0.4739,
          "gpu_busy_percent_avg": 78.2,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 159.83,
          "power_w_peak": 229.0,
          "vram_mb_avg": 21182.4,
          "vram_mb_peak": 23268.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 23,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_question_no_answer",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 5.750559654999961,
        "errors": [],
        "ideal_output": "What is the capital of France?",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output exactly preserves the dictated question with correct casing and punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 10.1941,
      "model": "glm-4.7-flash:latest",
      "ollama_status": "ok",
      "output": "What is the capital of France?\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 10.1957,
        "metrics": {
          "energy_wh_estimate": 0.4641,
          "gpu_busy_percent_avg": 81.1,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 163.86,
          "power_w_peak": 230.0,
          "vram_mb_avg": 21047.6,
          "vram_mb_peak": 23181.4,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 22,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_email_request",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 11.458429327998601,
        "errors": [
          {
            "detail": "Turned the dictation into an email draft format with a subject line and body.",
            "severity": "critical",
            "type": "over_generation"
          },
          {
            "detail": "Omitted 'Write an email to Maya,' from the intended content.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Changed 'The meeting moved' to 'The meeting has been moved' and split the sentence structure.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Subject should be 'Quarterly Update' with title casing.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 35,
        "summary": "Generated an email-style subject/body instead of preserving the dictated request as final cleaned text, and omitted the recipient instruction."
      },
      "judge_status": "ok",
      "latency_seconds": 37.5463,
      "model": "glm-4.7-flash:latest",
      "ollama_status": "ok",
      "output": "Subject: Quarterly update\n\nThe meeting has been moved to Thursday at 9:30 a.m. The budget is $12,450. The website is chirper.local/launch.\n\n",
      "passed": false,
      "score": 35,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 37.5481,
        "metrics": {
          "energy_wh_estimate": 2.2378,
          "gpu_busy_percent_avg": 87.6,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 214.55,
          "power_w_peak": 248.0,
          "vram_mb_avg": 22570.1,
          "vram_mb_peak": 23260.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 76,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "onboarding_mixed_format",
      "category": "mixed_formatting",
      "judge": {
        "elapsed_seconds": 17.38812776499981,
        "errors": [
          {
            "detail": "List/title formatting differs: title is bolded, bullets use asterisks with extra spacing, and list items are capitalized instead of lowercase.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Several dictated comma-linked sentences were split into separate sentences, changing the expected punctuation flow.",
            "severity": "major",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Uses Water/Tomato/etc. and 9:30 a.m. instead of lowercase list items and 9:30 AM.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Adds a comma in 'Hello, Chirper' and omits the hyphen in 'accent-friendly'.",
            "severity": "minor",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Changes 'and we should also look at' to 'Also look at,' weakening the intended sentence structure.",
            "severity": "major",
            "type": "semantic_change"
          }
        ],
        "ideal_output": "Hello Chirper. I need to write down accent-friendly words.\n\nAccent Friendly Words:\n- water\n- tomato\n- schedule\n- data\n- router\n- aluminium\n- privacy\n\nPlease write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.\n\nIn the deployment notes, mention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale. Thanks!",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 68,
        "summary": "Preserves most content but misses several required punctuation, casing, markdown/list, and phrasing details in a mixed-format task."
      },
      "judge_status": "ok",
      "latency_seconds": 52.1598,
      "model": "glm-4.7-flash:latest",
      "ollama_status": "ok",
      "output": "Hello, Chirper. I need to write down accent friendly words.\n\n**Accent Friendly Words:**\n*   Water\n*   Tomato\n*   Schedule\n*   Data\n*   Router\n*   Aluminium\n*   Privacy\n\nPlease write an email to Maya.\nSubject: Quarterly Update\n\nThe meeting moved to Thursday at 9:30 a.m. The budget is $12,450. The website is chirper.local/launch.\n\nIn the deployment notes, mention that systemd keeps the Chirper services running. Also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale.\n\nThanks!\n\n",
      "passed": false,
      "score": 68,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 52.1613,
        "metrics": {
          "energy_wh_estimate": 3.294,
          "gpu_busy_percent_avg": 88.7,
          "gpu_busy_percent_peak": 99.0,
          "power_w_avg": 227.34,
          "power_w_peak": 259.0,
          "vram_mb_avg": 22734.2,
          "vram_mb_peak": 23176.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 106,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "markdown_checklist_instruction_content",
      "category": "markdown",
      "judge": {
        "elapsed_seconds": 9.030772153000726,
        "errors": [
          {
            "detail": "Title and checklist items remain lowercase instead of title/sentence case.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Title is missing the requested colon.",
            "severity": "minor",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Used a markdown heading instead of the expected plain title line with colon.",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Release Tasks:\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 78,
        "summary": "Checklist structure and content are preserved, but title punctuation/format and casing do not match the expected cleaned text."
      },
      "judge_status": "ok",
      "latency_seconds": 16.5036,
      "model": "glm-4.7-flash:latest",
      "ollama_status": "ok",
      "output": "# release tasks\n- [ ] run tests\n- [ ] push tag\n- [ ] publish release\n\n",
      "passed": true,
      "score": 78,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 16.5057,
        "metrics": {
          "energy_wh_estimate": 0.8704,
          "gpu_busy_percent_avg": 83.4,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 189.85,
          "power_w_peak": 242.0,
          "vram_mb_avg": 21803.3,
          "vram_mb_peak": 23184.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 34,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "shell_command_as_content",
      "category": "code_identifiers",
      "judge": null,
      "judge_status": "not_run",
      "latency_seconds": 180.0032,
      "model": "glm-4.7-flash:latest",
      "ollama_status": "timeout",
      "output": "",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 180.0049,
        "metrics": {
          "energy_wh_estimate": 13.18,
          "gpu_busy_percent_avg": 90.9,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 263.59,
          "power_w_peak": 292.0,
          "vram_mb_avg": 23061.3,
          "vram_mb_peak": 23270.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 361,
        "status": "ok"
      },
      "timed_out": true
    },
    {
      "case_id": "spoken_correction_chain",
      "category": "spoken_edits",
      "judge": {
        "elapsed_seconds": 11.097934190998785,
        "errors": [
          {
            "detail": "Final identifier should be ChirperBench, not ChirperBenchmark.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Omitted the dictated phrase \"Call it\" and final punctuation.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Expected camel-cased identifier ChirperBench.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Call it ChirperBench.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 25,
        "summary": "Output changes the intended identifier and omits surrounding dictated text."
      },
      "judge_status": "ok",
      "latency_seconds": 35.082,
      "model": "glm-4.7-flash:latest",
      "ollama_status": "ok",
      "output": "ChirperBenchmark\n\n",
      "passed": false,
      "score": 25,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 35.0833,
        "metrics": {
          "energy_wh_estimate": 2.3246,
          "gpu_busy_percent_avg": 88.5,
          "gpu_busy_percent_peak": 99.0,
          "power_w_avg": 238.54,
          "power_w_peak": 268.0,
          "vram_mb_avg": 22828.2,
          "vram_mb_peak": 23184.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 72,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "url_https_path",
      "category": "urls",
      "judge": {
        "elapsed_seconds": 7.796112213000015,
        "errors": [
          {
            "detail": "Missing the dictated final period after the URL.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Visit https://docs.python.org/3/library/json.html.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 95,
        "summary": "URL and casing are correct, but the final period is missing."
      },
      "judge_status": "ok",
      "latency_seconds": 17.7265,
      "model": "glm-4.7-flash:latest",
      "ollama_status": "ok",
      "output": "Visit https://docs.python.org/3/library/json.html\n\n",
      "passed": true,
      "score": 95,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 17.7282,
        "metrics": {
          "energy_wh_estimate": 1.008,
          "gpu_busy_percent_avg": 83.6,
          "gpu_busy_percent_peak": 99.0,
          "power_w_avg": 204.7,
          "power_w_peak": 253.0,
          "vram_mb_avg": 21915.3,
          "vram_mb_peak": 23184.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 37,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "email_addresses",
      "category": "emails",
      "judge": {
        "elapsed_seconds": 6.452544712999952,
        "errors": [],
        "ideal_output": "Send it to alex@chirper.local and cc ops@example.com.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output matches the expected formatted email addresses and punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 20.3197,
      "model": "glm-4.7-flash:latest",
      "ollama_status": "ok",
      "output": "Send it to alex@chirper.local and cc ops@example.com.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 20.3212,
        "metrics": {
          "energy_wh_estimate": 1.1812,
          "gpu_busy_percent_avg": 84.9,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 209.26,
          "power_w_peak": 256.0,
          "vram_mb_avg": 22066.4,
          "vram_mb_peak": 23184.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 42,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "product_and_class_casing",
      "category": "casing_identifiers",
      "judge": {
        "elapsed_seconds": 7.593252620001294,
        "errors": [
          {
            "detail": "Expected code-style casing: ChirperBench and DictationFormatter.",
            "severity": "major",
            "type": "wrong_casing"
          },
          {
            "detail": "Failed to format product and class identifiers as required.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Added an unnecessary comma after the project name.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "The project name is ChirperBench and the class name is DictationFormatter.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 55,
        "summary": "Identifier casing was not applied correctly; product and class names were left as spaced words."
      },
      "judge_status": "ok",
      "latency_seconds": 22.1602,
      "model": "glm-4.7-flash:latest",
      "ollama_status": "ok",
      "output": "The project name is Chirper Bench, and the class name is Dictation Formatter.\n\n",
      "passed": false,
      "score": 55,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 22.1619,
        "metrics": {
          "energy_wh_estimate": 1.3033,
          "gpu_busy_percent_avg": 85.0,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 211.72,
          "power_w_peak": 256.0,
          "vram_mb_avg": 22163.6,
          "vram_mb_peak": 23184.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 46,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "inline_code_identifiers",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 8.227000602000771,
        "errors": [
          {
            "detail": "Converted the instruction sentence into an assignment expression.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Omitted 'Set variable' and 'equal to' wording.",
            "severity": "major",
            "type": "missing_text"
          }
        ],
        "ideal_output": "Set variable `user_id` equal to `request.user.id`.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 70,
        "summary": "Formatted identifiers, but changed the dictated sentence into a code assignment and omitted the surrounding words."
      },
      "judge_status": "ok",
      "latency_seconds": 21.0615,
      "model": "glm-4.7-flash:latest",
      "ollama_status": "ok",
      "output": "`user_id = request.user.id`\n\n",
      "passed": false,
      "score": 70,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 21.0629,
        "metrics": {
          "energy_wh_estimate": 1.218,
          "gpu_busy_percent_avg": 85.8,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 208.18,
          "power_w_peak": 265.0,
          "vram_mb_avg": 22117.2,
          "vram_mb_peak": 23184.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 44,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "numbers_versions_dates",
      "category": "numbers_versions",
      "judge": {
        "elapsed_seconds": 7.572232280999742,
        "errors": [
          {
            "detail": "Added a comma after 2026 that is not in the expected output.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Version 2.1.0 shipped on June 3, 2026 and supports Python 3.11+.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 96,
        "summary": "Content, casing, versions, date, and number formatting are correct; only an extra comma differs from the expected punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 14.6296,
      "model": "glm-4.7-flash:latest",
      "ollama_status": "ok",
      "output": "Version 2.1.0 shipped on June 3, 2026, and supports Python 3.11+.\n\n",
      "passed": true,
      "score": 96,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 14.6317,
        "metrics": {
          "energy_wh_estimate": 0.779,
          "gpu_busy_percent_avg": 82.7,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 191.68,
          "power_w_peak": 247.0,
          "vram_mb_avg": 21669.6,
          "vram_mb_peak": 23184.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 31,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "simple_list",
      "category": "lists",
      "judge": {
        "elapsed_seconds": 8.264313760999357,
        "errors": [
          {
            "detail": "Rendered the list inline with commas instead of using bullet points after the colon.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Added a final period not present in the expected formatted list.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Shopping list:\n- apples\n- bananas\n- oat milk",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 70,
        "summary": "Content is preserved, but the spoken list was not formatted as a markdown list."
      },
      "judge_status": "ok",
      "latency_seconds": 17.8045,
      "model": "glm-4.7-flash:latest",
      "ollama_status": "ok",
      "output": "Shopping list: apples, bananas, oat milk.\n\n",
      "passed": false,
      "score": 70,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 17.8061,
        "metrics": {
          "energy_wh_estimate": 0.9869,
          "gpu_busy_percent_avg": 83.5,
          "gpu_busy_percent_peak": 99.0,
          "power_w_avg": 199.54,
          "power_w_peak": 247.0,
          "vram_mb_avg": 21915.3,
          "vram_mb_peak": 23184.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 37,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "no_change_needed",
      "category": "no_change_needed",
      "judge": {
        "elapsed_seconds": 5.656282110998291,
        "errors": [],
        "ideal_output": "The quick brown fox jumps over the lazy dog.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output matches the expected clean text exactly."
      },
      "judge_status": "ok",
      "latency_seconds": 9.9996,
      "model": "glm-4.7-flash:latest",
      "ollama_status": "ok",
      "output": "The quick brown fox jumps over the lazy dog.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 10.0012,
        "metrics": {
          "energy_wh_estimate": 0.4549,
          "gpu_busy_percent_avg": 76.9,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 163.76,
          "power_w_peak": 244.0,
          "vram_mb_avg": 20948.3,
          "vram_mb_peak": 23184.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 21,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_pr_review_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 11.18222056599916,
        "errors": [
          {
            "detail": "Split the clause before 'if it needs to be fixed' differently than the expected output.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Please check the open PR #4 for code review comments and check each one that's still open if it needs to be fixed. Fix it, then resolve the comments and push your fix.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 94,
        "summary": "Preserves the dictated instruction with appropriate cleanup; only a minor sentence-boundary change from the expected wording."
      },
      "judge_status": "ok",
      "latency_seconds": 21.8595,
      "model": "granite4.1:30b",
      "ollama_status": "ok",
      "output": "Please check the open PR #4 for code review comments and check each one that's still open. If it needs to be fixed, fix it, then resolve the comments and push your fix.\n\n",
      "passed": true,
      "score": 94,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 21.8616,
        "metrics": {
          "energy_wh_estimate": 0.3801,
          "gpu_busy_percent_avg": 9.3,
          "gpu_busy_percent_peak": 79.0,
          "power_w_avg": 62.6,
          "power_w_peak": 169.0,
          "vram_mb_avg": 12143.5,
          "vram_mb_peak": 18974.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 45,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_agent_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 6.996700982999755,
        "errors": [
          {
            "detail": "Added markdown backticks around cargo test even though the expected output preserves it as plain dictated text.",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Run cargo test, fix any failing tests, then commit and push the branch.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 90,
        "summary": "Content, casing, and punctuation are correct, but it incorrectly formatted the command-like phrase as inline code."
      },
      "judge_status": "ok",
      "latency_seconds": 8.9717,
      "model": "granite4.1:30b",
      "ollama_status": "ok",
      "output": "Run `cargo test`, fix any failing tests, then commit and push the branch.\n\n",
      "passed": true,
      "score": 90,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 8.9733,
        "metrics": {
          "energy_wh_estimate": 0.1726,
          "gpu_busy_percent_avg": 15.5,
          "gpu_busy_percent_peak": 76.0,
          "power_w_avg": 69.26,
          "power_w_peak": 157.0,
          "vram_mb_avg": 14203.7,
          "vram_mb_peak": 18889.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 19,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_question_no_answer",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 5.4028007369997795,
        "errors": [],
        "ideal_output": "What is the capital of France?",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output correctly formats the dictated question without answering it."
      },
      "judge_status": "ok",
      "latency_seconds": 5.9814,
      "model": "granite4.1:30b",
      "ollama_status": "ok",
      "output": "What is the capital of France?\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 5.9839,
        "metrics": {
          "energy_wh_estimate": 0.1097,
          "gpu_busy_percent_avg": 17.8,
          "gpu_busy_percent_peak": 88.0,
          "power_w_avg": 66.0,
          "power_w_peak": 208.0,
          "vram_mb_avg": 12065.1,
          "vram_mb_peak": 18845.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 13,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_email_request",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 9.203951378000056,
        "errors": [
          {
            "detail": "Added https:// to chirper.local/launch, changing the dictated URL.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Formatted 9:30 AM as 9:30 A.M. with unusual spacing instead of expected AM style.",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 82,
        "summary": "Mostly correct, but changed the URL by adding a scheme and used different AM formatting."
      },
      "judge_status": "ok",
      "latency_seconds": 17.3919,
      "model": "granite4.1:30b",
      "ollama_status": "ok",
      "output": "Write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30\u202fA.M., the budget is $12,450, and the website is https://chirper.local/launch.\n\n",
      "passed": true,
      "score": 82,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 17.3942,
        "metrics": {
          "energy_wh_estimate": 0.4034,
          "gpu_busy_percent_avg": 18.1,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 83.5,
          "power_w_peak": 281.0,
          "vram_mb_avg": 16435.2,
          "vram_mb_peak": 18890.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 36,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "onboarding_mixed_format",
      "category": "mixed_formatting",
      "judge": {
        "elapsed_seconds": 14.17785853299938,
        "errors": [
          {
            "detail": "Leaves dictated commands such as \u201cThis is a bullet point list,\u201d \u201cEnd of list,\u201d and \u201cFinish with\u201d in the output.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Adds explanatory/quoted text around the list title and final thanks that should not appear.",
            "severity": "major",
            "type": "extra_text"
          },
          {
            "detail": "Formats the URL as inline code instead of plain text.",
            "severity": "minor",
            "type": "formatting_miss"
          },
          {
            "detail": "Uses 9:30 A.M. instead of 9:30 AM and quotes around Thanks.",
            "severity": "minor",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Changes the intended closing from \u201cThanks!\u201d to an instruction-like phrase, `Finish with \"Thanks\"!`.",
            "severity": "major",
            "type": "semantic_change"
          }
        ],
        "ideal_output": "Hello Chirper. I need to write down accent-friendly words.\n\nAccent Friendly Words:\n- water\n- tomato\n- schedule\n- data\n- router\n- aluminium\n- privacy\n\nPlease write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.\n\nIn the deployment notes, mention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale. Thanks!",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 58,
        "summary": "Preserves much content, but leaks spoken formatting commands and mishandles the title, URL formatting, time style, and final thanks."
      },
      "judge_status": "ok",
      "latency_seconds": 44.7834,
      "model": "granite4.1:30b",
      "ollama_status": "ok",
      "output": "Hello Chirper. I need to write down accent-friendly words.\n\nThis is a bullet point list with title \"Accent-Friendly Words\":\n- water\n- tomato\n- schedule\n- data\n- router\n- aluminium\n- privacy\n\nEnd of list.\n\nPlease write an email to Maya, subject: Quarterly Update.\n\nThe meeting moved to Thursday at 9:30\u202fA.M., the budget is $12,450, and the website is `chirper.local/launch`.\n\nIn the deployment notes, mention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale.\n\nFinish with \"Thanks\"!\n\n",
      "passed": false,
      "score": 58,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 44.7856,
        "metrics": {
          "energy_wh_estimate": 1.0252,
          "gpu_busy_percent_avg": 11.3,
          "gpu_busy_percent_peak": 90.0,
          "power_w_avg": 82.41,
          "power_w_peak": 228.0,
          "vram_mb_avg": 17914.9,
          "vram_mb_peak": 18890.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 91,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "markdown_checklist_instruction_content",
      "category": "markdown",
      "judge": {
        "elapsed_seconds": 9.71735150099994,
        "errors": [
          {
            "detail": "Includes 'Item One', 'Item Two', and 'Item Three' as literal text instead of using them only to structure the checklist.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Formats the title as a Markdown H1 rather than plain title text ending with a colon.",
            "severity": "minor",
            "type": "formatting_miss"
          },
          {
            "detail": "Adds extra blank lines and trailing spaces not present in the expected cleaned text.",
            "severity": "minor",
            "type": "extra_text"
          }
        ],
        "ideal_output": "Release Tasks:\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 70,
        "summary": "Preserves the main checklist content, but leaves spoken item markers and changes the requested title formatting."
      },
      "judge_status": "ok",
      "latency_seconds": 12.5421,
      "model": "granite4.1:30b",
      "ollama_status": "ok",
      "output": "# Release Tasks:\n\n- [ ] Item One: Run tests  \n- [ ] Item Two: Push tag  \n- [ ] Item Three: Publish release\n\n",
      "passed": false,
      "score": 70,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 12.544,
        "metrics": {
          "energy_wh_estimate": 0.2596,
          "gpu_busy_percent_avg": 14.3,
          "gpu_busy_percent_peak": 81.0,
          "power_w_avg": 74.52,
          "power_w_peak": 186.0,
          "vram_mb_avg": 15609.4,
          "vram_mb_peak": 18890.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 27,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "shell_command_as_content",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 8.66157783500239,
        "errors": [
          {
            "detail": "Changed `|` to `|&`, altering shell command behavior.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Included spoken word `space` as part of `space_test-output.log`.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Did not preserve the expected shell command structure and filename.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "cargo test | tee test-output.log",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 45,
        "summary": "Command structure and filename were changed: pipe became |& and spoken separator became part of the filename."
      },
      "judge_status": "ok",
      "latency_seconds": 6.2964,
      "model": "granite4.1:30b",
      "ollama_status": "ok",
      "output": "cargo test |& tee space_test-output.log\n\n",
      "passed": false,
      "score": 45,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 6.2986,
        "metrics": {
          "energy_wh_estimate": 0.1093,
          "gpu_busy_percent_avg": 15.6,
          "gpu_busy_percent_peak": 82.0,
          "power_w_avg": 62.5,
          "power_w_peak": 160.0,
          "vram_mb_avg": 12551.0,
          "vram_mb_peak": 18846.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 14,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "spoken_correction_chain",
      "category": "spoken_edits",
      "judge": {
        "elapsed_seconds": 9.747430890998658,
        "errors": [
          {
            "detail": "Final corrected identifier should be formatted as ChirperBench with no space.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Sentence should start with capital Call.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Missing final period.",
            "severity": "minor",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Chirper Benchmark changes the intended corrected name ChirperBench.",
            "severity": "major",
            "type": "semantic_change"
          }
        ],
        "ideal_output": "Call it ChirperBench.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 55,
        "summary": "Output removes spoken edits but fails final casing, punctuation, and the corrected identifier spelling."
      },
      "judge_status": "ok",
      "latency_seconds": 5.478,
      "model": "granite4.1:30b",
      "ollama_status": "ok",
      "output": "call it Chirper Benchmark\n\n",
      "passed": false,
      "score": 55,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 5.4801,
        "metrics": {
          "energy_wh_estimate": 0.0918,
          "gpu_busy_percent_avg": 19.7,
          "gpu_busy_percent_peak": 69.0,
          "power_w_avg": 60.33,
          "power_w_peak": 137.0,
          "vram_mb_avg": 11974.3,
          "vram_mb_peak": 18890.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 12,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "url_https_path",
      "category": "urls",
      "judge": {
        "elapsed_seconds": 6.382074437999108,
        "errors": [],
        "ideal_output": "Visit https://docs.python.org/3/library/json.html.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Perfectly formats the spoken HTTPS URL with correct casing and punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 7.1805,
      "model": "granite4.1:30b",
      "ollama_status": "ok",
      "output": "Visit https://docs.python.org/3/library/json.html.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 7.1822,
        "metrics": {
          "energy_wh_estimate": 0.1344,
          "gpu_busy_percent_avg": 16.7,
          "gpu_busy_percent_peak": 80.0,
          "power_w_avg": 67.38,
          "power_w_peak": 150.0,
          "vram_mb_avg": 13701.6,
          "vram_mb_peak": 18891.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 16,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "email_addresses",
      "category": "emails",
      "judge": {
        "elapsed_seconds": 6.548070058001031,
        "errors": [
          {
            "detail": "The sentence should start with uppercase 'Send'.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Send it to alex@chirper.local and cc ops@example.com.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 95,
        "summary": "Email addresses and punctuation are correct; only sentence-initial capitalization is missing."
      },
      "judge_status": "ok",
      "latency_seconds": 8.0058,
      "model": "granite4.1:30b",
      "ollama_status": "ok",
      "output": "send it to alex@chirper.local and cc ops@example.com.\n\n",
      "passed": true,
      "score": 95,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 8.008,
        "metrics": {
          "energy_wh_estimate": 0.1502,
          "gpu_busy_percent_avg": 16.5,
          "gpu_busy_percent_peak": 82.0,
          "power_w_avg": 67.53,
          "power_w_peak": 131.0,
          "vram_mb_avg": 13985.1,
          "vram_mb_peak": 18846.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 17,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "product_and_class_casing",
      "category": "casing_identifiers",
      "judge": {
        "elapsed_seconds": 8.523445848000847,
        "errors": [
          {
            "detail": "Formatted the project name as \"Chirper Bench\" instead of the required identifier \"ChirperBench\".",
            "severity": "major",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "The project name is ChirperBench and the class name is DictationFormatter.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 80,
        "summary": "Class casing was correct, but the product/code-style project name was not formatted as the intended identifier."
      },
      "judge_status": "ok",
      "latency_seconds": 8.5965,
      "model": "granite4.1:30b",
      "ollama_status": "ok",
      "output": "The project name is Chirper Bench and the class name is DictationFormatter.\n\n",
      "passed": false,
      "score": 80,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 8.5989,
        "metrics": {
          "energy_wh_estimate": 0.1668,
          "gpu_busy_percent_avg": 15.9,
          "gpu_busy_percent_peak": 82.0,
          "power_w_avg": 69.84,
          "power_w_peak": 157.0,
          "vram_mb_avg": 14533.1,
          "vram_mb_peak": 18890.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 19,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "inline_code_identifiers",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 9.221223045002262,
        "errors": [
          {
            "detail": "Sentence starts with lowercase 'set' instead of 'Set'.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Final dictated period was rendered as a semicolon.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Set variable `user_id` equal to `request.user.id`.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 82,
        "summary": "Identifiers were formatted correctly, but sentence casing and final punctuation are wrong."
      },
      "judge_status": "ok",
      "latency_seconds": 7.4342,
      "model": "granite4.1:30b",
      "ollama_status": "ok",
      "output": "set variable `user_id` equal to `request.user.id`;\n\n",
      "passed": true,
      "score": 82,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 7.4362,
        "metrics": {
          "energy_wh_estimate": 0.1403,
          "gpu_busy_percent_avg": 18.1,
          "gpu_busy_percent_peak": 82.0,
          "power_w_avg": 67.94,
          "power_w_peak": 161.0,
          "vram_mb_avg": 13744.3,
          "vram_mb_peak": 18890.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 16,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "numbers_versions_dates",
      "category": "numbers_versions",
      "judge": {
        "elapsed_seconds": 9.709896205000405,
        "errors": [
          {
            "detail": "Adds a comma after 2026 that is not in the expected output.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Version 2.1.0 shipped on June 3, 2026 and supports Python 3.11+.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 95,
        "summary": "Output correctly formats versions, date, numbers, casing, and punctuation with only an extra comma before the conjunction."
      },
      "judge_status": "ok",
      "latency_seconds": 10.8069,
      "model": "granite4.1:30b",
      "ollama_status": "ok",
      "output": "Version 2.1.0 shipped on June 3, 2026, and supports Python 3.11+.\n\n",
      "passed": true,
      "score": 95,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 10.8093,
        "metrics": {
          "energy_wh_estimate": 0.2113,
          "gpu_busy_percent_avg": 13.9,
          "gpu_busy_percent_peak": 82.0,
          "power_w_avg": 70.39,
          "power_w_peak": 161.0,
          "vram_mb_avg": 15046.0,
          "vram_mb_peak": 18891.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 23,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "simple_list",
      "category": "lists",
      "judge": {
        "elapsed_seconds": 8.9371674110007,
        "errors": [
          {
            "detail": "Spoken list should be formatted as a markdown bullet list after the colon.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "The heading should start with a capital letter.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Shopping list:\n- apples\n- bananas\n- oat milk",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 65,
        "summary": "Preserves the items but misses the required markdown list formatting and sentence casing."
      },
      "judge_status": "ok",
      "latency_seconds": 6.5678,
      "model": "granite4.1:30b",
      "ollama_status": "ok",
      "output": "shopping list: apples, bananas, oat milk.\n\n",
      "passed": false,
      "score": 65,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 6.5698,
        "metrics": {
          "energy_wh_estimate": 0.1242,
          "gpu_busy_percent_avg": 19.9,
          "gpu_busy_percent_peak": 70.0,
          "power_w_avg": 68.07,
          "power_w_peak": 138.0,
          "vram_mb_avg": 12985.3,
          "vram_mb_peak": 18890.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 15,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "no_change_needed",
      "category": "no_change_needed",
      "judge": {
        "elapsed_seconds": 6.83026820500163,
        "errors": [],
        "ideal_output": "The quick brown fox jumps over the lazy dog.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output matches the expected unchanged text."
      },
      "judge_status": "ok",
      "latency_seconds": 6.569,
      "model": "granite4.1:30b",
      "ollama_status": "ok",
      "output": "The quick brown fox jumps over the lazy dog.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 6.5705,
        "metrics": {
          "energy_wh_estimate": 0.1197,
          "gpu_busy_percent_avg": 17.7,
          "gpu_busy_percent_peak": 82.0,
          "power_w_avg": 65.6,
          "power_w_peak": 152.0,
          "vram_mb_avg": 13020.8,
          "vram_mb_peak": 18930.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 15,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_pr_review_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 11.860700543998973,
        "errors": [
          {
            "detail": "Formats PR identifier as \"number 4\" instead of \"#4\".",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Please check the open PR #4 for code review comments and check each one that's still open if it needs to be fixed. Fix it, then resolve the comments and push your fix.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 88,
        "summary": "Preserves the dictated instruction as content with mostly correct cleanup, but misses the expected PR number formatting."
      },
      "judge_status": "ok",
      "latency_seconds": 9.3138,
      "model": "gpt-oss:20b",
      "ollama_status": "ok",
      "output": "Please check the open PR number 4 for code review comments and check each one that's still open. If it needs to be fixed, fix it, resolve the comments, and push your fix.\n\n",
      "passed": true,
      "score": 88,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 9.3155,
        "metrics": {
          "energy_wh_estimate": 0.2974,
          "gpu_busy_percent_avg": 59.6,
          "gpu_busy_percent_peak": 91.0,
          "power_w_avg": 114.95,
          "power_w_peak": 293.0,
          "vram_mb_avg": 13039.9,
          "vram_mb_peak": 15110.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 20,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_agent_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 6.153074152000045,
        "errors": [],
        "ideal_output": "Run cargo test, fix any failing tests, then commit and push the branch.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output exactly preserves the dictated instruction as cleaned text."
      },
      "judge_status": "ok",
      "latency_seconds": 6.4238,
      "model": "gpt-oss:20b",
      "ollama_status": "ok",
      "output": "Run cargo test, fix any failing tests, then commit and push the branch.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 6.4254,
        "metrics": {
          "energy_wh_estimate": 0.2765,
          "gpu_busy_percent_avg": 67.0,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 154.93,
          "power_w_peak": 300.0,
          "vram_mb_avg": 12136.2,
          "vram_mb_peak": 15025.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 14,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_question_no_answer",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 9.479565115001606,
        "errors": [
          {
            "detail": "Expected a question mark after the question, not a period.",
            "severity": "major",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "What is the capital of France?",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 85,
        "summary": "Preserved the dictated question but used a period instead of the required question mark."
      },
      "judge_status": "ok",
      "latency_seconds": 4.5931,
      "model": "gpt-oss:20b",
      "ollama_status": "ok",
      "output": "What is the capital of France?\n\n",
      "passed": false,
      "score": 85,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 4.5948,
        "metrics": {
          "energy_wh_estimate": 0.133,
          "gpu_busy_percent_avg": 62.0,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 104.18,
          "power_w_peak": 298.0,
          "vram_mb_avg": 11348.3,
          "vram_mb_peak": 15025.4,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 11,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_email_request",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 10.464695572001801,
        "errors": [
          {
            "detail": "Invented greeting, signoff, and email layout/body.",
            "severity": "critical",
            "type": "over_generation"
          },
          {
            "detail": "Changed the request 'Write an email to Maya...' into the email itself.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Omitted the literal instruction framing 'Write an email to Maya'.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Split the final sentence into separate sentences rather than preserving the intended comma-connected content.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 35,
        "summary": "Generated an email body instead of formatting the dictated request as final text."
      },
      "judge_status": "ok",
      "latency_seconds": 8.0819,
      "model": "gpt-oss:20b",
      "ollama_status": "ok",
      "output": "Subject: Quarterly Update\n\nHi Maya,\n\nThe meeting moved to Thursday at 9:30\u202fa.m. The budget is $12,450. The website is chirper.local/launch.\n\nBest regards,\n\n[Your Name]\n\n",
      "passed": false,
      "score": 35,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 8.0837,
        "metrics": {
          "energy_wh_estimate": 0.3982,
          "gpu_busy_percent_avg": 70.6,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 177.33,
          "power_w_peak": 297.0,
          "vram_mb_avg": 12812.7,
          "vram_mb_peak": 15107.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 18,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "onboarding_mixed_format",
      "category": "mixed_formatting",
      "judge": {
        "elapsed_seconds": 18.406393002002005,
        "errors": [
          {
            "detail": "Leaves dictated formatting phrases in the output, including 'This is a bullet point list...' and 'End of list.'",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Does not format 'nine thirty a m' as 9:30 AM or 'twelve thousand four hundred fifty dollars' as $12,450.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Formats the URL as a Markdown link and invents an http:// target instead of plain chirper.local/launch.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Changes 'Hello Chirper.' to 'Hello, Chirper.' and uses a semicolon where the expected output uses a comma.",
            "severity": "minor",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Uses Markdown emphasis for the subject instead of plain 'subject: Quarterly Update.'",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Hello Chirper. I need to write down accent-friendly words.\n\nAccent Friendly Words:\n- water\n- tomato\n- schedule\n- data\n- router\n- aluminium\n- privacy\n\nPlease write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.\n\nIn the deployment notes, mention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale. Thanks!",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 68,
        "summary": "Preserves most content but leaks spoken structure commands and misses required numeric, currency, URL, and title formatting."
      },
      "judge_status": "ok",
      "latency_seconds": 11.6024,
      "model": "gpt-oss:20b",
      "ollama_status": "ok",
      "output": "Hello, Chirper.\n\nI need to write down accent\u2011friendly words.\n\nThis is a bullet point list with the title **Accent Friendly Words**:\n- water\n- tomato\n- schedule\n- data\n- router\n- aluminium\n- privacy\n\nEnd of list.\n\nPlease write an email to Maya, subject: *Quarterly Update*.\n\nThe meeting moved to Thursday at nine thirty a.m.; the budget is twelve thousand four hundred fifty dollars, and the website is [chirper.local/launch](http://chirper.local/launch).\n\nIn the deployment notes, mention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale.\n\nThanks!\n\n",
      "passed": false,
      "score": 68,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 11.6042,
        "metrics": {
          "energy_wh_estimate": 0.6944,
          "gpu_busy_percent_avg": 77.5,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 215.44,
          "power_w_peak": 302.0,
          "vram_mb_avg": 13408.6,
          "vram_mb_peak": 15026.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 25,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "markdown_checklist_instruction_content",
      "category": "markdown",
      "judge": {
        "elapsed_seconds": 10.061938408998685,
        "errors": [
          {
            "detail": "Rendered the title as a Markdown heading instead of `Release Tasks:` with a colon.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Title and checklist items should use title/sentence casing as expected, not lowercase.",
            "severity": "major",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Release Tasks:\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 72,
        "summary": "Preserves the checklist content but misses required casing and title punctuation/format."
      },
      "judge_status": "ok",
      "latency_seconds": 5.8794,
      "model": "gpt-oss:20b",
      "ollama_status": "ok",
      "output": "# Release tasks\n\n- [ ] run tests\n- [ ] push tag\n- [ ] publish release\n\n",
      "passed": false,
      "score": 72,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 5.8813,
        "metrics": {
          "energy_wh_estimate": 0.2364,
          "gpu_busy_percent_avg": 64.4,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 144.69,
          "power_w_peak": 309.0,
          "vram_mb_avg": 11915.2,
          "vram_mb_peak": 15026.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 13,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "shell_command_as_content",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 9.70174914600284,
        "errors": [
          {
            "detail": "Spoken spacing command 'space' leaked into the filename as 'space-'.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Output file changed from 'test-output.log' to 'space-test-output.log'.",
            "severity": "major",
            "type": "semantic_change"
          }
        ],
        "ideal_output": "cargo test | tee test-output.log",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 78,
        "summary": "Command structure mostly preserved, but spoken 'space' was incorrectly included in the filename, changing the output path."
      },
      "judge_status": "ok",
      "latency_seconds": 7.6592,
      "model": "gpt-oss:20b",
      "ollama_status": "ok",
      "output": "cargo test | tee space-test-output.log\n\n",
      "passed": false,
      "score": 78,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 7.6609,
        "metrics": {
          "energy_wh_estimate": 0.3805,
          "gpu_busy_percent_avg": 70.5,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 178.82,
          "power_w_peak": 303.0,
          "vram_mb_avg": 12647.2,
          "vram_mb_peak": 15026.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 17,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "spoken_correction_chain",
      "category": "spoken_edits",
      "judge": {
        "elapsed_seconds": 9.825479384999198,
        "errors": [
          {
            "detail": "Rendered the identifier as 'Chirper Benchmark' instead of 'ChirperBench'.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Included spoken spelling instruction content as '(C Chirper B Bench)'.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Added parenthetical text not present in the intended final output.",
            "severity": "major",
            "type": "extra_text"
          }
        ],
        "ideal_output": "Call it ChirperBench.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 45,
        "summary": "The output preserves the basic phrase but fails to apply the final spoken correction into the intended identifier and adds extra text from the spelling instruction."
      },
      "judge_status": "ok",
      "latency_seconds": 12.9061,
      "model": "gpt-oss:20b",
      "ollama_status": "ok",
      "output": "Call it Chirper Benchmark (C Chirper B Bench).\n\n",
      "passed": false,
      "score": 45,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 12.9084,
        "metrics": {
          "energy_wh_estimate": 0.7935,
          "gpu_busy_percent_avg": 78.7,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 221.3,
          "power_w_peak": 301.0,
          "vram_mb_avg": 13528.4,
          "vram_mb_peak": 15026.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 27,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "url_https_path",
      "category": "urls",
      "judge": {
        "elapsed_seconds": 7.245775528001104,
        "errors": [
          {
            "detail": "Sentence should start with capitalized 'Visit'.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Final period is missing.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Visit https://docs.python.org/3/library/json.html.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 85,
        "summary": "URL was formatted correctly, but sentence casing and final punctuation were missed."
      },
      "judge_status": "ok",
      "latency_seconds": 5.4304,
      "model": "gpt-oss:20b",
      "ollama_status": "ok",
      "output": "visit https://docs.python.org/3/library/json.html\n\n",
      "passed": true,
      "score": 85,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 5.4325,
        "metrics": {
          "energy_wh_estimate": 0.1904,
          "gpu_busy_percent_avg": 62.9,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 126.17,
          "power_w_peak": 290.0,
          "vram_mb_avg": 11655.9,
          "vram_mb_peak": 15026.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 12,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "email_addresses",
      "category": "emails",
      "judge": {
        "elapsed_seconds": 8.520342793002783,
        "errors": [],
        "ideal_output": "Send it to alex@chirper.local and cc ops@example.com.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output correctly formats spoken email addresses and punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 5.2761,
      "model": "gpt-oss:20b",
      "ollama_status": "ok",
      "output": "Send it to alex@chirper.local and cc ops@example.com.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 5.2779,
        "metrics": {
          "energy_wh_estimate": 0.195,
          "gpu_busy_percent_avg": 66.8,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 133.0,
          "power_w_peak": 298.0,
          "vram_mb_avg": 11655.9,
          "vram_mb_peak": 15026.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 12,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "product_and_class_casing",
      "category": "casing_identifiers",
      "judge": {
        "elapsed_seconds": 6.803354560001026,
        "errors": [
          {
            "detail": "Rendered product name as \"Chirper Bench\" instead of code-style \"ChirperBench\".",
            "severity": "major",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "The project name is ChirperBench and the class name is DictationFormatter.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 85,
        "summary": "Mostly correct, but missed the required product identifier casing by leaving a space in ChirperBench."
      },
      "judge_status": "ok",
      "latency_seconds": 7.3125,
      "model": "gpt-oss:20b",
      "ollama_status": "ok",
      "output": "The project name is Chirper Bench and the class name is DictationFormatter.\n\n",
      "passed": false,
      "score": 85,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 7.3141,
        "metrics": {
          "energy_wh_estimate": 0.3362,
          "gpu_busy_percent_avg": 66.6,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 165.5,
          "power_w_peak": 320.0,
          "vram_mb_avg": 12498.5,
          "vram_mb_peak": 15026.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 16,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "inline_code_identifiers",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 8.088838727999246,
        "errors": [
          {
            "detail": "Wrapped the entire sentence in inline code instead of only `user_id` and `request.user.id`.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Sentence should start with capitalized 'Set'.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Missing final period.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Set variable `user_id` equal to `request.user.id`.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 55,
        "summary": "Preserved the words and identifiers but incorrectly formatted the whole sentence as code, missed sentence casing, and omitted final punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 5.7087,
      "model": "gpt-oss:20b",
      "ollama_status": "ok",
      "output": "`set variable user_id equal to request.user.id`\n\n",
      "passed": false,
      "score": 55,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 5.7109,
        "metrics": {
          "energy_wh_estimate": 0.2311,
          "gpu_busy_percent_avg": 66.7,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 145.69,
          "power_w_peak": 296.0,
          "vram_mb_avg": 11915.1,
          "vram_mb_peak": 15026.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 13,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "numbers_versions_dates",
      "category": "numbers_versions",
      "judge": {
        "elapsed_seconds": 8.626390023000567,
        "errors": [
          {
            "detail": "Added a comma after the date before 'and' that was not in the expected output.",
            "severity": "minor",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Uses narrow nonbreaking spaces instead of regular spaces.",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Version 2.1.0 shipped on June 3, 2026 and supports Python 3.11+.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 92,
        "summary": "Content, casing, versions, date, and number formatting are preserved; only minor punctuation/spacing differences from expected."
      },
      "judge_status": "ok",
      "latency_seconds": 6.774,
      "model": "gpt-oss:20b",
      "ollama_status": "ok",
      "output": "Version\u202f2.1.0 shipped on June\u202f3,\u202f2026, and supports Python\u202f3.11+.\n\n",
      "passed": true,
      "score": 92,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 6.7756,
        "metrics": {
          "energy_wh_estimate": 0.3133,
          "gpu_busy_percent_avg": 69.2,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 166.47,
          "power_w_peak": 305.0,
          "vram_mb_avg": 12330.0,
          "vram_mb_peak": 15026.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 15,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "simple_list",
      "category": "lists",
      "judge": {
        "elapsed_seconds": 7.4403106559984735,
        "errors": [
          {
            "detail": "Did not convert the colon-separated shopping list into bullet points.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Shopping list:\n- apples\n- bananas\n- oat milk",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 70,
        "summary": "Content is preserved, but the spoken list was not formatted as a markdown list."
      },
      "judge_status": "ok",
      "latency_seconds": 5.2687,
      "model": "gpt-oss:20b",
      "ollama_status": "ok",
      "output": "Shopping list: apples, bananas, oat milk.\n\n",
      "passed": false,
      "score": 70,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 5.2702,
        "metrics": {
          "energy_wh_estimate": 0.193,
          "gpu_busy_percent_avg": 61.1,
          "gpu_busy_percent_peak": 99.0,
          "power_w_avg": 131.83,
          "power_w_peak": 300.0,
          "vram_mb_avg": 11655.9,
          "vram_mb_peak": 15026.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 12,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "no_change_needed",
      "category": "no_change_needed",
      "judge": {
        "elapsed_seconds": 5.208312044000195,
        "errors": [],
        "ideal_output": "The quick brown fox jumps over the lazy dog.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output matches the expected text exactly."
      },
      "judge_status": "ok",
      "latency_seconds": 5.6933,
      "model": "gpt-oss:20b",
      "ollama_status": "ok",
      "output": "The quick brown fox jumps over the lazy dog.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 5.6951,
        "metrics": {
          "energy_wh_estimate": 0.2271,
          "gpu_busy_percent_avg": 65.3,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 143.54,
          "power_w_peak": 316.0,
          "vram_mb_avg": 11915.2,
          "vram_mb_peak": 15026.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 13,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_pr_review_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 10.097142350001377,
        "errors": [
          {
            "detail": "Splits the conditional into a separate sentence and uses slightly different comma placement than expected.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Please check the open PR #4 for code review comments and check each one that's still open if it needs to be fixed. Fix it, then resolve the comments and push your fix.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 92,
        "summary": "Preserves the dictated instruction as content with correct casing and PR identifier; only minor sentence restructuring from the expected text."
      },
      "judge_status": "ok",
      "latency_seconds": 145.7833,
      "model": "qwen3.6:35b",
      "ollama_status": "ok",
      "output": "Please check the open PR #4 for code review comments and check each one that's still open. If it needs to be fixed, fix it, and then resolve the comments and push your fix.\n\n",
      "passed": true,
      "score": 92,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 145.7854,
        "metrics": {
          "energy_wh_estimate": 2.3489,
          "gpu_busy_percent_avg": 36.8,
          "gpu_busy_percent_peak": 78.0,
          "power_w_avg": 58.0,
          "power_w_peak": 86.0,
          "vram_mb_avg": 23508.4,
          "vram_mb_peak": 23975.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 292,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_agent_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 6.350770306999038,
        "errors": [],
        "ideal_output": "Run cargo test, fix any failing tests, then commit and push the branch.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output exactly preserves the dictated instruction as cleaned text with correct casing and punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 74.3601,
      "model": "qwen3.6:35b",
      "ollama_status": "ok",
      "output": "Run cargo test, fix any failing tests, then commit and push the branch.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 74.3631,
        "metrics": {
          "energy_wh_estimate": 1.1168,
          "gpu_busy_percent_avg": 37.5,
          "gpu_busy_percent_peak": 77.0,
          "power_w_avg": 54.07,
          "power_w_peak": 89.0,
          "vram_mb_avg": 23131.0,
          "vram_mb_peak": 23890.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 150,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_question_no_answer",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 6.9061663419997785,
        "errors": [
          {
            "detail": "Expected a question mark at the end, not a period.",
            "severity": "major",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "What is the capital of France?",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 85,
        "summary": "Preserved the question text and casing but used a period instead of the dictated question mark."
      },
      "judge_status": "ok",
      "latency_seconds": 37.1935,
      "model": "qwen3.6:35b",
      "ollama_status": "ok",
      "output": "What is the capital of France?\n\n",
      "passed": false,
      "score": 85,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 37.1964,
        "metrics": {
          "energy_wh_estimate": 0.5211,
          "gpu_busy_percent_avg": 39.7,
          "gpu_busy_percent_peak": 79.0,
          "power_w_avg": 50.43,
          "power_w_peak": 92.0,
          "vram_mb_avg": 22392.0,
          "vram_mb_peak": 23889.9,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 76,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_email_request",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 6.933417763000762,
        "errors": [
          {
            "detail": "Subject should be 'Quarterly Update', not 'Quarterly update'.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 96,
        "summary": "Nearly correct; only missed title casing in the subject."
      },
      "judge_status": "ok",
      "latency_seconds": 113.7947,
      "model": "qwen3.6:35b",
      "ollama_status": "ok",
      "output": "Write an email to Maya, subject: Quarterly update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.\n\n",
      "passed": true,
      "score": 96,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 113.7968,
        "metrics": {
          "energy_wh_estimate": 1.7737,
          "gpu_busy_percent_avg": 39.0,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 56.11,
          "power_w_peak": 105.0,
          "vram_mb_avg": 23399.9,
          "vram_mb_peak": 23974.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 229,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "onboarding_mixed_format",
      "category": "mixed_formatting",
      "judge": null,
      "judge_status": "not_run",
      "latency_seconds": 180.0076,
      "model": "qwen3.6:35b",
      "ollama_status": "timeout",
      "output": "",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 180.0118,
        "metrics": {
          "energy_wh_estimate": 2.8766,
          "gpu_busy_percent_avg": 36.7,
          "gpu_busy_percent_peak": 77.0,
          "power_w_avg": 57.53,
          "power_w_peak": 102.0,
          "vram_mb_avg": 23583.5,
          "vram_mb_peak": 23972.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 361,
        "status": "ok"
      },
      "timed_out": true
    },
    {
      "case_id": "markdown_checklist_instruction_content",
      "category": "markdown",
      "judge": {
        "elapsed_seconds": 7.970364086002519,
        "errors": [
          {
            "detail": "Added '# ' before the title, which was not in the expected output.",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Release Tasks:\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 90,
        "summary": "Checklist content is preserved, but the title was formatted as a Markdown heading instead of the expected plain title line."
      },
      "judge_status": "ok",
      "latency_seconds": 89.0202,
      "model": "qwen3.6:35b",
      "ollama_status": "ok",
      "output": "# Release Tasks:\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release\n\n",
      "passed": true,
      "score": 90,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 89.022,
        "metrics": {
          "energy_wh_estimate": 1.37,
          "gpu_busy_percent_avg": 36.3,
          "gpu_busy_percent_peak": 79.0,
          "power_w_avg": 55.4,
          "power_w_peak": 90.0,
          "vram_mb_avg": 22758.5,
          "vram_mb_peak": 23983.4,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 179,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "shell_command_as_content",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 7.989995853000437,
        "errors": [
          {
            "detail": "Added surrounding backticks around the shell command.",
            "severity": "minor",
            "type": "extra_text"
          },
          {
            "detail": "Returned markdown inline code formatting instead of plain command text.",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "cargo test | tee test-output.log",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 90,
        "summary": "Command content is correct, but the formatter added markdown backticks that were not dictated or expected."
      },
      "judge_status": "ok",
      "latency_seconds": 96.8362,
      "model": "qwen3.6:35b",
      "ollama_status": "ok",
      "output": "`cargo test | tee test-output.log`\n\n",
      "passed": true,
      "score": 90,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 96.8386,
        "metrics": {
          "energy_wh_estimate": 1.4968,
          "gpu_busy_percent_avg": 38.7,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 55.65,
          "power_w_peak": 109.0,
          "vram_mb_avg": 23310.8,
          "vram_mb_peak": 23975.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 195,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "spoken_correction_chain",
      "category": "spoken_edits",
      "judge": {
        "elapsed_seconds": 8.075976599997375,
        "errors": [
          {
            "detail": "Rendered the corrected name as two words instead of the intended identifier ChirperBench.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Call it ChirperBench.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 75,
        "summary": "The formatter removed the spoken edits and applied basic casing, but missed the corrected identifier format ChirperBench."
      },
      "judge_status": "ok",
      "latency_seconds": 132.7741,
      "model": "qwen3.6:35b",
      "ollama_status": "ok",
      "output": "Call it Chirper Bench.\n\n",
      "passed": false,
      "score": 75,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 132.7761,
        "metrics": {
          "energy_wh_estimate": 2.0743,
          "gpu_busy_percent_avg": 35.5,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 56.24,
          "power_w_peak": 110.0,
          "vram_mb_avg": 23461.7,
          "vram_mb_peak": 23889.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 266,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "url_https_path",
      "category": "urls",
      "judge": {
        "elapsed_seconds": 6.659792346999893,
        "errors": [],
        "ideal_output": "Visit https://docs.python.org/3/library/json.html.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output matches the expected cleaned URL text exactly."
      },
      "judge_status": "ok",
      "latency_seconds": 131.5023,
      "model": "qwen3.6:35b",
      "ollama_status": "ok",
      "output": "Visit https://docs.python.org/3/library/json.html.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 131.5039,
        "metrics": {
          "energy_wh_estimate": 2.0879,
          "gpu_busy_percent_avg": 36.9,
          "gpu_busy_percent_peak": 78.0,
          "power_w_avg": 57.16,
          "power_w_peak": 129.0,
          "vram_mb_avg": 23475.4,
          "vram_mb_peak": 23983.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 264,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "email_addresses",
      "category": "emails",
      "judge": {
        "elapsed_seconds": 10.357568243998685,
        "errors": [
          {
            "detail": "Changed alex@chirper.local to Alex@chirper.local.",
            "severity": "major",
            "type": "wrong_casing"
          },
          {
            "detail": "Changed spoken cc to uppercase CC instead of lowercase cc.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Send it to alex@chirper.local and cc ops@example.com.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 85,
        "summary": "Email addresses were formatted, but the first email local part and cc casing do not match the intended output."
      },
      "judge_status": "ok",
      "latency_seconds": 112.4319,
      "model": "qwen3.6:35b",
      "ollama_status": "ok",
      "output": "Send it to Alex@chirper.local and CC ops@example.com.\n\n",
      "passed": true,
      "score": 85,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 112.4345,
        "metrics": {
          "energy_wh_estimate": 1.7657,
          "gpu_busy_percent_avg": 37.9,
          "gpu_busy_percent_peak": 86.0,
          "power_w_avg": 56.54,
          "power_w_peak": 85.0,
          "vram_mb_avg": 23385.8,
          "vram_mb_peak": 23889.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 226,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "product_and_class_casing",
      "category": "casing_identifiers",
      "judge": {
        "elapsed_seconds": 12.82589163800003,
        "errors": [
          {
            "detail": "Product name should be formatted as the identifier ChirperBench, not Chirper Bench.",
            "severity": "major",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "The project name is ChirperBench and the class name is DictationFormatter.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 80,
        "summary": "One product identifier casing error: ChirperBench was split into two words."
      },
      "judge_status": "ok",
      "latency_seconds": 108.5076,
      "model": "qwen3.6:35b",
      "ollama_status": "ok",
      "output": "The project name is Chirper Bench and the class name is DictationFormatter.\n\n",
      "passed": false,
      "score": 80,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 108.5098,
        "metrics": {
          "energy_wh_estimate": 1.7023,
          "gpu_busy_percent_avg": 37.6,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 56.48,
          "power_w_peak": 117.0,
          "vram_mb_avg": 23376.0,
          "vram_mb_peak": 23975.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 218,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "inline_code_identifiers",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 7.0516404879999754,
        "errors": [
          {
            "detail": "Code identifiers `user_id` and `request.user.id` were not wrapped in backticks.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Set variable `user_id` equal to `request.user.id`.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 80,
        "summary": "Content and punctuation are correct, but required inline code formatting for identifiers is missing."
      },
      "judge_status": "ok",
      "latency_seconds": 104.1491,
      "model": "qwen3.6:35b",
      "ollama_status": "ok",
      "output": "Set variable user_id equal to request.user.id.\n\n",
      "passed": false,
      "score": 80,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 104.1516,
        "metrics": {
          "energy_wh_estimate": 1.6593,
          "gpu_busy_percent_avg": 37.2,
          "gpu_busy_percent_peak": 78.0,
          "power_w_avg": 57.35,
          "power_w_peak": 136.0,
          "vram_mb_avg": 23344.8,
          "vram_mb_peak": 23889.4,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 209,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "numbers_versions_dates",
      "category": "numbers_versions",
      "judge": {
        "elapsed_seconds": 9.027062373999797,
        "errors": [
          {
            "detail": "Rendered Python 3.11+ as Python 3.11 or later instead of preserving the plus notation.",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Version 2.1.0 shipped on June 3, 2026 and supports Python 3.11+.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 88,
        "summary": "Mostly correct, but it changed the dictated Python version notation from 3.11+ to prose."
      },
      "judge_status": "ok",
      "latency_seconds": 127.5603,
      "model": "qwen3.6:35b",
      "ollama_status": "ok",
      "output": "Version 2.1.0 shipped on June 3, 2026 and supports Python 3.11 or later.\n\n",
      "passed": true,
      "score": 88,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 127.5626,
        "metrics": {
          "energy_wh_estimate": 2.0109,
          "gpu_busy_percent_avg": 37.5,
          "gpu_busy_percent_peak": 81.0,
          "power_w_avg": 56.75,
          "power_w_peak": 112.0,
          "vram_mb_avg": 23455.6,
          "vram_mb_peak": 23983.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 256,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "simple_list",
      "category": "lists",
      "judge": {
        "elapsed_seconds": 8.019295977999718,
        "errors": [
          {
            "detail": "Kept the items inline instead of placing them on separate bullet lines.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Shopping list:\n- apples\n- bananas\n- oat milk",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 70,
        "summary": "Preserves the words but fails to format the dictated list as a markdown bullet list."
      },
      "judge_status": "ok",
      "latency_seconds": 42.0632,
      "model": "qwen3.6:35b",
      "ollama_status": "ok",
      "output": "Shopping list: apples, bananas, oat milk.\n\n",
      "passed": false,
      "score": 70,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 42.065,
        "metrics": {
          "energy_wh_estimate": 0.6226,
          "gpu_busy_percent_avg": 38.8,
          "gpu_busy_percent_peak": 77.0,
          "power_w_avg": 53.28,
          "power_w_peak": 159.0,
          "vram_mb_avg": 22560.4,
          "vram_mb_peak": 23972.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 85,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "no_change_needed",
      "category": "no_change_needed",
      "judge": {
        "elapsed_seconds": 6.526157915999647,
        "errors": [],
        "ideal_output": "The quick brown fox jumps over the lazy dog.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output matches the expected clean text exactly."
      },
      "judge_status": "ok",
      "latency_seconds": 30.8986,
      "model": "qwen3.6:35b",
      "ollama_status": "ok",
      "output": "The quick brown fox jumps over the lazy dog.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 30.9003,
        "metrics": {
          "energy_wh_estimate": 0.4387,
          "gpu_busy_percent_avg": 39.1,
          "gpu_busy_percent_peak": 78.0,
          "power_w_avg": 51.11,
          "power_w_peak": 88.0,
          "vram_mb_avg": 22085.4,
          "vram_mb_peak": 23889.4,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 63,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_pr_review_instruction",
      "category": "instruction_as_content",
      "judge": null,
      "judge_status": "not_run",
      "latency_seconds": 180.0081,
      "model": "qwen3.6:27b",
      "ollama_status": "timeout",
      "output": "",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 180.0122,
        "metrics": {
          "energy_wh_estimate": 7.1942,
          "gpu_busy_percent_avg": 48.4,
          "gpu_busy_percent_peak": 78.0,
          "power_w_avg": 143.88,
          "power_w_peak": 160.0,
          "vram_mb_avg": 23626.2,
          "vram_mb_peak": 23958.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 361,
        "status": "ok"
      },
      "timed_out": true
    },
    {
      "case_id": "literal_agent_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 7.5933290850007324,
        "errors": [
          {
            "detail": "Added backticks around cargo test instead of preserving it as plain text.",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Run cargo test, fix any failing tests, then commit and push the branch.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 90,
        "summary": "Content, casing, and punctuation are correct, but it adds markdown code formatting not present in the expected literal text."
      },
      "judge_status": "ok",
      "latency_seconds": 113.3584,
      "model": "qwen3.6:27b",
      "ollama_status": "ok",
      "output": "Run `cargo test`, fix any failing tests, then commit and push the branch.\n\n",
      "passed": true,
      "score": 90,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 113.3598,
        "metrics": {
          "energy_wh_estimate": 4.4931,
          "gpu_busy_percent_avg": 47.7,
          "gpu_busy_percent_peak": 74.0,
          "power_w_avg": 142.69,
          "power_w_peak": 169.0,
          "vram_mb_avg": 23541.6,
          "vram_mb_peak": 24043.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 228,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_question_no_answer",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 5.910315320001246,
        "errors": [],
        "ideal_output": "What is the capital of France?",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output correctly formats the dictated question without answering it."
      },
      "judge_status": "ok",
      "latency_seconds": 37.1508,
      "model": "qwen3.6:27b",
      "ollama_status": "ok",
      "output": "What is the capital of France?\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 37.1527,
        "metrics": {
          "energy_wh_estimate": 1.2715,
          "gpu_busy_percent_avg": 45.9,
          "gpu_busy_percent_peak": 78.0,
          "power_w_avg": 123.21,
          "power_w_peak": 185.0,
          "vram_mb_avg": 22380.5,
          "vram_mb_peak": 23958.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 76,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_email_request",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 10.72843715000272,
        "errors": [
          {
            "detail": "Changed \u201cWrite an email to Maya, subject...\u201d into To/Subject headers.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Omitted the literal instruction wording \u201cWrite an email to Maya\u201d from the final text.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Expected inline subject punctuation, not email header formatting.",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 65,
        "summary": "Preserved the meeting details but treated the dictated email request as an email header format instead of literal cleaned text."
      },
      "judge_status": "ok",
      "latency_seconds": 171.6584,
      "model": "qwen3.6:27b",
      "ollama_status": "ok",
      "output": "To: Maya\nSubject: Quarterly Update\nThe meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.\n\n",
      "passed": false,
      "score": 65,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 171.6598,
        "metrics": {
          "energy_wh_estimate": 7.1792,
          "gpu_busy_percent_avg": 52.3,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 150.56,
          "power_w_peak": 340.0,
          "vram_mb_avg": 23614.0,
          "vram_mb_peak": 24043.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 344,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "onboarding_mixed_format",
      "category": "mixed_formatting",
      "judge": null,
      "judge_status": "not_run",
      "latency_seconds": 180.0027,
      "model": "qwen3.6:27b",
      "ollama_status": "timeout",
      "output": "",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 180.0045,
        "metrics": {
          "energy_wh_estimate": 7.3125,
          "gpu_busy_percent_avg": 47.8,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 146.25,
          "power_w_peak": 261.0,
          "vram_mb_avg": 23635.5,
          "vram_mb_peak": 24049.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 361,
        "status": "ok"
      },
      "timed_out": true
    },
    {
      "case_id": "markdown_checklist_instruction_content",
      "category": "markdown",
      "judge": {
        "elapsed_seconds": 8.01342368299811,
        "errors": [
          {
            "detail": "Title was formatted as '# Release Tasks:' instead of plain 'Release Tasks:'.",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Release Tasks:\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 92,
        "summary": "Content and checklist formatting are preserved; only the title formatting differs by using a Markdown heading."
      },
      "judge_status": "ok",
      "latency_seconds": 121.96,
      "model": "qwen3.6:27b",
      "ollama_status": "ok",
      "output": "# Release Tasks:\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release\n\n",
      "passed": true,
      "score": 92,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 121.9612,
        "metrics": {
          "energy_wh_estimate": 4.8304,
          "gpu_busy_percent_avg": 46.4,
          "gpu_busy_percent_peak": 81.0,
          "power_w_avg": 142.58,
          "power_w_peak": 163.0,
          "vram_mb_avg": 23084.9,
          "vram_mb_peak": 24049.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 245,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "shell_command_as_content",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 8.350962728000013,
        "errors": [
          {
            "detail": "Added backticks around the command that were not dictated or expected.",
            "severity": "minor",
            "type": "extra_text"
          }
        ],
        "ideal_output": "cargo test | tee test-output.log",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 85,
        "summary": "Command content is correct, but it adds unwanted Markdown code formatting."
      },
      "judge_status": "ok",
      "latency_seconds": 163.876,
      "model": "qwen3.6:27b",
      "ollama_status": "ok",
      "output": "`cargo test | tee test-output.log`\n\n",
      "passed": true,
      "score": 85,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 163.8775,
        "metrics": {
          "energy_wh_estimate": 6.6449,
          "gpu_busy_percent_avg": 47.5,
          "gpu_busy_percent_peak": 78.0,
          "power_w_avg": 145.97,
          "power_w_peak": 210.0,
          "vram_mb_avg": 23596.2,
          "vram_mb_peak": 24043.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 328,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "spoken_correction_chain",
      "category": "spoken_edits",
      "judge": {
        "elapsed_seconds": 9.70980698999847,
        "errors": [
          {
            "detail": "Changed the intended final identifier from ChirperBench to Chirper Benchmark.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Missed the concatenated CamelCase identifier formatting.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Call it ChirperBench.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 65,
        "summary": "The spoken edit commands were removed, but the final corrected identifier was not preserved: it should be the concatenated name ChirperBench, not Chirper Benchmark."
      },
      "judge_status": "ok",
      "latency_seconds": 95.6537,
      "model": "qwen3.6:27b",
      "ollama_status": "ok",
      "output": "Call it Chirper Benchmark.\n\n",
      "passed": false,
      "score": 65,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 95.6556,
        "metrics": {
          "energy_wh_estimate": 3.7844,
          "gpu_busy_percent_avg": 47.9,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 142.43,
          "power_w_peak": 355.0,
          "vram_mb_avg": 23338.8,
          "vram_mb_peak": 24043.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 192,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "url_https_path",
      "category": "urls",
      "judge": {
        "elapsed_seconds": 7.262286178000068,
        "errors": [],
        "ideal_output": "Visit https://docs.python.org/3/library/json.html.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Perfectly formats the spoken HTTPS URL with correct casing and punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 76.9069,
      "model": "qwen3.6:27b",
      "ollama_status": "ok",
      "output": "Visit https://docs.python.org/3/library/json.html.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 76.9087,
        "metrics": {
          "energy_wh_estimate": 2.9112,
          "gpu_busy_percent_avg": 46.6,
          "gpu_busy_percent_peak": 82.0,
          "power_w_avg": 136.27,
          "power_w_peak": 159.0,
          "vram_mb_avg": 23201.9,
          "vram_mb_peak": 24049.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 155,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "email_addresses",
      "category": "emails",
      "judge": {
        "elapsed_seconds": 6.045870261998061,
        "errors": [],
        "ideal_output": "Send it to alex@chirper.local and cc ops@example.com.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Correctly formatted both spoken email addresses and final punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 103.8237,
      "model": "qwen3.6:27b",
      "ollama_status": "ok",
      "output": "Send it to alex@chirper.local and cc ops@example.com.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 103.8253,
        "metrics": {
          "energy_wh_estimate": 4.0747,
          "gpu_busy_percent_avg": 47.4,
          "gpu_busy_percent_peak": 86.0,
          "power_w_avg": 141.29,
          "power_w_peak": 168.0,
          "vram_mb_avg": 23386.8,
          "vram_mb_peak": 23958.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 209,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "product_and_class_casing",
      "category": "casing_identifiers",
      "judge": null,
      "judge_status": "not_run",
      "latency_seconds": 180.0034,
      "model": "qwen3.6:27b",
      "ollama_status": "timeout",
      "output": "",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 180.0051,
        "metrics": {
          "energy_wh_estimate": 7.4081,
          "gpu_busy_percent_avg": 48.3,
          "gpu_busy_percent_peak": 90.0,
          "power_w_avg": 148.16,
          "power_w_peak": 170.0,
          "vram_mb_avg": 23634.9,
          "vram_mb_peak": 24043.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 361,
        "status": "ok"
      },
      "timed_out": true
    },
    {
      "case_id": "inline_code_identifiers",
      "category": "code_identifiers",
      "judge": null,
      "judge_status": "not_run",
      "latency_seconds": 180.0034,
      "model": "qwen3.6:27b",
      "ollama_status": "timeout",
      "output": "",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 180.0048,
        "metrics": {
          "energy_wh_estimate": 7.3823,
          "gpu_busy_percent_avg": 47.8,
          "gpu_busy_percent_peak": 82.0,
          "power_w_avg": 147.64,
          "power_w_peak": 177.0,
          "vram_mb_avg": 23362.4,
          "vram_mb_peak": 23958.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 361,
        "status": "ok"
      },
      "timed_out": true
    },
    {
      "case_id": "numbers_versions_dates",
      "category": "numbers_versions",
      "judge": {
        "elapsed_seconds": 7.264269752999098,
        "errors": [
          {
            "detail": "Added a comma after the date that was not in the expected output.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Version 2.1.0 shipped on June 3, 2026 and supports Python 3.11+.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 95,
        "summary": "Content, casing, versions, date, and number formatting are correct; only an extra comma differs from expected punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 143.9601,
      "model": "qwen3.6:27b",
      "ollama_status": "ok",
      "output": "Version 2.1.0 shipped on June 3, 2026, and supports Python 3.11+.\n\n",
      "passed": true,
      "score": 95,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 143.9613,
        "metrics": {
          "energy_wh_estimate": 6.0736,
          "gpu_busy_percent_avg": 50.4,
          "gpu_busy_percent_peak": 93.0,
          "power_w_avg": 151.88,
          "power_w_peak": 220.0,
          "vram_mb_avg": 23635.9,
          "vram_mb_peak": 24049.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 289,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "simple_list",
      "category": "lists",
      "judge": {
        "elapsed_seconds": 10.649929008999607,
        "errors": [
          {
            "detail": "Expected items on separate bullet lines after the heading, not a comma-separated sentence.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Shopping list:\n- apples\n- bananas\n- oat milk",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 70,
        "summary": "Preserves the items but fails to format the dictated shopping list as a markdown list."
      },
      "judge_status": "ok",
      "latency_seconds": 79.806,
      "model": "qwen3.6:27b",
      "ollama_status": "ok",
      "output": "Shopping list: apples, bananas, oat milk.\n\n",
      "passed": false,
      "score": 70,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 79.8075,
        "metrics": {
          "energy_wh_estimate": 3.0784,
          "gpu_busy_percent_avg": 46.8,
          "gpu_busy_percent_peak": 84.0,
          "power_w_avg": 138.86,
          "power_w_peak": 164.0,
          "vram_mb_avg": 23212.0,
          "vram_mb_peak": 23957.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 161,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "no_change_needed",
      "category": "no_change_needed",
      "judge": {
        "elapsed_seconds": 6.104937519001396,
        "errors": [],
        "ideal_output": "The quick brown fox jumps over the lazy dog.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output matches the expected cleaned text exactly."
      },
      "judge_status": "ok",
      "latency_seconds": 48.325,
      "model": "qwen3.6:27b",
      "ollama_status": "ok",
      "output": "The quick brown fox jumps over the lazy dog.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 48.3267,
        "metrics": {
          "energy_wh_estimate": 1.6921,
          "gpu_busy_percent_avg": 45.6,
          "gpu_busy_percent_peak": 78.0,
          "power_w_avg": 126.05,
          "power_w_peak": 170.0,
          "vram_mb_avg": 22736.9,
          "vram_mb_peak": 23957.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 98,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_pr_review_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 11.222968113997922,
        "errors": [
          {
            "detail": "Uses \"PR number 4\" instead of the expected \"PR #4\".",
            "severity": "minor",
            "type": "formatting_miss"
          },
          {
            "detail": "Punctuation differs from the expected cleaned phrasing, though meaning is mostly preserved.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Please check the open PR #4 for code review comments and check each one that's still open if it needs to be fixed. Fix it, then resolve the comments and push your fix.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 85,
        "summary": "Preserves the intended instruction, but misses the expected PR number formatting and slightly changes the sentence structure."
      },
      "judge_status": "ok",
      "latency_seconds": 19.5337,
      "model": "gemma4:26b",
      "ollama_status": "ok",
      "output": "Please check the open PR number 4 for code review comments and check each one that's still open. If it needs to be fixed, fix it, and then resolve the comments and push your fix.\n\n",
      "passed": true,
      "score": 85,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 19.5352,
        "metrics": {
          "energy_wh_estimate": 0.8191,
          "gpu_busy_percent_avg": 68.2,
          "gpu_busy_percent_peak": 89.0,
          "power_w_avg": 150.95,
          "power_w_peak": 255.0,
          "vram_mb_avg": 18804.0,
          "vram_mb_peak": 20725.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 40,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_agent_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 5.758423043000221,
        "errors": [],
        "ideal_output": "Run cargo test, fix any failing tests, then commit and push the branch.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output exactly matches the expected cleaned text and preserves the command-like content as dictation."
      },
      "judge_status": "ok",
      "latency_seconds": 14.6337,
      "model": "gemma4:26b",
      "ollama_status": "ok",
      "output": "Run cargo test, fix any failing tests, then commit and push the branch.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 14.6353,
        "metrics": {
          "energy_wh_estimate": 0.4232,
          "gpu_busy_percent_avg": 59.9,
          "gpu_busy_percent_peak": 92.0,
          "power_w_avg": 104.1,
          "power_w_peak": 251.0,
          "vram_mb_avg": 18811.1,
          "vram_mb_peak": 20743.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 31,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_question_no_answer",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 7.433590287000698,
        "errors": [
          {
            "detail": "Expected a question mark at the end.",
            "severity": "major",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "What is the capital of France?",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 85,
        "summary": "Preserved the dictated question but missed the required question mark."
      },
      "judge_status": "ok",
      "latency_seconds": 13.485,
      "model": "gemma4:26b",
      "ollama_status": "ok",
      "output": "What is the capital of France?\n\n",
      "passed": false,
      "score": 85,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 13.4865,
        "metrics": {
          "energy_wh_estimate": 0.3054,
          "gpu_busy_percent_avg": 54.8,
          "gpu_busy_percent_peak": 88.0,
          "power_w_avg": 81.54,
          "power_w_peak": 249.0,
          "vram_mb_avg": 18616.1,
          "vram_mb_peak": 20735.9,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 28,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_email_request",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 7.927020164999703,
        "errors": [
          {
            "detail": "Capitalized \"Subject\" instead of lowercase \"subject\".",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 96,
        "summary": "Content and formatting are preserved; only minor casing differs from expected."
      },
      "judge_status": "ok",
      "latency_seconds": 44.7094,
      "model": "gemma4:26b",
      "ollama_status": "ok",
      "output": "Write an email to Maya, Subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.\n\n",
      "passed": true,
      "score": 96,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 44.7115,
        "metrics": {
          "energy_wh_estimate": 2.5086,
          "gpu_busy_percent_avg": 78.9,
          "gpu_busy_percent_peak": 89.0,
          "power_w_avg": 201.99,
          "power_w_peak": 257.0,
          "vram_mb_avg": 19901.9,
          "vram_mb_peak": 20822.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 91,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "onboarding_mixed_format",
      "category": "mixed_formatting",
      "judge": {
        "elapsed_seconds": 14.19435967499885,
        "errors": [
          {
            "detail": "List title was formatted as bold markdown and omitted the trailing colon instead of plain title text.",
            "severity": "minor",
            "type": "formatting_miss"
          },
          {
            "detail": "List title and email subject used lower-case words where title case was expected.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Changed 'Please write an email to Maya, subject:' into separate sentences and placed 'Thanks!' on a separate paragraph.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Hello Chirper. I need to write down accent-friendly words.\n\nAccent Friendly Words:\n- water\n- tomato\n- schedule\n- data\n- router\n- aluminium\n- privacy\n\nPlease write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.\n\nIn the deployment notes, mention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale. Thanks!",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 88,
        "summary": "Mostly preserves the intended content and mixed formatting, with minor casing and markdown/title punctuation issues."
      },
      "judge_status": "ok",
      "latency_seconds": 77.0199,
      "model": "gemma4:26b",
      "ollama_status": "ok",
      "output": "Hello Chirper. I need to write down accent-friendly words.\n\n**Accent friendly words**\n* water\n* tomato\n* schedule\n* data\n* router\n* aluminium\n* privacy\n\nPlease write an email to Maya. Subject: Quarterly update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.\n\nIn the deployment notes, mention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale.\n\nThanks!\n\n",
      "passed": true,
      "score": 88,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 77.0215,
        "metrics": {
          "energy_wh_estimate": 4.9521,
          "gpu_busy_percent_avg": 83.1,
          "gpu_busy_percent_peak": 91.0,
          "power_w_avg": 231.46,
          "power_w_peak": 275.0,
          "vram_mb_avg": 20359.0,
          "vram_mb_peak": 20745.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 155,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "markdown_checklist_instruction_content",
      "category": "markdown",
      "judge": {
        "elapsed_seconds": 9.380244247997325,
        "errors": [
          {
            "detail": "Title uses a markdown heading instead of the dictated colon after Release Tasks.",
            "severity": "minor",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Checklist item text should be sentence-cased: Run, Push, Publish.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Release Tasks:\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 75,
        "summary": "Content and checklist structure are preserved, but title punctuation and item casing do not match the expected formatting."
      },
      "judge_status": "ok",
      "latency_seconds": 18.6834,
      "model": "gemma4:26b",
      "ollama_status": "ok",
      "output": "# Release Tasks\n- [ ] run tests\n- [ ] push tag\n- [ ] publish release\n\n",
      "passed": true,
      "score": 75,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 18.685,
        "metrics": {
          "energy_wh_estimate": 0.9052,
          "gpu_busy_percent_avg": 74.1,
          "gpu_busy_percent_peak": 95.0,
          "power_w_avg": 174.41,
          "power_w_peak": 260.0,
          "vram_mb_avg": 18771.6,
          "vram_mb_peak": 20745.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 39,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "shell_command_as_content",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 5.395965522999177,
        "errors": [],
        "ideal_output": "cargo test | tee test-output.log",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output exactly preserves the intended shell command and formatting."
      },
      "judge_status": "ok",
      "latency_seconds": 14.585,
      "model": "gemma4:26b",
      "ollama_status": "ok",
      "output": "cargo test | tee test-output.log\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 14.5872,
        "metrics": {
          "energy_wh_estimate": 0.6272,
          "gpu_busy_percent_avg": 69.6,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 154.81,
          "power_w_peak": 255.0,
          "vram_mb_avg": 18259.9,
          "vram_mb_peak": 20737.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 31,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "spoken_correction_chain",
      "category": "spoken_edits",
      "judge": {
        "elapsed_seconds": 7.954911701999663,
        "errors": [
          {
            "detail": "Omitted \"Call it\" from the dictated sentence.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Rendered the identifier as two words instead of \"ChirperBench\".",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Missing final period.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Call it ChirperBench.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 35,
        "summary": "Applied some correction words but omitted required text and failed the intended identifier formatting."
      },
      "judge_status": "ok",
      "latency_seconds": 37.9267,
      "model": "gemma4:26b",
      "ollama_status": "ok",
      "output": "Chirper Benchmark\n\n",
      "passed": false,
      "score": 35,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 37.9286,
        "metrics": {
          "energy_wh_estimate": 2.4135,
          "gpu_busy_percent_avg": 81.4,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 229.08,
          "power_w_peak": 326.0,
          "vram_mb_avg": 19968.9,
          "vram_mb_peak": 20745.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 77,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "url_https_path",
      "category": "urls",
      "judge": {
        "elapsed_seconds": 6.059651878000295,
        "errors": [],
        "ideal_output": "Visit https://docs.python.org/3/library/json.html.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output correctly formats the HTTPS URL, casing, path, and final punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 11.6384,
      "model": "gemma4:26b",
      "ollama_status": "ok",
      "output": "Visit https://docs.python.org/3/library/json.html.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 11.6401,
        "metrics": {
          "energy_wh_estimate": 0.5436,
          "gpu_busy_percent_avg": 71.2,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 168.12,
          "power_w_peak": 257.0,
          "vram_mb_avg": 18361.2,
          "vram_mb_peak": 20830.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 25,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "email_addresses",
      "category": "emails",
      "judge": {
        "elapsed_seconds": 5.5407292600029905,
        "errors": [],
        "ideal_output": "Send it to alex@chirper.local and cc ops@example.com.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output correctly formats the spoken email addresses and punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 8.847,
      "model": "gemma4:26b",
      "ollama_status": "ok",
      "output": "Send it to alex@chirper.local and cc ops@example.com.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 8.8485,
        "metrics": {
          "energy_wh_estimate": 0.3751,
          "gpu_busy_percent_avg": 69.3,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 152.63,
          "power_w_peak": 256.0,
          "vram_mb_avg": 17608.4,
          "vram_mb_peak": 20745.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 19,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "product_and_class_casing",
      "category": "casing_identifiers",
      "judge": {
        "elapsed_seconds": 7.523754148001899,
        "errors": [
          {
            "detail": "Expected code-style casing: ChirperBench and DictationFormatter, not spaced title case.",
            "severity": "major",
            "type": "wrong_casing"
          },
          {
            "detail": "Product/class identifiers were not formatted as required.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "The project name is ChirperBench and the class name is DictationFormatter.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 70,
        "summary": "The output preserves the sentence and punctuation but misses required product and class identifier casing by leaving spaces in both names."
      },
      "judge_status": "ok",
      "latency_seconds": 9.9611,
      "model": "gemma4:26b",
      "ollama_status": "ok",
      "output": "The project name is Chirper Bench and the class name is Dictation Formatter.\n\n",
      "passed": false,
      "score": 70,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 9.963,
        "metrics": {
          "energy_wh_estimate": 0.4305,
          "gpu_busy_percent_avg": 71.6,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 155.57,
          "power_w_peak": 255.0,
          "vram_mb_avg": 17943.2,
          "vram_mb_peak": 20830.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 21,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "inline_code_identifiers",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 9.499484371997823,
        "errors": [
          {
            "detail": "Code identifiers were not wrapped in inline code backticks.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Sentence should start with a capital letter.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Changed 'equal to' into '=' rather than preserving the dictated phrasing.",
            "severity": "minor",
            "type": "semantic_change"
          }
        ],
        "ideal_output": "Set variable `user_id` equal to `request.user.id`.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 55,
        "summary": "Preserves the basic meaning but misses required inline code formatting and expected casing/wording."
      },
      "judge_status": "ok",
      "latency_seconds": 42.1044,
      "model": "gemma4:26b",
      "ollama_status": "ok",
      "output": "set variable user_id = request.user.id.\n\n",
      "passed": false,
      "score": 55,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 42.1064,
        "metrics": {
          "energy_wh_estimate": 2.7188,
          "gpu_busy_percent_avg": 84.2,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 232.45,
          "power_w_peak": 265.0,
          "vram_mb_avg": 20060.1,
          "vram_mb_peak": 20830.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 86,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "numbers_versions_dates",
      "category": "numbers_versions",
      "judge": {
        "elapsed_seconds": 7.5344826389991795,
        "errors": [
          {
            "detail": "Added a comma after '2026' that is not present in the expected output.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Version 2.1.0 shipped on June 3, 2026 and supports Python 3.11+.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 96,
        "summary": "Content, casing, dates, versions, and numbers are correctly formatted; only an extra comma differs from expected punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 24.6075,
      "model": "gemma4:26b",
      "ollama_status": "ok",
      "output": "Version 2.1.0 shipped on June 3, 2026, and supports Python 3.11+.\n\n",
      "passed": true,
      "score": 96,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 24.6095,
        "metrics": {
          "energy_wh_estimate": 1.4727,
          "gpu_busy_percent_avg": 80.5,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 215.43,
          "power_w_peak": 267.0,
          "vram_mb_avg": 19576.7,
          "vram_mb_peak": 20745.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 51,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "simple_list",
      "category": "lists",
      "judge": {
        "elapsed_seconds": 6.6742654830013635,
        "errors": [
          {
            "detail": "Expected a colon followed by bullet list items, but output stayed inline with commas.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Kept comma-separated sentence punctuation instead of line breaks after the colon.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Shopping list:\n- apples\n- bananas\n- oat milk",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 70,
        "summary": "Content is preserved, but the spoken list was not formatted as a markdown list."
      },
      "judge_status": "ok",
      "latency_seconds": 9.5314,
      "model": "gemma4:26b",
      "ollama_status": "ok",
      "output": "Shopping list: apples, bananas, oat milk.\n\n",
      "passed": false,
      "score": 70,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 9.5331,
        "metrics": {
          "energy_wh_estimate": 0.4154,
          "gpu_busy_percent_avg": 69.9,
          "gpu_busy_percent_peak": 98.0,
          "power_w_avg": 156.86,
          "power_w_peak": 260.0,
          "vram_mb_avg": 17894.9,
          "vram_mb_peak": 20737.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 21,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "no_change_needed",
      "category": "no_change_needed",
      "judge": {
        "elapsed_seconds": 19.815154699997947,
        "errors": [],
        "ideal_output": "The quick brown fox jumps over the lazy dog.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output exactly matches the expected cleaned text."
      },
      "judge_status": "ok",
      "latency_seconds": 8.0784,
      "model": "gemma4:26b",
      "ollama_status": "ok",
      "output": "The quick brown fox jumps over the lazy dog.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 8.0802,
        "metrics": {
          "energy_wh_estimate": 0.2992,
          "gpu_busy_percent_avg": 65.4,
          "gpu_busy_percent_peak": 97.0,
          "power_w_avg": 133.33,
          "power_w_peak": 253.0,
          "vram_mb_avg": 17416.3,
          "vram_mb_peak": 20737.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 18,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_pr_review_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 10.593305197002337,
        "errors": [
          {
            "detail": "Changed 'then resolve the comments and push your fix' to 'resolve the comments before pushing your fix,' slightly altering sequence wording.",
            "severity": "minor",
            "type": "semantic_change"
          }
        ],
        "ideal_output": "Please check the open PR #4 for code review comments and check each one that's still open if it needs to be fixed. Fix it, then resolve the comments and push your fix.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 92,
        "summary": "Preserves the dictated instruction with clean punctuation and casing; only minor wording changes."
      },
      "judge_status": "ok",
      "latency_seconds": 8.0312,
      "model": "lfm2:24b",
      "ollama_status": "ok",
      "output": "Please check open PR #4 for code review comments and check each one that's still open to see if it needs to be fixed. Then fix it and resolve the comments before pushing your fix.\n\n",
      "passed": true,
      "score": 92,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 8.0326,
        "metrics": {
          "energy_wh_estimate": 0.1619,
          "gpu_busy_percent_avg": 57.6,
          "gpu_busy_percent_peak": 89.0,
          "power_w_avg": 72.56,
          "power_w_peak": 220.0,
          "vram_mb_avg": 14795.8,
          "vram_mb_peak": 16450.9,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 18,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_agent_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 9.809097933000885,
        "errors": [
          {
            "detail": "Added backticks around cargo test instead of preserving it as plain text.",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Run cargo test, fix any failing tests, then commit and push the branch.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 90,
        "summary": "Content, punctuation, and casing are preserved, but command-like text was incorrectly formatted as inline code."
      },
      "judge_status": "ok",
      "latency_seconds": 8.1095,
      "model": "lfm2:24b",
      "ollama_status": "ok",
      "output": "Run `cargo test`, fix any failing tests, then commit and push the branch.\n\n",
      "passed": true,
      "score": 90,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 8.1114,
        "metrics": {
          "energy_wh_estimate": 0.1085,
          "gpu_busy_percent_avg": 54.9,
          "gpu_busy_percent_peak": 89.0,
          "power_w_avg": 48.17,
          "power_w_peak": 221.0,
          "vram_mb_avg": 14796.0,
          "vram_mb_peak": 16452.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 18,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_question_no_answer",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 7.819203381997795,
        "errors": [],
        "ideal_output": "What is the capital of France?",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output exactly preserves the dictated question with correct casing and punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 7.828,
      "model": "lfm2:24b",
      "ollama_status": "ok",
      "output": "What is the capital of France?\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 7.8296,
        "metrics": {
          "energy_wh_estimate": 0.075,
          "gpu_busy_percent_avg": 53.1,
          "gpu_busy_percent_peak": 77.0,
          "power_w_avg": 34.47,
          "power_w_peak": 131.0,
          "vram_mb_avg": 14698.5,
          "vram_mb_peak": 16451.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 17,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_email_request",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 10.712871288000315,
        "errors": [
          {
            "detail": "Generated an email body, greeting, signoff, and extra phrasing not in the transcript.",
            "severity": "critical",
            "type": "over_generation"
          },
          {
            "detail": "Changed the request from 'Write an email to Maya, subject...' into an actual composed email.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Did not format the budget as $12,450.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Omitted the URL path /launch.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Changed chirper.local/launch into a markdown link to http://chirper.local.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 20,
        "summary": "Invented a full email instead of preserving the dictated request, with URL and currency formatting errors."
      },
      "judge_status": "ok",
      "latency_seconds": 8.6227,
      "model": "lfm2:24b",
      "ollama_status": "ok",
      "output": "Subject: Quarterly Update \u2013 Meeting Moved to Thursday at 9:30 AM\n\nDear Maya,\n\nPlease find below the quarterly update. The meeting has been moved to Thursday at 9:30 AM.\n\nThe budget is twelve thousand four hundred fifty dollars.\n\nThe website is [chirper.local](http://chirper.local).\n\nBest regards,  \n[Your Name]\n\n",
      "passed": false,
      "score": 20,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 8.6242,
        "metrics": {
          "energy_wh_estimate": 0.127,
          "gpu_busy_percent_avg": 57.7,
          "gpu_busy_percent_peak": 89.0,
          "power_w_avg": 53.0,
          "power_w_peak": 220.0,
          "vram_mb_avg": 14882.9,
          "vram_mb_peak": 16451.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 19,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "onboarding_mixed_format",
      "category": "mixed_formatting",
      "judge": {
        "elapsed_seconds": 14.617129936003039,
        "errors": [
          {
            "detail": "Leaves dictated commands such as Period, comma, End of list, New paragraph, Finish with thanks, and Exclamation mark.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Handles some URL and technical casing but misses list, email sentence structure, spoken edits, and numeric formatting.",
            "severity": "major",
            "type": "partial_mixed_task"
          },
          {
            "detail": "Bullet list title and items are incorrectly formatted with extra punctuation words and capitalization.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Uses 'chirper' instead of 'Chirper' and changes title styling to 'Accent-friendly words'.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Splits and punctuates the email section incorrectly instead of producing the requested sentences.",
            "severity": "major",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Fails to format '9:30 AM' and '$12,450'.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Adds extra standalone command-like lines not intended for final text.",
            "severity": "major",
            "type": "extra_text"
          }
        ],
        "ideal_output": "Hello Chirper. I need to write down accent-friendly words.\n\nAccent Friendly Words:\n- water\n- tomato\n- schedule\n- data\n- router\n- aluminium\n- privacy\n\nPlease write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.\n\nIn the deployment notes, mention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale. Thanks!",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 25,
        "summary": "Pervasive spoken-command leakage and mixed-formatting misses, especially list cleanup, punctuation, number/currency formatting, and final thanks."
      },
      "judge_status": "ok",
      "latency_seconds": 9.5993,
      "model": "lfm2:24b",
      "ollama_status": "ok",
      "output": "Hello chirper.  \nPeriod. I need to write down accent-friendly words.  \nThis is a bullet point list with title: Accent-friendly words.  \n- Water, comma  \n- Tomato, comma  \n- Schedule, comma  \n- Data, comma  \n- Router, comma  \n- Aluminium, comma  \n- Privacy, period  \n\nEnd of list.  \n\nNew paragraph: Please write an email to Maya.  \nSubject: Quarterly Update  \nPeriod. The meeting moved to Thursday at nine thirty a.m.  \nThe budget is twelve thousand four hundred fifty dollars.  \nThe website is chirper.local/launch.  \n\nNew paragraph in the deployment notes:  \nMention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale.  \n\nFinish with thanks.  \nExclamation mark.\n\n",
      "passed": false,
      "score": 25,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 9.6009,
        "metrics": {
          "energy_wh_estimate": 0.193,
          "gpu_busy_percent_avg": 62.2,
          "gpu_busy_percent_peak": 96.0,
          "power_w_avg": 72.38,
          "power_w_peak": 221.0,
          "vram_mb_avg": 15033.1,
          "vram_mb_peak": 16454.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 21,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "markdown_checklist_instruction_content",
      "category": "markdown",
      "judge": {
        "elapsed_seconds": 9.431951690999995,
        "errors": [
          {
            "detail": "Leaves 'Make a Markdown checklist titled:' in the final text.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Adds instruction wording before the title.",
            "severity": "major",
            "type": "extra_text"
          },
          {
            "detail": "Title should be formatted as 'Release Tasks:'.",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Release Tasks:\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 60,
        "summary": "Checklist items are correct, but the output leaks the spoken formatting instruction instead of using only the requested title."
      },
      "judge_status": "ok",
      "latency_seconds": 8.1986,
      "model": "lfm2:24b",
      "ollama_status": "ok",
      "output": "Make a Markdown checklist titled: Release Tasks\n\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release\n\n",
      "passed": false,
      "score": 60,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 8.2003,
        "metrics": {
          "energy_wh_estimate": 0.0877,
          "gpu_busy_percent_avg": 53.5,
          "gpu_busy_percent_peak": 88.0,
          "power_w_avg": 38.5,
          "power_w_peak": 214.0,
          "vram_mb_avg": 14796.0,
          "vram_mb_peak": 16452.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 18,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "shell_command_as_content",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 7.6695102420017065,
        "errors": [
          {
            "detail": "Left spoken tokens like pipe, space, dash, and dot in the output.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Failed to convert dictated tokens into shell syntax.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Capitalized cargo as Cargo.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "cargo test | tee test-output.log",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 10,
        "summary": "Did not preserve shell command formatting; left dictated punctuation words and changed casing."
      },
      "judge_status": "ok",
      "latency_seconds": 8.0456,
      "model": "lfm2:24b",
      "ollama_status": "ok",
      "output": "Cargo test pipe tee space test dash output dot log\n\n",
      "passed": false,
      "score": 10,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 8.0479,
        "metrics": {
          "energy_wh_estimate": 0.1074,
          "gpu_busy_percent_avg": 57.6,
          "gpu_busy_percent_peak": 93.0,
          "power_w_avg": 48.06,
          "power_w_peak": 222.0,
          "vram_mb_avg": 14795.8,
          "vram_mb_peak": 16451.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 18,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "spoken_correction_chain",
      "category": "spoken_edits",
      "judge": {
        "elapsed_seconds": 8.10922207800104,
        "errors": [
          {
            "detail": "Leaves spoken edit commands such as 'no scratch that' and spelling instructions in the final text.",
            "severity": "critical",
            "type": "instruction_leak"
          },
          {
            "detail": "Formats the intended identifier as 'Chirper Bench' instead of 'ChirperBench'.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Adds repeated and unintended words including 'the' and 'the bench'.",
            "severity": "major",
            "type": "extra_text"
          },
          {
            "detail": "Uses comma-separated phrasing instead of the expected short sentence.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Call it ChirperBench.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 0,
        "summary": "Critical spoken-edit failure: the output preserves dictated correction words, changes the intended identifier, and adds extra text."
      },
      "judge_status": "ok",
      "latency_seconds": 8.2398,
      "model": "lfm2:24b",
      "ollama_status": "ok",
      "output": "Call it the Chirper Bench, no scratch that Chirper Bench, spelled with a capital C and a capital B, the Chirper Bench, the bench\n\n",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 8.2417,
        "metrics": {
          "energy_wh_estimate": 0.0903,
          "gpu_busy_percent_avg": 53.7,
          "gpu_busy_percent_peak": 88.0,
          "power_w_avg": 39.44,
          "power_w_peak": 217.0,
          "vram_mb_avg": 14843.5,
          "vram_mb_peak": 16538.0,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 18,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "url_https_path",
      "category": "urls",
      "judge": {
        "elapsed_seconds": 6.303534074999334,
        "errors": [
          {
            "detail": "Missing sentence-ending period after the URL.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Visit https://docs.python.org/3/library/json.html.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 95,
        "summary": "URL and casing are correct, but the final period is missing."
      },
      "judge_status": "ok",
      "latency_seconds": 8.0753,
      "model": "lfm2:24b",
      "ollama_status": "ok",
      "output": "Visit https://docs.python.org/3/library/json.html\n\n",
      "passed": true,
      "score": 95,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 8.0769,
        "metrics": {
          "energy_wh_estimate": 0.0865,
          "gpu_busy_percent_avg": 54.9,
          "gpu_busy_percent_peak": 89.0,
          "power_w_avg": 38.56,
          "power_w_peak": 213.0,
          "vram_mb_avg": 14796.0,
          "vram_mb_peak": 16452.4,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 18,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "email_addresses",
      "category": "emails",
      "judge": {
        "elapsed_seconds": 8.02785823999875,
        "errors": [
          {
            "detail": "Spoken 'at' was left as text instead of converted to @ in both email addresses.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Email local part was capitalized as Alex instead of alex.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Send it to alex@chirper.local and cc ops@example.com.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 55,
        "summary": "Email addresses were not correctly formatted with @ symbols."
      },
      "judge_status": "ok",
      "latency_seconds": 8.0904,
      "model": "lfm2:24b",
      "ollama_status": "ok",
      "output": "Send it to Alex at chirper.local and cc ops at example.com.\n\n",
      "passed": false,
      "score": 55,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 8.0921,
        "metrics": {
          "energy_wh_estimate": 0.096,
          "gpu_busy_percent_avg": 54.6,
          "gpu_busy_percent_peak": 88.0,
          "power_w_avg": 42.72,
          "power_w_peak": 219.0,
          "vram_mb_avg": 14839.3,
          "vram_mb_peak": 16537.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 18,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "product_and_class_casing",
      "category": "casing_identifiers",
      "judge": {
        "elapsed_seconds": 10.13823227100147,
        "errors": [
          {
            "detail": "Did not convert product name to ChirperBench or class name to DictationFormatter.",
            "severity": "major",
            "type": "wrong_casing"
          },
          {
            "detail": "Class identifier was formatted as quoted words instead of code-style PascalCase identifier.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Left the spoken punctuation command 'period' in the output.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Added quotation marks around the class name.",
            "severity": "minor",
            "type": "extra_text"
          }
        ],
        "ideal_output": "The project name is ChirperBench and the class name is DictationFormatter.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 35,
        "summary": "Preserves the rough sentence but fails required product/class identifier casing and leaves spoken punctuation in the text."
      },
      "judge_status": "ok",
      "latency_seconds": 8.134,
      "model": "lfm2:24b",
      "ollama_status": "ok",
      "output": "The project name is Chirper Bench and the class name is \"Dictation Formatter\" period\n\n",
      "passed": false,
      "score": 35,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 8.1358,
        "metrics": {
          "energy_wh_estimate": 0.1045,
          "gpu_busy_percent_avg": 56.7,
          "gpu_busy_percent_peak": 90.0,
          "power_w_avg": 46.22,
          "power_w_peak": 217.0,
          "vram_mb_avg": 14798.0,
          "vram_mb_peak": 16454.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 18,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "inline_code_identifiers",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 7.79975735500193,
        "errors": [
          {
            "detail": "Changed the intended property chain `request.user.id` into `request.user_id`.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Failed to preserve the dictated dotted code identifier as inline code.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Set variable `user_id` equal to `request.user.id`.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 70,
        "summary": "One inline identifier was formatted incorrectly, changing `request.user.id` to `request.user_id`."
      },
      "judge_status": "ok",
      "latency_seconds": 8.058,
      "model": "lfm2:24b",
      "ollama_status": "ok",
      "output": "Set variable `user_id` equal to `request.user_id`.\n\n",
      "passed": false,
      "score": 70,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 8.0595,
        "metrics": {
          "energy_wh_estimate": 0.0892,
          "gpu_busy_percent_avg": 53.9,
          "gpu_busy_percent_peak": 89.0,
          "power_w_avg": 39.83,
          "power_w_peak": 209.0,
          "vram_mb_avg": 14797.9,
          "vram_mb_peak": 16453.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 18,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "numbers_versions_dates",
      "category": "numbers_versions",
      "judge": {
        "elapsed_seconds": 8.826442444002168,
        "errors": [
          {
            "detail": "Version 2.1.0 was reduced to 2.1.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Python 3.11+ was rendered as Python 3.11 plus.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Added an unnecessary comma before and.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Version 2.1.0 shipped on June 3, 2026 and supports Python 3.11+.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 72,
        "summary": "Mostly preserved content, but version and plus notation were formatted incorrectly."
      },
      "judge_status": "ok",
      "latency_seconds": 8.1439,
      "model": "lfm2:24b",
      "ollama_status": "ok",
      "output": "Version 2.1 shipped on June 3, 2026, and supports Python 3.11 plus.\n\n",
      "passed": false,
      "score": 72,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 8.146,
        "metrics": {
          "energy_wh_estimate": 0.1036,
          "gpu_busy_percent_avg": 56.4,
          "gpu_busy_percent_peak": 88.0,
          "power_w_avg": 45.78,
          "power_w_peak": 218.0,
          "vram_mb_avg": 14843.9,
          "vram_mb_peak": 16564.4,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 18,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "simple_list",
      "category": "lists",
      "judge": {
        "elapsed_seconds": 10.674218968997593,
        "errors": [
          {
            "detail": "Did not convert the items after the colon into a bulleted list.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Did not capitalize the initial heading.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Kept inline commas and a final period instead of list formatting.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Shopping list:\n- apples\n- bananas\n- oat milk",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 55,
        "summary": "Content preserved, but the spoken list was not formatted as a markdown list and casing was missed."
      },
      "judge_status": "ok",
      "latency_seconds": 8.039,
      "model": "lfm2:24b",
      "ollama_status": "ok",
      "output": "shopping list: apples, bananas, oat milk.\n\n",
      "passed": false,
      "score": 55,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 8.0409,
        "metrics": {
          "energy_wh_estimate": 0.0876,
          "gpu_busy_percent_avg": 53.7,
          "gpu_busy_percent_peak": 88.0,
          "power_w_avg": 39.22,
          "power_w_peak": 211.0,
          "vram_mb_avg": 14807.6,
          "vram_mb_peak": 16462.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 18,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "no_change_needed",
      "category": "no_change_needed",
      "judge": {
        "elapsed_seconds": 9.194472102000873,
        "errors": [],
        "ideal_output": "The quick brown fox jumps over the lazy dog.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output matches the expected clean text exactly."
      },
      "judge_status": "ok",
      "latency_seconds": 8.0244,
      "model": "lfm2:24b",
      "ollama_status": "ok",
      "output": "The quick brown fox jumps over the lazy dog.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 8.0261,
        "metrics": {
          "energy_wh_estimate": 0.1022,
          "gpu_busy_percent_avg": 57.1,
          "gpu_busy_percent_peak": 94.0,
          "power_w_avg": 45.83,
          "power_w_peak": 228.0,
          "vram_mb_avg": 14807.6,
          "vram_mb_peak": 16462.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 18,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_pr_review_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 9.461490621000848,
        "errors": [
          {
            "detail": "Omitted the instruction to fix needed issues, resolve comments, and push the fix.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Changed the dictated instruction into a weaker review-only request.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Did not format 'PR number 4' as 'PR #4'.",
            "severity": "minor",
            "type": "formatting_miss"
          },
          {
            "detail": "Dropped 'Please' from the start.",
            "severity": "minor",
            "type": "missing_text"
          }
        ],
        "ideal_output": "Please check the open PR #4 for code review comments and check each one that's still open if it needs to be fixed. Fix it, then resolve the comments and push your fix.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 25,
        "summary": "Output preserves the general PR review topic but omits major dictated actions and fails required cleanup for PR number formatting."
      },
      "judge_status": "ok",
      "latency_seconds": 3.7246,
      "model": "lfm2.5-thinking:1.2b",
      "ollama_status": "ok",
      "output": "check the open PR number 4 for code review comments and review each open comment for necessary updates.\n\n",
      "passed": false,
      "score": 25,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.7274,
        "metrics": {
          "energy_wh_estimate": 0.1771,
          "gpu_busy_percent_avg": 62.3,
          "gpu_busy_percent_peak": 88.0,
          "power_w_avg": 171.11,
          "power_w_peak": 255.0,
          "vram_mb_avg": 2851.1,
          "vram_mb_peak": 3107.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 9,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_agent_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 9.034898950001661,
        "errors": [
          {
            "detail": "Left 'comma' and 'period' as words instead of applying punctuation.",
            "severity": "major",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Did not capitalize the first word.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Includes spoken punctuation tokens that should not appear in final text.",
            "severity": "major",
            "type": "extra_text"
          }
        ],
        "ideal_output": "Run cargo test, fix any failing tests, then commit and push the branch.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 20,
        "summary": "Output is essentially unformatted, leaving spoken punctuation words and wrong casing."
      },
      "judge_status": "ok",
      "latency_seconds": 6.2117,
      "model": "lfm2.5-thinking:1.2b",
      "ollama_status": "ok",
      "output": "run cargo test comma fix any failing tests comma then commit and push the branch period\n\n",
      "passed": false,
      "score": 20,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 6.2138,
        "metrics": {
          "energy_wh_estimate": 0.363,
          "gpu_busy_percent_avg": 73.4,
          "gpu_busy_percent_peak": 88.0,
          "power_w_avg": 210.36,
          "power_w_peak": 255.0,
          "vram_mb_avg": 2944.3,
          "vram_mb_peak": 3109.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 14,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_question_no_answer",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 7.486624913999549,
        "errors": [
          {
            "detail": "Leaves \"question mark\" in the text instead of converting it to punctuation.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Missing final question mark.",
            "severity": "major",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Does not capitalize the first word.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "What is the capital of France?",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 55,
        "summary": "The output preserves the question content but leaves the spoken punctuation command and misses required capitalization and question mark formatting."
      },
      "judge_status": "ok",
      "latency_seconds": 3.6368,
      "model": "lfm2.5-thinking:1.2b",
      "ollama_status": "ok",
      "output": "what is the capital of France question mark\n\n",
      "passed": false,
      "score": 55,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.6386,
        "metrics": {
          "energy_wh_estimate": 0.2023,
          "gpu_busy_percent_avg": 68.7,
          "gpu_busy_percent_peak": 88.0,
          "power_w_avg": 200.22,
          "power_w_peak": 260.0,
          "vram_mb_avg": 2851.5,
          "vram_mb_peak": 3107.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 9,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_email_request",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 8.967393639002694,
        "errors": [
          {
            "detail": "Leaves spoken commands such as comma, period, dot, and slash in the output.",
            "severity": "critical",
            "type": "instruction_leak"
          },
          {
            "detail": "Fails to format the URL as chirper.local/launch and the amount as $12,450.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Does not capitalize the sentence start, Maya, Quarterly Update, or AM consistently.",
            "severity": "major",
            "type": "wrong_casing"
          },
          {
            "detail": "Keeps dictated punctuation words instead of applying commas and periods.",
            "severity": "major",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 18,
        "summary": "Major formatting failures: spoken punctuation commands remain, casing is inconsistent, money and URL were not normalized, and the cleaned request is not preserved correctly."
      },
      "judge_status": "ok",
      "latency_seconds": 5.6279,
      "model": "lfm2.5-thinking:1.2b",
      "ollama_status": "ok",
      "output": "write an email to maya comma Subject: quarterly update period the meeting moved to Thursday at 9:30 a.m. comma the budget is twelve thousand four hundred fifty dollars comma and the website is Chirper dot local slash launch period.\n\n",
      "passed": false,
      "score": 18,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 5.6298,
        "metrics": {
          "energy_wh_estimate": 0.3235,
          "gpu_busy_percent_avg": 72.0,
          "gpu_busy_percent_peak": 88.0,
          "power_w_avg": 206.92,
          "power_w_peak": 257.0,
          "vram_mb_avg": 2930.3,
          "vram_mb_peak": 3107.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 13,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "onboarding_mixed_format",
      "category": "mixed_formatting",
      "judge": {
        "elapsed_seconds": 12.894665822997922,
        "errors": [
          {
            "detail": "Spoken commands like period, comma, colon, new paragraph, end of list, and finish with remained in the output.",
            "severity": "critical",
            "type": "instruction_leak"
          },
          {
            "detail": "Did not create the titled bullet list, paragraphs, email-style subject text, URL, currency, time, or technical casing.",
            "severity": "critical",
            "type": "formatting_miss"
          },
          {
            "detail": "Failed to capitalize Chirper, Maya, Quarterly Update, Thursday, AM, PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale.",
            "severity": "major",
            "type": "wrong_casing"
          },
          {
            "detail": "Failed to convert dictated punctuation words into punctuation marks.",
            "severity": "major",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Hello Chirper. I need to write down accent-friendly words.\n\nAccent Friendly Words:\n- water\n- tomato\n- schedule\n- data\n- router\n- aluminium\n- privacy\n\nPlease write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.\n\nIn the deployment notes, mention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale. Thanks!",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 0,
        "summary": "Output left the raw spoken transcript unchanged, including punctuation commands and formatting instructions."
      },
      "judge_status": "ok",
      "latency_seconds": 9.4144,
      "model": "lfm2.5-thinking:1.2b",
      "ollama_status": "ok",
      "output": "hello chirper period i need to write down accent friendly words period this is a bullet point list with title accent friendly words colon water comma tomato comma schedule comma data comma router comma aluminium comma privacy period end of list new paragraph please write an email to maya comma subject colon quarterly update period the meeting moved to thursday at nine thirty a m comma the budget is twelve thousand four hundred fifty dollars comma and the website is chirper dot local slash launch period new paragraph in the deployment notes comma mention that systemd keeps the chirper services running comma and we should also look at postgresql comma ffmpeg comma gnome comma nextcloud comma and tailscale period finish with thanks exclamation mark\n\n",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 9.4164,
        "metrics": {
          "energy_wh_estimate": 0.5992,
          "gpu_busy_percent_avg": 77.8,
          "gpu_busy_percent_peak": 89.0,
          "power_w_avg": 229.1,
          "power_w_peak": 265.0,
          "vram_mb_avg": 2993.8,
          "vram_mb_peak": 3109.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 20,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "markdown_checklist_instruction_content",
      "category": "markdown",
      "judge": {
        "elapsed_seconds": 8.316483113001595,
        "errors": [
          {
            "detail": "Missing markdown checklist syntax and colon after title.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Leaves spoken words like 'colon', 'item one', 'item two', and 'item three' in the output.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Title and checklist items are not properly capitalized.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Duplicates 'release tasks' as an extra line.",
            "severity": "minor",
            "type": "extra_text"
          }
        ],
        "ideal_output": "Release Tasks:\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 15,
        "summary": "The output does not format the dictated markdown checklist and leaks spoken structure words."
      },
      "judge_status": "ok",
      "latency_seconds": 11.0485,
      "model": "lfm2.5-thinking:1.2b",
      "ollama_status": "ok",
      "output": "release tasks  \nrelease tasks colon item one run tests item two push tag item three publish release\n\n",
      "passed": false,
      "score": 15,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 11.0503,
        "metrics": {
          "energy_wh_estimate": 0.7047,
          "gpu_busy_percent_avg": 78.9,
          "gpu_busy_percent_peak": 88.0,
          "power_w_avg": 229.58,
          "power_w_peak": 260.0,
          "vram_mb_avg": 3013.0,
          "vram_mb_peak": 3109.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 24,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "shell_command_as_content",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 9.013918443000875,
        "errors": [
          {
            "detail": "Did not render pipe, spaces, dash, or dot as shell command syntax.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Left spoken formatting tokens such as \"pipe\", \"space\", \"dash\", and \"dot\" in the output.",
            "severity": "major",
            "type": "instruction_leak"
          }
        ],
        "ideal_output": "cargo test | tee test-output.log",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 35,
        "summary": "Failed to convert dictated shell command tokens into the intended command structure."
      },
      "judge_status": "ok",
      "latency_seconds": 5.3821,
      "model": "lfm2.5-thinking:1.2b",
      "ollama_status": "ok",
      "output": "cargo test pipe tee space test dash output dot log\n\n",
      "passed": false,
      "score": 35,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 5.3838,
        "metrics": {
          "energy_wh_estimate": 0.3073,
          "gpu_busy_percent_avg": 71.0,
          "gpu_busy_percent_peak": 88.0,
          "power_w_avg": 205.5,
          "power_w_peak": 260.0,
          "vram_mb_avg": 2915.5,
          "vram_mb_peak": 3107.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 12,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "spoken_correction_chain",
      "category": "spoken_edits",
      "judge": {
        "elapsed_seconds": 7.575788410998939,
        "errors": [
          {
            "detail": "Left spoken edit commands such as \"no scratch that\" and spelling/casing instructions.",
            "severity": "critical",
            "type": "instruction_leak"
          },
          {
            "detail": "Did not apply the final intended corrected name ChirperBench.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Failed to capitalize ChirperBench and start the sentence with a capital letter.",
            "severity": "major",
            "type": "wrong_casing"
          },
          {
            "detail": "Missing final period.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Call it ChirperBench.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 0,
        "summary": "Failed to apply spoken corrections and left edit instructions in the text."
      },
      "judge_status": "ok",
      "latency_seconds": 6.3111,
      "model": "lfm2.5-thinking:1.2b",
      "ollama_status": "ok",
      "output": "chirper bench no scratch that chirper benchmark spelled capital c chirper capital b bench\n\n",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 6.3131,
        "metrics": {
          "energy_wh_estimate": 0.3731,
          "gpu_busy_percent_avg": 72.9,
          "gpu_busy_percent_peak": 88.0,
          "power_w_avg": 212.79,
          "power_w_peak": 260.0,
          "vram_mb_avg": 2944.3,
          "vram_mb_peak": 3109.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 14,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "url_https_path",
      "category": "urls",
      "judge": {
        "elapsed_seconds": 10.538985709001281,
        "errors": [
          {
            "detail": "Left spoken tokens such as 'colon slash slash dot slash period' instead of formatting them.",
            "severity": "critical",
            "type": "instruction_leak"
          },
          {
            "detail": "Did not convert the dictated URL into https://docs.python.org/3/library/json.html.",
            "severity": "critical",
            "type": "formatting_miss"
          },
          {
            "detail": "Omitted the leading word 'Visit'.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Changed HTTPS to HTTP and failed to preserve the intended URL.",
            "severity": "major",
            "type": "semantic_change"
          }
        ],
        "ideal_output": "Visit https://docs.python.org/3/library/json.html.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 0,
        "summary": "Failed to format the spoken HTTPS URL and left dictated punctuation/URL tokens in the output."
      },
      "judge_status": "ok",
      "latency_seconds": 6.1691,
      "model": "lfm2.5-thinking:1.2b",
      "ollama_status": "ok",
      "output": "HTTP colon slash slash docs dot python dot org slash three slash library slash json dot html period\n\n",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 6.1713,
        "metrics": {
          "energy_wh_estimate": 0.3643,
          "gpu_busy_percent_avg": 72.8,
          "gpu_busy_percent_peak": 88.0,
          "power_w_avg": 212.57,
          "power_w_peak": 260.0,
          "vram_mb_avg": 2944.3,
          "vram_mb_peak": 3109.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 14,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "email_addresses",
      "category": "emails",
      "judge": {
        "elapsed_seconds": 8.89090304599813,
        "errors": [
          {
            "detail": "Spoken email addresses were left as words instead of alex@chirper.local and ops@example.com.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "The spoken punctuation word \"period\" remains in the output.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Sentence should start with a capital letter and chirper should remain lowercase.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Send it to alex@chirper.local and cc ops@example.com.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 20,
        "summary": "Email addresses and spoken punctuation were not formatted, and casing was incorrect."
      },
      "judge_status": "ok",
      "latency_seconds": 6.6502,
      "model": "lfm2.5-thinking:1.2b",
      "ollama_status": "ok",
      "output": "send it to alex at Chirper dot local and cc ops at example dot com period\n\n",
      "passed": false,
      "score": 20,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 6.6517,
        "metrics": {
          "energy_wh_estimate": 0.4105,
          "gpu_busy_percent_avg": 76.1,
          "gpu_busy_percent_peak": 88.0,
          "power_w_avg": 222.2,
          "power_w_peak": 261.0,
          "vram_mb_avg": 2947.3,
          "vram_mb_peak": 3100.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 15,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "product_and_class_casing",
      "category": "casing_identifiers",
      "judge": {
        "elapsed_seconds": 7.5741840899972885,
        "errors": [
          {
            "detail": "Left the spoken word \"period\" instead of converting it to punctuation.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Did not convert \"chirper bench\" to \"ChirperBench\" or \"dictation formatter\" to \"DictationFormatter\".",
            "severity": "major",
            "type": "wrong_casing"
          },
          {
            "detail": "Missing final period punctuation.",
            "severity": "major",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "The project name is ChirperBench and the class name is DictationFormatter.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 0,
        "summary": "Failed to apply required product/class casing and punctuation, and left the spoken punctuation command in the text."
      },
      "judge_status": "ok",
      "latency_seconds": 5.5823,
      "model": "lfm2.5-thinking:1.2b",
      "ollama_status": "ok",
      "output": "the project name is chirper bench and the class name is dictation formatter period\n\n",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 5.5843,
        "metrics": {
          "energy_wh_estimate": 0.3278,
          "gpu_busy_percent_avg": 72.2,
          "gpu_busy_percent_peak": 88.0,
          "power_w_avg": 211.38,
          "power_w_peak": 259.0,
          "vram_mb_avg": 2923.1,
          "vram_mb_peak": 3100.6,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 13,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "inline_code_identifiers",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 8.293849571000464,
        "errors": [
          {
            "detail": "Did not format code identifiers as inline code.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Left spoken forms like 'underscore', 'dot', and 'period' in the output.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Did not capitalize the opening word.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Did not convert dictated period to sentence punctuation.",
            "severity": "major",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Set variable `user_id` equal to `request.user.id`.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 15,
        "summary": "Output was essentially unformatted and left dictated punctuation/identifier words intact."
      },
      "judge_status": "ok",
      "latency_seconds": 5.9512,
      "model": "lfm2.5-thinking:1.2b",
      "ollama_status": "ok",
      "output": "set variable user underscore id equal to request dot user dot id period\n\n",
      "passed": false,
      "score": 15,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 5.9531,
        "metrics": {
          "energy_wh_estimate": 0.346,
          "gpu_busy_percent_avg": 72.4,
          "gpu_busy_percent_peak": 88.0,
          "power_w_avg": 209.23,
          "power_w_peak": 259.0,
          "vram_mb_avg": 2963.2,
          "vram_mb_peak": 3179.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 13,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "numbers_versions_dates",
      "category": "numbers_versions",
      "judge": {
        "elapsed_seconds": 8.31051600399951,
        "errors": [
          {
            "detail": "Version number, date, Python version, and plus sign were not converted to required written forms.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Sentence start, Version, and Python casing are incorrect.",
            "severity": "major",
            "type": "wrong_casing"
          },
          {
            "detail": "Leaves spoken 'period' instead of ending with a period.",
            "severity": "major",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Version 2.1.0 shipped on June 3, 2026 and supports Python 3.11+.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 35,
        "summary": "The output preserves the rough wording but fails to format versions, dates, numbers, language casing, and spoken punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 7.4377,
      "model": "lfm2.5-thinking:1.2b",
      "ollama_status": "ok",
      "output": "version two point one point zero shipped on June third twenty twenty six and supports python three point eleven plus period\n\n",
      "passed": false,
      "score": 35,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 7.4399,
        "metrics": {
          "energy_wh_estimate": 0.4555,
          "gpu_busy_percent_avg": 74.8,
          "gpu_busy_percent_peak": 88.0,
          "power_w_avg": 220.44,
          "power_w_peak": 262.0,
          "vram_mb_avg": 2952.1,
          "vram_mb_peak": 3096.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 16,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "simple_list",
      "category": "lists",
      "judge": {
        "elapsed_seconds": 8.20665480399839,
        "errors": [
          {
            "detail": "Did not format the dictated shopping list as a heading plus markdown bullet list.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Did not capitalize the opening heading.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Left the spoken punctuation word \"period\" in the output.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Omitted the colon after the heading and did not apply comma/list separation.",
            "severity": "major",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Shopping list:\n- apples\n- bananas\n- oat milk",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 35,
        "summary": "Preserves item words but fails required casing, punctuation, and list formatting, and leaks the spoken punctuation command."
      },
      "judge_status": "ok",
      "latency_seconds": 5.4569,
      "model": "lfm2.5-thinking:1.2b",
      "ollama_status": "ok",
      "output": "shopping list apples bananas oat milk period\n\n",
      "passed": false,
      "score": 35,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 5.4588,
        "metrics": {
          "energy_wh_estimate": 0.3267,
          "gpu_busy_percent_avg": 73.1,
          "gpu_busy_percent_peak": 89.0,
          "power_w_avg": 215.5,
          "power_w_peak": 262.0,
          "vram_mb_avg": 2902.7,
          "vram_mb_peak": 3094.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 12,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "no_change_needed",
      "category": "no_change_needed",
      "judge": {
        "elapsed_seconds": 7.160849299001711,
        "errors": [],
        "ideal_output": "The quick brown fox jumps over the lazy dog.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output matches expected clean text exactly."
      },
      "judge_status": "ok",
      "latency_seconds": 3.6187,
      "model": "lfm2.5-thinking:1.2b",
      "ollama_status": "ok",
      "output": "The quick brown fox jumps over the lazy dog.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.6206,
        "metrics": {
          "energy_wh_estimate": 0.1887,
          "gpu_busy_percent_avg": 64.9,
          "gpu_busy_percent_peak": 88.0,
          "power_w_avg": 187.67,
          "power_w_peak": 260.0,
          "vram_mb_avg": 2838.7,
          "vram_mb_peak": 3094.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 9,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_pr_review_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 8.879506691002462,
        "errors": [
          {
            "detail": "Rephrases 'Fix it' as 'do so,' but the intended meaning is preserved.",
            "severity": "minor",
            "type": "semantic_change"
          }
        ],
        "ideal_output": "Please check the open PR #4 for code review comments and check each one that's still open if it needs to be fixed. Fix it, then resolve the comments and push your fix.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 96,
        "summary": "Preserves the dictated instruction with appropriate casing and punctuation; only minor wording and sentence-structure changes."
      },
      "judge_status": "ok",
      "latency_seconds": 4.3586,
      "model": "granite4.1:8b",
      "ollama_status": "ok",
      "output": "Please check the open PR #4 for code review comments. Check each one that's still open; if it needs to be fixed, do so. Then, resolve the comments and push your fix.\n\n",
      "passed": true,
      "score": 96,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 4.3604,
        "metrics": {
          "energy_wh_estimate": 0.0751,
          "gpu_busy_percent_avg": 22.4,
          "gpu_busy_percent_peak": 92.0,
          "power_w_avg": 62.0,
          "power_w_peak": 291.0,
          "vram_mb_avg": 4610.2,
          "vram_mb_peak": 12092.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 10,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_agent_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 7.491939589999674,
        "errors": [
          {
            "detail": "Wrapped cargo test in backticks instead of preserving it as plain text.",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Run cargo test, fix any failing tests, then commit and push the branch.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 92,
        "summary": "Content, casing, and punctuation are correct, but it added markdown code formatting not present in the expected literal text."
      },
      "judge_status": "ok",
      "latency_seconds": 2.3155,
      "model": "granite4.1:8b",
      "ollama_status": "ok",
      "output": "Run `cargo test`, fix any failing tests, then commit and push the branch.\n\n",
      "passed": true,
      "score": 92,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.3174,
        "metrics": {
          "energy_wh_estimate": 0.0464,
          "gpu_busy_percent_avg": 23.3,
          "gpu_busy_percent_peak": 84.0,
          "power_w_avg": 72.17,
          "power_w_peak": 298.0,
          "vram_mb_avg": 6686.8,
          "vram_mb_peak": 12048.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_question_no_answer",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 6.385514180998143,
        "errors": [
          {
            "detail": "Missing final question mark.",
            "severity": "major",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "What is the capital of France?",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 85,
        "summary": "Content and casing are correct, but the dictated question mark was omitted."
      },
      "judge_status": "ok",
      "latency_seconds": 2.1347,
      "model": "granite4.1:8b",
      "ollama_status": "ok",
      "output": "What is the capital of France?\n\n",
      "passed": false,
      "score": 85,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.137,
        "metrics": {
          "energy_wh_estimate": 0.042,
          "gpu_busy_percent_avg": 28.0,
          "gpu_busy_percent_peak": 92.0,
          "power_w_avg": 70.83,
          "power_w_peak": 269.0,
          "vram_mb_avg": 6767.1,
          "vram_mb_peak": 12093.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_email_request",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 12.35385393300021,
        "errors": [
          {
            "detail": "Generated an email-style body instead of returning the dictated request as final text.",
            "severity": "critical",
            "type": "over_generation"
          },
          {
            "detail": "Included duplicate content, a separator, and a 'Cleaned-up final text' label.",
            "severity": "major",
            "type": "extra_text"
          },
          {
            "detail": "Omitted 'Write an email to Maya,' from the intended output.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Changed 'The meeting moved' to 'The meeting has been moved'.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Incorrectly formatted the URL as a markdown HTTPS link and split the sentence into separate lines.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Used A.M./a.m. instead of AM and did not preserve the expected title/casing structure consistently.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 20,
        "summary": "Executed the email-writing request instead of preserving it as dictated content, added meta text and duplicate content, and changed formatting/wording."
      },
      "judge_status": "ok",
      "latency_seconds": 3.3601,
      "model": "granite4.1:8b",
      "ollama_status": "ok",
      "output": "Subject: Quarterly Update  \nThe meeting has been moved to Thursday at 9:30 A.M.  \nThe budget is $12,450.  \nThe website is [https://chirper.local/launch](https://chirper.local/launch).  \n\n---  \n**Cleaned-up final text:**  \nSubject: Quarterly Update  \nThe meeting has been moved to Thursday at 9:30 a.m.  \nThe budget is twelve thousand four hundred fifty dollars.  \nThe website is chirper.local/launch.\n\n",
      "passed": false,
      "score": 20,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.3621,
        "metrics": {
          "energy_wh_estimate": 0.1202,
          "gpu_busy_percent_avg": 42.8,
          "gpu_busy_percent_peak": 92.0,
          "power_w_avg": 128.75,
          "power_w_peak": 303.0,
          "vram_mb_avg": 8050.4,
          "vram_mb_peak": 12093.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 8,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "onboarding_mixed_format",
      "category": "mixed_formatting",
      "judge": {
        "elapsed_seconds": 36.45901332499852,
        "errors": [
          {
            "detail": "Left spoken commands such as \"this is a bullet point list with title\" and \"Finish with thanks\" in the output.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Omitted \"I need to write down accent-friendly words.\"",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Changed the opening from two sentences into a different list-introduction sentence.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Added a period to the final bullet and converted the URL into a markdown link instead of preserving plain URL text.",
            "severity": "minor",
            "type": "formatting_miss"
          },
          {
            "detail": "Used \"9:30 A.M.\" instead of \"9:30 AM\" and missed the final \"Thanks!\" sentence.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Hello Chirper. I need to write down accent-friendly words.\n\nAccent Friendly Words:\n- water\n- tomato\n- schedule\n- data\n- router\n- aluminium\n- privacy\n\nPlease write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.\n\nIn the deployment notes, mention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale. Thanks!",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 52,
        "summary": "Major mixed-formatting errors: omitted intended text, leaked spoken formatting instructions, and mishandled the closing thanks instruction."
      },
      "judge_status": "ok",
      "latency_seconds": 3.7799,
      "model": "granite4.1:8b",
      "ollama_status": "ok",
      "output": "Hello Chirper, this is a bullet point list with title: Accent Friendly Words:\n- water\n- tomato\n- schedule\n- data\n- router\n- aluminium\n- privacy.\n\nPlease write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 A.M., the budget is $12,450, and the website is [chirper.local/launch](http://chirper.local/launch).\n\nIn the deployment notes, mention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale. Finish with thanks!\n\n",
      "passed": false,
      "score": 52,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.7818,
        "metrics": {
          "energy_wh_estimate": 0.1514,
          "gpu_busy_percent_avg": 47.8,
          "gpu_busy_percent_peak": 93.0,
          "power_w_avg": 144.11,
          "power_w_peak": 308.0,
          "vram_mb_avg": 7932.1,
          "vram_mb_peak": 12093.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 9,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "markdown_checklist_instruction_content",
      "category": "markdown",
      "judge": {
        "elapsed_seconds": 13.248860175001028,
        "errors": [
          {
            "detail": "Used a level-2 markdown heading for the title instead of the expected plain 'Release Tasks:' line.",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Release Tasks:\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 90,
        "summary": "Content and checklist formatting are preserved; only the title style differs from the expected plain title line."
      },
      "judge_status": "ok",
      "latency_seconds": 2.3639,
      "model": "granite4.1:8b",
      "ollama_status": "ok",
      "output": "## Release Tasks:\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release\n\n",
      "passed": true,
      "score": 90,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.3657,
        "metrics": {
          "energy_wh_estimate": 0.0466,
          "gpu_busy_percent_avg": 23.5,
          "gpu_busy_percent_peak": 85.0,
          "power_w_avg": 71.0,
          "power_w_peak": 302.0,
          "vram_mb_avg": 6695.3,
          "vram_mb_peak": 12093.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "shell_command_as_content",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 5.695540898999752,
        "errors": [],
        "ideal_output": "cargo test | tee test-output.log",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output exactly preserves and formats the intended shell command."
      },
      "judge_status": "ok",
      "latency_seconds": 2.1389,
      "model": "granite4.1:8b",
      "ollama_status": "ok",
      "output": "cargo test | tee test-output.log\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.1406,
        "metrics": {
          "energy_wh_estimate": 0.0379,
          "gpu_busy_percent_avg": 25.2,
          "gpu_busy_percent_peak": 84.0,
          "power_w_avg": 63.83,
          "power_w_peak": 250.0,
          "vram_mb_avg": 6702.8,
          "vram_mb_peak": 12093.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "spoken_correction_chain",
      "category": "spoken_edits",
      "judge": {
        "elapsed_seconds": 9.587851533000503,
        "errors": [
          {
            "detail": "Omitted \"Call it\" from the final text.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Rendered the corrected identifier as \"Chirper Benchmark\" instead of \"ChirperBench\".",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Missing final period.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Call it ChirperBench.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 35,
        "summary": "The output removes spoken edit words but changes the intended identifier and omits surrounding text and punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 2.137,
      "model": "granite4.1:8b",
      "ollama_status": "ok",
      "output": "Chirper Benchmark\n\n",
      "passed": false,
      "score": 35,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.1388,
        "metrics": {
          "energy_wh_estimate": 0.0347,
          "gpu_busy_percent_avg": 24.8,
          "gpu_busy_percent_peak": 90.0,
          "power_w_avg": 58.5,
          "power_w_peak": 234.0,
          "vram_mb_avg": 5851.6,
          "vram_mb_peak": 12093.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "url_https_path",
      "category": "urls",
      "judge": {
        "elapsed_seconds": 5.750014659999579,
        "errors": [],
        "ideal_output": "Visit https://docs.python.org/3/library/json.html.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output exactly matches the expected formatted HTTPS URL with correct casing and punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 2.2461,
      "model": "granite4.1:8b",
      "ollama_status": "ok",
      "output": "Visit https://docs.python.org/3/library/json.html.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.2482,
        "metrics": {
          "energy_wh_estimate": 0.0432,
          "gpu_busy_percent_avg": 25.3,
          "gpu_busy_percent_peak": 90.0,
          "power_w_avg": 69.17,
          "power_w_peak": 292.0,
          "vram_mb_avg": 6695.6,
          "vram_mb_peak": 12050.9,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "email_addresses",
      "category": "emails",
      "judge": {
        "elapsed_seconds": 7.799003553001967,
        "errors": [
          {
            "detail": "Left the first email as 'Alex at chirper.local' instead of 'alex@chirper.local'.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Changed 'cc' to 'CC' and capitalized the email local part as 'Alex'.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Send it to alex@chirper.local and cc ops@example.com.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 65,
        "summary": "One spoken email address was not converted to email format, and casing differs from expected."
      },
      "judge_status": "ok",
      "latency_seconds": 2.2594,
      "model": "granite4.1:8b",
      "ollama_status": "ok",
      "output": "Send it to Alex at chirper.local and CC ops@example.com.\n\n",
      "passed": false,
      "score": 65,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.2615,
        "metrics": {
          "energy_wh_estimate": 0.0398,
          "gpu_busy_percent_avg": 25.0,
          "gpu_busy_percent_peak": 90.0,
          "power_w_avg": 63.33,
          "power_w_peak": 258.0,
          "vram_mb_avg": 5851.5,
          "vram_mb_peak": 12093.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "product_and_class_casing",
      "category": "casing_identifiers",
      "judge": {
        "elapsed_seconds": 8.852484244001971,
        "errors": [
          {
            "detail": "Used spaced title case instead of code-style identifiers ChirperBench and DictationFormatter.",
            "severity": "major",
            "type": "wrong_casing"
          },
          {
            "detail": "Did not format the project and class names as required identifiers.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "The project name is ChirperBench and the class name is DictationFormatter.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 75,
        "summary": "The output preserved the sentence and punctuation but failed required product and class identifier casing."
      },
      "judge_status": "ok",
      "latency_seconds": 2.3269,
      "model": "granite4.1:8b",
      "ollama_status": "ok",
      "output": "The project name is Chirper Bench and the class name is Dictation Formatter.\n\n",
      "passed": false,
      "score": 75,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.3288,
        "metrics": {
          "energy_wh_estimate": 0.046,
          "gpu_busy_percent_avg": 25.0,
          "gpu_busy_percent_peak": 89.0,
          "power_w_avg": 71.17,
          "power_w_peak": 297.0,
          "vram_mb_avg": 6696.8,
          "vram_mb_peak": 12094.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "inline_code_identifiers",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 8.822720367003058,
        "errors": [
          {
            "detail": "Did not format `user_id` and `request.user.id` as inline code.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Sentence starts lowercase instead of capitalized.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Uses a semicolon instead of the dictated period.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Set variable `user_id` equal to `request.user.id`.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 65,
        "summary": "Preserves the intended content and identifiers, but misses required inline code formatting, casing, and final punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 2.2552,
      "model": "granite4.1:8b",
      "ollama_status": "ok",
      "output": "set variable user_id equal to request.user.id;\n\n",
      "passed": false,
      "score": 65,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.2573,
        "metrics": {
          "energy_wh_estimate": 0.0359,
          "gpu_busy_percent_avg": 22.2,
          "gpu_busy_percent_peak": 74.0,
          "power_w_avg": 57.33,
          "power_w_peak": 217.0,
          "vram_mb_avg": 5853.1,
          "vram_mb_peak": 12094.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "numbers_versions_dates",
      "category": "numbers_versions",
      "judge": {
        "elapsed_seconds": 9.424029370999051,
        "errors": [
          {
            "detail": "Added \"was\" before \"shipped,\" changing the dictated wording slightly.",
            "severity": "minor",
            "type": "extra_text"
          },
          {
            "detail": "Added an unnecessary comma before \"and supports\" compared with the expected output.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Version 2.1.0 shipped on June 3, 2026 and supports Python 3.11+.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 88,
        "summary": "Mostly correct formatting of numbers, date, versions, and casing, with minor wording and punctuation changes."
      },
      "judge_status": "ok",
      "latency_seconds": 2.3805,
      "model": "granite4.1:8b",
      "ollama_status": "ok",
      "output": "Version 2.1.0 was shipped on June 3, 2026, and supports Python 3.11+.\n\n",
      "passed": true,
      "score": 88,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.3826,
        "metrics": {
          "energy_wh_estimate": 0.0474,
          "gpu_busy_percent_avg": 26.5,
          "gpu_busy_percent_peak": 90.0,
          "power_w_avg": 71.67,
          "power_w_peak": 296.0,
          "vram_mb_avg": 6704.8,
          "vram_mb_peak": 12096.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "simple_list",
      "category": "lists",
      "judge": {
        "elapsed_seconds": 8.045848463996663,
        "errors": [
          {
            "detail": "Expected three bullet list items after the colon, but output kept items inline with commas.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Shopping list:\n- apples\n- bananas\n- oat milk",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 70,
        "summary": "Content is preserved, but the spoken list was not formatted as a markdown list."
      },
      "judge_status": "ok",
      "latency_seconds": 2.1562,
      "model": "granite4.1:8b",
      "ollama_status": "ok",
      "output": "Shopping list: apples, bananas, oat milk.\n\n",
      "passed": false,
      "score": 70,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.1578,
        "metrics": {
          "energy_wh_estimate": 0.0433,
          "gpu_busy_percent_avg": 26.7,
          "gpu_busy_percent_peak": 90.0,
          "power_w_avg": 72.33,
          "power_w_peak": 294.0,
          "vram_mb_avg": 6697.0,
          "vram_mb_peak": 12050.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "no_change_needed",
      "category": "no_change_needed",
      "judge": {
        "elapsed_seconds": 6.121093256999302,
        "errors": [],
        "ideal_output": "The quick brown fox jumps over the lazy dog.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output matches the expected clean text exactly."
      },
      "judge_status": "ok",
      "latency_seconds": 2.1678,
      "model": "granite4.1:8b",
      "ollama_status": "ok",
      "output": "The quick brown fox jumps over the lazy dog.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.169,
        "metrics": {
          "energy_wh_estimate": 0.0415,
          "gpu_busy_percent_avg": 26.8,
          "gpu_busy_percent_peak": 88.0,
          "power_w_avg": 68.83,
          "power_w_peak": 261.0,
          "vram_mb_avg": 6689.7,
          "vram_mb_peak": 12050.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_pr_review_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 9.052487797001959,
        "errors": [
          {
            "detail": "Did not format 'PR number 4' as 'PR #4'.",
            "severity": "minor",
            "type": "formatting_miss"
          },
          {
            "detail": "Split the conditional phrasing differently from the expected cleanup.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Please check the open PR #4 for code review comments and check each one that's still open if it needs to be fixed. Fix it, then resolve the comments and push your fix.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 88,
        "summary": "Preserves the dictated instruction as content with reasonable cleanup, but misses the PR number identifier formatting and slightly alters sentence structure."
      },
      "judge_status": "ok",
      "latency_seconds": 2.326,
      "model": "granite4.1:3b",
      "ollama_status": "ok",
      "output": "Please check the open PR number 4 for code review comments and check each one that's still open. If it needs to be fixed, fix it and then resolve the comments and push your fix.\n\n",
      "passed": true,
      "score": 88,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.3277,
        "metrics": {
          "energy_wh_estimate": 0.0392,
          "gpu_busy_percent_avg": 18.2,
          "gpu_busy_percent_peak": 87.0,
          "power_w_avg": 60.67,
          "power_w_peak": 252.0,
          "vram_mb_avg": 3209.8,
          "vram_mb_peak": 6700.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_agent_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 8.32954409499871,
        "errors": [
          {
            "detail": "The spoken edit word \"comma\" remains in the final text.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Sentence should start with capitalized \"Run\".",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Punctuation is corrupted by retained spoken comma words.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Run cargo test, fix any failing tests, then commit and push the branch.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 55,
        "summary": "Leaves spoken punctuation commands in the output and misses required casing cleanup."
      },
      "judge_status": "ok",
      "latency_seconds": 1.4205,
      "model": "granite4.1:3b",
      "ollama_status": "ok",
      "output": "run cargo test, comma fix any failing tests, comma then commit and push the branch.\n\n",
      "passed": false,
      "score": 55,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 1.4224,
        "metrics": {
          "energy_wh_estimate": 0.0243,
          "gpu_busy_percent_avg": 25.2,
          "gpu_busy_percent_peak": 63.0,
          "power_w_avg": 61.5,
          "power_w_peak": 175.0,
          "vram_mb_avg": 4054.3,
          "vram_mb_peak": 6700.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 4,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_question_no_answer",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 6.3864770229993155,
        "errors": [],
        "ideal_output": "What is the capital of France?",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output correctly formats the dictated question without answering it."
      },
      "judge_status": "ok",
      "latency_seconds": 1.323,
      "model": "granite4.1:3b",
      "ollama_status": "ok",
      "output": "What is the capital of France?\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 1.325,
        "metrics": {
          "energy_wh_estimate": 0.0245,
          "gpu_busy_percent_avg": 32.8,
          "gpu_busy_percent_peak": 84.0,
          "power_w_avg": 66.5,
          "power_w_peak": 190.0,
          "vram_mb_avg": 3982.4,
          "vram_mb_peak": 6700.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 4,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_email_request",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 10.168570578996878,
        "errors": [
          {
            "detail": "Leaves the spoken punctuation word 'Period' in the output.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Does not format 'twelve thousand four hundred fifty dollars' as '$12,450'.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Missing the sentence break after 'Quarterly Update'.",
            "severity": "major",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Capitalizes 'Subject' and 'The budget' where the expected cleaned text uses lowercase in context.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 58,
        "summary": "Preserves the general request but misses key spoken punctuation and currency formatting, leaving 'Period' in the text and failing to format the dollar amount."
      },
      "judge_status": "ok",
      "latency_seconds": 1.6026,
      "model": "granite4.1:3b",
      "ollama_status": "ok",
      "output": "Write an email to Maya, Subject: Quarterly Update Period The meeting moved to Thursday at 9:30 AM, The budget is twelve thousand four hundred fifty dollars, and the website is chirper.local/launch.\n\n",
      "passed": false,
      "score": 58,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 1.6046,
        "metrics": {
          "energy_wh_estimate": 0.0527,
          "gpu_busy_percent_avg": 42.4,
          "gpu_busy_percent_peak": 88.0,
          "power_w_avg": 118.2,
          "power_w_peak": 255.0,
          "vram_mb_avg": 4566.0,
          "vram_mb_peak": 6656.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 5,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "onboarding_mixed_format",
      "category": "mixed_formatting",
      "judge": {
        "elapsed_seconds": 15.290073875999951,
        "errors": [
          {
            "detail": "Leaves dictated commands such as 'This is a bullet point list with title,' 'End of List,' 'New Paragraph,' and 'Finish with' in the output.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Handles some punctuation and URL cleanup but misses list cleanup, time, currency, and several casing requirements.",
            "severity": "major",
            "type": "partial_mixed_task"
          },
          {
            "detail": "List title and bullets do not match expected markdown formatting; email subject is split into a separate heading instead of inline text.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Splits several clauses into separate sentences instead of preserving the intended sentence structure.",
            "severity": "major",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Uses capitalized bullet items and 'NextCloud' instead of expected lowercase bullets and 'Nextcloud'.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Does not produce the final standalone 'Thanks!' correctly because it remains attached to a spoken edit command.",
            "severity": "minor",
            "type": "missing_text"
          }
        ],
        "ideal_output": "Hello Chirper. I need to write down accent-friendly words.\n\nAccent Friendly Words:\n- water\n- tomato\n- schedule\n- data\n- router\n- aluminium\n- privacy\n\nPlease write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.\n\nIn the deployment notes, mention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale. Thanks!",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 45,
        "summary": "Preserves much of the content but leaves spoken formatting commands, misses key numeric/currency/time formatting, and mishandles list and technical casing details."
      },
      "judge_status": "ok",
      "latency_seconds": 2.4009,
      "model": "granite4.1:3b",
      "ollama_status": "ok",
      "output": "Hello Chirper. I need to write down accent-friendly words.\n\nThis is a bullet point list with title: Accent Friendly Words\n\n- Water\n- Tomato\n- Schedule\n- Data\n- Router\n- Aluminium\n- Privacy\n\nEnd of List\n\nNew Paragraph\n\nPlease write an email to Maya.\n\nSubject: Quarterly Update\n\nThe meeting moved to Thursday at nine thirty a.m.\n\nThe budget is twelve thousand four hundred fifty dollars.\n\nAnd the website is chirper.local/launch.\n\nNew Paragraph\n\nIn the deployment notes, mention that systemd keeps the Chirper services running. We should also look at PostgreSQL, FFmpeg, GNOME, NextCloud, and Tailscale.\n\nFinish with Thanks!\n\n",
      "passed": false,
      "score": 45,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 2.4028,
        "metrics": {
          "energy_wh_estimate": 0.0919,
          "gpu_busy_percent_avg": 47.0,
          "gpu_busy_percent_peak": 86.0,
          "power_w_avg": 137.67,
          "power_w_peak": 256.0,
          "vram_mb_avg": 4938.1,
          "vram_mb_peak": 6700.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 6,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "markdown_checklist_instruction_content",
      "category": "markdown",
      "judge": {
        "elapsed_seconds": 8.047820875999605,
        "errors": [
          {
            "detail": "Used plain bullets instead of markdown checklist items.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Rendered the title as a heading without the required trailing colon.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Release Tasks:\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 65,
        "summary": "Preserved the release task content but missed the required checklist formatting and colon title."
      },
      "judge_status": "ok",
      "latency_seconds": 1.3712,
      "model": "granite4.1:3b",
      "ollama_status": "ok",
      "output": "# Release Tasks\n\n- Run tests\n- Push tag\n- Publish release\n\n",
      "passed": false,
      "score": 65,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 1.3732,
        "metrics": {
          "energy_wh_estimate": 0.0206,
          "gpu_busy_percent_avg": 13.2,
          "gpu_busy_percent_peak": 42.0,
          "power_w_avg": 54.0,
          "power_w_peak": 135.0,
          "vram_mb_avg": 4043.5,
          "vram_mb_peak": 6656.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 4,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "shell_command_as_content",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 7.113251198999933,
        "errors": [
          {
            "detail": "Left spoken punctuation words instead of converting them to shell syntax.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Output contains dictated formatting tokens like pipe, space, dash, and dot.",
            "severity": "major",
            "type": "instruction_leak"
          }
        ],
        "ideal_output": "cargo test | tee test-output.log",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 15,
        "summary": "Did not format the dictated shell command or preserve command syntax."
      },
      "judge_status": "ok",
      "latency_seconds": 1.3336,
      "model": "granite4.1:3b",
      "ollama_status": "ok",
      "output": "cargo test pipe tee space test dash output dot log\n\n",
      "passed": false,
      "score": 15,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 1.3357,
        "metrics": {
          "energy_wh_estimate": 0.0181,
          "gpu_busy_percent_avg": 14.2,
          "gpu_busy_percent_peak": 36.0,
          "power_w_avg": 48.75,
          "power_w_peak": 122.0,
          "vram_mb_avg": 4034.8,
          "vram_mb_peak": 6656.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 4,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "spoken_correction_chain",
      "category": "spoken_edits",
      "judge": {
        "elapsed_seconds": 7.446073428000091,
        "errors": [
          {
            "detail": "Omitted \"Call it\" from the dictated content.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Rendered the final corrected identifier as \"Chirper Benchmark\" instead of camel-cased \"ChirperBench\".",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Missing final period.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Call it ChirperBench.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 35,
        "summary": "Output drops required words and formats the corrected identifier incorrectly."
      },
      "judge_status": "ok",
      "latency_seconds": 1.2834,
      "model": "granite4.1:3b",
      "ollama_status": "ok",
      "output": "Chirper Benchmark\n\n",
      "passed": false,
      "score": 35,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 1.2855,
        "metrics": {
          "energy_wh_estimate": 0.0263,
          "gpu_busy_percent_avg": 31.0,
          "gpu_busy_percent_peak": 79.0,
          "power_w_avg": 73.75,
          "power_w_peak": 212.0,
          "vram_mb_avg": 4085.5,
          "vram_mb_peak": 6656.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 4,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "url_https_path",
      "category": "urls",
      "judge": {
        "elapsed_seconds": 7.479015829998389,
        "errors": [
          {
            "detail": "Sentence starts with lowercase \"visit\" instead of \"Visit\".",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Visit https://docs.python.org/3/library/json.html.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 90,
        "summary": "URL and punctuation are correct; only sentence capitalization is missed."
      },
      "judge_status": "ok",
      "latency_seconds": 1.3601,
      "model": "granite4.1:3b",
      "ollama_status": "ok",
      "output": "visit https://docs.python.org/3/library/json.html.\n\n",
      "passed": true,
      "score": 90,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 1.3619,
        "metrics": {
          "energy_wh_estimate": 0.0205,
          "gpu_busy_percent_avg": 17.2,
          "gpu_busy_percent_peak": 42.0,
          "power_w_avg": 54.25,
          "power_w_peak": 139.0,
          "vram_mb_avg": 4054.9,
          "vram_mb_peak": 6702.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 4,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "email_addresses",
      "category": "emails",
      "judge": {
        "elapsed_seconds": 6.017739609000273,
        "errors": [
          {
            "detail": "Sentence starts with lowercase 'send' instead of 'Send'.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Send it to alex@chirper.local and cc ops@example.com.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 95,
        "summary": "Email addresses and punctuation are correct; only sentence casing is missed."
      },
      "judge_status": "ok",
      "latency_seconds": 1.3766,
      "model": "granite4.1:3b",
      "ollama_status": "ok",
      "output": "send it to alex@chirper.local and cc ops@example.com.\n\n",
      "passed": true,
      "score": 95,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 1.3792,
        "metrics": {
          "energy_wh_estimate": 0.0307,
          "gpu_busy_percent_avg": 29.5,
          "gpu_busy_percent_peak": 82.0,
          "power_w_avg": 80.25,
          "power_w_peak": 239.0,
          "vram_mb_avg": 4043.3,
          "vram_mb_peak": 6656.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 4,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "product_and_class_casing",
      "category": "casing_identifiers",
      "judge": {
        "elapsed_seconds": 8.880578031999903,
        "errors": [
          {
            "detail": "Title-cased nearly every word instead of normal sentence casing.",
            "severity": "major",
            "type": "wrong_casing"
          },
          {
            "detail": "Did not apply product/class identifier casing: ChirperBench and DictationFormatter.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Left spoken punctuation command \"Period\" in the output.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Missing the final period as punctuation.",
            "severity": "major",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "The project name is ChirperBench and the class name is DictationFormatter.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 35,
        "summary": "Incorrect casing and identifier formatting; spoken punctuation was left in the output."
      },
      "judge_status": "ok",
      "latency_seconds": 1.4104,
      "model": "granite4.1:3b",
      "ollama_status": "ok",
      "output": "The Project Name Is Chirper Bench And The ClassName Is Dictation Formatter Period\n\n",
      "passed": false,
      "score": 35,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 1.4121,
        "metrics": {
          "energy_wh_estimate": 0.0292,
          "gpu_busy_percent_avg": 33.0,
          "gpu_busy_percent_peak": 83.0,
          "power_w_avg": 74.5,
          "power_w_peak": 221.0,
          "vram_mb_avg": 3331.4,
          "vram_mb_peak": 6700.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 4,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "inline_code_identifiers",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 8.029295203999936,
        "errors": [
          {
            "detail": "Changed the intended sentence into a pseudo-code assignment with 'setVariable:'.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Did not format identifiers as inline code with backticks.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Expected sentence casing with 'Set'.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Used semicolon instead of a final period.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Set variable `user_id` equal to `request.user.id`.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 35,
        "summary": "Output converts the dictated sentence into a code-like statement, changes wording/casing, and misses required inline code markdown."
      },
      "judge_status": "ok",
      "latency_seconds": 1.3406,
      "model": "granite4.1:3b",
      "ollama_status": "ok",
      "output": "setVariable: user_id = request.user.id;\n\n",
      "passed": false,
      "score": 35,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 1.3427,
        "metrics": {
          "energy_wh_estimate": 0.029,
          "gpu_busy_percent_avg": 31.5,
          "gpu_busy_percent_peak": 82.0,
          "power_w_avg": 77.75,
          "power_w_peak": 231.0,
          "vram_mb_avg": 4056.8,
          "vram_mb_peak": 6700.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 4,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "numbers_versions_dates",
      "category": "numbers_versions",
      "judge": {
        "elapsed_seconds": 8.242734776998986,
        "errors": [
          {
            "detail": "Did not convert version 2.1.0, date June 3, 2026, Python 3.11+, or final period.",
            "severity": "critical",
            "type": "formatting_miss"
          },
          {
            "detail": "Output uses title case instead of sentence case and proper language casing.",
            "severity": "major",
            "type": "wrong_casing"
          },
          {
            "detail": "Leaves the spoken punctuation word \"Period\" in the output.",
            "severity": "major",
            "type": "instruction_leak"
          }
        ],
        "ideal_output": "Version 2.1.0 shipped on June 3, 2026 and supports Python 3.11+.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 0,
        "summary": "Failed to format versions, date, number, casing, and punctuation; left dictated punctuation as text."
      },
      "judge_status": "ok",
      "latency_seconds": 1.4579,
      "model": "granite4.1:3b",
      "ollama_status": "ok",
      "output": "Version Two Point One Point Zero Shipped On June Third Twenty Twenty Six And Supports Python Three Point Eleven Plus Period\n\n",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 1.46,
        "metrics": {
          "energy_wh_estimate": 0.03,
          "gpu_busy_percent_avg": 32.8,
          "gpu_busy_percent_peak": 85.0,
          "power_w_avg": 74.0,
          "power_w_peak": 220.0,
          "vram_mb_avg": 4004.2,
          "vram_mb_peak": 6787.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 4,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "simple_list",
      "category": "lists",
      "judge": {
        "elapsed_seconds": 22.56501151699922,
        "errors": [
          {
            "detail": "Did not convert the comma-separated items into a bullet list after the colon.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Did not capitalize the initial heading.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Shopping list:\n- apples\n- bananas\n- oat milk",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 55,
        "summary": "Content is preserved, but the spoken list was not formatted as a markdown list and casing was missed."
      },
      "judge_status": "ok",
      "latency_seconds": 1.3434,
      "model": "granite4.1:3b",
      "ollama_status": "ok",
      "output": "shopping list: apples, bananas, oat milk\n\n",
      "passed": false,
      "score": 55,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 1.3455,
        "metrics": {
          "energy_wh_estimate": 0.0254,
          "gpu_busy_percent_avg": 26.0,
          "gpu_busy_percent_peak": 64.0,
          "power_w_avg": 68.0,
          "power_w_peak": 193.0,
          "vram_mb_avg": 3982.4,
          "vram_mb_peak": 6700.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 4,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "no_change_needed",
      "category": "no_change_needed",
      "judge": {
        "elapsed_seconds": 6.025609811000322,
        "errors": [],
        "ideal_output": "The quick brown fox jumps over the lazy dog.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output matches the expected cleaned text exactly."
      },
      "judge_status": "ok",
      "latency_seconds": 1.3777,
      "model": "granite4.1:3b",
      "ollama_status": "ok",
      "output": "The quick brown fox jumps over the lazy dog.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 1.3796,
        "metrics": {
          "energy_wh_estimate": 0.0295,
          "gpu_busy_percent_avg": 37.5,
          "gpu_busy_percent_peak": 81.0,
          "power_w_avg": 77.0,
          "power_w_peak": 200.0,
          "vram_mb_avg": 3648.3,
          "vram_mb_peak": 7176.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 4,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_pr_review_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 11.039292271001614,
        "errors": [
          {
            "detail": "Leaves the spoken command \u201cPeriod.\u201d in the output instead of converting it to punctuation.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Does not format \u201cPR number 4\u201d as \u201cPR #4\u201d.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Drops the opening \u201cPlease\u201d from the dictated text.",
            "severity": "minor",
            "type": "missing_text"
          },
          {
            "detail": "Misses the comma in \u201cFix it, then resolve...\u201d",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Please check the open PR #4 for code review comments and check each one that's still open if it needs to be fixed. Fix it, then resolve the comments and push your fix.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 55,
        "summary": "Mostly preserves the instruction as content, but leaves the spoken punctuation command and misses the requested PR number formatting."
      },
      "judge_status": "ok",
      "latency_seconds": 4.1817,
      "model": "gemma3:4b",
      "ollama_status": "ok",
      "output": "Check the open PR number 4 for code review comments and check each one that\u2019s still open if it needs to be fixed. Period. Fix it and then resolve the comments and push your fix.\n\n",
      "passed": false,
      "score": 55,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 4.1835,
        "metrics": {
          "energy_wh_estimate": 0.1076,
          "gpu_busy_percent_avg": 44.9,
          "gpu_busy_percent_peak": 90.0,
          "power_w_avg": 92.6,
          "power_w_peak": 287.0,
          "vram_mb_avg": 4979.9,
          "vram_mb_peak": 6529.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 10,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_agent_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 6.618910348999634,
        "errors": [],
        "ideal_output": "Run cargo test, fix any failing tests, then commit and push the branch.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output matches the expected cleaned text."
      },
      "judge_status": "ok",
      "latency_seconds": 3.408,
      "model": "gemma3:4b",
      "ollama_status": "ok",
      "output": "Run cargo test, fix any failing tests, then commit and push the branch.\n\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.4094,
        "metrics": {
          "energy_wh_estimate": 0.0752,
          "gpu_busy_percent_avg": 40.9,
          "gpu_busy_percent_peak": 86.0,
          "power_w_avg": 79.38,
          "power_w_peak": 268.0,
          "vram_mb_avg": 4807.9,
          "vram_mb_peak": 6738.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 8,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_question_no_answer",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 7.304766633998952,
        "errors": [],
        "ideal_output": "What is the capital of France?",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Correctly formatted the dictated question without answering it."
      },
      "judge_status": "ok",
      "latency_seconds": 3.3481,
      "model": "gemma3:4b",
      "ollama_status": "ok",
      "output": "What is the capital of France?\n\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.3498,
        "metrics": {
          "energy_wh_estimate": 0.0642,
          "gpu_busy_percent_avg": 42.2,
          "gpu_busy_percent_peak": 85.0,
          "power_w_avg": 69.0,
          "power_w_peak": 180.0,
          "vram_mb_avg": 4748.0,
          "vram_mb_peak": 6682.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 8,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_email_request",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 12.151973271000315,
        "errors": [
          {
            "detail": "Changed the literal request into separated email-style sections instead of preserving it as a single instruction sentence.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Added blank lines and a Subject header rather than keeping `subject:` inline.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Added `https://` to the dictated URL.",
            "severity": "minor",
            "type": "semantic_change"
          },
          {
            "detail": "Rendered `9:30 a.m.` instead of `9:30 AM`.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Missed comma before `and the website` and split the final sentence differently.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 70,
        "summary": "Preserves most content but reformats the dictated request into an email-like layout and changes URL/punctuation/casing from the expected cleaned text."
      },
      "judge_status": "ok",
      "latency_seconds": 3.6961,
      "model": "gemma3:4b",
      "ollama_status": "ok",
      "output": "Write an email to Maya.\n\nSubject: Quarterly Update\n\nThe meeting moved to Thursday at 9:30 a.m. The budget is $12,450 and the website is https://chirper.local/launch.\n\n",
      "passed": false,
      "score": 70,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.6977,
        "metrics": {
          "energy_wh_estimate": 0.1026,
          "gpu_busy_percent_avg": 45.7,
          "gpu_busy_percent_peak": 90.0,
          "power_w_avg": 99.89,
          "power_w_peak": 270.0,
          "vram_mb_avg": 4996.8,
          "vram_mb_peak": 6634.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 9,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "onboarding_mixed_format",
      "category": "mixed_formatting",
      "judge": {
        "elapsed_seconds": 14.968034759000147,
        "errors": [
          {
            "detail": "Leaves dictated commands such as 'End of list', 'New paragraph', and 'Finish with' in the output.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Does not format the accent-friendly words as a titled bullet list.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Changes 'aluminium' to 'aluminum'.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Does not format the budget as '$12,450'.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Uses 'Gnome' instead of 'GNOME'.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Handles some punctuation and URL cleanup but misses list formatting, spoken edit removal, currency formatting, and technical casing.",
            "severity": "major",
            "type": "partial_mixed_task"
          }
        ],
        "ideal_output": "Hello Chirper. I need to write down accent-friendly words.\n\nAccent Friendly Words:\n- water\n- tomato\n- schedule\n- data\n- router\n- aluminium\n- privacy\n\nPlease write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.\n\nIn the deployment notes, mention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale. Thanks!",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 35,
        "summary": "Major mixed-formatting failures: spoken commands leaked, bullet list was not created, several required casing/normalization items were missed, and one word was changed."
      },
      "judge_status": "ok",
      "latency_seconds": 4.3345,
      "model": "gemma3:4b",
      "ollama_status": "ok",
      "output": "Hello Chirper. I need to write down accent-friendly words. This is a bullet-point list with the title \u201cAccent-Friendly Words\u201d: water, tomato, schedule, data, router, aluminum, privacy.\n\nEnd of list.\n\nNew paragraph. Please write an email to Maya, subject: \u201cQuarterly Update\u201d. The meeting moved to Thursday at 9:30 a.m., the budget is twelve thousand four hundred fifty dollars, and the website is chirper.local/launch.\n\nNew paragraph. In the deployment notes, mention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, FFmpeg, Gnome, Nextcloud, and Tailscale.\n\nFinish with: \u201cThanks!\u201d\n\n",
      "passed": false,
      "score": 35,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 4.3359,
        "metrics": {
          "energy_wh_estimate": 0.1507,
          "gpu_busy_percent_avg": 53.4,
          "gpu_busy_percent_peak": 100.0,
          "power_w_avg": 125.1,
          "power_w_peak": 289.0,
          "vram_mb_avg": 5147.8,
          "vram_mb_peak": 6666.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 10,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "markdown_checklist_instruction_content",
      "category": "markdown",
      "judge": {
        "elapsed_seconds": 20.993688851001934,
        "errors": [
          {
            "detail": "Title should be plain text with a colon, not an H1 heading.",
            "severity": "minor",
            "type": "formatting_miss"
          },
          {
            "detail": "Missing colon after `Release Tasks`.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Release Tasks:\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 85,
        "summary": "Checklist content is preserved, but the dictated colon title was formatted as a Markdown heading instead of `Release Tasks:`."
      },
      "judge_status": "ok",
      "latency_seconds": 3.898,
      "model": "gemma3:4b",
      "ollama_status": "ok",
      "output": "# Release Tasks\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release\n\n\n",
      "passed": true,
      "score": 85,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.8993,
        "metrics": {
          "energy_wh_estimate": 0.0745,
          "gpu_busy_percent_avg": 41.2,
          "gpu_busy_percent_peak": 84.0,
          "power_w_avg": 68.78,
          "power_w_peak": 200.0,
          "vram_mb_avg": 5001.8,
          "vram_mb_peak": 6729.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 9,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "shell_command_as_content",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 8.345069921000686,
        "errors": [
          {
            "detail": "Did not convert spoken 'pipe' into the shell pipe operator '|'. ",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Changed 'test-output.log' into '-- output.log'.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Capitalized 'cargo' incorrectly.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "cargo test | tee test-output.log",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 15,
        "summary": "Command structure was not preserved: pipe was left as a word, filename was changed, and casing was wrong."
      },
      "judge_status": "ok",
      "latency_seconds": 3.9135,
      "model": "gemma3:4b",
      "ollama_status": "ok",
      "output": "Cargo test pipe tee -- output.log\n\n",
      "passed": false,
      "score": 15,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.9149,
        "metrics": {
          "energy_wh_estimate": 0.0582,
          "gpu_busy_percent_avg": 34.0,
          "gpu_busy_percent_peak": 76.0,
          "power_w_avg": 53.56,
          "power_w_peak": 96.0,
          "vram_mb_avg": 4680.0,
          "vram_mb_peak": 6679.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 9,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "spoken_correction_chain",
      "category": "spoken_edits",
      "judge": {
        "elapsed_seconds": 9.07505897199735,
        "errors": [
          {
            "detail": "Leaves 'No, scratch that' and spelling instructions in the output.",
            "severity": "critical",
            "type": "instruction_leak"
          },
          {
            "detail": "Does not apply the final correction to produce the identifier ChirperBench.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Formats the intended identifier as separate words instead of CamelCase.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Drops 'it' from 'Call it'.",
            "severity": "minor",
            "type": "missing_text"
          }
        ],
        "ideal_output": "Call it ChirperBench.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 0,
        "summary": "Critical instruction leak and semantic change: spoken correction commands were retained and the final corrected identifier was not produced."
      },
      "judge_status": "ok",
      "latency_seconds": 4.2256,
      "model": "gemma3:4b",
      "ollama_status": "ok",
      "output": "Call Chirper Bench. No, scratch that. Chirper Benchmark. (Spelled capital C) Chirper, capital B, bench.\n\n",
      "passed": false,
      "score": 0,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 4.2266,
        "metrics": {
          "energy_wh_estimate": 0.094,
          "gpu_busy_percent_avg": 40.4,
          "gpu_busy_percent_peak": 84.0,
          "power_w_avg": 80.1,
          "power_w_peak": 260.0,
          "vram_mb_avg": 4955.6,
          "vram_mb_peak": 6866.5,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 10,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "url_https_path",
      "category": "urls",
      "judge": {
        "elapsed_seconds": 8.385587010001473,
        "errors": [
          {
            "detail": "Omitted the word \"Visit\" from the transcript.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Missing the final period after the URL.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Visit https://docs.python.org/3/library/json.html.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 70,
        "summary": "URL was formatted correctly, but the command word and final punctuation were omitted."
      },
      "judge_status": "ok",
      "latency_seconds": 3.7929,
      "model": "gemma3:4b",
      "ollama_status": "ok",
      "output": "https://docs.python.org/3/library/json.html\n\n\n",
      "passed": false,
      "score": 70,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.7949,
        "metrics": {
          "energy_wh_estimate": 0.0792,
          "gpu_busy_percent_avg": 42.8,
          "gpu_busy_percent_peak": 81.0,
          "power_w_avg": 75.11,
          "power_w_peak": 227.0,
          "vram_mb_avg": 5133.4,
          "vram_mb_peak": 6854.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 9,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "email_addresses",
      "category": "emails",
      "judge": {
        "elapsed_seconds": 8.275361912001244,
        "errors": [
          {
            "detail": "Did not convert spoken email addresses to alex@chirper.local and ops@example.com.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Changed alex to Alex inside the email address.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Send it to alex@chirper.local and cc ops@example.com.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 45,
        "summary": "Email addresses were not formatted; spoken 'at' and 'dot' remained."
      },
      "judge_status": "ok",
      "latency_seconds": 3.9489,
      "model": "gemma3:4b",
      "ollama_status": "ok",
      "output": "Send it to Alex at chirper dot local and cc ops at example dot com.\n\n",
      "passed": false,
      "score": 45,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.9505,
        "metrics": {
          "energy_wh_estimate": 0.0744,
          "gpu_busy_percent_avg": 44.2,
          "gpu_busy_percent_peak": 76.0,
          "power_w_avg": 67.78,
          "power_w_peak": 193.0,
          "vram_mb_avg": 4626.7,
          "vram_mb_peak": 6703.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 9,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "product_and_class_casing",
      "category": "casing_identifiers",
      "judge": {
        "elapsed_seconds": 7.986498722999386,
        "errors": [
          {
            "detail": "Used 'Chirper Bench' instead of product-style 'ChirperBench'.",
            "severity": "major",
            "type": "wrong_casing"
          },
          {
            "detail": "Used 'Dictation Formatter' instead of code-style class name 'DictationFormatter'.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "The project name is ChirperBench and the class name is DictationFormatter.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 70,
        "summary": "The output preserves the sentence and punctuation but misses required product and class identifier casing by leaving spaces in both names."
      },
      "judge_status": "ok",
      "latency_seconds": 3.4181,
      "model": "gemma3:4b",
      "ollama_status": "ok",
      "output": "The project name is Chirper Bench and the class name is Dictation Formatter.\n\n",
      "passed": false,
      "score": 70,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.4198,
        "metrics": {
          "energy_wh_estimate": 0.0545,
          "gpu_busy_percent_avg": 34.4,
          "gpu_busy_percent_peak": 91.0,
          "power_w_avg": 57.38,
          "power_w_peak": 153.0,
          "vram_mb_avg": 4331.4,
          "vram_mb_peak": 6659.4,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 8,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "inline_code_identifiers",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 8.341484299999138,
        "errors": [
          {
            "detail": "Did not wrap code identifiers in inline code backticks.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Sentence should start with capitalized 'Set'.",
            "severity": "minor",
            "type": "wrong_casing"
          },
          {
            "detail": "Missing final period.",
            "severity": "minor",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Changed 'equal to' into '=' instead of preserving dictated wording.",
            "severity": "minor",
            "type": "semantic_change"
          }
        ],
        "ideal_output": "Set variable `user_id` equal to `request.user.id`.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 55,
        "summary": "Preserves the basic identifiers but misses required casing, punctuation, inline code formatting, and changes the phrasing."
      },
      "judge_status": "ok",
      "latency_seconds": 3.4182,
      "model": "gemma3:4b",
      "ollama_status": "ok",
      "output": "set variable user_id = request.user.id\n\n\n",
      "passed": false,
      "score": 55,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.4199,
        "metrics": {
          "energy_wh_estimate": 0.0532,
          "gpu_busy_percent_avg": 31.6,
          "gpu_busy_percent_peak": 90.0,
          "power_w_avg": 56.0,
          "power_w_peak": 109.0,
          "vram_mb_avg": 4283.9,
          "vram_mb_peak": 6611.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 8,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "numbers_versions_dates",
      "category": "numbers_versions",
      "judge": {
        "elapsed_seconds": 10.191202780999447,
        "errors": [
          {
            "detail": "Changed supported version from Python 3.11+ to Python 3.11.0, removing the plus requirement.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Formatted June 3 as June 3rd instead of June 3.",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Version 2.1.0 shipped on June 3, 2026 and supports Python 3.11+.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 65,
        "summary": "Version and date formatting mostly worked, but the output changed Python 3.11+ to Python 3.11.0 and used an ordinal date not requested."
      },
      "judge_status": "ok",
      "latency_seconds": 3.5894,
      "model": "gemma3:4b",
      "ollama_status": "ok",
      "output": "Version 2.1.0 shipped on June 3rd, 2026 and supports Python 3.11.0.\n\n",
      "passed": false,
      "score": 65,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.5906,
        "metrics": {
          "energy_wh_estimate": 0.0791,
          "gpu_busy_percent_avg": 40.4,
          "gpu_busy_percent_peak": 91.0,
          "power_w_avg": 79.33,
          "power_w_peak": 254.0,
          "vram_mb_avg": 4614.8,
          "vram_mb_peak": 6753.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 9,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "simple_list",
      "category": "lists",
      "judge": {
        "elapsed_seconds": 7.799640386998362,
        "errors": [
          {
            "detail": "Expected items after the colon to be split into bullet list entries.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Shopping list:\n- apples\n- bananas\n- oat milk",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 70,
        "summary": "Content is preserved, but the spoken list was not formatted as a markdown list."
      },
      "judge_status": "ok",
      "latency_seconds": 3.3547,
      "model": "gemma3:4b",
      "ollama_status": "ok",
      "output": "Shopping list: apples, bananas, oat milk\n\n",
      "passed": false,
      "score": 70,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.356,
        "metrics": {
          "energy_wh_estimate": 0.0463,
          "gpu_busy_percent_avg": 36.6,
          "gpu_busy_percent_peak": 89.0,
          "power_w_avg": 49.62,
          "power_w_peak": 153.0,
          "vram_mb_avg": 4868.1,
          "vram_mb_peak": 6825.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 8,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "no_change_needed",
      "category": "no_change_needed",
      "judge": {
        "elapsed_seconds": 6.306699830998696,
        "errors": [],
        "ideal_output": "The quick brown fox jumps over the lazy dog.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output matches the expected cleaned text."
      },
      "judge_status": "ok",
      "latency_seconds": 3.4088,
      "model": "gemma3:4b",
      "ollama_status": "ok",
      "output": "The quick brown fox jumps over the lazy dog.\n\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 3.4101,
        "metrics": {
          "energy_wh_estimate": 0.0586,
          "gpu_busy_percent_avg": 39.6,
          "gpu_busy_percent_peak": 88.0,
          "power_w_avg": 61.88,
          "power_w_peak": 237.0,
          "vram_mb_avg": 4459.4,
          "vram_mb_peak": 6791.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 8,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_pr_review_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 9.581592831000307,
        "errors": [
          {
            "detail": "Did not convert 'PR number 4' to 'PR #4'.",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Please check the open PR #4 for code review comments and check each one that's still open if it needs to be fixed. Fix it, then resolve the comments and push your fix.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 90,
        "summary": "Content is preserved and cleaned, but PR identifier formatting should be #4 rather than 'number 4'."
      },
      "judge_status": "ok",
      "latency_seconds": 14.2071,
      "model": "gemma4:latest",
      "ollama_status": "ok",
      "output": "Please check the open PR number 4 for code review comments. Check each one that's still open; if it needs to be fixed, fix it, then resolve the comments, and push your fix.\n\n",
      "passed": true,
      "score": 90,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 14.2089,
        "metrics": {
          "energy_wh_estimate": 0.4318,
          "gpu_busy_percent_avg": 59.8,
          "gpu_busy_percent_peak": 88.0,
          "power_w_avg": 109.4,
          "power_w_peak": 262.0,
          "vram_mb_avg": 11154.4,
          "vram_mb_peak": 12676.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 30,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_agent_instruction",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 7.237401596001291,
        "errors": [],
        "ideal_output": "Run cargo test, fix any failing tests, then commit and push the branch.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output exactly preserves the intended command-like content with correct casing and punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 5.9482,
      "model": "gemma4:latest",
      "ollama_status": "ok",
      "output": "Run cargo test, fix any failing tests, then commit and push the branch.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 5.95,
        "metrics": {
          "energy_wh_estimate": 0.0847,
          "gpu_busy_percent_avg": 46.5,
          "gpu_busy_percent_peak": 90.0,
          "power_w_avg": 51.23,
          "power_w_peak": 152.0,
          "vram_mb_avg": 9362.2,
          "vram_mb_peak": 12696.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 13,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_question_no_answer",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 13.877217298999312,
        "errors": [
          {
            "detail": "Missing final question mark from the dictated phrase 'question mark'.",
            "severity": "major",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "What is the capital of France?",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 75,
        "summary": "Content and casing are correct, but the dictated question mark was omitted."
      },
      "judge_status": "ok",
      "latency_seconds": 5.8111,
      "model": "gemma4:latest",
      "ollama_status": "ok",
      "output": "What is the capital of France?\n\n",
      "passed": false,
      "score": 75,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 5.8125,
        "metrics": {
          "energy_wh_estimate": 0.0934,
          "gpu_busy_percent_avg": 45.4,
          "gpu_busy_percent_peak": 90.0,
          "power_w_avg": 57.85,
          "power_w_peak": 231.0,
          "vram_mb_avg": 9401.7,
          "vram_mb_peak": 12700.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 13,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "literal_email_request",
      "category": "instruction_as_content",
      "judge": {
        "elapsed_seconds": 13.676307640002051,
        "errors": [
          {
            "detail": "Created email header/body formatting rather than returning the dictated request as final text.",
            "severity": "major",
            "type": "over_generation"
          },
          {
            "detail": "Omitted the leading instruction content: \"Write an email to Maya\".",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Rendered \"9:30 a.m.\" instead of \"9:30 AM\".",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 55,
        "summary": "The formatter turned the dictated request into an email-style draft instead of preserving it as cleaned literal text."
      },
      "judge_status": "ok",
      "latency_seconds": 9.9407,
      "model": "gemma4:latest",
      "ollama_status": "ok",
      "output": "**To:** Maya\n**Subject:** Quarterly Update\n\nThe meeting moved to Thursday at 9:30 a.m. The budget is $12,450, and the website is chirper.local/launch.\n\n",
      "passed": false,
      "score": 55,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 9.9422,
        "metrics": {
          "energy_wh_estimate": 0.3772,
          "gpu_busy_percent_avg": 61.4,
          "gpu_busy_percent_peak": 90.0,
          "power_w_avg": 136.57,
          "power_w_peak": 276.0,
          "vram_mb_avg": 10528.4,
          "vram_mb_peak": 12602.3,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 21,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "onboarding_mixed_format",
      "category": "mixed_formatting",
      "judge": {
        "elapsed_seconds": 14.649284347000503,
        "errors": [
          {
            "detail": "Kept the spoken instruction 'This is a bullet-point list with the title' instead of outputting only the title and list.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "List title/items formatting differs from required output, including title hyphenation and capitalized list items.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Rendered GNOME as Gnome and 9:30 AM as 9:30 a.m.",
            "severity": "major",
            "type": "wrong_casing"
          },
          {
            "detail": "Omitted the dictated closing 'Thanks!'.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Changed the email/meeting/budget sentence punctuation and structure from the intended cleaned text.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Hello Chirper. I need to write down accent-friendly words.\n\nAccent Friendly Words:\n- water\n- tomato\n- schedule\n- data\n- router\n- aluminium\n- privacy\n\nPlease write an email to Maya, subject: Quarterly Update. The meeting moved to Thursday at 9:30 AM, the budget is $12,450, and the website is chirper.local/launch.\n\nIn the deployment notes, mention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, FFmpeg, GNOME, Nextcloud, and Tailscale. Thanks!",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 62,
        "summary": "Mixed formatting was only partially applied; it leaked list instructions, missed required casing/markdown details, and omitted the final thanks."
      },
      "judge_status": "ok",
      "latency_seconds": 15.0308,
      "model": "gemma4:latest",
      "ollama_status": "ok",
      "output": "Hello Chirper. I need to write down accent-friendly words. This is a bullet-point list with the title \"Accent-Friendly Words\":\n*   Water\n*   Tomato\n*   Schedule\n*   Data\n*   Router\n*   Aluminium\n*   Privacy\n\nPlease write an email to Maya with the subject \"Quarterly Update.\" The meeting moved to Thursday at 9:30 a.m. The budget is $12,450, and the website is chirper.local/launch.\n\nIn the deployment notes, mention that systemd keeps the Chirper services running, and we should also look at PostgreSQL, FFmpeg, Gnome, Nextcloud, and Tailscale.\n\n",
      "passed": false,
      "score": 62,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 15.0321,
        "metrics": {
          "energy_wh_estimate": 0.7419,
          "gpu_busy_percent_avg": 70.6,
          "gpu_busy_percent_peak": 89.0,
          "power_w_avg": 177.69,
          "power_w_peak": 272.0,
          "vram_mb_avg": 11482.4,
          "vram_mb_peak": 12619.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 32,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "markdown_checklist_instruction_content",
      "category": "markdown",
      "judge": {
        "elapsed_seconds": 8.995567556001333,
        "errors": [
          {
            "detail": "Used `## Release Tasks:` instead of plain `Release Tasks:`.",
            "severity": "minor",
            "type": "formatting_miss"
          },
          {
            "detail": "Used `*` checklist bullets instead of the expected `-` bullets.",
            "severity": "minor",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Release Tasks:\n- [ ] Run tests\n- [ ] Push tag\n- [ ] Publish release",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 85,
        "summary": "Content is preserved and checklist formatting is valid, but the title was changed into a level-2 heading and bullet markers differ from the expected format."
      },
      "judge_status": "ok",
      "latency_seconds": 10.4986,
      "model": "gemma4:latest",
      "ollama_status": "ok",
      "output": "## Release Tasks:\n* [ ] Run tests\n* [ ] Push tag\n* [ ] Publish release\n\n",
      "passed": true,
      "score": 85,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 10.5001,
        "metrics": {
          "energy_wh_estimate": 0.4694,
          "gpu_busy_percent_avg": 68.3,
          "gpu_busy_percent_peak": 93.0,
          "power_w_avg": 160.95,
          "power_w_peak": 278.0,
          "vram_mb_avg": 11009.0,
          "vram_mb_peak": 12583.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 22,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "shell_command_as_content",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 9.712677266998071,
        "errors": [
          {
            "detail": "Left dictated tokens \"pipe\" and \"space\" in the output.",
            "severity": "major",
            "type": "instruction_leak"
          },
          {
            "detail": "Did not convert \"pipe\" to the shell pipe operator `|`.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Changed the intended command from a pipeline into literal arguments.",
            "severity": "major",
            "type": "semantic_change"
          }
        ],
        "ideal_output": "cargo test | tee test-output.log",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 35,
        "summary": "Failed to preserve shell command structure by leaving spoken tokens instead of formatting the pipe and spacing."
      },
      "judge_status": "ok",
      "latency_seconds": 14.5813,
      "model": "gemma4:latest",
      "ollama_status": "ok",
      "output": "cargo test pipe tee space test-output.log\n\n",
      "passed": false,
      "score": 35,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 14.5828,
        "metrics": {
          "energy_wh_estimate": 0.7456,
          "gpu_busy_percent_avg": 73.7,
          "gpu_busy_percent_peak": 95.0,
          "power_w_avg": 184.06,
          "power_w_peak": 274.0,
          "vram_mb_avg": 11437.9,
          "vram_mb_peak": 12562.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 31,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "spoken_correction_chain",
      "category": "spoken_edits",
      "judge": {
        "elapsed_seconds": 9.766410380998423,
        "errors": [
          {
            "detail": "Uses 'Chirper Benchmark' instead of the intended identifier 'ChirperBench'.",
            "severity": "major",
            "type": "semantic_change"
          },
          {
            "detail": "Omitted 'Call it' from the final sentence.",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Adds parenthetical explanation about capitalization.",
            "severity": "major",
            "type": "extra_text"
          },
          {
            "detail": "Fails to format the corrected name as the single CamelCase identifier 'ChirperBench'.",
            "severity": "major",
            "type": "formatting_miss"
          }
        ],
        "ideal_output": "Call it ChirperBench.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 15,
        "summary": "Output changes the intended identifier, omits words, and leaves explanatory spelling text instead of final cleaned dictation."
      },
      "judge_status": "ok",
      "latency_seconds": 10.0035,
      "model": "gemma4:latest",
      "ollama_status": "ok",
      "output": "Chirper Benchmark (C and B are capitalized)\n\n",
      "passed": false,
      "score": 15,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 10.005,
        "metrics": {
          "energy_wh_estimate": 0.4255,
          "gpu_busy_percent_avg": 68.3,
          "gpu_busy_percent_peak": 91.0,
          "power_w_avg": 153.1,
          "power_w_peak": 267.0,
          "vram_mb_avg": 10947.8,
          "vram_mb_peak": 12583.8,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 21,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "url_https_path",
      "category": "urls",
      "judge": {
        "elapsed_seconds": 7.666152069999953,
        "errors": [
          {
            "detail": "Omitted the dictated word \"Visit\".",
            "severity": "major",
            "type": "missing_text"
          },
          {
            "detail": "Missing the final period.",
            "severity": "major",
            "type": "wrong_punctuation"
          },
          {
            "detail": "Did not capitalize the sentence start because the opening word was omitted.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Visit https://docs.python.org/3/library/json.html.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 72,
        "summary": "URL was formatted correctly, but the leading word and final punctuation were omitted."
      },
      "judge_status": "ok",
      "latency_seconds": 10.1486,
      "model": "gemma4:latest",
      "ollama_status": "ok",
      "output": "https://docs.python.org/3/library/json.html\n\n",
      "passed": false,
      "score": 72,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 10.1504,
        "metrics": {
          "energy_wh_estimate": 0.4321,
          "gpu_busy_percent_avg": 64.6,
          "gpu_busy_percent_peak": 89.0,
          "power_w_avg": 153.27,
          "power_w_peak": 270.0,
          "vram_mb_avg": 10977.3,
          "vram_mb_peak": 12576.1,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 22,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "email_addresses",
      "category": "emails",
      "judge": {
        "elapsed_seconds": 8.708688291000726,
        "errors": [
          {
            "detail": "Did not format 'alex at chirper dot local' as alex@chirper.local.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Capitalized the local part as 'Alex' instead of preserving lowercase email text.",
            "severity": "minor",
            "type": "wrong_casing"
          }
        ],
        "ideal_output": "Send it to alex@chirper.local and cc ops@example.com.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 60,
        "summary": "One spoken email address was not converted correctly; it left 'at' as text and changed casing."
      },
      "judge_status": "ok",
      "latency_seconds": 5.4651,
      "model": "gemma4:latest",
      "ollama_status": "ok",
      "output": "Send it to Alex at chirper.local and cc ops@example.com.\n\n",
      "passed": false,
      "score": 60,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 5.4664,
        "metrics": {
          "energy_wh_estimate": 0.0921,
          "gpu_busy_percent_avg": 50.1,
          "gpu_busy_percent_peak": 93.0,
          "power_w_avg": 60.67,
          "power_w_peak": 189.0,
          "vram_mb_avg": 9701.4,
          "vram_mb_peak": 12449.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 12,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "product_and_class_casing",
      "category": "casing_identifiers",
      "judge": {
        "elapsed_seconds": 8.45655455800079,
        "errors": [
          {
            "detail": "Rendered product identifier as \"Chirper Bench\" instead of \"ChirperBench\".",
            "severity": "major",
            "type": "wrong_casing"
          },
          {
            "detail": "Added an unnecessary comma after the project name.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "The project name is ChirperBench and the class name is DictationFormatter.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 75,
        "summary": "Product/code-style casing was partly missed: ChirperBench was split into two words, with an extra comma added."
      },
      "judge_status": "ok",
      "latency_seconds": 5.5084,
      "model": "gemma4:latest",
      "ollama_status": "ok",
      "output": "The project name is Chirper Bench, and the class name is DictationFormatter.\n\n",
      "passed": false,
      "score": 75,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 5.51,
        "metrics": {
          "energy_wh_estimate": 0.1041,
          "gpu_busy_percent_avg": 52.1,
          "gpu_busy_percent_peak": 89.0,
          "power_w_avg": 68.0,
          "power_w_peak": 252.0,
          "vram_mb_avg": 9711.0,
          "vram_mb_peak": 12441.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 12,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "inline_code_identifiers",
      "category": "code_identifiers",
      "judge": {
        "elapsed_seconds": 10.813007475997438,
        "errors": [
          {
            "detail": "Code identifiers were not formatted as inline code.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Changed 'equal to' into '=' rather than preserving the dictated wording.",
            "severity": "minor",
            "type": "semantic_change"
          },
          {
            "detail": "Missing final period.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Set variable `user_id` equal to `request.user.id`.",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 55,
        "summary": "Missed required inline code formatting and final punctuation, and changed the phrasing to an equals sign."
      },
      "judge_status": "ok",
      "latency_seconds": 5.4056,
      "model": "gemma4:latest",
      "ollama_status": "ok",
      "output": "Set variable user\\_id = request.user.id\n\n",
      "passed": false,
      "score": 55,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 5.4076,
        "metrics": {
          "energy_wh_estimate": 0.11,
          "gpu_busy_percent_avg": 52.7,
          "gpu_busy_percent_peak": 95.0,
          "power_w_avg": 73.25,
          "power_w_peak": 243.0,
          "vram_mb_avg": 9715.7,
          "vram_mb_peak": 12425.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 12,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "numbers_versions_dates",
      "category": "numbers_versions",
      "judge": {
        "elapsed_seconds": 7.775223067998013,
        "errors": [
          {
            "detail": "Adds a comma after 2026 where expected output has none.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Version 2.1.0 shipped on June 3, 2026 and supports Python 3.11+.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 95,
        "summary": "Content, casing, versions, date, and number formatting are correct; only an extra comma differs from expected punctuation."
      },
      "judge_status": "ok",
      "latency_seconds": 5.5764,
      "model": "gemma4:latest",
      "ollama_status": "ok",
      "output": "Version 2.1.0 shipped on June 3, 2026, and supports Python 3.11+.\n\n",
      "passed": true,
      "score": 95,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 5.5777,
        "metrics": {
          "energy_wh_estimate": 0.1088,
          "gpu_busy_percent_avg": 44.2,
          "gpu_busy_percent_peak": 89.0,
          "power_w_avg": 70.23,
          "power_w_peak": 247.0,
          "vram_mb_avg": 9910.9,
          "vram_mb_peak": 12449.7,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 13,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "simple_list",
      "category": "lists",
      "judge": {
        "elapsed_seconds": 7.888906281001255,
        "errors": [
          {
            "detail": "Expected each shopping item as a separate markdown bullet.",
            "severity": "major",
            "type": "formatting_miss"
          },
          {
            "detail": "Kept comma-separated sentence form instead of colon followed by list items.",
            "severity": "minor",
            "type": "wrong_punctuation"
          }
        ],
        "ideal_output": "Shopping list:\n- apples\n- bananas\n- oat milk",
        "judge_status": "ok",
        "passed": false,
        "returncode": 0,
        "score": 70,
        "summary": "Content is preserved, but the spoken list was not formatted as a markdown list."
      },
      "judge_status": "ok",
      "latency_seconds": 9.1641,
      "model": "gemma4:latest",
      "ollama_status": "ok",
      "output": "Shopping list: apples, bananas, oat milk.\n\n",
      "passed": false,
      "score": 70,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 9.1654,
        "metrics": {
          "energy_wh_estimate": 0.3424,
          "gpu_busy_percent_avg": 62.7,
          "gpu_busy_percent_peak": 89.0,
          "power_w_avg": 134.5,
          "power_w_peak": 273.0,
          "vram_mb_avg": 10361.2,
          "vram_mb_peak": 12472.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 20,
        "status": "ok"
      },
      "timed_out": false
    },
    {
      "case_id": "no_change_needed",
      "category": "no_change_needed",
      "judge": {
        "elapsed_seconds": 5.8942368809985055,
        "errors": [],
        "ideal_output": "The quick brown fox jumps over the lazy dog.",
        "judge_status": "ok",
        "passed": true,
        "returncode": 0,
        "score": 100,
        "summary": "Output matches the expected clean text exactly."
      },
      "judge_status": "ok",
      "latency_seconds": 5.3732,
      "model": "gemma4:latest",
      "ollama_status": "ok",
      "output": "The quick brown fox jumps over the lazy dog.\n\n",
      "passed": true,
      "score": 100,
      "telemetry": {
        "device": {
          "card": "card1",
          "device_path": "/sys/class/drm/card1/device",
          "driver": "amdgpu",
          "pci_address": "0000:0c:00.0",
          "provider": "amd-sysfs",
          "reason": "",
          "status": "ok",
          "vendor": "0x1002",
          "vram_total_mb": 24560.0
        },
        "elapsed_seconds": 5.3746,
        "metrics": {
          "energy_wh_estimate": 0.1101,
          "gpu_busy_percent_avg": 51.3,
          "gpu_busy_percent_peak": 88.0,
          "power_w_avg": 73.75,
          "power_w_peak": 248.0,
          "vram_mb_avg": 9708.9,
          "vram_mb_peak": 12456.2,
          "vram_total_mb": 24560.0
        },
        "provider": "amd-sysfs",
        "reason": "",
        "sample_count": 12,
        "status": "ok"
      },
      "timed_out": false
    }
  ],
  "run_id": "20260603-225211",
  "summary": {
    "case_count": 15,
    "cases": [
      {
        "average_score": 65.48,
        "case_id": "literal_pr_review_instruction",
        "category": "instruction_as_content",
        "model_count": 29,
        "pass_rate": 0.6552
      },
      {
        "average_score": 78.0,
        "case_id": "literal_agent_instruction",
        "category": "instruction_as_content",
        "model_count": 29,
        "pass_rate": 0.7241
      },
      {
        "average_score": 74.31,
        "case_id": "literal_question_no_answer",
        "category": "instruction_as_content",
        "model_count": 29,
        "pass_rate": 0.5517
      },
      {
        "average_score": 45.31,
        "case_id": "literal_email_request",
        "category": "instruction_as_content",
        "model_count": 29,
        "pass_rate": 0.2069
      },
      {
        "average_score": 40.0,
        "case_id": "onboarding_mixed_format",
        "category": "mixed_formatting",
        "model_count": 29,
        "pass_rate": 0.069
      },
      {
        "average_score": 61.24,
        "case_id": "markdown_checklist_instruction_content",
        "category": "markdown",
        "model_count": 29,
        "pass_rate": 0.3448
      },
      {
        "average_score": 40.1,
        "case_id": "shell_command_as_content",
        "category": "code_identifiers",
        "model_count": 29,
        "pass_rate": 0.2069
      },
      {
        "average_score": 25.69,
        "case_id": "spoken_correction_chain",
        "category": "spoken_edits",
        "model_count": 29,
        "pass_rate": 0.0
      },
      {
        "average_score": 79.55,
        "case_id": "url_https_path",
        "category": "urls",
        "model_count": 29,
        "pass_rate": 0.6552
      },
      {
        "average_score": 61.86,
        "case_id": "email_addresses",
        "category": "emails",
        "model_count": 29,
        "pass_rate": 0.4828
      },
      {
        "average_score": 57.0,
        "case_id": "product_and_class_casing",
        "category": "casing_identifiers",
        "model_count": 29,
        "pass_rate": 0.1034
      },
      {
        "average_score": 49.76,
        "case_id": "inline_code_identifiers",
        "category": "code_identifiers",
        "model_count": 29,
        "pass_rate": 0.1034
      },
      {
        "average_score": 78.1,
        "case_id": "numbers_versions_dates",
        "category": "numbers_versions",
        "model_count": 29,
        "pass_rate": 0.6552
      },
      {
        "average_score": 61.62,
        "case_id": "simple_list",
        "category": "lists",
        "model_count": 29,
        "pass_rate": 0.069
      },
      {
        "average_score": 96.21,
        "case_id": "no_change_needed",
        "category": "no_change_needed",
        "model_count": 29,
        "pass_rate": 0.9655
      }
    ],
    "error_counts": {
      "answered_content": 4,
      "extra_text": 55,
      "formatting_miss": 234,
      "instruction_leak": 63,
      "missing_text": 65,
      "ollama_run_failed": 33,
      "over_generation": 16,
      "partial_mixed_task": 5,
      "refusal_or_meta": 10,
      "semantic_change": 104,
      "wrong_casing": 134,
      "wrong_punctuation": 152
    },
    "leaderboard": [
      {
        "average_score": 83.67,
        "case_count": 15,
        "error_count": 20,
        "judge_error_count": 20,
        "median_gpu_busy_percent_avg": 71.2,
        "median_latency_seconds": 14.6337,
        "median_power_w_avg": 156.86,
        "median_vram_mb_peak": 20745.3,
        "model": "gemma4:26b",
        "ollama_error_count": 0,
        "pass_count": 10,
        "pass_rate": 0.6667,
        "peak_power_w": 326.0,
        "peak_vram_mb": 20830.8,
        "rank": 1,
        "telemetry_sample_count": 733
      },
      {
        "average_score": 82.07,
        "case_count": 15,
        "error_count": 14,
        "judge_error_count": 13,
        "median_gpu_busy_percent_avg": 37.5,
        "median_latency_seconds": 108.5076,
        "median_power_w_avg": 56.24,
        "median_vram_mb_peak": 23972.7,
        "model": "qwen3.6:35b",
        "ollama_error_count": 1,
        "pass_count": 9,
        "pass_rate": 0.6,
        "peak_power_w": 159.0,
        "peak_vram_mb": 23983.4,
        "rank": 2,
        "telemetry_sample_count": 3069
      },
      {
        "average_score": 80.73,
        "case_count": 15,
        "error_count": 26,
        "judge_error_count": 26,
        "median_gpu_busy_percent_avg": 16.5,
        "median_latency_seconds": 8.0058,
        "median_power_w_avg": 67.94,
        "median_vram_mb_peak": 18890.2,
        "model": "granite4.1:30b",
        "ollama_error_count": 0,
        "pass_count": 9,
        "pass_rate": 0.6,
        "peak_power_w": 281.0,
        "peak_vram_mb": 18974.5,
        "rank": 3,
        "telemetry_sample_count": 378
      },
      {
        "average_score": 77.27,
        "case_count": 15,
        "error_count": 21,
        "judge_error_count": 20,
        "median_gpu_busy_percent_avg": 86.4,
        "median_latency_seconds": 58.6964,
        "median_power_w_avg": 270.14,
        "median_vram_mb_peak": 10743.3,
        "model": "qwen3.5:9b",
        "ollama_error_count": 1,
        "pass_count": 9,
        "pass_rate": 0.6,
        "peak_power_w": 317.0,
        "peak_vram_mb": 10843.7,
        "rank": 4,
        "telemetry_sample_count": 2139
      },
      {
        "average_score": 77.2,
        "case_count": 15,
        "error_count": 27,
        "judge_error_count": 27,
        "median_gpu_busy_percent_avg": 66.7,
        "median_latency_seconds": 6.4238,
        "median_power_w_avg": 145.69,
        "median_vram_mb_peak": 15026.5,
        "model": "gpt-oss:20b",
        "ollama_error_count": 0,
        "pass_count": 6,
        "pass_rate": 0.4,
        "peak_power_w": 320.0,
        "peak_vram_mb": 15110.8,
        "rank": 5,
        "telemetry_sample_count": 238
      },
      {
        "average_score": 75.53,
        "case_count": 15,
        "error_count": 28,
        "judge_error_count": 28,
        "median_gpu_busy_percent_avg": 25.2,
        "median_latency_seconds": 2.2594,
        "median_power_w_avg": 70.83,
        "median_vram_mb_peak": 12093.2,
        "model": "granite4.1:8b",
        "ollama_error_count": 0,
        "pass_count": 7,
        "pass_rate": 0.4667,
        "peak_power_w": 308.0,
        "peak_vram_mb": 12096.2,
        "rank": 6,
        "telemetry_sample_count": 99
      },
      {
        "average_score": 72.27,
        "case_count": 15,
        "error_count": 26,
        "judge_error_count": 25,
        "median_gpu_busy_percent_avg": 83.6,
        "median_latency_seconds": 20.3197,
        "median_power_w_avg": 204.7,
        "median_vram_mb_peak": 23184.5,
        "model": "glm-4.7-flash:latest",
        "ollama_error_count": 1,
        "pass_count": 8,
        "pass_rate": 0.5333,
        "peak_power_w": 292.0,
        "peak_vram_mb": 23270.0,
        "rank": 7,
        "telemetry_sample_count": 1006
      },
      {
        "average_score": 72.13,
        "case_count": 15,
        "error_count": 28,
        "judge_error_count": 28,
        "median_gpu_busy_percent_avg": 37.3,
        "median_latency_seconds": 36.6626,
        "median_power_w_avg": 60.65,
        "median_vram_mb_peak": 23884.5,
        "model": "nemotron3:33b",
        "ollama_error_count": 0,
        "pass_count": 6,
        "pass_rate": 0.4,
        "peak_power_w": 235.0,
        "peak_vram_mb": 23946.4,
        "rank": 8,
        "telemetry_sample_count": 1397
      },
      {
        "average_score": 72.07,
        "case_count": 15,
        "error_count": 32,
        "judge_error_count": 32,
        "median_gpu_busy_percent_avg": 59.5,
        "median_latency_seconds": 8.5258,
        "median_power_w_avg": 141.5,
        "median_vram_mb_peak": 11932.0,
        "model": "gemma4:e4b",
        "ollama_error_count": 0,
        "pass_count": 7,
        "pass_rate": 0.4667,
        "peak_power_w": 295.0,
        "peak_vram_mb": 12017.5,
        "rank": 9,
        "telemetry_sample_count": 294
      },
      {
        "average_score": 71.4,
        "case_count": 15,
        "error_count": 21,
        "judge_error_count": 19,
        "median_gpu_busy_percent_avg": 94.4,
        "median_latency_seconds": 57.0776,
        "median_power_w_avg": 272.93,
        "median_vram_mb_peak": 23545.3,
        "model": "olmo-3.1:32b",
        "ollama_error_count": 2,
        "pass_count": 8,
        "pass_rate": 0.5333,
        "peak_power_w": 315.0,
        "peak_vram_mb": 23917.7,
        "rank": 10,
        "telemetry_sample_count": 2276
      },
      {
        "average_score": 71.33,
        "case_count": 15,
        "error_count": 28,
        "judge_error_count": 28,
        "median_gpu_busy_percent_avg": 86.1,
        "median_latency_seconds": 68.1213,
        "median_power_w_avg": 276.03,
        "median_vram_mb_peak": 8180.5,
        "model": "qwen3.5:4b",
        "ollama_error_count": 0,
        "pass_count": 7,
        "pass_rate": 0.4667,
        "peak_power_w": 311.0,
        "peak_vram_mb": 8266.0,
        "rank": 11,
        "telemetry_sample_count": 2189
      },
      {
        "average_score": 69.6,
        "case_count": 15,
        "error_count": 32,
        "judge_error_count": 32,
        "median_gpu_busy_percent_avg": 59.8,
        "median_latency_seconds": 9.1641,
        "median_power_w_avg": 109.4,
        "median_vram_mb_peak": 12576.1,
        "model": "gemma4:latest",
        "ollama_error_count": 0,
        "pass_count": 5,
        "pass_rate": 0.3333,
        "peak_power_w": 278.0,
        "peak_vram_mb": 12700.1,
        "rank": 12,
        "telemetry_sample_count": 286
      },
      {
        "average_score": 67.4,
        "case_count": 15,
        "error_count": 32,
        "judge_error_count": 32,
        "median_gpu_busy_percent_avg": 56.8,
        "median_latency_seconds": 6.6283,
        "median_power_w_avg": 117.53,
        "median_vram_mb_peak": 9134.0,
        "model": "gemma4:e2b",
        "ollama_error_count": 0,
        "pass_count": 7,
        "pass_rate": 0.4667,
        "peak_power_w": 312.0,
        "peak_vram_mb": 9219.0,
        "rank": 13,
        "telemetry_sample_count": 232
      },
      {
        "average_score": 65.2,
        "case_count": 15,
        "error_count": 30,
        "judge_error_count": 30,
        "median_gpu_busy_percent_avg": 68.8,
        "median_latency_seconds": 3.4761,
        "median_power_w_avg": 173.88,
        "median_vram_mb_peak": 7925.2,
        "model": "nemotron-3-nano:4b",
        "ollama_error_count": 0,
        "pass_count": 5,
        "pass_rate": 0.3333,
        "peak_power_w": 296.0,
        "peak_vram_mb": 8070.5,
        "rank": 14,
        "telemetry_sample_count": 141
      },
      {
        "average_score": 65.13,
        "case_count": 15,
        "error_count": 30,
        "judge_error_count": 30,
        "median_gpu_busy_percent_avg": 57.0,
        "median_latency_seconds": 10.5886,
        "median_power_w_avg": 77.35,
        "median_vram_mb_peak": 22891.1,
        "model": "devstral-small-2:24b",
        "ollama_error_count": 0,
        "pass_count": 6,
        "pass_rate": 0.4,
        "peak_power_w": 316.0,
        "peak_vram_mb": 23007.4,
        "rank": 15,
        "telemetry_sample_count": 345
      },
      {
        "average_score": 64.13,
        "case_count": 15,
        "error_count": 14,
        "judge_error_count": 10,
        "median_gpu_busy_percent_avg": 47.7,
        "median_latency_seconds": 121.96,
        "median_power_w_avg": 142.69,
        "median_vram_mb_peak": 24043.6,
        "model": "qwen3.6:27b",
        "ollama_error_count": 4,
        "pass_count": 8,
        "pass_rate": 0.5333,
        "peak_power_w": 355.0,
        "peak_vram_mb": 24049.6,
        "rank": 16,
        "telemetry_sample_count": 3769
      },
      {
        "average_score": 62.33,
        "case_count": 15,
        "error_count": 37,
        "judge_error_count": 37,
        "median_gpu_busy_percent_avg": 40.9,
        "median_latency_seconds": 3.6961,
        "median_power_w_avg": 69.0,
        "median_vram_mb_peak": 6703.3,
        "model": "gemma3:4b",
        "ollama_error_count": 0,
        "pass_count": 4,
        "pass_rate": 0.2667,
        "peak_power_w": 289.0,
        "peak_vram_mb": 6866.5,
        "rank": 17,
        "telemetry_sample_count": 132
      },
      {
        "average_score": 62.2,
        "case_count": 15,
        "error_count": 36,
        "judge_error_count": 36,
        "median_gpu_busy_percent_avg": 62.4,
        "median_latency_seconds": 4.2146,
        "median_power_w_avg": 123.0,
        "median_vram_mb_peak": 17043.7,
        "model": "ministral-3:14b",
        "ollama_error_count": 0,
        "pass_count": 7,
        "pass_rate": 0.4667,
        "peak_power_w": 313.0,
        "peak_vram_mb": 17114.5,
        "rank": 18,
        "telemetry_sample_count": 160
      },
      {
        "average_score": 58.6,
        "case_count": 15,
        "error_count": 39,
        "judge_error_count": 39,
        "median_gpu_busy_percent_avg": 54.9,
        "median_latency_seconds": 8.0904,
        "median_power_w_avg": 45.78,
        "median_vram_mb_peak": 16453.1,
        "model": "lfm2:24b",
        "ollama_error_count": 0,
        "pass_count": 5,
        "pass_rate": 0.3333,
        "peak_power_w": 228.0,
        "peak_vram_mb": 16564.4,
        "rank": 19,
        "telemetry_sample_count": 273
      },
      {
        "average_score": 58.07,
        "case_count": 15,
        "error_count": 37,
        "judge_error_count": 37,
        "median_gpu_busy_percent_avg": 31.0,
        "median_latency_seconds": 1.3766,
        "median_power_w_avg": 73.75,
        "median_vram_mb_peak": 6700.7,
        "model": "granite4.1:3b",
        "ollama_error_count": 0,
        "pass_count": 5,
        "pass_rate": 0.3333,
        "peak_power_w": 256.0,
        "peak_vram_mb": 7176.7,
        "rank": 20,
        "telemetry_sample_count": 65
      },
      {
        "average_score": 56.07,
        "case_count": 15,
        "error_count": 37,
        "judge_error_count": 37,
        "median_gpu_busy_percent_avg": 59.8,
        "median_latency_seconds": 3.2377,
        "median_power_w_avg": 123.0,
        "median_vram_mb_peak": 12723.6,
        "model": "ministral-3:8b",
        "ollama_error_count": 0,
        "pass_count": 5,
        "pass_rate": 0.3333,
        "peak_power_w": 339.0,
        "peak_vram_mb": 12766.1,
        "rank": 21,
        "telemetry_sample_count": 128
      },
      {
        "average_score": 55.73,
        "case_count": 15,
        "error_count": 36,
        "judge_error_count": 36,
        "median_gpu_busy_percent_avg": 51.8,
        "median_latency_seconds": 2.4815,
        "median_power_w_avg": 121.0,
        "median_vram_mb_peak": 11529.1,
        "model": "rnj-1:8b",
        "ollama_error_count": 0,
        "pass_count": 5,
        "pass_rate": 0.3333,
        "peak_power_w": 351.0,
        "peak_vram_mb": 11609.0,
        "rank": 22,
        "telemetry_sample_count": 100
      },
      {
        "average_score": 54.0,
        "case_count": 15,
        "error_count": 38,
        "judge_error_count": 38,
        "median_gpu_busy_percent_avg": 26.2,
        "median_latency_seconds": 1.8999,
        "median_power_w_avg": 73.2,
        "median_vram_mb_peak": 8191.5,
        "model": "olmo2:7b",
        "ollama_error_count": 0,
        "pass_count": 5,
        "pass_rate": 0.3333,
        "peak_power_w": 305.0,
        "peak_vram_mb": 8193.0,
        "rank": 23,
        "telemetry_sample_count": 86
      },
      {
        "average_score": 53.07,
        "case_count": 15,
        "error_count": 28,
        "judge_error_count": 25,
        "median_gpu_busy_percent_avg": 83.7,
        "median_latency_seconds": 58.0375,
        "median_power_w_avg": 236.17,
        "median_vram_mb_peak": 6013.3,
        "model": "qwen3.5:2b",
        "ollama_error_count": 3,
        "pass_count": 4,
        "pass_rate": 0.2667,
        "peak_power_w": 293.0,
        "peak_vram_mb": 6098.8,
        "rank": 24,
        "telemetry_sample_count": 2403
      },
      {
        "average_score": 46.47,
        "case_count": 15,
        "error_count": 44,
        "judge_error_count": 44,
        "median_gpu_busy_percent_avg": 44.0,
        "median_latency_seconds": 2.4151,
        "median_power_w_avg": 94.0,
        "median_vram_mb_peak": 8957.0,
        "model": "ministral-3:3b",
        "ollama_error_count": 0,
        "pass_count": 3,
        "pass_rate": 0.2,
        "peak_power_w": 319.0,
        "peak_vram_mb": 9121.9,
        "rank": 25,
        "telemetry_sample_count": 96
      },
      {
        "average_score": 40.33,
        "case_count": 15,
        "error_count": 42,
        "judge_error_count": 42,
        "median_gpu_busy_percent_avg": 49.0,
        "median_latency_seconds": 10.8602,
        "median_power_w_avg": 80.58,
        "median_vram_mb_peak": 24258.1,
        "model": "laguna-xs.2:latest",
        "ollama_error_count": 0,
        "pass_count": 4,
        "pass_rate": 0.2667,
        "peak_power_w": 202.0,
        "peak_vram_mb": 24329.4,
        "rank": 26,
        "telemetry_sample_count": 816
      },
      {
        "average_score": 24.87,
        "case_count": 15,
        "error_count": 49,
        "judge_error_count": 49,
        "median_gpu_busy_percent_avg": 72.8,
        "median_latency_seconds": 5.9512,
        "median_power_w_avg": 211.38,
        "median_vram_mb_peak": 3107.6,
        "model": "lfm2.5-thinking:1.2b",
        "ollama_error_count": 0,
        "pass_count": 1,
        "pass_rate": 0.0667,
        "peak_power_w": 265.0,
        "peak_vram_mb": 3179.8,
        "rank": 27,
        "telemetry_sample_count": 207
      },
      {
        "average_score": 20.0,
        "case_count": 15,
        "error_count": 12,
        "judge_error_count": 0,
        "median_gpu_busy_percent_avg": 53.9,
        "median_latency_seconds": 180.0034,
        "median_power_w_avg": 149.06,
        "median_vram_mb_peak": 24249.5,
        "model": "qwen3.5:27b",
        "ollama_error_count": 12,
        "pass_count": 3,
        "pass_rate": 0.2,
        "peak_power_w": 344.0,
        "peak_vram_mb": 24293.5,
        "rank": 28,
        "telemetry_sample_count": 4664
      },
      {
        "average_score": 8.67,
        "case_count": 15,
        "error_count": 31,
        "judge_error_count": 22,
        "median_gpu_busy_percent_avg": 83.7,
        "median_latency_seconds": 180.0028,
        "median_power_w_avg": 204.14,
        "median_vram_mb_peak": 4432.8,
        "model": "qwen3.5:0.8b",
        "ollama_error_count": 9,
        "pass_count": 0,
        "pass_rate": 0.0,
        "peak_power_w": 247.0,
        "peak_vram_mb": 4534.5,
        "rank": 29,
        "telemetry_sample_count": 3697
      }
    ],
    "matrix": [
      {
        "case_id": "literal_pr_review_instruction",
        "category": "instruction_as_content",
        "cells": {
          "devstral-small-2:24b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 82
          },
          "gemma3:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 55
          },
          "gemma4:26b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 85
          },
          "gemma4:e2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 88
          },
          "gemma4:e4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 88
          },
          "gemma4:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 90
          },
          "glm-4.7-flash:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 92
          },
          "gpt-oss:20b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 88
          },
          "granite4.1:30b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 94
          },
          "granite4.1:3b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 88
          },
          "granite4.1:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 96
          },
          "laguna-xs.2:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 0
          },
          "lfm2.5-thinking:1.2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 25
          },
          "lfm2:24b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 92
          },
          "ministral-3:14b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 78
          },
          "ministral-3:3b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 55
          },
          "ministral-3:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 82
          },
          "nemotron-3-nano:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 95
          },
          "nemotron3:33b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 78
          },
          "olmo-3.1:32b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 88
          },
          "olmo2:7b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 88
          },
          "qwen3.5:0.8b": {
            "judge_status": "not_run",
            "ollama_status": "timeout",
            "passed": false,
            "score": 0
          },
          "qwen3.5:27b": {
            "judge_status": "not_run",
            "ollama_status": "timeout",
            "passed": false,
            "score": 0
          },
          "qwen3.5:2b": {
            "judge_status": "not_run",
            "ollama_status": "timeout",
            "passed": false,
            "score": 0
          },
          "qwen3.5:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 88
          },
          "qwen3.5:9b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 92
          },
          "qwen3.6:27b": {
            "judge_status": "not_run",
            "ollama_status": "timeout",
            "passed": false,
            "score": 0
          },
          "qwen3.6:35b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 92
          },
          "rnj-1:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 0
          }
        }
      },
      {
        "case_id": "literal_agent_instruction",
        "category": "instruction_as_content",
        "cells": {
          "devstral-small-2:24b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 92
          },
          "gemma3:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "gemma4:26b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "gemma4:e2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 92
          },
          "gemma4:e4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 45
          },
          "gemma4:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "glm-4.7-flash:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "gpt-oss:20b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "granite4.1:30b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 90
          },
          "granite4.1:3b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 55
          },
          "granite4.1:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 92
          },
          "laguna-xs.2:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 94
          },
          "lfm2.5-thinking:1.2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 20
          },
          "lfm2:24b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 90
          },
          "ministral-3:14b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 20
          },
          "ministral-3:3b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 40
          },
          "ministral-3:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 25
          },
          "nemotron-3-nano:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "nemotron3:33b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "olmo-3.1:32b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "olmo2:7b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "qwen3.5:0.8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 35
          },
          "qwen3.5:27b": {
            "judge_status": "not_run",
            "ollama_status": "timeout",
            "passed": false,
            "score": 0
          },
          "qwen3.5:2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 95
          },
          "qwen3.5:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 92
          },
          "qwen3.5:9b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "qwen3.6:27b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 90
          },
          "qwen3.6:35b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "rnj-1:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 95
          }
        }
      },
      {
        "case_id": "literal_question_no_answer",
        "category": "instruction_as_content",
        "cells": {
          "devstral-small-2:24b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 0
          },
          "gemma3:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "gemma4:26b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 85
          },
          "gemma4:e2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "gemma4:e4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "gemma4:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 75
          },
          "glm-4.7-flash:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "gpt-oss:20b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 85
          },
          "granite4.1:30b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "granite4.1:3b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "granite4.1:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 85
          },
          "laguna-xs.2:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 0
          },
          "lfm2.5-thinking:1.2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 55
          },
          "lfm2:24b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "ministral-3:14b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "ministral-3:3b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 0
          },
          "ministral-3:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 0
          },
          "nemotron-3-nano:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "nemotron3:33b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "olmo-3.1:32b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "olmo2:7b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 0
          },
          "qwen3.5:0.8b": {
            "judge_status": "not_run",
            "ollama_status": "timeout",
            "passed": false,
            "score": 0
          },
          "qwen3.5:27b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "qwen3.5:2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "qwen3.5:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 85
          },
          "qwen3.5:9b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "qwen3.6:27b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "qwen3.6:35b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 85
          },
          "rnj-1:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          }
        }
      },
      {
        "case_id": "literal_email_request",
        "category": "instruction_as_content",
        "cells": {
          "devstral-small-2:24b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 35
          },
          "gemma3:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 70
          },
          "gemma4:26b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 96
          },
          "gemma4:e2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 78
          },
          "gemma4:e4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 70
          },
          "gemma4:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 55
          },
          "glm-4.7-flash:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 35
          },
          "gpt-oss:20b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 35
          },
          "granite4.1:30b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 82
          },
          "granite4.1:3b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 58
          },
          "granite4.1:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 20
          },
          "laguna-xs.2:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 58
          },
          "lfm2.5-thinking:1.2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 18
          },
          "lfm2:24b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 20
          },
          "ministral-3:14b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 20
          },
          "ministral-3:3b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 20
          },
          "ministral-3:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 35
          },
          "nemotron-3-nano:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 72
          },
          "nemotron3:33b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 20
          },
          "olmo-3.1:32b": {
            "judge_status": "not_run",
            "ollama_status": "timeout",
            "passed": false,
            "score": 0
          },
          "olmo2:7b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 35
          },
          "qwen3.5:0.8b": {
            "judge_status": "not_run",
            "ollama_status": "timeout",
            "passed": false,
            "score": 0
          },
          "qwen3.5:27b": {
            "judge_status": "not_run",
            "ollama_status": "timeout",
            "passed": false,
            "score": 0
          },
          "qwen3.5:2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 45
          },
          "qwen3.5:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 92
          },
          "qwen3.5:9b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 84
          },
          "qwen3.6:27b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 65
          },
          "qwen3.6:35b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 96
          },
          "rnj-1:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 0
          }
        }
      },
      {
        "case_id": "onboarding_mixed_format",
        "category": "mixed_formatting",
        "cells": {
          "devstral-small-2:24b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 62
          },
          "gemma3:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 35
          },
          "gemma4:26b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 88
          },
          "gemma4:e2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 25
          },
          "gemma4:e4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 55
          },
          "gemma4:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 62
          },
          "glm-4.7-flash:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 68
          },
          "gpt-oss:20b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 68
          },
          "granite4.1:30b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 58
          },
          "granite4.1:3b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 45
          },
          "granite4.1:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 52
          },
          "laguna-xs.2:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 45
          },
          "lfm2.5-thinking:1.2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 0
          },
          "lfm2:24b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 25
          },
          "ministral-3:14b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 35
          },
          "ministral-3:3b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 42
          },
          "ministral-3:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 72
          },
          "nemotron-3-nano:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 28
          },
          "nemotron3:33b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 82
          },
          "olmo-3.1:32b": {
            "judge_status": "not_run",
            "ollama_status": "timeout",
            "passed": false,
            "score": 0
          },
          "olmo2:7b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 42
          },
          "qwen3.5:0.8b": {
            "judge_status": "not_run",
            "ollama_status": "timeout",
            "passed": false,
            "score": 0
          },
          "qwen3.5:27b": {
            "judge_status": "not_run",
            "ollama_status": "timeout",
            "passed": false,
            "score": 0
          },
          "qwen3.5:2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 28
          },
          "qwen3.5:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 55
          },
          "qwen3.5:9b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 63
          },
          "qwen3.6:27b": {
            "judge_status": "not_run",
            "ollama_status": "timeout",
            "passed": false,
            "score": 0
          },
          "qwen3.6:35b": {
            "judge_status": "not_run",
            "ollama_status": "timeout",
            "passed": false,
            "score": 0
          },
          "rnj-1:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 25
          }
        }
      },
      {
        "case_id": "markdown_checklist_instruction_content",
        "category": "markdown",
        "cells": {
          "devstral-small-2:24b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 40
          },
          "gemma3:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 85
          },
          "gemma4:26b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 75
          },
          "gemma4:e2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 45
          },
          "gemma4:e4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 82
          },
          "gemma4:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 85
          },
          "glm-4.7-flash:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 78
          },
          "gpt-oss:20b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 72
          },
          "granite4.1:30b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 70
          },
          "granite4.1:3b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 65
          },
          "granite4.1:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 90
          },
          "laguna-xs.2:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 85
          },
          "lfm2.5-thinking:1.2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 15
          },
          "lfm2:24b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 60
          },
          "ministral-3:14b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 55
          },
          "ministral-3:3b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 45
          },
          "ministral-3:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 70
          },
          "nemotron-3-nano:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 35
          },
          "nemotron3:33b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 82
          },
          "olmo-3.1:32b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 60
          },
          "olmo2:7b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 65
          },
          "qwen3.5:0.8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 35
          },
          "qwen3.5:27b": {
            "judge_status": "not_run",
            "ollama_status": "timeout",
            "passed": false,
            "score": 0
          },
          "qwen3.5:2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 35
          },
          "qwen3.5:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 35
          },
          "qwen3.5:9b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 85
          },
          "qwen3.6:27b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 92
          },
          "qwen3.6:35b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 90
          },
          "rnj-1:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 45
          }
        }
      },
      {
        "case_id": "shell_command_as_content",
        "category": "code_identifiers",
        "cells": {
          "devstral-small-2:24b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 60
          },
          "gemma3:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 15
          },
          "gemma4:26b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "gemma4:e2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 25
          },
          "gemma4:e4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 15
          },
          "gemma4:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 35
          },
          "glm-4.7-flash:latest": {
            "judge_status": "not_run",
            "ollama_status": "timeout",
            "passed": false,
            "score": 0
          },
          "gpt-oss:20b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 78
          },
          "granite4.1:30b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 45
          },
          "granite4.1:3b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 15
          },
          "granite4.1:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "laguna-xs.2:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 0
          },
          "lfm2.5-thinking:1.2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 35
          },
          "lfm2:24b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 10
          },
          "ministral-3:14b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 15
          },
          "ministral-3:3b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 35
          },
          "ministral-3:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 0
          },
          "nemotron-3-nano:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 20
          },
          "nemotron3:33b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 55
          },
          "olmo-3.1:32b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "olmo2:7b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 0
          },
          "qwen3.5:0.8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 25
          },
          "qwen3.5:27b": {
            "judge_status": "not_run",
            "ollama_status": "timeout",
            "passed": false,
            "score": 0
          },
          "qwen3.5:2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 30
          },
          "qwen3.5:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 20
          },
          "qwen3.5:9b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 80
          },
          "qwen3.6:27b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 85
          },
          "qwen3.6:35b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 90
          },
          "rnj-1:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 75
          }
        }
      },
      {
        "case_id": "spoken_correction_chain",
        "category": "spoken_edits",
        "cells": {
          "devstral-small-2:24b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 60
          },
          "gemma3:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 0
          },
          "gemma4:26b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 35
          },
          "gemma4:e2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 0
          },
          "gemma4:e4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 45
          },
          "gemma4:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 15
          },
          "glm-4.7-flash:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 25
          },
          "gpt-oss:20b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 45
          },
          "granite4.1:30b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 55
          },
          "granite4.1:3b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 35
          },
          "granite4.1:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 35
          },
          "laguna-xs.2:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 0
          },
          "lfm2.5-thinking:1.2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 0
          },
          "lfm2:24b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 0
          },
          "ministral-3:14b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 35
          },
          "ministral-3:3b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 10
          },
          "ministral-3:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 15
          },
          "nemotron-3-nano:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 0
          },
          "nemotron3:33b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 0
          },
          "olmo-3.1:32b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 45
          },
          "olmo2:7b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 35
          },
          "qwen3.5:0.8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 0
          },
          "qwen3.5:27b": {
            "judge_status": "not_run",
            "ollama_status": "timeout",
            "passed": false,
            "score": 0
          },
          "qwen3.5:2b": {
            "judge_status": "not_run",
            "ollama_status": "timeout",
            "passed": false,
            "score": 0
          },
          "qwen3.5:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 15
          },
          "qwen3.5:9b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 65
          },
          "qwen3.6:27b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 65
          },
          "qwen3.6:35b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 75
          },
          "rnj-1:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 35
          }
        }
      },
      {
        "case_id": "url_https_path",
        "category": "urls",
        "cells": {
          "devstral-small-2:24b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "gemma3:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 70
          },
          "gemma4:26b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "gemma4:e2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 85
          },
          "gemma4:e4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 75
          },
          "gemma4:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 72
          },
          "glm-4.7-flash:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 95
          },
          "gpt-oss:20b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 85
          },
          "granite4.1:30b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "granite4.1:3b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 90
          },
          "granite4.1:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "laguna-xs.2:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 0
          },
          "lfm2.5-thinking:1.2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 0
          },
          "lfm2:24b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 95
          },
          "ministral-3:14b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 85
          },
          "ministral-3:3b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 75
          },
          "ministral-3:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 65
          },
          "nemotron-3-nano:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "nemotron3:33b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "olmo-3.1:32b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "olmo2:7b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 45
          },
          "qwen3.5:0.8b": {
            "judge_status": "not_run",
            "ollama_status": "timeout",
            "passed": false,
            "score": 0
          },
          "qwen3.5:27b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "qwen3.5:2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "qwen3.5:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 85
          },
          "qwen3.5:9b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "qwen3.6:27b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "qwen3.6:35b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "rnj-1:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 85
          }
        }
      },
      {
        "case_id": "email_addresses",
        "category": "emails",
        "cells": {
          "devstral-small-2:24b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 95
          },
          "gemma3:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 45
          },
          "gemma4:26b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "gemma4:e2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "gemma4:e4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 95
          },
          "gemma4:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 60
          },
          "glm-4.7-flash:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "gpt-oss:20b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "granite4.1:30b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 95
          },
          "granite4.1:3b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 95
          },
          "granite4.1:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 65
          },
          "laguna-xs.2:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 0
          },
          "lfm2.5-thinking:1.2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 20
          },
          "lfm2:24b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 55
          },
          "ministral-3:14b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 90
          },
          "ministral-3:3b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 35
          },
          "ministral-3:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 92
          },
          "nemotron-3-nano:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 65
          },
          "nemotron3:33b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 75
          },
          "olmo-3.1:32b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "olmo2:7b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 45
          },
          "qwen3.5:0.8b": {
            "judge_status": "not_run",
            "ollama_status": "timeout",
            "passed": false,
            "score": 0
          },
          "qwen3.5:27b": {
            "judge_status": "not_run",
            "ollama_status": "timeout",
            "passed": false,
            "score": 0
          },
          "qwen3.5:2b": {
            "judge_status": "not_run",
            "ollama_status": "timeout",
            "passed": false,
            "score": 0
          },
          "qwen3.5:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 82
          },
          "qwen3.5:9b": {
            "judge_status": "not_run",
            "ollama_status": "timeout",
            "passed": false,
            "score": 0
          },
          "qwen3.6:27b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "qwen3.6:35b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 85
          },
          "rnj-1:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 0
          }
        }
      },
      {
        "case_id": "product_and_class_casing",
        "category": "casing_identifiers",
        "cells": {
          "devstral-small-2:24b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 85
          },
          "gemma3:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 70
          },
          "gemma4:26b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 70
          },
          "gemma4:e2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 70
          },
          "gemma4:e4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 95
          },
          "gemma4:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 75
          },
          "glm-4.7-flash:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 55
          },
          "gpt-oss:20b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 85
          },
          "granite4.1:30b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 80
          },
          "granite4.1:3b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 35
          },
          "granite4.1:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 75
          },
          "laguna-xs.2:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 60
          },
          "lfm2.5-thinking:1.2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 0
          },
          "lfm2:24b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 35
          },
          "ministral-3:14b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 88
          },
          "ministral-3:3b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 15
          },
          "ministral-3:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 55
          },
          "nemotron-3-nano:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 55
          },
          "nemotron3:33b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 70
          },
          "olmo-3.1:32b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 85
          },
          "olmo2:7b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 20
          },
          "qwen3.5:0.8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 35
          },
          "qwen3.5:27b": {
            "judge_status": "not_run",
            "ollama_status": "timeout",
            "passed": false,
            "score": 0
          },
          "qwen3.5:2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 60
          },
          "qwen3.5:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 75
          },
          "qwen3.5:9b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 65
          },
          "qwen3.6:27b": {
            "judge_status": "not_run",
            "ollama_status": "timeout",
            "passed": false,
            "score": 0
          },
          "qwen3.6:35b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 80
          },
          "rnj-1:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 60
          }
        }
      },
      {
        "case_id": "inline_code_identifiers",
        "category": "code_identifiers",
        "cells": {
          "devstral-small-2:24b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 35
          },
          "gemma3:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 55
          },
          "gemma4:26b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 55
          },
          "gemma4:e2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 60
          },
          "gemma4:e4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 55
          },
          "gemma4:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 55
          },
          "glm-4.7-flash:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 70
          },
          "gpt-oss:20b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 55
          },
          "granite4.1:30b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 82
          },
          "granite4.1:3b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 35
          },
          "granite4.1:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 65
          },
          "laguna-xs.2:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 0
          },
          "lfm2.5-thinking:1.2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 15
          },
          "lfm2:24b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 70
          },
          "ministral-3:14b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 45
          },
          "ministral-3:3b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 55
          },
          "ministral-3:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 45
          },
          "nemotron-3-nano:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 75
          },
          "nemotron3:33b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 65
          },
          "olmo-3.1:32b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 45
          },
          "olmo2:7b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 85
          },
          "qwen3.5:0.8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 0
          },
          "qwen3.5:27b": {
            "judge_status": "not_run",
            "ollama_status": "timeout",
            "passed": false,
            "score": 0
          },
          "qwen3.5:2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 55
          },
          "qwen3.5:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 86
          },
          "qwen3.5:9b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 55
          },
          "qwen3.6:27b": {
            "judge_status": "not_run",
            "ollama_status": "timeout",
            "passed": false,
            "score": 0
          },
          "qwen3.6:35b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 80
          },
          "rnj-1:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 45
          }
        }
      },
      {
        "case_id": "numbers_versions_dates",
        "category": "numbers_versions",
        "cells": {
          "devstral-small-2:24b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 96
          },
          "gemma3:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 65
          },
          "gemma4:26b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 96
          },
          "gemma4:e2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 78
          },
          "gemma4:e4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 96
          },
          "gemma4:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 95
          },
          "glm-4.7-flash:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 96
          },
          "gpt-oss:20b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 92
          },
          "granite4.1:30b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 95
          },
          "granite4.1:3b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 0
          },
          "granite4.1:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 88
          },
          "laguna-xs.2:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 98
          },
          "lfm2.5-thinking:1.2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 35
          },
          "lfm2:24b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 72
          },
          "ministral-3:14b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 95
          },
          "ministral-3:3b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 85
          },
          "ministral-3:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 95
          },
          "nemotron-3-nano:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 78
          },
          "nemotron3:33b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 95
          },
          "olmo-3.1:32b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 78
          },
          "olmo2:7b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 90
          },
          "qwen3.5:0.8b": {
            "judge_status": "not_run",
            "ollama_status": "timeout",
            "passed": false,
            "score": 0
          },
          "qwen3.5:27b": {
            "judge_status": "not_run",
            "ollama_status": "timeout",
            "passed": false,
            "score": 0
          },
          "qwen3.5:2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 78
          },
          "qwen3.5:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 90
          },
          "qwen3.5:9b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "qwen3.6:27b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 95
          },
          "qwen3.6:35b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 88
          },
          "rnj-1:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 96
          }
        }
      },
      {
        "case_id": "simple_list",
        "category": "lists",
        "cells": {
          "devstral-small-2:24b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 35
          },
          "gemma3:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 70
          },
          "gemma4:26b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 70
          },
          "gemma4:e2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 65
          },
          "gemma4:e4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 65
          },
          "gemma4:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 70
          },
          "glm-4.7-flash:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 70
          },
          "gpt-oss:20b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 70
          },
          "granite4.1:30b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 65
          },
          "granite4.1:3b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 55
          },
          "granite4.1:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 70
          },
          "laguna-xs.2:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 65
          },
          "lfm2.5-thinking:1.2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 35
          },
          "lfm2:24b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 55
          },
          "ministral-3:14b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 72
          },
          "ministral-3:3b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 85
          },
          "ministral-3:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 90
          },
          "nemotron-3-nano:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 55
          },
          "nemotron3:33b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 60
          },
          "olmo-3.1:32b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 70
          },
          "olmo2:7b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 70
          },
          "qwen3.5:0.8b": {
            "judge_status": "not_run",
            "ollama_status": "timeout",
            "passed": false,
            "score": 0
          },
          "qwen3.5:27b": {
            "judge_status": "not_run",
            "ollama_status": "timeout",
            "passed": false,
            "score": 0
          },
          "qwen3.5:2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 70
          },
          "qwen3.5:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 70
          },
          "qwen3.5:9b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 70
          },
          "qwen3.6:27b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 70
          },
          "qwen3.6:35b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 70
          },
          "rnj-1:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": false,
            "score": 75
          }
        }
      },
      {
        "case_id": "no_change_needed",
        "category": "no_change_needed",
        "cells": {
          "devstral-small-2:24b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "gemma3:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "gemma4:26b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "gemma4:e2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "gemma4:e4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "gemma4:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "glm-4.7-flash:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "gpt-oss:20b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "granite4.1:30b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "granite4.1:3b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "granite4.1:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "laguna-xs.2:latest": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "lfm2.5-thinking:1.2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "lfm2:24b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "ministral-3:14b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "ministral-3:3b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "ministral-3:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "nemotron-3-nano:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "nemotron3:33b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "olmo-3.1:32b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "olmo2:7b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 90
          },
          "qwen3.5:0.8b": {
            "judge_status": "not_run",
            "ollama_status": "timeout",
            "passed": false,
            "score": 0
          },
          "qwen3.5:27b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "qwen3.5:2b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "qwen3.5:4b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "qwen3.5:9b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "qwen3.6:27b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "qwen3.6:35b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          },
          "rnj-1:8b": {
            "judge_status": "ok",
            "ollama_status": "ok",
            "passed": true,
            "score": 100
          }
        }
      }
    ],
    "model_count": 29,
    "models": [
      {
        "average_score": 83.67,
        "case_count": 15,
        "error_count": 20,
        "judge_error_count": 20,
        "median_gpu_busy_percent_avg": 71.2,
        "median_latency_seconds": 14.6337,
        "median_power_w_avg": 156.86,
        "median_vram_mb_peak": 20745.3,
        "model": "gemma4:26b",
        "ollama_error_count": 0,
        "pass_count": 10,
        "pass_rate": 0.6667,
        "peak_power_w": 326.0,
        "peak_vram_mb": 20830.8,
        "rank": 1,
        "telemetry_sample_count": 733
      },
      {
        "average_score": 82.07,
        "case_count": 15,
        "error_count": 14,
        "judge_error_count": 13,
        "median_gpu_busy_percent_avg": 37.5,
        "median_latency_seconds": 108.5076,
        "median_power_w_avg": 56.24,
        "median_vram_mb_peak": 23972.7,
        "model": "qwen3.6:35b",
        "ollama_error_count": 1,
        "pass_count": 9,
        "pass_rate": 0.6,
        "peak_power_w": 159.0,
        "peak_vram_mb": 23983.4,
        "rank": 2,
        "telemetry_sample_count": 3069
      },
      {
        "average_score": 80.73,
        "case_count": 15,
        "error_count": 26,
        "judge_error_count": 26,
        "median_gpu_busy_percent_avg": 16.5,
        "median_latency_seconds": 8.0058,
        "median_power_w_avg": 67.94,
        "median_vram_mb_peak": 18890.2,
        "model": "granite4.1:30b",
        "ollama_error_count": 0,
        "pass_count": 9,
        "pass_rate": 0.6,
        "peak_power_w": 281.0,
        "peak_vram_mb": 18974.5,
        "rank": 3,
        "telemetry_sample_count": 378
      },
      {
        "average_score": 77.27,
        "case_count": 15,
        "error_count": 21,
        "judge_error_count": 20,
        "median_gpu_busy_percent_avg": 86.4,
        "median_latency_seconds": 58.6964,
        "median_power_w_avg": 270.14,
        "median_vram_mb_peak": 10743.3,
        "model": "qwen3.5:9b",
        "ollama_error_count": 1,
        "pass_count": 9,
        "pass_rate": 0.6,
        "peak_power_w": 317.0,
        "peak_vram_mb": 10843.7,
        "rank": 4,
        "telemetry_sample_count": 2139
      },
      {
        "average_score": 77.2,
        "case_count": 15,
        "error_count": 27,
        "judge_error_count": 27,
        "median_gpu_busy_percent_avg": 66.7,
        "median_latency_seconds": 6.4238,
        "median_power_w_avg": 145.69,
        "median_vram_mb_peak": 15026.5,
        "model": "gpt-oss:20b",
        "ollama_error_count": 0,
        "pass_count": 6,
        "pass_rate": 0.4,
        "peak_power_w": 320.0,
        "peak_vram_mb": 15110.8,
        "rank": 5,
        "telemetry_sample_count": 238
      },
      {
        "average_score": 75.53,
        "case_count": 15,
        "error_count": 28,
        "judge_error_count": 28,
        "median_gpu_busy_percent_avg": 25.2,
        "median_latency_seconds": 2.2594,
        "median_power_w_avg": 70.83,
        "median_vram_mb_peak": 12093.2,
        "model": "granite4.1:8b",
        "ollama_error_count": 0,
        "pass_count": 7,
        "pass_rate": 0.4667,
        "peak_power_w": 308.0,
        "peak_vram_mb": 12096.2,
        "rank": 6,
        "telemetry_sample_count": 99
      },
      {
        "average_score": 72.27,
        "case_count": 15,
        "error_count": 26,
        "judge_error_count": 25,
        "median_gpu_busy_percent_avg": 83.6,
        "median_latency_seconds": 20.3197,
        "median_power_w_avg": 204.7,
        "median_vram_mb_peak": 23184.5,
        "model": "glm-4.7-flash:latest",
        "ollama_error_count": 1,
        "pass_count": 8,
        "pass_rate": 0.5333,
        "peak_power_w": 292.0,
        "peak_vram_mb": 23270.0,
        "rank": 7,
        "telemetry_sample_count": 1006
      },
      {
        "average_score": 72.13,
        "case_count": 15,
        "error_count": 28,
        "judge_error_count": 28,
        "median_gpu_busy_percent_avg": 37.3,
        "median_latency_seconds": 36.6626,
        "median_power_w_avg": 60.65,
        "median_vram_mb_peak": 23884.5,
        "model": "nemotron3:33b",
        "ollama_error_count": 0,
        "pass_count": 6,
        "pass_rate": 0.4,
        "peak_power_w": 235.0,
        "peak_vram_mb": 23946.4,
        "rank": 8,
        "telemetry_sample_count": 1397
      },
      {
        "average_score": 72.07,
        "case_count": 15,
        "error_count": 32,
        "judge_error_count": 32,
        "median_gpu_busy_percent_avg": 59.5,
        "median_latency_seconds": 8.5258,
        "median_power_w_avg": 141.5,
        "median_vram_mb_peak": 11932.0,
        "model": "gemma4:e4b",
        "ollama_error_count": 0,
        "pass_count": 7,
        "pass_rate": 0.4667,
        "peak_power_w": 295.0,
        "peak_vram_mb": 12017.5,
        "rank": 9,
        "telemetry_sample_count": 294
      },
      {
        "average_score": 71.4,
        "case_count": 15,
        "error_count": 21,
        "judge_error_count": 19,
        "median_gpu_busy_percent_avg": 94.4,
        "median_latency_seconds": 57.0776,
        "median_power_w_avg": 272.93,
        "median_vram_mb_peak": 23545.3,
        "model": "olmo-3.1:32b",
        "ollama_error_count": 2,
        "pass_count": 8,
        "pass_rate": 0.5333,
        "peak_power_w": 315.0,
        "peak_vram_mb": 23917.7,
        "rank": 10,
        "telemetry_sample_count": 2276
      },
      {
        "average_score": 71.33,
        "case_count": 15,
        "error_count": 28,
        "judge_error_count": 28,
        "median_gpu_busy_percent_avg": 86.1,
        "median_latency_seconds": 68.1213,
        "median_power_w_avg": 276.03,
        "median_vram_mb_peak": 8180.5,
        "model": "qwen3.5:4b",
        "ollama_error_count": 0,
        "pass_count": 7,
        "pass_rate": 0.4667,
        "peak_power_w": 311.0,
        "peak_vram_mb": 8266.0,
        "rank": 11,
        "telemetry_sample_count": 2189
      },
      {
        "average_score": 69.6,
        "case_count": 15,
        "error_count": 32,
        "judge_error_count": 32,
        "median_gpu_busy_percent_avg": 59.8,
        "median_latency_seconds": 9.1641,
        "median_power_w_avg": 109.4,
        "median_vram_mb_peak": 12576.1,
        "model": "gemma4:latest",
        "ollama_error_count": 0,
        "pass_count": 5,
        "pass_rate": 0.3333,
        "peak_power_w": 278.0,
        "peak_vram_mb": 12700.1,
        "rank": 12,
        "telemetry_sample_count": 286
      },
      {
        "average_score": 67.4,
        "case_count": 15,
        "error_count": 32,
        "judge_error_count": 32,
        "median_gpu_busy_percent_avg": 56.8,
        "median_latency_seconds": 6.6283,
        "median_power_w_avg": 117.53,
        "median_vram_mb_peak": 9134.0,
        "model": "gemma4:e2b",
        "ollama_error_count": 0,
        "pass_count": 7,
        "pass_rate": 0.4667,
        "peak_power_w": 312.0,
        "peak_vram_mb": 9219.0,
        "rank": 13,
        "telemetry_sample_count": 232
      },
      {
        "average_score": 65.2,
        "case_count": 15,
        "error_count": 30,
        "judge_error_count": 30,
        "median_gpu_busy_percent_avg": 68.8,
        "median_latency_seconds": 3.4761,
        "median_power_w_avg": 173.88,
        "median_vram_mb_peak": 7925.2,
        "model": "nemotron-3-nano:4b",
        "ollama_error_count": 0,
        "pass_count": 5,
        "pass_rate": 0.3333,
        "peak_power_w": 296.0,
        "peak_vram_mb": 8070.5,
        "rank": 14,
        "telemetry_sample_count": 141
      },
      {
        "average_score": 65.13,
        "case_count": 15,
        "error_count": 30,
        "judge_error_count": 30,
        "median_gpu_busy_percent_avg": 57.0,
        "median_latency_seconds": 10.5886,
        "median_power_w_avg": 77.35,
        "median_vram_mb_peak": 22891.1,
        "model": "devstral-small-2:24b",
        "ollama_error_count": 0,
        "pass_count": 6,
        "pass_rate": 0.4,
        "peak_power_w": 316.0,
        "peak_vram_mb": 23007.4,
        "rank": 15,
        "telemetry_sample_count": 345
      },
      {
        "average_score": 64.13,
        "case_count": 15,
        "error_count": 14,
        "judge_error_count": 10,
        "median_gpu_busy_percent_avg": 47.7,
        "median_latency_seconds": 121.96,
        "median_power_w_avg": 142.69,
        "median_vram_mb_peak": 24043.6,
        "model": "qwen3.6:27b",
        "ollama_error_count": 4,
        "pass_count": 8,
        "pass_rate": 0.5333,
        "peak_power_w": 355.0,
        "peak_vram_mb": 24049.6,
        "rank": 16,
        "telemetry_sample_count": 3769
      },
      {
        "average_score": 62.33,
        "case_count": 15,
        "error_count": 37,
        "judge_error_count": 37,
        "median_gpu_busy_percent_avg": 40.9,
        "median_latency_seconds": 3.6961,
        "median_power_w_avg": 69.0,
        "median_vram_mb_peak": 6703.3,
        "model": "gemma3:4b",
        "ollama_error_count": 0,
        "pass_count": 4,
        "pass_rate": 0.2667,
        "peak_power_w": 289.0,
        "peak_vram_mb": 6866.5,
        "rank": 17,
        "telemetry_sample_count": 132
      },
      {
        "average_score": 62.2,
        "case_count": 15,
        "error_count": 36,
        "judge_error_count": 36,
        "median_gpu_busy_percent_avg": 62.4,
        "median_latency_seconds": 4.2146,
        "median_power_w_avg": 123.0,
        "median_vram_mb_peak": 17043.7,
        "model": "ministral-3:14b",
        "ollama_error_count": 0,
        "pass_count": 7,
        "pass_rate": 0.4667,
        "peak_power_w": 313.0,
        "peak_vram_mb": 17114.5,
        "rank": 18,
        "telemetry_sample_count": 160
      },
      {
        "average_score": 58.6,
        "case_count": 15,
        "error_count": 39,
        "judge_error_count": 39,
        "median_gpu_busy_percent_avg": 54.9,
        "median_latency_seconds": 8.0904,
        "median_power_w_avg": 45.78,
        "median_vram_mb_peak": 16453.1,
        "model": "lfm2:24b",
        "ollama_error_count": 0,
        "pass_count": 5,
        "pass_rate": 0.3333,
        "peak_power_w": 228.0,
        "peak_vram_mb": 16564.4,
        "rank": 19,
        "telemetry_sample_count": 273
      },
      {
        "average_score": 58.07,
        "case_count": 15,
        "error_count": 37,
        "judge_error_count": 37,
        "median_gpu_busy_percent_avg": 31.0,
        "median_latency_seconds": 1.3766,
        "median_power_w_avg": 73.75,
        "median_vram_mb_peak": 6700.7,
        "model": "granite4.1:3b",
        "ollama_error_count": 0,
        "pass_count": 5,
        "pass_rate": 0.3333,
        "peak_power_w": 256.0,
        "peak_vram_mb": 7176.7,
        "rank": 20,
        "telemetry_sample_count": 65
      },
      {
        "average_score": 56.07,
        "case_count": 15,
        "error_count": 37,
        "judge_error_count": 37,
        "median_gpu_busy_percent_avg": 59.8,
        "median_latency_seconds": 3.2377,
        "median_power_w_avg": 123.0,
        "median_vram_mb_peak": 12723.6,
        "model": "ministral-3:8b",
        "ollama_error_count": 0,
        "pass_count": 5,
        "pass_rate": 0.3333,
        "peak_power_w": 339.0,
        "peak_vram_mb": 12766.1,
        "rank": 21,
        "telemetry_sample_count": 128
      },
      {
        "average_score": 55.73,
        "case_count": 15,
        "error_count": 36,
        "judge_error_count": 36,
        "median_gpu_busy_percent_avg": 51.8,
        "median_latency_seconds": 2.4815,
        "median_power_w_avg": 121.0,
        "median_vram_mb_peak": 11529.1,
        "model": "rnj-1:8b",
        "ollama_error_count": 0,
        "pass_count": 5,
        "pass_rate": 0.3333,
        "peak_power_w": 351.0,
        "peak_vram_mb": 11609.0,
        "rank": 22,
        "telemetry_sample_count": 100
      },
      {
        "average_score": 54.0,
        "case_count": 15,
        "error_count": 38,
        "judge_error_count": 38,
        "median_gpu_busy_percent_avg": 26.2,
        "median_latency_seconds": 1.8999,
        "median_power_w_avg": 73.2,
        "median_vram_mb_peak": 8191.5,
        "model": "olmo2:7b",
        "ollama_error_count": 0,
        "pass_count": 5,
        "pass_rate": 0.3333,
        "peak_power_w": 305.0,
        "peak_vram_mb": 8193.0,
        "rank": 23,
        "telemetry_sample_count": 86
      },
      {
        "average_score": 53.07,
        "case_count": 15,
        "error_count": 28,
        "judge_error_count": 25,
        "median_gpu_busy_percent_avg": 83.7,
        "median_latency_seconds": 58.0375,
        "median_power_w_avg": 236.17,
        "median_vram_mb_peak": 6013.3,
        "model": "qwen3.5:2b",
        "ollama_error_count": 3,
        "pass_count": 4,
        "pass_rate": 0.2667,
        "peak_power_w": 293.0,
        "peak_vram_mb": 6098.8,
        "rank": 24,
        "telemetry_sample_count": 2403
      },
      {
        "average_score": 46.47,
        "case_count": 15,
        "error_count": 44,
        "judge_error_count": 44,
        "median_gpu_busy_percent_avg": 44.0,
        "median_latency_seconds": 2.4151,
        "median_power_w_avg": 94.0,
        "median_vram_mb_peak": 8957.0,
        "model": "ministral-3:3b",
        "ollama_error_count": 0,
        "pass_count": 3,
        "pass_rate": 0.2,
        "peak_power_w": 319.0,
        "peak_vram_mb": 9121.9,
        "rank": 25,
        "telemetry_sample_count": 96
      },
      {
        "average_score": 40.33,
        "case_count": 15,
        "error_count": 42,
        "judge_error_count": 42,
        "median_gpu_busy_percent_avg": 49.0,
        "median_latency_seconds": 10.8602,
        "median_power_w_avg": 80.58,
        "median_vram_mb_peak": 24258.1,
        "model": "laguna-xs.2:latest",
        "ollama_error_count": 0,
        "pass_count": 4,
        "pass_rate": 0.2667,
        "peak_power_w": 202.0,
        "peak_vram_mb": 24329.4,
        "rank": 26,
        "telemetry_sample_count": 816
      },
      {
        "average_score": 24.87,
        "case_count": 15,
        "error_count": 49,
        "judge_error_count": 49,
        "median_gpu_busy_percent_avg": 72.8,
        "median_latency_seconds": 5.9512,
        "median_power_w_avg": 211.38,
        "median_vram_mb_peak": 3107.6,
        "model": "lfm2.5-thinking:1.2b",
        "ollama_error_count": 0,
        "pass_count": 1,
        "pass_rate": 0.0667,
        "peak_power_w": 265.0,
        "peak_vram_mb": 3179.8,
        "rank": 27,
        "telemetry_sample_count": 207
      },
      {
        "average_score": 20.0,
        "case_count": 15,
        "error_count": 12,
        "judge_error_count": 0,
        "median_gpu_busy_percent_avg": 53.9,
        "median_latency_seconds": 180.0034,
        "median_power_w_avg": 149.06,
        "median_vram_mb_peak": 24249.5,
        "model": "qwen3.5:27b",
        "ollama_error_count": 12,
        "pass_count": 3,
        "pass_rate": 0.2,
        "peak_power_w": 344.0,
        "peak_vram_mb": 24293.5,
        "rank": 28,
        "telemetry_sample_count": 4664
      },
      {
        "average_score": 8.67,
        "case_count": 15,
        "error_count": 31,
        "judge_error_count": 22,
        "median_gpu_busy_percent_avg": 83.7,
        "median_latency_seconds": 180.0028,
        "median_power_w_avg": 204.14,
        "median_vram_mb_peak": 4432.8,
        "model": "qwen3.5:0.8b",
        "ollama_error_count": 9,
        "pass_count": 0,
        "pass_rate": 0.0,
        "peak_power_w": 247.0,
        "peak_vram_mb": 4534.5,
        "rank": 29,
        "telemetry_sample_count": 3697
      }
    ],
    "result_count": 435,
    "severity_counts": {
      "critical": 98,
      "major": 466,
      "minor": 311
    },
    "telemetry": {
      "available": true,
      "providers": [
        "amd-sysfs"
      ],
      "sample_count": 31418
    }
  },
  "telemetry": {
    "device": {
      "card": "card1",
      "device_path": "/sys/class/drm/card1/device",
      "driver": "amdgpu",
      "pci_address": "0000:0c:00.0",
      "provider": "amd-sysfs",
      "reason": "",
      "status": "ok",
      "vendor": "0x1002",
      "vram_total_mb": 24560.0
    },
    "interval_seconds": 0.5,
    "mode": "amd-sysfs",
    "provider": "amd-sysfs",
    "reason": "",
    "status": "ok"
  }
}
