Experiments via SDK

The SDK experiment workflow lets you run your full application against a dataset and collect scores programmatically. This is the most flexible option — you control the task function, the evaluation logic, and the scoring.

Workflow Overview

Define your task — the function that takes a dataset item input and returns an output
Run the experiment — XeroML iterates over all dataset items, calls your task, and creates a trace per item
Score the outputs — attach scores to each trace (via evaluator or custom scoring logic)
Review results — compare scores across experiment runs in the dashboard

from xeroml import get_client, observe

xeroml = get_client()

@observe()
def my_task(input: dict) -> str:
    """Your application logic goes here."""
    question = input["question"]
    return call_llm(question)


def run_experiment():
    dataset = xeroml.get_dataset("evaluation/qa-dataset")

    experiment = xeroml.create_experiment(
        name="prompt-v2-baseline",
        dataset_name="evaluation/qa-dataset",
    )

    for item in dataset.items:
        with experiment.run_item(item) as run:
            output = my_task(item.input)
            run.set_output(output)

            # Score the output
            score = evaluate_output(item.input, output, item.expected_output)
            run.score(name="accuracy", value=score)

    print(f"Experiment complete: {experiment.url}")


run_experiment()
xeroml.flush()

import { XeroMLClient } from "@xeroml/client";
import { startActiveObservation, flushXeroML } from "@xeroml/tracing";

const xeroml = new XeroMLClient();

async function myTask(input: Record<string, string>): Promise<string> {
  return await startActiveObservation({ name: "task" }, async () => {
    return await callLLM(input.question);
  });
}

async function runExperiment() {
  const dataset = await xeroml.getDataset("evaluation/qa-dataset");

  const experiment = await xeroml.createExperiment({
    name: "prompt-v2-baseline",
    datasetName: "evaluation/qa-dataset",
  });

  for (const item of dataset.items) {
    const run = await experiment.runItem(item);
    const output = await myTask(item.input);
    await run.setOutput(output);

    const score = evaluateOutput(item.input, output, item.expectedOutput);
    await run.score({ name: "accuracy", value: score });
  }

  console.log(`Experiment complete: ${experiment.url}`);
}

await runExperiment();
await flushXeroML();

With LLM-as-a-Judge Scoring

Instead of writing your own scoring function, use XeroML’s built-in LLM-as-a-Judge evaluators:

experiment = xeroml.create_experiment(
    name="prompt-v2-with-judge",
    dataset_name="evaluation/qa-dataset",
    evaluators=["helpfulness", "accuracy"],  # Pre-configured evaluators
)

for item in dataset.items:
    with experiment.run_item(item) as run:
        output = my_task(item.input)
        run.set_output(output)
        # Evaluators run automatically after each item

Using Local Datasets

You don’t have to use XeroML datasets. You can run experiments against any local data and just push results to XeroML:

local_test_cases = [
    {"input": {"q": "What is XeroML?"}, "expected": "An LLM observability platform."},
    # ...
]

experiment = xeroml.create_experiment(name="local-dataset-run")

for case in local_test_cases:
    with experiment.run_item_raw(input=case["input"]) as run:
        output = my_task(case["input"])
        run.set_output(output)
        run.score("accuracy", exact_match(output, case["expected"]))

Comparing Experiment Runs

After running multiple experiments on the same dataset, open the dataset in the XeroML UI and click Compare Runs. XeroML shows a side-by-side comparison of scores per item, making it easy to identify exactly which test cases regressed or improved between runs.

CI/CD Integration

Run experiments in your CI pipeline to gate deployments on evaluation quality:

experiment = run_experiment()
results = experiment.get_results()

avg_accuracy = sum(r.score("accuracy") for r in results) / len(results)

if avg_accuracy < 0.85:
    print(f"Experiment failed: accuracy {avg_accuracy:.2f} < 0.85 threshold")
    exit(1)

print(f"Experiment passed: accuracy {avg_accuracy:.2f}")