RL Environments
Use Akira for reinforcement learning evaluation at scale.
The Challenge
Training RL agents requires:
- Isolated evaluation environments
- Parallel episode execution
- Consistent, reproducible runs
- Safe execution of learned policies
Akira provides all of this out of the box.
Architecture
┌──────────────────────────────────────────────────────────┐
│ RL Training System │
│ │
│ ┌─────────────┐ │
│ │ Policy │ │
│ │ Network │ │
│ └──────┬──────┘ │
│ │ │
│ ▼ │
│ ┌──────────────────────────────────────────────────┐ │
│ │ Episode Manager │ │
│ └──────────────────────────────────────────────────┘ │
│ │ │
│ ▼ │
│ ┌──────────────────────────────────────────────────┐ │
│ │ Akira Sandbox Pool │ │
│ │ ┌────────┐ ┌────────┐ ┌────────┐ ┌────────┐ │ │
│ │ │ Env 1 │ │ Env 2 │ │ Env 3 │ │ Env N │ │ │
│ │ │ │ │ │ │ │ │ │ │ │
│ │ └────────┘ └────────┘ └────────┘ └────────┘ │ │
│ └──────────────────────────────────────────────────┘ │
│ │ │
│ ▼ │
│ ┌──────────────────────────────────────────────────┐ │
│ │ Experience Buffer │ │
│ │ (states, actions, rewards, next_states) │ │
│ └──────────────────────────────────────────────────┘ │
└──────────────────────────────────────────────────────────┘Example: Parallel Episode Evaluation
Run multiple RL episodes concurrently:
import SandboxSDK from '@akiralabs/sandbox-sdk';
const client = new SandboxSDK({ apiKey: process.env.AKIRA_API_KEY });
interface Episode {
states: number[][];
actions: number[];
rewards: number[];
totalReward: number;
}
async function runEpisodes(
policyWeights: Float32Array,
numEpisodes: number,
maxSteps: number
): Promise<Episode[]> {
// Create sandboxes for parallel execution
const sandboxes = await Promise.all(
Array(numEpisodes).fill(null).map(() =>
client.sandboxes.create({
image: 'akiralabs/akira-default-sandbox',
resources: { cpu: 1, memory: 1024 },
})
)
);
try {
// Run episodes in parallel
const episodes = await Promise.all(
sandboxes.map(async (sandbox) => {
// Upload policy weights
await client.sandboxes.upload(sandbox.id, {
path: '/app/weights.bin',
content: Buffer.from(policyWeights.buffer),
});
// Run episode
const result = await client.sandboxes.execute(sandbox.id, {
command: `python /app/run_episode.py \
--weights /app/weights.bin \
--max-steps ${maxSteps}`,
timeout: 120,
});
return JSON.parse(result.stdout) as Episode;
})
);
return episodes;
} finally {
// Clean up all sandboxes
await Promise.all(sandboxes.map(s => client.sandboxes.delete(s.id)));
}
}Example: Code Generation Environment
An RL environment for training code-generating agents:
interface CodeEnvState {
currentCode: string;
testResults: string;
remainingSteps: number;
}
interface CodeEnvAction {
type: 'edit' | 'run' | 'submit';
payload: string;
}
class CodeGenerationEnv {
private sandboxId: string | null = null;
private targetFunction: string;
private testCases: TestCase[];
constructor(targetFunction: string, testCases: TestCase[]) {
this.targetFunction = targetFunction;
this.testCases = testCases;
}
async reset(): Promise<CodeEnvState> {
// Clean up previous sandbox
if (this.sandboxId) {
await client.sandboxes.delete(this.sandboxId);
}
// Create fresh environment
const sandbox = await client.sandboxes.create({
image: 'akiralabs/akira-default-sandbox',
});
this.sandboxId = sandbox.id;
// Upload test harness
await client.sandboxes.upload(this.sandboxId, {
path: '/app/test_harness.py',
content: Buffer.from(this.generateTestHarness()),
});
// Initialize empty solution
await client.sandboxes.upload(this.sandboxId, {
path: '/app/solution.py',
content: Buffer.from(`def ${this.targetFunction}():\n pass`),
});
return this.getState();
}
async step(action: CodeEnvAction): Promise<{
state: CodeEnvState;
reward: number;
done: boolean;
}> {
let reward = 0;
let done = false;
switch (action.type) {
case 'edit':
// Update the code
await client.sandboxes.upload(this.sandboxId!, {
path: '/app/solution.py',
content: Buffer.from(action.payload),
});
reward = -0.01; // Small penalty for each edit
break;
case 'run':
// Run tests
const result = await client.sandboxes.execute(this.sandboxId!, {
command: 'python /app/test_harness.py',
timeout: 10,
});
const passed = this.parseTestResults(result.stdout);
reward = passed / this.testCases.length; // Reward for passing tests
break;
case 'submit':
// Final submission
const finalResult = await client.sandboxes.execute(this.sandboxId!, {
command: 'python /app/test_harness.py',
timeout: 10,
});
const finalPassed = this.parseTestResults(finalResult.stdout);
reward = finalPassed === this.testCases.length ? 10 : -1;
done = true;
break;
}
return { state: await this.getState(), reward, done };
}
private async getState(): Promise<CodeEnvState> {
const code = await client.sandboxes.execute(this.sandboxId!, {
command: 'cat /app/solution.py',
});
return {
currentCode: code.stdout,
testResults: '',
remainingSteps: 50,
};
}
async close() {
if (this.sandboxId) {
await client.sandboxes.delete(this.sandboxId);
}
}
}Example: Multi-Agent Environment
Train agents that interact in a shared environment:
async function runMultiAgentEpisode(
agents: Agent[]
): Promise<MultiAgentEpisode> {
// Create sandboxes for each agent
const sandboxes = await Promise.all(
agents.map(() =>
client.sandboxes.create({
image: 'akiralabs/akira-default-sandbox',
})
)
);
const trajectory: Step[] = [];
try {
for (let step = 0; step < MAX_STEPS; step++) {
// Get observations for all agents
const observations = await Promise.all(
sandboxes.map(async (sandbox) => {
const obs = await client.sandboxes.execute(sandbox.id, {
command: 'python /app/get_observation.py',
});
return JSON.parse(obs.stdout);
})
);
// Get actions from agents
const actions = agents.map((agent, i) =>
agent.getAction(observations[i])
);
// Execute actions in parallel
const results = await Promise.all(
sandboxes.map(async (sandbox, i) => {
const result = await client.sandboxes.execute(sandbox.id, {
command: `python /app/step.py --action '${JSON.stringify(actions[i])}'`,
});
return JSON.parse(result.stdout);
})
);
trajectory.push({
observations,
actions,
rewards: results.map(r => r.reward),
nextObservations: results.map(r => r.next_obs),
});
if (results.every(r => r.done)) break;
}
return { trajectory };
} finally {
await Promise.all(sandboxes.map(s => client.sandboxes.delete(s.id)));
}
}Key Benefits
Isolation
Each episode runs in a completely isolated VM:
- No state leakage between episodes
- Deterministic resets
- Safe execution of learned policies
Parallelization
Scale horizontally for faster training:
- Run hundreds of episodes simultaneously
- Linear speedup for sample collection
- Efficient use of compute
Reproducibility
Consistent environments from snapshots:
// Create base environment
const baseEnv = await client.sandboxes.create({
image: 'akiralabs/akira-default-sandbox',
});
// Set up environment
await client.sandboxes.execute(baseEnv.id, {
command: 'pip install gymnasium numpy',
});
// Snapshot for reproducibility
const envSnapshot = await client.sandboxes.snapshot(baseEnv.id, {
name: 'rl-env-base',
});
// Create identical environments from snapshot
const envs = await Promise.all(
Array(100).fill(null).map(() =>
client.snapshots.restore(envSnapshot.id)
)
);Best Practices
Use Snapshots for Setup
Snapshot a configured environment (with dependencies installed, data loaded) and restore it for each episode to avoid repeated setup time.
- Pool sandboxes - Reuse environments when possible
- Use snapshots - Share configured environments across episodes
- Snapshot base environments - Ensure reproducibility
- Set appropriate timeouts - Prevent stuck episodes
- Batch episode creation - Reduce API overhead
Performance Tips
Warm Sandbox Pool
Pre-create sandboxes to avoid creation latency:
// Create pool before training starts
const pool = await Promise.all(
Array(POOL_SIZE).fill(null).map(() =>
client.sandboxes.create({ image: 'akiralabs/akira-default-sandbox' })
)
);
// Use and recycle during trainingSnapshot Base Environments
Set up once, restore many times:
// Create and configure base environment
const baseSandbox = await client.sandboxes.create({
image: 'akiralabs/akira-default-sandbox',
});
// Install dependencies and load data
await client.sandboxes.execute(baseSandbox.id, {
command: 'pip install gymnasium numpy torch',
});
await uploadTrainingData(baseSandbox.id);
// Snapshot the configured environment
const snapshot = await client.sandboxes.snapshot(baseSandbox.id, {
name: 'rl-env-with-data',
});
// Restore for each worker
const workers = await Promise.all(
Array(NUM_WORKERS).fill(null).map(() =>
client.snapshots.restore(snapshot.id)
)
);Next Steps
- Learn about parallel processing patterns
- Explore autonomous agents
- Review sandbox configuration