Examples¶
This section provides practical examples of using LLM Sandbox for executing LLM-generated code in real-world AI agent scenarios. These examples focus on the most common use cases where LLMs generate code that needs to be executed safely.
LLM Framework Integrations¶
LangChain Integration¶
# ruff: noqa: E501
# Reference: https://python.langchain.com/docs/how_to/custom_tools/
import logging
from langchain import hub
from langchain.agents import AgentExecutor, create_tool_calling_agent
from langchain_core.tools import tool
from langchain_openai import ChatOpenAI
from llm_sandbox import SandboxSession
logging.basicConfig(level=logging.INFO, format="%(message)s")
logger = logging.getLogger(__name__)
@tool
def run_code(lang: str, code: str, libraries: list | None = None) -> str:
"""Run code in a sandboxed environment.
:param lang: The language of the code, must be one of ['python', 'java', 'javascript', 'cpp', 'go', 'ruby'].
:param code: The code to run.
:param libraries: The libraries to use, it is optional.
:return: The output of the code.
"""
with SandboxSession(lang=lang, verbose=False) as session:
return session.run(code, libraries).stdout
if __name__ == "__main__":
llm = ChatOpenAI(model="gpt-4.1-nano", temperature=0)
prompt = hub.pull("hwchase17/openai-functions-agent")
tools = [run_code]
agent = create_tool_calling_agent(llm, tools, prompt) # type: ignore[arg-type]
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True) # type: ignore[arg-type]
output = agent_executor.invoke({
"input": "Write python code to calculate Pi number by Monte Carlo method then run it."
})
logger.info("Agent: %s", output)
output = agent_executor.invoke({"input": "Write python code to calculate the factorial of a number then run it."})
logger.info("Agent: %s", output)
output = agent_executor.invoke({"input": "Write python code to calculate the Fibonacci sequence then run it."})
logger.info("Agent: %s", output)
output = agent_executor.invoke({"input": "Calculate the sum of the first 10000 numbers."})
logger.info("Agent: %s", output)
LangGraph Integration¶
import logging
from langchain_core.tools import tool
from langgraph.prebuilt import create_react_agent
from llm_sandbox import SandboxSession
logging.basicConfig(level=logging.INFO, format="%(message)s")
logger = logging.getLogger(__name__)
@tool
def run_code(lang: str, code: str, libraries: list | None = None) -> str:
"""Run code in a sandboxed environment.
:param lang: The language of the code, must be one of ['python', 'java', 'javascript', 'cpp', 'go', 'ruby'].
:param code: The code to run.
:param libraries: The libraries to use, it is optional.
:return: The output of the code.
"""
with SandboxSession(lang=lang, verbose=False) as session:
return session.run(code, libraries).stdout
if __name__ == "__main__":
agent = create_react_agent(model="openai:gpt-4.1-nano", tools=[run_code])
logger.info(
"Agent: %s",
agent.invoke({
"messages": [
{
"role": "user",
"content": "Write python code to calculate Pi number by Monte Carlo method then run it.",
}
]
}),
)
logger.info(
"Agent: %s",
agent.invoke({
"messages": [
{"role": "user", "content": "Write python code to calculate the factorial of a number then run it."}
]
}),
)
logger.info(
"Agent: %s",
agent.invoke({
"messages": [
{"role": "user", "content": "Write python code to calculate the Fibonacci sequence then run it."}
]
}),
)
logger.info(
"Agent: %s",
agent.invoke({"messages": [{"role": "user", "content": "Calculate the sum of the first 10000 numbers."}]}),
)
LlamaIndex Integration¶
# ruff: noqa: E501
# Reference: https://docs.llamaindex.ai/en/stable/module_guides/deploying/agents/tools/
import logging
import nest_asyncio
from llama_index.core.agent import FunctionCallingAgentWorker
from llama_index.core.tools import FunctionTool
from llama_index.llms.openai import OpenAI
from llm_sandbox import SandboxSession
nest_asyncio.apply()
logging.basicConfig(level=logging.INFO, format="%(message)s")
logger = logging.getLogger(__name__)
def run_code(lang: str, code: str, libraries: list | None = None) -> str:
"""Run code in a sandboxed environment.
:param lang: The language of the code, must be one of ['python', 'java', 'javascript', 'cpp', 'go', 'ruby'].
:param code: The code to run.
:param libraries: The libraries to use, it is optional.
:return: The output of the code.
"""
with SandboxSession(lang=lang, verbose=False) as session:
return session.run(code, libraries).stdout
if __name__ == "__main__":
llm = OpenAI(model="gpt-4.1-nano", temperature=0)
code_execution_tool = FunctionTool.from_defaults(fn=run_code)
agent_worker = FunctionCallingAgentWorker.from_tools(
[code_execution_tool],
llm=llm,
verbose=True,
allow_parallel_tool_calls=False,
)
agent = agent_worker.as_agent()
response = agent.chat("Write python code to calculate Pi number by Monte Carlo method then run it.")
logger.info(response)
response = agent.chat("Write python code to calculate the factorial of a number then run it.")
logger.info(response)
response = agent.chat("Write python code to calculate the Fibonacci sequence then run it.")
logger.info(response)
response = agent.chat("Calculate the sum of the first 10000 numbers.")
logger.info(response)
Code Generation Patterns¶
1. Self-Correcting Code Generator¶
from llm_sandbox import SandboxSession
import openai
class SelfCorrectingCodeGenerator:
"""Generate and iteratively improve code using LLM feedback"""
def __init__(self, api_key: str):
self.client = openai.OpenAI(api_key=api_key)
self.max_iterations = 3
def generate_and_test_code(self, task: str, test_cases: list) -> dict:
"""Generate code and iteratively improve it based on test results"""
iteration = 0
current_code = None
while iteration < self.max_iterations:
iteration += 1
# Generate or improve code
if current_code is None:
prompt = f"Write Python code to: {task}\n\nInclude proper error handling and documentation."
else:
prompt = f"""
The previous code failed. Here's what happened:
Code: {current_code}
Error: {last_error}
Fix the issues and improve the code to: {task}
"""
response = self.client.chat.completions.create(
model="gpt-4",
messages=[
{"role": "system", "content": "You are an expert Python developer. Write clean, efficient, well-tested code."},
{"role": "user", "content": prompt}
]
)
current_code = response.choices[0].message.content
# Test the generated code
with SandboxSession(lang="python") as session:
# Setup test environment
test_result = session.run(current_code)
if test_result.exit_code == 0:
# Run test cases
all_passed = True
test_outputs = []
for test_case in test_cases:
test_code = f"""
# Test case: {test_case['description']}
try:
result = {test_case['code']}
expected = {test_case['expected']}
passed = result == expected
print(f"Test '{test_case['description']}': {'PASS' if passed else 'FAIL'}")
if not passed:
print(f" Expected: {expected}, Got: {result}")
except Exception as e:
print(f"Test '{test_case['description']}': ERROR - {e}")
passed = False
"""
test_output = session.run(test_code)
test_outputs.append(test_output.stdout)
if "FAIL" in test_output.stdout or "ERROR" in test_output.stdout:
all_passed = False
if all_passed:
return {
"success": True,
"code": current_code,
"iterations": iteration,
"test_results": test_outputs
}
else:
last_error = "Some test cases failed: " + "\n".join(test_outputs)
else:
last_error = test_result.stderr
return {
"success": False,
"code": current_code,
"iterations": iteration,
"final_error": last_error
}
# Example usage
generator = SelfCorrectingCodeGenerator("your-api-key")
test_cases = [
{
"description": "Basic sorting",
"code": "sort_list([3, 1, 4, 1, 5])",
"expected": [1, 1, 3, 4, 5]
},
{
"description": "Empty list",
"code": "sort_list([])",
"expected": []
},
{
"description": "Single element",
"code": "sort_list([42])",
"expected": [42]
}
]
result = generator.generate_and_test_code(
"Create a function called 'sort_list' that sorts a list of numbers in ascending order",
test_cases
)
print(f"Success: {result['success']}")
print(f"Iterations: {result['iterations']}")
if result['success']:
print("Generated code:", result['code'])
2. Multi-Language Code Translator¶
from llm_sandbox import SandboxSession
import openai
class CodeTranslator:
"""Translate code between different programming languages"""
def __init__(self, api_key: str):
self.client = openai.OpenAI(api_key=api_key)
self.supported_languages = ["python", "javascript", "java", "cpp", "go"]
def translate_code(self, source_code: str, source_lang: str, target_lang: str) -> dict:
"""Translate code from one language to another and test it"""
translation_prompt = f"""
Translate this {source_lang} code to {target_lang}:
{source_code}
Requirements:
1. Maintain the same functionality
2. Use idiomatic {target_lang} patterns
3. Include proper error handling
4. Add comments explaining the translation choices
5. Ensure the code is runnable and follows best practices
"""
response = self.client.chat.completions.create(
model="gpt-4",
messages=[
{"role": "system", "content": f"You are an expert in both {source_lang} and {target_lang}. Provide accurate, idiomatic translations."},
{"role": "user", "content": translation_prompt}
]
)
translated_code = response.choices[0].message.content
# Test both original and translated code
original_result = self._test_code(source_code, source_lang)
translated_result = self._test_code(translated_code, target_lang)
return {
"source_language": source_lang,
"target_language": target_lang,
"original_code": source_code,
"translated_code": translated_code,
"original_output": original_result,
"translated_output": translated_result,
"translation_successful": translated_result["success"],
"outputs_match": self._compare_outputs(original_result, translated_result)
}
def _test_code(self, code: str, language: str) -> dict:
"""Test code execution in specified language"""
try:
with SandboxSession(lang=language) as session:
result = session.run(code)
return {
"success": result.exit_code == 0,
"output": result.stdout,
"error": result.stderr
}
except Exception as e:
return {
"success": False,
"output": "",
"error": str(e)
}
def _compare_outputs(self, original: dict, translated: dict) -> bool:
"""Compare outputs to verify translation accuracy"""
if not (original["success"] and translated["success"]):
return False
# Simple output comparison (can be enhanced for specific needs)
return original["output"].strip() == translated["output"].strip()
# Example usage
translator = CodeTranslator("your-api-key")
python_code = """
def fibonacci(n):
if n <= 1:
return n
return fibonacci(n-1) + fibonacci(n-2)
# Test the function
for i in range(10):
print(f"fib({i}) = {fibonacci(i)}")
"""
translation = translator.translate_code(python_code, "python", "javascript")
print(f"Translation successful: {translation['translation_successful']}")
print(f"Outputs match: {translation['outputs_match']}")
print("Translated code:", translation['translated_code'])
Security and Monitoring¶
Secure Code Execution Service¶
from llm_sandbox import SandboxSession
from llm_sandbox.security import SecurityPolicy, RestrictedModule, SecurityIssueSeverity
import hashlib
import time
import logging
class SecureAICodeExecutor:
"""Production-ready secure execution service for AI-generated code"""
def __init__(self):
self.execution_log = []
self.security_policy = self._create_security_policy()
self.logger = logging.getLogger(__name__)
def _create_security_policy(self) -> SecurityPolicy:
"""Create comprehensive security policy for AI-generated code"""
return SecurityPolicy(
severity_threshold=SecurityIssueSeverity.MEDIUM,
restricted_modules=[
RestrictedModule("os", "Operating system access", SecurityIssueSeverity.HIGH),
RestrictedModule("subprocess", "Process execution", SecurityIssueSeverity.HIGH),
RestrictedModule("socket", "Network operations", SecurityIssueSeverity.MEDIUM),
RestrictedModule("ctypes", "Foreign function library", SecurityIssueSeverity.HIGH)
]
)
def execute_ai_code(
self,
code: str,
user_id: str,
ai_model: str,
language: str = "python",
timeout: int = 30
) -> dict:
"""Execute AI-generated code with comprehensive security and monitoring"""
execution_id = hashlib.sha256(f"{user_id}{time.time()}{code}".encode()).hexdigest()[:16]
# Log execution attempt
log_entry = {
"execution_id": execution_id,
"user_id": user_id,
"ai_model": ai_model,
"language": language,
"timestamp": time.time(),
"code_length": len(code),
"code_hash": hashlib.sha256(code.encode()).hexdigest()
}
try:
with SandboxSession(
lang=language,
security_policy=self.security_policy,
runtime_configs={
"timeout": timeout,
"mem_limit": "256m",
"cpu_count": 1,
"network_mode": "none",
"read_only": True
}
) as session:
# Security check
is_safe, violations = session.is_safe(code)
if not is_safe:
log_entry["security_violations"] = [v.description for v in violations]
self.execution_log.append(log_entry)
return {
"success": False,
"execution_id": execution_id,
"error": "Security policy violations detected",
"violations": [
{"description": v.description, "severity": v.severity.name}
for v in violations
]
}
# Execute code
result = session.run(code)
log_entry["success"] = result.exit_code == 0
log_entry["execution_time"] = time.time() - log_entry["timestamp"]
self.execution_log.append(log_entry)
return {
"success": result.exit_code == 0,
"execution_id": execution_id,
"output": result.stdout[:5000], # Limit output size
"error": result.stderr[:1000] if result.stderr else None,
"execution_time": log_entry["execution_time"]
}
except Exception as e:
log_entry["error"] = str(e)
self.execution_log.append(log_entry)
return {
"success": False,
"execution_id": execution_id,
"error": f"Execution failed: {str(e)}"
}
def get_execution_stats(self, user_id: str = None) -> dict:
"""Get execution statistics"""
logs = self.execution_log
if user_id:
logs = [log for log in logs if log["user_id"] == user_id]
if not logs:
return {"total": 0}
total = len(logs)
successful = sum(1 for log in logs if log.get("success", False))
violations = sum(1 for log in logs if "security_violations" in log)
return {
"total_executions": total,
"successful_executions": successful,
"security_violations": violations,
"success_rate": successful / total if total > 0 else 0,
"violation_rate": violations / total if total > 0 else 0
}
# Example usage
executor = SecureAICodeExecutor()
# Example AI-generated code execution
ai_code = """
import numpy as np
import matplotlib.pyplot as plt
# Generate data
x = np.linspace(0, 2*np.pi, 100)
y = np.sin(x)
# Create plot
plt.figure(figsize=(10, 6))
plt.plot(x, y, 'b-', linewidth=2, label='sin(x)')
plt.xlabel('x')
plt.ylabel('sin(x)')
plt.title('Sine Wave Generated by AI')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
print("Successfully generated sine wave plot!")
"""
result = executor.execute_ai_code(
code=ai_code,
user_id="ai_agent_001",
ai_model="gpt-4",
language="python"
)
print(f"Execution successful: {result['success']}")
if result['success']:
print("Output:", result['output'])
else:
print("Error:", result['error'])
# Get statistics
stats = executor.get_execution_stats()
print(f"Success rate: {stats['success_rate']:.2%}")
Performance Optimization¶
Parallel AI Code Processing¶
from llm_sandbox import SandboxSession
import concurrent.futures
import time
class ParallelAICodeProcessor:
"""Process multiple AI-generated code snippets in parallel"""
def __init__(self, max_workers: int = 4):
self.max_workers = max_workers
def process_code_batch(self, code_tasks: list) -> list:
"""Process multiple code tasks in parallel"""
def execute_single_task(task):
task_id, code, language = task["id"], task["code"], task.get("language", "python")
start_time = time.time()
try:
with SandboxSession(
lang=language,
runtime_configs={"timeout": 30, "mem_limit": "128m"}
) as session:
result = session.run(code)
return {
"task_id": task_id,
"success": result.exit_code == 0,
"output": result.stdout,
"error": result.stderr,
"execution_time": time.time() - start_time
}
except Exception as e:
return {
"task_id": task_id,
"success": False,
"output": "",
"error": str(e),
"execution_time": time.time() - start_time
}
# Execute tasks in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
future_to_task = {
executor.submit(execute_single_task, task): task
for task in code_tasks
}
results = []
for future in concurrent.futures.as_completed(future_to_task):
results.append(future.result())
return sorted(results, key=lambda x: x["task_id"])
# Example usage
processor = ParallelAICodeProcessor(max_workers=3)
# Batch of AI-generated code tasks
code_tasks = [
{
"id": 1,
"code": "print('Task 1: Hello from AI!')\nprint(sum(range(100)))",
"language": "python"
},
{
"id": 2,
"code": "import math\nprint(f'Task 2: Pi = {math.pi:.6f}')",
"language": "python"
},
{
"id": 3,
"code": "console.log('Task 3: JavaScript execution')\nconsole.log(Array.from({length: 10}, (_, i) => i * 2))",
"language": "javascript"
}
]
results = processor.process_code_batch(code_tasks)
for result in results:
print(f"Task {result['task_id']}: {'✓' if result['success'] else '✗'}")
print(f" Execution time: {result['execution_time']:.3f}s")
if result['success']:
print(f" Output: {result['output'][:100]}...")
else:
print(f" Error: {result['error']}")
Best Practices¶
1. Code Validation Pipeline¶
from llm_sandbox import SandboxSession
import openai
class AICodeValidator:
"""Validate AI-generated code before execution"""
def __init__(self, api_key: str):
self.client = openai.OpenAI(api_key=api_key)
def validate_and_improve_code(self, code: str, requirements: str) -> dict:
"""Validate code and suggest improvements"""
validation_prompt = f"""
Review this code for:
1. Syntax errors
2. Logic issues
3. Security concerns
4. Performance problems
5. Best practices compliance
Requirements: {requirements}
Code: {code}
Provide:
- Issues found (if any)
- Improved version of the code
- Explanation of changes
"""
response = self.client.chat.completions.create(
model="gpt-4",
messages=[
{"role": "system", "content": "You are a senior code reviewer. Identify issues and provide improved code."},
{"role": "user", "content": validation_prompt}
]
)
review_result = response.choices[0].message.content
# Test both original and improved code
original_test = self._test_code(code)
# Extract improved code from review (simplified extraction)
improved_code = self._extract_improved_code(review_result)
improved_test = self._test_code(improved_code) if improved_code else None
return {
"original_code": code,
"review_feedback": review_result,
"improved_code": improved_code,
"original_test_result": original_test,
"improved_test_result": improved_test,
"improvement_successful": improved_test and improved_test["success"]
}
def _test_code(self, code: str) -> dict:
"""Test code execution"""
try:
with SandboxSession(lang="python") as session:
result = session.run(code)
return {
"success": result.exit_code == 0,
"output": result.stdout,
"error": result.stderr
}
except Exception as e:
return {"success": False, "output": "", "error": str(e)}
def _extract_improved_code(self, review_text: str) -> str:
"""Extract improved code from review text"""
# Simple extraction - look for code blocks
import re
code_blocks = re.findall(r'```python\n(.*?)\n```', review_text, re.DOTALL)
return code_blocks[-1] if code_blocks else None
# Example usage
validator = AICodeValidator("your-api-key")
ai_generated_code = """
def calculate_average(numbers):
return sum(numbers) / len(numbers)
numbers = [1, 2, 3, 4, 5]
print(calculate_average(numbers))
"""
validation = validator.validate_and_improve_code(
code=ai_generated_code,
requirements="Function should handle edge cases like empty lists and non-numeric inputs"
)
print("Validation Results:")
print("Original successful:", validation["original_test_result"]["success"])
print("Improvement successful:", validation["improvement_successful"])
print("Review feedback:", validation["review_feedback"])
Next Steps¶
- Learn about Security Best Practices for AI-generated code
- Explore Configuration Options for production deployments
- Check API Reference for advanced features
- Read about Contributing to extend functionality
For more examples and use cases, visit our GitHub repository.