Claude Opus 4 — Extended thinking
pip install anthropic · Best for: agentic coding, multi-file changes
import anthropic
client = anthropic.Anthropic()
def generate_code(task: str, codebase_context: str = "") -> str:
"""Generate code using Claude Opus 4 with extended thinking."""
messages = [
{
"role": "user",
"content": f"""You are a senior software engineer.
Context from the codebase:
{codebase_context}
Task: {task}
Requirements:
- Write production-quality code with error handling
- Follow existing code patterns from the context
- Include type hints and docstrings
- Add inline comments for non-obvious logic"""
}
]
# Use extended thinking for complex tasks
response = client.messages.create(
model="claude-opus-4-20250115",
max_tokens=16000,
thinking={
"type": "enabled",
"budget_tokens": 10000 # Let the model reason deeply
},
messages=messages
)
# Extract the text response (thinking is internal)
for block in response.content:
if block.type == "text":
return block.text
return ""
GPT-5 — Structured outputs
pip install openai pydantic · Best for: structured analysis + generation
from openai import OpenAI
from pydantic import BaseModel
client = OpenAI()
class CodeResponse(BaseModel):
code: str
language: str
explanation: str
complexity: str # O(n), O(n log n), etc.
edge_cases: list[str]
response = client.beta.chat.completions.parse(
model="gpt-5",
messages=[
{"role": "system", "content": "Generate code and analysis in the required format."},
{"role": "user", "content": "Implement a thread-safe LRU cache with TTL support in Python"}
],
response_format=CodeResponse,
)
result = response.choices[0].message.parsed
print(f"Complexity: {result.complexity}")
print(f"Edge cases: {result.edge_cases}")
print(result.code)
Gemini 2.5 Pro — 1M context
pip install google-generativeai · Best for: algorithmic reasoning, whole-codebase analysis
import google.generativeai as genai
genai.configure(api_key="YOUR_API_KEY")
def generate_code(task: str, thinking: bool = True) -> str:
"""Generate code using Gemini 2.5 Pro with thinking mode."""
model = genai.GenerativeModel("gemini-2.5-pro-preview-03-25")
config = genai.GenerationConfig(
temperature=0,
max_output_tokens=8192,
)
if thinking:
config.thinking_config = {"thinking_budget": 8000}
response = model.generate_content(
f"""Solve this programming problem step by step.
{task}
Provide:
1. Your approach and reasoning
2. Clean, optimized code
3. Time and space complexity analysis
4. Test cases covering edge cases""",
generation_config=config
)
return response.text
# Leverage 1M context for whole-codebase analysis
def analyze_codebase(files: dict[str, str], question: str) -> str:
model = genai.GenerativeModel("gemini-2.5-pro-preview-03-25")
context = "\n\n".join(f"--- {p} ---\n{c}" for p, c in files.items())
response = model.generate_content(
f"Codebase:\n\n{context}\n\nQuestion: {question}",
generation_config=genai.GenerationConfig(temperature=0)
)
return response.text
DeepSeek-V3 — API + self-hosted
pip install openai · Best for: budget, privacy-first deployments
from openai import OpenAI
# DeepSeek uses an OpenAI-compatible API
client = OpenAI(api_key="YOUR_DEEPSEEK_KEY", base_url="https://api.deepseek.com")
def generate_code(task: str) -> str:
response = client.chat.completions.create(
model="deepseek-chat", # Points to DeepSeek-V3
messages=[
{"role": "system", "content": "You are an expert programmer."},
{"role": "user", "content": task}
],
temperature=0,
max_tokens=4096,
)
return response.choices[0].message.content
# Self-hosted via vLLM for full privacy:
# vllm serve deepseek-ai/DeepSeek-V3 --tensor-parallel-size 8
def generate_code_self_hosted(task: str) -> str:
local = OpenAI(api_key="not-needed", base_url="http://localhost:8000/v1")
return local.chat.completions.create(
model="deepseek-ai/DeepSeek-V3",
messages=[{"role": "user", "content": task}],
temperature=0, max_tokens=4096
).choices[0].message.content
Qwen2.5-Coder-32B — self-hosted
pip install transformers torch accelerate · Best for: self-hosted autocomplete, fine-tuning
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
model_name = "Qwen/Qwen2.5-Coder-32B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True,
)
def generate_code(task: str) -> str:
messages = [
{"role": "system", "content": "You are an expert programmer."},
{"role": "user", "content": task},
]
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(text, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(**inputs, max_new_tokens=4096, do_sample=False, num_beams=1)
return tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
# For production: serve with vLLM
# vllm serve Qwen/Qwen2.5-Coder-32B-Instruct --tensor-parallel-size 2
Codestral 25.01 — Fill-in-the-Middle
pip install mistralai · Best for: real-time autocomplete, IDE integration
from mistralai import Mistral
client = Mistral(api_key="YOUR_MISTRAL_KEY")
# Fill-in-the-Middle — Codestral's killer feature
def code_completion(prefix: str, suffix: str) -> str:
response = client.fim.complete(
model="codestral-latest",
prompt=prefix,
suffix=suffix,
temperature=0,
max_tokens=512,
)
return response.choices[0].message.content
prefix = '''def binary_search(arr: list[int], target: int) -> int:
"""Find target in sorted array. Returns index or -1."""
left, right = 0, len(arr) - 1
while left <= right:
'''
suffix = '''
return -1
assert binary_search([1, 3, 5, 7, 9], 5) == 2
assert binary_search([1, 3, 5, 7, 9], 4) == -1
'''
middle = code_completion(prefix, suffix)