§ 05 · Code
Four small programs.
One example per path — self-hosted Whisper, the lowest-latency commercial stream, the richest analysis pipeline, and a multimodal option.
Whisper large-v3 — self-hosted with faster-whisper.
whisper_self_hosted.pypython
# Self-hosted Whisper with faster-whisper (CTranslate2 backend)
# pip install faster-whisper
from faster_whisper import WhisperModel
model = WhisperModel("large-v3", device="cuda", compute_type="float16")
segments, info = model.transcribe(
"meeting.wav",
beam_size=5,
language="en",
vad_filter=True, # Skip silence for faster processing
vad_parameters=dict(
min_silence_duration_ms=500,
),
)
print(f"Detected language: {info.language} (prob: {info.language_probability:.2f})")
for segment in segments:
print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")
Deepgram Nova-3 — real-time streaming with diarisation.
deepgram_streaming.pypython
# Deepgram Nova-3 real-time streaming
# pip install deepgram-sdk
import asyncio
from deepgram import DeepgramClient, LiveOptions, LiveTranscriptionEvents
async def transcribe_stream():
dg = DeepgramClient("YOUR_API_KEY")
connection = dg.listen.asynclive.v("1")
async def on_message(self, result, **kwargs):
transcript = result.channel.alternatives[0].transcript
if transcript:
print(f"[{result.start:.2f}s] {transcript}")
connection.on(LiveTranscriptionEvents.Transcript, on_message)
options = LiveOptions(
model="nova-3",
language="en",
smart_format=True,
diarize=True,
encoding="linear16",
sample_rate=16000,
)
await connection.start(options)
# Stream audio chunks from microphone or file
with open("call_recording.wav", "rb") as f:
while chunk := f.read(4096):
connection.send(chunk)
await asyncio.sleep(0.1) # Simulate real-time
await connection.finish()
asyncio.run(transcribe_stream())
AssemblyAI Universal-2 — diarisation, chapters, sentiment, PII redaction.
assemblyai_universal2.pypython
# AssemblyAI Universal-2 with speaker diarization
# pip install assemblyai
import assemblyai as aai
aai.settings.api_key = "YOUR_API_KEY"
config = aai.TranscriptionConfig(
speech_model=aai.SpeechModel.best, # Universal-2
speaker_labels=True, # Diarization
auto_chapters=True, # Chapter summaries
entity_detection=True, # PII detection
sentiment_analysis=True,
)
transcriber = aai.Transcriber()
transcript = transcriber.transcribe("podcast_episode.mp3", config=config)
# Print with speaker labels
for utterance in transcript.utterances:
print(f"Speaker {utterance.speaker}: {utterance.text}")
# Auto-generated chapters
for chapter in transcript.chapters:
print(f"\n## {chapter.headline}")
print(f" {chapter.summary}")
print(f" [{chapter.start/1000:.0f}s - {chapter.end/1000:.0f}s]")
Gemini 2.5 Pro — transcription and reasoning in a single call.
gemini_transcribe.pypython
# Gemini 2.5 Pro audio transcription + analysis
# pip install google-genai
from google import genai
client = genai.Client(api_key="YOUR_API_KEY")
audio_file = client.files.upload(file="earnings_call.mp3")
response = client.models.generate_content(
model="gemini-2.5-pro",
contents=[
audio_file,
"""Transcribe this audio precisely, then provide:
1. Full transcript with timestamps
2. Key topics discussed
3. Action items mentioned
4. Overall sentiment per speaker""",
],
)
print(response.text)