How-ToProblem-oriented
Mistral OCR 3 How-To Guides
Step-by-step solutions for specific tasks: batch processing, invoice extraction, table parsing, and LLM integration.
How to Batch Process Multiple PDFs
Use the Batch API for 50% cost savings when processing many documents:
from mistralai import Mistralimport base64import osfrom pathlib import Path client = Mistral(api_key=os.environ["MISTRAL_API_KEY"])def process_batch(pdf_paths: list[str]) -> dict:"""Process multiple PDFs using the Batch API (50% cheaper)."""# Prepare documentsdocuments = []for path in pdf_paths:with open(path, "rb") as f: doc_data = base64.b64encode(f.read()).decode() documents.append({"id": Path(path).stem,"document": {"type": "pdf", "data": doc_data}})# Submit batch jobbatch_job = client.ocr.batch_process( model="mistral-ocr-2512", documents=documents )# Poll for results (in production, use webhooks)while batch_job.status != "completed": time.sleep(5) batch_job = client.ocr.get_batch(batch_job.id)# Return results mapped by document IDreturn {r.id: r.content for r in batch_job.results}# Usagepdf_files = ["invoice1.pdf", "invoice2.pdf", "invoice3.pdf"] results = process_batch(pdf_files)for doc_id, content in results.items():print(f"--- {doc_id} ---")print(content[:500])Cost Comparison: Standard API = $2/1000 pages. Batch API = $1/1000 pages (50% savings).
How to Extract Invoice Data to JSON
Combine Mistral OCR 3 with an LLM for structured extraction:
from mistralai import Mistralimport jsonimport base64import os client = Mistral(api_key=os.environ["MISTRAL_API_KEY"])def extract_invoice(pdf_path: str) -> dict:"""Extract structured invoice data from a PDF."""# Step 1: OCR the documentwith open(pdf_path, "rb") as f: doc_data = base64.b64encode(f.read()).decode() ocr_response = client.ocr.process( model="mistral-ocr-2512", document={"type": "pdf", "data": doc_data}) markdown = ocr_response.content# Step 2: Extract structured data with LLMextraction_prompt = f"""Extract invoice data as JSON:{"invoice_number": string, "date": string (YYYY-MM-DD), "vendor": {"name": string, "address": string}, "customer": {"name": string, "address": string}, "line_items": [{"description": string, "quantity": number, "unit_price": number, "total": number}], "subtotal": number, "tax": number, "total": number}Document:{markdown}""" chat_response = client.chat.complete( model="mistral-large-latest", messages=[{"role": "user", "content": extraction_prompt}], response_format={"type": "json_object"})return json.loads(chat_response.choices[0].message.content)# Usageinvoice = extract_invoice("invoice.pdf")print(json.dumps(invoice, indent=2))How to Parse Tables to CSV/DataFrame
Extract HTML tables from OCR output and convert to pandas DataFrames:
from mistralai import Mistralimport pandas as pdfrom bs4 import BeautifulSoupimport base64import osimport re client = Mistral(api_key=os.environ["MISTRAL_API_KEY"])def extract_tables(pdf_path: str) -> list[pd.DataFrame]:"""Extract all tables from a PDF as DataFrames."""# OCR the documentwith open(pdf_path, "rb") as f: doc_data = base64.b64encode(f.read()).decode() response = client.ocr.process( model="mistral-ocr-2512", document={"type": "pdf", "data": doc_data})# Extract HTML tables from markdowntables = [] soup = BeautifulSoup(response.content, "html.parser")for table_html in soup.find_all("table"):# Convert HTML table to DataFramedf = pd.read_html(str(table_html))[0] tables.append(df)return tables# Usagetables = extract_tables("report.pdf")for i, df in enumerate(tables):print(f"Table {i+1}:")print(df) df.to_csv(f"table_{i+1}.csv", index=False)How to Handle Multi-Language Documents
Mistral OCR 3 automatically detects and handles multiple languages:
English
94.6%
Text accuracy
Chinese
86.1%
Text accuracy
Mixed
86.2%
Text accuracy
# No special configuration needed for multi-language# Mistral OCR 3 auto-detects the languageresponse = client.ocr.process( model="mistral-ocr-2512", document={"type": "pdf", "data": doc_data})# The output will contain text in whatever language# was present in the documentprint(response.content)Note: For best Chinese results, consider using documents with clear fonts. Accuracy drops for handwritten or stylized text.
How to Integrate with LLMs for Q&A
Use OCR output as context for LLM-powered document Q&A:
from mistralai import Mistralimport base64import os client = Mistral(api_key=os.environ["MISTRAL_API_KEY"])def ask_document(pdf_path: str, question: str) -> str:"""Ask a question about a PDF document."""# OCR the documentwith open(pdf_path, "rb") as f: doc_data = base64.b64encode(f.read()).decode() ocr_response = client.ocr.process( model="mistral-ocr-2512", document={"type": "pdf", "data": doc_data})# Use the OCR output as context for the LLMchat_response = client.chat.complete( model="mistral-large-latest", messages=[{"role": "system","content": "Answer questions based on the document provided."},{"role": "user","content": f"""Document:{ocr_response.content}Question: {question}"""}] )return chat_response.choices[0].message.content# Usageanswer = ask_document("contract.pdf", "What is the termination clause?")print(answer)How to Handle Errors and Retries
Robust error handling for production use:
from mistralai import Mistralfrom mistralai.exceptions import APIError, RateLimitErrorimport timeimport base64import os client = Mistral(api_key=os.environ["MISTRAL_API_KEY"])def process_with_retry(pdf_path: str, max_retries: int = 3) -> str:"""Process a PDF with exponential backoff retry."""with open(pdf_path, "rb") as f: doc_data = base64.b64encode(f.read()).decode()for attempt in range(max_retries):try: response = client.ocr.process( model="mistral-ocr-2512", document={"type": "pdf", "data": doc_data})return response.contentexcept RateLimitError: wait_time = 2 ** attempt # Exponential backoffprint(f"Rate limited. Waiting {wait_time}s...") time.sleep(wait_time)except APIError as e:if attempt == max_retries - 1:raiseprint(f"API error: {e}. Retrying...") time.sleep(1)raise Exception("Max retries exceeded")# Usagetry: content = process_with_retry("document.pdf")print(content)except Exception as e:print(f"Failed to process: {e}")