How-ToProblem-oriented
Docling How-To Guides
Step-by-step solutions for specific tasks: OCR configuration, table extraction, RAG integration, and performance optimization.
How to Configure OCR Engines
Docling supports multiple OCR backends. Choose based on your needs:
EasyOCR (Multi-language)
from docling.datamodel.pipeline_options import PdfPipelineOptions, EasyOcrOptionsfrom docling.document_converter import DocumentConverter, PdfFormatOption, InputFormat pipeline_options = PdfPipelineOptions() pipeline_options.do_ocr = Truepipeline_options.ocr_options = EasyOcrOptions( lang=["en", "fr", "de"], # Languages to detectuse_gpu=True, confidence_threshold=0.5) converter = DocumentConverter( format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)} )Install: pip install "docling[easyocr]"
Tesseract (System OCR)
from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractOcrOptions pipeline_options = PdfPipelineOptions() pipeline_options.do_ocr = Truepipeline_options.ocr_options = TesseractOcrOptions()# Requires system Tesseract installation:# macOS: brew install tesseract# Ubuntu: apt-get install tesseract-ocrInstall: pip install "docling[tesserocr]"
RapidOCR (Default, ONNX-based)
from docling.datamodel.pipeline_options import PdfPipelineOptions, RapidOcrOptionsfrom huggingface_hub import snapshot_downloadimport os# Download custom models from HuggingFacedownload_path = snapshot_download(repo_id="SWHL/RapidOCR") pipeline_options = PdfPipelineOptions() pipeline_options.do_ocr = Truepipeline_options.ocr_options = RapidOcrOptions( det_model_path=os.path.join(download_path, "PP-OCRv4", "en_PP-OCRv3_det_infer.onnx"), rec_model_path=os.path.join(download_path, "PP-OCRv4", "ch_PP-OCRv4_rec_server_infer.onnx"), )How to Process Scanned PDFs
Force full-page OCR for scanned documents where text extraction fails:
from docling.datamodel.pipeline_options import PdfPipelineOptions, EasyOcrOptionsfrom docling.document_converter import DocumentConverter, PdfFormatOption, InputFormat pipeline_options = PdfPipelineOptions() pipeline_options.do_ocr = Truepipeline_options.do_table_structure = Truepipeline_options.table_structure_options.do_cell_matching = True# Force OCR on every page (for scanned documents)pipeline_options.ocr_options = EasyOcrOptions( force_full_page_ocr=True) converter = DocumentConverter( format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)} ) result = converter.convert("scanned_document.pdf")print(result.document.export_to_markdown())How to Extract Tables to CSV/Excel
from docling.document_converter import DocumentConverterimport pandas as pd converter = DocumentConverter() result = converter.convert("report.pdf")# Extract all tablesfor i, table in enumerate(result.document.tables):# To DataFramedf = table.export_to_dataframe()# Save as CSVdf.to_csv(f"table_{i}.csv", index=False)# Save as Exceldf.to_excel(f"table_{i}.xlsx", index=False)# Or get as Markdownprint(table.export_to_markdown())How to Extract Invoice Data
Convert invoice to structured data, then use an LLM to extract fields:
from docling.document_converter import DocumentConverterfrom openai import OpenAIimport json# Step 1: Convert PDF to Markdownconverter = DocumentConverter() result = converter.convert("invoice.pdf") markdown = result.document.export_to_markdown()# Step 2: Use LLM to extract structured dataclient = OpenAI() response = client.chat.completions.create( model="gpt-5", # or your preferred modelmessages=[{"role": "user","content": f"""Extract invoice data as JSON: {"invoice_number": str, "date": str, "vendor": str, "total": float, "line_items": [{"description": str, "qty": int, "price": float}]} Document: {markdown}"""}], response_format={"type": "json_object"} ) invoice_data = json.loads(response.choices[0].message.content)print(json.dumps(invoice_data, indent=2))How to Enable GPU Acceleration
NVIDIA CUDA
# Install with CUDA supportpip install docling[cuda]# Use vLLM for fast inferencefrom docling.datamodel import vlm_model_specs pipeline_options = VlmPipelineOptions( vlm_options=vlm_model_specs.GRANITEDOCLING_VLLM )Apple Silicon (M1/M2/M3)
# Install with MLX supportpip install "docling[vlm]" mlx# Use MLX-optimized modelfrom docling.datamodel import vlm_model_specs pipeline_options = VlmPipelineOptions( vlm_options=vlm_model_specs.GRANITEDOCLING_MLX )How to Integrate with LangChain
from langchain_community.document_loaders import DoclingLoader# Load documents using Doclingloader = DoclingLoader(file_path="document.pdf") documents = loader.load()# Use with LangChainfrom langchain.text_splitter import RecursiveCharacterTextSplitterfrom langchain_openai import OpenAIEmbeddingsfrom langchain_community.vectorstores import FAISS# Split into chunkstext_splitter = RecursiveCharacterTextSplitter(chunk_size=1000) chunks = text_splitter.split_documents(documents)# Create vector storeembeddings = OpenAIEmbeddings() vectorstore = FAISS.from_documents(chunks, embeddings)# Queryresults = vectorstore.similarity_search("What is the total revenue?")How to Integrate with LlamaIndex
from llama_index.readers.docling import DoclingReaderfrom llama_index.core import VectorStoreIndex# Load with Docling readerreader = DoclingReader() documents = reader.load_data(file_path="document.pdf")# Create indexindex = VectorStoreIndex.from_documents(documents)# Queryquery_engine = index.as_query_engine() response = query_engine.query("Summarize the key findings")print(response)How to Handle Large Documents
For documents with hundreds of pages, process in batches:
from docling.document_converter import DocumentConverterfrom docling.datamodel.pipeline_options import PdfPipelineOptionsimport gc# Configure for memory efficiencypipeline_options = PdfPipelineOptions() pipeline_options.images_scale = 0.5 # Reduce image resolutionconverter = DocumentConverter( format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)} )# Process large documentresult = converter.convert("large_document.pdf")# Stream output to avoid memory issueswith open("output.md", "w") as f: f.write(result.document.export_to_markdown())# Clean updel result gc.collect()