Skip to main content
Follow these steps to set up your account and integrate with our API.
1

Sign Up and Create an API Key

  1. Visit Chunkr AI
  2. Click on “Login” and create your account
  3. Once logged in, navigate to “API Keys” in the dashboard
2

Install our client SDK

The Python SDK is currently in alpha. The --pre flag is required to install pre-release versions.
pip install --pre chunkr-ai
3

Parse a document

import os
import time

from chunkr_ai import Chunkr
from pydantic import BaseModel

# Initialize the client
client = Chunkr(api_key=os.environ["CHUNKR_API_KEY"])

# Create a parse task using a file URL
parse_task = client.tasks.parse.create(
    file="https://s3.us-east-1.amazonaws.com/chunkr-web/uploads/invoice.pdf"
)

# Alternatively, upload a local file first
# with open('path/to/doc.pdf', 'rb') as f:
#     uploaded = client.files.create(file=f)
# parse_task = client.tasks.parse.create(file=uploaded.url)

# Wait for parse task to complete
while not parse_task.completed:
    parse_task = client.tasks.parse.get(task_id=parse_task.task_id)
    print(f"Parse Status: {parse_task.status}")
    time.sleep(3)

if parse_task.status == "Succeeded":
    # Do something with the output
    pass
else:  # Could be "Failed" or "Cancelled"
    print(f"Parse Status: {parse_task.status}")
4

Extract structured data

class Invoice(BaseModel):
    invoice_number: str
    invoice_date: str
    total_amount: float

# Use the parse task ID to create an extract task
extract_task = client.tasks.extract.create(
    file=parse_task.task_id, 
    schema=Invoice.model_json_schema() # Convert Pydantic model to JSON schema
)

# Wait for extract task to complete
while not extract_task.completed:
    extract_task = client.tasks.extract.get(task_id=extract_task.task_id)
    print(f"Extract Status: {extract_task.status}")
    time.sleep(3)
5

Explore Output

# Get parse results and print first 5 chunk contents
if parse_task.output is not None:
    for chunk in parse_task.output.chunks[:5]:
        if chunk.content is not None:
            print(chunk.content[:200])

# Get extract results and print schema fields
if extract_task.status == "Succeeded" and extract_task.output is not None:
    # Validate the results against the schema
    invoice = Invoice.model_validate(extract_task.output.results)
    # Do something with the invoice
    print(invoice)
You can also explore the output through our web interface in more detail.
import os
import time

from chunkr_ai import Chunkr
from pydantic import BaseModel

# Initialize the client
client = Chunkr(api_key=os.environ["CHUNKR_API_KEY"])

# Create a parse task using a file URL
parse_task = client.tasks.parse.create(
    file="https://s3.us-east-1.amazonaws.com/chunkr-web/uploads/invoice.pdf"
)

# Alternatively, upload a local file first
# with open('path/to/doc.pdf', 'rb') as f:
#     uploaded = client.files.create(file=f)
# parse_task = client.tasks.parse.create(file=uploaded.url)

# Wait for parse task to complete
while not parse_task.completed:
    parse_task = client.tasks.parse.get(task_id=parse_task.task_id)
    print(f"Parse Status: {parse_task.status}")
    time.sleep(3)

if parse_task.status == "Succeeded":
    # Do something with the output
    pass
else: # Could be "Failed" or "Cancelled"
    print(f"Parse Status: {parse_task.status}")

class Invoice(BaseModel):
    invoice_number: str
    invoice_date: str
    total_amount: float


extract_task = client.tasks.extract.create(
    file=parse_task.task_id, 
    schema=Invoice.model_json_schema() # Convert Pydantic model to JSON schema
)

# Wait for extract task to complete
while not extract_task.completed:
    extract_task = client.tasks.extract.get(task_id=extract_task.task_id)
    print(f"Extract Status: {extract_task.status}")
    time.sleep(3)

# Get parse results and print first 5 chunk contents
if parse_task.output is not None:
    for chunk in parse_task.output.chunks[:5]:
        if chunk.content is not None:
            print(chunk.content[:200])

# Get extract results and print schema fields
if extract_task.status == "Succeeded" and extract_task.output is not None:
    invoice = Invoice.model_validate(extract_task.output.results)
    # Do something with the invoice
    print(invoice)

Next Steps

Get to production with our task system and webhooks.
I