1
Sign Up and Create an API Key
- Visit Chunkr AI
- Click on “Login” and create your account
- Once logged in, navigate to “API Keys” in the dashboard
2
Install our client SDK
The Python SDK is currently in alpha. The
--pre flag is required to install pre-release versions.Copy
Ask AI
pip install --pre chunkr-ai
3
Parse a document
Copy
Ask AI
import os
import time
from chunkr_ai import Chunkr
from pydantic import BaseModel
# Initialize the client
client = Chunkr(api_key=os.environ["CHUNKR_API_KEY"])
# Create a parse task using a file URL
parse_task = client.tasks.parse.create(
file="https://s3.us-east-1.amazonaws.com/chunkr-web/uploads/invoice.pdf"
)
# Alternatively, upload a local file first
# with open('path/to/doc.pdf', 'rb') as f:
# uploaded = client.files.create(file=f)
# parse_task = client.tasks.parse.create(file=uploaded.url)
# Wait for parse task to complete
while not parse_task.completed:
parse_task = client.tasks.parse.get(task_id=parse_task.task_id)
print(f"Parse Status: {parse_task.status}")
time.sleep(3)
if parse_task.status == "Succeeded":
# Do something with the output
pass
else: # Could be "Failed" or "Cancelled"
print(f"Parse Status: {parse_task.status}")
4
Extract structured data
Copy
Ask AI
class Invoice(BaseModel):
invoice_number: str
invoice_date: str
total_amount: float
# Use the parse task ID to create an extract task
extract_task = client.tasks.extract.create(
file=parse_task.task_id,
schema=Invoice.model_json_schema() # Convert Pydantic model to JSON schema
)
# Wait for extract task to complete
while not extract_task.completed:
extract_task = client.tasks.extract.get(task_id=extract_task.task_id)
print(f"Extract Status: {extract_task.status}")
time.sleep(3)
5
Explore Output
Copy
Ask AI
# Get parse results and print first 5 chunk contents
if parse_task.output is not None:
for chunk in parse_task.output.chunks[:5]:
if chunk.content is not None:
print(chunk.content[:200])
# Get extract results and print schema fields
if extract_task.status == "Succeeded" and extract_task.output is not None:
# Validate the results against the schema
invoice = Invoice.model_validate(extract_task.output.results)
# Do something with the invoice
print(invoice)
Full example (copy-paste)
Full example (copy-paste)
Copy
Ask AI
import os
import time
from chunkr_ai import Chunkr
from pydantic import BaseModel
# Initialize the client
client = Chunkr(api_key=os.environ["CHUNKR_API_KEY"])
# Create a parse task using a file URL
parse_task = client.tasks.parse.create(
file="https://s3.us-east-1.amazonaws.com/chunkr-web/uploads/invoice.pdf"
)
# Alternatively, upload a local file first
# with open('path/to/doc.pdf', 'rb') as f:
# uploaded = client.files.create(file=f)
# parse_task = client.tasks.parse.create(file=uploaded.url)
# Wait for parse task to complete
while not parse_task.completed:
parse_task = client.tasks.parse.get(task_id=parse_task.task_id)
print(f"Parse Status: {parse_task.status}")
time.sleep(3)
if parse_task.status == "Succeeded":
# Do something with the output
pass
else: # Could be "Failed" or "Cancelled"
print(f"Parse Status: {parse_task.status}")
class Invoice(BaseModel):
invoice_number: str
invoice_date: str
total_amount: float
extract_task = client.tasks.extract.create(
file=parse_task.task_id,
schema=Invoice.model_json_schema() # Convert Pydantic model to JSON schema
)
# Wait for extract task to complete
while not extract_task.completed:
extract_task = client.tasks.extract.get(task_id=extract_task.task_id)
print(f"Extract Status: {extract_task.status}")
time.sleep(3)
# Get parse results and print first 5 chunk contents
if parse_task.output is not None:
for chunk in parse_task.output.chunks[:5]:
if chunk.content is not None:
print(chunk.content[:200])
# Get extract results and print schema fields
if extract_task.status == "Succeeded" and extract_task.output is not None:
invoice = Invoice.model_validate(extract_task.output.results)
# Do something with the invoice
print(invoice)