import os
import time
from chunkr_ai import Chunkr
from pydantic import BaseModel
# Initialize the client
client = Chunkr(api_key=os.environ["CHUNKR_API_KEY"])
# Create a parse task using a file URL
parse_task = client.tasks.parse.create(
file="https://s3.us-east-1.amazonaws.com/chunkr-web/uploads/invoice.pdf"
)
# Alternatively, upload a local file first
# with open('path/to/doc.pdf', 'rb') as f:
# uploaded = client.files.create(file=f)
# parse_task = client.tasks.parse.create(file=uploaded.url)
# Wait for parse task to complete
while not parse_task.completed:
parse_task = client.tasks.parse.get(task_id=parse_task.task_id)
print(f"Parse Status: {parse_task.status}")
time.sleep(3)
if parse_task.status == "Succeeded":
# Do something with the output
pass
else: # Could be "Failed" or "Cancelled"
print(f"Parse Status: {parse_task.status}")
class Invoice(BaseModel):
invoice_number: str
invoice_date: str
total_amount: float
extract_task = client.tasks.extract.create(
file=parse_task.task_id,
schema=Invoice.model_json_schema() # Convert Pydantic model to JSON schema
)
# Wait for extract task to complete
while not extract_task.completed:
extract_task = client.tasks.extract.get(task_id=extract_task.task_id)
print(f"Extract Status: {extract_task.status}")
time.sleep(3)
# Get parse results and print first 5 chunk contents
if parse_task.output is not None:
for chunk in parse_task.output.chunks[:5]:
if chunk.content is not None:
print(chunk.content[:200])
# Get extract results and print schema fields
if extract_task.status == "Succeeded" and extract_task.output is not None:
invoice = Invoice.model_validate(extract_task.output.results)
# Do something with the invoice
print(invoice)