Once logged in, navigate to “API Keys” in the dashboard
2
Install our client SDK
The Python SDK is currently in alpha. The --pre flag is required to install pre-release versions.
pip install --pre chunkr-ai
3
Parse a document
import osimport timefrom chunkr_ai import Chunkrfrom pydantic import BaseModel# Initialize the clientclient = Chunkr(api_key=os.environ["CHUNKR_API_KEY"])# Create a parse task using a file URLparse_task = client.tasks.parse.create( file="https://s3.us-east-1.amazonaws.com/chunkr-web/uploads/invoice.pdf")# Alternatively, upload a local file first# with open('path/to/doc.pdf', 'rb') as f:# uploaded = client.files.create(file=f)# parse_task = client.tasks.parse.create(file=uploaded.url)# Wait for parse task to completewhile not parse_task.completed: parse_task = client.tasks.parse.get(task_id=parse_task.task_id) print(f"Parse Status: {parse_task.status}") time.sleep(3)if parse_task.status == "Succeeded": # Do something with the output passelse: # Could be "Failed" or "Cancelled" print(f"Parse Status: {parse_task.status}")
4
Extract structured data
class Invoice(BaseModel): invoice_number: str invoice_date: str total_amount: float# Use the parse task ID to create an extract taskextract_task = client.tasks.extract.create( file=parse_task.task_id, schema=Invoice.model_json_schema() # Convert Pydantic model to JSON schema)# Wait for extract task to completewhile not extract_task.completed: extract_task = client.tasks.extract.get(task_id=extract_task.task_id) print(f"Extract Status: {extract_task.status}") time.sleep(3)
5
Explore Output
# Get parse results and print first 5 chunk contentsif parse_task.output is not None: for chunk in parse_task.output.chunks[:5]: if chunk.content is not None: print(chunk.content[:200])# Get extract results and print schema fieldsif extract_task.status == "Succeeded" and extract_task.output is not None: # Validate the results against the schema invoice = Invoice.model_validate(extract_task.output.results) # Do something with the invoice print(invoice)
import osimport timefrom chunkr_ai import Chunkrfrom pydantic import BaseModel# Initialize the clientclient = Chunkr(api_key=os.environ["CHUNKR_API_KEY"])# Create a parse task using a file URLparse_task = client.tasks.parse.create( file="https://s3.us-east-1.amazonaws.com/chunkr-web/uploads/invoice.pdf")# Alternatively, upload a local file first# with open('path/to/doc.pdf', 'rb') as f:# uploaded = client.files.create(file=f)# parse_task = client.tasks.parse.create(file=uploaded.url)# Wait for parse task to completewhile not parse_task.completed: parse_task = client.tasks.parse.get(task_id=parse_task.task_id) print(f"Parse Status: {parse_task.status}") time.sleep(3)if parse_task.status == "Succeeded": # Do something with the output passelse: # Could be "Failed" or "Cancelled" print(f"Parse Status: {parse_task.status}")class Invoice(BaseModel): invoice_number: str invoice_date: str total_amount: floatextract_task = client.tasks.extract.create( file=parse_task.task_id, schema=Invoice.model_json_schema() # Convert Pydantic model to JSON schema)# Wait for extract task to completewhile not extract_task.completed: extract_task = client.tasks.extract.get(task_id=extract_task.task_id) print(f"Extract Status: {extract_task.status}") time.sleep(3)# Get parse results and print first 5 chunk contentsif parse_task.output is not None: for chunk in parse_task.output.chunks[:5]: if chunk.content is not None: print(chunk.content[:200])# Get extract results and print schema fieldsif extract_task.status == "Succeeded" and extract_task.output is not None: invoice = Invoice.model_validate(extract_task.output.results) # Do something with the invoice print(invoice)