Here’s how to efficiently process multiple files using Chunkr AI’s async capabilities.

Process a Directory

Here’s a simple script to process all files in a directory:

import asyncio
from chunkr_ai import Chunkr
import os
from pathlib import Path

chunkr = Chunkr()

async def process_directory(input_dir: str, output_dir: str):
    try:
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        # Get all files in directory
        files = list(Path(input_dir).glob('*.*'))
        print(f"Found {len(files)} files to process")
        
        # Process files concurrently
        tasks = []
        for file_path in files:
            task = asyncio.create_task(process_file(chunkr, file_path, output_dir))
            tasks.append(task)
        
        # Wait for all files to complete
        results = await asyncio.gather(*tasks)
        
        print(f"Completed processing {len(results)} files")
    except Exception as e:
        print(f"Error processing directory: {e}")

async def process_file(chunkr, file_path, output_dir):
    try:
        # Upload file
        result = await chunkr.upload(file_path)
        
        # Check if upload was successful
        if result.status == "Failed":
            print(f"Failed to process file {file_path}: {result.message}")
            return None
        
        # Save result
        file_name = file_path.name
        output_file_path = Path(output_dir) / f"{file_name}.json"
        result.json(output_file_path)
        
        return file_name
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None


# Run the processor
if __name__ == "__main__":
    INPUT_DIR = "/data/Chunkr/dataset/files"
    OUTPUT_DIR = "processed/"
    asyncio.run(process_directory(INPUT_DIR, OUTPUT_DIR))