> ## Documentation Index
> Fetch the complete documentation index at: https://docs.chunkr.ai/llms.txt
> Use this file to discover all available pages before exploring further.

# Create Extract Task

> Queues a document/parsed task for extraction and returns a `TaskResponse` with the
assigned `task_id`, initial configuration, file metadata, and timestamps.
The initial status is `Starting`.

Creates an extract task and returns its metadata immediately.


## OpenAPI

````yaml https://api.chunkr.ai/docs/openapi.json post /tasks/extract
openapi: 3.1.0
info:
  title: Chunkr API
  description: >-
    API service for document layout analysis and chunking to convert document
    into RAG/LLM-ready data.
  contact:
    name: Chunkr
    url: https://chunkr.ai
    email: ishaan@lumina.sh
  license:
    name: ''
  version: 2.62.0
servers:
  - url: https://api.chunkr.ai
    description: Chunkr API
security: []
tags:
  - name: Files
    description: Endpoints for uploading and managing files
  - name: Health
    description: Endpoint for checking the health of the service.
  - name: Tasks
    description: Endpoints for uploading and managing tasks
  - name: Webhook
    description: Endpoints for managing webhooks
paths:
  /tasks/extract:
    post:
      tags:
        - Tasks
      summary: Create Extract Task
      description: >-
        Queues a document/parsed task for extraction and returns a
        `TaskResponse` with the

        assigned `task_id`, initial configuration, file metadata, and
        timestamps.

        The initial status is `Starting`.


        Creates an extract task and returns its metadata immediately.
      operationId: create_extract_task_route
      requestBody:
        description: JSON request to create an extract task
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/CreateExtractForm'
        required: true
      responses:
        '200':
          description: Task created successfully.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ExtractTaskResponse'
        '400':
          description: Invalid request or unsupported file type
          content:
            text/plain:
              schema:
                type: string
        '401':
          description: Unauthorized
          content:
            text/plain:
              schema:
                type: string
        '429':
          description: Usage limit exceeded or rate limit exceeded
          content:
            text/plain:
              schema:
                type: string
        '500':
          description: Server error
          content:
            text/plain:
              schema:
                type: string
      security:
        - api_key: []
components:
  schemas:
    CreateExtractForm:
      allOf:
        - $ref: '#/components/schemas/ExtractConfiguration'
        - type: object
          required:
            - file
          properties:
            expires_in:
              type:
                - integer
                - 'null'
              format: int32
              description: >-
                The number of seconds until task is deleted.

                Expired tasks can **not** be updated, polled or accessed via web
                interface.
            file:
              type: string
              description: >-
                The file to be extracted. Supported inputs:

                - `ch://files/{file_id}`: Reference to an existing file. Upload
                via the Files API

                - `http(s)://...`: Remote URL to fetch

                - `data:*;base64,...` or raw base64 string

                - `task_id`: Reference to an existing `parse`task.
            file_name:
              type:
                - string
                - 'null'
              description: >-
                The name of the file to be extracted. If not set a name will be
                generated.

                Can not be provided if the `file` is a `task_id`.
    ExtractTaskResponse:
      type: object
      required:
        - configuration
        - completed
        - created_at
        - file_info
        - message
        - status
        - task_type
        - task_id
        - version_info
      properties:
        completed:
          type: boolean
          description: >-
            True when the task reaches a terminal state i.e. `status` is
            `Succeeded` or `Failed` or `Cancelled`
        configuration:
          $ref: '#/components/schemas/ExtractConfiguration'
        created_at:
          type: string
          format: date-time
          description: The date and time when the task was created and queued.
        expires_at:
          type:
            - string
            - 'null'
          format: date-time
          description: The date and time when the task will expire.
        file_info:
          $ref: '#/components/schemas/FileInfo'
        finished_at:
          type:
            - string
            - 'null'
          format: date-time
          description: The date and time when the task was finished.
        input_file_url:
          type:
            - string
            - 'null'
          description: |-
            The presigned URL of the input file.
            Deprecated use `file_info.url` instead.
          deprecated: true
        message:
          type: string
          description: A message describing the task's status or any errors that occurred.
        output:
          oneOf:
            - type: 'null'
            - $ref: '#/components/schemas/ExtractOutputResponse'
        parse_task_id:
          type:
            - string
            - 'null'
          description: The ID of the source `parse` task that was used for extraction
        started_at:
          type:
            - string
            - 'null'
          format: date-time
          description: The date and time when the task was started.
        status:
          $ref: '#/components/schemas/Status'
        task_id:
          type: string
          description: The unique identifier for the task.
        task_type:
          $ref: '#/components/schemas/TaskType'
        task_url:
          type:
            - string
            - 'null'
          description: The presigned URL of the task.
        version_info:
          $ref: '#/components/schemas/VersionInfo'
    ExtractConfiguration:
      type: object
      title: Extract
      required:
        - schema
      properties:
        parse_configuration:
          oneOf:
            - type: 'null'
            - $ref: '#/components/schemas/ParseConfiguration'
              description: |-
                Optional configuration for the `parse` task.
                Can not be used if `file` is a `task_id`.
        schema:
          type: object
          description: The schema to be used for the extraction.
        system_prompt:
          type:
            - string
            - 'null'
          description: The system prompt to be used for the extraction.
          default: >-
            You are an expert at structured data extraction. You will be given
            parsed text from a document and should convert it into the given
            structure.
    FileInfo:
      type: object
      description: Information about the input file.
      required:
        - url
      properties:
        mime_type:
          type:
            - string
            - 'null'
          description: The MIME type of the file.
        name:
          type:
            - string
            - 'null'
          description: The name of the file.
        page_count:
          type:
            - integer
            - 'null'
          format: int32
          description: The number of pages in the file.
          minimum: 0
        ss_cell_count:
          type:
            - integer
            - 'null'
          format: int32
          description: The number of cells in the file. Only used for spreadsheets.
          minimum: 0
        url:
          type: string
          description: The presigned URL/Base64 encoded URL of the input file.
    ExtractOutputResponse:
      type: object
      title: Extract
      description: >-
        The processed results of a document extraction task.


        Shapes:

        - `results`: JSON matching the user-provided schema.

        - `citations`: mirror of `results`; only leaf positions (primitive or
        array-of-primitives) contain a `Vec<Citation>` supporting that field.

        - `metrics`: mirror of `results`; only leaf positions contain a
        `Metrics` object for that field.
      required:
        - results
        - citations
        - metrics
      properties:
        citations:
          description: >-
            Mirror of `results`; leaves are `Vec<Citation>` for the
            corresponding field


            Example:


            ```json

            {
              "field_name": [
                {
                  "citation_id": "abc1234",
                  "citation_type": "Segment",
                  "bboxes": [
                    {
                      "left": 10,
                      "top": 20,
                      "width": 100,
                      "height": 18
                    }
                  ],
                  "content": "Example content",
                  "segment_id": "seg_001",
                  "segment_type": "Text",
                  "page_number": 1,
                  "page_height": 297,
                  "page_width": 210,
                  "ss_ranges": ["A1:C10"],
                  "ss_sheet_name": "Sheet1"
                }
              ]
            }

            ```
        metrics:
          description: >-
            Mirror of `results`; leaves contain a `Metrics` object for the
            corresponding field


            Example:


            ```json

            { "field_name": { "confidence": "High" } }

            ```
        results:
          description: |-
            JSON data that matches the provided schema

            Example:

            ```json
            { "field_name": "value" }
            ```
    Status:
      type: string
      description: The status of the task.
      enum:
        - Starting
        - Processing
        - Succeeded
        - Failed
        - Cancelled
    TaskType:
      type: string
      enum:
        - Parse
        - Extract
    VersionInfo:
      type: object
      description: Version information for the task.
      required:
        - server_version
        - client_version
      properties:
        client_version:
          $ref: '#/components/schemas/ClientVersion'
          description: The version of the client.
        server_version:
          type: string
          description: The version of the server.
    ParseConfiguration:
      type: object
      title: Parse
      properties:
        chunk_processing:
          oneOf:
            - $ref: '#/components/schemas/ChunkProcessing'
          default:
            ignore_headers_and_footers: null
            target_length: 4096
            tokenizer:
              Enum: Word
        error_handling:
          oneOf:
            - $ref: '#/components/schemas/ErrorHandlingStrategy'
          default: Fail
        ocr_strategy:
          oneOf:
            - $ref: '#/components/schemas/OcrStrategy'
          default: All
        pipeline:
          oneOf:
            - $ref: '#/components/schemas/PipelineType'
          default: Chunkr
        segment_processing:
          oneOf:
            - type: 'null'
            - $ref: '#/components/schemas/SegmentProcessing'
          default: null
        segmentation_strategy:
          oneOf:
            - $ref: '#/components/schemas/SegmentationStrategy'
          default: LayoutAnalysis
    ClientVersion:
      oneOf:
        - type: string
          title: Legacy
          description: Legacy SDK without version information (< 0.3.3)
          enum:
            - Legacy
        - type: object
          title: ManualSdk
          description: Version of the current manually-maintained SDK
          required:
            - ManualSdk
          properties:
            ManualSdk:
              type: string
              description: Version of the current manually-maintained SDK
        - type: object
          title: GeneratedSdk
          description: Version of the auto-generated SDK
          required:
            - GeneratedSdk
          properties:
            GeneratedSdk:
              type: string
              description: Version of the auto-generated SDK
        - type: string
          title: Unspecified
          description: Unspecified/raw API request without any client version headers
          enum:
            - Unspecified
      description: Represents different types of SDK clients and their versions
    ChunkProcessing:
      type: object
      description: Controls the setting for the chunking and post-processing of each chunk.
      properties:
        ignore_headers_and_footers:
          type:
            - boolean
            - 'null'
          description: 'DEPRECATED: use `segment_processing.ignore` instead'
          deprecated: true
        target_length:
          type: integer
          format: int32
          description: >-
            The target number of words in each chunk. If 0, each chunk will
            contain a single segment.
          default: 4096
          minimum: 0
        tokenizer:
          oneOf:
            - $ref: '#/components/schemas/TokenizerType'
              description: The tokenizer to use for the chunking process.
          default: Word
    ErrorHandlingStrategy:
      type: string
      description: >-
        Controls how errors are handled during processing:

        - `Fail`: Stops processing and fails the task when any error occurs

        - `Continue`: Attempts to continue processing despite non-critical
        errors (eg. LLM refusals etc.)
      enum:
        - Fail
        - Continue
    OcrStrategy:
      type: string
      description: >-
        Controls the Optical Character Recognition (OCR) strategy.

        - `All`: Processes all pages with OCR. (Latency penalty: ~0.5 seconds
        per page)

        - `Auto`: Selectively applies OCR only to pages with missing or
        low-quality text. When text layer is present the bounding boxes from the
        text layer are used.
      enum:
        - All
        - Auto
    PipelineType:
      type: string
      enum:
        - Azure
        - Chunkr
      deprecated: true
    SegmentProcessing:
      type: object
      description: >-
        Configuration for how each document segment is processed and formatted.


        Each segment has sensible defaults, but you can override specific
        settings:

        - `format`: Output as `Html` or `Markdown`

        - `strategy`: `Auto` (rule-based), `LLM` (AI-generated), or `Ignore`
        (skip)

        - `crop_image`: Whether to crop images to segment bounds

        - `extended_context`: Use full page as context for LLM processing

        - `description`: Generate descriptions for segments


        **Defaults per segment type:** Check the documentation for more details.


        Only specify the fields you want to change - everything else uses the
        defaults.
      properties:
        Caption:
          oneOf:
            - type: 'null'
            - $ref: '#/components/schemas/GenerationConfig'
          default: null
        Footnote:
          oneOf:
            - type: 'null'
            - $ref: '#/components/schemas/GenerationConfig'
          default: null
        FormRegion:
          oneOf:
            - type: 'null'
            - $ref: '#/components/schemas/GenerationConfig'
              description: >-
                New segment types - must be Optional for backwards
                compatibility.
          default: null
        Formula:
          oneOf:
            - type: 'null'
            - $ref: '#/components/schemas/GenerationConfig'
          default: null
        GraphicalItem:
          oneOf:
            - type: 'null'
            - $ref: '#/components/schemas/GenerationConfig'
          default: null
        Legend:
          oneOf:
            - type: 'null'
            - $ref: '#/components/schemas/GenerationConfig'
          default: null
        LineNumber:
          oneOf:
            - type: 'null'
            - $ref: '#/components/schemas/GenerationConfig'
          default: null
        ListItem:
          oneOf:
            - type: 'null'
            - $ref: '#/components/schemas/GenerationConfig'
          default: null
        Page:
          oneOf:
            - type: 'null'
            - $ref: '#/components/schemas/GenerationConfig'
          default: null
        PageFooter:
          oneOf:
            - type: 'null'
            - $ref: '#/components/schemas/GenerationConfig'
          default: null
        PageHeader:
          oneOf:
            - type: 'null'
            - $ref: '#/components/schemas/GenerationConfig'
          default: null
        PageNumber:
          oneOf:
            - type: 'null'
            - $ref: '#/components/schemas/GenerationConfig'
          default: null
        Picture:
          oneOf:
            - type: 'null'
            - $ref: '#/components/schemas/GenerationConfig'
          default: null
        Table:
          oneOf:
            - type: 'null'
            - $ref: '#/components/schemas/GenerationConfig'
          default: null
        Text:
          oneOf:
            - type: 'null'
            - $ref: '#/components/schemas/GenerationConfig'
          default: null
        Title:
          oneOf:
            - type: 'null'
            - $ref: '#/components/schemas/GenerationConfig'
          default: null
        Unknown:
          oneOf:
            - type: 'null'
            - $ref: '#/components/schemas/GenerationConfig'
          default: null
    SegmentationStrategy:
      type: string
      description: >-
        Controls the segmentation strategy:

        - `LayoutAnalysis`: Analyzes pages for layout elements (e.g., `Table`,
        `Picture`, `Formula`, etc.) using bounding boxes. Provides fine-grained
        segmentation and better chunking.

        - `Page`: Treats each page as a single segment. Faster processing, but
        without layout element detection and only simple chunking.
      enum:
        - LayoutAnalysis
        - Page
    TokenizerType:
      oneOf:
        - type: object
          title: Enum
          description: Use one of the predefined tokenizer types
          required:
            - Enum
          properties:
            Enum:
              $ref: '#/components/schemas/Tokenizer'
              description: Use one of the predefined tokenizer types
        - type: object
          title: String
          description: |-
            Use any Hugging Face tokenizer by specifying its model ID
            Examples: "Qwen/Qwen-tokenizer", "facebook/bart-large"
          required:
            - String
          properties:
            String:
              type: string
              description: |-
                Use any Hugging Face tokenizer by specifying its model ID
                Examples: "Qwen/Qwen-tokenizer", "facebook/bart-large"
      description: >-
        Specifies which tokenizer to use for the chunking process.


        This type supports two ways of specifying a tokenizer:

        1. Using a predefined tokenizer from the `Tokenizer` enum

        2. Using any Hugging Face tokenizer by providing its model ID as a
        string
           (e.g. "facebook/bart-large", "Qwen/Qwen-tokenizer", etc.)

        When using a string, any valid Hugging Face tokenizer ID can be
        specified,

        which will be loaded using the Hugging Face tokenizers library.
    GenerationConfig:
      type: object
      description: >-
        Controls the processing and generation for the segment.

        - `crop_image` controls whether to crop the file's images to the
        segment's bounding box.
          The cropped image will be stored in the segment's `image` field. Use `All` to always crop,
          or `Auto` to only crop when needed for post-processing.
        - `format` specifies the output format: `Html` or `Markdown`

        - `strategy` determines how the content is generated: `Auto`, `LLM`, or
        `Ignore`
          - `Auto`: Process content automatically
          - `LLM`: Use large language models for processing
          - `Ignore`: Exclude segments from final output
        - `description` enables LLM-generated descriptions for segments.
          **Note:** This uses chunkr's own VLM models and is not configurable via LLM processing configuration.
        - `extended_context` uses the full page image as context for LLM
        generation.
      properties:
        crop_image:
          oneOf:
            - type: 'null'
            - $ref: '#/components/schemas/CroppingStrategy'
          default: null
        description:
          type:
            - boolean
            - 'null'
          description: Generate LLM descriptions for this segment
          default: null
        extended_context:
          type:
            - boolean
            - 'null'
          description: Use the full page image as context for LLM generation
          default: null
        format:
          oneOf:
            - type: 'null'
            - $ref: '#/components/schemas/SegmentFormat'
          default: null
        llm:
          type:
            - string
            - 'null'
          deprecated: true
        strategy:
          oneOf:
            - type: 'null'
            - $ref: '#/components/schemas/GenerationStrategy'
          default: null
    Tokenizer:
      type: string
      description: >-
        Common tokenizers used for text processing.


        These values represent standard tokenization approaches and popular
        pre-trained

        tokenizers from the Hugging Face ecosystem.
      enum:
        - Word
        - Cl100kBase
        - XlmRobertaBase
        - BertBaseUncased
    CroppingStrategy:
      type: string
      description: |-
        Controls the cropping strategy for an item (e.g. segment, chunk, etc.)
        - `All` crops all images in the item
        - `Auto` crops images only if required for post-processing
      enum:
        - All
        - Auto
    SegmentFormat:
      type: string
      description: The format for the `content` field of a segment.
      enum:
        - Html
        - Markdown
    GenerationStrategy:
      type: string
      description: The strategy for generating the `content` field of a segment.
      enum:
        - LLM
        - Auto
        - Ignore
  securitySchemes:
    api_key:
      type: apiKey
      in: header
      name: Authorization

````