openapi: 3.1.0
info:
  title: extraction-worker
  description: Structured data extraction from parsed documents using LlamaCloud Extract API with schema generation from Fibery artifact definitions
  version: 1.0.0
  contact:
    name: AdviceOS

servers:
  - url: https://extraction-worker.your-subdomain.workers.dev
    description: Production

security: []

tags:
  - name: Health
    description: Service health checks
  - name: Extractions
    description: Create and retrieve extractions
  - name: Config
    description: Service configuration
  - name: Debug
    description: Debug and diagnostic endpoints

paths:
  /health:
    get:
      operationId: getHealth
      summary: Health check
      tags: [Health]
      security: []
      responses:
        "200":
          description: Service is healthy
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/HealthResponse"

  /v1/extractions:
    post:
      operationId: createExtraction
      summary: Create an extraction job
      tags: [Extractions]
      security:
        - ApiKeyAuth: []
        - WorkflowAuth: []
        - OrchestratorAuth: []
      requestBody:
        required: true
        content:
          multipart/form-data:
            schema:
              type: object
              required:
                - file
                - artifactId
              properties:
                file:
                  type: string
                  format: binary
                  description: Document file to extract from
                artifactId:
                  type: string
                  description: Fibery artifact ID for schema resolution
      responses:
        "200":
          description: Extraction completed successfully
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ExtractionResponse"
        "400":
          description: Invalid request (missing file or artifactId)
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ErrorResponse"
        "401":
          description: Unauthorized
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ErrorResponse"
        "500":
          description: Extraction failed
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ErrorResponse"

  /v1/extractions/{documentId}:
    get:
      operationId: getExtraction
      summary: Get extraction status and results
      tags: [Extractions]
      security:
        - ApiKeyAuth: []
        - WorkflowAuth: []
        - OrchestratorAuth: []
      parameters:
        - name: documentId
          in: path
          required: true
          schema:
            type: string
            format: uuid
          description: Document UUID
      responses:
        "200":
          description: Extraction status and results
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ExtractionStatusResponse"
        "404":
          description: Extraction not found
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ErrorResponse"
        "401":
          description: Unauthorized
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ErrorResponse"

  /v1/config:
    get:
      operationId: getConfig
      summary: Get current extraction configuration
      tags: [Config]
      security:
        - ApiKeyAuth: []
        - WorkflowAuth: []
        - OrchestratorAuth: []
      responses:
        "200":
          description: Current configuration
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ConfigResponse"

  /v1/debug/job/{jobId}:
    get:
      operationId: getDebugJob
      summary: Debug endpoint for raw LlamaCloud job status
      tags: [Debug]
      security:
        - ApiKeyAuth: []
      parameters:
        - name: jobId
          in: path
          required: true
          schema:
            type: string
          description: LlamaCloud job ID
      responses:
        "200":
          description: Raw LlamaCloud job status
          content:
            application/json:
              schema:
                type: object
                additionalProperties: true
        "401":
          description: Unauthorized
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ErrorResponse"
        "404":
          description: Job not found
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ErrorResponse"

components:
  securitySchemes:
    ApiKeyAuth:
      type: apiKey
      in: header
      name: X-Universal-Api-Key
      description: Universal API key for requireUniversalApiAccess
    WorkflowAuth:
      type: apiKey
      in: header
      name: X-Workflow-Internal-Api-Key
      description: Internal workflow authentication key
    OrchestratorAuth:
      type: apiKey
      in: header
      name: X-Orchestrator-Api-Key
      description: Orchestrator service authentication key

  schemas:
    HealthResponse:
      type: object
      required:
        - ok
        - service
      properties:
        ok:
          type: boolean
          enum: [true]
        service:
          type: string
          enum: [extraction-worker]

    ExtractionResponse:
      type: object
      required:
        - ok
        - documentId
        - artifactId
        - schemaId
        - schemaGenerated
        - status
        - extractedData
        - metadata
      properties:
        ok:
          type: boolean
        documentId:
          type: string
          format: uuid
          description: Unique identifier for the extracted document
        artifactId:
          type: string
          description: Fibery artifact ID used for schema resolution
        schemaId:
          type: string
          format: uuid
          description: Identifier of the schema used for extraction
        schemaGenerated:
          type: boolean
          description: Whether the schema was freshly generated (vs cached)
        status:
          type: string
          enum: [completed]
          description: Extraction job status
        extractedData:
          type: object
          additionalProperties: true
          description: Schema-conformant structured extraction results
        metadata:
          $ref: "#/components/schemas/ExtractionMetadata"

    ExtractionMetadata:
      type: object
      properties:
        confidence:
          type: number
          minimum: 0
          maximum: 1
          description: Overall confidence score for the extraction
        extract_metadata:
          type: object
          additionalProperties: true
          description: Provider-level extraction metadata
        pagesProcessed:
          type: integer
          minimum: 0
          description: Number of pages processed during extraction
        providerJobId:
          type: string
          description: LlamaCloud job identifier
        providerFileId:
          type: string
          description: LlamaCloud file identifier

    ExtractionStatusResponse:
      type: object
      required:
        - ok
        - documentId
        - status
      properties:
        ok:
          type: boolean
        documentId:
          type: string
          format: uuid
          description: Document UUID
        artifactId:
          type: string
          description: Fibery artifact ID
        schemaId:
          type: string
          format: uuid
          description: Schema identifier used
        status:
          type: string
          enum: [pending, processing, completed, failed]
          description: Current extraction status
        extractedData:
          type: object
          additionalProperties: true
          description: Extracted data (present when status is completed)
        metadata:
          $ref: "#/components/schemas/ExtractionMetadata"
        error:
          type: string
          description: Error message (present when status is failed)

    ConfigResponse:
      type: object
      required:
        - ok
        - schemaPolicyVersion
        - extractTier
        - parseTier
        - baseUrl
      properties:
        ok:
          type: boolean
        schemaPolicyVersion:
          type: string
          description: Active schema generation policy version
        extractTier:
          type: string
          description: Configured extraction tier
        parseTier:
          type: string
          description: Configured parse tier
        baseUrl:
          type: string
          format: uri
          description: LlamaCloud API base URL

    ErrorResponse:
      type: object
      required:
        - ok
        - error
      properties:
        ok:
          type: boolean
          enum: [false]
        error:
          type: string
          description: Error message
