openapi: 3.1.0
info:
  title: data-normalization
  description: Cross-document field normalization and conflict resolution that reconciles extracted data from multiple documents into canonical values
  version: 1.0.0

servers:
  - url: https://data-normalization.your-subdomain.workers.dev

paths:
  /health:
    get:
      summary: Health check
      operationId: getHealth
      responses:
        "200":
          description: Service is healthy
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/HealthResponse"

  /v1/normalizations:
    post:
      summary: Create a normalization run
      operationId: createNormalization
      security:
        - bearerAuth: []
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: "#/components/schemas/NormalizationRequest"
      responses:
        "200":
          description: Normalization run created and queued
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/NormalizationResponse"
        "400":
          description: Invalid request body
        "401":
          description: Unauthorized

  /v1/normalizations/{normalizationId}:
    get:
      summary: Get normalization run status and results
      operationId: getNormalization
      security:
        - bearerAuth: []
      parameters:
        - name: normalizationId
          in: path
          required: true
          schema:
            type: string
      responses:
        "200":
          description: Normalization run details
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/NormalizationDetailResponse"
        "401":
          description: Unauthorized
        "404":
          description: Normalization run not found

  /v1/normalizations/{normalizationId}/issues/{issueId}/resolve:
    post:
      summary: Manually resolve a conflict issue
      operationId: resolveIssue
      security:
        - bearerAuth: []
      parameters:
        - name: normalizationId
          in: path
          required: true
          schema:
            type: string
        - name: issueId
          in: path
          required: true
          schema:
            type: string
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: "#/components/schemas/IssueResolveRequest"
      responses:
        "200":
          description: Issue resolved successfully
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/IssueResolveResponse"
        "400":
          description: Invalid request body
        "401":
          description: Unauthorized
        "404":
          description: Normalization or issue not found

  /v1/config:
    get:
      summary: Get current configuration
      operationId: getConfig
      security:
        - bearerAuth: []
      responses:
        "200":
          description: Current service configuration
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ConfigResponse"
        "401":
          description: Unauthorized

components:
  securitySchemes:
    bearerAuth:
      type: http
      scheme: bearer

  schemas:
    HealthResponse:
      type: object
      required:
        - ok
        - service
      properties:
        ok:
          type: boolean
        service:
          type: string

    NormalizationRequest:
      type: object
      required:
        - documents
      properties:
        normalizationId:
          type: string
          description: Optional custom ID for the normalization run
        documents:
          type: array
          minItems: 1
          items:
            $ref: "#/components/schemas/InputDocument"
        options:
          $ref: "#/components/schemas/NormalizationOptions"

    InputDocument:
      type: object
      required:
        - documentId
        - extractedData
      properties:
        documentId:
          type: string
        artifactId:
          type: string
        extractedData:
          type: object
          description: Extracted fields from the document (may be nested)
        metadata:
          type: object
          description: Optional metadata about the document

    NormalizationOptions:
      type: object
      properties:
        autoResolve:
          type: boolean
          default: true
          description: Whether to automatically resolve conflicts using heuristics
        useLlm:
          type: boolean
          default: false
          description: Whether to use LLM adjudication for unresolved conflicts

    NormalizationResponse:
      type: object
      required:
        - ok
        - normalizationId
        - status
        - documentCount
      properties:
        ok:
          type: boolean
        normalizationId:
          type: string
        status:
          type: string
          enum:
            - queued
        documentCount:
          type: integer

    NormalizationDetailResponse:
      type: object
      required:
        - ok
        - normalizationId
        - status
      properties:
        ok:
          type: boolean
        normalizationId:
          type: string
        status:
          type: string
          enum:
            - queued
            - running
            - completed
            - failed
        documentCount:
          type: integer
        normalizedData:
          type: object
          additionalProperties:
            $ref: "#/components/schemas/NormalizedField"
        issues:
          type: array
          items:
            $ref: "#/components/schemas/ConflictIssue"
        createdAt:
          type: string
          format: date-time
        completedAt:
          type: string
          format: date-time

    NormalizedField:
      type: object
      required:
        - value
        - resolutionMethod
        - sources
        - confidence
      properties:
        value:
          description: The canonical resolved value
        resolutionMethod:
          type: string
          enum:
            - single_source
            - duplicate
            - heuristic
            - llm
            - manual
        sources:
          type: array
          items:
            $ref: "#/components/schemas/FieldSource"
        confidence:
          type: number
          minimum: 0
          maximum: 1

    FieldSource:
      type: object
      required:
        - documentId
        - originalKey
        - originalValue
      properties:
        documentId:
          type: string
        artifactId:
          type: string
        originalKey:
          type: string
          description: The field key as it appeared in the source document
        originalValue:
          description: The value as it appeared in the source document

    ConflictIssue:
      type: object
      required:
        - issueId
        - canonicalKey
        - conflictingValues
        - status
      properties:
        issueId:
          type: string
        canonicalKey:
          type: string
          description: The canonical field key where the conflict was detected
        conflictingValues:
          type: array
          items:
            $ref: "#/components/schemas/FieldSource"
        status:
          type: string
          enum:
            - open
            - resolved
        resolvedValue:
          description: The resolved canonical value (present when status is resolved)
        resolvedBy:
          type: string
        resolvedAt:
          type: string
          format: date-time
        rationale:
          type: string

    IssueResolveRequest:
      type: object
      required:
        - resolvedValue
      properties:
        resolvedValue:
          description: The canonical value to use for this field
        resolvedBy:
          type: string
          description: Identifier of the person resolving the issue
        rationale:
          type: string
          description: Reason for choosing this value

    IssueResolveResponse:
      type: object
      required:
        - ok
        - issueId
        - status
      properties:
        ok:
          type: boolean
        issueId:
          type: string
        status:
          type: string
          enum:
            - resolved
        resolvedValue:
          description: The resolved canonical value
        resolvedBy:
          type: string
        resolvedAt:
          type: string
          format: date-time

    ConfigResponse:
      type: object
      required:
        - ok
        - model
        - timeout
      properties:
        ok:
          type: boolean
        model:
          type: string
          description: The LLM model used for normalization adjudication
        timeout:
          type: integer
          description: Timeout in milliseconds for LLM normalization calls
