ai/backend.old/src/indicator/schema.py

"""
Data models for the Indicator system.

Defines schemas for input/output specifications, computation context,
and metadata for AI agent discovery.
"""

from typing import Any, Dict, List, Literal, Optional

from pydantic import BaseModel, Field

from datasource.schema import ColumnInfo


class InputSchema(BaseModel):
    """
    Declares the required input columns for an Indicator.

    Indicators match against any data source (DataSource or other Indicator)
    that provides columns satisfying this schema.
    """

    model_config = {"extra": "forbid"}

    required_columns: List[ColumnInfo] = Field(
        description="Columns that must be present in the input data"
    )
    optional_columns: List[ColumnInfo] = Field(
        default_factory=list,
        description="Columns that may be used if present but are not required"
    )
    time_column: str = Field(
        default="time",
        description="Name of the timestamp column (must be present)"
    )

    def matches(self, available_columns: List[ColumnInfo]) -> bool:
        """
        Check if available columns satisfy this input schema.

        Args:
            available_columns: Columns provided by a data source

        Returns:
            True if all required columns are present with compatible types
        """
        available_map = {col.name: col for col in available_columns}

        # Check time column exists
        if self.time_column not in available_map:
            return False

        # Check all required columns exist with compatible types
        for required in self.required_columns:
            if required.name not in available_map:
                return False
            available = available_map[required.name]
            if available.type != required.type:
                return False

        return True

    def get_missing_columns(self, available_columns: List[ColumnInfo]) -> List[str]:
        """
        Get list of missing required column names.

        Args:
            available_columns: Columns provided by a data source

        Returns:
            List of missing column names
        """
        available_names = {col.name for col in available_columns}
        missing = []

        if self.time_column not in available_names:
            missing.append(self.time_column)

        for required in self.required_columns:
            if required.name not in available_names:
                missing.append(required.name)

        return missing


class OutputSchema(BaseModel):
    """
    Declares the output columns produced by an Indicator.

    Column names will be automatically prefixed with the indicator instance name
    to avoid collisions in the pipeline.
    """

    model_config = {"extra": "forbid"}

    columns: List[ColumnInfo] = Field(
        description="Output columns produced by this indicator"
    )
    time_column: str = Field(
        default="time",
        description="Name of the timestamp column (passed through from input)"
    )

    def with_prefix(self, prefix: str) -> "OutputSchema":
        """
        Create a new OutputSchema with all column names prefixed.

        Args:
            prefix: Prefix to add (e.g., indicator instance name)

        Returns:
            New OutputSchema with prefixed column names
        """
        prefixed_columns = [
            ColumnInfo(
                name=f"{prefix}_{col.name}" if col.name != self.time_column else col.name,
                type=col.type,
                description=col.description,
                unit=col.unit,
                nullable=col.nullable
            )
            for col in self.columns
        ]
        return OutputSchema(
            columns=prefixed_columns,
            time_column=self.time_column
        )


class IndicatorParameter(BaseModel):
    """
    Metadata for a configurable indicator parameter.

    Used for AI agent discovery and dynamic indicator instantiation.
    """

    model_config = {"extra": "forbid"}

    name: str = Field(description="Parameter name")
    type: Literal["int", "float", "string", "bool"] = Field(description="Parameter type")
    description: str = Field(description="Human and LLM-readable description")
    default: Optional[Any] = Field(default=None, description="Default value if not specified")
    required: bool = Field(default=False, description="Whether this parameter is required")
    min_value: Optional[float] = Field(default=None, description="Minimum value (for numeric types)")
    max_value: Optional[float] = Field(default=None, description="Maximum value (for numeric types)")


class IndicatorMetadata(BaseModel):
    """
    Rich metadata for an Indicator class.

    Enables AI agents to discover, understand, and instantiate indicators.
    """

    model_config = {"extra": "forbid"}

    name: str = Field(description="Unique indicator class name (e.g., 'RSI', 'SMA', 'BollingerBands')")
    display_name: str = Field(description="Human-readable display name")
    description: str = Field(description="Detailed description of what this indicator computes and why it's useful")
    category: str = Field(
        description="Indicator category (e.g., 'momentum', 'trend', 'volatility', 'volume', 'custom')"
    )
    parameters: List[IndicatorParameter] = Field(
        default_factory=list,
        description="Configurable parameters for this indicator"
    )
    use_cases: List[str] = Field(
        default_factory=list,
        description="Common use cases and trading scenarios where this indicator is helpful"
    )
    references: List[str] = Field(
        default_factory=list,
        description="URLs or citations for indicator methodology"
    )
    tags: List[str] = Field(
        default_factory=list,
        description="Searchable tags (e.g., 'oscillator', 'mean-reversion', 'price-based')"
    )


class ComputeContext(BaseModel):
    """
    Context passed to an Indicator's compute() method.

    Contains the input data and metadata about what changed (for incremental updates).
    """

    model_config = {"extra": "forbid"}

    data: List[Dict[str, Any]] = Field(
        description="Input data rows (time-ordered). Each dict is {column_name: value, time: timestamp}"
    )
    is_incremental: bool = Field(
        default=False,
        description="True if this is an incremental update (only new/changed rows), False for full recompute"
    )
    updated_from_time: Optional[int] = Field(
        default=None,
        description="Unix timestamp (ms) of the earliest updated row (for incremental updates)"
    )

    def get_column(self, name: str) -> List[Any]:
        """
        Extract a single column as a list of values.

        Args:
            name: Column name

        Returns:
            List of values in time order
        """
        return [row.get(name) for row in self.data]

    def get_times(self) -> List[int]:
        """
        Get the time column as a list.

        Returns:
            List of timestamps in order
        """
        return [row["time"] for row in self.data]


class ComputeResult(BaseModel):
    """
    Result from an Indicator's compute() method.

    Contains the computed output data with proper column naming.
    """

    model_config = {"extra": "forbid"}

    data: List[Dict[str, Any]] = Field(
        description="Output data rows (time-ordered). Must include time column."
    )
    is_partial: bool = Field(
        default=False,
        description="True if this result only contains updates (for incremental computation)"
    )

    def merge_with_prefix(self, prefix: str, existing_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """
        Merge this result into existing data with column name prefixing.

        Args:
            prefix: Prefix to add to all column names except time
            existing_data: Existing data to merge with (matched by time)

        Returns:
            Merged data with prefixed columns added
        """
        # Build a time index for new data
        time_index = {row["time"]: row for row in self.data}

        # Merge into existing data
        result = []
        for existing_row in existing_data:
            row_time = existing_row["time"]
            merged_row = existing_row.copy()

            if row_time in time_index:
                new_row = time_index[row_time]
                for key, value in new_row.items():
                    if key != "time":
                        merged_row[f"{prefix}_{key}"] = value

            result.append(merged_row)

        return result