ai/client-py/dexorder/iceberg_client.py

"""
IcebergClient - Query OHLC data from Iceberg warehouse (Iceberg 1.10.1)
"""

from typing import Optional, List, Tuple
import pandas as pd
from pyiceberg.catalog import load_catalog
from pyiceberg.expressions import (
    And,
    EqualTo,
    GreaterThanOrEqual,
    LessThanOrEqual
)


class IcebergClient:
    """
    Client for querying OHLC data from Iceberg warehouse (Iceberg 1.10.1).

    Note: Iceberg 1.x does not enforce primary keys at the table level.
    Deduplication is handled by:
    - Flink upsert mode with equality delete files
    - PyIceberg automatically filters deleted rows during queries
    - Last-write-wins semantics for duplicates

    Provides:
    - Query OHLC data by ticker, period, and time range
    - Identify missing data gaps
    - Efficient partition pruning for large datasets
    """

    def __init__(
        self,
        catalog_uri: str,
        namespace: str = "trading",
        s3_endpoint: Optional[str] = None,
        s3_access_key: Optional[str] = None,
        s3_secret_key: Optional[str] = None,
    ):
        """
        Initialize Iceberg client.

        Args:
            catalog_uri: URI of the Iceberg catalog (e.g., "http://iceberg-catalog:8181")
            namespace: Iceberg namespace (default: "trading")
            s3_endpoint: S3/MinIO endpoint URL (e.g., "http://localhost:9000")
            s3_access_key: S3/MinIO access key
            s3_secret_key: S3/MinIO secret key
        """
        self.catalog_uri = catalog_uri
        self.namespace = namespace

        catalog_props = {"uri": catalog_uri}
        if s3_endpoint:
            catalog_props["s3.endpoint"] = s3_endpoint
            catalog_props["s3.path-style-access"] = "true"
        if s3_access_key:
            catalog_props["s3.access-key-id"] = s3_access_key
        if s3_secret_key:
            catalog_props["s3.secret-access-key"] = s3_secret_key

        self.catalog = load_catalog("trading", **catalog_props)
        self.table = self.catalog.load_table(f"{namespace}.ohlc")

    def query_ohlc(
        self,
        ticker: str,
        period_seconds: int,
        start_time: int,
        end_time: int
    ) -> pd.DataFrame:
        """
        Query OHLC data for a specific ticker, period, and time range.

        Args:
            ticker: Market identifier (e.g., "BINANCE:BTC/USDT")
            period_seconds: OHLC period in seconds (60, 300, 3600, etc.)
            start_time: Start timestamp in microseconds
            end_time: End timestamp in microseconds

        Returns:
            DataFrame with OHLC data sorted by timestamp
        """
        # Reload table metadata to pick up snapshots committed after this client was initialized
        self.table = self.catalog.load_table(f"{self.namespace}.ohlc")

        df = self.table.scan(
            row_filter=And(
                EqualTo("ticker", ticker),
                EqualTo("period_seconds", period_seconds),
                GreaterThanOrEqual("timestamp", start_time),
                LessThanOrEqual("timestamp", end_time)
            )
        ).to_pandas()

        if not df.empty:
            df = df.sort_values("timestamp")

        return df

    def find_missing_ranges(
        self,
        ticker: str,
        period_seconds: int,
        start_time: int,
        end_time: int
    ) -> List[Tuple[int, int]]:
        """
        Identify missing data ranges in the requested time period.

        Returns list of (start, end) tuples for missing ranges.
        Expected candles are calculated based on period_seconds.

        Args:
            ticker: Market identifier
            period_seconds: OHLC period in seconds
            start_time: Start timestamp in microseconds
            end_time: End timestamp in microseconds

        Returns:
            List of (start_time, end_time) tuples for missing ranges
        """
        df = self.query_ohlc(ticker, period_seconds, start_time, end_time)

        if df.empty:
            # All data is missing
            return [(start_time, end_time)]

        # Convert period to microseconds
        period_micros = period_seconds * 1_000_000

        # Generate expected timestamps
        expected_timestamps = list(range(start_time, end_time + 1, period_micros))
        actual_timestamps = set(df['timestamp'].values)

        # Find gaps
        missing = sorted(set(expected_timestamps) - actual_timestamps)

        if not missing:
            return []

        # Consolidate consecutive missing timestamps into ranges
        ranges = []
        range_start = missing[0]
        prev_ts = missing[0]

        for ts in missing[1:]:
            if ts > prev_ts + period_micros:
                # Gap in missing data - close previous range
                ranges.append((range_start, prev_ts))
                range_start = ts
            prev_ts = ts

        # Close final range
        ranges.append((range_start, prev_ts))

        return ranges

    def has_data(
        self,
        ticker: str,
        period_seconds: int,
        start_time: int,
        end_time: int
    ) -> bool:
        """
        Check if any data exists for the given parameters.

        Args:
            ticker: Market identifier
            period_seconds: OHLC period in seconds
            start_time: Start timestamp in microseconds
            end_time: End timestamp in microseconds

        Returns:
            True if at least one candle exists, False otherwise
        """
        df = self.query_ohlc(ticker, period_seconds, start_time, end_time)
        return not df.empty