Files
ai/client-py/dexorder/iceberg_client.py
2026-03-11 18:47:11 -04:00

180 lines
5.5 KiB
Python

"""
IcebergClient - Query OHLC data from Iceberg warehouse (Iceberg 1.10.1)
"""
from typing import Optional, List, Tuple
import pandas as pd
from pyiceberg.catalog import load_catalog
from pyiceberg.expressions import (
And,
EqualTo,
GreaterThanOrEqual,
LessThanOrEqual
)
class IcebergClient:
"""
Client for querying OHLC data from Iceberg warehouse (Iceberg 1.10.1).
Note: Iceberg 1.x does not enforce primary keys at the table level.
Deduplication is handled by:
- Flink upsert mode with equality delete files
- PyIceberg automatically filters deleted rows during queries
- Last-write-wins semantics for duplicates
Provides:
- Query OHLC data by ticker, period, and time range
- Identify missing data gaps
- Efficient partition pruning for large datasets
"""
def __init__(
self,
catalog_uri: str,
namespace: str = "trading",
s3_endpoint: Optional[str] = None,
s3_access_key: Optional[str] = None,
s3_secret_key: Optional[str] = None,
):
"""
Initialize Iceberg client.
Args:
catalog_uri: URI of the Iceberg catalog (e.g., "http://iceberg-catalog:8181")
namespace: Iceberg namespace (default: "trading")
s3_endpoint: S3/MinIO endpoint URL (e.g., "http://localhost:9000")
s3_access_key: S3/MinIO access key
s3_secret_key: S3/MinIO secret key
"""
self.catalog_uri = catalog_uri
self.namespace = namespace
catalog_props = {"uri": catalog_uri}
if s3_endpoint:
catalog_props["s3.endpoint"] = s3_endpoint
catalog_props["s3.path-style-access"] = "true"
if s3_access_key:
catalog_props["s3.access-key-id"] = s3_access_key
if s3_secret_key:
catalog_props["s3.secret-access-key"] = s3_secret_key
self.catalog = load_catalog("trading", **catalog_props)
self.table = self.catalog.load_table(f"{namespace}.ohlc")
def query_ohlc(
self,
ticker: str,
period_seconds: int,
start_time: int,
end_time: int
) -> pd.DataFrame:
"""
Query OHLC data for a specific ticker, period, and time range.
Args:
ticker: Market identifier (e.g., "BINANCE:BTC/USDT")
period_seconds: OHLC period in seconds (60, 300, 3600, etc.)
start_time: Start timestamp in microseconds
end_time: End timestamp in microseconds
Returns:
DataFrame with OHLC data sorted by timestamp
"""
# Reload table metadata to pick up snapshots committed after this client was initialized
self.table = self.catalog.load_table(f"{self.namespace}.ohlc")
df = self.table.scan(
row_filter=And(
EqualTo("ticker", ticker),
EqualTo("period_seconds", period_seconds),
GreaterThanOrEqual("timestamp", start_time),
LessThanOrEqual("timestamp", end_time)
)
).to_pandas()
if not df.empty:
df = df.sort_values("timestamp")
return df
def find_missing_ranges(
self,
ticker: str,
period_seconds: int,
start_time: int,
end_time: int
) -> List[Tuple[int, int]]:
"""
Identify missing data ranges in the requested time period.
Returns list of (start, end) tuples for missing ranges.
Expected candles are calculated based on period_seconds.
Args:
ticker: Market identifier
period_seconds: OHLC period in seconds
start_time: Start timestamp in microseconds
end_time: End timestamp in microseconds
Returns:
List of (start_time, end_time) tuples for missing ranges
"""
df = self.query_ohlc(ticker, period_seconds, start_time, end_time)
if df.empty:
# All data is missing
return [(start_time, end_time)]
# Convert period to microseconds
period_micros = period_seconds * 1_000_000
# Generate expected timestamps
expected_timestamps = list(range(start_time, end_time + 1, period_micros))
actual_timestamps = set(df['timestamp'].values)
# Find gaps
missing = sorted(set(expected_timestamps) - actual_timestamps)
if not missing:
return []
# Consolidate consecutive missing timestamps into ranges
ranges = []
range_start = missing[0]
prev_ts = missing[0]
for ts in missing[1:]:
if ts > prev_ts + period_micros:
# Gap in missing data - close previous range
ranges.append((range_start, prev_ts))
range_start = ts
prev_ts = ts
# Close final range
ranges.append((range_start, prev_ts))
return ranges
def has_data(
self,
ticker: str,
period_seconds: int,
start_time: int,
end_time: int
) -> bool:
"""
Check if any data exists for the given parameters.
Args:
ticker: Market identifier
period_seconds: OHLC period in seconds
start_time: Start timestamp in microseconds
end_time: End timestamp in microseconds
Returns:
True if at least one candle exists, False otherwise
"""
df = self.query_ohlc(ticker, period_seconds, start_time, end_time)
return not df.empty