180 lines
5.5 KiB
Python
180 lines
5.5 KiB
Python
"""
|
|
IcebergClient - Query OHLC data from Iceberg warehouse (Iceberg 1.10.1)
|
|
"""
|
|
|
|
from typing import Optional, List, Tuple
|
|
import pandas as pd
|
|
from pyiceberg.catalog import load_catalog
|
|
from pyiceberg.expressions import (
|
|
And,
|
|
EqualTo,
|
|
GreaterThanOrEqual,
|
|
LessThanOrEqual
|
|
)
|
|
|
|
|
|
class IcebergClient:
|
|
"""
|
|
Client for querying OHLC data from Iceberg warehouse (Iceberg 1.10.1).
|
|
|
|
Note: Iceberg 1.x does not enforce primary keys at the table level.
|
|
Deduplication is handled by:
|
|
- Flink upsert mode with equality delete files
|
|
- PyIceberg automatically filters deleted rows during queries
|
|
- Last-write-wins semantics for duplicates
|
|
|
|
Provides:
|
|
- Query OHLC data by ticker, period, and time range
|
|
- Identify missing data gaps
|
|
- Efficient partition pruning for large datasets
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
catalog_uri: str,
|
|
namespace: str = "trading",
|
|
s3_endpoint: Optional[str] = None,
|
|
s3_access_key: Optional[str] = None,
|
|
s3_secret_key: Optional[str] = None,
|
|
):
|
|
"""
|
|
Initialize Iceberg client.
|
|
|
|
Args:
|
|
catalog_uri: URI of the Iceberg catalog (e.g., "http://iceberg-catalog:8181")
|
|
namespace: Iceberg namespace (default: "trading")
|
|
s3_endpoint: S3/MinIO endpoint URL (e.g., "http://localhost:9000")
|
|
s3_access_key: S3/MinIO access key
|
|
s3_secret_key: S3/MinIO secret key
|
|
"""
|
|
self.catalog_uri = catalog_uri
|
|
self.namespace = namespace
|
|
|
|
catalog_props = {"uri": catalog_uri}
|
|
if s3_endpoint:
|
|
catalog_props["s3.endpoint"] = s3_endpoint
|
|
catalog_props["s3.path-style-access"] = "true"
|
|
if s3_access_key:
|
|
catalog_props["s3.access-key-id"] = s3_access_key
|
|
if s3_secret_key:
|
|
catalog_props["s3.secret-access-key"] = s3_secret_key
|
|
|
|
self.catalog = load_catalog("trading", **catalog_props)
|
|
self.table = self.catalog.load_table(f"{namespace}.ohlc")
|
|
|
|
def query_ohlc(
|
|
self,
|
|
ticker: str,
|
|
period_seconds: int,
|
|
start_time: int,
|
|
end_time: int
|
|
) -> pd.DataFrame:
|
|
"""
|
|
Query OHLC data for a specific ticker, period, and time range.
|
|
|
|
Args:
|
|
ticker: Market identifier (e.g., "BINANCE:BTC/USDT")
|
|
period_seconds: OHLC period in seconds (60, 300, 3600, etc.)
|
|
start_time: Start timestamp in microseconds
|
|
end_time: End timestamp in microseconds
|
|
|
|
Returns:
|
|
DataFrame with OHLC data sorted by timestamp
|
|
"""
|
|
# Reload table metadata to pick up snapshots committed after this client was initialized
|
|
self.table = self.catalog.load_table(f"{self.namespace}.ohlc")
|
|
|
|
df = self.table.scan(
|
|
row_filter=And(
|
|
EqualTo("ticker", ticker),
|
|
EqualTo("period_seconds", period_seconds),
|
|
GreaterThanOrEqual("timestamp", start_time),
|
|
LessThanOrEqual("timestamp", end_time)
|
|
)
|
|
).to_pandas()
|
|
|
|
if not df.empty:
|
|
df = df.sort_values("timestamp")
|
|
|
|
return df
|
|
|
|
def find_missing_ranges(
|
|
self,
|
|
ticker: str,
|
|
period_seconds: int,
|
|
start_time: int,
|
|
end_time: int
|
|
) -> List[Tuple[int, int]]:
|
|
"""
|
|
Identify missing data ranges in the requested time period.
|
|
|
|
Returns list of (start, end) tuples for missing ranges.
|
|
Expected candles are calculated based on period_seconds.
|
|
|
|
Args:
|
|
ticker: Market identifier
|
|
period_seconds: OHLC period in seconds
|
|
start_time: Start timestamp in microseconds
|
|
end_time: End timestamp in microseconds
|
|
|
|
Returns:
|
|
List of (start_time, end_time) tuples for missing ranges
|
|
"""
|
|
df = self.query_ohlc(ticker, period_seconds, start_time, end_time)
|
|
|
|
if df.empty:
|
|
# All data is missing
|
|
return [(start_time, end_time)]
|
|
|
|
# Convert period to microseconds
|
|
period_micros = period_seconds * 1_000_000
|
|
|
|
# Generate expected timestamps
|
|
expected_timestamps = list(range(start_time, end_time + 1, period_micros))
|
|
actual_timestamps = set(df['timestamp'].values)
|
|
|
|
# Find gaps
|
|
missing = sorted(set(expected_timestamps) - actual_timestamps)
|
|
|
|
if not missing:
|
|
return []
|
|
|
|
# Consolidate consecutive missing timestamps into ranges
|
|
ranges = []
|
|
range_start = missing[0]
|
|
prev_ts = missing[0]
|
|
|
|
for ts in missing[1:]:
|
|
if ts > prev_ts + period_micros:
|
|
# Gap in missing data - close previous range
|
|
ranges.append((range_start, prev_ts))
|
|
range_start = ts
|
|
prev_ts = ts
|
|
|
|
# Close final range
|
|
ranges.append((range_start, prev_ts))
|
|
|
|
return ranges
|
|
|
|
def has_data(
|
|
self,
|
|
ticker: str,
|
|
period_seconds: int,
|
|
start_time: int,
|
|
end_time: int
|
|
) -> bool:
|
|
"""
|
|
Check if any data exists for the given parameters.
|
|
|
|
Args:
|
|
ticker: Market identifier
|
|
period_seconds: OHLC period in seconds
|
|
start_time: Start timestamp in microseconds
|
|
end_time: End timestamp in microseconds
|
|
|
|
Returns:
|
|
True if at least one candle exists, False otherwise
|
|
"""
|
|
df = self.query_ohlc(ticker, period_seconds, start_time, end_time)
|
|
return not df.empty
|