data fixes; indicator=>workspace sync

This commit is contained in:
2026-03-31 20:29:12 -04:00
parent 998f69fa1a
commit cd28e18e52
45 changed files with 1324 additions and 1239 deletions

View File

@@ -10,7 +10,7 @@ from pyiceberg.expressions import (
And,
EqualTo,
GreaterThanOrEqual,
LessThanOrEqual
LessThan,
)
log = logging.getLogger(__name__)
@@ -98,7 +98,7 @@ class IcebergClient:
EqualTo("ticker", ticker),
EqualTo("period_seconds", period_seconds),
GreaterThanOrEqual("timestamp", start_time),
LessThanOrEqual("timestamp", end_time)
LessThan("timestamp", end_time) # end_time is exclusive
)
)
@@ -110,6 +110,10 @@ class IcebergClient:
if not df.empty:
df = df.sort_values("timestamp")
# Convert integer microsecond timestamps to DatetimeIndex
df.index = pd.to_datetime(df["timestamp"], unit="us", utc=True)
df.index.name = "datetime"
df = df.drop(columns=["timestamp"])
# Apply price/volume conversion if metadata client available
if self.metadata_client is not None:
df = self._apply_denominators(df, ticker)
@@ -186,9 +190,9 @@ class IcebergClient:
# Convert period to microseconds
period_micros = period_seconds * 1_000_000
# Generate expected timestamps
expected_timestamps = list(range(start_time, end_time + 1, period_micros))
actual_timestamps = set(df['timestamp'].values)
# Generate expected timestamps — end_time is exclusive
expected_timestamps = list(range(start_time, end_time, period_micros))
actual_timestamps = set(df.index.view('int64') // 1000)
# Find gaps
missing = sorted(set(expected_timestamps) - actual_timestamps)

View File

@@ -12,6 +12,8 @@ from .symbol_metadata_client import SymbolMetadataClient
log = logging.getLogger(__name__)
log = logging.getLogger(__name__)
class OHLCClient:
"""
@@ -118,6 +120,11 @@ class OHLCClient:
TimeoutError: If historical data request times out
ValueError: If request fails
"""
# Align times to period boundaries: [ceil(start), ceil(end)) exclusive
period_micros = period_seconds * 1_000_000
start_time = ((start_time + period_micros - 1) // period_micros) * period_micros
end_time = ((end_time + period_micros - 1) // period_micros) * period_micros # exclusive
# Step 1: Check Iceberg for existing data
df = self.iceberg.query_ohlc(ticker, period_seconds, start_time, end_time)
@@ -128,7 +135,7 @@ class OHLCClient:
if not missing_ranges:
# All data exists in Iceberg
return df
return self._forward_fill_gaps(df, period_seconds)
# Step 3: Request missing data for each range
# For simplicity, request entire range (relay can merge adjacent requests)
@@ -147,6 +154,39 @@ class OHLCClient:
# Step 5: Query Iceberg again for complete dataset
df = self.iceberg.query_ohlc(ticker, period_seconds, start_time, end_time)
return self._forward_fill_gaps(df, period_seconds)
def _forward_fill_gaps(self, df: pd.DataFrame, period_seconds: int) -> pd.DataFrame:
"""
Forward-fill interior missing bars by carrying the last known close into
open, high, low, and close of any gap bar.
Only interior gaps (rows already present with null OHLC from the ingestor,
or timestamp slots missing between real bars) are filled. Edge gaps (before
the first real bar or after the last real bar) are left as-is.
"""
if df.empty:
return df
df = df.sort_index()
# Identify rows that are gap bars (null close)
is_gap = df['close'].isna()
if not is_gap.any():
return df
# Forward-fill close across gap rows, then copy into open/high/low
df['close'] = df['close'].ffill()
price_cols = ['open', 'high', 'low']
for col in price_cols:
if col in df.columns:
df[col] = df[col].where(~is_gap, df['close'])
# Zero out volume for filled gap rows
if 'volume' in df.columns:
df['volume'] = df['volume'].where(~is_gap, 0.0)
return df
async def __aenter__(self):