Optimize OHLC queries: run Iceberg scans in threads and reuse DataFrames to avoid redundant scans
This commit is contained in:
@@ -169,7 +169,8 @@ class IcebergClient:
|
||||
ticker: str,
|
||||
period_seconds: int,
|
||||
start_time: int,
|
||||
end_time: int
|
||||
end_time: int,
|
||||
df: Optional[pd.DataFrame] = None,
|
||||
) -> List[Tuple[int, int]]:
|
||||
"""
|
||||
Identify missing data ranges in the requested time period.
|
||||
@@ -179,11 +180,13 @@ class IcebergClient:
|
||||
period_seconds: OHLC period in seconds
|
||||
start_time: Start timestamp in nanoseconds
|
||||
end_time: End timestamp in nanoseconds
|
||||
df: Optional pre-fetched DataFrame to avoid a redundant Iceberg scan
|
||||
|
||||
Returns:
|
||||
List of (start_time, end_time) tuples for missing ranges (nanoseconds)
|
||||
"""
|
||||
df = self.query_ohlc(ticker, period_seconds, start_time, end_time)
|
||||
if df is None:
|
||||
df = self.query_ohlc(ticker, period_seconds, start_time, end_time)
|
||||
|
||||
if df.empty:
|
||||
return [(start_time, end_time)]
|
||||
|
||||
@@ -122,12 +122,12 @@ class OHLCClient:
|
||||
start_time = ((start_time + period_nanos - 1) // period_nanos) * period_nanos
|
||||
end_time = ((end_time + period_nanos - 1) // period_nanos) * period_nanos # exclusive
|
||||
|
||||
# Step 1: Check Iceberg for existing data
|
||||
df = self.iceberg.query_ohlc(ticker, period_seconds, start_time, end_time)
|
||||
# Step 1: Check Iceberg for existing data (run in thread — scan.to_pandas() blocks ~3-5s)
|
||||
df = await asyncio.to_thread(self.iceberg.query_ohlc, ticker, period_seconds, start_time, end_time)
|
||||
|
||||
# Step 2: Identify missing ranges
|
||||
# Step 2: Identify missing ranges — pass df to avoid a redundant Iceberg scan
|
||||
missing_ranges = self.iceberg.find_missing_ranges(
|
||||
ticker, period_seconds, start_time, end_time
|
||||
ticker, period_seconds, start_time, end_time, df=df
|
||||
)
|
||||
|
||||
if not missing_ranges:
|
||||
@@ -148,8 +148,8 @@ class OHLCClient:
|
||||
if result['status'] == 'ERROR':
|
||||
raise ValueError(f"Historical data request failed: {result['error_message']}")
|
||||
|
||||
# Step 5: Query Iceberg again for complete dataset
|
||||
df = self.iceberg.query_ohlc(ticker, period_seconds, start_time, end_time)
|
||||
# Step 5: Query Iceberg again for complete dataset (run in thread)
|
||||
df = await asyncio.to_thread(self.iceberg.query_ohlc, ticker, period_seconds, start_time, end_time)
|
||||
|
||||
return self._apply_decimal_correction(ticker, df)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user