|
import pandas as pd |
|
import numpy as np |
|
from datetime import datetime, timedelta |
|
import logging |
|
from typing import Dict, List, Optional |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
class SyntheticDataGenerator: |
|
""" |
|
Generates synthetic market data for testing and development purposes. |
|
Creates realistic price movements with volatility, trends, and market noise. |
|
""" |
|
|
|
def __init__(self, config: Dict): |
|
self.config = config |
|
self.base_price = config.get('synthetic_data', {}).get('base_price', 100.0) |
|
self.volatility = config.get('synthetic_data', {}).get('volatility', 0.02) |
|
self.trend = config.get('synthetic_data', {}).get('trend', 0.001) |
|
self.noise_level = config.get('synthetic_data', {}).get('noise_level', 0.005) |
|
|
|
logger.info(f"Initialized SyntheticDataGenerator with base_price={self.base_price}, " |
|
f"volatility={self.volatility}, trend={self.trend}") |
|
|
|
def generate_ohlcv_data(self, |
|
symbol: str = 'AAPL', |
|
start_date: str = '2024-01-01', |
|
end_date: str = '2024-12-31', |
|
frequency: str = '1min') -> pd.DataFrame: |
|
""" |
|
Generate synthetic OHLCV (Open, High, Low, Close, Volume) data. |
|
|
|
Args: |
|
symbol: Stock symbol |
|
start_date: Start date in YYYY-MM-DD format |
|
end_date: End date in YYYY-MM-DD format |
|
frequency: Data frequency ('1min', '5min', '1H', '1D') |
|
|
|
Returns: |
|
DataFrame with OHLCV data |
|
""" |
|
logger.info(f"Generating synthetic OHLCV data for {symbol} from {start_date} to {end_date}") |
|
|
|
|
|
start_dt = pd.to_datetime(start_date) |
|
end_dt = pd.to_datetime(end_date) |
|
|
|
|
|
if frequency == '1min' or frequency == '1m': |
|
timestamps = pd.date_range(start=start_dt, end=end_dt, freq='1min') |
|
elif frequency == '5min' or frequency == '5m': |
|
timestamps = pd.date_range(start=start_dt, end=end_dt, freq='5min') |
|
elif frequency == '1H' or frequency == '1h': |
|
timestamps = pd.date_range(start=start_dt, end=end_dt, freq='1h') |
|
elif frequency == '1D' or frequency == '1d': |
|
timestamps = pd.date_range(start=start_dt, end=end_dt, freq='1D') |
|
else: |
|
raise ValueError(f"Unsupported frequency: {frequency}") |
|
|
|
|
|
prices = self._generate_price_series(len(timestamps)) |
|
|
|
|
|
data = [] |
|
current_price = self.base_price |
|
|
|
for i, timestamp in enumerate(timestamps): |
|
|
|
trend_component = self.trend * i |
|
noise = np.random.normal(0, self.noise_level) |
|
|
|
|
|
open_price = current_price * (1 + noise) |
|
close_price = open_price * (1 + np.random.normal(0, self.volatility)) |
|
|
|
|
|
price_range = abs(close_price - open_price) * np.random.uniform(1.5, 3.0) |
|
high_price = max(open_price, close_price) + price_range * np.random.uniform(0, 0.5) |
|
low_price = min(open_price, close_price) - price_range * np.random.uniform(0, 0.5) |
|
|
|
|
|
volume = np.random.randint(1000, 100000) * (1 + abs(close_price - open_price) / open_price) |
|
|
|
data.append({ |
|
'timestamp': timestamp, |
|
'symbol': symbol, |
|
'open': round(open_price, 2), |
|
'high': round(high_price, 2), |
|
'low': round(low_price, 2), |
|
'close': round(close_price, 2), |
|
'volume': int(volume) |
|
}) |
|
|
|
current_price = close_price |
|
|
|
df = pd.DataFrame(data) |
|
logger.info(f"Generated {len(df)} data points for {symbol}") |
|
return df |
|
|
|
def generate_tick_data(self, |
|
symbol: str = 'AAPL', |
|
duration_minutes: int = 60, |
|
tick_interval_ms: int = 1000) -> pd.DataFrame: |
|
""" |
|
Generate high-frequency tick data for testing. |
|
|
|
Args: |
|
symbol: Stock symbol |
|
duration_minutes: Duration in minutes |
|
tick_interval_ms: Interval between ticks in milliseconds |
|
|
|
Returns: |
|
DataFrame with tick data |
|
""" |
|
logger.info(f"Generating tick data for {symbol} for {duration_minutes} minutes") |
|
|
|
num_ticks = (duration_minutes * 60 * 1000) // tick_interval_ms |
|
timestamps = pd.date_range( |
|
start=datetime.now(), |
|
periods=num_ticks, |
|
freq=f'{tick_interval_ms}ms' |
|
) |
|
|
|
|
|
base_prices = self._generate_price_series(num_ticks, volatility=self.volatility * 2) |
|
|
|
data = [] |
|
for i, (timestamp, base_price) in enumerate(zip(timestamps, base_prices)): |
|
|
|
tick_price = base_price * (1 + np.random.normal(0, self.noise_level * 0.5)) |
|
|
|
data.append({ |
|
'timestamp': timestamp, |
|
'symbol': symbol, |
|
'price': round(tick_price, 4), |
|
'volume': np.random.randint(1, 100) |
|
}) |
|
|
|
df = pd.DataFrame(data) |
|
logger.info(f"Generated {len(df)} tick data points for {symbol}") |
|
return df |
|
|
|
def _generate_price_series(self, length: int, volatility: Optional[float] = None) -> np.ndarray: |
|
""" |
|
Generate a realistic price series using geometric Brownian motion. |
|
|
|
Args: |
|
length: Number of price points |
|
volatility: Price volatility (if None, uses self.volatility) |
|
|
|
Returns: |
|
Array of prices |
|
""" |
|
if volatility is None: |
|
volatility = self.volatility |
|
|
|
|
|
mu = self.trend |
|
sigma = volatility |
|
|
|
|
|
dt = 1.0 / length |
|
t = np.linspace(0, 1, length) |
|
|
|
|
|
dW = np.random.normal(0, np.sqrt(dt), length) |
|
W = np.cumsum(dW) |
|
|
|
|
|
S = self.base_price * np.exp((mu - 0.5 * sigma**2) * t + sigma * W) |
|
|
|
return S |
|
|
|
def save_to_csv(self, df: pd.DataFrame, filepath: str) -> None: |
|
""" |
|
Save generated data to CSV file. |
|
|
|
Args: |
|
df: DataFrame to save |
|
filepath: Path to save the CSV file |
|
""" |
|
df.to_csv(filepath, index=False) |
|
logger.info(f"Saved synthetic data to {filepath}") |
|
|
|
def generate_market_scenarios(self, scenario_type: str = 'normal') -> pd.DataFrame: |
|
""" |
|
Generate data for different market scenarios. |
|
|
|
Args: |
|
scenario_type: Type of scenario ('normal', 'volatile', 'trending', 'crash') |
|
|
|
Returns: |
|
DataFrame with scenario-specific data |
|
""" |
|
logger.info(f"Generating {scenario_type} market scenario") |
|
|
|
if scenario_type == 'normal': |
|
return self.generate_ohlcv_data() |
|
elif scenario_type == 'volatile': |
|
|
|
self.volatility *= 3 |
|
data = self.generate_ohlcv_data() |
|
self.volatility /= 3 |
|
return data |
|
elif scenario_type == 'trending': |
|
|
|
self.trend *= 5 |
|
data = self.generate_ohlcv_data() |
|
self.trend /= 5 |
|
return data |
|
elif scenario_type == 'crash': |
|
|
|
original_volatility = self.volatility |
|
original_trend = self.trend |
|
|
|
self.volatility *= 5 |
|
self.trend = -0.01 |
|
|
|
try: |
|
data = self.generate_ohlcv_data() |
|
finally: |
|
|
|
self.volatility = original_volatility |
|
self.trend = original_trend |
|
|
|
return data |
|
else: |
|
raise ValueError(f"Unknown scenario type: {scenario_type}") |
|
|
|
def generate_data(self) -> pd.DataFrame: |
|
""" |
|
Generate synthetic OHLCV data using config defaults. |
|
Returns: |
|
DataFrame with OHLCV data |
|
""" |
|
symbol = self.config.get('trading', {}).get('symbol', 'AAPL') |
|
start_date = self.config.get('synthetic_data', {}).get('start_date', '2024-01-01') |
|
end_date = self.config.get('synthetic_data', {}).get('end_date', '2024-12-31') |
|
frequency = self.config.get('synthetic_data', {}).get('frequency', '1min') |
|
return self.generate_ohlcv_data(symbol=symbol, start_date=start_date, end_date=end_date, frequency=frequency) |