File size: 9,290 Bytes
859af74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7db56ad
859af74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63f74a3
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import logging
from typing import Dict, List, Optional

logger = logging.getLogger(__name__)

class SyntheticDataGenerator:
    """
    Generates synthetic market data for testing and development purposes.
    Creates realistic price movements with volatility, trends, and market noise.
    """
    
    def __init__(self, config: Dict):
        self.config = config
        self.base_price = config.get('synthetic_data', {}).get('base_price', 100.0)
        self.volatility = config.get('synthetic_data', {}).get('volatility', 0.02)
        self.trend = config.get('synthetic_data', {}).get('trend', 0.001)
        self.noise_level = config.get('synthetic_data', {}).get('noise_level', 0.005)
        
        logger.info(f"Initialized SyntheticDataGenerator with base_price={self.base_price}, "
                   f"volatility={self.volatility}, trend={self.trend}")
    
    def generate_ohlcv_data(self, 
                           symbol: str = 'AAPL',
                           start_date: str = '2024-01-01',
                           end_date: str = '2024-12-31',
                           frequency: str = '1min') -> pd.DataFrame:
        """
        Generate synthetic OHLCV (Open, High, Low, Close, Volume) data.
        
        Args:
            symbol: Stock symbol
            start_date: Start date in YYYY-MM-DD format
            end_date: End date in YYYY-MM-DD format
            frequency: Data frequency ('1min', '5min', '1H', '1D')
            
        Returns:
            DataFrame with OHLCV data
        """
        logger.info(f"Generating synthetic OHLCV data for {symbol} from {start_date} to {end_date}")
        
        # Create datetime range
        start_dt = pd.to_datetime(start_date)
        end_dt = pd.to_datetime(end_date)
        
        # Generate timestamps based on frequency
        if frequency == '1min' or frequency == '1m':
            timestamps = pd.date_range(start=start_dt, end=end_dt, freq='1min')
        elif frequency == '5min' or frequency == '5m':
            timestamps = pd.date_range(start=start_dt, end=end_dt, freq='5min')
        elif frequency == '1H' or frequency == '1h':
            timestamps = pd.date_range(start=start_dt, end=end_dt, freq='1h')
        elif frequency == '1D' or frequency == '1d':
            timestamps = pd.date_range(start=start_dt, end=end_dt, freq='1D')
        else:
            raise ValueError(f"Unsupported frequency: {frequency}")
        
        # Generate price data
        prices = self._generate_price_series(len(timestamps))
        
        # Generate OHLCV data
        data = []
        current_price = self.base_price
        
        for i, timestamp in enumerate(timestamps):
            # Add trend and noise
            trend_component = self.trend * i
            noise = np.random.normal(0, self.noise_level)
            
            # Generate OHLC from current price
            open_price = current_price * (1 + noise)
            close_price = open_price * (1 + np.random.normal(0, self.volatility))
            
            # Generate high and low
            price_range = abs(close_price - open_price) * np.random.uniform(1.5, 3.0)
            high_price = max(open_price, close_price) + price_range * np.random.uniform(0, 0.5)
            low_price = min(open_price, close_price) - price_range * np.random.uniform(0, 0.5)
            
            # Generate volume (correlated with price movement)
            volume = np.random.randint(1000, 100000) * (1 + abs(close_price - open_price) / open_price)
            
            data.append({
                'timestamp': timestamp,
                'symbol': symbol,
                'open': round(open_price, 2),
                'high': round(high_price, 2),
                'low': round(low_price, 2),
                'close': round(close_price, 2),
                'volume': int(volume)
            })
            
            current_price = close_price
        
        df = pd.DataFrame(data)
        logger.info(f"Generated {len(df)} data points for {symbol}")
        return df
    
    def generate_tick_data(self, 
                          symbol: str = 'AAPL',
                          duration_minutes: int = 60,
                          tick_interval_ms: int = 1000) -> pd.DataFrame:
        """
        Generate high-frequency tick data for testing.
        
        Args:
            symbol: Stock symbol
            duration_minutes: Duration in minutes
            tick_interval_ms: Interval between ticks in milliseconds
            
        Returns:
            DataFrame with tick data
        """
        logger.info(f"Generating tick data for {symbol} for {duration_minutes} minutes")
        
        num_ticks = (duration_minutes * 60 * 1000) // tick_interval_ms
        timestamps = pd.date_range(
            start=datetime.now(),
            periods=num_ticks,
            freq=f'{tick_interval_ms}ms'
        )
        
        # Generate price series with more noise for tick data
        base_prices = self._generate_price_series(num_ticks, volatility=self.volatility * 2)
        
        data = []
        for i, (timestamp, base_price) in enumerate(zip(timestamps, base_prices)):
            # Add micro-movements
            tick_price = base_price * (1 + np.random.normal(0, self.noise_level * 0.5))
            
            data.append({
                'timestamp': timestamp,
                'symbol': symbol,
                'price': round(tick_price, 4),
                'volume': np.random.randint(1, 100)
            })
        
        df = pd.DataFrame(data)
        logger.info(f"Generated {len(df)} tick data points for {symbol}")
        return df
    
    def _generate_price_series(self, length: int, volatility: Optional[float] = None) -> np.ndarray:
        """
        Generate a realistic price series using geometric Brownian motion.
        
        Args:
            length: Number of price points
            volatility: Price volatility (if None, uses self.volatility)
            
        Returns:
            Array of prices
        """
        if volatility is None:
            volatility = self.volatility
        
        # Geometric Brownian motion parameters
        mu = self.trend  # drift
        sigma = volatility  # volatility
        
        # Generate random walks
        dt = 1.0 / length
        t = np.linspace(0, 1, length)
        
        # Brownian motion
        dW = np.random.normal(0, np.sqrt(dt), length)
        W = np.cumsum(dW)
        
        # Geometric Brownian motion
        S = self.base_price * np.exp((mu - 0.5 * sigma**2) * t + sigma * W)
        
        return S
    
    def save_to_csv(self, df: pd.DataFrame, filepath: str) -> None:
        """
        Save generated data to CSV file.
        
        Args:
            df: DataFrame to save
            filepath: Path to save the CSV file
        """
        df.to_csv(filepath, index=False)
        logger.info(f"Saved synthetic data to {filepath}")
    
    def generate_market_scenarios(self, scenario_type: str = 'normal') -> pd.DataFrame:
        """
        Generate data for different market scenarios.
        
        Args:
            scenario_type: Type of scenario ('normal', 'volatile', 'trending', 'crash')
            
        Returns:
            DataFrame with scenario-specific data
        """
        logger.info(f"Generating {scenario_type} market scenario")
        
        if scenario_type == 'normal':
            return self.generate_ohlcv_data()
        elif scenario_type == 'volatile':
            # High volatility scenario
            self.volatility *= 3
            data = self.generate_ohlcv_data()
            self.volatility /= 3  # Reset
            return data
        elif scenario_type == 'trending':
            # Strong upward trend
            self.trend *= 5
            data = self.generate_ohlcv_data()
            self.trend /= 5  # Reset
            return data
        elif scenario_type == 'crash':
            # Market crash scenario
            original_volatility = self.volatility
            original_trend = self.trend
            
            self.volatility *= 5
            self.trend = -0.01  # Strong downward trend
            
            try:
                data = self.generate_ohlcv_data()
            finally:
                # Reset parameters
                self.volatility = original_volatility
                self.trend = original_trend
            
            return data
        else:
            raise ValueError(f"Unknown scenario type: {scenario_type}") 

    def generate_data(self) -> pd.DataFrame:
        """
        Generate synthetic OHLCV data using config defaults.
        Returns:
            DataFrame with OHLCV data
        """
        symbol = self.config.get('trading', {}).get('symbol', 'AAPL')
        start_date = self.config.get('synthetic_data', {}).get('start_date', '2024-01-01')
        end_date = self.config.get('synthetic_data', {}).get('end_date', '2024-12-31')
        frequency = self.config.get('synthetic_data', {}).get('frequency', '1min')
        return self.generate_ohlcv_data(symbol=symbol, start_date=start_date, end_date=end_date, frequency=frequency)