File size: 13,935 Bytes
0a856ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65c5351
 
 
0a856ce
 
7df5b53
30c5519
65c5351
 
0a856ce
7df5b53
65c5351
 
 
 
 
 
 
0a856ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219bd2a
0a856ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65c5351
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f27090c
65c5351
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0a856ce
65c5351
 
 
 
 
0a856ce
65c5351
 
0a856ce
 
65c5351
 
 
 
0a856ce
65c5351
 
 
0a856ce
65c5351
 
 
 
 
 
0a856ce
 
65c5351
 
 
 
 
0a856ce
65c5351
 
0a856ce
65c5351
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0a856ce
65c5351
 
 
 
 
 
 
 
0a856ce
 
f27090c
0a856ce
db29da7
 
 
84112ce
65c5351
0a856ce
65c5351
 
 
 
 
 
 
 
0a856ce
 
 
 
 
 
65c5351
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0a856ce
65c5351
 
 
 
 
0a856ce
 
 
65c5351
 
 
 
 
 
 
 
 
 
 
 
 
 
0a856ce
65c5351
 
 
 
0a856ce
 
 
65c5351
 
0a856ce
65c5351
0a856ce
 
 
65c5351
 
 
 
0a856ce
65c5351
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
import os
import pandas as pd
from typing import Tuple
from PIL import Image
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain_google_genai import ChatGoogleGenerativeAI
import matplotlib.pyplot as plt
import json
from datetime import datetime
from huggingface_hub import HfApi
import uuid

# FORCE reload environment variables
load_dotenv(override=True)

# Get API keys with explicit None handling and debugging
Groq_Token = os.getenv("GROQ_API_KEY")
hf_token = os.getenv("HF_TOKEN")
gemini_token = os.getenv("GEMINI_TOKEN")

# Debug print (remove in production)
# print(f"Debug - Groq Token: {'Present' if Groq_Token else 'Missing'}")
# print(f"Debug - Groq Token Value: {Groq_Token[:10] + '...' if Groq_Token else 'None'}")
# print(f"Debug - Gemini Token: {'Present' if gemini_token else 'Missing'}")

models = {
    "gpt-oss-120b": "openai/gpt-oss-120b",
     "qwen3-32b": "qwen/qwen3-32b",
    "gpt-oss-20b": "openai/gpt-oss-20b",
    "llama4 maverik":"meta-llama/llama-4-maverick-17b-128e-instruct",
    "llama3.3": "llama-3.3-70b-versatile",
    "deepseek-R1": "deepseek-r1-distill-llama-70b",
    "gemini-2.5-flash": "gemini-2.5-flash",
    "gemini-2.5-pro": "gemini-2.5-pro",
    "gemini-2.5-flash-lite": "gemini-2.5-flash-lite",
    "gemini-2.0-flash": "gemini-2.0-flash",
    "gemini-2.0-flash-lite": "gemini-2.0-flash-lite",
    # "llama4 scout":"meta-llama/llama-4-scout-17b-16e-instruct"
    # "llama3.1": "llama-3.1-8b-instant"
}

def log_interaction(user_query, model_name, response_content, generated_code, execution_time, error_message=None, is_image=False):
    """Log user interactions to Hugging Face dataset"""
    try:
        if not hf_token or hf_token.strip() == "":
            print("Warning: HF_TOKEN not available, skipping logging")
            return
        
        # Create log entry
        log_entry = {
            "timestamp": datetime.now().isoformat(),
            "session_id": str(uuid.uuid4()),
            "user_query": user_query,
            "model_name": model_name,
            "response_content": str(response_content),
            "generated_code": generated_code or "",
            "execution_time_seconds": execution_time,
            "error_message": error_message or "",
            "is_image_output": is_image,
            "success": error_message is None
        }
        
        # Create DataFrame
        df = pd.DataFrame([log_entry])
        
        # Create unique filename with timestamp
        timestamp_str = datetime.now().strftime("%Y%m%d_%H%M%S")
        random_id = str(uuid.uuid4())[:8]
        filename = f"interaction_log_{timestamp_str}_{random_id}.parquet"
        
        # Save locally first
        local_path = f"/tmp/{filename}"
        df.to_parquet(local_path, index=False)
        
        # Upload to Hugging Face
        api = HfApi(token=hf_token)
        api.upload_file(
            path_or_fileobj=local_path,
            path_in_repo=f"data/{filename}",
            repo_id="SustainabilityLabIITGN/VayuChat_logs",
            repo_type="dataset",
        )
        
        # Clean up local file
        if os.path.exists(local_path):
            os.remove(local_path)
            
        print(f"Successfully logged interaction to HuggingFace: {filename}")
        
    except Exception as e:
        print(f"Error logging interaction: {e}")

def preprocess_and_load_df(path: str) -> pd.DataFrame:
    """Load and preprocess the dataframe"""
    try:
        df = pd.read_csv(path)
        df["Timestamp"] = pd.to_datetime(df["Timestamp"])
        return df
    except Exception as e:
        raise Exception(f"Error loading dataframe: {e}")


def get_from_user(prompt):
    """Format user prompt"""
    return {"role": "user", "content": prompt}


def ask_question(model_name, question):
    """Ask question with comprehensive error handling and logging"""
    start_time = datetime.now()
    # ------------------------
    # Helper functions
    # ------------------------
    def make_error_response(msg, log_msg, content=None):
        """Build error response + log it"""
        execution_time = (datetime.now() - start_time).total_seconds()
        log_interaction(
            user_query=question,
            model_name=model_name,
            response_content=content or msg,
            generated_code="",
            execution_time=execution_time,
            error_message=log_msg,
            is_image=False
        )
        return {
            "role": "assistant",
            "content": content or msg,
            "gen_code": "",
            "ex_code": "",
            "last_prompt": question,
            "error": log_msg
        }
    def validate_api_token(token, token_name, msg_if_missing):
        """Check for missing/empty API tokens"""
        if not token or token.strip() == "":
            return make_error_response(
                msg="Missing or empty API token",
                log_msg="Missing or empty API token",
                content=msg_if_missing
            )
        return None  # OK
    def run_safe_exec(full_code, df=None, extra_globals=None):
        """Safely execute generated code and handle errors"""
        local_vars = {}
        global_vars = {
            'pd': pd, 'plt': plt, 'os': os,
            'sns': __import__('seaborn'),
            'uuid': __import__('uuid'),
            'calendar': __import__('calendar'),
            'np': __import__('numpy'),
            'df': df  # <-- pass your DataFrame here
        }

        # allow user to inject more globals (optional)
        if extra_globals:
            global_vars.update(extra_globals)

        try:
            exec(full_code, global_vars, local_vars)
            return (
                local_vars.get('answer', "Code executed but no result was saved in 'answer' variable"),
                None
            )
        except Exception as code_error:
            return None, str(code_error)

    # ------------------------
    # Step 1: Reload env vars
    # ------------------------
    load_dotenv(override=True)
    fresh_groq_token = os.getenv("GROQ_API_KEY")
    fresh_gemini_token = os.getenv("GEMINI_TOKEN")
    # ------------------------
    # Step 2: Init LLM
    # ------------------------
    try:
        if "gemini" in model_name:
            token_error = validate_api_token(
                fresh_gemini_token,
                "GEMINI_TOKEN",
                "Gemini API token not available or empty. Please set GEMINI_TOKEN in your environment variable."
            )
            if token_error:
                return token_error
            
            try:
                llm = ChatGoogleGenerativeAI(
                    model=models[model_name],
                    google_api_key=fresh_gemini_token,
                    temperature=0
                )
                # Gemini requires async call
                llm.invoke("Test")
                # print("Gemini API key test successful")
            except Exception as api_error:
                return make_error_response(
                    msg="API Connection Error",
                    log_msg=str(api_error),
                    content="API Key Error: Your Gemini API key appears to be invalid, expired, or restricted. Please check your GEMINI_TOKEN in the .env file."
                    if "organization_restricted"in str(api_error).lower() or "unauthorized" in str(api_error).lower()
                    else f"API Connection Error: {api_error}"
                )

        else:
            token_error = validate_api_token(
                fresh_groq_token,
                "GROQ_API_KEY",
                "Groq API token not available or empty. Please set GROQ_API_KEY in your environment variables and restart the application."
            )
            if token_error:
                return token_error

            try:
                llm = ChatGroq(
                    model=models[model_name],
                    api_key=fresh_groq_token,
                    temperature=0
                )
                llm.invoke("Test")  # test API key
                # print("Groq API key test successful")
            except Exception as api_error:
                return make_error_response(
                    msg="API Connection Error",
                    log_msg=str(api_error),
                    content="API Key Error: Your Groq API key appears to be invalid, expired, or restricted. Please check your GROQ_API_KEY in the .env file."
                    if "organization_restricted"in str(api_error).lower() or "unauthorized" in str(api_error).lower()
                    else f"API Connection Error: {api_error}"
                )
    except Exception as e:
        return make_error_response(str(e), str(e))
    # ------------------------
    # Step 3: Check AQ_met_data.csv
    # ------------------------
    if not os.path.exists("AQ_met_data.csv"):
        return make_error_response(
            msg="Data file not found",
            log_msg="Data file not found",
            content="AQ_met_data.csv file not found. Please ensure the data file is in the correct location."
        )

    df = pd.read_csv("AQ_met_data.csv")
    df["Timestamp"] = pd.to_datetime(df["Timestamp"])
    new_line = "\n"
    states_df = pd.read_csv("states_data.csv")
    ncap_df = pd.read_csv("ncap_funding_data.csv")
    
    # Template for user query
    template = f"""```python
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import uuid
import calendar
import numpy as np
# Set professional matplotlib styling
plt.style.use('vayuchat.mplstyle')
df = pd.read_csv("AQ_met_data.csv")
df["Timestamp"] = pd.to_datetime(df["Timestamp"])
states_df = pd.read_csv("states_data.csv")
ncap_df = pd.read_csv("ncap_funding_data.csv")
# df is pandas DataFrame with air quality data from India. Data frequency is daily from 2017 to 2024. The data has the following columns and data types:
{new_line.join(map(lambda x: '# '+x, str(df.dtypes).split(new_line)))}
# states_df is a pandas DataFrame of state-wise population, area and whether state is union territory or not of India.
{new_line.join(map(lambda x: '# '+x, str(states_df.dtypes).split(new_line)))}
# ncap_df is a pandas DataFrame of funding given to the cities of India from 2019-2022, under The National Clean Air Program (NCAP).
{new_line.join(map(lambda x: '# '+x, str(ncap_df.dtypes).split(new_line)))}
# Question: {question.strip()}
# Generate code to answer the question and save result in 'answer' variable
# If creating a plot, save it with a unique filename and store the filename in 'answer'
# If returning text/numbers, store the result directly in 'answer'
```"""

    # Read system prompt from txt file
    with open("new_system_prompt.txt", "r", encoding="utf-8") as f:
        system_prompt = f.read().strip()

    messages = [
        {
            "role": "system",
            "content": system_prompt
        },
        {
            "role": "user",
            "content": f"""Complete the following code to answer the user's question: 
            {template}"""
        }
    ]
    
    # ------------------------
    # Step 4: Call model
    # ------------------------
    try:
        response = llm.invoke(messages)
        answer = response.content
    except Exception as e:
        return make_error_response(f"Error: {e}", str(e))
            
    # ------------------------
    # Step 5: Extract code
    # ------------------------
    code_part = answer.split("```python")[1].split("```")[0] if "```python" in answer else answer
    full_code = f"""
{template.split("```python")[1].split("```")[0]}
{code_part}
"""
    answer_result, code_error = run_safe_exec(full_code, df, extra_globals={'states_df': states_df, 'ncap_df': ncap_df})

    execution_time = (datetime.now() - start_time).total_seconds()
    if code_error:
        # Friendly error messages
        msg = "I encountered an error while analyzing your data. "
        if "syntax" in code_error.lower():
            msg += "There was a syntax error in the generated code. Please try rephrasing your question."
        elif "not defined" in code_error.lower():
            msg += "Variable naming error occurred. Please try asking the question again."
        elif "division by zero" in code_error.lower():
            msg += "Calculation involved division by zero, possibly due to missing data."
        elif "no data" in code_error.lower() or "empty" in code_error.lower():
            msg += "No relevant data was found for your query."
        else:
            msg += f"Technical error: {code_error}"

        msg += "\n\n💡 **Suggestions:**\n- Try rephrasing your question\n- Use simpler terms\n- Check if the data exists for your specified criteria"

        log_interaction(
            user_query=question,
            model_name=model_name,
            response_content=msg,
            generated_code=full_code,
            execution_time=execution_time,
            error_message=code_error,
            is_image=False
        )
        return {
            "role": "assistant",
            "content": msg,
            "gen_code": full_code,
            "ex_code": full_code,
            "last_prompt": question,
            "error": code_error
        }
        
    # ------------------------
    # Step 7: Success logging
    # ------------------------
    is_image = isinstance(answer_result, str) and answer_result.endswith(('.png', '.jpg', '.jpeg'))
    log_interaction(
        user_query=question,
        model_name=model_name,
        response_content=str(answer_result),
        generated_code=full_code,
        execution_time=execution_time,
        error_message=None,
        is_image=is_image
    )

    return {
        "role": "assistant",
        "content": answer_result,
        "gen_code": full_code,
        "ex_code": full_code,
        "last_prompt": question,
        "error": None
    }