Spaces:

SustainabilityLabIITGN
/

VayuChat

Running

App Files Files Community

VayuChat / src.py

Nipun

Add critical numpy indexing fixes to system prompt

91be342 7 days ago

raw

history blame

17.2 kB

	import os
	import pandas as pd
	from typing import Tuple
	from PIL import Image
	from dotenv import load_dotenv
	from langchain_groq import ChatGroq
	from langchain_google_genai import ChatGoogleGenerativeAI
	import matplotlib.pyplot as plt
	import json
	from datetime import datetime
	from huggingface_hub import HfApi
	import uuid

	# FORCE reload environment variables
	load_dotenv(override=True)

	# Get API keys with explicit None handling and debugging
	Groq_Token = os.getenv("GROQ_API_KEY")
	hf_token = os.getenv("HF_TOKEN")
	gemini_token = os.getenv("GEMINI_TOKEN")

	# Debug print (remove in production)
	print(f"Debug - Groq Token: {'Present' if Groq_Token else 'Missing'}")
	print(f"Debug - Groq Token Value: {Groq_Token[:10] + '...' if Groq_Token else 'None'}")
	print(f"Debug - Gemini Token: {'Present' if gemini_token else 'Missing'}")

	models = {
	"gpt-oss-20b": "openai/gpt-oss-20b",
	"gpt-oss-120b": "openai/gpt-oss-120b",
	"llama3.1": "llama-3.1-8b-instant",
	"llama3.3": "llama-3.3-70b-versatile",
	"deepseek-R1": "deepseek-r1-distill-llama-70b",
	"llama4 maverik":"meta-llama/llama-4-maverick-17b-128e-instruct",
	"llama4 scout":"meta-llama/llama-4-scout-17b-16e-instruct",
	"gemini-pro": "gemini-1.5-pro"
	}

	def log_interaction(user_query, model_name, response_content, generated_code, execution_time, error_message=None, is_image=False):
	"""Log user interactions to Hugging Face dataset"""
	try:
	if not hf_token or hf_token.strip() == "":
	print("Warning: HF_TOKEN not available, skipping logging")
	return

	# Create log entry
	log_entry = {
	"timestamp": datetime.now().isoformat(),
	"session_id": str(uuid.uuid4()),
	"user_query": user_query,
	"model_name": model_name,
	"response_content": str(response_content),
	"generated_code": generated_code or "",
	"execution_time_seconds": execution_time,
	"error_message": error_message or "",
	"is_image_output": is_image,
	"success": error_message is None
	}

	# Create DataFrame
	df = pd.DataFrame([log_entry])

	# Create unique filename with timestamp
	timestamp_str = datetime.now().strftime("%Y%m%d_%H%M%S")
	random_id = str(uuid.uuid4())[:8]
	filename = f"interaction_log_{timestamp_str}_{random_id}.parquet"

	# Save locally first
	local_path = f"/tmp/{filename}"
	df.to_parquet(local_path, index=False)

	# Upload to Hugging Face
	api = HfApi(token=hf_token)
	api.upload_file(
	path_or_fileobj=local_path,
	path_in_repo=f"data/{filename}",
	repo_id="SustainabilityLabIITGN/VayuChat_logs",
	repo_type="dataset",
	)

	# Clean up local file
	if os.path.exists(local_path):
	os.remove(local_path)

	print(f"Successfully logged interaction to HuggingFace: {filename}")

	except Exception as e:
	print(f"Error logging interaction: {e}")

	def preprocess_and_load_df(path: str) -> pd.DataFrame:
	"""Load and preprocess the dataframe"""
	try:
	df = pd.read_csv(path)
	df["Timestamp"] = pd.to_datetime(df["Timestamp"])
	return df
	except Exception as e:
	raise Exception(f"Error loading dataframe: {e}")



	def get_from_user(prompt):
	"""Format user prompt"""
	return {"role": "user", "content": prompt}




	def ask_question(model_name, question):
	"""Ask question with comprehensive error handling and logging"""
	start_time = datetime.now()
	try:
	# Reload environment variables to get fresh values
	load_dotenv(override=True)
	fresh_groq_token = os.getenv("GROQ_API_KEY")
	fresh_gemini_token = os.getenv("GEMINI_TOKEN")

	print(f"ask_question - Fresh Groq Token: {'Present' if fresh_groq_token else 'Missing'}")

	# Check API availability with fresh tokens
	if model_name == "gemini-pro":
	if not fresh_gemini_token or fresh_gemini_token.strip() == "":
	execution_time = (datetime.now() - start_time).total_seconds()
	error_msg = "Missing or empty API token"

	# Log the failed interaction
	log_interaction(
	user_query=question,
	model_name=model_name,
	response_content="Gemini API token not available or empty",
	generated_code="",
	execution_time=execution_time,
	error_message=error_msg,
	is_image=False
	)

	return {
	"role": "assistant",
	"content": "Gemini API token not available or empty. Please set GEMINI_TOKEN in your environment variables.",
	"gen_code": "",
	"ex_code": "",
	"last_prompt": question,
	"error": error_msg
	}
	llm = ChatGoogleGenerativeAI(
	model=models[model_name],
	google_api_key=fresh_gemini_token,
	temperature=0
	)
	else:
	if not fresh_groq_token or fresh_groq_token.strip() == "":
	execution_time = (datetime.now() - start_time).total_seconds()
	error_msg = "Missing or empty API token"

	# Log the failed interaction
	log_interaction(
	user_query=question,
	model_name=model_name,
	response_content="Groq API token not available or empty",
	generated_code="",
	execution_time=execution_time,
	error_message=error_msg,
	is_image=False
	)

	return {
	"role": "assistant",
	"content": "Groq API token not available or empty. Please set GROQ_API_KEY in your environment variables and restart the application.",
	"gen_code": "",
	"ex_code": "",
	"last_prompt": question,
	"error": error_msg
	}

	# Test the API key by trying to create the client
	try:
	llm = ChatGroq(
	model=models[model_name],
	api_key=fresh_groq_token,
	temperature=0.1
	)
	# Test with a simple call to verify the API key works
	test_response = llm.invoke("Test")
	print("API key test successful")
	except Exception as api_error:
	execution_time = (datetime.now() - start_time).total_seconds()
	error_msg = str(api_error)

	if "organization_restricted" in error_msg.lower() or "unauthorized" in error_msg.lower():
	response_content = "API Key Error: Your Groq API key appears to be invalid, expired, or restricted. Please check your API key in the .env file."
	log_error_msg = f"API key validation failed: {error_msg}"
	else:
	response_content = f"API Connection Error: {error_msg}"
	log_error_msg = error_msg

	# Log the failed interaction
	log_interaction(
	user_query=question,
	model_name=model_name,
	response_content=response_content,
	generated_code="",
	execution_time=execution_time,
	error_message=log_error_msg,
	is_image=False
	)

	return {
	"role": "assistant",
	"content": response_content,
	"gen_code": "",
	"ex_code": "",
	"last_prompt": question,
	"error": log_error_msg
	}

	# Check if data file exists
	if not os.path.exists("Data.csv"):
	execution_time = (datetime.now() - start_time).total_seconds()
	error_msg = "Data file not found"

	# Log the failed interaction
	log_interaction(
	user_query=question,
	model_name=model_name,
	response_content="Data.csv file not found",
	generated_code="",
	execution_time=execution_time,
	error_message=error_msg,
	is_image=False
	)

	return {
	"role": "assistant",
	"content": "Data.csv file not found. Please ensure the data file is in the correct location.",
	"gen_code": "",
	"ex_code": "",
	"last_prompt": question,
	"error": error_msg
	}

	df_check = pd.read_csv("Data.csv")
	df_check["Timestamp"] = pd.to_datetime(df_check["Timestamp"])
	df_check = df_check.head(5)

	new_line = "\n"

	template = f"""```python
	import pandas as pd
	import matplotlib.pyplot as plt
	import uuid
	import calendar
	import numpy as np

	# Set professional matplotlib styling
	plt.rcParams.update({{
	'font.size': 12,
	'figure.dpi': 400,
	'figure.facecolor': 'white',
	'axes.facecolor': 'white',
	'axes.edgecolor': '#e2e8f0',
	'axes.linewidth': 1.2,
	'axes.labelcolor': '#374151',
	'axes.spines.top': False,
	'axes.spines.right': False,
	'axes.spines.left': True,
	'axes.spines.bottom': True,
	'axes.grid': True,
	'grid.color': '#f1f5f9',
	'grid.linewidth': 0.8,
	'grid.alpha': 0.7,
	'xtick.color': '#6b7280',
	'ytick.color': '#6b7280',
	'text.color': '#374151',
	'figure.figsize': [12, 6],
	'axes.prop_cycle': plt.cycler('color', ['#3b82f6', '#ef4444', '#10b981', '#f59e0b', '#8b5cf6', '#06b6d4'])
	}})

	df = pd.read_csv("Data.csv")
	df["Timestamp"] = pd.to_datetime(df["Timestamp"])

	# Available columns and data types:
	{new_line.join(map(lambda x: '# '+x, str(df_check.dtypes).split(new_line)))}

	# Question: {question.strip()}
	# Generate code to answer the question and save result in 'answer' variable
	# If creating a plot, save it with a unique filename and store the filename in 'answer'
	# If returning text/numbers, store the result directly in 'answer'
	```"""

	system_prompt = """Generate Python code to answer the user's question about air quality data.

	IMPORTANT: Only generate Python code - no explanations, no thinking, just clean code.

	WHEN TO CREATE PLOTS vs TEXT ANSWERS:
	- Questions asking "Which", "What", specific values → TEXT ANSWERS (store text in 'answer')
	- Questions asking "Plot", "Show", "Visualize", "Chart" → PLOTS (store filename in 'answer')
	- Questions asking for comparisons of many items → PLOTS
	- Simple direct questions → TEXT ANSWERS

	SAFETY & ROBUSTNESS RULES:
	- Always check if data exists before processing: if df.empty: answer = "No data available"
	- Handle missing values: use .dropna() or .fillna() appropriately
	- Use try-except blocks for risky operations like indexing
	- Validate city/location names exist in data before filtering
	- Check for empty results after filtering: if filtered_df.empty: answer = "No data found for specified criteria"
	- Use .round(2) for numerical results to avoid long decimals
	- Handle division by zero: check denominators before division
	- Validate date ranges exist in data
	- Use proper string formatting for answers with units (μg/m³)

	CRITICAL: NUMPY INDEXING FIXES:
	- ALWAYS convert pandas/numpy values to int before using as list indices
	- Example: calendar.month_name[int(month_value)] NOT calendar.month_name[month_value]
	- Use int() conversion for ANY value used as index: int(row['month']), int(max_idx), etc.
	- When accessing pandas iloc results, wrap in int(): int(df.loc[idx, 'column'])

	TECHNICAL REQUIREMENTS:
	- Save final result in variable called 'answer'
	- For TEXT: Store the direct answer as a string in 'answer'
	- For PLOTS: Save with unique filename f"plot_{{uuid.uuid4().hex[:8]}}.png" and store filename in 'answer'
	- Always use .iloc or .loc properly for pandas indexing
	- Close matplotlib figures with plt.close() to prevent memory leaks
	- Use proper column name checks before accessing columns
	"""

	query = f"""{system_prompt}

	Complete the following code to answer the user's question:

	{template}
	"""

	# Make API call
	if model_name == "gemini-pro":
	response = llm.invoke(query)
	answer = response.content
	else:
	response = llm.invoke(query)
	answer = response.content

	# Extract and execute code
	try:
	if "```python" in answer:
	code_part = answer.split("```python")[1].split("```")[0]
	else:
	code_part = answer

	full_code = f"""
	{template.split("```python")[1].split("```")[0]}
	{code_part}
	"""

	# Execute code in a controlled environment
	local_vars = {}
	global_vars = {
	'pd': pd,
	'plt': plt,
	'os': os,
	'uuid': __import__('uuid'),
	'calendar': __import__('calendar'),
	'np': __import__('numpy')
	}

	exec(full_code, global_vars, local_vars)

	# Get the answer
	if 'answer' in local_vars:
	answer_result = local_vars['answer']
	else:
	answer_result = "No answer variable found in generated code"

	execution_time = (datetime.now() - start_time).total_seconds()

	# Determine if output is an image
	is_image = isinstance(answer_result, str) and any(answer_result.endswith(ext) for ext in ['.png', '.jpg', '.jpeg'])

	# Log successful interaction
	log_interaction(
	user_query=question,
	model_name=model_name,
	response_content=str(answer_result),
	generated_code=full_code,
	execution_time=execution_time,
	error_message=None,
	is_image=is_image
	)

	return {
	"role": "assistant",
	"content": answer_result,
	"gen_code": full_code,
	"ex_code": full_code,
	"last_prompt": question,
	"error": None
	}

	except Exception as code_error:
	execution_time = (datetime.now() - start_time).total_seconds()
	error_msg = str(code_error)

	# Log the failed code execution
	log_interaction(
	user_query=question,
	model_name=model_name,
	response_content=f"Error executing generated code: {error_msg}",
	generated_code=full_code if 'full_code' in locals() else "",
	execution_time=execution_time,
	error_message=error_msg,
	is_image=False
	)

	return {
	"role": "assistant",
	"content": f"Error executing generated code: {error_msg}",
	"gen_code": full_code if 'full_code' in locals() else "",
	"ex_code": full_code if 'full_code' in locals() else "",
	"last_prompt": question,
	"error": error_msg
	}

	except Exception as e:
	execution_time = (datetime.now() - start_time).total_seconds()
	error_msg = str(e)

	# Handle specific API errors
	if "organization_restricted" in error_msg:
	response_content = "API Organization Restricted: Your API key access has been restricted. Please check your Groq API key or try generating a new one."
	log_error_msg = "API access restricted"
	elif "rate_limit" in error_msg.lower():
	response_content = "Rate limit exceeded. Please wait a moment and try again."
	log_error_msg = "Rate limit exceeded"
	else:
	response_content = f"Error: {error_msg}"
	log_error_msg = error_msg

	# Log the failed interaction
	log_interaction(
	user_query=question,
	model_name=model_name,
	response_content=response_content,
	generated_code="",
	execution_time=execution_time,
	error_message=log_error_msg,
	is_image=False
	)

	return {
	"role": "assistant",
	"content": response_content,
	"gen_code": "",
	"ex_code": "",
	"last_prompt": question,
	"error": log_error_msg
	}