Spaces:
Sleeping
Sleeping
File size: 8,253 Bytes
de716c3 8a33362 2e654c9 de716c3 8a33362 de716c3 8a33362 de716c3 8a33362 2e654c9 de716c3 9bdf024 ad7ff43 1bf821b de716c3 2e654c9 de716c3 8111a53 a8d643d 2e654c9 8a33362 2e654c9 f9bc1b8 fcf406d 0cb6df9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
import os
import streamlit as st
from st_aggrid import AgGrid
import pandas as pd
from transformers import pipeline, T5ForConditionalGeneration, T5Tokenizer
# Set the page layout for Streamlit
st.set_page_config(layout="wide")
# CSS styling
style = '''
<style>
body {background-color: #F5F5F5; color: #000000;}
header {visibility: hidden;}
div.block-container {padding-top:4rem;}
section[data-testid="stSidebar"] div:first-child {
padding-top: 0;
}
.font {
text-align:center;
font-family:sans-serif;font-size: 1.25rem;}
</style>
'''
st.markdown(style, unsafe_allow_html=True)
st.markdown('<p style="font-family:sans-serif;font-size: 1.5rem;text-align: right;"> HertogAI Table Q&A using TAPAS and Model Language</p>', unsafe_allow_html=True)
st.markdown('<p style="font-family:sans-serif;font-size: 0.7rem;text-align: right;"> This code is based on Jordan Skinner. I enhanced his work for Data analysis COUNT, AVG, TOTAL, MEAN & StdDev </p>', unsafe_allow_html=True)
st.markdown("<p style='font-family:sans-serif;font-size: 0.6rem;text-align: right;'>Pre-trained TAPAS model runs on max 64 rows and 32 columns data. Make sure the file data doesn't exceed these dimensions.</p>", unsafe_allow_html=True)
# Initialize TAPAS pipeline
tqa = pipeline(task="table-question-answering",
model="google/tapas-large-finetuned-wtq",
device="cpu")
# Initialize T5 tokenizer and model for text generation
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")
# File uploader in the sidebar
file_name = st.sidebar.file_uploader("Upload file:", type=['csv', 'xlsx'])
# File processing and question answering
if file_name is None:
st.markdown('<p class="custom-font">Please click left side bar to upload an excel or csv file </p>', unsafe_allow_html=True)
else:
try:
# Check file type and handle reading accordingly
if file_name.name.endswith('.csv'):
df = pd.read_csv(file_name, sep=';', encoding='ISO-8859-1') # Adjust encoding if needed
elif file_name.name.endswith('.xlsx'):
df = pd.read_excel(file_name, engine='openpyxl') # Use openpyxl to read .xlsx files
else:
st.error("Unsupported file type")
df = None
# Continue with further processing if df is loaded
if df is not None:
numeric_columns = df.select_dtypes(include=['object']).columns
for col in numeric_columns:
df[col] = pd.to_numeric(df[col], errors='ignore')
st.write("Original Data:")
st.write(df)
# Create a copy for numerical operations
df_numeric = df.copy()
df = df.astype(str)
# Display the first 5 rows of the dataframe in an editable grid
grid_response = AgGrid(
df.head(5),
columns_auto_size_mode='FIT_CONTENTS',
editable=True,
height=300,
width='100%',
)
except Exception as e:
st.error(f"Error reading file: {str(e)}")
# User input for the question
question = st.text_input('Type your question')
# Process the answer using TAPAS and T5
with st.spinner():
if st.button('Answer'):
try:
# Get the raw answer from TAPAS
raw_answer = tqa(table=df, query=question, truncation=True)
st.markdown("<p style='font-family:sans-serif;font-size: 0.9rem;'> Raw Result From TAPAS: </p>",
unsafe_allow_html=True)
st.success(raw_answer)
# Extract relevant information from the TAPAS result
answer = raw_answer['answer']
aggregator = raw_answer.get('aggregator', '')
coordinates = raw_answer.get('coordinates', [])
cells = raw_answer.get('cells', [])
# Construct a base sentence replacing 'SUM' with the query term
base_sentence = f"The {question.lower()} of the selected data is {answer}."
if coordinates and cells:
rows_info = [f"Row {coordinate[0] + 1}, Column '{df.columns[coordinate[1]]}' with value {cell}"
for coordinate, cell in zip(coordinates, cells)]
rows_description = " and ".join(rows_info)
base_sentence += f" This includes the following data: {rows_description}."
# Generate a fluent response using the T5 model, rephrasing the base sentence
input_text = f"Given the question: '{question}', generate a more human-readable response: {base_sentence}"
# Tokenize the input and generate a fluent response using T5
inputs = t5_tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
summary_ids = t5_model.generate(inputs, max_length=150, num_beams=4, early_stopping=True)
# Decode the generated text
generated_text = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
# Display the final generated response
st.markdown("<p style='font-family:sans-serif;font-size: 0.9rem;'> Final Generated Response with LLM: </p>", unsafe_allow_html=True)
st.success(generated_text)
except Exception as e:
st.warning("Please retype your question and make sure to use the column name and cell value correctly.")
# Manually fix the aggregator if it returns an incorrect one
if 'MEDIAN' in question.upper() and 'AVERAGE' in aggregator.upper():
aggregator = 'MEDIAN'
elif 'MIN' in question.upper() and 'AVERAGE' in aggregator.upper():
aggregator = 'MIN'
elif 'MAX' in question.upper() and 'AVERAGE' in aggregator.upper():
aggregator = 'MAX'
elif 'TOTAL' in question.upper() and 'SUM' in aggregator.upper():
aggregator = 'SUM'
# Use the corrected aggregator for further processing
summary_type = aggregator.lower()
# Now, calculate the correct value using pandas based on the corrected aggregator
if summary_type == 'sum':
numeric_value = df_numeric[column_name].sum()
elif summary_type == 'max':
numeric_value = df_numeric[column_name].max()
elif summary_type == 'min':
numeric_value = df_numeric[column_name].min()
elif summary_type == 'average':
numeric_value = df_numeric[column_name].mean()
elif summary_type == 'count':
numeric_value = df_numeric[column_name].count()
elif summary_type == 'median':
numeric_value = df_numeric[column_name].median()
elif summary_type == 'std_dev':
numeric_value = df_numeric[column_name].std()
else:
numeric_value = processed_answer # Fallback if something went wrong
# Construct a natural language response
if summary_type == 'sum':
natural_language_answer = f"The total {column_name} is {numeric_value}."
elif summary_type == 'maximum':
natural_language_answer = f"The highest {column_name} is {numeric_value}."
elif summary_type == 'minimum':
natural_language_answer = f"The lowest {column_name} is {numeric_value}."
elif summary_type == 'average':
natural_language_answer = f"The average {column_name} is {numeric_value}."
elif summary_type == 'count':
natural_language_answer = f"The number of entries in {column_name} is {numeric_value}."
elif summary_type == 'median':
natural_language_answer = f"The median {column_name} is {numeric_value}."
elif summary_type == 'std_dev':
natural_language_answer = f"The standard deviation of {column_name} is {numeric_value}."
else:
natural_language_answer = f"The value for {column_name} is {numeric_value}."
# Display the result to the user
st.markdown("<p style='font-family:sans-serif;font-size: 0.9rem;'> Analysis Results: </p>", unsafe_allow_html=True)
st.success(f"""
• Answer: {natural_language_answer}
Data Location:
• Column: {column_name}
Additional Context:
• Query Asked: "{question}"
""")
|