File size: 12,496 Bytes
de716c3
 
8a33362
2e654c9
de716c3
 
8a33362
 
de716c3
8a33362
de716c3
 
8a33362
 
 
 
2e654c9
 
 
 
 
de716c3
 
 
 
9bdf024
ad7ff43
 
1bf821b
de716c3
 
 
 
 
 
 
 
 
 
2e654c9
 
de716c3
8111a53
 
 
a8d643d
2e654c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8a33362
2e654c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9bc1b8
fcf406d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2e654c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
import os
import streamlit as st
from st_aggrid import AgGrid
import pandas as pd
from transformers import pipeline, T5ForConditionalGeneration, T5Tokenizer

# Set the page layout for Streamlit
st.set_page_config(layout="wide")

# CSS styling
style = '''
    <style>
        body {background-color: #F5F5F5; color: #000000;}
        header {visibility: hidden;}
        div.block-container {padding-top:4rem;}
        section[data-testid="stSidebar"] div:first-child {
        padding-top: 0;
    }
     .font {                                          
    text-align:center;
    font-family:sans-serif;font-size: 1.25rem;}
    </style>
'''
st.markdown(style, unsafe_allow_html=True)

st.markdown('<p style="font-family:sans-serif;font-size: 1.5rem;text-align: right;"> HertogAI Table Q&A using TAPAS and Model Language</p>', unsafe_allow_html=True)
st.markdown('<p style="font-family:sans-serif;font-size: 0.7rem;text-align: right;"> This code is based on Jordan Skinner. I enhanced his work for Data analysis COUNT, AVG, TOTAL, MEAN & StdDev </p>', unsafe_allow_html=True)
st.markdown("<p style='font-family:sans-serif;font-size: 0.6rem;text-align: right;'>Pre-trained TAPAS model runs on max 64 rows and 32 columns data. Make sure the file data doesn't exceed these dimensions.</p>", unsafe_allow_html=True)


# Initialize TAPAS pipeline
tqa = pipeline(task="table-question-answering", 
              model="google/tapas-large-finetuned-wtq",
              device="cpu")

# Initialize T5 tokenizer and model for text generation
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")

# File uploader in the sidebar
file_name = st.sidebar.file_uploader("Upload file:", type=['csv', 'xlsx'])

# File processing and question answering
if file_name is None:

    st.markdown('<p class="custom-font">Please click left side bar to upload an excel or csv file </p>', unsafe_allow_html=True)
else:
    try:
        # Check file type and handle reading accordingly
        if file_name.name.endswith('.csv'):
            df = pd.read_csv(file_name, sep=';', encoding='ISO-8859-1')  # Adjust encoding if needed
        elif file_name.name.endswith('.xlsx'):
            df = pd.read_excel(file_name, engine='openpyxl')  # Use openpyxl to read .xlsx files
        else:
            st.error("Unsupported file type")
            df = None

        # Continue with further processing if df is loaded
        if df is not None:
            numeric_columns = df.select_dtypes(include=['object']).columns
            for col in numeric_columns:
                df[col] = pd.to_numeric(df[col], errors='ignore')

            st.write("Original Data:")
            st.write(df)

            # Create a copy for numerical operations
            df_numeric = df.copy()
            df = df.astype(str)

            # Display the first 5 rows of the dataframe in an editable grid
            grid_response = AgGrid(
                df.head(5),
                columns_auto_size_mode='FIT_CONTENTS',
                editable=True, 
                height=300, 
                width='100%',
            )
            
    except Exception as e:
        st.error(f"Error reading file: {str(e)}")

    # User input for the question
    question = st.text_input('Type your question')

    # Process the answer using TAPAS and T5
    with st.spinner():
        if st.button('Answer'):
            try:
                # Get the raw answer from TAPAS
                raw_answer = tqa(table=df, query=question, truncation=True)
                
                st.markdown("<p style='font-family:sans-serif;font-size: 0.9rem;'> Raw Result From TAPAS: </p>",
                           unsafe_allow_html=True)
                st.success(raw_answer)
                
                # Extract relevant information from the TAPAS result
                answer = raw_answer['answer']
                aggregator = raw_answer.get('aggregator', '')
                coordinates = raw_answer.get('coordinates', [])
                cells = raw_answer.get('cells', [])
                
                # Construct a base sentence replacing 'SUM' with the query term
                base_sentence = f"The {question.lower()} of the selected data is {answer}."
                if coordinates and cells:
                    rows_info = [f"Row {coordinate[0] + 1}, Column '{df.columns[coordinate[1]]}' with value {cell}" 
                                 for coordinate, cell in zip(coordinates, cells)]
                    rows_description = " and ".join(rows_info)
                    base_sentence += f" This includes the following data: {rows_description}."

                # Generate a fluent response using the T5 model, rephrasing the base sentence
                input_text = f"Given the question: '{question}', generate a more human-readable response: {base_sentence}"

                # Tokenize the input and generate a fluent response using T5
                inputs = t5_tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
                summary_ids = t5_model.generate(inputs, max_length=150, num_beams=4, early_stopping=True)

                # Decode the generated text
                generated_text = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

                # Display the final generated response
                st.markdown("<p style='font-family:sans-serif;font-size: 0.9rem;'> Final Generated Response with LLM: </p>", unsafe_allow_html=True)
                st.success(generated_text)

            except Exception as e:
                st.warning("Please retype your question and make sure to use the column name and cell value correctly.")
  
           
# Manually fix the aggregator if it returns an incorrect one
if 'MEDIAN' in question.upper() and 'AVERAGE' in aggregator.upper():
    aggregator = 'MEDIAN'
elif 'MIN' in question.upper() and 'AVERAGE' in aggregator.upper():
    aggregator = 'MIN'
elif 'MAX' in question.upper() and 'AVERAGE' in aggregator.upper():
    aggregator = 'MAX'
elif 'TOTAL' in question.upper() and 'SUM' in aggregator.upper():
    aggregator = 'SUM'

# Use the corrected aggregator for further processing
summary_type = aggregator.lower()

# Now, calculate the correct value using pandas based on the corrected aggregator
if summary_type == 'sum':
    numeric_value = df_numeric[column_name].sum()
elif summary_type == 'max':
    numeric_value = df_numeric[column_name].max()
elif summary_type == 'min':
    numeric_value = df_numeric[column_name].min()
elif summary_type == 'average':
    numeric_value = df_numeric[column_name].mean()
elif summary_type == 'count':
    numeric_value = df_numeric[column_name].count()
elif summary_type == 'median':
    numeric_value = df_numeric[column_name].median()
elif summary_type == 'std_dev':
    numeric_value = df_numeric[column_name].std()
else:
    numeric_value = processed_answer  # Fallback if something went wrong

# Construct a natural language response
if summary_type == 'sum':
    natural_language_answer = f"The total {column_name} is {numeric_value}."
elif summary_type == 'maximum':
    natural_language_answer = f"The highest {column_name} is {numeric_value}."
elif summary_type == 'minimum':
    natural_language_answer = f"The lowest {column_name} is {numeric_value}."
elif summary_type == 'average':
    natural_language_answer = f"The average {column_name} is {numeric_value}."
elif summary_type == 'count':
    natural_language_answer = f"The number of entries in {column_name} is {numeric_value}."
elif summary_type == 'median':
    natural_language_answer = f"The median {column_name} is {numeric_value}."
elif summary_type == 'std_dev':
    natural_language_answer = f"The standard deviation of {column_name} is {numeric_value}."
else:
    natural_language_answer = f"The value for {column_name} is {numeric_value}."

# Display the result to the user
st.markdown("<p style='font-family:sans-serif;font-size: 0.9rem;'> Analysis Results: </p>", unsafe_allow_html=True)
st.success(f"""
    • Answer: {natural_language_answer}

    Data Location:
    • Column: {column_name}

    Additional Context:
    • Query Asked: "{question}"
""")




                

                # Display raw result for debugging purposes
                st.markdown("<p style='font-family:sans-serif;font-size: 0.9rem;'> Raw Result: </p>", unsafe_allow_html=True)
                st.success(raw_answer)

                # Processing the raw_answer
                processed_answer = raw_answer['answer'].replace(';', ' ')  # Clean the answer text
                row_idx = raw_answer['coordinates'][0][0]  # Row index from TAPAS
                col_idx = raw_answer['coordinates'][0][1]  # Column index from TAPAS
                column_name = df.columns[col_idx]  # Column name from the DataFrame
                row_data = df.iloc[row_idx].to_dict()  # Row data corresponding to the row index

                # Handle different types of answers (e.g., 'SUM', 'MAX', 'MIN', 'AVG', etc.)
                if 'SUM' in processed_answer:
                    summary_type = 'sum'
                    numeric_value = df_numeric[column_name].sum()
                elif 'MAX' in processed_answer:
                    summary_type = 'maximum'
                    numeric_value = df_numeric[column_name].max()
                elif 'MIN' in processed_answer:
                    summary_type = 'minimum'
                    numeric_value = df_numeric[column_name].min()
                elif 'AVG' in processed_answer or 'AVERAGE' in processed_answer:
                    summary_type = 'average'
                    numeric_value = df_numeric[column_name].mean()
                elif 'COUNT' in processed_answer:
                    summary_type = 'count'
                    numeric_value = df_numeric[column_name].count()
                elif 'MEDIAN' in processed_answer:
                    summary_type = 'median'
                    numeric_value = df_numeric[column_name].median()
                elif 'STD' in processed_answer or 'STANDARD DEVIATION' in processed_answer:
                    summary_type = 'std_dev'
                    numeric_value = df_numeric[column_name].std()
                else:
                    summary_type = 'value'
                    numeric_value = processed_answer  # In case of a general answer

                # Build a natural language response based on the aggregation type
                if summary_type == 'sum':
                    natural_language_answer = f"The total {column_name} is {numeric_value}."
                elif summary_type == 'maximum':
                    natural_language_answer = f"The highest {column_name} is {numeric_value}, recorded for '{row_data.get('Name', 'Unknown')}'."
                elif summary_type == 'minimum':
                    natural_language_answer = f"The lowest {column_name} is {numeric_value}, recorded for '{row_data.get('Name', 'Unknown')}'."
                elif summary_type == 'average':
                    natural_language_answer = f"The average {column_name} is {numeric_value}."
                elif summary_type == 'count':
                    natural_language_answer = f"The number of entries in {column_name} is {numeric_value}."
                elif summary_type == 'median':
                    natural_language_answer = f"The median {column_name} is {numeric_value}."
                elif summary_type == 'std_dev':
                    natural_language_answer = f"The standard deviation of {column_name} is {numeric_value}."
                else:
                    natural_language_answer = f"The {column_name} value is {numeric_value} for '{row_data.get('Name', 'Unknown')}'."

                # Display the final natural language answer
                st.markdown("<p style='font-family:sans-serif;font-size: 0.9rem;'> Analysis Results: </p>", unsafe_allow_html=True)
                st.success(f"""
                    • Answer: {natural_language_answer}

                    Data Location:
                    • Row: {row_idx + 1}
                    • Column: {column_name}

                    Additional Context:
                    • Full Row Data: {row_data}
                    • Query Asked: "{question}"
                """)

            except Exception as e:
                st.warning("Please retype your question and make sure to use the column name and cell value correctly.")