Spaces:
Sleeping
Sleeping
Update my_model/tabs/dataset_analysis.py
Browse files
my_model/tabs/dataset_analysis.py
CHANGED
|
@@ -246,33 +246,37 @@ class OKVQADatasetAnalyzer:
|
|
| 246 |
|
| 247 |
|
| 248 |
|
| 249 |
-
|
| 250 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
datasets_comparison_table = pd.read_excel(config.DATASET_ANALYSES_PATH, sheet_name="VQA Datasets Comparison")
|
| 252 |
okvqa_dataset_characteristics = pd.read_excel(config.DATASET_ANALYSES_PATH, sheet_name="OK-VQA Dataset Characteristics")
|
| 253 |
-
|
| 254 |
-
val_data = process_okvqa_dataset(config.DATASET_VAL_QUESTIONS_PATH, config.DATASET_VAL_ANNOTATIONS_PATH,
|
| 255 |
-
save_to_csv=False)
|
| 256 |
-
train_data = process_okvqa_dataset(config.DATASET_TRAIN_QUESTIONS_PATH, config.DATASET_TRAIN_ANNOTATIONS_PATH ,
|
| 257 |
-
save_to_csv=False)
|
| 258 |
|
|
|
|
|
|
|
|
|
|
| 259 |
|
|
|
|
|
|
|
| 260 |
|
| 261 |
-
|
| 262 |
-
config.DATASET_VAL_QUESTIONS_PATH, 'train_test')
|
| 263 |
-
|
| 264 |
with st.container():
|
| 265 |
st.markdown("## Overview of KB-VQA Datasets")
|
| 266 |
col1, col2 = st.columns([2, 1])
|
| 267 |
with col1:
|
| 268 |
st.write(" ")
|
| 269 |
with st.expander("1 - Knowledge-Based VQA (KB-VQA)"):
|
| 270 |
-
st.markdown(""" [Knowledge-Based VQA (KB-VQA)](https://arxiv.org/abs/1511.02570): One of the earliest
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
and the KB.\n""")
|
| 276 |
with st.expander("2 - Factual VQA (FVQA)"):
|
| 277 |
st.markdown(""" [Factual VQA (FVQA)](https://arxiv.org/abs/1606.05433): This dataset includes 2,190
|
| 278 |
images and 5,826 questions, accompanied by a knowledge base containing 193,449 facts.
|
|
@@ -296,6 +300,8 @@ def run_dataset_analyzer():
|
|
| 296 |
st.markdown("#### KB-VQA Datasets Comparison")
|
| 297 |
st.write(datasets_comparison_table, use_column_width=True)
|
| 298 |
st.write("-----------------------")
|
|
|
|
|
|
|
| 299 |
with st.container():
|
| 300 |
st.write("\n" * 10)
|
| 301 |
st.markdown("## OK-VQA Dataset")
|
|
@@ -307,16 +313,14 @@ def run_dataset_analyzer():
|
|
| 307 |
with st.expander("Questions Distribution over Knowledge Category"):
|
| 308 |
df = pd.read_excel(config.DATASET_ANALYSES_PATH, sheet_name="Question Category Dist")
|
| 309 |
st.markdown("#### Questions Distribution over Knowledge Category")
|
| 310 |
-
dataset_analyzer.plot_bar_chart(df, "Knowledge Category", "Percentage", "Questions Distribution over "
|
| 311 |
-
"Knowledge Category")
|
| 312 |
|
| 313 |
with st.expander("Distribution of Question Keywords"):
|
| 314 |
-
|
| 315 |
-
#with st.expander("Distribution of Question Keywords"):
|
| 316 |
dataset_analyzer.categorize_questions()
|
| 317 |
st.markdown("#### Distribution of Question Keywords")
|
| 318 |
dataset_analyzer.plot_question_distribution()
|
| 319 |
|
|
|
|
| 320 |
with st.container():
|
| 321 |
with st.expander("Show Dataset Samples"):
|
| 322 |
st.write(train_data[:10])
|
|
|
|
| 246 |
|
| 247 |
|
| 248 |
|
| 249 |
+
|
| 250 |
+
def run_dataset_analyzer() -> None:
|
| 251 |
+
"""
|
| 252 |
+
Executes the dataset analysis process and displays the results using Streamlit.
|
| 253 |
+
This function provides an overview of the dataset, it utilizes the OKVQADatasetAnalyzer to visualize
|
| 254 |
+
the data.
|
| 255 |
+
"""
|
| 256 |
+
|
| 257 |
+
# Load datasets from Excel
|
| 258 |
datasets_comparison_table = pd.read_excel(config.DATASET_ANALYSES_PATH, sheet_name="VQA Datasets Comparison")
|
| 259 |
okvqa_dataset_characteristics = pd.read_excel(config.DATASET_ANALYSES_PATH, sheet_name="OK-VQA Dataset Characteristics")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
|
| 261 |
+
# Process OK-VQA datasets for validation and training
|
| 262 |
+
val_data = process_okvqa_dataset(config.DATASET_VAL_QUESTIONS_PATH, config.DATASET_VAL_ANNOTATIONS_PATH, save_to_csv=False)
|
| 263 |
+
train_data = process_okvqa_dataset(config.DATASET_TRAIN_QUESTIONS_PATH, config.DATASET_TRAIN_ANNOTATIONS_PATH, save_to_csv=False)
|
| 264 |
|
| 265 |
+
# Initialize the dataset analyzer
|
| 266 |
+
dataset_analyzer = OKVQADatasetAnalyzer(config.DATASET_TRAIN_QUESTIONS_PATH, config.DATASET_VAL_QUESTIONS_PATH, 'train_test')
|
| 267 |
|
| 268 |
+
# Display KB-VQA datasets overview
|
|
|
|
|
|
|
| 269 |
with st.container():
|
| 270 |
st.markdown("## Overview of KB-VQA Datasets")
|
| 271 |
col1, col2 = st.columns([2, 1])
|
| 272 |
with col1:
|
| 273 |
st.write(" ")
|
| 274 |
with st.expander("1 - Knowledge-Based VQA (KB-VQA)"):
|
| 275 |
+
st.markdown(""" [Knowledge-Based VQA (KB-VQA)](https://arxiv.org/abs/1511.02570): One of the earliest datasets in this domain, KB-VQA
|
| 276 |
+
comprises 700 images and 2,402 questions, with each question associated with both an image
|
| 277 |
+
and a knowledge base (KB). The KB encapsulates facts about the world, including object
|
| 278 |
+
names, properties, and relationships, aiming to foster models capable of answering
|
| 279 |
+
questions through reasoning over both the image and the KB.\n""")
|
|
|
|
| 280 |
with st.expander("2 - Factual VQA (FVQA)"):
|
| 281 |
st.markdown(""" [Factual VQA (FVQA)](https://arxiv.org/abs/1606.05433): This dataset includes 2,190
|
| 282 |
images and 5,826 questions, accompanied by a knowledge base containing 193,449 facts.
|
|
|
|
| 300 |
st.markdown("#### KB-VQA Datasets Comparison")
|
| 301 |
st.write(datasets_comparison_table, use_column_width=True)
|
| 302 |
st.write("-----------------------")
|
| 303 |
+
|
| 304 |
+
# Display OK-VQA dataset details
|
| 305 |
with st.container():
|
| 306 |
st.write("\n" * 10)
|
| 307 |
st.markdown("## OK-VQA Dataset")
|
|
|
|
| 313 |
with st.expander("Questions Distribution over Knowledge Category"):
|
| 314 |
df = pd.read_excel(config.DATASET_ANALYSES_PATH, sheet_name="Question Category Dist")
|
| 315 |
st.markdown("#### Questions Distribution over Knowledge Category")
|
| 316 |
+
dataset_analyzer.plot_bar_chart(df, "Knowledge Category", "Percentage", "Questions Distribution over Knowledge Category")
|
|
|
|
| 317 |
|
| 318 |
with st.expander("Distribution of Question Keywords"):
|
|
|
|
|
|
|
| 319 |
dataset_analyzer.categorize_questions()
|
| 320 |
st.markdown("#### Distribution of Question Keywords")
|
| 321 |
dataset_analyzer.plot_question_distribution()
|
| 322 |
|
| 323 |
+
# Display sample data
|
| 324 |
with st.container():
|
| 325 |
with st.expander("Show Dataset Samples"):
|
| 326 |
st.write(train_data[:10])
|