Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	| import streamlit as st | |
| import json | |
| from collections import Counter | |
| import contractions | |
| import csv | |
| import altair as alt | |
| from typing import Tuple, List, Optional | |
| from my_model.dataset.dataset_processor import process_okvqa_dataset | |
| from my_model.config import dataset_config as config | |
| class OKVQADatasetAnalyzer: | |
| """ | |
| Provides tools for analyzing and visualizing distributions of question types within given question datasets. | |
| It supports operations such as data loading, categorization of questions based on keywords, visualization of q | |
| uestion distribution, and exporting data to CSV files. | |
| Attributes: | |
| train_file_path (str): Path to the training dataset file. | |
| test_file_path (str): Path to the testing dataset file. | |
| data_choice (str): Choice of dataset(s) to analyze; options include 'train', 'test', or 'train_test'. | |
| questions (List[str]): List of questions aggregated based on the dataset choice. | |
| question_types (Counter): Counter object tracking the frequency of each question type. | |
| Qs (Dict[str, List[str]]): Dictionary mapping question types to lists of corresponding questions. | |
| """ | |
| def __init__(self, train_file_path: str, test_file_path: str, data_choice: str): | |
| """ | |
| Initializes the OKVQADatasetAnalyzer with paths to dataset files and a choice of which datasets to analyze. | |
| Parameters: | |
| train_file_path (str): Path to the training dataset JSON file. This file should contain a list of questions. | |
| test_file_path (str): Path to the testing dataset JSON file. This file should also contain a list of | |
| questions. | |
| data_choice (str): Specifies which dataset(s) to load and analyze. Valid options are 'train', 'test', or | |
| 'train_test'indicating whether to load training data, testing data, or both. | |
| The constructor initializes the paths, selects the dataset based on the choice, and loads the initial data by | |
| calling the `load_data` method. | |
| It also prepares structures for categorizing questions and storing the results. | |
| """ | |
| self.train_file_path = train_file_path | |
| self.test_file_path = test_file_path | |
| self.data_choice = data_choice | |
| self.questions = [] | |
| self.question_types = Counter() | |
| self.Qs = {keyword: [] for keyword in config.QUESTION_KEYWORDS} | |
| self.load_data() | |
| def load_data(self) -> None: | |
| """ | |
| Loads the dataset(s) from the specified JSON file(s) based on the user's choice of 'train', 'test', or | |
| 'train_test'. | |
| This method updates the internal list of questions depending on the chosen dataset. | |
| """ | |
| if self.data_choice in ['train', 'train_test']: | |
| with open(self.train_file_path, 'r') as file: | |
| train_data = json.load(file) | |
| self.questions += [q['question'] for q in train_data['questions']] | |
| if self.data_choice in ['test', 'train_test']: | |
| with open(self.test_file_path, 'r') as file: | |
| test_data = json.load(file) | |
| self.questions += [q['question'] for q in test_data['questions']] | |
| def categorize_questions(self) -> None: | |
| """ | |
| Categorizes each question in the loaded data into predefined categories based on keywords. | |
| This method updates the internal dictionary `self.Qs` and the Counter `self.question_types` with categorized | |
| questions. | |
| """ | |
| question_keywords = config.QUESTION_KEYWORDS | |
| for question in self.questions: | |
| question = contractions.fix(question) | |
| words = question.lower().split() | |
| question_keyword = None | |
| if words[:2] == ['name', 'the']: | |
| question_keyword = 'name the' | |
| else: | |
| for word in words: | |
| if word in question_keywords: | |
| question_keyword = word | |
| break | |
| if question_keyword: | |
| self.question_types[question_keyword] += 1 | |
| self.Qs[question_keyword].append(question) | |
| else: | |
| self.question_types["others"] += 1 | |
| self.Qs["others"].append(question) | |
| def plot_question_distribution(self) -> None: | |
| """ | |
| Plots an interactive bar chart of question types using Altair and Streamlit, displaying the count and percentage | |
| of each type. | |
| The chart sorts question types by count in descending order and includes detailed tooltips for interaction. | |
| This method is intended for visualization in a Streamlit application. | |
| """ | |
| # Prepare data | |
| total_questions = sum(self.question_types.values()) | |
| items = [(key, value, (value / total_questions) * 100) for key, value in self.question_types.items()] | |
| df = pd.DataFrame(items, columns=['Question Keyword', 'Count', 'Percentage']) | |
| # Sort data and handle 'others' category specifically if present | |
| df = df[df['Question Keyword'] != 'others'].sort_values('Count', ascending=False) | |
| if 'others' in self.question_types: | |
| others_df = pd.DataFrame([('others', self.question_types['others'], | |
| (self.question_types['others'] / total_questions) * 100)], | |
| columns=['Question Keyword', 'Count', 'Percentage']) | |
| df = pd.concat([df, others_df], ignore_index=True) | |
| # Explicitly set the order of the x-axis based on the sorted DataFrame | |
| order = df['Question Keyword'].tolist() | |
| # Create the bar chart | |
| bars = alt.Chart(df).mark_bar().encode( | |
| x=alt.X('Question Keyword:N', sort=order, title='Question Keyword', axis=alt.Axis(labelAngle=-45)), | |
| y=alt.Y('Count:Q', title='Frequency'), | |
| color=alt.Color('Question Keyword:N', scale=alt.Scale(scheme='category20'), legend=None), | |
| tooltip=[alt.Tooltip('Question Keyword:N', title='Type'), | |
| alt.Tooltip('Count:Q', title='Count'), | |
| alt.Tooltip('Percentage:Q', title='Percentage', format='.1f')] | |
| ) | |
| # Create text labels for the bars with count and percentage | |
| text = bars.mark_text( | |
| align='center', | |
| baseline='bottom', | |
| dy=-5 # Nudges text up so it appears above the bar | |
| ).encode( | |
| text=alt.Text('PercentageText:N') | |
| ).transform_calculate( | |
| PercentageText="datum.Count + ' (' + format(datum.Percentage, '.1f') + '%)'" | |
| ) | |
| # Combine the bar and text layers | |
| chart = (bars + text).properties( | |
| width=700, | |
| height=400, | |
| title='Distribution of Question Keywords' | |
| ).configure_title(fontSize=20).configure_axis( | |
| labelFontSize=12, | |
| titleFontSize=14 | |
| ) | |
| # Display the chart in Streamlit | |
| st.altair_chart(chart, use_container_width=True) | |
| def export_to_csv(self, qs_filename: str, question_types_filename: str) -> None: | |
| """ | |
| Exports the categorized questions and their counts to two separate CSV files. | |
| Parameters: | |
| qs_filename (str): The filename or path for exporting the `self.Qs` dictionary data. | |
| question_types_filename (str): The filename or path for exporting the `self.question_types` Counter data. | |
| This method writes the contents of `self.Qs` and `self.question_types` to the specified files in CSV format. | |
| Each CSV file includes headers for better understanding and use of the exported data. | |
| """ | |
| # Export self.Qs dictionary | |
| with open(qs_filename, mode='w', newline='', encoding='utf-8') as file: | |
| writer = csv.writer(file) | |
| writer.writerow(['Question Type', 'Questions']) | |
| for q_type, questions in self.Qs.items(): | |
| for question in questions: | |
| writer.writerow([q_type, question]) | |
| # Export self.question_types Counter | |
| with open(question_types_filename, mode='w', newline='', encoding='utf-8') as file: | |
| writer = csv.writer(file) | |
| writer.writerow(['Question Type', 'Count']) | |
| for q_type, count in self.question_types.items(): | |
| writer.writerow([q_type, count]) | 
