Zerotoheroinmachinelearning / pages /LIFE_CYCLE_OF_MACHINE_LEARNING.py
ronakreddy18's picture
Update pages/LIFE_CYCLE_OF_MACHINE_LEARNING.py
4914bcc verified
raw
history blame
12.4 kB
import streamlit as st
import pandas as pd
import json
import xml.etree.ElementTree as ET
# Inject custom CSS to style the buttons
st.markdown("""
<style>
.stButton>button {
background-color: #4CAF50;
color: white;
width: 100%;
}
</style>
""", unsafe_allow_html=True)
# Initialize page navigation state
if 'page' not in st.session_state:
st.session_state.page = "home" # Default page is "home"
# ----------------- Home Page -----------------
def home_page():
st.title(":green[Lifecycle of a Machine Learning Project]")
st.markdown("Click on a stage to learn more about it.")
# Buttons for various stages of the ML project lifecycle
if st.button(":blue[πŸ“Š Data Collection]"):
st.session_state.page = "data_collection"
if st.button(":blue[🌟 Problem Statement]"):
st.markdown("### Problem Statement\nIdentify the problem you want to solve and set clear objectives and success criteria.")
if st.button(":blue[πŸ› οΈ Simple EDA]"):
st.markdown("### Simple EDA\nPerform exploratory data analysis to understand data distributions and relationships.")
if st.button(":blue[Data Pre-Processing]"):
st.markdown("### Data Pre-Processing\nConvert raw data into cleaned data.")
if st.button(":blue[πŸ“ˆ Exploratory Data Analysis (EDA)]"):
st.markdown("### Exploratory Data Analysis (EDA)\nVisualize and analyze the data to understand its distributions and relationships.")
if st.button(":blue[πŸ‹οΈ Feature Engineering]"):
st.markdown("### Feature Engineering\nCreate new features from existing data.")
if st.button(":blue[πŸ€– Model Training]"):
st.markdown("### Model Training\nTrain the model using the training data and optimize its parameters.")
if st.button(":blue[πŸ”§ Model Testing]"):
st.markdown("### Model Testing\nAssess the model's performance using various metrics and cross-validation techniques.")
if st.button(":blue[πŸš€ Model Deployment]"):
st.markdown("### Model Deployment\nIntegrate the trained model into a production environment and monitor its performance.")
if st.button(":blue[πŸ“ Monitoring]"):
st.markdown("### Monitoring\nPeriodically retrain the model with new data and update features as needed.")
# ----------------- Data Collection Page -----------------
def data_collection_page():
st.title(":red[Data Collection]")
st.markdown("### Data Collection\nThis page discusses the process of Data Collection.")
st.markdown("Types of Data: **Structured**, **Unstructured**, **Semi-Structured**")
if st.button(":blue[🌟 Structured Data]"):
st.session_state.page = "structured_data"
if st.button(":blue[πŸ“· Unstructured Data]"):
st.session_state.page = "unstructured_data"
if st.button(":blue[πŸ—ƒοΈ Semi-Structured Data]"):
st.session_state.page = "semi_structured_data"
if st.button("Back to Home"):
st.session_state.page = "home"
# ----------------- Structured Data Page -----------------
def structured_data_page():
st.title(":blue[Structured Data]")
st.markdown("""
Structured data is highly organized and typically stored in tables like spreadsheets or databases. It is easy to search and analyze.
""")
st.markdown("### Examples: Excel files, CSV files, JSON files")
if st.button(":green[\ud83d\udcca Excel]"):
st.session_state.page = "excel"
if st.button(":green[\ud83d\udcc4 CSV]"):
st.session_state.page = "csv"
if st.button(":green[\ud83d\udd39 JSON]"):
st.session_state.page = "json"
if st.button("Back to Data Collection"):
st.session_state.page = "data_collection"
# ----------------- Excel Data Page -----------------
def excel_page():
st.title(":green[Excel Data Format]")
st.write("### What is Excel?")
st.write("Excel is a spreadsheet tool for storing data in tabular format with rows and columns. Common file extensions: .xls, .xlsx.")
st.write("### How to Read Excel Files")
st.code("""
import pandas as pd
# Read an Excel file
df = pd.read_excel('data.xlsx', sheet_name='Sheet1')
print(df)
""", language='python')
st.write("### Issues Encountered")
st.write("""
- **File not found**: Incorrect file path.
- **Sheet name error**: Specified sheet doesn't exist.
- **Missing libraries**: openpyxl or xlrd might be missing.
""")
st.write("### Solutions to These Issues")
st.code("""
# Install required libraries
# pip install openpyxl xlrd
# Handle missing file
try:
df = pd.read_excel('data.xlsx', sheet_name='Sheet1')
except FileNotFoundError:
print("File not found. Check the file path.")
# List available sheet names
excel_file = pd.ExcelFile('data.xlsx')
print(excel_file.sheet_names)
""", language='python')
st.link_button("Jupyter Notebook", "https://colab.research.google.com/drive/1ZTKWTknL-4IQ9QbAfcyKzIP-_lNxmz2P?usp=sharing")
if st.button("Back to Structured Data"):
st.session_state.page = "structured_data"
# ----------------- CSV Data Page -----------------
def csv_page():
st.title(":green[CSV Data Format]")
st.write("### What is CSV?")
st.write("CSV (Comma-Separated Values) files store tabular data in plain text, where each line is a data record and columns are separated by commas.")
st.write("### How to Read CSV Files")
st.code("""
import pandas as pd
# Read a CSV file
df = pd.read_csv('data.csv')
print(df)
""", language='python')
st.write("### Error Handling for CSV Files")
st.code("""
import pandas as pd
try:
df = pd.read_csv('data.csv', encoding='utf-8', delimiter=',')
print("CSV File Loaded Successfully!")
print(df)
except FileNotFoundError:
print("Error: File not found. Please check the file path.")
except pd.errors.ParserError:
print("Error: The file is not a valid CSV format.")
except UnicodeDecodeError:
print("Error: Encoding issue. Try specifying a different encoding like 'latin1' or 'utf-8'.")
""", language='python')
st.link_button("Jupyter Notebook", "https://colab.research.google.com/drive/your_csv_guide_link")
if st.button("Back to Structured Data"):
st.session_state.page = "structured_data"
# ----------------- JSON Data Page -----------------
def json_page():
st.title(":green[JSON Data Format]")
st.write("### What is JSON?")
st.write("""
JSON (JavaScript Object Notation) is a lightweight data-interchange format.
""")
st.code("""
import json
# Read a JSON file
with open('data.json', 'r') as file:
data = json.load(file)
print(data)
""", language='python')
st.link_button("Jupyter Notebook", "https://colab.research.google.com/drive/your_json_guide_link")
if st.button("Back to Structured Data"):
st.session_state.page = "structured
# ----------------- Unstructured Data Page -----------------
def unstructured_data_page():
st.title(":blue[Unstructured Data]")
st.markdown("""
**Unstructured data** does not have a predefined format. It consists of various data types like text, images, videos, and audio files.
Examples include:
- Text documents (e.g., .txt, .docx)
- Images (e.g., .jpg, .png)
- Videos (e.g., .mp4, .avi)
- Audio files (e.g., .mp3, .wav)
- Social media posts
""")
st.header("πŸ“„ Handling Text Data")
st.markdown("""
Text data can be analyzed using Natural Language Processing (NLP) techniques.
""")
st.code("""
# Reading text data
with open('sample.txt', 'r') as file:
text = file.read()
print(text)
# Basic text processing using NLTK
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
tokens = word_tokenize(text)
print(tokens)
""", language='python')
st.header("πŸ–ΌοΈ Handling Image Data")
st.markdown("""
Image data can be processed using libraries like OpenCV and PIL (Pillow).
""")
st.code("""
from PIL import Image
# Open an image file
image = Image.open('sample_image.jpg')
image.show()
# Convert image to grayscale
gray_image = image.convert('L')
gray_image.show()
""", language='python')
st.header("πŸŽ₯ Handling Video Data")
st.markdown("""
Videos can be processed frame by frame using OpenCV.
""")
st.code("""
import cv2
# Capture video
video = cv2.VideoCapture('sample_video.mp4')
while video.isOpened():
ret, frame = video.read()
if not ret:
break
cv2.imshow('Frame', frame)
if cv2.waitKey(25) & 0xFF == ord('q'):
break
video.release()
cv2.destroyAllWindows()
""", language='python')
st.header("πŸ”Š Handling Audio Data")
st.markdown("""
Audio data can be handled using libraries like librosa.
""")
st.code("""
import librosa
import librosa.display
import matplotlib.pyplot as plt
# Load audio file
y, sr = librosa.load('sample_audio.mp3')
librosa.display.waveshow(y, sr=sr)
plt.title('Waveform')
plt.show()
""", language='python')
st.markdown("### Challenges with Unstructured Data")
st.write("""
- **Noise and Inconsistency**: Data is often incomplete or noisy.
- **Storage Requirements**: Large size and variability in data types.
- **Processing Time**: Analyzing unstructured data is computationally expensive.
""")
st.markdown("### Solutions")
st.write("""
- **Data Cleaning**: Preprocess data to remove noise.
- **Efficient Storage**: Use NoSQL databases (e.g., MongoDB) or cloud storage.
- **Parallel Processing**: Utilize frameworks like Apache Spark.
""")
# Back to Data Collection
if st.button("Back to Data Collection"):
st.session_state.page = "data_collection"
# ----------------- Semi-Structured Data Page -----------------
def semi_structured_data_page():
st.title(":blue[Semi-Structured Data]")
st.markdown("""
**Semi-structured data** does not conform strictly to a tabular structure but contains tags or markers to separate elements. Examples include:
- JSON (JavaScript Object Notation) files
- XML (Extensible Markup Language) files
- YAML (Yet Another Markup Language)
""")
st.header("πŸ”Ή JSON Data")
st.markdown("""
JSON is a popular format for storing and exchanging data.
""")
st.code("""
# Sample JSON data
data = '''
{
"name": "Alice",
"age": 25,
"skills": ["Python", "Machine Learning"]
}
'''
# Parse JSON
parsed_data = json.loads(data)
print(parsed_data['name']) # Output: Alice
""", language='python')
st.header("πŸ”Ή Reading JSON Files")
st.code("""
# Reading a JSON file
with open('data.json', 'r') as file:
data = json.load(file)
print(data)
""", language='python')
st.header("πŸ”Ή XML Data")
st.markdown("""
XML is a markup language that defines a set of rules for encoding documents.
""")
st.code("""
import xml.etree.ElementTree as ET
# Sample XML data
xml_data = '''
<person>
<name>Bob</name>
<age>30</age>
<city>New York</city>
</person>
'''
# Parse XML
root = ET.fromstring(xml_data)
print(root.find('name').text) # Output: Bob
""", language='python')
st.markdown("### Challenges with Semi-Structured Data")
st.write("""
- **Complex Parsing**: Requires specialized parsers.
- **Nested Data**: Can be deeply nested, making it harder to process.
""")
st.markdown("### Solutions")
st.write("""
- **Libraries**: Use libraries like json, xml.etree.ElementTree, and yaml for parsing.
- **Validation**: Validate data formats to avoid parsing errors.
""")
# Back to Data Collection
if st.button("Back to Data Collection"):
st.session_state.page = "data_collection"
# ----------------- Router -----------------
def router():
if st.session_state.page == "home":
home_page()
elif st.session_state.page == "data_collection":
data_collection_page()
elif st.session_state.page == "structured_data":
structured_data_page()
elif st.session_state.page == "excel":
excel_page()
elif st.session_state.page == "csv":
csv_page()
elif st.session_state.page == "unstructured_data":
unstructured_data_page()
elif st.session_state.page == "semi_structured_data":
semi_structured_data_page()
# Run the router function
if __name__ == "__main__":
router()