Spaces:

ronakreddy18
/

Zerotoheroinmachinelearning

Sleeping

File size: 12,428 Bytes

import streamlit as st
import pandas as pd
import json
import xml.etree.ElementTree as ET

# Inject custom CSS to style the buttons
st.markdown("""
    <style>
    .stButton>button {
        background-color: #4CAF50;
        color: white;
        width: 100%;
    }
    </style>
    """, unsafe_allow_html=True)

# Initialize page navigation state
if 'page' not in st.session_state:
    st.session_state.page = "home"  # Default page is "home"

# ----------------- Home Page -----------------
def home_page():
    st.title(":green[Lifecycle of a Machine Learning Project]")
    st.markdown("Click on a stage to learn more about it.")

    # Buttons for various stages of the ML project lifecycle
    if st.button(":blue[📊 Data Collection]"):
        st.session_state.page = "data_collection"

    if st.button(":blue[🌟 Problem Statement]"):
        st.markdown("### Problem Statement\nIdentify the problem you want to solve and set clear objectives and success criteria.")
    
    if st.button(":blue[🛠️ Simple EDA]"):
        st.markdown("### Simple EDA\nPerform exploratory data analysis to understand data distributions and relationships.")
    
    if st.button(":blue[Data Pre-Processing]"):
        st.markdown("### Data Pre-Processing\nConvert raw data into cleaned data.")

    if st.button(":blue[📈 Exploratory Data Analysis (EDA)]"):
        st.markdown("### Exploratory Data Analysis (EDA)\nVisualize and analyze the data to understand its distributions and relationships.")

    if st.button(":blue[🏋️ Feature Engineering]"):
        st.markdown("### Feature Engineering\nCreate new features from existing data.")

    if st.button(":blue[🤖 Model Training]"):
        st.markdown("### Model Training\nTrain the model using the training data and optimize its parameters.")

    if st.button(":blue[🔧 Model Testing]"):
        st.markdown("### Model Testing\nAssess the model's performance using various metrics and cross-validation techniques.")

    if st.button(":blue[🚀 Model Deployment]"):
        st.markdown("### Model Deployment\nIntegrate the trained model into a production environment and monitor its performance.")

    if st.button(":blue[📝 Monitoring]"):
        st.markdown("### Monitoring\nPeriodically retrain the model with new data and update features as needed.")

# ----------------- Data Collection Page -----------------
def data_collection_page():
    st.title(":red[Data Collection]")
    st.markdown("### Data Collection\nThis page discusses the process of Data Collection.")
    st.markdown("Types of Data: **Structured**, **Unstructured**, **Semi-Structured**")

    if st.button(":blue[🌟 Structured Data]"):
        st.session_state.page = "structured_data"

    if st.button(":blue[📷 Unstructured Data]"):
        st.session_state.page = "unstructured_data"

    if st.button(":blue[🗃️ Semi-Structured Data]"):
        st.session_state.page = "semi_structured_data"

    if st.button("Back to Home"):
        st.session_state.page = "home"


# ----------------- Structured Data Page -----------------
def structured_data_page():
    st.title(":blue[Structured Data]")
    st.markdown("""
    Structured data is highly organized and typically stored in tables like spreadsheets or databases. It is easy to search and analyze.
    """)
    st.markdown("### Examples: Excel files, CSV files, JSON files")

    if st.button(":green[\ud83d\udcca Excel]"):
        st.session_state.page = "excel"

    if st.button(":green[\ud83d\udcc4 CSV]"):
        st.session_state.page = "csv"

    if st.button(":green[\ud83d\udd39 JSON]"):
        st.session_state.page = "json"

    if st.button("Back to Data Collection"):
        st.session_state.page = "data_collection"

# ----------------- Excel Data Page -----------------
def excel_page():
    st.title(":green[Excel Data Format]")

    st.write("### What is Excel?")
    st.write("Excel is a spreadsheet tool for storing data in tabular format with rows and columns. Common file extensions: .xls, .xlsx.")

    st.write("### How to Read Excel Files")
    st.code("""
import pandas as pd

# Read an Excel file
df = pd.read_excel('data.xlsx', sheet_name='Sheet1')
print(df)
    """, language='python')

    st.write("### Issues Encountered")
    st.write("""
- **File not found**: Incorrect file path.
- **Sheet name error**: Specified sheet doesn't exist.
- **Missing libraries**: openpyxl or xlrd might be missing.
""")

    st.write("### Solutions to These Issues")
    st.code("""
# Install required libraries
# pip install openpyxl xlrd

# Handle missing file
try:
    df = pd.read_excel('data.xlsx', sheet_name='Sheet1')
except FileNotFoundError:
    print("File not found. Check the file path.")

# List available sheet names
excel_file = pd.ExcelFile('data.xlsx')
print(excel_file.sheet_names)
    """, language='python')

    st.link_button("Jupyter Notebook", "https://colab.research.google.com/drive/1ZTKWTknL-4IQ9QbAfcyKzIP-_lNxmz2P?usp=sharing")

    if st.button("Back to Structured Data"):
        st.session_state.page = "structured_data"

# ----------------- CSV Data Page -----------------
def csv_page():
    st.title(":green[CSV Data Format]")

    st.write("### What is CSV?")
    st.write("CSV (Comma-Separated Values) files store tabular data in plain text, where each line is a data record and columns are separated by commas.")

    st.write("### How to Read CSV Files")
    st.code("""
import pandas as pd

# Read a CSV file
df = pd.read_csv('data.csv')
print(df)
    """, language='python')

    st.write("### Error Handling for CSV Files")
    st.code("""
import pandas as pd

try:
    df = pd.read_csv('data.csv', encoding='utf-8', delimiter=',')
    print("CSV File Loaded Successfully!")
    print(df)
except FileNotFoundError:
    print("Error: File not found. Please check the file path.")
except pd.errors.ParserError:
    print("Error: The file is not a valid CSV format.")
except UnicodeDecodeError:
    print("Error: Encoding issue. Try specifying a different encoding like 'latin1' or 'utf-8'.")
    """, language='python')

    st.link_button("Jupyter Notebook", "https://colab.research.google.com/drive/your_csv_guide_link")

    if st.button("Back to Structured Data"):
        st.session_state.page = "structured_data"

# ----------------- JSON Data Page -----------------
def json_page():
    st.title(":green[JSON Data Format]")

    st.write("### What is JSON?")
    st.write("""
    JSON (JavaScript Object Notation) is a lightweight data-interchange format.
    """)

    st.code("""
import json

# Read a JSON file
with open('data.json', 'r') as file:
    data = json.load(file)
    print(data)
    """, language='python')

    st.link_button("Jupyter Notebook", "https://colab.research.google.com/drive/your_json_guide_link")

    if st.button("Back to Structured Data"):
        st.session_state.page = "structured


# ----------------- Unstructured Data Page -----------------
def unstructured_data_page():
    st.title(":blue[Unstructured Data]")
    
    st.markdown("""
    **Unstructured data** does not have a predefined format. It consists of various data types like text, images, videos, and audio files.
    Examples include:
    - Text documents (e.g., .txt, .docx)
    - Images (e.g., .jpg, .png)
    - Videos (e.g., .mp4, .avi)
    - Audio files (e.g., .mp3, .wav)
    - Social media posts
    """)

    st.header("📄 Handling Text Data")
    st.markdown("""
    Text data can be analyzed using Natural Language Processing (NLP) techniques.
    """)
    st.code("""
# Reading text data
with open('sample.txt', 'r') as file:
    text = file.read()
    print(text)

# Basic text processing using NLTK
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')
tokens = word_tokenize(text)
print(tokens)
    """, language='python')

    st.header("🖼️ Handling Image Data")
    st.markdown("""
    Image data can be processed using libraries like OpenCV and PIL (Pillow).
    """)
    st.code("""
from PIL import Image

# Open an image file
image = Image.open('sample_image.jpg')
image.show()

# Convert image to grayscale
gray_image = image.convert('L')
gray_image.show()
    """, language='python')

    st.header("🎥 Handling Video Data")
    st.markdown("""
    Videos can be processed frame by frame using OpenCV.
    """)
    st.code("""
import cv2

# Capture video
video = cv2.VideoCapture('sample_video.mp4')

while video.isOpened():
    ret, frame = video.read()
    if not ret:
        break
    cv2.imshow('Frame', frame)
    if cv2.waitKey(25) & 0xFF == ord('q'):
        break

video.release()
cv2.destroyAllWindows()
    """, language='python')

    st.header("🔊 Handling Audio Data")
    st.markdown("""
    Audio data can be handled using libraries like librosa.
    """)
    st.code("""
import librosa
import librosa.display
import matplotlib.pyplot as plt

# Load audio file
y, sr = librosa.load('sample_audio.mp3')
librosa.display.waveshow(y, sr=sr)
plt.title('Waveform')
plt.show()
    """, language='python')

    st.markdown("### Challenges with Unstructured Data")
    st.write("""
    - **Noise and Inconsistency**: Data is often incomplete or noisy.
    - **Storage Requirements**: Large size and variability in data types.
    - **Processing Time**: Analyzing unstructured data is computationally expensive.
    """)

    st.markdown("### Solutions")
    st.write("""
    - **Data Cleaning**: Preprocess data to remove noise.
    - **Efficient Storage**: Use NoSQL databases (e.g., MongoDB) or cloud storage.
    - **Parallel Processing**: Utilize frameworks like Apache Spark.
    """)

    # Back to Data Collection
    if st.button("Back to Data Collection"):
        st.session_state.page = "data_collection"

# ----------------- Semi-Structured Data Page -----------------
def semi_structured_data_page():
    st.title(":blue[Semi-Structured Data]")
    
    st.markdown("""
    **Semi-structured data** does not conform strictly to a tabular structure but contains tags or markers to separate elements. Examples include:
    - JSON (JavaScript Object Notation) files
    - XML (Extensible Markup Language) files
    - YAML (Yet Another Markup Language)
    """)

    st.header("🔹 JSON Data")
    st.markdown("""
    JSON is a popular format for storing and exchanging data.
    """)
    st.code("""
# Sample JSON data
data = '''
{
    "name": "Alice",
    "age": 25,
    "skills": ["Python", "Machine Learning"]
}
'''

# Parse JSON
parsed_data = json.loads(data)
print(parsed_data['name'])  # Output: Alice
    """, language='python')

    st.header("🔹 Reading JSON Files")
    st.code("""
# Reading a JSON file
with open('data.json', 'r') as file:
    data = json.load(file)
    print(data)
    """, language='python')

    st.header("🔹 XML Data")
    st.markdown("""
    XML is a markup language that defines a set of rules for encoding documents.
    """)
    st.code("""
import xml.etree.ElementTree as ET

# Sample XML data
xml_data = '''
<person>
    <name>Bob</name>
    <age>30</age>
    <city>New York</city>
</person>
'''

# Parse XML
root = ET.fromstring(xml_data)
print(root.find('name').text)  # Output: Bob
    """, language='python')

    st.markdown("### Challenges with Semi-Structured Data")
    st.write("""
    - **Complex Parsing**: Requires specialized parsers.
    - **Nested Data**: Can be deeply nested, making it harder to process.
    """)

    st.markdown("### Solutions")
    st.write("""
    - **Libraries**: Use libraries like json, xml.etree.ElementTree, and yaml for parsing.
    - **Validation**: Validate data formats to avoid parsing errors.
    """)

    # Back to Data Collection
    if st.button("Back to Data Collection"):
        st.session_state.page = "data_collection"

# ----------------- Router -----------------
def router():
    if st.session_state.page == "home":
        home_page()
    elif st.session_state.page == "data_collection":
        data_collection_page()
    elif st.session_state.page == "structured_data":
        structured_data_page()
    elif st.session_state.page == "excel":
        excel_page()
    elif st.session_state.page == "csv":
        csv_page()
    elif st.session_state.page == "unstructured_data":
        unstructured_data_page()
    elif st.session_state.page == "semi_structured_data":
        semi_structured_data_page()

# Run the router function
if __name__ == "__main__":
    router()