|
import streamlit as st |
|
import pandas as pd |
|
import json |
|
import xml.etree.ElementTree as ET |
|
|
|
|
|
st.markdown(""" |
|
<style> |
|
.stButton>button { |
|
background-color: #4CAF50; |
|
color: white; |
|
width: 100%; |
|
} |
|
</style> |
|
""", unsafe_allow_html=True) |
|
|
|
|
|
if 'page' not in st.session_state: |
|
st.session_state.page = "home" |
|
|
|
|
|
def home_page(): |
|
st.title(":green[Lifecycle of a Machine Learning Project]") |
|
st.markdown("Click on a stage to learn more about it.") |
|
|
|
|
|
if st.button(":blue[π Data Collection]"): |
|
st.session_state.page = "data_collection" |
|
|
|
if st.button(":blue[π Problem Statement]"): |
|
st.markdown("### Problem Statement\nIdentify the problem you want to solve and set clear objectives and success criteria.") |
|
|
|
if st.button(":blue[π οΈ Simple EDA]"): |
|
st.markdown("### Simple EDA\nPerform exploratory data analysis to understand data distributions and relationships.") |
|
|
|
if st.button(":blue[π§Ή Data Pre-Processing]"): |
|
st.markdown("### Data Pre-Processing\nConvert raw data into cleaned data.") |
|
|
|
if st.button(":blue[π Exploratory Data Analysis (EDA)]"): |
|
st.markdown("### Exploratory Data Analysis (EDA)\nVisualize and analyze the data to understand its distributions and relationships.") |
|
|
|
if st.button(":blue[ποΈ Feature Engineering]"): |
|
st.markdown("### Feature Engineering\nCreate new features from existing data.") |
|
|
|
if st.button(":blue[π€ Model Training]"): |
|
st.markdown("### Model Training\nTrain the model using the training data and optimize its parameters.") |
|
|
|
if st.button(":blue[π§ Model Testing]"): |
|
st.markdown("### Model Testing\nAssess the model's performance using various metrics and cross-validation techniques.") |
|
|
|
if st.button(":blue[π Model Deployment]"): |
|
st.markdown("### Model Deployment\nIntegrate the trained model into a production environment and monitor its performance.") |
|
|
|
if st.button(":blue[π Monitoring]"): |
|
st.markdown("### Monitoring\nPeriodically retrain the model with new data and update features as needed.") |
|
|
|
|
|
def data_collection_page(): |
|
st.title(":red[Data Collection]") |
|
st.markdown("### Data Collection\nThis page discusses the process of Data Collection.") |
|
st.markdown("Types of Data: **Structured**, **Unstructured**, **Semi-Structured**") |
|
|
|
if st.button(":blue[π Structured Data]"): |
|
st.session_state.page = "structured_data" |
|
|
|
if st.button(":blue[π· Unstructured Data]"): |
|
st.session_state.page = "unstructured_data" |
|
|
|
if st.button(":blue[ποΈ Semi-Structured Data]"): |
|
st.session_state.page = "semi_structured_data" |
|
|
|
if st.button("Back to Home"): |
|
st.session_state.page = "home" |
|
|
|
|
|
def structured_data_page(): |
|
st.title(":blue[Structured Data]") |
|
st.markdown(""" |
|
Structured data is highly organized and typically stored in tables like spreadsheets or databases. It is easy to search and analyze. |
|
""") |
|
st.markdown("### Examples: Excel files") |
|
|
|
if st.button(":green[π Excel]"): |
|
st.session_state.page = "excel" |
|
|
|
if st.button("Back to Data Collection"): |
|
st.session_state.page = "data_collection" |
|
|
|
|
|
def excel_page(): |
|
st.title(":green[Excel Data Format]") |
|
|
|
st.write("### What is Excel?") |
|
st.write("Excel is a spreadsheet tool for storing data in tabular format with rows and columns. Common file extensions: .xls, .xlsx.") |
|
|
|
st.write("### How to Read Excel Files") |
|
st.code(""" |
|
import pandas as pd |
|
|
|
# Read an Excel file |
|
df = pd.read_excel('data.xlsx', sheet_name='Sheet1') |
|
print(df) |
|
""", language='python') |
|
|
|
st.write("### Issues Encountered") |
|
st.write(""" |
|
- **File not found**: Incorrect file path. |
|
- **Sheet name error**: Specified sheet doesn't exist. |
|
- **Missing libraries**: openpyxl or xlrd might be missing. |
|
""") |
|
|
|
st.write("### Solutions to These Issues") |
|
st.code(""" |
|
# Install required libraries |
|
# pip install openpyxl xlrd |
|
|
|
# Handle missing file |
|
try: |
|
df = pd.read_excel('data.xlsx', sheet_name='Sheet1') |
|
except FileNotFoundError: |
|
print("File not found. Check the file path.") |
|
|
|
# List available sheet names |
|
excel_file = pd.ExcelFile('data.xlsx') |
|
print(excel_file.sheet_names) |
|
""", language='python') |
|
|
|
st.markdown('[Jupyter Notebook](https://colab.research.google.com/drive/1Dv68m9hcRzXsLRlRit0uZc-8CB8U6VV3?usp=sharing)') |
|
|
|
|
|
if st.button("Back to Structured Data"): |
|
st.session_state.page = "structured_data" |
|
|
|
|
|
def unstructured_data_page(): |
|
st.title(":blue[Unstructured Data]") |
|
|
|
st.markdown(""" |
|
**Unstructured data** does not have a predefined format. It consists of various data types like text, images, videos, and audio files. |
|
Examples include: |
|
- Text documents (e.g., .txt, .docx) |
|
- Images (e.g., .jpg, .png) |
|
- Videos (e.g., .mp4, .avi) |
|
- Audio files (e.g., .mp3, .wav) |
|
- Social media posts |
|
""") |
|
|
|
st.header("π Handling Text Data") |
|
st.markdown(""" |
|
Text data can be analyzed using Natural Language Processing (NLP) techniques. |
|
""") |
|
st.code(""" |
|
# Reading text data |
|
with open('sample.txt', 'r') as file: |
|
text = file.read() |
|
print(text) |
|
|
|
# Basic text processing using NLTK |
|
import nltk |
|
from nltk.tokenize import word_tokenize |
|
|
|
nltk.download('punkt') |
|
tokens = word_tokenize(text) |
|
print(tokens) |
|
""", language='python') |
|
|
|
st.header("πΌοΈ Handling Image Data") |
|
st.markdown(""" |
|
Image data can be processed using libraries like OpenCV and PIL (Pillow). |
|
""") |
|
st.code(""" |
|
from PIL import Image |
|
|
|
# Open an image file |
|
image = Image.open('sample_image.jpg') |
|
image.show() |
|
|
|
# Convert image to grayscale |
|
gray_image = image.convert('L') |
|
gray_image.show() |
|
""", language='python') |
|
|
|
st.header("π₯ Handling Video Data") |
|
st.markdown(""" |
|
Videos can be processed frame by frame using OpenCV. |
|
""") |
|
st.code(""" |
|
import cv2 |
|
|
|
# Capture video |
|
video = cv2.VideoCapture('sample_video.mp4') |
|
|
|
while video.isOpened(): |
|
ret, frame = video.read() |
|
if not ret: |
|
break |
|
cv2.imshow('Frame', frame) |
|
if cv2.waitKey(25) & 0xFF == ord('q'): |
|
break |
|
|
|
video.release() |
|
cv2.destroyAllWindows() |
|
""", language='python') |
|
|
|
st.header("π Handling Audio Data") |
|
st.markdown(""" |
|
Audio data can be handled using libraries like librosa. |
|
""") |
|
st.code(""" |
|
import librosa |
|
import librosa.display |
|
import matplotlib.pyplot as plt |
|
|
|
# Load audio file |
|
y, sr = librosa.load('sample_audio.mp3') |
|
librosa.display.waveshow(y, sr=sr) |
|
plt.title('Waveform') |
|
plt.show() |
|
""", language='python') |
|
|
|
st.markdown("### Challenges with Unstructured Data") |
|
st.write(""" |
|
- **Noise and Inconsistency**: Data is often incomplete or noisy. |
|
- **Storage Requirements**: Large size and variability in data types. |
|
- **Processing Time**: Analyzing unstructured data is computationally expensive. |
|
""") |
|
|
|
st.markdown("### Solutions") |
|
st.write(""" |
|
- **Data Cleaning**: Preprocess data to remove noise. |
|
- **Efficient Storage**: Use NoSQL databases (e.g., MongoDB) or cloud storage. |
|
- **Parallel Processing**: Utilize frameworks like Apache Spark. |
|
""") |
|
|
|
|
|
if st.button("Back to Data Collection"): |
|
st.session_state.page = "data_collection" |
|
|
|
|
|
def semi_structured_data_page(): |
|
st.title(":orange[Semi-Structured Data]") |
|
st.markdown(""" |
|
Semi-structured data does not follow the rigid structure of relational databases but still has some organizational properties. Examples include: |
|
- JSON files |
|
- XML files |
|
""") |
|
|
|
if st.button(":green[πΎ JSON]"): |
|
st.session_state.page = "json" |
|
|
|
if st.button(":green[π CSV]"): |
|
st.session_state.page = "csv" |
|
|
|
if st.button(":green[π XML]"): |
|
st.session_state.page = "xml" |
|
|
|
if st.button("Back to Data Collection"): |
|
st.session_state.page = "data_collection" |
|
|
|
|
|
def json_page(): |
|
st.title(":green[JSON Data Format]") |
|
|
|
st.write("### What is JSON?") |
|
st.write(""" |
|
JSON (JavaScript Object Notation) is a lightweight data-interchange format that's easy for humans to read and write, and easy for machines to parse and generate. JSON is often used in APIs, configuration files, and data transfer applications. |
|
""") |
|
|
|
st.write("### Reading JSON Files") |
|
st.code(""" |
|
import json |
|
# Read a JSON file |
|
with open('data.json', 'r') as file: |
|
data = json.load(file) |
|
print(data) |
|
""", language='python') |
|
|
|
st.write("### Writing JSON Files") |
|
st.code(""" |
|
import json |
|
# Write data to JSON file |
|
data = { |
|
"name": "Alice", |
|
"age": 25, |
|
"skills": ["Python", "Machine Learning"] |
|
} |
|
with open('data.json', 'w') as file: |
|
json.dump(data, file, indent=4) |
|
""", language='python') |
|
|
|
st.markdown("### Tips for Handling JSON Files") |
|
st.write(""" |
|
- JSON files can be nested, so you might need to navigate through dictionaries and lists. |
|
- If the structure is complex, you can use libraries like json_normalize() in pandas to flatten the JSON into a more tabular format for easier analysis. |
|
- JSON supports both strings and numbers, and other types like arrays and booleans, making it versatile for various data types. |
|
""") |
|
|
|
st.markdown('[Jupyter Notebook](https://huggingface.co/spaces/ronakreddy18/Zerotoheroinmachinelearning/blob/main/pages/json_file__handling.ipynb)') |
|
|
|
if st.button("Back to Semi-Structured Data"): |
|
st.session_state.page = "semi_structured_data" |
|
|
|
|
|
def csv_page(): |
|
st.title(":green[CSV Data Format]") |
|
|
|
st.write("### What is CSV?") |
|
st.write(""" |
|
CSV (Comma-Separated Values) files store tabular data in plain text, where each line is a data record and columns are separated by commas. |
|
""") |
|
|
|
st.write("### Reading CSV Files") |
|
st.code(""" |
|
import pandas as pd |
|
|
|
# Read a CSV file |
|
df = pd.read_csv('data.csv') |
|
print(df) |
|
""", language='python') |
|
|
|
st.write("### Error Handling for CSV Files") |
|
st.code(""" |
|
import pandas as pd |
|
|
|
try: |
|
df = pd.read_csv('data.csv', encoding='utf-8', delimiter=',') |
|
print("CSV File Loaded Successfully!") |
|
print(df) |
|
except FileNotFoundError: |
|
print("Error: File not found. Please check the file path.") |
|
except pd.errors.ParserError: |
|
print("Error: The file is not a valid CSV format.") |
|
except UnicodeDecodeError: |
|
print("Error: Encoding issue. Try specifying a different encoding like 'latin1' or 'utf-8'.") |
|
""", language='python') |
|
|
|
st.markdown('[Jupyter Notebook](https://huggingface.co/spaces/ronakreddy18/Zerotoheroinmachinelearning/blob/main/pages/CSV_HANDLING_GUIDE.ipynb)') |
|
|
|
if st.button("Back to Semi-Structured Data"): |
|
st.session_state.page = "semi_structured_data" |
|
|
|
|
|
def xml_page(): |
|
st.title(":green[XML Data Format]") |
|
|
|
st.write("### What is XML?") |
|
st.write(""" |
|
XML (Extensible Markup Language) is a markup language used for storing and exchanging structured data. It uses a hierarchical structure with tags to define elements. |
|
""") |
|
|
|
st.write("### Reading XML Files") |
|
st.code(""" |
|
import xml.etree.ElementTree as ET |
|
|
|
# Load and parse an XML file |
|
tree = ET.parse('data.xml') |
|
root = tree.getroot() |
|
|
|
# Access elements |
|
for child in root: |
|
print(child.tag, child.text) |
|
""", language='python') |
|
|
|
st.write("### Sample XML Data") |
|
st.code(""" |
|
<company> |
|
<employee> |
|
<name>John Doe</name> |
|
<role>Developer</role> |
|
</employee> |
|
<employee> |
|
<name>Jane Smith</name> |
|
<role>Manager</role> |
|
</employee> |
|
</company> |
|
""", language='xml') |
|
|
|
st.write("### Issues Encountered") |
|
st.write(""" |
|
- **File not found**: The specified XML file path is incorrect. |
|
- **Malformed XML**: The XML structure has syntax errors. |
|
- **XPath Errors**: Incorrect XPath expressions when querying data. |
|
""") |
|
|
|
st.write("### Solutions to These Issues") |
|
st.code(""" |
|
# Handle missing file |
|
try: |
|
tree = ET.parse('data.xml') |
|
except FileNotFoundError: |
|
print("File not found. Check the file path.") |
|
|
|
# Validate XML structure |
|
try: |
|
root = ET.fromstring(xml_data) |
|
except ET.ParseError: |
|
print("Malformed XML.") |
|
""", language='python') |
|
|
|
st.markdown('[Jupyter Notebook](https://colab.research.google.com/drive/1Dv68m9hcRzXsLRlRit0uZc-8CB8U6VV3?usp=sharing)') |
|
|
|
|
|
|
|
if st.button("Back to Semi-Structured Data"): |
|
st.session_state.page = "semi_structured_data" |
|
|
|
|
|
if st.session_state.page == "home": |
|
home_page() |
|
elif st.session_state.page == "data_collection": |
|
data_collection_page() |
|
elif st.session_state.page == "structured_data": |
|
structured_data_page() |
|
elif st.session_state.page == "excel": |
|
excel_page() |
|
elif st.session_state.page == "csv": |
|
csv_page() |
|
elif st.session_state.page == "json": |
|
json_page() |
|
elif st.session_state.page == "unstructured_data": |
|
unstructured_data_page() |
|
elif st.session_state.page == "semi_structured_data": |
|
semi_structured_data_page() |
|
elif st.session_state.page == "xml": |
|
xml_page() |
|
|