Spaces:
Build error
Build error
Update pages/LIFE_CYCLE_OF_MACHINE_LEARNING.py
Browse files
pages/LIFE_CYCLE_OF_MACHINE_LEARNING.py
CHANGED
|
@@ -227,6 +227,252 @@ plt.show()
|
|
| 227 |
|
| 228 |
|
| 229 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
# ----------------- Semi-Structured Data Page -----------------
|
| 231 |
def semi_structured_data_page():
|
| 232 |
st.title(":orange[Semi-Structured Data]")
|
|
@@ -273,7 +519,8 @@ import json
|
|
| 273 |
data = {
|
| 274 |
"name": "Alice",
|
| 275 |
"age": 25,
|
| 276 |
-
"skills
|
|
|
|
| 277 |
}
|
| 278 |
with open('data.json', 'w') as file:
|
| 279 |
json.dump(data, file, indent=4)
|
|
|
|
| 227 |
|
| 228 |
|
| 229 |
|
| 230 |
+
# ----------------- Semi-Structured Data Page -----------------
|
| 231 |
+
def semi_structured_data_page():
|
| 232 |
+
st.title(":orange[Semi-Structured Data]")
|
| 233 |
+
st.markdown("""
|
| 234 |
+
Semi-structured data does not follow the rigid structure of relational databases but still has some organizational properties. Examples include:
|
| 235 |
+
- JSON files
|
| 236 |
+
- XML files
|
| 237 |
+
""")
|
| 238 |
+
|
| 239 |
+
if st.button(":green[πΎ JSON]"):
|
| 240 |
+
st.session_state.page = "json"
|
| 241 |
+
|
| 242 |
+
if st.button(":green[π CSV]"):
|
| 243 |
+
st.session_state.page = "csv"
|
| 244 |
+
|
| 245 |
+
if st.button(":green[π XML]"):
|
| 246 |
+
st.session_state.page = "xml"
|
| 247 |
+
import streamlit as st
|
| 248 |
+
import pandas as pd
|
| 249 |
+
import json
|
| 250 |
+
import xml.etree.ElementTree as ET
|
| 251 |
+
|
| 252 |
+
# Inject custom CSS to style the buttons
|
| 253 |
+
st.markdown("""
|
| 254 |
+
<style>
|
| 255 |
+
.stButton>button {
|
| 256 |
+
background-color: #4CAF50;
|
| 257 |
+
color: white;
|
| 258 |
+
width: 100%;
|
| 259 |
+
}
|
| 260 |
+
</style>
|
| 261 |
+
""", unsafe_allow_html=True)
|
| 262 |
+
|
| 263 |
+
# Initialize page navigation state
|
| 264 |
+
if 'page' not in st.session_state:
|
| 265 |
+
st.session_state.page = "home" # Default page is "home"
|
| 266 |
+
|
| 267 |
+
# ----------------- Home Page -----------------
|
| 268 |
+
def home_page():
|
| 269 |
+
st.title(":green[Lifecycle of a Machine Learning Project]")
|
| 270 |
+
st.markdown("Click on a stage to learn more about it.")
|
| 271 |
+
|
| 272 |
+
# Buttons for various stages of the ML project lifecycle
|
| 273 |
+
if st.button(":blue[π Data Collection]"):
|
| 274 |
+
st.session_state.page = "data_collection"
|
| 275 |
+
|
| 276 |
+
if st.button(":blue[π Problem Statement]"):
|
| 277 |
+
st.markdown("### Problem Statement\nIdentify the problem you want to solve and set clear objectives and success criteria.")
|
| 278 |
+
|
| 279 |
+
if st.button(":blue[π οΈ Simple EDA]"):
|
| 280 |
+
st.markdown("### Simple EDA\nPerform exploratory data analysis to understand data distributions and relationships.")
|
| 281 |
+
|
| 282 |
+
if st.button(":blue[π§Ή Data Pre-Processing]"):
|
| 283 |
+
st.markdown("### Data Pre-Processing\nConvert raw data into cleaned data.")
|
| 284 |
+
|
| 285 |
+
if st.button(":blue[π Exploratory Data Analysis (EDA)]"):
|
| 286 |
+
st.markdown("### Exploratory Data Analysis (EDA)\nVisualize and analyze the data to understand its distributions and relationships.")
|
| 287 |
+
|
| 288 |
+
if st.button(":blue[ποΈ Feature Engineering]"):
|
| 289 |
+
st.markdown("### Feature Engineering\nCreate new features from existing data.")
|
| 290 |
+
|
| 291 |
+
if st.button(":blue[π€ Model Training]"):
|
| 292 |
+
st.markdown("### Model Training\nTrain the model using the training data and optimize its parameters.")
|
| 293 |
+
|
| 294 |
+
if st.button(":blue[π§ Model Testing]"):
|
| 295 |
+
st.markdown("### Model Testing\nAssess the model's performance using various metrics and cross-validation techniques.")
|
| 296 |
+
|
| 297 |
+
if st.button(":blue[π Model Deployment]"):
|
| 298 |
+
st.markdown("### Model Deployment\nIntegrate the trained model into a production environment and monitor its performance.")
|
| 299 |
+
|
| 300 |
+
if st.button(":blue[π Monitoring]"):
|
| 301 |
+
st.markdown("### Monitoring\nPeriodically retrain the model with new data and update features as needed.")
|
| 302 |
+
|
| 303 |
+
# ----------------- Data Collection Page -----------------
|
| 304 |
+
def data_collection_page():
|
| 305 |
+
st.title(":red[Data Collection]")
|
| 306 |
+
st.markdown("### Data Collection\nThis page discusses the process of Data Collection.")
|
| 307 |
+
st.markdown("Types of Data: **Structured**, **Unstructured**, **Semi-Structured**")
|
| 308 |
+
|
| 309 |
+
if st.button(":blue[π Structured Data]"):
|
| 310 |
+
st.session_state.page = "structured_data"
|
| 311 |
+
|
| 312 |
+
if st.button(":blue[π· Unstructured Data]"):
|
| 313 |
+
st.session_state.page = "unstructured_data"
|
| 314 |
+
|
| 315 |
+
if st.button(":blue[ποΈ Semi-Structured Data]"):
|
| 316 |
+
st.session_state.page = "semi_structured_data"
|
| 317 |
+
|
| 318 |
+
if st.button("Back to Home"):
|
| 319 |
+
st.session_state.page = "home"
|
| 320 |
+
|
| 321 |
+
# ----------------- Structured Data Page -----------------
|
| 322 |
+
def structured_data_page():
|
| 323 |
+
st.title(":blue[Structured Data]")
|
| 324 |
+
st.markdown("""
|
| 325 |
+
Structured data is highly organized and typically stored in tables like spreadsheets or databases. It is easy to search and analyze.
|
| 326 |
+
""")
|
| 327 |
+
st.markdown("### Examples: Excel files")
|
| 328 |
+
|
| 329 |
+
if st.button(":green[π Excel]"):
|
| 330 |
+
st.session_state.page = "excel"
|
| 331 |
+
|
| 332 |
+
if st.button("Back to Data Collection"):
|
| 333 |
+
st.session_state.page = "data_collection"
|
| 334 |
+
|
| 335 |
+
# ----------------- Excel Data Page -----------------
|
| 336 |
+
def excel_page():
|
| 337 |
+
st.title(":green[Excel Data Format]")
|
| 338 |
+
|
| 339 |
+
st.write("### What is Excel?")
|
| 340 |
+
st.write("Excel is a spreadsheet tool for storing data in tabular format with rows and columns. Common file extensions: .xls, .xlsx.")
|
| 341 |
+
|
| 342 |
+
st.write("### How to Read Excel Files")
|
| 343 |
+
st.code("""
|
| 344 |
+
import pandas as pd
|
| 345 |
+
|
| 346 |
+
# Read an Excel file
|
| 347 |
+
df = pd.read_excel('data.xlsx', sheet_name='Sheet1')
|
| 348 |
+
print(df)
|
| 349 |
+
""", language='python')
|
| 350 |
+
|
| 351 |
+
st.write("### Issues Encountered")
|
| 352 |
+
st.write("""
|
| 353 |
+
- **File not found**: Incorrect file path.
|
| 354 |
+
- **Sheet name error**: Specified sheet doesn't exist.
|
| 355 |
+
- **Missing libraries**: openpyxl or xlrd might be missing.
|
| 356 |
+
""")
|
| 357 |
+
|
| 358 |
+
st.write("### Solutions to These Issues")
|
| 359 |
+
st.code("""
|
| 360 |
+
# Install required libraries
|
| 361 |
+
# pip install openpyxl xlrd
|
| 362 |
+
|
| 363 |
+
# Handle missing file
|
| 364 |
+
try:
|
| 365 |
+
df = pd.read_excel('data.xlsx', sheet_name='Sheet1')
|
| 366 |
+
except FileNotFoundError:
|
| 367 |
+
print("File not found. Check the file path.")
|
| 368 |
+
|
| 369 |
+
# List available sheet names
|
| 370 |
+
excel_file = pd.ExcelFile('data.xlsx')
|
| 371 |
+
print(excel_file.sheet_names)
|
| 372 |
+
""", language='python')
|
| 373 |
+
|
| 374 |
+
st.markdown('[Jupyter Notebook](https://colab.research.google.com/drive/1Dv68m9hcRzXsLRlRit0uZc-8CB8U6VV3?usp=sharing)')
|
| 375 |
+
|
| 376 |
+
|
| 377 |
+
if st.button("Back to Structured Data"):
|
| 378 |
+
st.session_state.page = "structured_data"
|
| 379 |
+
|
| 380 |
+
# ----------------- Unstructured Data Page -----------------
|
| 381 |
+
|
| 382 |
+
from PIL import Image
|
| 383 |
+
import numpy as np
|
| 384 |
+
import matplotlib.pyplot as plt
|
| 385 |
+
|
| 386 |
+
def unstructured_data_page():
|
| 387 |
+
st.title(":blue[Unstructured Data]")
|
| 388 |
+
|
| 389 |
+
st.markdown("""
|
| 390 |
+
*Unstructured data* does not have a predefined format. It consists of various data types like text, images, videos, and audio files.
|
| 391 |
+
Examples include:
|
| 392 |
+
- Images (e.g., .jpg, .png)
|
| 393 |
+
- Videos (e.g., .mp4, .avi)
|
| 394 |
+
- Social media posts
|
| 395 |
+
""")
|
| 396 |
+
|
| 397 |
+
### Handling Image Data Section
|
| 398 |
+
st.header("πΌοΈ Handling Image Data")
|
| 399 |
+
st.markdown("""
|
| 400 |
+
Image data can be processed using libraries like OpenCV and PIL (Pillow). Images often need to be preprocessed for tasks like analysis, classification, or feature extraction. Common operations include:
|
| 401 |
+
- **Reading and displaying images**
|
| 402 |
+
- **Converting to grayscale**
|
| 403 |
+
- **Resizing and cropping**
|
| 404 |
+
- **Rotating and flipping**
|
| 405 |
+
- **Applying filters**
|
| 406 |
+
- **Edge detection and other transformations**
|
| 407 |
+
""")
|
| 408 |
+
|
| 409 |
+
st.code("""
|
| 410 |
+
from PIL import Image
|
| 411 |
+
import numpy as np
|
| 412 |
+
import matplotlib.pyplot as plt
|
| 413 |
+
|
| 414 |
+
# Open an image file
|
| 415 |
+
image = Image.open('sample_image.jpg')
|
| 416 |
+
image.show()
|
| 417 |
+
|
| 418 |
+
# Convert image to grayscale
|
| 419 |
+
gray_image = image.convert('L')
|
| 420 |
+
gray_image.show()
|
| 421 |
+
|
| 422 |
+
# Resize the image
|
| 423 |
+
resized_image = image.resize((200, 200))
|
| 424 |
+
resized_image.show()
|
| 425 |
+
|
| 426 |
+
# Rotate the image by 90 degrees
|
| 427 |
+
rotated_image = image.rotate(90)
|
| 428 |
+
rotated_image.show()
|
| 429 |
+
|
| 430 |
+
# Convert the image to a NumPy array and display its shape
|
| 431 |
+
image_array = np.array(image)
|
| 432 |
+
print(image_array.shape)
|
| 433 |
+
|
| 434 |
+
# Display the image array as a plot
|
| 435 |
+
plt.imshow(image)
|
| 436 |
+
plt.title("Original Image")
|
| 437 |
+
plt.axis('off')
|
| 438 |
+
plt.show()
|
| 439 |
+
""", language='python')
|
| 440 |
+
|
| 441 |
+
st.markdown("""
|
| 442 |
+
**Common Image Processing Techniques:**
|
| 443 |
+
- **Resizing**: Adjust the dimensions of an image for uniformity in models.
|
| 444 |
+
- **Cropping**: Extract a region of interest (ROI) from an image.
|
| 445 |
+
- **Grayscale Conversion**: Simplify image data by reducing it to a single channel.
|
| 446 |
+
- **Rotation/Flipping**: Perform augmentations to increase the dataset for model training.
|
| 447 |
+
- **Edge Detection**: Identify edges in images using filters like the Sobel or Canny filters.
|
| 448 |
+
""")
|
| 449 |
+
|
| 450 |
+
### Challenges and Solutions Section
|
| 451 |
+
st.markdown("### Challenges with Unstructured Data")
|
| 452 |
+
st.write("""
|
| 453 |
+
- *Noise and Inconsistency*: Data is often incomplete or noisy.
|
| 454 |
+
- *Storage Requirements*: Large size and variability in data types.
|
| 455 |
+
- *Processing Time*: Analyzing unstructured data is computationally expensive.
|
| 456 |
+
""")
|
| 457 |
+
|
| 458 |
+
st.markdown("### Solutions")
|
| 459 |
+
st.write("""
|
| 460 |
+
- *Data Cleaning*: Preprocess data to remove noise.
|
| 461 |
+
- *Efficient Storage*: Use NoSQL databases (e.g., MongoDB) or cloud storage.
|
| 462 |
+
- *Parallel Processing*: Utilize frameworks like Apache Spark.
|
| 463 |
+
""")
|
| 464 |
+
|
| 465 |
+
# Button to Navigate to Introduction to Image
|
| 466 |
+
if st.button("Introduction to Image"):
|
| 467 |
+
st.session_state.page = "introduction_to_image"
|
| 468 |
+
|
| 469 |
+
# Navigation Button
|
| 470 |
+
if st.button("Back to Data Collection"):
|
| 471 |
+
st.session_state.page = "data_collection"
|
| 472 |
+
|
| 473 |
+
|
| 474 |
+
|
| 475 |
+
|
| 476 |
# ----------------- Semi-Structured Data Page -----------------
|
| 477 |
def semi_structured_data_page():
|
| 478 |
st.title(":orange[Semi-Structured Data]")
|
|
|
|
| 519 |
data = {
|
| 520 |
"name": "Alice",
|
| 521 |
"age": 25,
|
| 522 |
+
"skills
|
| 523 |
+
: ["Python", "Machine Learning"]
|
| 524 |
}
|
| 525 |
with open('data.json', 'w') as file:
|
| 526 |
json.dump(data, file, indent=4)
|