Update pages/LIFE_CYCLE_OF_MACHINE_LEARNING.py
Browse files
pages/LIFE_CYCLE_OF_MACHINE_LEARNING.py
CHANGED
@@ -227,6 +227,252 @@ plt.show()
|
|
227 |
|
228 |
|
229 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
230 |
# ----------------- Semi-Structured Data Page -----------------
|
231 |
def semi_structured_data_page():
|
232 |
st.title(":orange[Semi-Structured Data]")
|
@@ -273,7 +519,8 @@ import json
|
|
273 |
data = {
|
274 |
"name": "Alice",
|
275 |
"age": 25,
|
276 |
-
"skills
|
|
|
277 |
}
|
278 |
with open('data.json', 'w') as file:
|
279 |
json.dump(data, file, indent=4)
|
|
|
227 |
|
228 |
|
229 |
|
230 |
+
# ----------------- Semi-Structured Data Page -----------------
|
231 |
+
def semi_structured_data_page():
|
232 |
+
st.title(":orange[Semi-Structured Data]")
|
233 |
+
st.markdown("""
|
234 |
+
Semi-structured data does not follow the rigid structure of relational databases but still has some organizational properties. Examples include:
|
235 |
+
- JSON files
|
236 |
+
- XML files
|
237 |
+
""")
|
238 |
+
|
239 |
+
if st.button(":green[πΎ JSON]"):
|
240 |
+
st.session_state.page = "json"
|
241 |
+
|
242 |
+
if st.button(":green[π CSV]"):
|
243 |
+
st.session_state.page = "csv"
|
244 |
+
|
245 |
+
if st.button(":green[π XML]"):
|
246 |
+
st.session_state.page = "xml"
|
247 |
+
import streamlit as st
|
248 |
+
import pandas as pd
|
249 |
+
import json
|
250 |
+
import xml.etree.ElementTree as ET
|
251 |
+
|
252 |
+
# Inject custom CSS to style the buttons
|
253 |
+
st.markdown("""
|
254 |
+
<style>
|
255 |
+
.stButton>button {
|
256 |
+
background-color: #4CAF50;
|
257 |
+
color: white;
|
258 |
+
width: 100%;
|
259 |
+
}
|
260 |
+
</style>
|
261 |
+
""", unsafe_allow_html=True)
|
262 |
+
|
263 |
+
# Initialize page navigation state
|
264 |
+
if 'page' not in st.session_state:
|
265 |
+
st.session_state.page = "home" # Default page is "home"
|
266 |
+
|
267 |
+
# ----------------- Home Page -----------------
|
268 |
+
def home_page():
|
269 |
+
st.title(":green[Lifecycle of a Machine Learning Project]")
|
270 |
+
st.markdown("Click on a stage to learn more about it.")
|
271 |
+
|
272 |
+
# Buttons for various stages of the ML project lifecycle
|
273 |
+
if st.button(":blue[π Data Collection]"):
|
274 |
+
st.session_state.page = "data_collection"
|
275 |
+
|
276 |
+
if st.button(":blue[π Problem Statement]"):
|
277 |
+
st.markdown("### Problem Statement\nIdentify the problem you want to solve and set clear objectives and success criteria.")
|
278 |
+
|
279 |
+
if st.button(":blue[π οΈ Simple EDA]"):
|
280 |
+
st.markdown("### Simple EDA\nPerform exploratory data analysis to understand data distributions and relationships.")
|
281 |
+
|
282 |
+
if st.button(":blue[π§Ή Data Pre-Processing]"):
|
283 |
+
st.markdown("### Data Pre-Processing\nConvert raw data into cleaned data.")
|
284 |
+
|
285 |
+
if st.button(":blue[π Exploratory Data Analysis (EDA)]"):
|
286 |
+
st.markdown("### Exploratory Data Analysis (EDA)\nVisualize and analyze the data to understand its distributions and relationships.")
|
287 |
+
|
288 |
+
if st.button(":blue[ποΈ Feature Engineering]"):
|
289 |
+
st.markdown("### Feature Engineering\nCreate new features from existing data.")
|
290 |
+
|
291 |
+
if st.button(":blue[π€ Model Training]"):
|
292 |
+
st.markdown("### Model Training\nTrain the model using the training data and optimize its parameters.")
|
293 |
+
|
294 |
+
if st.button(":blue[π§ Model Testing]"):
|
295 |
+
st.markdown("### Model Testing\nAssess the model's performance using various metrics and cross-validation techniques.")
|
296 |
+
|
297 |
+
if st.button(":blue[π Model Deployment]"):
|
298 |
+
st.markdown("### Model Deployment\nIntegrate the trained model into a production environment and monitor its performance.")
|
299 |
+
|
300 |
+
if st.button(":blue[π Monitoring]"):
|
301 |
+
st.markdown("### Monitoring\nPeriodically retrain the model with new data and update features as needed.")
|
302 |
+
|
303 |
+
# ----------------- Data Collection Page -----------------
|
304 |
+
def data_collection_page():
|
305 |
+
st.title(":red[Data Collection]")
|
306 |
+
st.markdown("### Data Collection\nThis page discusses the process of Data Collection.")
|
307 |
+
st.markdown("Types of Data: **Structured**, **Unstructured**, **Semi-Structured**")
|
308 |
+
|
309 |
+
if st.button(":blue[π Structured Data]"):
|
310 |
+
st.session_state.page = "structured_data"
|
311 |
+
|
312 |
+
if st.button(":blue[π· Unstructured Data]"):
|
313 |
+
st.session_state.page = "unstructured_data"
|
314 |
+
|
315 |
+
if st.button(":blue[ποΈ Semi-Structured Data]"):
|
316 |
+
st.session_state.page = "semi_structured_data"
|
317 |
+
|
318 |
+
if st.button("Back to Home"):
|
319 |
+
st.session_state.page = "home"
|
320 |
+
|
321 |
+
# ----------------- Structured Data Page -----------------
|
322 |
+
def structured_data_page():
|
323 |
+
st.title(":blue[Structured Data]")
|
324 |
+
st.markdown("""
|
325 |
+
Structured data is highly organized and typically stored in tables like spreadsheets or databases. It is easy to search and analyze.
|
326 |
+
""")
|
327 |
+
st.markdown("### Examples: Excel files")
|
328 |
+
|
329 |
+
if st.button(":green[π Excel]"):
|
330 |
+
st.session_state.page = "excel"
|
331 |
+
|
332 |
+
if st.button("Back to Data Collection"):
|
333 |
+
st.session_state.page = "data_collection"
|
334 |
+
|
335 |
+
# ----------------- Excel Data Page -----------------
|
336 |
+
def excel_page():
|
337 |
+
st.title(":green[Excel Data Format]")
|
338 |
+
|
339 |
+
st.write("### What is Excel?")
|
340 |
+
st.write("Excel is a spreadsheet tool for storing data in tabular format with rows and columns. Common file extensions: .xls, .xlsx.")
|
341 |
+
|
342 |
+
st.write("### How to Read Excel Files")
|
343 |
+
st.code("""
|
344 |
+
import pandas as pd
|
345 |
+
|
346 |
+
# Read an Excel file
|
347 |
+
df = pd.read_excel('data.xlsx', sheet_name='Sheet1')
|
348 |
+
print(df)
|
349 |
+
""", language='python')
|
350 |
+
|
351 |
+
st.write("### Issues Encountered")
|
352 |
+
st.write("""
|
353 |
+
- **File not found**: Incorrect file path.
|
354 |
+
- **Sheet name error**: Specified sheet doesn't exist.
|
355 |
+
- **Missing libraries**: openpyxl or xlrd might be missing.
|
356 |
+
""")
|
357 |
+
|
358 |
+
st.write("### Solutions to These Issues")
|
359 |
+
st.code("""
|
360 |
+
# Install required libraries
|
361 |
+
# pip install openpyxl xlrd
|
362 |
+
|
363 |
+
# Handle missing file
|
364 |
+
try:
|
365 |
+
df = pd.read_excel('data.xlsx', sheet_name='Sheet1')
|
366 |
+
except FileNotFoundError:
|
367 |
+
print("File not found. Check the file path.")
|
368 |
+
|
369 |
+
# List available sheet names
|
370 |
+
excel_file = pd.ExcelFile('data.xlsx')
|
371 |
+
print(excel_file.sheet_names)
|
372 |
+
""", language='python')
|
373 |
+
|
374 |
+
st.markdown('[Jupyter Notebook](https://colab.research.google.com/drive/1Dv68m9hcRzXsLRlRit0uZc-8CB8U6VV3?usp=sharing)')
|
375 |
+
|
376 |
+
|
377 |
+
if st.button("Back to Structured Data"):
|
378 |
+
st.session_state.page = "structured_data"
|
379 |
+
|
380 |
+
# ----------------- Unstructured Data Page -----------------
|
381 |
+
|
382 |
+
from PIL import Image
|
383 |
+
import numpy as np
|
384 |
+
import matplotlib.pyplot as plt
|
385 |
+
|
386 |
+
def unstructured_data_page():
|
387 |
+
st.title(":blue[Unstructured Data]")
|
388 |
+
|
389 |
+
st.markdown("""
|
390 |
+
*Unstructured data* does not have a predefined format. It consists of various data types like text, images, videos, and audio files.
|
391 |
+
Examples include:
|
392 |
+
- Images (e.g., .jpg, .png)
|
393 |
+
- Videos (e.g., .mp4, .avi)
|
394 |
+
- Social media posts
|
395 |
+
""")
|
396 |
+
|
397 |
+
### Handling Image Data Section
|
398 |
+
st.header("πΌοΈ Handling Image Data")
|
399 |
+
st.markdown("""
|
400 |
+
Image data can be processed using libraries like OpenCV and PIL (Pillow). Images often need to be preprocessed for tasks like analysis, classification, or feature extraction. Common operations include:
|
401 |
+
- **Reading and displaying images**
|
402 |
+
- **Converting to grayscale**
|
403 |
+
- **Resizing and cropping**
|
404 |
+
- **Rotating and flipping**
|
405 |
+
- **Applying filters**
|
406 |
+
- **Edge detection and other transformations**
|
407 |
+
""")
|
408 |
+
|
409 |
+
st.code("""
|
410 |
+
from PIL import Image
|
411 |
+
import numpy as np
|
412 |
+
import matplotlib.pyplot as plt
|
413 |
+
|
414 |
+
# Open an image file
|
415 |
+
image = Image.open('sample_image.jpg')
|
416 |
+
image.show()
|
417 |
+
|
418 |
+
# Convert image to grayscale
|
419 |
+
gray_image = image.convert('L')
|
420 |
+
gray_image.show()
|
421 |
+
|
422 |
+
# Resize the image
|
423 |
+
resized_image = image.resize((200, 200))
|
424 |
+
resized_image.show()
|
425 |
+
|
426 |
+
# Rotate the image by 90 degrees
|
427 |
+
rotated_image = image.rotate(90)
|
428 |
+
rotated_image.show()
|
429 |
+
|
430 |
+
# Convert the image to a NumPy array and display its shape
|
431 |
+
image_array = np.array(image)
|
432 |
+
print(image_array.shape)
|
433 |
+
|
434 |
+
# Display the image array as a plot
|
435 |
+
plt.imshow(image)
|
436 |
+
plt.title("Original Image")
|
437 |
+
plt.axis('off')
|
438 |
+
plt.show()
|
439 |
+
""", language='python')
|
440 |
+
|
441 |
+
st.markdown("""
|
442 |
+
**Common Image Processing Techniques:**
|
443 |
+
- **Resizing**: Adjust the dimensions of an image for uniformity in models.
|
444 |
+
- **Cropping**: Extract a region of interest (ROI) from an image.
|
445 |
+
- **Grayscale Conversion**: Simplify image data by reducing it to a single channel.
|
446 |
+
- **Rotation/Flipping**: Perform augmentations to increase the dataset for model training.
|
447 |
+
- **Edge Detection**: Identify edges in images using filters like the Sobel or Canny filters.
|
448 |
+
""")
|
449 |
+
|
450 |
+
### Challenges and Solutions Section
|
451 |
+
st.markdown("### Challenges with Unstructured Data")
|
452 |
+
st.write("""
|
453 |
+
- *Noise and Inconsistency*: Data is often incomplete or noisy.
|
454 |
+
- *Storage Requirements*: Large size and variability in data types.
|
455 |
+
- *Processing Time*: Analyzing unstructured data is computationally expensive.
|
456 |
+
""")
|
457 |
+
|
458 |
+
st.markdown("### Solutions")
|
459 |
+
st.write("""
|
460 |
+
- *Data Cleaning*: Preprocess data to remove noise.
|
461 |
+
- *Efficient Storage*: Use NoSQL databases (e.g., MongoDB) or cloud storage.
|
462 |
+
- *Parallel Processing*: Utilize frameworks like Apache Spark.
|
463 |
+
""")
|
464 |
+
|
465 |
+
# Button to Navigate to Introduction to Image
|
466 |
+
if st.button("Introduction to Image"):
|
467 |
+
st.session_state.page = "introduction_to_image"
|
468 |
+
|
469 |
+
# Navigation Button
|
470 |
+
if st.button("Back to Data Collection"):
|
471 |
+
st.session_state.page = "data_collection"
|
472 |
+
|
473 |
+
|
474 |
+
|
475 |
+
|
476 |
# ----------------- Semi-Structured Data Page -----------------
|
477 |
def semi_structured_data_page():
|
478 |
st.title(":orange[Semi-Structured Data]")
|
|
|
519 |
data = {
|
520 |
"name": "Alice",
|
521 |
"age": 25,
|
522 |
+
"skills
|
523 |
+
: ["Python", "Machine Learning"]
|
524 |
}
|
525 |
with open('data.json', 'w') as file:
|
526 |
json.dump(data, file, indent=4)
|