diff --git "a/Data Analitics/Week 3/.ipynb_checkpoints/TU257-Lab2-1-Automated-Data-Profiling-checkpoint.ipynb" "b/Data Analitics/Week 3/.ipynb_checkpoints/TU257-Lab2-1-Automated-Data-Profiling-checkpoint.ipynb" new file mode 100644--- /dev/null +++ "b/Data Analitics/Week 3/.ipynb_checkpoints/TU257-Lab2-1-Automated-Data-Profiling-checkpoint.ipynb" @@ -0,0 +1,11863 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from IPython.display import display, HTML\n", + "display(HTML(\"\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
5603Moran, Mr. JamesmaleNaN003308778.4583NaNQ
6701McCarthy, Mr. Timothy Jmale54.0001746351.8625E46S
7803Palsson, Master. Gosta Leonardmale2.03134990921.0750NaNS
\n", + "
" + ], + "text/plain": [ + " PassengerId Survived Pclass \\\n", + "0 1 0 3 \n", + "1 2 1 1 \n", + "2 3 1 3 \n", + "3 4 1 1 \n", + "4 5 0 3 \n", + "5 6 0 3 \n", + "6 7 0 1 \n", + "7 8 0 3 \n", + "\n", + " Name Sex Age SibSp \\\n", + "0 Braund, Mr. Owen Harris male 22.0 1 \n", + "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", + "2 Heikkinen, Miss. Laina female 26.0 0 \n", + "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", + "4 Allen, Mr. William Henry male 35.0 0 \n", + "5 Moran, Mr. James male NaN 0 \n", + "6 McCarthy, Mr. Timothy J male 54.0 0 \n", + "7 Palsson, Master. Gosta Leonard male 2.0 3 \n", + "\n", + " Parch Ticket Fare Cabin Embarked \n", + "0 0 A/5 21171 7.2500 NaN S \n", + "1 0 PC 17599 71.2833 C85 C \n", + "2 0 STON/O2. 3101282 7.9250 NaN S \n", + "3 0 113803 53.1000 C123 S \n", + "4 0 373450 8.0500 NaN S \n", + "5 0 330877 8.4583 NaN Q \n", + "6 0 17463 51.8625 E46 S \n", + "7 1 349909 21.0750 NaN S " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "#Change this next command to the location of train.csv on your Computer\n", + "df = pd.read_csv(\"/Users/brendan.tierney/Dropbox/4-Datasets/titanic/train.csv\")\n", + "#df = pd.read_csv(\"C:\\Studies\\TU257\\DataAnalytics\\Week2\\train.csv\")\n", + "df.head(8)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SurvivedPclassSexAgeSibSpParchTicketCabinEmbarked
003male22.010A/5 21171NaNS
111female38.010PC 17599C85C
213female26.000STON/O2. 3101282NaNS
311female35.010113803C123S
403male35.000373450NaNS
503maleNaN00330877NaNQ
601male54.00017463E46S
703male2.031349909NaNS
\n", + "
" + ], + "text/plain": [ + " Survived Pclass Sex Age SibSp Parch Ticket Cabin \\\n", + "0 0 3 male 22.0 1 0 A/5 21171 NaN \n", + "1 1 1 female 38.0 1 0 PC 17599 C85 \n", + "2 1 3 female 26.0 0 0 STON/O2. 3101282 NaN \n", + "3 1 1 female 35.0 1 0 113803 C123 \n", + "4 0 3 male 35.0 0 0 373450 NaN \n", + "5 0 3 male NaN 0 0 330877 NaN \n", + "6 0 1 male 54.0 0 0 17463 E46 \n", + "7 0 3 male 2.0 3 1 349909 NaN \n", + "\n", + " Embarked \n", + "0 S \n", + "1 C \n", + "2 S \n", + "3 S \n", + "4 S \n", + "5 Q \n", + "6 S \n", + "7 S " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2 = df.iloc[:,[1,2,4,5,6,7,8,10,11]]\n", + "df2.head(8)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SurvivedPclassAgeSibSpParch
count891.000000891.000000714.000000891.000000891.000000
mean0.3838382.30864229.6991180.5230080.381594
std0.4865920.83607114.5264971.1027430.806057
min0.0000001.0000000.4200000.0000000.000000
25%0.0000002.00000020.1250000.0000000.000000
50%0.0000003.00000028.0000000.0000000.000000
75%1.0000003.00000038.0000001.0000000.000000
max1.0000003.00000080.0000008.0000006.000000
\n", + "
" + ], + "text/plain": [ + " Survived Pclass Age SibSp Parch\n", + "count 891.000000 891.000000 714.000000 891.000000 891.000000\n", + "mean 0.383838 2.308642 29.699118 0.523008 0.381594\n", + "std 0.486592 0.836071 14.526497 1.102743 0.806057\n", + "min 0.000000 1.000000 0.420000 0.000000 0.000000\n", + "25% 0.000000 2.000000 20.125000 0.000000 0.000000\n", + "50% 0.000000 3.000000 28.000000 0.000000 0.000000\n", + "75% 1.000000 3.000000 38.000000 1.000000 0.000000\n", + "max 1.000000 3.000000 80.000000 8.000000 6.000000" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countmeanstdmin25%50%75%max
Survived891.00.3838380.4865920.000.0000.01.01.0
Pclass891.02.3086420.8360711.002.0003.03.03.0
Age714.029.69911814.5264970.4220.12528.038.080.0
SibSp891.00.5230081.1027430.000.0000.01.08.0
Parch891.00.3815940.8060570.000.0000.00.06.0
\n", + "
" + ], + "text/plain": [ + " count mean std min 25% 50% 75% max\n", + "Survived 891.0 0.383838 0.486592 0.00 0.000 0.0 1.0 1.0\n", + "Pclass 891.0 2.308642 0.836071 1.00 2.000 3.0 3.0 3.0\n", + "Age 714.0 29.699118 14.526497 0.42 20.125 28.0 38.0 80.0\n", + "SibSp 891.0 0.523008 1.102743 0.00 0.000 0.0 1.0 8.0\n", + "Parch 891.0 0.381594 0.806057 0.00 0.000 0.0 0.0 6.0" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2.describe().transpose()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6fc6de048dc740cdad086460002c82d3", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Summarize dataset: 0%| | 0/5 [00:00" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Make sure to install 'ydata_profiling' library before running the following\n", + "#see Lab Notes\n", + "\n", + "from ydata_profiling import ProfileReport\n", + "\n", + "profile = ProfileReport(df2, title=\"Profiling Report\")\n", + "profile" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "#Can you save the Data Profile Report to a file?\n", + "#Check the package Github site for examples (link to this is in the Lab Notes)\n", + "# https://github.com/ydataai/ydata-profiling\n", + "# Scroll to the bottom of the main GitHub page for examples of saving the report\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "#Enter the code here\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### See lots more examples of using this library/package for analysing datasets on the Github page. Scroll to bottom of main page to get the links" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}