{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "28c5c43d-ab8e-42e0-b335-37eae30f075a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "        <script type=\"text/javascript\">\n",
       "        window.PlotlyConfig = {MathJaxConfig: 'local'};\n",
       "        if (window.MathJax && window.MathJax.Hub && window.MathJax.Hub.Config) {window.MathJax.Hub.Config({SVG: {font: \"STIX-Web\"}});}\n",
       "        if (typeof require !== 'undefined') {\n",
       "        require.undef(\"plotly\");\n",
       "        requirejs.config({\n",
       "            paths: {\n",
       "                'plotly': ['https://cdn.plot.ly/plotly-2.35.2.min']\n",
       "            }\n",
       "        });\n",
       "        require(['plotly'], function(Plotly) {\n",
       "            window._Plotly = Plotly;\n",
       "        });\n",
       "        }\n",
       "        </script>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "#import all the necessary packages.\n",
    "from PIL import Image\n",
    "import requests\n",
    "from io import BytesIO\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import warnings\n",
    "from bs4 import BeautifulSoup\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.tokenize import word_tokenize\n",
    "import nltk\n",
    "import math\n",
    "import time\n",
    "import re\n",
    "import os\n",
    "import seaborn as sns\n",
    "from collections import Counter\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.metrics.pairwise import cosine_similarity  \n",
    "from sklearn.metrics import pairwise_distances\n",
    "from matplotlib import gridspec\n",
    "from scipy.sparse import hstack\n",
    "import plotly\n",
    "import plotly.figure_factory as ff\n",
    "from plotly.graph_objs import Scatter, Layout\n",
    "\n",
    "plotly.offline.init_notebook_mode(connected=True)\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "771a96b6-5cb1-4d33-84ea-3191ea4c4a14",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of data points :  183138 Number of features/variables: 19\n",
      "Index(['sku', 'asin', 'product_type_name', 'formatted_price', 'author',\n",
      "       'color', 'brand', 'publisher', 'availability', 'reviews',\n",
      "       'large_image_url', 'availability_type', 'small_image_url',\n",
      "       'editorial_review', 'title', 'model', 'medium_image_url',\n",
      "       'manufacturer', 'editorial_reivew'],\n",
      "      dtype='object')\n"
     ]
    }
   ],
   "source": [
    "from pprint import pprint\n",
    "data = pd.read_json('tops_fashion.json')\n",
    "print ('Number of data points : ', data.shape[0], \\\n",
    "       'Number of features/variables:', data.shape[1])\n",
    "\n",
    "pprint(data.columns)\n",
    "data.to_pickle('180k_apparel_data')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "a8fb6af7-9020-4a42-8787-dbe8e4bdffdd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of data points :  183138 Number of features: 7\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>asin</th>\n",
       "      <th>brand</th>\n",
       "      <th>color</th>\n",
       "      <th>medium_image_url</th>\n",
       "      <th>product_type_name</th>\n",
       "      <th>title</th>\n",
       "      <th>formatted_price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>B016I2TS4W</td>\n",
       "      <td>FNC7C</td>\n",
       "      <td>None</td>\n",
       "      <td>https://images-na.ssl-images-amazon.com/images...</td>\n",
       "      <td>SHIRT</td>\n",
       "      <td>Minions Como Superheroes Ironman Long Sleeve R...</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>B01N49AI08</td>\n",
       "      <td>FIG Clothing</td>\n",
       "      <td>None</td>\n",
       "      <td>https://images-na.ssl-images-amazon.com/images...</td>\n",
       "      <td>SHIRT</td>\n",
       "      <td>FIG Clothing Womens Izo Tunic</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>B01JDPCOHO</td>\n",
       "      <td>FIG Clothing</td>\n",
       "      <td>None</td>\n",
       "      <td>https://images-na.ssl-images-amazon.com/images...</td>\n",
       "      <td>SHIRT</td>\n",
       "      <td>FIG Clothing Womens Won Top</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>B01N19U5H5</td>\n",
       "      <td>Focal18</td>\n",
       "      <td>None</td>\n",
       "      <td>https://images-na.ssl-images-amazon.com/images...</td>\n",
       "      <td>SHIRT</td>\n",
       "      <td>Focal18 Sailor Collar Bubble Sleeve Blouse Shi...</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>B004GSI2OS</td>\n",
       "      <td>FeatherLite</td>\n",
       "      <td>Onyx Black/ Stone</td>\n",
       "      <td>https://images-na.ssl-images-amazon.com/images...</td>\n",
       "      <td>SHIRT</td>\n",
       "      <td>Featherlite Ladies' Long Sleeve Stain Resistan...</td>\n",
       "      <td>$26.26</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         asin         brand              color  \\\n",
       "0  B016I2TS4W         FNC7C               None   \n",
       "1  B01N49AI08  FIG Clothing               None   \n",
       "2  B01JDPCOHO  FIG Clothing               None   \n",
       "3  B01N19U5H5       Focal18               None   \n",
       "4  B004GSI2OS   FeatherLite  Onyx Black/ Stone   \n",
       "\n",
       "                                    medium_image_url product_type_name  \\\n",
       "0  https://images-na.ssl-images-amazon.com/images...             SHIRT   \n",
       "1  https://images-na.ssl-images-amazon.com/images...             SHIRT   \n",
       "2  https://images-na.ssl-images-amazon.com/images...             SHIRT   \n",
       "3  https://images-na.ssl-images-amazon.com/images...             SHIRT   \n",
       "4  https://images-na.ssl-images-amazon.com/images...             SHIRT   \n",
       "\n",
       "                                               title formatted_price  \n",
       "0  Minions Como Superheroes Ironman Long Sleeve R...            None  \n",
       "1                      FIG Clothing Womens Izo Tunic            None  \n",
       "2                        FIG Clothing Womens Won Top            None  \n",
       "3  Focal18 Sailor Collar Bubble Sleeve Blouse Shi...            None  \n",
       "4  Featherlite Ladies' Long Sleeve Stain Resistan...          $26.26  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = data[['asin', 'brand', 'color', 'medium_image_url', 'product_type_name', 'title', 'formatted_price']]\n",
    "print ('Number of data points : ', data.shape[0], \\\n",
    "       'Number of features:', data.shape[1])\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "2770f2c1-8f64-4e46-89ab-56fe0aee98f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_pickle('180k_apparel_data')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "be51c67c-f8f7-41a4-afbc-6675a65d5ac0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of data points After eliminating price=NULL : 28385\n",
      "Number of data points After eliminating color=NULL : 28385\n"
     ]
    }
   ],
   "source": [
    "data = data.loc[~data['formatted_price'].isnull()]\n",
    "print('Number of data points After eliminating price=NULL :', data.shape[0])\n",
    "\n",
    "data =data.loc[~data['color'].isnull()]\n",
    "print('Number of data points After eliminating color=NULL :', data.shape[0])\n",
    "\n",
    "data.to_pickle('28k_apparel_data')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "7a73f305-e11d-4bc4-b63a-0695259c5546",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "After removal of products with short description: 27949\n"
     ]
    }
   ],
   "source": [
    "# Remove All products with very few words in title\n",
    "data_sorted = data[data['title'].apply(lambda x: len(x.split())>4)]\n",
    "print(\"After removal of products with short description:\", data_sorted.shape[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "c833f607-b356-496e-8877-2bb67eec9f76",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The number of entries with duplicate title is 2325\n",
      "Number of data points now is:  17593\n"
     ]
    }
   ],
   "source": [
    "data = pd.read_pickle('28k_apparel_data')\n",
    "print('The number of entries with duplicate title is %d'%sum(data.duplicated('title')))\n",
    "# Sort the whole data based on title (alphabetical order of title) \n",
    "data_sorted.sort_values('title',inplace=True, ascending=False)\n",
    "\n",
    "indices = []\n",
    "for i,row in data_sorted.iterrows():\n",
    "    indices.append(i)\n",
    "import itertools\n",
    "stage1_dedupe_asins = []\n",
    "i = 0\n",
    "j = 0\n",
    "num_data_points = data_sorted.shape[0]\n",
    "while i < num_data_points and j < num_data_points:    \n",
    "    previous_i = i\n",
    "    # store the list of words of ith string in a, ex: a = ['tokidoki', 'The', 'Queen', 'of', 'Diamonds', 'Women's', 'Shirt', 'X-Large']\n",
    "    a = data['title'].loc[indices[i]].split()\n",
    "    # search for the similar products sequentially \n",
    "    j = i+1\n",
    "    while j < num_data_points:\n",
    "        # store the list of words of jth string in b, ex: b = ['tokidoki', 'The', 'Queen', 'of', 'Diamonds', 'Women's', 'Shirt', 'Small']\n",
    "        b = data['title'].loc[indices[j]].split()\n",
    "        # store the maximum length of two strings\n",
    "        length = max(len(a), len(b))\n",
    "        # count is used to store the number of words that are matched in both strings\n",
    "        count  = 0\n",
    "        # itertools.zip_longest(a,b): will map the corresponding words in both strings, it will appened None in case of unequal strings\n",
    "        # example: a =['a', 'b', 'c', 'd']\n",
    "        # b = ['a', 'b', 'd']\n",
    "        # itertools.zip_longest(a,b): will give [('a','a'), ('b','b'), ('c','d'), ('d', None)]\n",
    "        for k in itertools.zip_longest(a,b): \n",
    "            if (k[0] == k[1]):\n",
    "                count += 1\n",
    "        # if the number of words in which both strings differ are > 2 , we are considering it as those two apperals are different\n",
    "        # if the number of words in which both strings differ are < 2 , we are considering it as those two apperals are same, hence we are ignoring them\n",
    "        if (length - count) > 2: # number of words in which both sensences differ\n",
    "            # if both strings are differ by more than 2 words we include the 1st string index\n",
    "            stage1_dedupe_asins.append(data_sorted['asin'].loc[indices[i]])\n",
    "            # if the comaprision between is between num_data_points, num_data_points-1 strings and they differ in more than 2 words we include both\n",
    "            if j == num_data_points-1: stage1_dedupe_asins.append(data_sorted['asin'].loc[indices[j]])\n",
    "            # start searching for similar apperals corresponds 2nd string\n",
    "            i = j\n",
    "            break\n",
    "        else:\n",
    "            j += 1\n",
    "    if previous_i == i:\n",
    "        break\n",
    "\n",
    "data = data.loc[data['asin'].isin(stage1_dedupe_asins)]\n",
    "print('Number of data points now is: ', data.shape[0])\n",
    "data.to_pickle('17k_apparrel_data')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "59569acf-a5ab-4426-83fa-a06e73f477c7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of data points after stage two of dedupe:  16435\n"
     ]
    }
   ],
   "source": [
    "data = pd.read_pickle('17k_apparrel_data')\n",
    "indices = []\n",
    "for i,row in data.iterrows():\n",
    "    indices.append(i)\n",
    "\n",
    "stage2_dedupe_asins = []\n",
    "while len(indices)!=0:\n",
    "    i = indices.pop()\n",
    "    stage2_dedupe_asins.append(data['asin'].loc[i])\n",
    "    # consider the first apperal's title\n",
    "    a = data['title'].loc[i].split()\n",
    "    # store the list of words of ith string in a, ex: a = ['tokidoki', 'The', 'Queen', 'of', 'Diamonds', 'Women's', 'Shirt', 'X-Large']\n",
    "    for j in indices:        \n",
    "        b = data['title'].loc[j].split()\n",
    "        # store the list of words of jth string in b, ex: b = ['tokidoki', 'The', 'Queen', 'of', 'Diamonds', 'Women's', 'Shirt', 'X-Large']\n",
    "        length = max(len(a),len(b))        \n",
    "        # count is used to store the number of words that are matched in both strings\n",
    "        count  = 0\n",
    "        # itertools.zip_longest(a,b): will map the corresponding words in both strings, it will appened None in case of unequal strings\n",
    "        # example: a =['a', 'b', 'c', 'd']\n",
    "        # b = ['a', 'b', 'd']\n",
    "        # itertools.zip_longest(a,b): will give [('a','a'), ('b','b'), ('c','d'), ('d', None)]\n",
    "        for k in itertools.zip_longest(a,b): \n",
    "            if (k[0]==k[1]):\n",
    "                count += 1\n",
    "        # if the number of words in which both strings differ are < 3 , we are considering it as those two apperals are same, hence we are ignoring them\n",
    "        if (length - count) < 3:\n",
    "            indices.remove(j)\n",
    "            \n",
    "data = data.loc[data['asin'].isin(stage2_dedupe_asins)]\n",
    "print('Number of data points after stage two of dedupe: ',data.shape[0])\n",
    "data.to_pickle('16k_apperal_data')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "c9817bf6-78ac-4d6a-8ea6-07c4692a28a7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "list of stop words: {'doesn', 'haven', 'out', 'between', 'who', 'do', 'me', 'into', 'shan', \"you'll\", 'which', 'up', 'while', 'until', 't', \"hadn't\", 'off', \"should've\", 'with', 'had', 'here', 'your', 'are', \"you're\", 'so', 'under', 'before', 'aren', 'll', 'both', 'been', 'again', 'couldn', 'ma', \"that'll\", 'on', 'other', 'if', 'you', 'hasn', \"you've\", 'own', 'or', 'further', 'our', \"doesn't\", 'm', 'he', \"mustn't\", 'them', 'it', \"isn't\", \"weren't\", 'won', \"it's\", 'y', 'was', 'ours', 'an', 'no', 'against', 's', \"shouldn't\", 'should', \"didn't\", 'isn', 've', 'does', 'just', 'wasn', 'my', 'him', 'doing', 'same', 'about', 'to', 'being', \"needn't\", 'why', 'hadn', 'having', 'most', 'am', 'from', 'itself', 'as', 'her', 'mustn', 'that', 'mightn', 'too', 'don', 'below', 'theirs', 'herself', 'some', 'myself', 'weren', \"won't\", 'wouldn', 'can', 'then', 'what', \"couldn't\", 'once', 'how', 'those', 'of', \"mightn't\", 'themselves', 'at', 'where', 'such', 'is', 'did', 'ain', \"aren't\", 'the', 'have', 'any', 'because', \"haven't\", 'she', 'after', 'shouldn', 'its', 'when', 'nor', 'few', 'needn', 'not', 'hers', 'd', 'yourself', 'there', 'o', 'only', 'their', 'yourselves', 'ourselves', 'we', 'above', 'through', 'over', 'were', \"you'd\", 'whom', 'more', 'very', \"don't\", \"she's\", 'a', 'now', 're', 'his', 'all', 'for', 'i', 'himself', \"hasn't\", 'has', 'down', 'by', 'didn', 'will', 'but', 'during', \"shan't\", 'yours', 'in', 'than', \"wasn't\", 'these', 'each', 'they', 'and', \"wouldn't\", 'be', 'this'}\n"
     ]
    }
   ],
   "source": [
    "data = pd.read_pickle('16k_apperal_data')\n",
    "# we use the list of stop words that are downloaded from nltk lib.\n",
    "stop_words = set(stopwords.words('english'))\n",
    "print ('list of stop words:', stop_words)\n",
    "\n",
    "def nlp_preprocessing(total_text, index, column):\n",
    "    if type(total_text) is not int:\n",
    "        string = \"\"\n",
    "        for words in total_text.split():\n",
    "            # remove the special chars in review like '\"#$@!%^&*()_+-~?>< etc.\n",
    "            word = (\"\".join(e for e in words if e.isalnum()))\n",
    "            # Conver all letters to lower-case\n",
    "            word = word.lower()\n",
    "            # stop-word removal\n",
    "            if not word in stop_words:\n",
    "                string += word + \" \"\n",
    "        data[column][index] = string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "f42e07a2-f411-451d-b414-8e6f0a4ea16f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "7.099361000582576 seconds\n"
     ]
    }
   ],
   "source": [
    "import time\n",
    "start_time = time.perf_counter()\n",
    "# we take each title and we text-preprocess it.\n",
    "for index, row in data.iterrows():\n",
    "    nlp_preprocessing(row['title'], index, 'title')\n",
    "# we print the time it took to preprocess whole titles \n",
    "print(time.perf_counter() - start_time, \"seconds\")\n",
    "\n",
    "data.head()\n",
    "data.to_pickle('16k_apparel_data')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "67c3fb82-aa21-4146-90ec-c37ce40a951f",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = data[['asin', 'brand', 'color', 'medium_image_url', 'product_type_name', 'title', 'formatted_price']]\n",
    "data.to_pickle('16k_apparel_data')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "22545da7-367e-45fe-98cd-e6b67f575097",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>asin</th>\n",
       "      <th>brand</th>\n",
       "      <th>color</th>\n",
       "      <th>medium_image_url</th>\n",
       "      <th>product_type_name</th>\n",
       "      <th>title</th>\n",
       "      <th>formatted_price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>B004GSI2OS</td>\n",
       "      <td>FeatherLite</td>\n",
       "      <td>Onyx Black/ Stone</td>\n",
       "      <td>https://images-na.ssl-images-amazon.com/images...</td>\n",
       "      <td>SHIRT</td>\n",
       "      <td>featherlite ladies long sleeve stain resistant...</td>\n",
       "      <td>$26.26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>B012YX2ZPI</td>\n",
       "      <td>HX-Kingdom Fashion T-shirts</td>\n",
       "      <td>White</td>\n",
       "      <td>https://images-na.ssl-images-amazon.com/images...</td>\n",
       "      <td>SHIRT</td>\n",
       "      <td>womens unique 100 cotton  special olympics wor...</td>\n",
       "      <td>$9.99</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>B003BSRPB0</td>\n",
       "      <td>FeatherLite</td>\n",
       "      <td>White</td>\n",
       "      <td>https://images-na.ssl-images-amazon.com/images...</td>\n",
       "      <td>SHIRT</td>\n",
       "      <td>featherlite ladies moisture free mesh sport sh...</td>\n",
       "      <td>$20.54</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>B014ICEJ1Q</td>\n",
       "      <td>FNC7C</td>\n",
       "      <td>Purple</td>\n",
       "      <td>https://images-na.ssl-images-amazon.com/images...</td>\n",
       "      <td>SHIRT</td>\n",
       "      <td>supernatural chibis sam dean castiel neck tshi...</td>\n",
       "      <td>$7.39</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>B01NACPBG2</td>\n",
       "      <td>Fifth Degree</td>\n",
       "      <td>Black</td>\n",
       "      <td>https://images-na.ssl-images-amazon.com/images...</td>\n",
       "      <td>SHIRT</td>\n",
       "      <td>fifth degree womens gold foil graphic tees jun...</td>\n",
       "      <td>$6.95</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          asin                        brand              color  \\\n",
       "4   B004GSI2OS                  FeatherLite  Onyx Black/ Stone   \n",
       "6   B012YX2ZPI  HX-Kingdom Fashion T-shirts              White   \n",
       "15  B003BSRPB0                  FeatherLite              White   \n",
       "27  B014ICEJ1Q                        FNC7C             Purple   \n",
       "46  B01NACPBG2                 Fifth Degree              Black   \n",
       "\n",
       "                                     medium_image_url product_type_name  \\\n",
       "4   https://images-na.ssl-images-amazon.com/images...             SHIRT   \n",
       "6   https://images-na.ssl-images-amazon.com/images...             SHIRT   \n",
       "15  https://images-na.ssl-images-amazon.com/images...             SHIRT   \n",
       "27  https://images-na.ssl-images-amazon.com/images...             SHIRT   \n",
       "46  https://images-na.ssl-images-amazon.com/images...             SHIRT   \n",
       "\n",
       "                                                title formatted_price  \n",
       "4   featherlite ladies long sleeve stain resistant...          $26.26  \n",
       "6   womens unique 100 cotton  special olympics wor...           $9.99  \n",
       "15  featherlite ladies moisture free mesh sport sh...          $20.54  \n",
       "27  supernatural chibis sam dean castiel neck tshi...           $7.39  \n",
       "46  fifth degree womens gold foil graphic tees jun...           $6.95  "
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "38fc94f6-3682-4d38-94b7-370a81afe631",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}