{ "cells": [ { "cell_type": "code", "execution_count": 33, "id": "28c5c43d-ab8e-42e0-b335-37eae30f075a", "metadata": {}, "outputs": [ { "data": { "text/html": [ " \n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#import all the necessary packages.\n", "from PIL import Image\n", "import requests\n", "from io import BytesIO\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", "import warnings\n", "from bs4 import BeautifulSoup\n", "from nltk.corpus import stopwords\n", "from nltk.tokenize import word_tokenize\n", "import nltk\n", "import math\n", "import time\n", "import re\n", "import os\n", "import seaborn as sns\n", "from collections import Counter\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics.pairwise import cosine_similarity \n", "from sklearn.metrics import pairwise_distances\n", "from matplotlib import gridspec\n", "from scipy.sparse import hstack\n", "import plotly\n", "import plotly.figure_factory as ff\n", "from plotly.graph_objs import Scatter, Layout\n", "\n", "plotly.offline.init_notebook_mode(connected=True)\n", "warnings.filterwarnings(\"ignore\")" ] }, { "cell_type": "code", "execution_count": 2, "id": "771a96b6-5cb1-4d33-84ea-3191ea4c4a14", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of data points : 183138 Number of features/variables: 19\n", "Index(['sku', 'asin', 'product_type_name', 'formatted_price', 'author',\n", " 'color', 'brand', 'publisher', 'availability', 'reviews',\n", " 'large_image_url', 'availability_type', 'small_image_url',\n", " 'editorial_review', 'title', 'model', 'medium_image_url',\n", " 'manufacturer', 'editorial_reivew'],\n", " dtype='object')\n" ] } ], "source": [ "from pprint import pprint\n", "data = pd.read_json('tops_fashion.json')\n", "print ('Number of data points : ', data.shape[0], \\\n", " 'Number of features/variables:', data.shape[1])\n", "\n", "pprint(data.columns)\n", "data.to_pickle('180k_apparel_data')" ] }, { "cell_type": "code", "execution_count": 4, "id": "a8fb6af7-9020-4a42-8787-dbe8e4bdffdd", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of data points : 183138 Number of features: 7\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
asinbrandcolormedium_image_urlproduct_type_nametitleformatted_price
0B016I2TS4WFNC7CNonehttps://images-na.ssl-images-amazon.com/images...SHIRTMinions Como Superheroes Ironman Long Sleeve R...None
1B01N49AI08FIG ClothingNonehttps://images-na.ssl-images-amazon.com/images...SHIRTFIG Clothing Womens Izo TunicNone
2B01JDPCOHOFIG ClothingNonehttps://images-na.ssl-images-amazon.com/images...SHIRTFIG Clothing Womens Won TopNone
3B01N19U5H5Focal18Nonehttps://images-na.ssl-images-amazon.com/images...SHIRTFocal18 Sailor Collar Bubble Sleeve Blouse Shi...None
4B004GSI2OSFeatherLiteOnyx Black/ Stonehttps://images-na.ssl-images-amazon.com/images...SHIRTFeatherlite Ladies' Long Sleeve Stain Resistan...$26.26
\n", "
" ], "text/plain": [ " asin brand color \\\n", "0 B016I2TS4W FNC7C None \n", "1 B01N49AI08 FIG Clothing None \n", "2 B01JDPCOHO FIG Clothing None \n", "3 B01N19U5H5 Focal18 None \n", "4 B004GSI2OS FeatherLite Onyx Black/ Stone \n", "\n", " medium_image_url product_type_name \\\n", "0 https://images-na.ssl-images-amazon.com/images... SHIRT \n", "1 https://images-na.ssl-images-amazon.com/images... SHIRT \n", "2 https://images-na.ssl-images-amazon.com/images... SHIRT \n", "3 https://images-na.ssl-images-amazon.com/images... SHIRT \n", "4 https://images-na.ssl-images-amazon.com/images... SHIRT \n", "\n", " title formatted_price \n", "0 Minions Como Superheroes Ironman Long Sleeve R... None \n", "1 FIG Clothing Womens Izo Tunic None \n", "2 FIG Clothing Womens Won Top None \n", "3 Focal18 Sailor Collar Bubble Sleeve Blouse Shi... None \n", "4 Featherlite Ladies' Long Sleeve Stain Resistan... $26.26 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = data[['asin', 'brand', 'color', 'medium_image_url', 'product_type_name', 'title', 'formatted_price']]\n", "print ('Number of data points : ', data.shape[0], \\\n", " 'Number of features:', data.shape[1])\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 5, "id": "2770f2c1-8f64-4e46-89ab-56fe0aee98f1", "metadata": {}, "outputs": [], "source": [ "data = pd.read_pickle('180k_apparel_data')" ] }, { "cell_type": "code", "execution_count": 11, "id": "be51c67c-f8f7-41a4-afbc-6675a65d5ac0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of data points After eliminating price=NULL : 28385\n", "Number of data points After eliminating color=NULL : 28385\n" ] } ], "source": [ "data = data.loc[~data['formatted_price'].isnull()]\n", "print('Number of data points After eliminating price=NULL :', data.shape[0])\n", "\n", "data =data.loc[~data['color'].isnull()]\n", "print('Number of data points After eliminating color=NULL :', data.shape[0])\n", "\n", "data.to_pickle('28k_apparel_data')" ] }, { "cell_type": "code", "execution_count": 12, "id": "7a73f305-e11d-4bc4-b63a-0695259c5546", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "After removal of products with short description: 27949\n" ] } ], "source": [ "# Remove All products with very few words in title\n", "data_sorted = data[data['title'].apply(lambda x: len(x.split())>4)]\n", "print(\"After removal of products with short description:\", data_sorted.shape[0])" ] }, { "cell_type": "code", "execution_count": 13, "id": "c833f607-b356-496e-8877-2bb67eec9f76", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The number of entries with duplicate title is 2325\n", "Number of data points now is: 17593\n" ] } ], "source": [ "data = pd.read_pickle('28k_apparel_data')\n", "print('The number of entries with duplicate title is %d'%sum(data.duplicated('title')))\n", "# Sort the whole data based on title (alphabetical order of title) \n", "data_sorted.sort_values('title',inplace=True, ascending=False)\n", "\n", "indices = []\n", "for i,row in data_sorted.iterrows():\n", " indices.append(i)\n", "import itertools\n", "stage1_dedupe_asins = []\n", "i = 0\n", "j = 0\n", "num_data_points = data_sorted.shape[0]\n", "while i < num_data_points and j < num_data_points: \n", " previous_i = i\n", " # store the list of words of ith string in a, ex: a = ['tokidoki', 'The', 'Queen', 'of', 'Diamonds', 'Women's', 'Shirt', 'X-Large']\n", " a = data['title'].loc[indices[i]].split()\n", " # search for the similar products sequentially \n", " j = i+1\n", " while j < num_data_points:\n", " # store the list of words of jth string in b, ex: b = ['tokidoki', 'The', 'Queen', 'of', 'Diamonds', 'Women's', 'Shirt', 'Small']\n", " b = data['title'].loc[indices[j]].split()\n", " # store the maximum length of two strings\n", " length = max(len(a), len(b))\n", " # count is used to store the number of words that are matched in both strings\n", " count = 0\n", " # itertools.zip_longest(a,b): will map the corresponding words in both strings, it will appened None in case of unequal strings\n", " # example: a =['a', 'b', 'c', 'd']\n", " # b = ['a', 'b', 'd']\n", " # itertools.zip_longest(a,b): will give [('a','a'), ('b','b'), ('c','d'), ('d', None)]\n", " for k in itertools.zip_longest(a,b): \n", " if (k[0] == k[1]):\n", " count += 1\n", " # if the number of words in which both strings differ are > 2 , we are considering it as those two apperals are different\n", " # if the number of words in which both strings differ are < 2 , we are considering it as those two apperals are same, hence we are ignoring them\n", " if (length - count) > 2: # number of words in which both sensences differ\n", " # if both strings are differ by more than 2 words we include the 1st string index\n", " stage1_dedupe_asins.append(data_sorted['asin'].loc[indices[i]])\n", " # if the comaprision between is between num_data_points, num_data_points-1 strings and they differ in more than 2 words we include both\n", " if j == num_data_points-1: stage1_dedupe_asins.append(data_sorted['asin'].loc[indices[j]])\n", " # start searching for similar apperals corresponds 2nd string\n", " i = j\n", " break\n", " else:\n", " j += 1\n", " if previous_i == i:\n", " break\n", "\n", "data = data.loc[data['asin'].isin(stage1_dedupe_asins)]\n", "print('Number of data points now is: ', data.shape[0])\n", "data.to_pickle('17k_apparrel_data')" ] }, { "cell_type": "code", "execution_count": 14, "id": "59569acf-a5ab-4426-83fa-a06e73f477c7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of data points after stage two of dedupe: 16435\n" ] } ], "source": [ "data = pd.read_pickle('17k_apparrel_data')\n", "indices = []\n", "for i,row in data.iterrows():\n", " indices.append(i)\n", "\n", "stage2_dedupe_asins = []\n", "while len(indices)!=0:\n", " i = indices.pop()\n", " stage2_dedupe_asins.append(data['asin'].loc[i])\n", " # consider the first apperal's title\n", " a = data['title'].loc[i].split()\n", " # store the list of words of ith string in a, ex: a = ['tokidoki', 'The', 'Queen', 'of', 'Diamonds', 'Women's', 'Shirt', 'X-Large']\n", " for j in indices: \n", " b = data['title'].loc[j].split()\n", " # store the list of words of jth string in b, ex: b = ['tokidoki', 'The', 'Queen', 'of', 'Diamonds', 'Women's', 'Shirt', 'X-Large']\n", " length = max(len(a),len(b)) \n", " # count is used to store the number of words that are matched in both strings\n", " count = 0\n", " # itertools.zip_longest(a,b): will map the corresponding words in both strings, it will appened None in case of unequal strings\n", " # example: a =['a', 'b', 'c', 'd']\n", " # b = ['a', 'b', 'd']\n", " # itertools.zip_longest(a,b): will give [('a','a'), ('b','b'), ('c','d'), ('d', None)]\n", " for k in itertools.zip_longest(a,b): \n", " if (k[0]==k[1]):\n", " count += 1\n", " # if the number of words in which both strings differ are < 3 , we are considering it as those two apperals are same, hence we are ignoring them\n", " if (length - count) < 3:\n", " indices.remove(j)\n", " \n", "data = data.loc[data['asin'].isin(stage2_dedupe_asins)]\n", "print('Number of data points after stage two of dedupe: ',data.shape[0])\n", "data.to_pickle('16k_apperal_data')" ] }, { "cell_type": "code", "execution_count": 16, "id": "c9817bf6-78ac-4d6a-8ea6-07c4692a28a7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "list of stop words: {'doesn', 'haven', 'out', 'between', 'who', 'do', 'me', 'into', 'shan', \"you'll\", 'which', 'up', 'while', 'until', 't', \"hadn't\", 'off', \"should've\", 'with', 'had', 'here', 'your', 'are', \"you're\", 'so', 'under', 'before', 'aren', 'll', 'both', 'been', 'again', 'couldn', 'ma', \"that'll\", 'on', 'other', 'if', 'you', 'hasn', \"you've\", 'own', 'or', 'further', 'our', \"doesn't\", 'm', 'he', \"mustn't\", 'them', 'it', \"isn't\", \"weren't\", 'won', \"it's\", 'y', 'was', 'ours', 'an', 'no', 'against', 's', \"shouldn't\", 'should', \"didn't\", 'isn', 've', 'does', 'just', 'wasn', 'my', 'him', 'doing', 'same', 'about', 'to', 'being', \"needn't\", 'why', 'hadn', 'having', 'most', 'am', 'from', 'itself', 'as', 'her', 'mustn', 'that', 'mightn', 'too', 'don', 'below', 'theirs', 'herself', 'some', 'myself', 'weren', \"won't\", 'wouldn', 'can', 'then', 'what', \"couldn't\", 'once', 'how', 'those', 'of', \"mightn't\", 'themselves', 'at', 'where', 'such', 'is', 'did', 'ain', \"aren't\", 'the', 'have', 'any', 'because', \"haven't\", 'she', 'after', 'shouldn', 'its', 'when', 'nor', 'few', 'needn', 'not', 'hers', 'd', 'yourself', 'there', 'o', 'only', 'their', 'yourselves', 'ourselves', 'we', 'above', 'through', 'over', 'were', \"you'd\", 'whom', 'more', 'very', \"don't\", \"she's\", 'a', 'now', 're', 'his', 'all', 'for', 'i', 'himself', \"hasn't\", 'has', 'down', 'by', 'didn', 'will', 'but', 'during', \"shan't\", 'yours', 'in', 'than', \"wasn't\", 'these', 'each', 'they', 'and', \"wouldn't\", 'be', 'this'}\n" ] } ], "source": [ "data = pd.read_pickle('16k_apperal_data')\n", "# we use the list of stop words that are downloaded from nltk lib.\n", "stop_words = set(stopwords.words('english'))\n", "print ('list of stop words:', stop_words)\n", "\n", "def nlp_preprocessing(total_text, index, column):\n", " if type(total_text) is not int:\n", " string = \"\"\n", " for words in total_text.split():\n", " # remove the special chars in review like '\"#$@!%^&*()_+-~?>< etc.\n", " word = (\"\".join(e for e in words if e.isalnum()))\n", " # Conver all letters to lower-case\n", " word = word.lower()\n", " # stop-word removal\n", " if not word in stop_words:\n", " string += word + \" \"\n", " data[column][index] = string" ] }, { "cell_type": "code", "execution_count": 19, "id": "f42e07a2-f411-451d-b414-8e6f0a4ea16f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "7.099361000582576 seconds\n" ] } ], "source": [ "import time\n", "start_time = time.perf_counter()\n", "# we take each title and we text-preprocess it.\n", "for index, row in data.iterrows():\n", " nlp_preprocessing(row['title'], index, 'title')\n", "# we print the time it took to preprocess whole titles \n", "print(time.perf_counter() - start_time, \"seconds\")\n", "\n", "data.head()\n", "data.to_pickle('16k_apparel_data')" ] }, { "cell_type": "code", "execution_count": 21, "id": "67c3fb82-aa21-4146-90ec-c37ce40a951f", "metadata": {}, "outputs": [], "source": [ "data = data[['asin', 'brand', 'color', 'medium_image_url', 'product_type_name', 'title', 'formatted_price']]\n", "data.to_pickle('16k_apparel_data')" ] }, { "cell_type": "code", "execution_count": 22, "id": "22545da7-367e-45fe-98cd-e6b67f575097", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
asinbrandcolormedium_image_urlproduct_type_nametitleformatted_price
4B004GSI2OSFeatherLiteOnyx Black/ Stonehttps://images-na.ssl-images-amazon.com/images...SHIRTfeatherlite ladies long sleeve stain resistant...$26.26
6B012YX2ZPIHX-Kingdom Fashion T-shirtsWhitehttps://images-na.ssl-images-amazon.com/images...SHIRTwomens unique 100 cotton special olympics wor...$9.99
15B003BSRPB0FeatherLiteWhitehttps://images-na.ssl-images-amazon.com/images...SHIRTfeatherlite ladies moisture free mesh sport sh...$20.54
27B014ICEJ1QFNC7CPurplehttps://images-na.ssl-images-amazon.com/images...SHIRTsupernatural chibis sam dean castiel neck tshi...$7.39
46B01NACPBG2Fifth DegreeBlackhttps://images-na.ssl-images-amazon.com/images...SHIRTfifth degree womens gold foil graphic tees jun...$6.95
\n", "
" ], "text/plain": [ " asin brand color \\\n", "4 B004GSI2OS FeatherLite Onyx Black/ Stone \n", "6 B012YX2ZPI HX-Kingdom Fashion T-shirts White \n", "15 B003BSRPB0 FeatherLite White \n", "27 B014ICEJ1Q FNC7C Purple \n", "46 B01NACPBG2 Fifth Degree Black \n", "\n", " medium_image_url product_type_name \\\n", "4 https://images-na.ssl-images-amazon.com/images... SHIRT \n", "6 https://images-na.ssl-images-amazon.com/images... SHIRT \n", "15 https://images-na.ssl-images-amazon.com/images... SHIRT \n", "27 https://images-na.ssl-images-amazon.com/images... SHIRT \n", "46 https://images-na.ssl-images-amazon.com/images... SHIRT \n", "\n", " title formatted_price \n", "4 featherlite ladies long sleeve stain resistant... $26.26 \n", "6 womens unique 100 cotton special olympics wor... $9.99 \n", "15 featherlite ladies moisture free mesh sport sh... $20.54 \n", "27 supernatural chibis sam dean castiel neck tshi... $7.39 \n", "46 fifth degree womens gold foil graphic tees jun... $6.95 " ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.head()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "38fc94f6-3682-4d38-94b7-370a81afe631", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.11" } }, "nbformat": 4, "nbformat_minor": 5 }