Spaces:
Sleeping
Sleeping
File size: 8,120 Bytes
53f5531 b81a27d 53f5531 2197ffd 53f5531 2197ffd 1e39d45 2197ffd 53f5531 2197ffd 53f5531 2197ffd 53f5531 2197ffd 20f654b 2197ffd 53f5531 2197ffd 53f5531 2197ffd 53f5531 6c4addd aeafac5 9e1ea5c aeafac5 2197ffd 6c4addd 2197ffd 6c4addd 53f5531 6c4addd 53f5531 6c4addd 53f5531 60bf80e 53f5531 2197ffd b81a27d 53f5531 2197ffd 53f5531 aeafac5 60bf80e 53f5531 0711e1b 506f53d 0711e1b 53f5531 aeafac5 53f5531 6c4addd 9ad86e8 506f53d 9ad86e8 0711e1b 9ad86e8 53f5531 2497bf3 502f045 2497bf3 9ad86e8 60bf80e e6f6978 2497bf3 502f045 9ad86e8 b8a58f7 53f5531 d465234 aeafac5 53f5531 d465234 53f5531 d465234 53f5531 5e9caa2 082b8e8 5e9caa2 d465234 aeafac5 53f5531 aeafac5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 |
import os
import re
import pandas as pd
import numpy as np
from typing import List, Tuple
import faiss
from faiss import write_index, read_index
import gradio as gr
from fuzzywuzzy import process
from tqdm import tqdm
from transformers import BertTokenizerFast, BertModel, AutoTokenizer, AutoModel
# Global variables to store loaded data
dataset = None
faiss_index = None
normalized_data = None
book_titles = None
def is_valid_isbn(isbn):
pattern = r'^(?:(?:978|979)\d{10}|\d{9}[0-9X])$'
return bool(re.match(pattern, isbn))
def load_data(ratings_path, books_path) -> Tuple[pd.DataFrame, pd.DataFrame]:
ratings = pd.read_csv(ratings_path, encoding='cp1251', sep=';', on_bad_lines='skip')
ratings = ratings[ratings['Book-Rating'] != 0]
books = pd.read_csv(books_path, encoding='cp1251', sep=';', on_bad_lines='skip')
return ratings, books
def preprocess_data(ratings: pd.DataFrame, books: pd.DataFrame) -> pd.DataFrame:
dataset = pd.merge(ratings, books, on=['ISBN'])
return dataset.apply(lambda x: x.str.lower() if x.dtype == 'object' else x)
def create_embedding(dataset):
model_name = "mrm8488/bert-tiny-finetuned-sms-spam-detection"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
print("creating tokens")
tokens = [tokenizer(i, padding="max_length", truncation=True, max_length=10, return_tensors='pt')
for i in dataset]
print("\ncreating embedding\n")
emb = []
for i in tqdm(tokens):
emb.append(model(**i,)["last_hidden_state"].detach().numpy().squeeze().reshape(-1))
# Normalize the data
normalized_data = emb / np.linalg.norm(emb)
return normalized_data
def build_faiss_index(dataset: pd.DataFrame) -> Tuple[faiss.IndexFlatIP, np.ndarray]:
if os.path.exists("books.index"):
return read_index("books.index")
dataset["embedding"] = create_embedding(dataset["Book-Title"])
print("creating index")
normalized_data = dataset["embedding"]
# Create a Faiss index
dimension = normalized_data.shape[-1]
index = faiss.IndexFlatIP(dimension)
# Add vectors to the index
index.add(normalized_data.astype('float16'))
write_index(index, "data/books.index")
return index
def compute_correlations_faiss(index: faiss.IndexFlatIP, book_titles: List[str],
target_book, ) -> pd.DataFrame:
print(target_book, type(target_book))
emb = create_embedding([target_book[0]])
# target_vector = book_titles.index(emb)
# Perform the search
k = len(book_titles) # Search for all books
similarities, I = index.search(emb.astype('float16'), k)
# # Reduce database and query vectors to 2D for visualization
# pca = PCA(n_components=2)
# reduced_db = pca.fit_transform(data)
# reduced_query = pca.transform(target_vector)
#
# # Scatter plot
# plt.scatter(reduced_db[:, 0], reduced_db[:, 1], label='Database Vectors', alpha=0.5)
# plt.scatter(reduced_query[:, 0], reduced_query[:, 1], label='Query Vectors', marker='X', color='red')
# plt.legend()
# plt.title("PCA Projection of IndexFlatIP Vectors")
# plt.show()
corr_df = pd.DataFrame({
'book': [book_titles[i] for i in I[0]],
'corr': similarities[0]
})
return corr_df.sort_values('corr', ascending=False)
def load_and_prepare_data():
global dataset, faiss_index, normalized_data, book_titles, ratings_by_isbn
# Download data files from Hugging Face
ratings = "BX-Book-Ratings.csv"
books = "BX-Books.csv"
ratings, books = load_data(ratings, books)
dataset = preprocess_data(ratings, books)
ratings = ratings[ratings['ISBN'].apply(is_valid_isbn)]
dataset = dataset[dataset['ISBN'].apply(is_valid_isbn)]
ratings_by_isbn = ratings.drop(columns="User-ID")[ratings.drop(columns="User-ID")["Book-Rating"] > 0]
ratings_by_isbn = ratings_by_isbn.groupby('ISBN')["Book-Rating"].mean().reset_index()
ratings_by_isbn = ratings_by_isbn.drop_duplicates(subset=['ISBN'])
dataset = dataset.drop(columns=["User-ID", "Book-Rating"])
dataset = dataset[dataset['ISBN'].isin(ratings_by_isbn['ISBN'])]
dataset = dataset.drop_duplicates(subset=['ISBN'])
dataset = preprocess_data(dataset, ratings_by_isbn)
# Build Faiss index
faiss_index = build_faiss_index(dataset)
book_titles = dataset["Book-Title"]
def recommend_books(target_book: str, num_recommendations: int = 10) -> str:
global dataset, faiss_index, normalized_data, book_titles, ratings_by_isbn
if dataset is None or faiss_index is None or normalized_data is None or book_titles is None:
load_and_prepare_data()
dataset['ISBN'] = dataset['ISBN'].str.strip()
print("Before dropping duplicates:", len(dataset))
dataset = dataset.drop_duplicates(subset=['ISBN'])
print("After dropping duplicates:", len(dataset))
target_book = target_book.lower()
# Fuzzy match the input to the closest book title
closest_match = process.extractOne(target_book, book_titles)
correlations = compute_correlations_faiss(faiss_index, book_titles, closest_match)
recommendations = correlations[correlations['book'] != target_book]
# Create a mask of unique ISBNs
unique_mask = dataset.duplicated(subset=['ISBN'], keep='first') == False
# Apply the mask
dataset = dataset[unique_mask]
recommendations = recommendations.head(num_recommendations)
result = f"Top {num_recommendations} recommendations for '{target_book}':\n\n"
dups = []
result += "\n\n".join([
f"{idx, dups.append(dataset.loc[dataset['Book-Title'] == row['book'], 'ISBN'].values[0])}. "
f"Title: {dataset.loc[dataset['Book-Title'] == row['book'], 'Book-Title'].values[0]}, "
f"Author: {dataset.loc[dataset['Book-Title'] == row['book'], 'Book-Author'].values[0]}, "
f"Year: {dataset.loc[dataset['Book-Title'] == row['book'], 'Year-Of-Publication'].values[0]}, "
f"Publisher: {dataset.loc[dataset['Book-Title'] == row['book'], 'Publisher'].values[0]}, "
f"ISBN: {dataset.loc[dataset['Book-Title'] == row['book'], 'ISBN'].values[0]}, "
f"Rating: {ratings_by_isbn.loc[ratings_by_isbn['ISBN'] == dataset.loc[dataset['Book-Title'] == row['book'], 'ISBN'].values[0], 'Book-Rating'].values[0]}"
for idx, (_, row) in enumerate(recommendations.iterrows(), 1) if dataset.loc[dataset['Book-Title'] == row['book'], 'ISBN'].values[0] not in dups
])
# "ISBN";"Book-Title";"Book-Author";"Year-Of-Publication";"Publisher";"Image-URL-S"
result_df = pd.DataFrame([
{
"Rank": idx,
"Title": dataset.loc[dataset['Book-Title'] == row['book'], 'Book-Title'].values[0],
"Author": dataset.loc[dataset['Book-Title'] == row['book'], 'Book-Author'].values[0],
"Year": dataset.loc[dataset['Book-Title'] == row['book'], 'Year-Of-Publication'].values[0],
"Publisher": dataset.loc[dataset['Book-Title'] == row['book'], 'Publisher'].values[0],
"ISBN": dataset.loc[dataset['Book-Title'] == row['book'], 'ISBN'].values[0],
"Rating": ratings_by_isbn.loc[
ratings_by_isbn['ISBN'] == dataset.loc[dataset['Book-Title'] == row['book'], 'ISBN'].values[
0], 'Book-Rating'].values[0]
}
for idx, (_, row) in enumerate(recommendations.iterrows(), 1)
])
return result_df
# Create Gradio interface
iface = gr.Interface(
fn=recommend_books,
inputs=[
gr.Textbox(label="Enter a book title"),
gr.Slider(minimum=1, maximum=20, step=1, label="Number of recommendations", value=10)
],
outputs=[
gr.Dataframe(
headers=["Rank", "Title", "Author", "Year", "Publisher", "ISBN", "Rating"],
type="pandas",
),
gr.JSON(label="Detailed Recommendations")
],
title="Book Recommender",
description="Enter a book title to get recommendations based on user ratings and book similarities."
)
# Launch the app
iface.launch() |