Spaces:
Sleeping
Sleeping
File size: 5,796 Bytes
53f5531 2197ffd 53f5531 2197ffd 1e39d45 2197ffd 53f5531 2197ffd 53f5531 2197ffd 53f5531 2197ffd 53f5531 2197ffd 53f5531 2197ffd 53f5531 2197ffd 53f5531 2197ffd 53f5531 2197ffd 53f5531 2197ffd 53f5531 2197ffd 53f5531 2197ffd 53f5531 2197ffd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
import os
import pandas as pd
import numpy as np
from typing import List, Tuple
import faiss
from faiss import write_index, read_index
import gradio as gr
from fuzzywuzzy import process
from tqdm import tqdm
from transformers import BertTokenizerFast, BertModel, AutoTokenizer, AutoModel
# Global variables to store loaded data
dataset = None
faiss_index = None
normalized_data = None
book_titles = None
def is_valid_isbn(isbn):
pattern = r'^(?:(?:978|979)\d{10}|\d{9}[0-9X])$'
return bool(re.match(pattern, isbn))
def load_data(ratings_path, books_path) -> Tuple[pd.DataFrame, pd.DataFrame]:
ratings = pd.read_csv(ratings_path, encoding='cp1251', sep=';', on_bad_lines='skip')
ratings = ratings[ratings['Book-Rating'] != 0]
books = pd.read_csv(books_path, encoding='cp1251', sep=';', on_bad_lines='skip')
return ratings, books
def preprocess_data(ratings: pd.DataFrame, books: pd.DataFrame) -> pd.DataFrame:
dataset = pd.merge(ratings, books, on=['ISBN'])
return dataset.apply(lambda x: x.str.lower() if x.dtype == 'object' else x)
def create_embedding(dataset):
model_name = "mrm8488/bert-tiny-finetuned-sms-spam-detection"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
print("creating tokens")
tokens = [tokenizer(i, padding="max_length", truncation=True, max_length=10, return_tensors='pt')
for i in dataset]
print("\ncreating embedding\n")
emb = []
for i in tqdm(tokens):
emb.append(model(**i,)["last_hidden_state"].detach().numpy().squeeze().reshape(-1))
# Normalize the data
normalized_data = emb / np.linalg.norm(emb)
return normalized_data
def build_faiss_index(dataset: pd.DataFrame) -> Tuple[faiss.IndexFlatIP, np.ndarray]:
if os.path.exists("data/books.index"):
return read_index("data/books.index")
dataset["embedding"] = create_embedding(dataset["Book-Title"])
print("creating index")
normalized_data = dataset["embedding"]
# Create a Faiss index
dimension = normalized_data.shape[-1]
index = faiss.IndexFlatIP(dimension)
# Add vectors to the index
index.add(normalized_data.astype('float16'))
write_index(index, "data/books.index")
return index
def compute_correlations_faiss(index: faiss.IndexFlatIP, book_titles: List[str],
target_book: str, ) -> pd.DataFrame:
emb = create_embedding([target_book])
# target_vector = book_titles.index(emb)
# Perform the search
k = len(book_titles) # Search for all books
similarities, I = index.search(emb.astype('float16'), k)
# # Reduce database and query vectors to 2D for visualization
# pca = PCA(n_components=2)
# reduced_db = pca.fit_transform(data)
# reduced_query = pca.transform(target_vector)
#
# # Scatter plot
# plt.scatter(reduced_db[:, 0], reduced_db[:, 1], label='Database Vectors', alpha=0.5)
# plt.scatter(reduced_query[:, 0], reduced_query[:, 1], label='Query Vectors', marker='X', color='red')
# plt.legend()
# plt.title("PCA Projection of IndexFlatIP Vectors")
# plt.show()
corr_df = pd.DataFrame({
'book': [book_titles[i] for i in I[0]],
'corr': similarities[0]
})
return corr_df.sort_values('corr', ascending=False)
def load_and_prepare_data():
global dataset, faiss_index, normalized_data, book_titles
# Download data files from Hugging Face
ratings = "BX-Book-Ratings.csv"
books = "BX-Books.csv"
dataset = preprocess_data(ratings, books)
ratings = ratings[ratings['ISBN'].apply(is_valid_isbn)]
dataset = dataset[dataset['ISBN'].apply(is_valid_isbn)]
ratings_by_isbn = ratings.drop(columns="User-ID")[ratings.drop(columns="User-ID")["Book-Rating"] > 0]
ratings_by_isbn = ratings_by_isbn.groupby('ISBN')["Book-Rating"].mean().reset_index()
ratings_by_isbn = ratings_by_isbn.drop_duplicates(subset=['ISBN'])
dataset = dataset.drop(columns=["User-ID", "Book-Rating"])
dataset = dataset[dataset['ISBN'].isin(ratings_by_isbn['ISBN'])]
dataset = dataset.drop_duplicates(subset=['ISBN'])
dataset = preprocess_data(dataset, ratings_by_isbn)
# Build Faiss index
faiss_index = build_faiss_index(dataset)
book_titles = dataset["Book-Title"]
def recommend_books(target_book: str, num_recommendations: int = 10) -> str:
global dataset, faiss_index, normalized_data, book_titles
if dataset is None or faiss_index is None or normalized_data is None or book_titles is None:
load_and_prepare_data()
target_book = target_book.lower()
# Fuzzy match the input to the closest book title
closest_match, score = process.extractOne(target_book, book_titles)
if score < 50: # threshold
return f"No close match found for '{target_book}'. Please try a different title."
correlations = compute_correlations_faiss(faiss_index, normalized_data, book_titles, closest_match)
recommendations = correlations[correlations['book'] != target_book].head(num_recommendations)
result = f"Top {num_recommendations} recommendations for '{target_book}':\n\n"
for i, (_, row) in enumerate(recommendations.iterrows(), 1):
result += f"{i}. {row['book']} (Correlation: {row['corr']:.2f})\n"
return result
# Create Gradio interface
iface = gr.Interface(
fn=recommend_books,
inputs=[
gr.Textbox(label="Enter a book title"),
gr.Slider(minimum=1, maximum=20, step=1, label="Number of recommendations", value=10)
],
outputs=gr.Textbox(label="Recommendations"),
title="Book Recommender",
description="Enter a book title to get recommendations based on user ratings and book similarities."
)
# Launch the app
iface.launch() |