File size: 4,847 Bytes
53f5531
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os
import pandas as pd
import numpy as np

from typing import List, Tuple
import faiss
from faiss import write_index, read_index
import gradio as gr
from fuzzywuzzy import process

# Global variables to store loaded data
dataset = None
faiss_index = None
normalized_data = None
book_titles = None


def load_data(ratings_path: str, books_path: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    ratings = pd.read_csv(ratings_path, encoding='cp1251', sep=';')
    ratings = ratings[ratings['Book-Rating'] != 0]
    books = pd.read_csv(books_path, encoding='cp1251', sep=';', on_bad_lines='skip')
    return ratings, books


def preprocess_data(ratings: pd.DataFrame, books: pd.DataFrame) -> pd.DataFrame:
    dataset = pd.merge(ratings, books, on=['ISBN'])
    return dataset.apply(lambda x: x.str.lower() if x.dtype == 'object' else x)


def get_books_to_compare(data: pd.DataFrame, min_ratings: int = 8) -> List[str]:
    book_ratings = data.groupby('Book-Title')['User-ID'].count()
    return book_ratings[book_ratings >= min_ratings].index.tolist()


def prepare_correlation_dataset(data: pd.DataFrame, books_to_compare: List[str]) -> pd.DataFrame:
    ratings_data = data.loc[data['Book-Title'].isin(books_to_compare), ['User-ID', 'Book-Rating', 'Book-Title']]
    ratings_mean = ratings_data.groupby(['User-ID', 'Book-Title'])['Book-Rating'].mean().reset_index()
    return ratings_mean.pivot(index='User-ID', columns='Book-Title', values='Book-Rating').fillna(0)


def build_faiss_index(data: pd.DataFrame) -> Tuple[faiss.IndexFlatIP, np.ndarray]:
    transposed_data = data.T.values
    normalized_data = transposed_data / np.linalg.norm(transposed_data, axis=1)[:, np.newaxis]

    index_file = "books.index"
    if os.path.exists(index_file):
        return read_index(index_file), normalized_data

    dimension = normalized_data.shape[1]
    index = faiss.IndexFlatIP(dimension)
    index.add(normalized_data.astype('float32'))
    write_index(index, index_file)
    return index, normalized_data


def compute_correlations_faiss(index: faiss.IndexFlatIP, data: np.ndarray, book_titles: List[str],
                               target_book: str) -> pd.DataFrame:
    target_index = book_titles.index(target_book)
    target_vector = data[target_index].reshape(1, -1)
    k = len(book_titles)
    similarities, I = index.search(target_vector.astype('float32'), k)
    avg_ratings = np.mean(data, axis=1)
    corr_df = pd.DataFrame({
        'book': [book_titles[i] for i in I[0]],
        'corr': similarities[0],
        'avg_rating': avg_ratings[I[0]]
    })
    return corr_df.sort_values('corr', ascending=False)


def load_and_prepare_data():
    global dataset, faiss_index, normalized_data, book_titles

    # Download data files from Hugging Face
    ratings_file = "BX-Book-Ratings.csv"
    books_file = "BX-Books.csv"

    ratings, books = load_data(ratings_file, books_file)
    dataset = preprocess_data(ratings, books)
    books_to_compare = get_books_to_compare(dataset)
    correlation_dataset = prepare_correlation_dataset(dataset, books_to_compare)
    faiss_index, normalized_data = build_faiss_index(correlation_dataset)
    book_titles = correlation_dataset.columns.tolist()


def recommend_books(target_book: str, num_recommendations: int = 10) -> str:
    global dataset, faiss_index, normalized_data, book_titles

    if dataset is None or faiss_index is None or normalized_data is None or book_titles is None:
        load_and_prepare_data()

    target_book = target_book.lower()
    # Fuzzy match the input to the closest book title
    closest_match, score = process.extractOne(target_book, book_titles)

    if score < 50:  # You can adjust this threshold
        return f"No close match found for '{target_book}'. Please try a different title."

    if closest_match != target_book:
        result = f"Closest match: '{closest_match}' (similarity: {score}%)\n\n"
    else:
        result = ""

    correlations = compute_correlations_faiss(faiss_index, normalized_data, book_titles, closest_match)

    recommendations = correlations[correlations['book'] != target_book].head(num_recommendations)

    result = f"Top {num_recommendations} recommendations for '{target_book}':\n\n"
    for i, (_, row) in enumerate(recommendations.iterrows(), 1):
        result += f"{i}. {row['book']} (Correlation: {row['corr']:.2f})\n"

    return result


# Create Gradio interface
iface = gr.Interface(
    fn=recommend_books,
    inputs=[
        gr.Textbox(label="Enter a book title"),
        gr.Slider(minimum=1, maximum=20, step=1, label="Number of recommendations", value=10)
    ],
    outputs=gr.Textbox(label="Recommendations"),
    title="Book Recommender",
    description="Enter a book title to get recommendations based on user ratings and book similarities."
)

# Launch the app
iface.launch(share=True)