danschnurp commited on
Commit
2197ffd
·
verified ·
1 Parent(s): 48484af

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +85 -44
  2. requirements.txt +4 -1
app.py CHANGED
@@ -7,6 +7,8 @@ import faiss
7
  from faiss import write_index, read_index
8
  import gradio as gr
9
  from fuzzywuzzy import process
 
 
10
 
11
  # Global variables to store loaded data
12
  dataset = None
@@ -15,10 +17,18 @@ normalized_data = None
15
  book_titles = None
16
 
17
 
18
- def load_data(ratings_path: str, books_path: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
19
- ratings = pd.read_csv(ratings_path, encoding='cp1251', sep=';')
 
 
 
 
 
 
20
  ratings = ratings[ratings['Book-Rating'] != 0]
 
21
  books = pd.read_csv(books_path, encoding='cp1251', sep=';', on_bad_lines='skip')
 
22
  return ratings, books
23
 
24
 
@@ -27,60 +37,95 @@ def preprocess_data(ratings: pd.DataFrame, books: pd.DataFrame) -> pd.DataFrame:
27
  return dataset.apply(lambda x: x.str.lower() if x.dtype == 'object' else x)
28
 
29
 
30
- def get_books_to_compare(data: pd.DataFrame, min_ratings: int = 8) -> List[str]:
31
- book_ratings = data.groupby('Book-Title')['User-ID'].count()
32
- return book_ratings[book_ratings >= min_ratings].index.tolist()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
 
34
 
35
- def prepare_correlation_dataset(data: pd.DataFrame, books_to_compare: List[str]) -> pd.DataFrame:
36
- ratings_data = data.loc[data['Book-Title'].isin(books_to_compare), ['User-ID', 'Book-Rating', 'Book-Title']]
37
- ratings_mean = ratings_data.groupby(['User-ID', 'Book-Title'])['Book-Rating'].mean().reset_index()
38
- return ratings_mean.pivot(index='User-ID', columns='Book-Title', values='Book-Rating').fillna(0)
39
 
40
 
41
- def build_faiss_index(data: pd.DataFrame) -> Tuple[faiss.IndexFlatIP, np.ndarray]:
42
- transposed_data = data.T.values
43
- normalized_data = transposed_data / np.linalg.norm(transposed_data, axis=1)[:, np.newaxis]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
- index_file = "books.index"
46
- if os.path.exists(index_file):
47
- return read_index(index_file), normalized_data
48
 
49
- dimension = normalized_data.shape[1]
50
- index = faiss.IndexFlatIP(dimension)
51
- index.add(normalized_data.astype('float32'))
52
- write_index(index, index_file)
53
- return index, normalized_data
54
-
55
-
56
- def compute_correlations_faiss(index: faiss.IndexFlatIP, data: np.ndarray, book_titles: List[str],
57
- target_book: str) -> pd.DataFrame:
58
- target_index = book_titles.index(target_book)
59
- target_vector = data[target_index].reshape(1, -1)
60
- k = len(book_titles)
61
- similarities, I = index.search(target_vector.astype('float32'), k)
62
- avg_ratings = np.mean(data, axis=1)
63
  corr_df = pd.DataFrame({
64
  'book': [book_titles[i] for i in I[0]],
65
- 'corr': similarities[0],
66
- 'avg_rating': avg_ratings[I[0]]
67
  })
68
  return corr_df.sort_values('corr', ascending=False)
69
 
70
 
 
71
  def load_and_prepare_data():
72
  global dataset, faiss_index, normalized_data, book_titles
73
 
74
  # Download data files from Hugging Face
75
- ratings_file = "BX-Book-Ratings.csv"
76
- books_file = "BX-Books.csv"
77
 
78
- ratings, books = load_data(ratings_file, books_file)
79
  dataset = preprocess_data(ratings, books)
80
- books_to_compare = get_books_to_compare(dataset)
81
- correlation_dataset = prepare_correlation_dataset(dataset, books_to_compare)
82
- faiss_index, normalized_data = build_faiss_index(correlation_dataset)
83
- book_titles = correlation_dataset.columns.tolist()
 
 
 
 
 
 
 
 
 
 
84
 
85
 
86
  def recommend_books(target_book: str, num_recommendations: int = 10) -> str:
@@ -93,13 +138,9 @@ def recommend_books(target_book: str, num_recommendations: int = 10) -> str:
93
  # Fuzzy match the input to the closest book title
94
  closest_match, score = process.extractOne(target_book, book_titles)
95
 
96
- if score < 50: # You can adjust this threshold
97
  return f"No close match found for '{target_book}'. Please try a different title."
98
 
99
- if closest_match != target_book:
100
- result = f"Closest match: '{closest_match}' (similarity: {score}%)\n\n"
101
- else:
102
- result = ""
103
 
104
  correlations = compute_correlations_faiss(faiss_index, normalized_data, book_titles, closest_match)
105
 
@@ -125,4 +166,4 @@ iface = gr.Interface(
125
  )
126
 
127
  # Launch the app
128
- iface.launch(share=True)
 
7
  from faiss import write_index, read_index
8
  import gradio as gr
9
  from fuzzywuzzy import process
10
+ from tqdm import tqdm
11
+ from transformers import BertTokenizerFast, BertModel, AutoTokenizer, AutoModel
12
 
13
  # Global variables to store loaded data
14
  dataset = None
 
17
  book_titles = None
18
 
19
 
20
+ def is_valid_isbn(isbn):
21
+ pattern = r'^(?:(?:978|979)\d{10}|\d{9}[0-9X])$'
22
+ return bool(re.match(pattern, isbn))
23
+
24
+
25
+
26
+ def load_data(ratings_path: Path, books_path: Path) -> Tuple[pd.DataFrame, pd.DataFrame]:
27
+ ratings = pd.read_csv(ratings_path, encoding='cp1251', sep=';', on_bad_lines='skip')
28
  ratings = ratings[ratings['Book-Rating'] != 0]
29
+
30
  books = pd.read_csv(books_path, encoding='cp1251', sep=';', on_bad_lines='skip')
31
+
32
  return ratings, books
33
 
34
 
 
37
  return dataset.apply(lambda x: x.str.lower() if x.dtype == 'object' else x)
38
 
39
 
40
+ def create_embedding(dataset):
41
+ model_name = "mrm8488/bert-tiny-finetuned-sms-spam-detection"
42
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
43
+ model = AutoModel.from_pretrained(model_name)
44
+ print("creating tokens")
45
+ tokens = [tokenizer(i, padding="max_length", truncation=True, max_length=10, return_tensors='pt')
46
+ for i in dataset]
47
+ print("\ncreating embedding\n")
48
+ emb = []
49
+ for i in tqdm(tokens):
50
+ emb.append(model(**i,)["last_hidden_state"].detach().numpy().squeeze().reshape(-1))
51
+ # Normalize the data
52
+ normalized_data = emb / np.linalg.norm(emb)
53
+ return normalized_data
54
+
55
+
56
+ def build_faiss_index(dataset: pd.DataFrame) -> Tuple[faiss.IndexFlatIP, np.ndarray]:
57
+ if os.path.exists("data/books.index"):
58
+ return read_index("data/books.index")
59
+
60
+ dataset["embedding"] = create_embedding(dataset["Book-Title"])
61
+ print("creating index")
62
+ normalized_data = dataset["embedding"]
63
+ # Create a Faiss index
64
+ dimension = normalized_data.shape[-1]
65
+ index = faiss.IndexFlatIP(dimension)
66
+
67
+ # Add vectors to the index
68
+ index.add(normalized_data.astype('float16'))
69
 
70
+ write_index(index, "data/books.index")
71
 
72
+ return index
 
 
 
73
 
74
 
75
+ def compute_correlations_faiss(index: faiss.IndexFlatIP, book_titles: List[str],
76
+ target_book: str, ) -> pd.DataFrame:
77
+ emb = create_embedding([target_book])
78
+ # target_vector = book_titles.index(emb)
79
+
80
+
81
+ # Perform the search
82
+ k = len(book_titles) # Search for all books
83
+ similarities, I = index.search(emb.astype('float16'), k)
84
+
85
+ # # Reduce database and query vectors to 2D for visualization
86
+ # pca = PCA(n_components=2)
87
+ # reduced_db = pca.fit_transform(data)
88
+ # reduced_query = pca.transform(target_vector)
89
+ #
90
+ # # Scatter plot
91
+ # plt.scatter(reduced_db[:, 0], reduced_db[:, 1], label='Database Vectors', alpha=0.5)
92
+ # plt.scatter(reduced_query[:, 0], reduced_query[:, 1], label='Query Vectors', marker='X', color='red')
93
+ # plt.legend()
94
+ # plt.title("PCA Projection of IndexFlatIP Vectors")
95
+ # plt.show()
96
+
97
 
 
 
 
98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  corr_df = pd.DataFrame({
100
  'book': [book_titles[i] for i in I[0]],
101
+ 'corr': similarities[0]
 
102
  })
103
  return corr_df.sort_values('corr', ascending=False)
104
 
105
 
106
+
107
  def load_and_prepare_data():
108
  global dataset, faiss_index, normalized_data, book_titles
109
 
110
  # Download data files from Hugging Face
111
+ ratings = "BX-Book-Ratings.csv"
112
+ books = "BX-Books.csv"
113
 
 
114
  dataset = preprocess_data(ratings, books)
115
+ ratings = ratings[ratings['ISBN'].apply(is_valid_isbn)]
116
+ dataset = dataset[dataset['ISBN'].apply(is_valid_isbn)]
117
+
118
+ ratings_by_isbn = ratings.drop(columns="User-ID")[ratings.drop(columns="User-ID")["Book-Rating"] > 0]
119
+ ratings_by_isbn = ratings_by_isbn.groupby('ISBN')["Book-Rating"].mean().reset_index()
120
+ ratings_by_isbn = ratings_by_isbn.drop_duplicates(subset=['ISBN'])
121
+ dataset = dataset.drop(columns=["User-ID", "Book-Rating"])
122
+ dataset = dataset[dataset['ISBN'].isin(ratings_by_isbn['ISBN'])]
123
+ dataset = dataset.drop_duplicates(subset=['ISBN'])
124
+ dataset = preprocess_data(dataset, ratings_by_isbn)
125
+ # Build Faiss index
126
+ faiss_index = build_faiss_index(dataset)
127
+
128
+ book_titles = dataset["Book-Title"]
129
 
130
 
131
  def recommend_books(target_book: str, num_recommendations: int = 10) -> str:
 
138
  # Fuzzy match the input to the closest book title
139
  closest_match, score = process.extractOne(target_book, book_titles)
140
 
141
+ if score < 50: # threshold
142
  return f"No close match found for '{target_book}'. Please try a different title."
143
 
 
 
 
 
144
 
145
  correlations = compute_correlations_faiss(faiss_index, normalized_data, book_titles, closest_match)
146
 
 
166
  )
167
 
168
  # Launch the app
169
+ iface.launch()
requirements.txt CHANGED
@@ -3,7 +3,10 @@
3
  faiss-cpu
4
  pandas
5
  numpy
 
6
  gradio
7
  huggingface_hub
8
  fuzzywuzzy
9
- python-Levenshtein
 
 
 
3
  faiss-cpu
4
  pandas
5
  numpy
6
+
7
  gradio
8
  huggingface_hub
9
  fuzzywuzzy
10
+ python-Levenshtein
11
+ transformers
12
+ tqdm