import warnings warnings.filterwarnings('ignore') import streamlit as st import pandas as pd import numpy as np from sklearn.metrics.pairwise import cosine_similarity from transformers import AutoTokenizer, AutoModel import torch from torch.utils.data import DataLoader, Dataset from datasets import load_dataset from datetime import datetime from typing import List, Dict, Any from functools import partial # Configure GPU if available device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Initialize session state if 'history' not in st.session_state: st.session_state.history = [] if 'feedback' not in st.session_state: st.session_state.feedback = {} # Define subset size and batch size for optimization SUBSET_SIZE = 500 # Subset for faster precomputation BATCH_SIZE = 8 # Smaller batch size to reduce memory overhead @st.cache_resource def load_model_and_tokenizer_with_progress(): """ Load the pre-trained model and tokenizer using Hugging Face Transformers with a progress bar for better user experience. """ progress_bar = st.progress(0) status_text = st.empty() try: progress_bar.progress(10) status_text.text("Loading tokenizer...") model_name = "Salesforce/codet5-small" tokenizer = AutoTokenizer.from_pretrained(model_name) progress_bar.progress(50) status_text.text("Loading model...") model = AutoModel.from_pretrained(model_name).to(device) model.eval() progress_bar.progress(100) status_text.text("Model loaded successfully!") finally: progress_bar.empty() status_text.empty() return tokenizer, model @st.cache_resource def load_data(): """ Load and sample the dataset from Hugging Face. Ensures the 'text' column is created for embedding precomputation. """ dataset = load_dataset("frankjosh/filtered_dataset") data = pd.DataFrame(dataset['train']) # Take a random subset of data data = data.sample(n=min(SUBSET_SIZE, len(data)), random_state=42).reset_index(drop=True) # Create a 'text' column by combining relevant fields data['text'] = data['docstring'].fillna('') + ' ' + data['summary'].fillna('') return data @st.cache_resource def precompute_embeddings(data: pd.DataFrame, _tokenizer, _model, batch_size=BATCH_SIZE): """ Precompute embeddings for repository metadata to optimize query performance. The tokenizer and model are excluded from caching as they are unhashable. """ class TextDataset(Dataset): def __init__(self, texts: List[str], tokenizer, max_length=512): self.texts = texts self.tokenizer = tokenizer self.max_length = max_length def __len__(self): return len(self.texts) def __getitem__(self, idx): return self.tokenizer( self.texts[idx], padding='max_length', truncation=True, max_length=self.max_length, return_tensors="pt" ) def collate_fn(batch, pad_token_id): max_length = max(inputs['input_ids'].shape[1] for inputs in batch) input_ids, attention_mask = [], [] for inputs in batch: input_ids.append(torch.nn.functional.pad( inputs['input_ids'].squeeze(), (0, max_length - inputs['input_ids'].shape[1]), value=pad_token_id )) attention_mask.append(torch.nn.functional.pad( inputs['attention_mask'].squeeze(), (0, max_length - inputs['attention_mask'].shape[1]), value=0 )) return { 'input_ids': torch.stack(input_ids), 'attention_mask': torch.stack(attention_mask) } def generate_embeddings_batch(model, batch, device): with torch.no_grad(): batch = {k: v.to(device) for k, v in batch.items()} outputs = model.encoder(**batch) return outputs.last_hidden_state.mean(dim=1).cpu().numpy() dataset = TextDataset(data['text'].tolist(), _tokenizer) dataloader = DataLoader( dataset, batch_size=batch_size, shuffle=False, collate_fn=partial(collate_fn, pad_token_id=_tokenizer.pad_token_id) ) embeddings = [] progress_bar = st.progress(0) # Progress bar for embedding computation for i, batch in enumerate(dataloader): batch_embeddings = generate_embeddings_batch(_model, batch, device) embeddings.extend(batch_embeddings) progress_bar.progress((i + 1) / len(dataloader)) progress_bar.empty() data['embedding'] = embeddings return data @torch.no_grad() def generate_query_embedding(model, tokenizer, query: str) -> np.ndarray: """ Generate embedding for a user query using the pre-trained model. """ inputs = tokenizer( query, return_tensors="pt", padding=True, truncation=True, max_length=512 ).to(device) outputs = model.encoder(**inputs) return outputs.last_hidden_state.mean(dim=1).cpu().numpy() def find_similar_repos(query_embedding: np.ndarray, data: pd.DataFrame, top_n=5) -> pd.DataFrame: """ Compute cosine similarity and return the top N most similar repositories. """ # Reshape query_embedding to 2D query_embedding = query_embedding.reshape(1, -1) # Convert data['embedding'] to a 2D array embeddings = np.vstack(data['embedding'].values) # Compute cosine similarity similarities = cosine_similarity(query_embedding, embeddings)[0] # Add similarity scores to the DataFrame data['similarity'] = similarities return data.nlargest(top_n, 'similarity') def display_recommendations(recommendations: pd.DataFrame): """ Display the recommended repositories in the Streamlit app interface. """ st.markdown("### 🎯 Top Recommendations") for idx, row in recommendations.iterrows(): st.markdown(f"### {idx + 1}. {row['repo']}") st.metric("Match Score", f"{row['similarity']:.2%}") st.markdown(f"[View Repository]({row['url']})") # Main workflow st.title("Repository Recommender System 🚀") st.caption("Find repositories based on your project description.") # Load resources with progress bar tokenizer, model = load_model_and_tokenizer_with_progress() # Load data and precompute embeddings data = load_data() data = precompute_embeddings(data, tokenizer, model) # User input user_query = st.text_area( "Describe your project:", height=150, placeholder="Example: A machine learning project for customer churn prediction..." ) if st.button("🔍 Search Repositories"): if user_query.strip(): with st.spinner("Finding relevant repositories..."): query_embedding = generate_query_embedding(model, tokenizer, user_query) recommendations = find_similar_repos(query_embedding, data) display_recommendations(recommendations) else: st.error("Please provide a project description.")