import streamlit as st import pandas as pd import numpy as np from sklearn.metrics.pairwise import cosine_similarity from transformers import AutoTokenizer, AutoModel import torch import requests from datasets import load_dataset # Set page configuration st.set_page_config(page_title="Repository Recommender", layout="wide") # Load model and tokenizer @st.cache_resource def load_model(): model_name = "Salesforce/codet5-small" tokenizer = AutoTokenizer.from_pretrained(model_name) # Check if GPU is available device = "cuda" if torch.cuda.is_available() else "cpu" model = AutoModel.from_pretrained(model_name).to(device) return tokenizer, model, device def generate_embedding(text, tokenizer, model, device): """Generate embeddings for a given text.""" if not text.strip(): return np.zeros(512) # Handle empty input gracefully inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512) inputs = {k: v.to(device) for k, v in inputs.items()} with torch.no_grad(): outputs = model.encoder(**inputs) return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy() # Load dataset @st.cache_data def load_data(_tokenizer, _model, _device): dataset = load_dataset("frankjosh/filtered_dataset", split="train") df = pd.DataFrame(dataset).head(500) # Limit to 500 repositories # Fill missing values to avoid errors df['docstring'] = df.get('docstring', "").fillna("") df['summary'] = df.get('summary', "").fillna("") # Generate embeddings for each row def compute_embedding(row): text = f"{row['docstring']} {row['summary']}" return generate_embedding(text, _tokenizer, _model, _device) df['embedding'] = df.apply(compute_embedding, axis=1) return df def fetch_readme(repo_url): """Fetch README file from GitHub repository.""" try: readme_url = repo_url.rstrip("/") + "/blob/main/README.md" response = requests.get(readme_url, timeout=10) if response.status_code == 200: return response.text else: return "README not available." except requests.exceptions.RequestException as e: return f"Error fetching README: {e}" # Main application logic def main(): st.title("Repository Recommender System") st.write("Find Python repositories to learn production-level coding practices.") # Load resources tokenizer, model, device = load_model() with st.spinner("Loading dataset and generating embeddings. This may take a moment..."): try: data = load_data(tokenizer, model, device) except Exception as e: st.error(f"Error loading dataset: {e}") return # Input user query user_query = st.text_input("Describe your project or learning goal:", "I am working on a project to recommend music using pandas and numpy.") if user_query: with st.spinner("Processing your query..."): query_embedding = generate_embedding(user_query, tokenizer, model, device) # Compute similarity try: data['similarity'] = data['embedding'].apply( lambda emb: cosine_similarity([query_embedding], [np.array(emb)])[0][0] ) # Filter and sort recommendations top_recommendations = ( data.sort_values(by='similarity', ascending=False) .head(5) ) # Display recommendations st.subheader("Top Recommendations") for idx, row in top_recommendations.iterrows(): st.markdown(f"### {row['repo']}") st.write(f"**Path:** {row['path']}") st.write(f"**Summary:** {row['summary']}") st.write(f"**Similarity Score:** {row['similarity']:.2f}") st.markdown(f"[Repository Link]({row['url']})") # Fetch and display README st.subheader("Repository README") readme_content = fetch_readme(row['url']) st.code(readme_content) except Exception as e: st.error(f"Error computing recommendations: {e}") if __name__ == "__main__": main()