Spaces:

aryan79
/

Analytics_Vidhya_Smart_Search_System

Sleeping

App Files Files Community

aryan79 commited on Nov 9, 2024

Commit

2bdf49a

verified ·

1 Parent(s): 8ae7ecd

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -53

app.py CHANGED Viewed

@@ -2,8 +2,8 @@ import requests
 from bs4 import BeautifulSoup
 import pandas as pd
 import gradio as gr
-import os
-from groq import Groq
 import creds  # Assuming creds.py holds your API key as creds.api_key
 # Step 1: Scrape the free courses from Analytics Vidhya
@@ -36,55 +36,40 @@ for course_card in soup.find_all('header', class_='course-card__img-container'):
 # Step 2: Create DataFrame
 df = pd.DataFrame(courses)
-# Step 3: Initialize the Groq client and set the API key
-client = Groq(api_key=creds.api_key)
-def search_courses(query):
-    try:
-        # Prepare the prompt for Groq
-        prompt = f"""Given the following query: "{query}"
-        Please analyze the query and rank the following courses based on their relevance to the query.
-        Prioritize courses from Analytics Vidhya. Provide a relevance score from 0 to 1 for each course.
-        Only return courses with a relevance score of 0.5 or higher.
-        Return the results in the following format:
-        Title: [Course Title]
-        Relevance: [Score]
-        Courses:
-        {df['title'].to_string(index=False)}
-        """
-        # Get response from Groq
-        response = client.chat.completions.create(
-            model="mixtral-8x7b-32768",
-            messages=[{"role": "system", "content": "You are an AI assistant specialized in course recommendations."},
-                      {"role": "user", "content": prompt}],
-            temperature=0.2,
-            max_tokens=1000
-        )
-        # Parse Groq's response
-        results = []
-        for line in response.choices[0].message.content.split('\n'):
-            if line.startswith('Title:'):
-                title = line.split('Title:')[1].strip()
-            elif line.startswith('Relevance:'):
-                relevance = float(line.split('Relevance:')[1].strip())
-                if relevance >= 0.5:
-                    matching_courses = df[df['title'] == title]
-                    if not matching_courses.empty:
-                        course = matching_courses.iloc[0]
-                        results.append({
-                            'title': title,
-                            'image_url': course['image_url'],
-                            'course_link': course['course_link'],
-                            'score': relevance
-                        })
-        return sorted(results, key=lambda x: x['score'], reverse=True)[:10]
-    except Exception as e:
-        return []
 def gradio_search(query):
     result_list = search_courses(query)
@@ -115,8 +100,8 @@ def gradio_search(query):
 custom_css = """
 body {
     font-family: Arial, sans-serif;
-    background-color: #121212;  /* Dark background */
-    color: #E0E0E0;  /* Light text color for dark background */
 }
 .container {
     max-width: 800px;
@@ -130,7 +115,7 @@ body {
     justify-content: space-between;
 }
 .course-card {
-    background-color: #1E1E1E;  /* Darker card background */
     border-radius: 8px;
     box-shadow: 0 2px 4px rgba(0, 0, 0, 0.5);
     margin-bottom: 20px;
@@ -152,10 +137,10 @@ body {
 .course-info h3 {
     margin-top: 0;
     font-size: 18px;
-    color: #E0E0E0;  /* Light text color */
 }
 .course-info p {
-    color: #B0B0B0;  /* Slightly darker text color for contrast */
     font-size: 14px;
     margin-bottom: 10px;
 }

 from bs4 import BeautifulSoup
 import pandas as pd
 import gradio as gr
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
 import creds  # Assuming creds.py holds your API key as creds.api_key
 # Step 1: Scrape the free courses from Analytics Vidhya
 # Step 2: Create DataFrame
 df = pd.DataFrame(courses)
+# Step 3: Text Processing for Improved Relevance
+def preprocess_text(text):
+    text = text.lower()
+    text = text.replace("-", " ")
+    return text
+df['processed_title'] = df['title'].apply(preprocess_text)
+# Step 4: Generate TF-IDF Vectors for Titles
+vectorizer = TfidfVectorizer(stop_words='english')
+tfidf_matrix = vectorizer.fit_transform(df['processed_title'])
+def search_courses(query):
+    # Process query and generate its TF-IDF vector
+    processed_query = preprocess_text(query)
+    query_vector = vectorizer.transform([processed_query])
+    # Calculate cosine similarity
+    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
+    df['relevance_score'] = similarities
+    # Filter and sort courses based on relevance score
+    relevant_courses = df[df['relevance_score'] >= 0.3].sort_values(by='relevance_score', ascending=False)
+    results = []
+    for _, course in relevant_courses.iterrows():
+        results.append({
+            'title': course['title'],
+            'image_url': course['image_url'],
+            'course_link': course['course_link'],
+            'score': course['relevance_score']
+        })
+    return results[:10]
 def gradio_search(query):
     result_list = search_courses(query)
 custom_css = """
 body {
     font-family: Arial, sans-serif;
+    background-color: #121212;
+    color: #E0E0E0;
 }
 .container {
     max-width: 800px;
     justify-content: space-between;
 }
 .course-card {
+    background-color: #1E1E1E;
     border-radius: 8px;
     box-shadow: 0 2px 4px rgba(0, 0, 0, 0.5);
     margin-bottom: 20px;
 .course-info h3 {
     margin-top: 0;
     font-size: 18px;
+    color: #E0E0E0;
 }
 .course-info p {
+    color: #B0B0B0;
     font-size: 14px;
     margin-bottom: 10px;
 }