Update app.py
Browse files
app.py
CHANGED
@@ -2,8 +2,8 @@ import requests
|
|
2 |
from bs4 import BeautifulSoup
|
3 |
import pandas as pd
|
4 |
import gradio as gr
|
5 |
-
import
|
6 |
-
from
|
7 |
import creds # Assuming creds.py holds your API key as creds.api_key
|
8 |
|
9 |
# Step 1: Scrape the free courses from Analytics Vidhya
|
@@ -36,55 +36,40 @@ for course_card in soup.find_all('header', class_='course-card__img-container'):
|
|
36 |
# Step 2: Create DataFrame
|
37 |
df = pd.DataFrame(courses)
|
38 |
|
39 |
-
# Step 3:
|
40 |
-
|
|
|
|
|
|
|
41 |
|
42 |
-
|
43 |
-
try:
|
44 |
-
# Prepare the prompt for Groq
|
45 |
-
prompt = f"""Given the following query: "{query}"
|
46 |
-
Please analyze the query and rank the following courses based on their relevance to the query.
|
47 |
-
Prioritize courses from Analytics Vidhya. Provide a relevance score from 0 to 1 for each course.
|
48 |
-
Only return courses with a relevance score of 0.5 or higher.
|
49 |
-
Return the results in the following format:
|
50 |
-
Title: [Course Title]
|
51 |
-
Relevance: [Score]
|
52 |
-
|
53 |
-
Courses:
|
54 |
-
{df['title'].to_string(index=False)}
|
55 |
-
"""
|
56 |
-
|
57 |
-
# Get response from Groq
|
58 |
-
response = client.chat.completions.create(
|
59 |
-
model="mixtral-8x7b-32768",
|
60 |
-
messages=[{"role": "system", "content": "You are an AI assistant specialized in course recommendations."},
|
61 |
-
{"role": "user", "content": prompt}],
|
62 |
-
temperature=0.2,
|
63 |
-
max_tokens=1000
|
64 |
-
)
|
65 |
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
if line.startswith('Title:'):
|
70 |
-
title = line.split('Title:')[1].strip()
|
71 |
-
elif line.startswith('Relevance:'):
|
72 |
-
relevance = float(line.split('Relevance:')[1].strip())
|
73 |
-
if relevance >= 0.5:
|
74 |
-
matching_courses = df[df['title'] == title]
|
75 |
-
if not matching_courses.empty:
|
76 |
-
course = matching_courses.iloc[0]
|
77 |
-
results.append({
|
78 |
-
'title': title,
|
79 |
-
'image_url': course['image_url'],
|
80 |
-
'course_link': course['course_link'],
|
81 |
-
'score': relevance
|
82 |
-
})
|
83 |
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
|
89 |
def gradio_search(query):
|
90 |
result_list = search_courses(query)
|
@@ -115,8 +100,8 @@ def gradio_search(query):
|
|
115 |
custom_css = """
|
116 |
body {
|
117 |
font-family: Arial, sans-serif;
|
118 |
-
background-color: #121212;
|
119 |
-
color: #E0E0E0;
|
120 |
}
|
121 |
.container {
|
122 |
max-width: 800px;
|
@@ -130,7 +115,7 @@ body {
|
|
130 |
justify-content: space-between;
|
131 |
}
|
132 |
.course-card {
|
133 |
-
background-color: #1E1E1E;
|
134 |
border-radius: 8px;
|
135 |
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.5);
|
136 |
margin-bottom: 20px;
|
@@ -152,10 +137,10 @@ body {
|
|
152 |
.course-info h3 {
|
153 |
margin-top: 0;
|
154 |
font-size: 18px;
|
155 |
-
color: #E0E0E0;
|
156 |
}
|
157 |
.course-info p {
|
158 |
-
color: #B0B0B0;
|
159 |
font-size: 14px;
|
160 |
margin-bottom: 10px;
|
161 |
}
|
|
|
2 |
from bs4 import BeautifulSoup
|
3 |
import pandas as pd
|
4 |
import gradio as gr
|
5 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
6 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
7 |
import creds # Assuming creds.py holds your API key as creds.api_key
|
8 |
|
9 |
# Step 1: Scrape the free courses from Analytics Vidhya
|
|
|
36 |
# Step 2: Create DataFrame
|
37 |
df = pd.DataFrame(courses)
|
38 |
|
39 |
+
# Step 3: Text Processing for Improved Relevance
|
40 |
+
def preprocess_text(text):
|
41 |
+
text = text.lower()
|
42 |
+
text = text.replace("-", " ")
|
43 |
+
return text
|
44 |
|
45 |
+
df['processed_title'] = df['title'].apply(preprocess_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
+
# Step 4: Generate TF-IDF Vectors for Titles
|
48 |
+
vectorizer = TfidfVectorizer(stop_words='english')
|
49 |
+
tfidf_matrix = vectorizer.fit_transform(df['processed_title'])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
+
def search_courses(query):
|
52 |
+
# Process query and generate its TF-IDF vector
|
53 |
+
processed_query = preprocess_text(query)
|
54 |
+
query_vector = vectorizer.transform([processed_query])
|
55 |
+
|
56 |
+
# Calculate cosine similarity
|
57 |
+
similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
|
58 |
+
df['relevance_score'] = similarities
|
59 |
+
|
60 |
+
# Filter and sort courses based on relevance score
|
61 |
+
relevant_courses = df[df['relevance_score'] >= 0.3].sort_values(by='relevance_score', ascending=False)
|
62 |
+
results = []
|
63 |
+
|
64 |
+
for _, course in relevant_courses.iterrows():
|
65 |
+
results.append({
|
66 |
+
'title': course['title'],
|
67 |
+
'image_url': course['image_url'],
|
68 |
+
'course_link': course['course_link'],
|
69 |
+
'score': course['relevance_score']
|
70 |
+
})
|
71 |
+
|
72 |
+
return results[:10]
|
73 |
|
74 |
def gradio_search(query):
|
75 |
result_list = search_courses(query)
|
|
|
100 |
custom_css = """
|
101 |
body {
|
102 |
font-family: Arial, sans-serif;
|
103 |
+
background-color: #121212;
|
104 |
+
color: #E0E0E0;
|
105 |
}
|
106 |
.container {
|
107 |
max-width: 800px;
|
|
|
115 |
justify-content: space-between;
|
116 |
}
|
117 |
.course-card {
|
118 |
+
background-color: #1E1E1E;
|
119 |
border-radius: 8px;
|
120 |
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.5);
|
121 |
margin-bottom: 20px;
|
|
|
137 |
.course-info h3 {
|
138 |
margin-top: 0;
|
139 |
font-size: 18px;
|
140 |
+
color: #E0E0E0;
|
141 |
}
|
142 |
.course-info p {
|
143 |
+
color: #B0B0B0;
|
144 |
font-size: 14px;
|
145 |
margin-bottom: 10px;
|
146 |
}
|