aryan79 commited on
Commit
2bdf49a
·
verified ·
1 Parent(s): 8ae7ecd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -53
app.py CHANGED
@@ -2,8 +2,8 @@ import requests
2
  from bs4 import BeautifulSoup
3
  import pandas as pd
4
  import gradio as gr
5
- import os
6
- from groq import Groq
7
  import creds # Assuming creds.py holds your API key as creds.api_key
8
 
9
  # Step 1: Scrape the free courses from Analytics Vidhya
@@ -36,55 +36,40 @@ for course_card in soup.find_all('header', class_='course-card__img-container'):
36
  # Step 2: Create DataFrame
37
  df = pd.DataFrame(courses)
38
 
39
- # Step 3: Initialize the Groq client and set the API key
40
- client = Groq(api_key=creds.api_key)
 
 
 
41
 
42
- def search_courses(query):
43
- try:
44
- # Prepare the prompt for Groq
45
- prompt = f"""Given the following query: "{query}"
46
- Please analyze the query and rank the following courses based on their relevance to the query.
47
- Prioritize courses from Analytics Vidhya. Provide a relevance score from 0 to 1 for each course.
48
- Only return courses with a relevance score of 0.5 or higher.
49
- Return the results in the following format:
50
- Title: [Course Title]
51
- Relevance: [Score]
52
-
53
- Courses:
54
- {df['title'].to_string(index=False)}
55
- """
56
-
57
- # Get response from Groq
58
- response = client.chat.completions.create(
59
- model="mixtral-8x7b-32768",
60
- messages=[{"role": "system", "content": "You are an AI assistant specialized in course recommendations."},
61
- {"role": "user", "content": prompt}],
62
- temperature=0.2,
63
- max_tokens=1000
64
- )
65
 
66
- # Parse Groq's response
67
- results = []
68
- for line in response.choices[0].message.content.split('\n'):
69
- if line.startswith('Title:'):
70
- title = line.split('Title:')[1].strip()
71
- elif line.startswith('Relevance:'):
72
- relevance = float(line.split('Relevance:')[1].strip())
73
- if relevance >= 0.5:
74
- matching_courses = df[df['title'] == title]
75
- if not matching_courses.empty:
76
- course = matching_courses.iloc[0]
77
- results.append({
78
- 'title': title,
79
- 'image_url': course['image_url'],
80
- 'course_link': course['course_link'],
81
- 'score': relevance
82
- })
83
 
84
- return sorted(results, key=lambda x: x['score'], reverse=True)[:10]
85
-
86
- except Exception as e:
87
- return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
  def gradio_search(query):
90
  result_list = search_courses(query)
@@ -115,8 +100,8 @@ def gradio_search(query):
115
  custom_css = """
116
  body {
117
  font-family: Arial, sans-serif;
118
- background-color: #121212; /* Dark background */
119
- color: #E0E0E0; /* Light text color for dark background */
120
  }
121
  .container {
122
  max-width: 800px;
@@ -130,7 +115,7 @@ body {
130
  justify-content: space-between;
131
  }
132
  .course-card {
133
- background-color: #1E1E1E; /* Darker card background */
134
  border-radius: 8px;
135
  box-shadow: 0 2px 4px rgba(0, 0, 0, 0.5);
136
  margin-bottom: 20px;
@@ -152,10 +137,10 @@ body {
152
  .course-info h3 {
153
  margin-top: 0;
154
  font-size: 18px;
155
- color: #E0E0E0; /* Light text color */
156
  }
157
  .course-info p {
158
- color: #B0B0B0; /* Slightly darker text color for contrast */
159
  font-size: 14px;
160
  margin-bottom: 10px;
161
  }
 
2
  from bs4 import BeautifulSoup
3
  import pandas as pd
4
  import gradio as gr
5
+ from sklearn.feature_extraction.text import TfidfVectorizer
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
  import creds # Assuming creds.py holds your API key as creds.api_key
8
 
9
  # Step 1: Scrape the free courses from Analytics Vidhya
 
36
  # Step 2: Create DataFrame
37
  df = pd.DataFrame(courses)
38
 
39
+ # Step 3: Text Processing for Improved Relevance
40
+ def preprocess_text(text):
41
+ text = text.lower()
42
+ text = text.replace("-", " ")
43
+ return text
44
 
45
+ df['processed_title'] = df['title'].apply(preprocess_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
+ # Step 4: Generate TF-IDF Vectors for Titles
48
+ vectorizer = TfidfVectorizer(stop_words='english')
49
+ tfidf_matrix = vectorizer.fit_transform(df['processed_title'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
+ def search_courses(query):
52
+ # Process query and generate its TF-IDF vector
53
+ processed_query = preprocess_text(query)
54
+ query_vector = vectorizer.transform([processed_query])
55
+
56
+ # Calculate cosine similarity
57
+ similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
58
+ df['relevance_score'] = similarities
59
+
60
+ # Filter and sort courses based on relevance score
61
+ relevant_courses = df[df['relevance_score'] >= 0.3].sort_values(by='relevance_score', ascending=False)
62
+ results = []
63
+
64
+ for _, course in relevant_courses.iterrows():
65
+ results.append({
66
+ 'title': course['title'],
67
+ 'image_url': course['image_url'],
68
+ 'course_link': course['course_link'],
69
+ 'score': course['relevance_score']
70
+ })
71
+
72
+ return results[:10]
73
 
74
  def gradio_search(query):
75
  result_list = search_courses(query)
 
100
  custom_css = """
101
  body {
102
  font-family: Arial, sans-serif;
103
+ background-color: #121212;
104
+ color: #E0E0E0;
105
  }
106
  .container {
107
  max-width: 800px;
 
115
  justify-content: space-between;
116
  }
117
  .course-card {
118
+ background-color: #1E1E1E;
119
  border-radius: 8px;
120
  box-shadow: 0 2px 4px rgba(0, 0, 0, 0.5);
121
  margin-bottom: 20px;
 
137
  .course-info h3 {
138
  margin-top: 0;
139
  font-size: 18px;
140
+ color: #E0E0E0;
141
  }
142
  .course-info p {
143
+ color: #B0B0B0;
144
  font-size: 14px;
145
  margin-bottom: 10px;
146
  }