Kolumbus Lindh commited on
Commit
05e61be
·
1 Parent(s): 9267889

gradio like/dislike

Browse files
Files changed (1) hide show
  1. gradio.py +155 -104
gradio.py CHANGED
@@ -2,21 +2,25 @@ import gradio as gr
2
  import PyPDF2
3
  import docx2txt
4
  import re
5
- import os
6
  from typing import Optional
7
  from datetime import datetime
8
 
9
- # If these come from separate modules, import them:
10
  from pinecone_handler import PineconeHandler
11
  from time_handling import read_timestamp
12
  from settings import DATE_FORMAT
13
 
14
  # ------------------------------------------------------------------
15
- # Original helper functions
16
  # ------------------------------------------------------------------
 
 
17
 
 
 
 
 
18
  def extract_text_from_pdf(pdf_file) -> str:
19
- """Extract text content from PDF file"""
20
  pdf_reader = PyPDF2.PdfReader(pdf_file)
21
  text = ""
22
  for page in pdf_reader.pages:
@@ -24,26 +28,20 @@ def extract_text_from_pdf(pdf_file) -> str:
24
  return text
25
 
26
  def extract_text_from_docx(docx_file) -> str:
27
- """Extract text content from DOCX file"""
28
  text = docx2txt.process(docx_file)
29
  return text
30
 
31
  def extract_resume_text(uploaded_file) -> Optional[str]:
32
- """Extract text from uploaded resume file"""
33
  if uploaded_file is None:
34
  return None
35
 
36
- # Extract filename from the Gradio file object
37
  file_extension = uploaded_file.name.split('.')[-1].lower()
38
-
39
  try:
40
  if file_extension == 'pdf':
41
- # Gradio’s uploaded_file is a tempfile-like object
42
  return extract_text_from_pdf(uploaded_file)
43
  elif file_extension in ['docx', 'doc']:
44
  return extract_text_from_docx(uploaded_file.name)
45
  elif file_extension == 'txt':
46
- # Read entire text
47
  return uploaded_file.read().decode("utf-8", errors="replace")
48
  else:
49
  return f"ERROR: Unsupported file format: {file_extension}"
@@ -51,7 +49,6 @@ def extract_resume_text(uploaded_file) -> Optional[str]:
51
  return f"ERROR: {str(e)}"
52
 
53
  def clean_resume_text(text: str) -> str:
54
- """Clean and process resume text"""
55
  if not text:
56
  return ""
57
  # Remove special characters and extra whitespace
@@ -59,16 +56,14 @@ def clean_resume_text(text: str) -> str:
59
  return text.strip()
60
 
61
  def is_description_truncated(description: str) -> bool:
62
- """Check if the description appears to be truncated"""
63
  truncation_indicators = [
64
- lambda x: len(x) >= 995, # Close to the 1000 char limit
65
  lambda x: x.rstrip().endswith(('...', '…')),
66
  lambda x: re.search(r'\w+$', x) and not re.search(r'[.!?]$', x),
67
  ]
68
  return any(indicator(description) for indicator in truncation_indicators)
69
 
70
  def format_job_description(description: str, truncated: bool = False) -> str:
71
- """Format job description text with sections, line breaks, etc."""
72
  if not description:
73
  return ""
74
 
@@ -82,15 +77,10 @@ def format_job_description(description: str, truncated: bool = False) -> str:
82
  formatted_text = description
83
  for section in sections:
84
  pattern = re.compile(f'({section}:?)', re.IGNORECASE)
85
- formatted_text = pattern.sub(r'\n\n\1', formatted_text)
86
 
87
- # Handle bullet points
88
  formatted_text = re.sub(r'[•-]\s*', '\n• ', formatted_text)
89
-
90
- # Add line breaks for sentences that look like list items
91
  formatted_text = re.sub(r'(?<=\w)\.(?=\s*[A-Z])', '.\n', formatted_text)
92
-
93
- # Reduce triple+ newlines to double
94
  formatted_text = re.sub(r'\n{3,}', '\n\n', formatted_text)
95
 
96
  if truncated:
@@ -98,128 +88,189 @@ def format_job_description(description: str, truncated: bool = False) -> str:
98
 
99
  return formatted_text.strip()
100
 
 
101
  # ------------------------------------------------------------------
102
- # Main Gradio function to handle user input and produce output
103
  # ------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
 
 
 
 
105
  def search_jobs(resume_file, num_results, city_filter):
106
  """
107
  1) Extract + clean resume
108
  2) Query Pinecone
109
- 3) Format the matching job ads
110
- 4) Return results as text (or HTML)
111
  """
112
- # If no file was uploaded
 
 
 
113
  if resume_file is None:
114
  return "Please upload a resume first."
115
-
116
  resume_text = extract_resume_text(resume_file)
117
  if resume_text is None or resume_text.startswith("ERROR"):
118
  return f"Error processing file: {resume_text}"
119
-
120
  clean_text = clean_resume_text(resume_text)
121
  if not clean_text:
122
  return "No text extracted from resume or file is invalid."
123
-
124
- # Pinecone init
125
- try:
126
- handler = PineconeHandler()
127
- except Exception as e:
128
- return f"Error connecting to Pinecone: {str(e)}"
129
-
130
- # Attempt to read timestamp for “Database Status”
131
- database_info = ""
132
  try:
133
  last_update = read_timestamp()
134
  last_update_dt = datetime.strptime(last_update, DATE_FORMAT)
135
- database_info = f"**Database last update:** {last_update_dt.strftime('%B %d, %Y at %I:%M %p')} (Stockholm Time)\n\n"
136
  except Exception as e:
137
- database_info = f"Error reading timestamp: {str(e)}\n\n"
138
-
139
- # Query Pinecone
 
 
 
 
 
 
140
  try:
141
  results = handler.search_similar_ads(
142
  clean_text, top_k=num_results, city=city_filter.strip()
143
  )
144
  except Exception as e:
145
- return f"{database_info}Error searching jobs: {str(e)}"
146
-
147
  if not results:
148
- return f"{database_info}No matching jobs found. Try a different city or fewer results."
149
-
150
- # Build a nice text/HTML output
151
- output_lines = []
152
- output_lines.append(database_info)
153
- output_lines.append(f"**Found {len(results)} matching jobs:**\n")
154
-
155
- for i, match in enumerate(results, 1):
156
  metadata = match.metadata
157
  score = match.score
158
 
159
- # Basic info
160
- output_lines.append(f"### {i}. {metadata['headline']}")
161
- output_lines.append(f"Match Score (Cosine): {score:.2f}")
162
-
163
- if metadata.get('logo_url'):
164
- # Gradio can't directly “embed” an image in text, but we can supply a link:
165
- output_lines.append(f"Logo: {metadata['logo_url']}")
166
-
167
- output_lines.append(f"**Location:** {metadata['city']}")
168
- output_lines.append(f"**Occupation:** {metadata['occupation']}")
169
- output_lines.append(f"**Published:** {metadata['published']}")
170
-
171
- # Handle description
172
- description = metadata['description']
173
- is_trunc = is_description_truncated(description)
174
- snippet = description[:2000] if is_trunc else description
175
-
176
- formatted_desc = format_job_description(snippet, truncated=is_trunc)
177
- output_lines.append(formatted_desc)
178
-
179
- if is_trunc:
180
- output_lines.append(
181
- "> **Note**: The full description seems truncated. Please visit the original posting."
 
 
182
  )
183
- if metadata.get('webpage_url'):
184
- output_lines.append(f"[View Original Job Posting]({metadata['webpage_url']})")
185
-
186
- output_lines.append(f"**Contact:** {metadata['email']}")
187
- output_lines.append("---")
188
 
189
- return "\n".join(output_lines)
 
 
190
 
191
 
192
  # ------------------------------------------------------------------
193
- # Build the Gradio interface
194
  # ------------------------------------------------------------------
 
 
 
195
 
196
- # We’ll combine the user inputs into a single function call
197
- with gr.Blocks() as demo:
198
- gr.Markdown("# AI-Powered Job Search (Gradio Version)")
199
- gr.Markdown(
200
- "Tired of searching for jobs? Upload your resume and discover perfectly matched opportunities!"
201
- )
202
-
203
- with gr.Row():
204
- resume_input = gr.File(label="Upload your resume (PDF, DOCX, DOC, or TXT)")
205
- num_results_slider = gr.Slider(
206
- minimum=1, maximum=20, value=5, step=1, label="Number of results"
207
- )
208
- city_input = gr.Textbox(
209
- label="Filter by city (optional)",
210
- placeholder="Enter a city to filter job results by location"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  )
212
-
213
- search_button = gr.Button("Search Jobs")
214
- output_box = gr.Markdown()
215
 
216
- # When the user clicks the button, call search_jobs()
217
- search_button.click(
218
- fn=search_jobs,
219
- inputs=[resume_input, num_results_slider, city_input],
220
- outputs=[output_box]
221
- )
222
 
223
 
224
  if __name__ == "__main__":
225
- demo.launch()
 
 
2
  import PyPDF2
3
  import docx2txt
4
  import re
 
5
  from typing import Optional
6
  from datetime import datetime
7
 
8
+ # --- Import your custom modules
9
  from pinecone_handler import PineconeHandler
10
  from time_handling import read_timestamp
11
  from settings import DATE_FORMAT
12
 
13
  # ------------------------------------------------------------------
14
+ # Global or session-level store for job data
15
  # ------------------------------------------------------------------
16
+ MAX_RESULTS = 10 # Up to 10 job ads displayed
17
+ JOBS_CACHE = [None] * MAX_RESULTS # Each element will hold (ad_id, ad_metadata, full_resume_text)
18
 
19
+
20
+ # ------------------------------------------------------------------
21
+ # Helper functions (same as your original ones)
22
+ # ------------------------------------------------------------------
23
  def extract_text_from_pdf(pdf_file) -> str:
 
24
  pdf_reader = PyPDF2.PdfReader(pdf_file)
25
  text = ""
26
  for page in pdf_reader.pages:
 
28
  return text
29
 
30
  def extract_text_from_docx(docx_file) -> str:
 
31
  text = docx2txt.process(docx_file)
32
  return text
33
 
34
  def extract_resume_text(uploaded_file) -> Optional[str]:
 
35
  if uploaded_file is None:
36
  return None
37
 
 
38
  file_extension = uploaded_file.name.split('.')[-1].lower()
 
39
  try:
40
  if file_extension == 'pdf':
 
41
  return extract_text_from_pdf(uploaded_file)
42
  elif file_extension in ['docx', 'doc']:
43
  return extract_text_from_docx(uploaded_file.name)
44
  elif file_extension == 'txt':
 
45
  return uploaded_file.read().decode("utf-8", errors="replace")
46
  else:
47
  return f"ERROR: Unsupported file format: {file_extension}"
 
49
  return f"ERROR: {str(e)}"
50
 
51
  def clean_resume_text(text: str) -> str:
 
52
  if not text:
53
  return ""
54
  # Remove special characters and extra whitespace
 
56
  return text.strip()
57
 
58
  def is_description_truncated(description: str) -> bool:
 
59
  truncation_indicators = [
60
+ lambda x: len(x) >= 995, # close to 1000 char limit
61
  lambda x: x.rstrip().endswith(('...', '…')),
62
  lambda x: re.search(r'\w+$', x) and not re.search(r'[.!?]$', x),
63
  ]
64
  return any(indicator(description) for indicator in truncation_indicators)
65
 
66
  def format_job_description(description: str, truncated: bool = False) -> str:
 
67
  if not description:
68
  return ""
69
 
 
77
  formatted_text = description
78
  for section in sections:
79
  pattern = re.compile(f'({section}:?)', re.IGNORECASE)
80
+ formatted_text = pattern.sub(r'\n\n\\1', formatted_text)
81
 
 
82
  formatted_text = re.sub(r'[•-]\s*', '\n• ', formatted_text)
 
 
83
  formatted_text = re.sub(r'(?<=\w)\.(?=\s*[A-Z])', '.\n', formatted_text)
 
 
84
  formatted_text = re.sub(r'\n{3,}', '\n\n', formatted_text)
85
 
86
  if truncated:
 
88
 
89
  return formatted_text.strip()
90
 
91
+
92
  # ------------------------------------------------------------------
93
+ # Callback for Like/Dislike
94
  # ------------------------------------------------------------------
95
+ def user_interaction(index_in_cache, action):
96
+ """
97
+ index_in_cache: which job row's button was clicked (0..MAX_RESULTS-1)
98
+ action: 'like' or 'dislike'
99
+
100
+ We'll retrieve:
101
+ - ad_id
102
+ - resume_text
103
+ - possibly do something with them (e.g. store in DB)
104
+ """
105
+ if index_in_cache < 0 or index_in_cache >= MAX_RESULTS:
106
+ return "Invalid job index."
107
+
108
+ cached = JOBS_CACHE[index_in_cache]
109
+ if not cached:
110
+ return "No job data at this slot."
111
+
112
+ ad_id, metadata, full_resume_text = cached
113
+
114
+ # Example logging or storing
115
+ # In reality, you might store this info in a database or call an API
116
+ print(f"[USER_INTERACTION] Action={action}, AdID={ad_id}, CV length={len(full_resume_text)} chars.")
117
+
118
+ return f"You {action}d job {ad_id}."
119
 
120
+
121
+ # ------------------------------------------------------------------
122
+ # Callback to search jobs
123
+ # ------------------------------------------------------------------
124
  def search_jobs(resume_file, num_results, city_filter):
125
  """
126
  1) Extract + clean resume
127
  2) Query Pinecone
128
+ 3) Populate the placeholders for up to MAX_RESULTS job ads
129
+ 4) Return status message
130
  """
131
+ # Clear out global cache
132
+ for i in range(MAX_RESULTS):
133
+ JOBS_CACHE[i] = None
134
+
135
  if resume_file is None:
136
  return "Please upload a resume first."
137
+
138
  resume_text = extract_resume_text(resume_file)
139
  if resume_text is None or resume_text.startswith("ERROR"):
140
  return f"Error processing file: {resume_text}"
141
+
142
  clean_text = clean_resume_text(resume_text)
143
  if not clean_text:
144
  return "No text extracted from resume or file is invalid."
145
+
146
+ # Attempt to read the database update time
 
 
 
 
 
 
 
147
  try:
148
  last_update = read_timestamp()
149
  last_update_dt = datetime.strptime(last_update, DATE_FORMAT)
150
+ db_info = f"**Database last update:** {last_update_dt.strftime('%B %d, %Y at %I:%M %p')} (Stockholm Time)\n\n"
151
  except Exception as e:
152
+ db_info = f"Error reading timestamp: {str(e)}\n\n"
153
+
154
+ # Pinecone init
155
+ try:
156
+ handler = PineconeHandler()
157
+ except Exception as e:
158
+ return f"{db_info}Error connecting to Pinecone: {str(e)}"
159
+
160
+ # Search
161
  try:
162
  results = handler.search_similar_ads(
163
  clean_text, top_k=num_results, city=city_filter.strip()
164
  )
165
  except Exception as e:
166
+ return f"{db_info}Error searching jobs: {str(e)}"
167
+
168
  if not results:
169
+ return f"{db_info}No matching jobs found."
170
+
171
+ # Fill up to MAX_RESULTS
172
+ text_output = [db_info + f"**Found {len(results)} matching jobs:**\n"]
173
+
174
+ for i, match in enumerate(results[:MAX_RESULTS]):
 
 
175
  metadata = match.metadata
176
  score = match.score
177
 
178
+ # We'll store data in our global JOBS_CACHE so user_interaction can retrieve it
179
+ # You might have an 'id' or something in metadata that you treat as the ad_id
180
+ ad_id = str(metadata.get('job_id', f"Unknown_{i}"))
181
+ JOBS_CACHE[i] = (ad_id, metadata, clean_text)
182
+
183
+ headline = metadata.get('headline', 'Untitled')
184
+ city = metadata.get('city', 'Unknown City')
185
+ occupation = metadata.get('occupation', 'Unknown Occupation')
186
+ published = metadata.get('published', 'Unknown Date')
187
+ desc = metadata.get('description', '')
188
+ truncated = is_description_truncated(desc)
189
+ snippet = desc[:2000] if truncated else desc
190
+ formatted_desc = format_job_description(snippet, truncated=truncated)
191
+
192
+ text_output.append(f"### {i+1}. {headline}")
193
+ text_output.append(f"**Ad ID**: `{ad_id}`")
194
+ text_output.append(f"**Match Score (Cosine)**: {score:.2f}")
195
+ text_output.append(f"**Location**: {city}")
196
+ text_output.append(f"**Occupation**: {occupation}")
197
+ text_output.append(f"**Published**: {published}")
198
+ text_output.append(formatted_desc or "*No description*")
199
+
200
+ if truncated:
201
+ text_output.append(
202
+ "> **Note**: Description truncated. See original link for full details."
203
  )
204
+ if 'webpage_url' in metadata:
205
+ text_output.append(f"[View Original]({metadata['webpage_url']})")
 
 
 
206
 
207
+ text_output.append("---")
208
+
209
+ return "\n".join(text_output)
210
 
211
 
212
  # ------------------------------------------------------------------
213
+ # Build Gradio interface
214
  # ------------------------------------------------------------------
215
+ def build_interface():
216
+ with gr.Blocks() as demo:
217
+ gr.Markdown("# AI-Powered Job Search (Gradio with Like/Dislike)")
218
 
219
+ with gr.Row():
220
+ resume_input = gr.File(label="Upload your resume (PDF, DOCX, DOC, or TXT)")
221
+ num_results_slider = gr.Slider(
222
+ minimum=1, maximum=MAX_RESULTS, value=5,
223
+ step=1, label="Number of results"
224
+ )
225
+ city_input = gr.Textbox(
226
+ label="Filter by city (optional)",
227
+ placeholder="Enter a city to filter job results by location"
228
+ )
229
+
230
+ search_button = gr.Button("Search Jobs")
231
+ results_markdown = gr.Markdown()
232
+
233
+ # We create up to MAX_RESULTS rows for like/dislike
234
+ # Each row has two buttons that map to user_interaction
235
+ # We'll label them with the index so we can pass it to user_interaction
236
+ output_messages = []
237
+ for i in range(MAX_RESULTS):
238
+ with gr.Row(visible=True) as row_i:
239
+ # Each row: "Like" & "Dislike"
240
+ btn_like = gr.Button(f"Like #{i+1}", variant="secondary", visible=True)
241
+ btn_dislike = gr.Button(f"Dislike #{i+1}", variant="secondary", visible=True)
242
+
243
+ # user_interaction callback => returns a small message
244
+ msg = gr.Markdown(visible=True)
245
+ output_messages.append(msg)
246
+
247
+ # Wire the buttons to user_interaction
248
+ # We pass:
249
+ # - The index in the JOBS_CACHE
250
+ # - The literal string 'like' or 'dislike'
251
+ # The function returns a small text update
252
+ btn_like.click(
253
+ fn=user_interaction,
254
+ inputs=[gr.State(i), gr.State("like")],
255
+ outputs=[msg]
256
+ )
257
+ btn_dislike.click(
258
+ fn=user_interaction,
259
+ inputs=[gr.State(i), gr.State("dislike")],
260
+ outputs=[msg]
261
+ )
262
+
263
+ # On search click => call search_jobs
264
+ # outputs => results_markdown (which displays the job list)
265
+ search_button.click(
266
+ fn=search_jobs,
267
+ inputs=[resume_input, num_results_slider, city_input],
268
+ outputs=[results_markdown]
269
  )
 
 
 
270
 
271
+ return demo
 
 
 
 
 
272
 
273
 
274
  if __name__ == "__main__":
275
+ demo_app = build_interface()
276
+ demo_app.launch()