ElegantSolutions commited on
Commit
eb340f9
Β·
verified Β·
1 Parent(s): e551906

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +168 -0
app.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import requests
4
+ import re
5
+ import tempfile
6
+ import shutil
7
+ import os
8
+ from difflib import SequenceMatcher
9
+ import json
10
+ from urllib.parse import quote_plus
11
+ import base64
12
+
13
+ # -----------------------------------------------
14
+ # πŸ”§ UTILITY FUNCTIONS
15
+ # -----------------------------------------------
16
+
17
+ def construct_query(row):
18
+ """Constructs the Google search query using applicant data."""
19
+ query = str(row['Applicant Name'])
20
+ optional_fields = ['Job Title', 'State', 'City', 'Skills']
21
+
22
+ for field in optional_fields:
23
+ if field in row and pd.notna(row[field]):
24
+ value = row[field]
25
+ query += f" {str(value).strip()}" if str(value).strip() else ""
26
+
27
+ query += " linkedin"
28
+ print(f"[DEBUG] Search Query: {query}")
29
+ return query
30
+
31
+ def get_name_from_url(link):
32
+ """Extracts the name part from a LinkedIn profile URL."""
33
+ match = re.search(r'linkedin\.com/in/([a-zA-Z0-9-]+)', link)
34
+ if match:
35
+ profile_name = match.group(1).replace('-', ' ')
36
+ print(f"[DEBUG] Extracted profile name from URL: {profile_name}")
37
+ return profile_name
38
+ return None
39
+
40
+ def calculate_similarity(name1, name2):
41
+ """Calculates similarity between two names."""
42
+ similarity = SequenceMatcher(None, name1.lower().strip(), name2.lower().strip()).ratio()
43
+ print(f"[DEBUG] Similarity between '{name1}' and '{name2}' = {similarity}")
44
+ return similarity
45
+
46
+ # -----------------------------------------------
47
+ # πŸ” LINKEDIN SCRAPER FUNCTION
48
+ # -----------------------------------------------
49
+
50
+ def fetch_linkedin_links(query, api_key, applicant_name):
51
+ """Fetches LinkedIn profile links using BrightData SERP scraping API."""
52
+ try:
53
+ print(f"[DEBUG] Sending request to BrightData for query: {query}")
54
+ url = "https://api.brightdata.com/request"
55
+ google_url = f"https://www.google.com/search?q={quote_plus(query)}"
56
+
57
+ payload = {
58
+ "zone": "serp_api2",
59
+ "url": google_url,
60
+ "method": "GET",
61
+ "country": "us",
62
+ "format": "raw",
63
+ "data_format": "html"
64
+ }
65
+
66
+ headers = {
67
+ "Authorization": f"Bearer {api_key}",
68
+ "Content-Type": "application/json"
69
+ }
70
+
71
+ response = requests.post(url, headers=headers, json=payload)
72
+ response.raise_for_status()
73
+ html = response.text
74
+
75
+ linkedin_regex = r'https://(?:[a-z]{2,3}\.)?linkedin\.com/in/[a-zA-Z0-9\-_/]+'
76
+ matches = re.findall(linkedin_regex, html)
77
+ print(f"[DEBUG] Found {len(matches)} LinkedIn link(s) in search result")
78
+
79
+ for link in matches:
80
+ profile_name = get_name_from_url(link)
81
+ if profile_name:
82
+ similarity = calculate_similarity(applicant_name, profile_name)
83
+ if similarity >= 0.5:
84
+ print(f"[DEBUG] Match found: {link}")
85
+ return link
86
+ print(f"[DEBUG] No matching LinkedIn profile found for: {applicant_name}")
87
+ return None
88
+
89
+ except Exception as e:
90
+ print(f"[ERROR] Error fetching LinkedIn link for query '{query}': {e}")
91
+ return None
92
+
93
+ # -----------------------------------------------
94
+ # πŸ“‚ PROCESS FILE FUNCTION
95
+ # -----------------------------------------------
96
+
97
+ def process_file(file, api_key):
98
+ """Processes the uploaded Excel file to fetch LinkedIn profile links."""
99
+ try:
100
+ df = pd.read_excel(file)
101
+ print(f"[DEBUG] Input file read successfully. Rows: {len(df)}")
102
+
103
+ if 'Applicant Name' not in df.columns:
104
+ raise ValueError("Missing required column: 'Applicant Name'")
105
+
106
+ df = df[df['Applicant Name'].notna()]
107
+ df = df[df['Applicant Name'].str.strip() != '']
108
+ print(f"[DEBUG] Valid applicant rows after filtering: {len(df)}")
109
+
110
+ df['Search Query'] = df.apply(construct_query, axis=1)
111
+ df['LinkedIn Link'] = df.apply(
112
+ lambda row: fetch_linkedin_links(row['Search Query'], api_key, row['Applicant Name']),
113
+ axis=1
114
+ )
115
+
116
+ temp_dir = tempfile.mkdtemp()
117
+ output_file = os.path.join(temp_dir, "updated_with_linkedin_links.csv")
118
+ df.to_csv(output_file, index=False)
119
+ print(f"[DEBUG] Output written to: {output_file}")
120
+ return output_file
121
+
122
+ except Exception as e:
123
+ print(f"[ERROR] Error processing file: {e}")
124
+ st.error(f"Error processing file: {e}")
125
+ return None
126
+
127
+ # -----------------------------------------------
128
+ # 🌐 STREAMLIT INTERFACE
129
+ # -----------------------------------------------
130
+
131
+ st.set_page_config(page_title="LinkedIn Profile Scraper", layout="centered")
132
+ st.title("πŸ”— LinkedIn Profile Link Scraper")
133
+ st.markdown("Upload an Excel file with applicant details to fetch best-matching LinkedIn profile links.")
134
+
135
+ api_key = st.text_input("Enter your BrightData SERP API Key", type="password")
136
+ uploaded_file = st.file_uploader("Upload Excel File (.xlsx)", type=["xlsx"])
137
+
138
+ if uploaded_file and api_key:
139
+ st.info("⏳ Processing file... This may take a moment.")
140
+ output_file = process_file(uploaded_file, api_key)
141
+ if output_file:
142
+ with open(output_file, "rb") as f:
143
+ csv_bytes = f.read()
144
+
145
+ b64 = base64.b64encode(csv_bytes).decode()
146
+ href = f'<a href="data:text/csv;base64,{b64}" download="updated_with_linkedin_links.csv" id="auto-download-link"></a>'
147
+
148
+ st.markdown(href, unsafe_allow_html=True)
149
+ st.markdown(
150
+ """
151
+ <script>
152
+ document.getElementById('auto-download-link').click();
153
+ </script>
154
+ """,
155
+ unsafe_allow_html=True
156
+ )
157
+
158
+ st.success("βœ… Processing complete. Your file is downloading automatically! Or click below if it didn't start.")
159
+ # st.success("βœ… Processing complete. Download the updated file below.")
160
+ st.download_button(
161
+ label="πŸ“₯ Download CSV with LinkedIn Links",
162
+ data=f,
163
+ file_name="updated_with_linkedin_links.csv",
164
+ mime="text/csv"
165
+ )
166
+ shutil.rmtree(os.path.dirname(output_file)) # Cleanup temp directory
167
+ elif not api_key:
168
+ st.warning("⚠️ Please enter your BrightData SERP API key to proceed.")