ElegantSolutions commited on
Commit
774bf29
·
verified ·
1 Parent(s): d495373

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +151 -0
app.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import requests
4
+ import re
5
+ import tempfile
6
+ import shutil
7
+ import os
8
+ from difflib import SequenceMatcher
9
+ import json
10
+ from urllib.parse import quote_plus
11
+
12
+ # -----------------------------------------------
13
+ # 🔧 UTILITY FUNCTIONS
14
+ # -----------------------------------------------
15
+
16
+ def construct_query(row):
17
+ """Constructs the Google search query using applicant data."""
18
+ query = str(row['Applicant Name'])
19
+ optional_fields = ['Job Title', 'State', 'City', 'Skills']
20
+
21
+ for field in optional_fields:
22
+ if field in row and pd.notna(row[field]):
23
+ value = row[field]
24
+ query += f" {str(value).strip()}" if str(value).strip() else ""
25
+
26
+ query += " linkedin"
27
+ print(f"[DEBUG] Search Query: {query}")
28
+ return query
29
+
30
+ def get_name_from_url(link):
31
+ """Extracts the name part from a LinkedIn profile URL."""
32
+ match = re.search(r'linkedin\.com/in/([a-zA-Z0-9-]+)', link)
33
+ if match:
34
+ profile_name = match.group(1).replace('-', ' ')
35
+ print(f"[DEBUG] Extracted profile name from URL: {profile_name}")
36
+ return profile_name
37
+ return None
38
+
39
+ def calculate_similarity(name1, name2):
40
+ """Calculates similarity between two names."""
41
+ similarity = SequenceMatcher(None, name1.lower().strip(), name2.lower().strip()).ratio()
42
+ print(f"[DEBUG] Similarity between '{name1}' and '{name2}' = {similarity}")
43
+ return similarity
44
+
45
+ # -----------------------------------------------
46
+ # 🔍 LINKEDIN SCRAPER FUNCTION
47
+ # -----------------------------------------------
48
+
49
+ def fetch_linkedin_links(query, api_key, applicant_name):
50
+ """Fetches LinkedIn profile links using BrightData SERP scraping API."""
51
+ try:
52
+ print(f"[DEBUG] Sending request to BrightData for query: {query}")
53
+ url = "https://api.brightdata.com/request"
54
+ google_url = f"https://www.google.com/search?q={quote_plus(query)}"
55
+
56
+ payload = {
57
+ "zone": "serp_api2",
58
+ "url": google_url,
59
+ "method": "GET",
60
+ "country": "us",
61
+ "format": "raw",
62
+ "data_format": "html"
63
+ }
64
+
65
+ headers = {
66
+ "Authorization": f"Bearer {api_key}",
67
+ "Content-Type": "application/json"
68
+ }
69
+
70
+ response = requests.post(url, headers=headers, json=payload)
71
+ response.raise_for_status()
72
+ html = response.text
73
+
74
+ linkedin_regex = r'https://(?:[a-z]{2,3}\.)?linkedin\.com/in/[a-zA-Z0-9\-_/]+'
75
+ matches = re.findall(linkedin_regex, html)
76
+ print(f"[DEBUG] Found {len(matches)} LinkedIn link(s) in search result")
77
+
78
+ for link in matches:
79
+ profile_name = get_name_from_url(link)
80
+ if profile_name:
81
+ similarity = calculate_similarity(applicant_name, profile_name)
82
+ if similarity >= 0.5:
83
+ print(f"[DEBUG] Match found: {link}")
84
+ return link
85
+ print(f"[DEBUG] No matching LinkedIn profile found for: {applicant_name}")
86
+ return None
87
+
88
+ except Exception as e:
89
+ print(f"[ERROR] Error fetching LinkedIn link for query '{query}': {e}")
90
+ return None
91
+
92
+ # -----------------------------------------------
93
+ # 📂 PROCESS FILE FUNCTION
94
+ # -----------------------------------------------
95
+
96
+ def process_file(file, api_key):
97
+ """Processes the uploaded Excel file to fetch LinkedIn profile links."""
98
+ try:
99
+ df = pd.read_excel(file)
100
+ print(f"[DEBUG] Input file read successfully. Rows: {len(df)}")
101
+
102
+ if 'Applicant Name' not in df.columns:
103
+ raise ValueError("Missing required column: 'Applicant Name'")
104
+
105
+ df = df[df['Applicant Name'].notna()]
106
+ df = df[df['Applicant Name'].str.strip() != '']
107
+ print(f"[DEBUG] Valid applicant rows after filtering: {len(df)}")
108
+
109
+ df['Search Query'] = df.apply(construct_query, axis=1)
110
+ df['LinkedIn Link'] = df.apply(
111
+ lambda row: fetch_linkedin_links(row['Search Query'], api_key, row['Applicant Name']),
112
+ axis=1
113
+ )
114
+
115
+ temp_dir = tempfile.mkdtemp()
116
+ output_file = os.path.join(temp_dir, "updated_with_linkedin_links.csv")
117
+ df.to_csv(output_file, index=False)
118
+ print(f"[DEBUG] Output written to: {output_file}")
119
+ return output_file
120
+
121
+ except Exception as e:
122
+ print(f"[ERROR] Error processing file: {e}")
123
+ st.error(f"Error processing file: {e}")
124
+ return None
125
+
126
+ # -----------------------------------------------
127
+ # 🌐 STREAMLIT INTERFACE
128
+ # -----------------------------------------------
129
+
130
+ st.set_page_config(page_title="LinkedIn Profile Scraper", layout="centered")
131
+ st.title("🔗 LinkedIn Profile Link Scraper")
132
+ st.markdown("Upload an Excel file with applicant details to fetch best-matching LinkedIn profile links.")
133
+
134
+ api_key = st.text_input("Enter your BrightData SERP API Key", type="password")
135
+ uploaded_file = st.file_uploader("Upload Excel File (.xlsx)", type=["xlsx"])
136
+
137
+ if uploaded_file and api_key:
138
+ st.info("⏳ Processing file... This may take a moment.")
139
+ output_file = process_file(uploaded_file, api_key)
140
+ if output_file:
141
+ with open(output_file, "rb") as f:
142
+ st.success("✅ Processing complete. Download the updated file below.")
143
+ st.download_button(
144
+ label="📥 Download CSV with LinkedIn Links",
145
+ data=f,
146
+ file_name="updated_with_linkedin_links.csv",
147
+ mime="text/csv"
148
+ )
149
+ shutil.rmtree(os.path.dirname(output_file)) # Cleanup temp directory
150
+ elif not api_key:
151
+ st.warning("⚠️ Please enter your BrightData SERP API key to proceed.")