ElegantSolutions's picture
Rename app.py to working_app.py
d495373 verified
import streamlit as st
import pandas as pd
import requests
import re
import tempfile
import shutil
import os
from difflib import SequenceMatcher
import json
from urllib.parse import quote_plus
def construct_query(row):
"""Constructs the Google search query using applicant data."""
query = str(row['Applicant Name'])
optional_fields = ['Job Title', 'State', 'City', 'Skills']
for field in optional_fields:
if field in row and pd.notna(row[field]):
value = row[field]
if isinstance(value, str) and value.strip():
query += f" {value.strip()}"
elif not isinstance(value, str):
query += f" {str(value).strip()}"
query += " linkedin"
return query
def get_name_from_url(link):
"""Extracts the name part from a LinkedIn profile URL."""
match = re.search(r'linkedin\.com/in/([a-zA-Z0-9-]+)', link)
if match:
return match.group(1).replace('-', ' ')
return None
def calculate_similarity(name1, name2):
"""Calculates similarity between two names."""
return SequenceMatcher(None, name1.lower().strip(), name2.lower().strip()).ratio()
# def fetch_linkedin_links(query, api_key, applicant_name):
# """Fetches LinkedIn profile links using BrightData SERP API."""
# linkedin_regex = r'https://(www|[a-z]{2})\.linkedin\.com/.*'
# try:
# response = requests.get(
# "https://serpapi.brightdata.com/google/search",
# params={
# "q": query,
# "num": 5,
# "api_key": api_key
# }
# )
# response.raise_for_status()
# results = response.json()
# organic_results = results.get("organic_results", [])
# for result in organic_results:
# link = result.get("link")
# if re.match(linkedin_regex, link):
# profile_name = get_name_from_url(link)
# if profile_name:
# similarity = calculate_similarity(applicant_name, profile_name)
# if similarity >= 0.5:
# return link
# return None
# except Exception as e:
# st.error(f"Error fetching link for query '{query}': {e}")
# return None
def fetch_linkedin_links(query, api_key, applicant_name):
"""Fetches LinkedIn profile links using BrightData SERP scraping API."""
try:
url = "https://api.brightdata.com/request"
google_url = f"https://www.google.com/search?q={quote_plus(query)}"
payload = {
"zone": "serp_api2", # ✅ ADD THIS LINE to include working zone
"url": google_url,
"method": "GET",
"country": "us",
"format": "raw",
"data_format": "html"
}
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}
response = requests.post(url, headers=headers, json=payload)
response.raise_for_status()
html = response.text
# Match standard LinkedIn profile URLs
linkedin_regex = r'https://(?:[a-z]{2,3}\.)?linkedin\.com/in/[a-zA-Z0-9\-_/]+'
matches = re.findall(linkedin_regex, html)
for link in matches:
profile_name = get_name_from_url(link)
if profile_name:
similarity = calculate_similarity(applicant_name, profile_name)
if similarity >= 0.5:
return link
return None
except Exception as e:
st.error(f"Error fetching link for query '{query}': {e}")
return None
def process_file(file, api_key):
"""Processes the uploaded Excel file to fetch LinkedIn profile links."""
try:
df = pd.read_excel(file)
df = df[df['Applicant Name'].notna()]
df = df[df['Applicant Name'].str.strip() != '']
df['Search Query'] = df.apply(construct_query, axis=1)
df['LinkedIn Link'] = df.apply(
lambda row: fetch_linkedin_links(row['Search Query'], api_key, row['Applicant Name']),
axis=1
)
temp_dir = tempfile.mkdtemp()
output_file = os.path.join(temp_dir, "updated_with_linkedin_links.csv")
df.to_csv(output_file, index=False)
return output_file
except Exception as e:
st.error(f"Error processing file: {e}")
return None
# Streamlit UI
st.title("LinkedIn Profile Link Scraper")
st.markdown("Upload an Excel file with applicant details, and get a CSV with LinkedIn profile links.")
api_key = st.text_input("Enter your BrightData SERP API Key", type="password")
uploaded_file = st.file_uploader("Upload Excel File", type=["xlsx"])
if uploaded_file and api_key:
st.write("Processing file...")
output_file = process_file(uploaded_file, api_key)
if output_file:
with open(output_file, "rb") as f:
st.download_button(
label="Download Updated CSV",
data=f,
file_name="updated_with_linkedin_links.csv",
mime="text/csv"
)
shutil.rmtree(os.path.dirname(output_file))
elif not api_key:
st.warning("Please enter your BrightData SERP API key to proceed.")