import streamlit as st import pandas as pd import requests import re import tempfile import shutil import os from difflib import SequenceMatcher def construct_query(row): """Constructs the Google search query using applicant data.""" query = str(row['Applicant Name']) optional_fields = ['Job Title', 'State', 'City', 'Skills'] for field in optional_fields: if field in row and pd.notna(row[field]): value = row[field] if isinstance(value, str) and value.strip(): query += f" {value.strip()}" elif not isinstance(value, str): query += f" {str(value).strip()}" query += " linkedin" return query def get_name_from_url(link): """Extracts the name part from a LinkedIn profile URL.""" match = re.search(r'linkedin\.com/in/([a-zA-Z0-9-]+)', link) if match: return match.group(1).replace('-', ' ') return None def calculate_similarity(name1, name2): """Calculates similarity between two names.""" return SequenceMatcher(None, name1.lower().strip(), name2.lower().strip()).ratio() def fetch_linkedin_links(query, api_key, applicant_name): """Fetches LinkedIn profile links using BrightData SERP API.""" linkedin_regex = r'https://(www|[a-z]{2})\.linkedin\.com/.*' try: response = requests.get( "https://serpapi.brightdata.com/google/search", params={ "q": query, "num": 5, "api_key": api_key } ) response.raise_for_status() results = response.json() organic_results = results.get("organic_results", []) for result in organic_results: link = result.get("link") if re.match(linkedin_regex, link): profile_name = get_name_from_url(link) if profile_name: similarity = calculate_similarity(applicant_name, profile_name) if similarity >= 0.5: return link return None except Exception as e: st.error(f"Error fetching link for query '{query}': {e}") return None def process_file(file, api_key): """Processes the uploaded Excel file to fetch LinkedIn profile links.""" try: df = pd.read_excel(file) df = df[df['Applicant Name'].notna()] df = df[df['Applicant Name'].str.strip() != ''] df['Search Query'] = df.apply(construct_query, axis=1) df['LinkedIn Link'] = df.apply( lambda row: fetch_linkedin_links(row['Search Query'], api_key, row['Applicant Name']), axis=1 ) temp_dir = tempfile.mkdtemp() output_file = os.path.join(temp_dir, "updated_with_linkedin_links.csv") df.to_csv(output_file, index=False) return output_file except Exception as e: st.error(f"Error processing file: {e}") return None # Streamlit UI st.title("LinkedIn Profile Link Scraper") st.markdown("Upload an Excel file with applicant details, and get a CSV with LinkedIn profile links.") api_key = st.text_input("Enter your BrightData SERP API Key", type="password") uploaded_file = st.file_uploader("Upload Excel File", type=["xlsx"]) if uploaded_file and api_key: st.write("Processing file...") output_file = process_file(uploaded_file, api_key) if output_file: with open(output_file, "rb") as f: st.download_button( label="Download Updated CSV", data=f, file_name="updated_with_linkedin_links.csv", mime="text/csv" ) shutil.rmtree(os.path.dirname(output_file)) elif not api_key: st.warning("Please enter your BrightData SERP API key to proceed.")