BrightData_SerpAPI_LinkedIn_Profile_Scraping

Running

BrightData_SerpAPI_LinkedIn_Profile_Scraping

File size: 3,812 Bytes

import streamlit as st
import pandas as pd
import requests
import re
import tempfile
import shutil
import os
from difflib import SequenceMatcher

def construct_query(row):
    """Constructs the Google search query using applicant data."""
    query = str(row['Applicant Name'])
    optional_fields = ['Job Title', 'State', 'City', 'Skills']
    for field in optional_fields:
        if field in row and pd.notna(row[field]):
            value = row[field]
            if isinstance(value, str) and value.strip():
                query += f" {value.strip()}"
            elif not isinstance(value, str):
                query += f" {str(value).strip()}"
    query += " linkedin"
    return query

def get_name_from_url(link):
    """Extracts the name part from a LinkedIn profile URL."""
    match = re.search(r'linkedin\.com/in/([a-zA-Z0-9-]+)', link)
    if match:
        return match.group(1).replace('-', ' ')
    return None

def calculate_similarity(name1, name2):
    """Calculates similarity between two names."""
    return SequenceMatcher(None, name1.lower().strip(), name2.lower().strip()).ratio()

def fetch_linkedin_links(query, api_key, applicant_name):
    """Fetches LinkedIn profile links using BrightData SERP API."""
    linkedin_regex = r'https://(www|[a-z]{2})\.linkedin\.com/.*'
    
    try:
        response = requests.get(
            "https://serpapi.brightdata.com/google/search",
            params={
                "q": query,
                "num": 5,
                "api_key": api_key
            }
        )
        response.raise_for_status()
        results = response.json()
        organic_results = results.get("organic_results", [])
        
        for result in organic_results:
            link = result.get("link")
            if re.match(linkedin_regex, link):
                profile_name = get_name_from_url(link)
                if profile_name:
                    similarity = calculate_similarity(applicant_name, profile_name)
                    if similarity >= 0.5:
                        return link
        return None
    except Exception as e:
        st.error(f"Error fetching link for query '{query}': {e}")
        return None

def process_file(file, api_key):
    """Processes the uploaded Excel file to fetch LinkedIn profile links."""
    try:
        df = pd.read_excel(file)
        df = df[df['Applicant Name'].notna()]
        df = df[df['Applicant Name'].str.strip() != '']
        df['Search Query'] = df.apply(construct_query, axis=1)
        df['LinkedIn Link'] = df.apply(
            lambda row: fetch_linkedin_links(row['Search Query'], api_key, row['Applicant Name']),
            axis=1
        )
        
        temp_dir = tempfile.mkdtemp()
        output_file = os.path.join(temp_dir, "updated_with_linkedin_links.csv")
        df.to_csv(output_file, index=False)
        return output_file
    except Exception as e:
        st.error(f"Error processing file: {e}")
        return None

# Streamlit UI
st.title("LinkedIn Profile Link Scraper")
st.markdown("Upload an Excel file with applicant details, and get a CSV with LinkedIn profile links.")

api_key = st.text_input("Enter your BrightData SERP API Key", type="password")
uploaded_file = st.file_uploader("Upload Excel File", type=["xlsx"])

if uploaded_file and api_key:
    st.write("Processing file...")
    output_file = process_file(uploaded_file, api_key)
    if output_file:
        with open(output_file, "rb") as f:
            st.download_button(
                label="Download Updated CSV",
                data=f,
                file_name="updated_with_linkedin_links.csv",
                mime="text/csv"
            )
        shutil.rmtree(os.path.dirname(output_file))
elif not api_key:
    st.warning("Please enter your BrightData SERP API key to proceed.")