BrightData_SerpAPI_LinkedIn_Profile_Scraping

Running

App Files Files Community

BrightData_SerpAPI_LinkedIn_Profile_Scraping / working_app.py

ElegantSolutions

Rename app.py to working_app.py

d495373 verified about 2 months ago

raw

history blame contribute delete

5.32 kB

	import streamlit as st
	import pandas as pd
	import requests
	import re
	import tempfile
	import shutil
	import os
	from difflib import SequenceMatcher
	import json
	from urllib.parse import quote_plus

	def construct_query(row):
	"""Constructs the Google search query using applicant data."""
	query = str(row['Applicant Name'])
	optional_fields = ['Job Title', 'State', 'City', 'Skills']
	for field in optional_fields:
	if field in row and pd.notna(row[field]):
	value = row[field]
	if isinstance(value, str) and value.strip():
	query += f" {value.strip()}"
	elif not isinstance(value, str):
	query += f" {str(value).strip()}"
	query += " linkedin"
	return query

	def get_name_from_url(link):
	"""Extracts the name part from a LinkedIn profile URL."""
	match = re.search(r'linkedin\.com/in/([a-zA-Z0-9-]+)', link)
	if match:
	return match.group(1).replace('-', ' ')
	return None

	def calculate_similarity(name1, name2):
	"""Calculates similarity between two names."""
	return SequenceMatcher(None, name1.lower().strip(), name2.lower().strip()).ratio()

	# def fetch_linkedin_links(query, api_key, applicant_name):
	# """Fetches LinkedIn profile links using BrightData SERP API."""
	# linkedin_regex = r'https://(www\|[a-z]{2})\.linkedin\.com/.*'

	# try:
	# response = requests.get(
	# "https://serpapi.brightdata.com/google/search",
	# params={
	# "q": query,
	# "num": 5,
	# "api_key": api_key
	# }
	# )
	# response.raise_for_status()
	# results = response.json()
	# organic_results = results.get("organic_results", [])

	# for result in organic_results:
	# link = result.get("link")
	# if re.match(linkedin_regex, link):
	# profile_name = get_name_from_url(link)
	# if profile_name:
	# similarity = calculate_similarity(applicant_name, profile_name)
	# if similarity >= 0.5:
	# return link
	# return None
	# except Exception as e:
	# st.error(f"Error fetching link for query '{query}': {e}")
	# return None


	def fetch_linkedin_links(query, api_key, applicant_name):
	"""Fetches LinkedIn profile links using BrightData SERP scraping API."""
	try:
	url = "https://api.brightdata.com/request"
	google_url = f"https://www.google.com/search?q={quote_plus(query)}"

	payload = {
	"zone": "serp_api2", # ✅ ADD THIS LINE to include working zone
	"url": google_url,
	"method": "GET",
	"country": "us",
	"format": "raw",
	"data_format": "html"
	}

	headers = {
	"Authorization": f"Bearer {api_key}",
	"Content-Type": "application/json"
	}

	response = requests.post(url, headers=headers, json=payload)
	response.raise_for_status()
	html = response.text

	# Match standard LinkedIn profile URLs
	linkedin_regex = r'https://(?:[a-z]{2,3}\.)?linkedin\.com/in/[a-zA-Z0-9\-_/]+'
	matches = re.findall(linkedin_regex, html)

	for link in matches:
	profile_name = get_name_from_url(link)
	if profile_name:
	similarity = calculate_similarity(applicant_name, profile_name)
	if similarity >= 0.5:
	return link
	return None
	except Exception as e:
	st.error(f"Error fetching link for query '{query}': {e}")
	return None

	def process_file(file, api_key):
	"""Processes the uploaded Excel file to fetch LinkedIn profile links."""
	try:
	df = pd.read_excel(file)
	df = df[df['Applicant Name'].notna()]
	df = df[df['Applicant Name'].str.strip() != '']
	df['Search Query'] = df.apply(construct_query, axis=1)
	df['LinkedIn Link'] = df.apply(
	lambda row: fetch_linkedin_links(row['Search Query'], api_key, row['Applicant Name']),
	axis=1
	)

	temp_dir = tempfile.mkdtemp()
	output_file = os.path.join(temp_dir, "updated_with_linkedin_links.csv")
	df.to_csv(output_file, index=False)
	return output_file
	except Exception as e:
	st.error(f"Error processing file: {e}")
	return None

	# Streamlit UI
	st.title("LinkedIn Profile Link Scraper")
	st.markdown("Upload an Excel file with applicant details, and get a CSV with LinkedIn profile links.")

	api_key = st.text_input("Enter your BrightData SERP API Key", type="password")
	uploaded_file = st.file_uploader("Upload Excel File", type=["xlsx"])

	if uploaded_file and api_key:
	st.write("Processing file...")
	output_file = process_file(uploaded_file, api_key)
	if output_file:
	with open(output_file, "rb") as f:
	st.download_button(
	label="Download Updated CSV",
	data=f,
	file_name="updated_with_linkedin_links.csv",
	mime="text/csv"
	)
	shutil.rmtree(os.path.dirname(output_file))
	elif not api_key:
	st.warning("Please enter your BrightData SERP API key to proceed.")