File size: 5,317 Bytes
a9ddd8f f953f49 a9ddd8f dbe16be a9ddd8f 80c04b5 29f7e0a f953f49 a9ddd8f 29f7e0a dbe16be 80c04b5 29f7e0a dbe16be 80c04b5 a9ddd8f f953f49 80c04b5 dbe16be a9ddd8f f953f49 dbe16be 80c04b5 dbe16be f953f49 dbe16be a9ddd8f 29f7e0a a9ddd8f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
import streamlit as st
import pandas as pd
import requests
import re
import tempfile
import shutil
import os
from difflib import SequenceMatcher
import json
from urllib.parse import quote_plus
def construct_query(row):
"""Constructs the Google search query using applicant data."""
query = str(row['Applicant Name'])
optional_fields = ['Job Title', 'State', 'City', 'Skills']
for field in optional_fields:
if field in row and pd.notna(row[field]):
value = row[field]
if isinstance(value, str) and value.strip():
query += f" {value.strip()}"
elif not isinstance(value, str):
query += f" {str(value).strip()}"
query += " linkedin"
return query
def get_name_from_url(link):
"""Extracts the name part from a LinkedIn profile URL."""
match = re.search(r'linkedin\.com/in/([a-zA-Z0-9-]+)', link)
if match:
return match.group(1).replace('-', ' ')
return None
def calculate_similarity(name1, name2):
"""Calculates similarity between two names."""
return SequenceMatcher(None, name1.lower().strip(), name2.lower().strip()).ratio()
# def fetch_linkedin_links(query, api_key, applicant_name):
# """Fetches LinkedIn profile links using BrightData SERP API."""
# linkedin_regex = r'https://(www|[a-z]{2})\.linkedin\.com/.*'
# try:
# response = requests.get(
# "https://serpapi.brightdata.com/google/search",
# params={
# "q": query,
# "num": 5,
# "api_key": api_key
# }
# )
# response.raise_for_status()
# results = response.json()
# organic_results = results.get("organic_results", [])
# for result in organic_results:
# link = result.get("link")
# if re.match(linkedin_regex, link):
# profile_name = get_name_from_url(link)
# if profile_name:
# similarity = calculate_similarity(applicant_name, profile_name)
# if similarity >= 0.5:
# return link
# return None
# except Exception as e:
# st.error(f"Error fetching link for query '{query}': {e}")
# return None
def fetch_linkedin_links(query, api_key, applicant_name):
"""Fetches LinkedIn profile links using BrightData SERP scraping API."""
try:
url = "https://api.brightdata.com/request"
google_url = f"https://www.google.com/search?q={quote_plus(query)}"
payload = {
"zone": "serp_api2", # ✅ ADD THIS LINE to include working zone
"url": google_url,
"method": "GET",
"country": "us",
"format": "raw",
"data_format": "html"
}
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}
response = requests.post(url, headers=headers, json=payload)
response.raise_for_status()
html = response.text
# Match standard LinkedIn profile URLs
linkedin_regex = r'https://(?:[a-z]{2,3}\.)?linkedin\.com/in/[a-zA-Z0-9\-_/]+'
matches = re.findall(linkedin_regex, html)
for link in matches:
profile_name = get_name_from_url(link)
if profile_name:
similarity = calculate_similarity(applicant_name, profile_name)
if similarity >= 0.5:
return link
return None
except Exception as e:
st.error(f"Error fetching link for query '{query}': {e}")
return None
def process_file(file, api_key):
"""Processes the uploaded Excel file to fetch LinkedIn profile links."""
try:
df = pd.read_excel(file)
df = df[df['Applicant Name'].notna()]
df = df[df['Applicant Name'].str.strip() != '']
df['Search Query'] = df.apply(construct_query, axis=1)
df['LinkedIn Link'] = df.apply(
lambda row: fetch_linkedin_links(row['Search Query'], api_key, row['Applicant Name']),
axis=1
)
temp_dir = tempfile.mkdtemp()
output_file = os.path.join(temp_dir, "updated_with_linkedin_links.csv")
df.to_csv(output_file, index=False)
return output_file
except Exception as e:
st.error(f"Error processing file: {e}")
return None
# Streamlit UI
st.title("LinkedIn Profile Link Scraper")
st.markdown("Upload an Excel file with applicant details, and get a CSV with LinkedIn profile links.")
api_key = st.text_input("Enter your BrightData SERP API Key", type="password")
uploaded_file = st.file_uploader("Upload Excel File", type=["xlsx"])
if uploaded_file and api_key:
st.write("Processing file...")
output_file = process_file(uploaded_file, api_key)
if output_file:
with open(output_file, "rb") as f:
st.download_button(
label="Download Updated CSV",
data=f,
file_name="updated_with_linkedin_links.csv",
mime="text/csv"
)
shutil.rmtree(os.path.dirname(output_file))
elif not api_key:
st.warning("Please enter your BrightData SERP API key to proceed.") |