ElegantSolutions commited on
Commit
a9ddd8f
·
verified ·
1 Parent(s): a9239bd

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +152 -0
app.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import requests
4
+ import re
5
+ import tempfile
6
+ import shutil
7
+ import os
8
+ from difflib import SequenceMatcher
9
+ import json
10
+
11
+ def construct_query(row):
12
+ """Constructs the Google search query using applicant data."""
13
+ query = str(row['Applicant Name'])
14
+ optional_fields = ['Job Title', 'State', 'City', 'Skills']
15
+ for field in optional_fields:
16
+ if field in row and pd.notna(row[field]):
17
+ value = row[field]
18
+ if isinstance(value, str) and value.strip():
19
+ query += f" {value.strip()}"
20
+ elif not isinstance(value, str):
21
+ query += f" {str(value).strip()}"
22
+ query += " linkedin"
23
+ return query
24
+
25
+ def get_name_from_url(link):
26
+ """Extracts the name part from a LinkedIn profile URL."""
27
+ match = re.search(r'linkedin\.com/in/([a-zA-Z0-9-]+)', link)
28
+ if match:
29
+ return match.group(1).replace('-', ' ')
30
+ return None
31
+
32
+ def calculate_similarity(name1, name2):
33
+ """Calculates similarity between two names."""
34
+ return SequenceMatcher(None, name1.lower().strip(), name2.lower().strip()).ratio()
35
+
36
+ # def fetch_linkedin_links(query, api_key, applicant_name):
37
+ # """Fetches LinkedIn profile links using BrightData SERP API."""
38
+ # linkedin_regex = r'https://(www|[a-z]{2})\.linkedin\.com/.*'
39
+
40
+ # try:
41
+ # response = requests.get(
42
+ # "https://serpapi.brightdata.com/google/search",
43
+ # params={
44
+ # "q": query,
45
+ # "num": 5,
46
+ # "api_key": api_key
47
+ # }
48
+ # )
49
+ # response.raise_for_status()
50
+ # results = response.json()
51
+ # organic_results = results.get("organic_results", [])
52
+
53
+ # for result in organic_results:
54
+ # link = result.get("link")
55
+ # if re.match(linkedin_regex, link):
56
+ # profile_name = get_name_from_url(link)
57
+ # if profile_name:
58
+ # similarity = calculate_similarity(applicant_name, profile_name)
59
+ # if similarity >= 0.5:
60
+ # return link
61
+ # return None
62
+ # except Exception as e:
63
+ # st.error(f"Error fetching link for query '{query}': {e}")
64
+ # return None
65
+
66
+
67
+ def fetch_linkedin_links(query, api_key, applicant_name):
68
+ """Fetches LinkedIn profile links using BrightData proxy to Google."""
69
+ linkedin_regex = r'https://(www|[a-z]{2})\.linkedin\.com/.*'
70
+
71
+ try:
72
+ headers = {
73
+ "Content-Type": "application/json",
74
+ "Authorization": f"Bearer {api_key}"
75
+ }
76
+
77
+ payload = {
78
+ "zone": "serp_api2", # Or your configured BrightData zone name
79
+ "url": f"https://www.google.com/search?q={query}",
80
+ "format": "json" # Or "raw" if you want HTML
81
+ }
82
+
83
+ response = requests.post("https://api.brightdata.com/request", headers=headers, data=json.dumps(payload))
84
+ response.raise_for_status()
85
+
86
+ data = response.json()
87
+
88
+ # If "json" format is used and BrightData parses the page:
89
+ if "results" in data:
90
+ results = data["results"]
91
+ else:
92
+ # Fallback: Parse raw HTML if format is "raw"
93
+ results = []
94
+
95
+ # Search for LinkedIn links in response content (raw or parsed)
96
+ links = re.findall(linkedin_regex, response.text)
97
+ for link in links:
98
+ profile_name = get_name_from_url(link)
99
+ if profile_name:
100
+ similarity = calculate_similarity(applicant_name, profile_name)
101
+ if similarity >= 0.5:
102
+ return link
103
+
104
+ return None
105
+
106
+ except Exception as e:
107
+ st.error(f"Error fetching link for query '{query}': {e}")
108
+ return None
109
+
110
+
111
+
112
+ def process_file(file, api_key):
113
+ """Processes the uploaded Excel file to fetch LinkedIn profile links."""
114
+ try:
115
+ df = pd.read_excel(file)
116
+ df = df[df['Applicant Name'].notna()]
117
+ df = df[df['Applicant Name'].str.strip() != '']
118
+ df['Search Query'] = df.apply(construct_query, axis=1)
119
+ df['LinkedIn Link'] = df.apply(
120
+ lambda row: fetch_linkedin_links(row['Search Query'], api_key, row['Applicant Name']),
121
+ axis=1
122
+ )
123
+
124
+ temp_dir = tempfile.mkdtemp()
125
+ output_file = os.path.join(temp_dir, "updated_with_linkedin_links.csv")
126
+ df.to_csv(output_file, index=False)
127
+ return output_file
128
+ except Exception as e:
129
+ st.error(f"Error processing file: {e}")
130
+ return None
131
+
132
+ # Streamlit UI
133
+ st.title("LinkedIn Profile Link Scraper")
134
+ st.markdown("Upload an Excel file with applicant details, and get a CSV with LinkedIn profile links.")
135
+
136
+ api_key = st.text_input("Enter your BrightData SERP API Key", type="password")
137
+ uploaded_file = st.file_uploader("Upload Excel File", type=["xlsx"])
138
+
139
+ if uploaded_file and api_key:
140
+ st.write("Processing file...")
141
+ output_file = process_file(uploaded_file, api_key)
142
+ if output_file:
143
+ with open(output_file, "rb") as f:
144
+ st.download_button(
145
+ label="Download Updated CSV",
146
+ data=f,
147
+ file_name="updated_with_linkedin_links.csv",
148
+ mime="text/csv"
149
+ )
150
+ shutil.rmtree(os.path.dirname(output_file))
151
+ elif not api_key:
152
+ st.warning("Please enter your BrightData SERP API key to proceed.")