ElegantSolutions commited on
Commit
94bd43b
·
verified ·
1 Parent(s): 804b409

Creating app.py

Browse files
Files changed (1) hide show
  1. app.py +153 -0
app.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st # Import Streamlit for creating a web app interface
2
+ import pandas as pd # Import pandas for data manipulation
3
+ from serpapi import GoogleSearch # Import SerpAPI to perform Google searches
4
+ import re # Import regex module for pattern matching
5
+ import tempfile # Import tempfile for creating temporary files
6
+ import shutil # Import shutil for file operations
7
+ import os # Import os for handling file paths
8
+ from difflib import SequenceMatcher # Import SequenceMatcher to calculate string similarity
9
+
10
+ # Function to construct a Google search query from applicant data
11
+ def construct_query(row):
12
+ """Constructs the Google search query using applicant data."""
13
+ query = str(row['Applicant Name']) # Start with the applicant's name
14
+ print(f"Constructing query for Applicant Name: {row['Applicant Name']}")
15
+
16
+ # Additional fields to include in the search query if available
17
+ optional_fields = ['Job Title', 'State', 'City', 'Skills']
18
+ for field in optional_fields:
19
+ if field in row and pd.notna(row[field]): # Check if the field exists and is not NaN
20
+ value = row[field]
21
+ if isinstance(value, str) and value.strip(): # Ensure the value is a non-empty string
22
+ query += f" {value.strip()}" # Add the value to the query
23
+ elif not isinstance(value, str): # Handle non-string values
24
+ query += f" {str(value).strip()}"
25
+ query += " linkedin" # Append "linkedin" to focus search on LinkedIn profiles
26
+ print(f"Constructed query: {query}")
27
+ return query
28
+
29
+ # Function to extract the name from a LinkedIn profile URL
30
+ def get_name_from_url(link):
31
+ """Extracts the name part from a LinkedIn profile URL."""
32
+ print(f"Extracting name from LinkedIn URL: {link}")
33
+ match = re.search(r'linkedin\.com/in/([a-zA-Z0-9-]+)', link) # Regex to find profile name
34
+ if match:
35
+ name = match.group(1).replace('-', ' ') # Replace dashes with spaces for readability
36
+ print(f"Extracted name: {name}")
37
+ return name
38
+ print("No name extracted from URL.")
39
+ return None
40
+
41
+ # Function to calculate similarity between two names
42
+ def calculate_similarity(name1, name2):
43
+ """Calculates similarity between two names."""
44
+ similarity = SequenceMatcher(None, name1.lower().strip(), name2.lower().strip()).ratio()
45
+ print(f"Calculated similarity between '{name1}' and '{name2}': {similarity}")
46
+ return similarity
47
+
48
+ # Function to fetch LinkedIn links using SerpAPI
49
+ def fetch_linkedin_links(query, api_key, applicant_name):
50
+ """Fetches LinkedIn profile links and validates them against the applicant's name."""
51
+ linkedin_regex = r'https://(www|[a-z]{2})\.linkedin\.com/.*' # Regex for LinkedIn links
52
+ try:
53
+ print(f"Fetching LinkedIn links for query: {query}")
54
+ search = GoogleSearch({
55
+ "q": query, # The search query
56
+ "num": 5, # Number of search results
57
+ "api_key": api_key # API key for SerpAPI
58
+ })
59
+
60
+ # Execute the search and get results
61
+ results = search.get_dict()
62
+ organic_results = results.get("organic_results", []) # Extract organic search results
63
+ print(f"Raw search results: {organic_results}")
64
+
65
+ # Iterate through results to find LinkedIn links
66
+ for result in organic_results:
67
+ link = result.get("link") # Get the URL of the search result
68
+ print(f"Checking link: {link}")
69
+ if re.match(linkedin_regex, link): # Check if the link matches LinkedIn regex
70
+ profile_name = get_name_from_url(link) # Extract the name from the URL
71
+ if profile_name:
72
+ similarity = calculate_similarity(applicant_name, profile_name) # Validate name similarity
73
+ if similarity >= 0.5: # Accept link if similarity is above the threshold
74
+ print(f"Valid LinkedIn link found: {link} (Similarity: {similarity})")
75
+ return link
76
+ else:
77
+ print(f"Rejected link: {link} (Similarity: {similarity})")
78
+ else:
79
+ print(f"Link does not match LinkedIn regex: {link}")
80
+
81
+ print("No valid LinkedIn link found.")
82
+ return None
83
+ except Exception as e:
84
+ print(f"Error fetching link for query '{query}': {e}")
85
+ st.error(f"Error fetching link for query '{query}': {e}")
86
+ return None
87
+
88
+ # Function to process the uploaded Excel file
89
+ def process_file(file, api_key):
90
+ """Processes the uploaded Excel file to fetch LinkedIn profile links."""
91
+ try:
92
+ print("Reading uploaded Excel file...")
93
+ df = pd.read_excel(file) # Read the Excel file into a pandas DataFrame
94
+ print(f"Initial DataFrame:\n{df.head()}")
95
+
96
+ # Filter out rows with empty or missing applicant names
97
+ df = df[df['Applicant Name'].notna()]
98
+ df = df[df['Applicant Name'].str.strip() != '']
99
+ print(f"Filtered DataFrame:\n{df.head()}")
100
+
101
+ # Generate search queries for each applicant
102
+ df['Search Query'] = df.apply(construct_query, axis=1)
103
+ print(f"DataFrame with Search Queries:\n{df[['Applicant Name', 'Search Query']].head()}")
104
+
105
+ # Fetch LinkedIn links for each applicant
106
+ df['LinkedIn Link'] = df.apply(
107
+ lambda row: fetch_linkedin_links(row['Search Query'], api_key, row['Applicant Name']),
108
+ axis=1
109
+ )
110
+ print(f"DataFrame with LinkedIn Links:\n{df.head()}")
111
+
112
+ # Save the updated DataFrame to a temporary file
113
+ temp_dir = tempfile.mkdtemp() # Create a temporary directory
114
+ output_file = os.path.join(temp_dir, "updated_with_linkedin_links.csv")
115
+ df.to_csv(output_file, index=False) # Save as CSV
116
+ print(f"CSV file created at: {output_file}")
117
+
118
+ return output_file
119
+ except Exception as e:
120
+ print(f"Error processing file: {e}")
121
+ st.error(f"Error processing file: {e}")
122
+ return None
123
+
124
+ # Streamlit UI setup
125
+ st.title("LinkedIn Profile Link Scraper") # App title
126
+ st.markdown("Upload an Excel file with applicant details, and get a CSV with LinkedIn profile links.") # Description
127
+
128
+ # Input for SerpAPI Key
129
+ api_key = st.text_input("Enter your SerpAPI Key", type="password") # Input for SerpAPI key
130
+
131
+ # File uploader widget
132
+ uploaded_file = st.file_uploader("Upload Excel File", type=["xlsx"]) # File uploader for Excel files
133
+
134
+ # Process the file if both file and API key are provided
135
+ if uploaded_file and api_key:
136
+ st.write("Processing file...")
137
+ output_file = process_file(uploaded_file, api_key) # Process the uploaded file
138
+
139
+ if output_file:
140
+ with open(output_file, "rb") as f: # Open the CSV for download
141
+ st.download_button(
142
+ label="Download Updated CSV",
143
+ data=f,
144
+ file_name="updated_with_linkedin_links.csv",
145
+ mime="text/csv"
146
+ )
147
+ print("File ready for download.")
148
+
149
+ # Clean up the temporary directory after download
150
+ shutil.rmtree(os.path.dirname(output_file))
151
+ print("Temporary files cleaned up.")
152
+ elif not api_key:
153
+ st.warning("Please enter your SerpAPI key to proceed.") # Warning for missing API key