import pandas as pd from nltk.corpus import stopwords # Preprocessing function def preprocess_text(text): """Tokenize and clean the input text""" tokens = text.lower().split() # Remove punctuation and stopwords stop_words = set(stopwords.words('english')) tokens = [word for word in tokens if word.isalnum() and word not in stop_words] return tokens # Function to calculate similarity score between text and bag of words def similarity_score(text, bow): """Calculate similarity score between text and BoW""" tokens = preprocess_text(text) # Ensure the BoW contains the 'Word' and 'Frequency' columns if 'Word' not in bow.columns or 'Frequency' not in bow.columns: print("Invalid BoW format. Ensure it contains 'Word' and 'Frequency' columns.") return 0 # Calculate similarity score common_words = set(tokens) & set(bow['Word']) # Sum the frequencies of matching words score = sum(bow[bow['Word'] == word]['Frequency'].values[0] for word in common_words) return score # Function to classify text domain using bag of words def classify_text_domain(text): """Classify text domain based on similarity score with BoW files""" # Load BoW CSV files for different domains try: reliance_bow = pd.read_csv("reliance_bow.csv") except FileNotFoundError: print("BoW file not found.") return "Unknown" # Ensure CSV files are not empty if reliance_bow.empty: print("BoW file is empty.") return "Unknown" # Calculate similarity scores scores = { "Reliance": similarity_score(text, reliance_bow) } # Determine the domain with the highest similarity score domain = max(scores, key=scores.get) print(f"Scores: {scores}") # Display