Spaces:
Sleeping
Sleeping
import pandas as pd | |
from nltk.corpus import stopwords | |
# Preprocessing function | |
def preprocess_text(text): | |
"""Tokenize and clean the input text""" | |
tokens = text.lower().split() | |
# Remove punctuation and stopwords | |
stop_words = set(stopwords.words('english')) | |
tokens = [word for word in tokens if word.isalnum() and word not in stop_words] | |
return tokens | |
# Function to calculate similarity score between text and bag of words | |
def similarity_score(text, bow): | |
"""Calculate similarity score between text and BoW""" | |
tokens = preprocess_text(text) | |
# Ensure the BoW contains the 'Word' and 'Frequency' columns | |
if 'Word' not in bow.columns or 'Frequency' not in bow.columns: | |
print("Invalid BoW format. Ensure it contains 'Word' and 'Frequency' columns.") | |
return 0 | |
# Calculate similarity score | |
common_words = set(tokens) & set(bow['Word']) | |
# Sum the frequencies of matching words | |
score = sum(bow[bow['Word'] == word]['Frequency'].values[0] for word in common_words) | |
return score | |
# Function to classify text domain using bag of words | |
def classify_text_domain(text): | |
"""Classify text domain based on similarity score with BoW files""" | |
# Load BoW CSV files for different domains | |
try: | |
reliance_bow = pd.read_csv("reliance_bow.csv") | |
except FileNotFoundError: | |
print("BoW file not found.") | |
return "Unknown" | |
# Ensure CSV files are not empty | |
if reliance_bow.empty: | |
print("BoW file is empty.") | |
return "Unknown" | |
# Calculate similarity scores | |
scores = { | |
"Reliance": similarity_score(text, reliance_bow) | |
} | |
# Determine the domain with the highest similarity score | |
domain = max(scores, key=scores.get) | |
print(f"Scores: {scores}") # Display | |