Spaces:
Sleeping
Sleeping
File size: 1,812 Bytes
706ec74 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
import pandas as pd
from nltk.corpus import stopwords
# Preprocessing function
def preprocess_text(text):
"""Tokenize and clean the input text"""
tokens = text.lower().split()
# Remove punctuation and stopwords
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
return tokens
# Function to calculate similarity score between text and bag of words
def similarity_score(text, bow):
"""Calculate similarity score between text and BoW"""
tokens = preprocess_text(text)
# Ensure the BoW contains the 'Word' and 'Frequency' columns
if 'Word' not in bow.columns or 'Frequency' not in bow.columns:
print("Invalid BoW format. Ensure it contains 'Word' and 'Frequency' columns.")
return 0
# Calculate similarity score
common_words = set(tokens) & set(bow['Word'])
# Sum the frequencies of matching words
score = sum(bow[bow['Word'] == word]['Frequency'].values[0] for word in common_words)
return score
# Function to classify text domain using bag of words
def classify_text_domain(text):
"""Classify text domain based on similarity score with BoW files"""
# Load BoW CSV files for different domains
try:
reliance_bow = pd.read_csv("reliance_bow.csv")
except FileNotFoundError:
print("BoW file not found.")
return "Unknown"
# Ensure CSV files are not empty
if reliance_bow.empty:
print("BoW file is empty.")
return "Unknown"
# Calculate similarity scores
scores = {
"Reliance": similarity_score(text, reliance_bow)
}
# Determine the domain with the highest similarity score
domain = max(scores, key=scores.get)
print(f"Scores: {scores}") # Display
|