Spaces:
Build error
Build error
File size: 1,820 Bytes
3a1020a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import pandas as pd
def load_stopwords(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
stopwords = f.read().splitlines() # Her satır bir stopword olacak şekilde yükle
return set(stopwords)
stop_words = load_stopwords('stopwords.txt')
df = pd.read_csv('veriler_cleaned.csv')
def remove_stopwords_without_nltk(text):
if isinstance(text, str):
words = text.split()
filtered_words = [word for word in words if word.lower() not in stop_words]
return ' '.join(filtered_words)
else:
return ""
df['stopwords_text'] = df['cleaned_text'].apply(remove_stopwords_without_nltk)
print(df[['cleaned_text', 'stopwords_text']].head())
df.to_csv('temizlenmis_veri.csv', index=False)
"""
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('turkish'))
def load_custom_stopwords(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
custom_stopwords = f.read().splitlines()
return set(custom_stopwords)
custom_stopwords = load_custom_stopwords('stopwords.txt')
stop_words.update(custom_stopwords)
df = pd.read_csv('veriler_cleaned.csv')
def remove_stopwords(text):
if isinstance(text, str):
words = word_tokenize(text)
filtered_words = [word for word in words if word.lower() not in stop_words] # Stopwords'leri çıkar
return ' '.join(filtered_words)
else:
return ""
df['stopwords_text'] = df['cleaned_text'].apply(remove_stopwords)
print(df[['cleaned_text', 'stopwords_text']].head())
df.to_csv('temizlenmis_veri.csv', index=False)
""" |