File size: 1,820 Bytes
3a1020a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import pandas as pd


def load_stopwords(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        stopwords = f.read().splitlines()  # Her satır bir stopword olacak şekilde yükle
    return set(stopwords)


stop_words = load_stopwords('stopwords.txt')


df = pd.read_csv('veriler_cleaned.csv')


def remove_stopwords_without_nltk(text):
    if isinstance(text, str):  
        words = text.split()  
        filtered_words = [word for word in words if word.lower() not in stop_words]  
        return ' '.join(filtered_words)  
    else:
        return ""  


df['stopwords_text'] = df['cleaned_text'].apply(remove_stopwords_without_nltk)


print(df[['cleaned_text', 'stopwords_text']].head())


df.to_csv('temizlenmis_veri.csv', index=False)




"""

import pandas as pd

import nltk

from nltk.tokenize import word_tokenize



nltk.download('stopwords')

nltk.download('punkt')





from nltk.corpus import stopwords

stop_words = set(stopwords.words('turkish'))



def load_custom_stopwords(file_path):

    with open(file_path, 'r', encoding='utf-8') as f:

        custom_stopwords = f.read().splitlines()

    return set(custom_stopwords)





custom_stopwords = load_custom_stopwords('stopwords.txt')

stop_words.update(custom_stopwords)





df = pd.read_csv('veriler_cleaned.csv')





def remove_stopwords(text):

    if isinstance(text, str): 

        words = word_tokenize(text)  

        filtered_words = [word for word in words if word.lower() not in stop_words]  # Stopwords'leri çıkar

        return ' '.join(filtered_words)  

    else:

        return ""  





df['stopwords_text'] = df['cleaned_text'].apply(remove_stopwords)





print(df[['cleaned_text', 'stopwords_text']].head())





df.to_csv('temizlenmis_veri.csv', index=False)

"""