Spaces:

yonkasoft
/

makaleChatbotu

Build error

App Files Files Community

yonkasoft commited on Aug 17, 2024

Commit

201583f

verified ·

1 Parent(s): f93efba

Upload datasets.ipynb

Browse files

Files changed (1) hide show

datasets.ipynb +140 -12

datasets.ipynb CHANGED Viewed

@@ -287,15 +287,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
-   "outputs": [],
    "source": [
     "from pymongo import MongoClient\n",
     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
     "from textblob import TextBlob as tb\n",
     "import numpy as np\n",
     "import math\n",
     "\n",
     "class Database:\n",
     "    @staticmethod\n",
@@ -311,7 +324,12 @@
     "        cursor = collection.find().limit(limit)\n",
     "        documents = [doc for doc in cursor]\n",
     "        document_count = len(documents)\n",
-    "        return documents, document_count\n",
     "\n",
     "class Tf:\n",
     "    @staticmethod\n",
@@ -335,24 +353,134 @@
     "        return Database.get_input_documents(limit)\n",
     "\n",
     "# Kullanım örneği\n",
-    "documents, document_count = Tf.get_input_documents(limit=3)\n",
     "\n",
     "# Dokümanları işleyerek TF-IDF hesaplama\n",
     "\n",
-    "blobs = [tb(doc.get('text', '')) for doc in documents]  # veya 'title' kullanarak başlıkları işleyebilirsiniz\n",
-    "all_words = set(word for blob in blobs for word in blob.words)\n",
     "\n",
-    "tfidf_scores = {}\n",
-    "for word in all_words:\n",
-    "    tfidf_scores[word] = [Tf.tfidf(word, blob, blobs) for blob in blobs]\n",
     "\n",
-    "print(\"TF-IDF Skorları:\")\n",
-    "for word, scores in tfidf_scores.items():\n",
-    "    print(f\"Kelime: {word}, Skorlar: {scores}\")\n",
     "\n",
     "\n",
     "\n",
     "\n",
     "\"\"\"turkish_stop_words = set([\n",
     "        'ad', 'adım', 'ah', 'ama', 'an', 'ancak', 'araba', 'aralar', 'aslında', \n",
     "     'b', 'bazı', 'belirli', 'ben', 'bence', 'bunu', 'burada', 'biz', 'bu', 'buna', 'çünkü', \n",

   },
   {
    "cell_type": "code",
+   "execution_count": 20,
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\"turkish_stop_words = set([\\n        'ad', 'adım', 'ah', 'ama', 'an', 'ancak', 'araba', 'aralar', 'aslında', \\n     'b', 'bazı', 'belirli', 'ben', 'bence', 'bunu', 'burada', 'biz', 'bu', 'buna', 'çünkü', \\n    'da', 'de', 'demek', 'den', 'derken', 'değil', 'daha', 'dolayı',  'edilir', 'eğer', 'en', 'fakat', \\n    'genellikle', 'gibi', 'hem', 'her', 'herhangi', 'hiç', 'ise', 'işte', 'itibaren', 'iyi', 'kadar', \\n    'karşı', 'ki', 'kime', 'kısaca', 'mu', 'mü', 'nasıl', 'ne', 'neden', 'niye', 'o', 'olabilir', 'oluşur', \\n    'önce', 'şu', 'sadece', 'se', 'şey', 'şimdi', 'tabi', 'tüm', 've', 'ya', 'ya da', 'yani', 'yine'\\n])\\ndef calculate_tfidf(documents):\\n    vectorizer = TfidfVectorizer(stop_words=turkish_stop_words, max_features=10000)  # max_features ile özellik sayısını sınırlıyoruz\\n    tfidf_matrix = vectorizer.fit_transform(documents)\\n    feature_names = vectorizer.get_feature_names_out()\\n    return tfidf_matrix, feature_names\\n\\n#feature_names lerin belirlenmesi grekir \\ntfidf_matrix, feature_names=calculate_tfidf(documents)\\n\\n\\n\\n# En yüksek TF-IDF skorlarına sahip anahtar kelimeleri çıkarın\\n#sıkışık format kullanmarak tf-ıdf matrisini işleme \\ndef get_top_n_keywords_sparse(n=10):\\n\\n    # TF-IDF hesaplayıcı oluşturun\\n    vectorizer = TfidfVectorizer()\\n\\n    # Başlıklar ve metinler ile TF-IDF matrisini oluşturun\\n    texts = Database.get_input_texts()\\n    titles = Database.get_input_titles()\\n    \\n\\n    #title ve text değerlerini alarak vektörleştirdik.\\n    tfidf_matrix = vectorizer.fit_transform(documents)\\n\\n    # Özellik adlarını (kelimeleri) alın\\n\\n    feature_names = vectorizer.get_feature_names_out()\\n\\n    # TF-IDF sonuçlarını DataFrame'e dönüştürün\\n    df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)\\n    print(df)\\n    keywords = {}\\n    for i in range(tfidf_matrix.shape[0]):\\n        row = tfidf_matrix[i].toarray().flatten() #list yapısından çıkarma \\n        sorted_indices = row.argsort()[::-1]  # Büyükten küçüğe sıralama\\n        top_indices = sorted_indices[:n]\\n        top_keywords = [feature_names[idx] for idx in top_indices]\\n        keywords[i] = top_keywords\\n    return keywords\""
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "from pymongo import MongoClient\n",
     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
     "from textblob import TextBlob as tb\n",
     "import numpy as np\n",
     "import math\n",
+    "import nltk \n",
+    "import matplotlib.pyplot as plt \n",
     "\n",
     "class Database:\n",
     "    @staticmethod\n",
     "        cursor = collection.find().limit(limit)\n",
     "        documents = [doc for doc in cursor]\n",
     "        document_count = len(documents)\n",
+    "        \n",
+    "        # Dökümanları isimlendir\n",
+    "        named_documents = {f'döküman {i+1}': doc for i, doc in enumerate(documents)}\n",
+    "        \n",
+    "        return named_documents, document_count\n",
+    "\n",
     "\n",
     "class Tf:\n",
     "    @staticmethod\n",
     "        return Database.get_input_documents(limit)\n",
     "\n",
     "# Kullanım örneği\n",
+    "named_documents, document_count = Tf.get_input_documents(limit=3)\n",
+    "\n",
+    "#tf-ıdf ile döküman içerisinden kelime seçme \n",
+    "\n",
+    "def extract_keywords(tfidf_matrix, feature_names, top_n=10):\n",
+    "    \"\"\"\n",
+    "    Her döküman için anahtar kelimeleri seç.\n",
+    "    :param tfidf_matrix: TF-IDF matris\n",
+    "    :param feature_names: TF-IDF özellik isimleri\n",
+    "    :param top_n: Her döküman için seçilecek anahtar kelime sayısı\n",
+    "    :return: Anahtar kelimeler ve skorlari\n",
+    "    \"\"\"\n",
+    "    keywords = {}\n",
+    "    for doc_idx, row in enumerate(tfidf_matrix):\n",
+    "        # TF-IDF değerlerini ve özellik isimlerini al\n",
+    "        scores = np.asarray(row.T.todense()).flatten()\n",
+    "        sorted_indices = np.argsort(scores)[::-1]  # Skorları azalan sırada\n",
+    "        top_features = sorted_indices[:top_n]\n",
+    "        \n",
+    "        doc_keywords = [(feature_names[idx], scores[idx]) for idx in top_features]\n",
+    "        keywords[f'document_{doc_idx+1}'] = doc_keywords\n",
+    "    \n",
+    "    return keywords\n",
     "\n",
     "# Dokümanları işleyerek TF-IDF hesaplama\n",
+    "#bloblist dökümanların bir listesi\n",
+    "bloblist = []\n",
+    "for i, blob in enumerate(bloblist):\n",
+    "    print(\"Top words in document {}\".format(i + 1))\n",
+    "    scores = {word: Tf.tfidf(word, blob, bloblist) for word in blob.words} #dökümanların içerisinde bulunan kelimeleri alır.\n",
+    "    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)\n",
+    "    for word, score in sorted_words[:3]:\n",
+    "        print(\"\\tWord: {}, TF-IDF: {}\".format(word, round(score, 5)))\n",
+    "\n",
+    "turkish_stop_words = [\n",
+    "    'ah', 'ama', 'an', 'ancak', 'araba', 'aralar', 'aslında', \n",
+    "     'b','başlayan','bağlı', 'bazı', 'belirli', 'ben', 'bence','birkaç','birlikte', 'bunu', 'burada','biten','biten' ,'biz', 'bu', 'buna', 'çünkü', \n",
+    "    'da', 'de', 'demek', 'den', 'derken', 'değil', 'daha', 'dolayı',  'edilir', 'eğer', 'en', 'fakat', \n",
+    "    'genellikle', 'gibi', 'hem', 'her', 'herhangi', 'hiç', 'ise', 'işte', 'itibaren', 'iyi', 'kadar', \n",
+    "    'karşı', 'ki', 'kime', 'kısaca', 'mu', 'mü', 'nasıl', 'ne', 'neden', 'niye', 'o', 'olasılıkla','olabilir', 'oluşur', \n",
+    "    'önce', 'şu', 'sadece', 'se', 'şey', 'şimdi', 'tabi', 'tüm', 've', 'ya', 'ya da','yanı' ,'yanı','yani','yılında','yılında','yetenekli', 'yine'\n",
+    "]\n",
+    "\n",
+    "#featuresların eklenmesi gerekir \n",
+    "def calculate_tfidf(documents, stop_words):\n",
+    "    vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=10000)\n",
+    "    tfidf_matrix = vectorizer.fit_transform(documents)\n",
+    "    feature_names = vectorizer.get_feature_names_out()\n",
+    "    return tfidf_matrix, feature_names\n",
+    "\n",
+    "\n",
+    "#kelimelerin ortalama skorlarını hesaplama \n",
+    "def identify_low_tfidf_words(tfidf_matrix, feature_names, threshold=0.001):\n",
+    "    # TF-IDF skorlarını toplayarak her kelimenin ortalama skorunu hesaplayın\n",
+    "    avg_scores = np.mean(tfidf_matrix, axis=0).A1\n",
+    "    low_tfidf_words = [feature_names[i] for i, score in enumerate(avg_scores) if score < threshold]\n",
+    "    return low_tfidf_words\n",
+    "\n",
+    "#kelimelerin güncellenmesi \n",
+    "def update_stop_words(existing_stop_words, low_tfidf_words):\n",
+    "    updated_stop_words = set(existing_stop_words) | set(low_tfidf_words)\n",
+    "    return list(updated_stop_words)\n",
+    "\n",
+    "\n",
+    "def iterative_update(documents, initial_stop_words, iterations=5):\n",
+    "    stop_words = set(initial_stop_words)\n",
+    "    for _ in range(iterations):\n",
+    "        tfidf_matrix, feature_names = calculate_tfidf(documents, stop_words)\n",
+    "        low_tfidf_words = identify_low_tfidf_words(tfidf_matrix, feature_names)\n",
+    "        stop_words = update_stop_words(stop_words, low_tfidf_words)\n",
+    "    return list(stop_words)\n",
+    "stop_words= iterative_update\n",
+    "\n",
+    "\n",
+    "def main ():\n",
+    "\n",
+    "#anlam ilişkisini de kontrol edecek bir yapı oluşpturulacak title ile benzerlik kontrol ederek yüksek benzerlik içeren kelimler sıralnacak .\n",
+    "\n",
+    "# Dökümanları liste olarak al\n",
+    "    documents_list = [doc.get('text', '') if isinstance(doc, dict) else doc for doc in list(named_documents.values())]\n",
+    "\n",
+    "    #tf-ıdf hesaplama\n",
+    "    tfidf_matrix, feature_names=calculate_tfidf(documents_list,stop_words)\n",
+    "\n",
+    "# Veritabanından dökümanları alın\n",
+    "    named_documents, document_count = Database.get_input_documents(limit=3)\n",
+    "\n",
+    "#başalngıç stop değerleriyle yeni olanları arasında değişim yapma \n",
+    "    initial_stop_words = turkish_stop_words\n",
+    "\n",
+    "# Stop-words listesini iteratif olarak güncelleyin\n",
+    "    final_stop_words = iterative_update(documents_list, initial_stop_words)\n",
+    "\n",
+    "    print(\"Güncellenmiş Stop-Words Listesi:\", final_stop_words)\n",
+    "\n",
+    "\n",
+    "# Sonuçları yazdır\n",
+    "    print(\"İsimlendirilmiş Dökümanlar:\")\n",
+    "    for name, doc in named_documents.items():\n",
+    "        print(f\"{name}: {doc}\")\n",
+    "\n",
+    "    print(\"\\nDökümanlar Listesi:\")\n",
+    "    print(documents_list)\n",
+    "\n",
+    "#---------------------------------------------------------\n",
+    "    blobs = [tb(doc) for doc in documents_list]  # veya 'title' kullanarak başlıkları işleyebilirsiniz\n",
+    "    all_words = set(word for blob in blobs for word in blob.words)\n",
+    "\n",
+    "    tfidf_scores = {}\n",
+    "    for word in all_words:\n",
+    "        tfidf_scores[word] = [Tf.tfidf(word, blob, blobs) for blob in blobs]\n",
+    "\n",
+    "    print(\"TF-IDF Skorları:\")\n",
+    "    for word, scores in tfidf_scores.items():\n",
+    "        print(f\"Kelime: {word}, Skorlar: {scores}\")\n",
+    "\n",
+    "\n",
+    "\n",
     "\n",
     "\n",
+    "#----------------------------------------------\n",
     "\n",
     "\n",
     "\n",
     "\n",
     "\n",
+    "#alternatif keywordleri belirleme \n",
+    "#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n",
     "\"\"\"turkish_stop_words = set([\n",
     "        'ad', 'adım', 'ah', 'ama', 'an', 'ancak', 'araba', 'aralar', 'aslında', \n",
     "     'b', 'bazı', 'belirli', 'ben', 'bence', 'bunu', 'burada', 'biz', 'bu', 'buna', 'çünkü', \n",