Spaces:

yonkasoft
/

makaleChatbotu

Build error

File size: 9,095 Bytes

3a1020a

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                    _id       id  \\\n",
      "0  {'$oid': '66c33a8c3b8bd216bd8ea93a'}  3525037   \n",
      "1  {'$oid': '66c33a8c3b8bd216bd8ea93b'}  3532700   \n",
      "2  {'$oid': '66c33a8c3b8bd216bd8ea93c'}  3203545   \n",
      "3  {'$oid': '66c33a8c3b8bd216bd8ea93d'}  1765445   \n",
      "4  {'$oid': '66c33a8c3b8bd216bd8ea93e'}   575462   \n",
      "\n",
      "                                                 url            title  \\\n",
      "0  https://tr.wikipedia.org/wiki/P%C5%9F%C4%B1qo%...    Pşıqo Ahecaqo   \n",
      "1      https://tr.wikipedia.org/wiki/Craterolophinae  Craterolophinae   \n",
      "2           https://tr.wikipedia.org/wiki/Notocrabro       Notocrabro   \n",
      "3    https://tr.wikipedia.org/wiki/Ibrahim%20Sissoko  Ibrahim Sissoko   \n",
      "4        https://tr.wikipedia.org/wiki/Salah%20Cedid      Salah Cedid   \n",
      "\n",
      "                                                text  no  \n",
      "0  Pşıqo Ahecaqo (), Çerkes siyasetçi, askeri kom...   0  \n",
      "1  Craterolophinae, Depastridae familyasına bağlı...   1  \n",
      "2  Notocrabro Crabronina oymağına bağlı bir cinst...   2  \n",
      "3  İbrahim Sissoko (d. 30 Kasım 1991), Fildişi Sa...   3  \n",
      "4  Salah Cedid (1926-1993) (Arapça: صلاح جديد) Su...   4  \n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# CSV dosyasını yükleyelim\n",
    "df = pd.read_csv('veriler.csv')\n",
    "\n",
    "# ID sütunu ekleyelim (her satıra 0'dan başlayarak benzersiz bir ID verelim)\n",
    "df['no'] = df.index\n",
    "\n",
    "# Sonucu yeni bir CSV dosyasına kaydedelim\n",
    "df.to_csv('data_with_id.csv', index=False)\n",
    "\n",
    "# İlk birkaç satırı kontrol edelim\n",
    "print(df.head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Boş değer sayısı:\n",
      "_id      0\n",
      "id       0\n",
      "url      0\n",
      "title    0\n",
      "text     0\n",
      "dtype: int64\n",
      "Tekrarlanan değer sayısı:\n",
      "0\n",
      "Eşleşmeyen 'title' sayısı: 0\n",
      "Eşleşmeyen 'text' sayısı: 0\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# Verileri yükleyin\n",
    "df = pd.read_csv(\"common_400000.csv\")\n",
    "\n",
    "# Boş değerlerin kontrolü\n",
    "print(\"Boş değer sayısı:\")\n",
    "print(df.isnull().sum())\n",
    "\n",
    "# Tekrarlanan değerlerin kontrolü\n",
    "print(\"Tekrarlanan değer sayısı:\")\n",
    "print(df.duplicated(subset=['title', 'text']).sum())\n",
    "\n",
    "# Eşleşmeyen değerlerin kontrolü\n",
    "unmatched_titles = df[df['text'].isna()]\n",
    "print(f\"Eşleşmeyen 'title' sayısı: {len(unmatched_titles)}\")\n",
    "\n",
    "unmatched_texts = df[df['title'].isna()]\n",
    "print(f\"Eşleşmeyen 'text' sayısı: {len(unmatched_texts)}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Eksik 'text' değerlerini doldur\n",
    "df['text'] = df['text'].fillna(\"Missing Text\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Tamamen aynı olan satırları kaldır\n",
    "df = df.drop_duplicates(subset=['title', 'text'])\n",
    "\n",
    "# Sadece 'title' bazında tekrarlanan satırları kaldır (ilkini tutar)\n",
    "df = df.drop_duplicates(subset=['title'], keep='first')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Text'i olmayan satırları kaldır\n",
    "df = df.dropna(subset=['text'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Veriler başarıyla converted_file.csv olarak kaydedildi.\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# JSON dosyasını yükleyin\n",
    "json_file = 'EgitimDatabase.train.json'  # JSON dosyanızın adı\n",
    "df = pd.read_json(json_file)  # JSON dosyasını DataFrame'e dönüştürme\n",
    "\n",
    "# DataFrame'i CSV olarak kaydetme\n",
    "csv_file = 'converted_file.csv'  # Çıktı CSV dosya adı\n",
    "df.to_csv(csv_file, index=False, encoding='utf-8')  # index olmadan ve UTF-8 formatında kaydedilir\n",
    "\n",
    "print(f\"Veriler başarıyla {csv_file} olarak kaydedildi.\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 416434 entries, 0 to 416433\n",
      "Data columns (total 5 columns):\n",
      " #   Column  Non-Null Count   Dtype \n",
      "---  ------  --------------   ----- \n",
      " 0   _id     416434 non-null  object\n",
      " 1   id      416434 non-null  int64 \n",
      " 2   url     416434 non-null  object\n",
      " 3   title   416434 non-null  object\n",
      " 4   text    416434 non-null  object\n",
      "dtypes: int64(1), object(4)\n",
      "memory usage: 15.9+ MB\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Boş text satırları: 1\n",
      "Boş title satırları: 0\n"
     ]
    }
   ],
   "source": [
    "# Boş değerleri kontrol etmek\n",
    "empty_text_rows = df[df['text'].str.strip() == \"\"]\n",
    "print(f\"Boş text satırları: {len(empty_text_rows)}\")\n",
    "\n",
    "empty_title_rows = df[df['title'].str.strip() == \"\"]\n",
    "print(f\"Boş title satırları: {len(empty_title_rows)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Geçerli satır sayısı: 416434\n"
     ]
    }
   ],
   "source": [
    "# Hem title hem text dolu olanları kontrol et\n",
    "valid_rows = df[df['title'].notnull() & df['text'].notnull()]\n",
    "print(f\"Geçerli satır sayısı: {len(valid_rows)}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['text'] = df['text'].apply(lambda x: str(x) if isinstance(x, dict) else x)\n",
    "df['_id'] = df['_id'].apply(lambda x: str(x) if isinstance(x, dict) else x)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Tekrarlayan satır sayısı: 0\n"
     ]
    }
   ],
   "source": [
    "# Tekrarlayan satırları kontrol etmek\n",
    "duplicated_rows = df[df.duplicated()]\n",
    "print(f\"Tekrarlayan satır sayısı: {len(duplicated_rows)}\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Empty DataFrame\n",
      "Columns: [_id, id, url, title, text]\n",
      "Index: []\n"
     ]
    }
   ],
   "source": [
    "# Title ve text sütunlarında boş veya tutarsız değer var mı?\n",
    "print(df[df['title'].isna() | df['text'].isna()])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['text'] = df['text'].fillna('Eksik veri')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "416434\n"
     ]
    }
   ],
   "source": [
    "# Sütundaki benzersiz değerleri sayma\n",
    "print(df['title'].nunique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "414397\n"
     ]
    }
   ],
   "source": [
    "print(df['text'].nunique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}