diff --git "a/eda.ipynb" "b/eda.ipynb" new file mode 100644--- /dev/null +++ "b/eda.ipynb" @@ -0,0 +1,648 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/homebrew/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "from datasets import load_dataset\n", + "import os\n", + "import sys\n", + "import pandas as pd\n", + "sys.path.append(\"/Users/soumechenadaradjane/Documents/Frugal_challenge/\")\n", + "from src.utils.eda_functions import process_text, generate_word_clouds_by_category\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import classification_report\n", + "from sentence_transformers import SentenceTransformer" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "train = pd.read_csv(\"/Users/soumechenadaradjane/Documents/Frugal_challenge/outputs/train_v1.csv\", sep=\";\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | quote | \n", + "label | \n", + "source | \n", + "url | \n", + "language | \n", + "subsource | \n", + "id | \n", + "__index_level_0__ | \n", + "processed_quote | \n", + "
---|---|---|---|---|---|---|---|---|---|
0 | \n", + "There is clear, compelling evidence that many ... | \n", + "5_science_unreliable | \n", + "FLICC | \n", + "https://huggingface.co/datasets/fzanartu/FLICC... | \n", + "en | \n", + "CARDS | \n", + "NaN | \n", + "0 | \n", + "['clear', 'compelling', 'evidence', 'many', 'm... | \n", + "
1 | \n", + "For most of the Holocene (last 10k years), sea... | \n", + "1_not_happening | \n", + "FLICC | \n", + "https://huggingface.co/datasets/fzanartu/FLICC... | \n", + "en | \n", + "hamburg_test1 | \n", + "NaN | \n", + "1 | \n", + "['holocene', 'last', '10k', 'year', 'sea', 'le... | \n", + "
2 | \n", + "China, which hosts U.N. climate talks next wee... | \n", + "4_solutions_harmful_unnecessary | \n", + "FLICC | \n", + "https://huggingface.co/datasets/fzanartu/FLICC... | \n", + "en | \n", + "CARDS | \n", + "NaN | \n", + "2 | \n", + "['china', 'host', 'un', 'climate', 'talk', 'ne... | \n", + "
3 | \n", + "And the fabricated documents (which Dr. Mann a... | \n", + "0_not_relevant | \n", + "FLICC | \n", + "https://huggingface.co/datasets/fzanartu/FLICC... | \n", + "en | \n", + "CARDS | \n", + "NaN | \n", + "3 | \n", + "['fabricate', 'document', 'dr', 'mann', 'appar... | \n", + "
4 | \n", + "It's going to be 42 here today and the hottest... | \n", + "1_not_happening | \n", + "FLICC | \n", + "https://huggingface.co/datasets/fzanartu/FLICC... | \n", + "en | \n", + "hamburg_test3 | \n", + "NaN | \n", + "4 | \n", + "['be', 'go', '42', 'today', 'hot', 'summer', '... | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
6086 | \n", + "There is that sense of the Senate that climate... | \n", + "6_proponents_biased | \n", + "Desmog | \n", + "https://www.desmog.com/kevin-cramer/ | \n", + "en | \n", + "NaN | \n", + "NaN | \n", + "6431 | \n", + "['sense', 'senate', 'climate', 'change', 'real... | \n", + "
6087 | \n", + "This is the biggest scientific hoax being perp... | \n", + "2_not_human | \n", + "Desmog | \n", + "https://www.desmog.com/tad-murty/ | \n", + "en | \n", + "NaN | \n", + "NaN | \n", + "6432 | \n", + "['big', 'scientific', 'hoax', 'perpetrate', 'h... | \n", + "
6088 | \n", + "A world with no tobacco might sound appealing ... | \n", + "0_not_relevant | \n", + "Desmog | \n", + "https://www.desmog.com/drew-johnson/ | \n", + "en | \n", + "NaN | \n", + "NaN | \n", + "6433 | \n", + "['world', 'tobacco', 'might', 'sound', 'appeal... | \n", + "
6089 | \n", + "The relationship between CO2 and temperature i... | \n", + "5_science_unreliable | \n", + "Desmog | \n", + "https://www.desmog.com/junkscience-com/ | \n", + "en | \n", + "NaN | \n", + "NaN | \n", + "6435 | \n", + "['relationship', 'co2', 'temperature', 'logari... | \n", + "
6090 | \n", + "Whatever is happening to the weather at the mo... | \n", + "1_not_happening | \n", + "Desmog | \n", + "https://www.desmog.com/boris-johnson/ | \n", + "en | \n", + "NaN | \n", + "NaN | \n", + "6436 | \n", + "['whatever', 'happen', 'weather', 'moment', 's... | \n", + "
6091 rows × 9 columns
\n", + "