{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/homebrew/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "from datasets import load_dataset\n", "import os\n", "import sys\n", "import pandas as pd\n", "sys.path.append(\"/Users/soumechenadaradjane/Documents/Frugal_challenge/\")\n", "from src.utils.eda_functions import process_text, generate_word_clouds_by_category\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import classification_report\n", "from sentence_transformers import SentenceTransformer" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "train = pd.read_csv(\"/Users/soumechenadaradjane/Documents/Frugal_challenge/outputs/train_v1.csv\", sep=\";\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | quote | \n", "label | \n", "source | \n", "url | \n", "language | \n", "subsource | \n", "id | \n", "__index_level_0__ | \n", "processed_quote | \n", "
---|---|---|---|---|---|---|---|---|---|
0 | \n", "There is clear, compelling evidence that many ... | \n", "5_science_unreliable | \n", "FLICC | \n", "https://huggingface.co/datasets/fzanartu/FLICC... | \n", "en | \n", "CARDS | \n", "NaN | \n", "0 | \n", "['clear', 'compelling', 'evidence', 'many', 'm... | \n", "
1 | \n", "For most of the Holocene (last 10k years), sea... | \n", "1_not_happening | \n", "FLICC | \n", "https://huggingface.co/datasets/fzanartu/FLICC... | \n", "en | \n", "hamburg_test1 | \n", "NaN | \n", "1 | \n", "['holocene', 'last', '10k', 'year', 'sea', 'le... | \n", "
2 | \n", "China, which hosts U.N. climate talks next wee... | \n", "4_solutions_harmful_unnecessary | \n", "FLICC | \n", "https://huggingface.co/datasets/fzanartu/FLICC... | \n", "en | \n", "CARDS | \n", "NaN | \n", "2 | \n", "['china', 'host', 'un', 'climate', 'talk', 'ne... | \n", "
3 | \n", "And the fabricated documents (which Dr. Mann a... | \n", "0_not_relevant | \n", "FLICC | \n", "https://huggingface.co/datasets/fzanartu/FLICC... | \n", "en | \n", "CARDS | \n", "NaN | \n", "3 | \n", "['fabricate', 'document', 'dr', 'mann', 'appar... | \n", "
4 | \n", "It's going to be 42 here today and the hottest... | \n", "1_not_happening | \n", "FLICC | \n", "https://huggingface.co/datasets/fzanartu/FLICC... | \n", "en | \n", "hamburg_test3 | \n", "NaN | \n", "4 | \n", "['be', 'go', '42', 'today', 'hot', 'summer', '... | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
6086 | \n", "There is that sense of the Senate that climate... | \n", "6_proponents_biased | \n", "Desmog | \n", "https://www.desmog.com/kevin-cramer/ | \n", "en | \n", "NaN | \n", "NaN | \n", "6431 | \n", "['sense', 'senate', 'climate', 'change', 'real... | \n", "
6087 | \n", "This is the biggest scientific hoax being perp... | \n", "2_not_human | \n", "Desmog | \n", "https://www.desmog.com/tad-murty/ | \n", "en | \n", "NaN | \n", "NaN | \n", "6432 | \n", "['big', 'scientific', 'hoax', 'perpetrate', 'h... | \n", "
6088 | \n", "A world with no tobacco might sound appealing ... | \n", "0_not_relevant | \n", "Desmog | \n", "https://www.desmog.com/drew-johnson/ | \n", "en | \n", "NaN | \n", "NaN | \n", "6433 | \n", "['world', 'tobacco', 'might', 'sound', 'appeal... | \n", "
6089 | \n", "The relationship between CO2 and temperature i... | \n", "5_science_unreliable | \n", "Desmog | \n", "https://www.desmog.com/junkscience-com/ | \n", "en | \n", "NaN | \n", "NaN | \n", "6435 | \n", "['relationship', 'co2', 'temperature', 'logari... | \n", "
6090 | \n", "Whatever is happening to the weather at the mo... | \n", "1_not_happening | \n", "Desmog | \n", "https://www.desmog.com/boris-johnson/ | \n", "en | \n", "NaN | \n", "NaN | \n", "6436 | \n", "['whatever', 'happen', 'weather', 'moment', 's... | \n", "
6091 rows × 9 columns
\n", "