{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import sys\n", "sys.path.append(\"../../FinNLP\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import os\n", "import requests\n", "import shutil\n", "import pandas as pd\n", "from finnlp.data_engineering.data_cleaning import * " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Downloading sample data" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def download_parquet_files(url_list, local_dir):\n", " for url in url_list:\n", " file_name = url.split('/')[-1]\n", " local_file = os.path.join(local_dir, file_name)\n", " if not os.path.exists(local_dir):\n", " os.makedirs(local_dir)\n", "\n", " r = requests.get(url, stream=True)\n", " if r.status_code == 200:\n", " with open(local_file, 'wb+') as f:\n", " r.raw.decode_content = True\n", " shutil.copyfileobj(r.raw, f)\n", " else:\n", " print('download failed: ', url)\n", "\n", "def web_data_prepare(name):\n", " r = requests.get(\"https://datasets-server.huggingface.co/parquet?dataset=\"+name)\n", " j = r.json()\n", " urls = [f['url'] for f in j['parquet_files'] if f['split'] == 'train']\n", " train_urls = [f['url'] for f in j['parquet_files'] if f['config'] == name and f['split'] == 'train']\n", " test_urls = [f['url'] for f in j['parquet_files'] if f['config'] == name and f['split'] == 'validation']\n", " download_parquet_files(train_urls, 'train_dataset')\n", " download_parquet_files(test_urls, 'test_dataset')\n", "\n", " train_dataset = pd.read_parquet('./train_dataset', engine='pyarrow')\n", " test_dataset = pd.read_parquet('./test_dataset', engine='pyarrow')\n", "\n", " # train_dataset.rebalance()\n", " # test_dataset.rebalance()\n", " return train_dataset, test_dataset\n", "\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | text | \n", "date | \n", "gender | \n", "age | \n", "horoscope | \n", "job | \n", "
---|---|---|---|---|---|---|
395587 | \n", "it has just hit me that in a matter of a few s... | \n", "04,August,2004 | \n", "female | \n", "27 | \n", "Virgo | \n", "indUnk | \n", "
275435 | \n", "Ok, Dear Dopugie/ Ben/ Matt/ anyone who can he... | \n", "27,June,2004 | \n", "male | \n", "14 | \n", "Sagittarius | \n", "Student | \n", "
634637 | \n", "ooyeah/Happy Roctober = Miriam? Anyway, just ... | \n", "03,August,2004 | \n", "female | \n", "24 | \n", "Libra | \n", "Arts | \n", "
264675 | \n", "Election season is in the air! And I know that... | \n", "19,February,2004 | \n", "male | \n", "24 | \n", "Gemini | \n", "indUnk | \n", "
628907 | \n", "His face bore the the signs of the many battle... | \n", "03,February,2004 | \n", "male | \n", "24 | \n", "Libra | \n", "indUnk | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
41392 | \n", "Everything You Wanted to Know About Oscillatin... | \n", "04,October,2003 | \n", "female | \n", "40 | \n", "Gemini | \n", "Law | \n", "
71956 | \n", "Sick Chickens March has not been a good mont... | \n", "08,August,2004 | \n", "male | \n", "26 | \n", "Capricorn | \n", "Communications-Media | \n", "
21415 | \n", "Gunshot as you are more likely to die quickly ... | \n", "26,March,2003 | \n", "male | \n", "17 | \n", "Taurus | \n", "Technology | \n", "
255686 | \n", "Well I am sad to report that my Uncle has pass... | \n", "07,March,2004 | \n", "female | \n", "27 | \n", "Capricorn | \n", "indUnk | \n", "
401130 | \n", "urlLink A picture from the 'Precious Moment... | \n", "10,June,2004 | \n", "male | \n", "33 | \n", "Gemini | \n", "Technology | \n", "
6898 rows × 6 columns
\n", "