diff --git "a/external/FinGPT/fingpt/FinGPT_Benchmark/data/prepare_data.ipynb" "b/external/FinGPT/fingpt/FinGPT_Benchmark/data/prepare_data.ipynb" new file mode 100644--- /dev/null +++ "b/external/FinGPT/fingpt/FinGPT_Benchmark/data/prepare_data.ipynb" @@ -0,0 +1,2844 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 20, + "id": "f4ff218d-f4eb-4e4b-9f4c-205351349fd2", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import re\n", + "import os\n", + "import random\n", + "import datasets\n", + "import pandas as pd\n", + "from glob import glob\n", + "from tqdm.notebook import tqdm\n", + "from datasets import load_dataset, load_from_disk, Dataset, DatasetDict\n", + "from transformers import AutoTokenizer, AutoConfig" + ] + }, + { + "cell_type": "markdown", + "id": "dc6368fa-4fcd-4336-a548-f37bda3a5d2f", + "metadata": { + "tags": [] + }, + "source": [ + "# Sentiment Datasets (following FinGPT v3)" + ] + }, + { + "cell_type": "markdown", + "id": "90c1b2b0-550c-47a7-8486-c7f5601bc0a1", + "metadata": {}, + "source": [ + "### 1. FPB" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "0d79a0e4-f8f8-4e20-811e-42256b0362df", + "metadata": {}, + "outputs": [], + "source": [ + "dic = {\n", + " 0:\"negative\",\n", + " 1:'neutral',\n", + " 2:'positive',\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "85a3d5c8-6a06-40e5-8ae7-06d1fd93f479", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['input', 'output', 'instruction'],\n", + " num_rows: 3634\n", + "})" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# fpb_datasets = load_dataset(\"financial_phrasebank\", \"sentences_50agree\")\n", + "fpb_datasets = load_from_disk('../data/financial_phrasebank-sentences_50agree/')\n", + "fpb_datasets = fpb_datasets[\"train\"]\n", + "fpb_datasets = fpb_datasets.to_pandas()\n", + "fpb_datasets.columns = [\"input\", \"output\"]\n", + "fpb_datasets[\"output\"] = fpb_datasets[\"output\"].apply(lambda x:dic[x])\n", + "fpb_datasets[\"instruction\"] = \"What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}.\"\n", + "fpb_datasets = datasets.Dataset.from_pandas(fpb_datasets)\n", + "fpb_datasets = fpb_datasets.train_test_split(seed=42)['train']\n", + "fpb_datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "8d8d28e4-f380-423b-8a1b-51bfcb0b7004", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['input', 'output', 'instruction'],\n", + " num_rows: 21804\n", + "})" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_dataset = datasets.concatenate_datasets([fpb_datasets]*6) # we want each data source have similar number of samples\n", + "train_dataset" + ] + }, + { + "cell_type": "markdown", + "id": "9097f1f3-598c-47d3-8b96-a08254da4c4e", + "metadata": {}, + "source": [ + "### 2. FiQA SA" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "55d618cc-9e05-4053-b354-e7b159c6906b", + "metadata": {}, + "outputs": [], + "source": [ + "def make_label(x):\n", + " if x < - 0.1:\n", + " return \"negative\"\n", + " elif -0.1 <= x < 0.1:\n", + " return \"neutral\"\n", + " else:\n", + " return \"positive\"\n", + "\n", + "def add_instructions(x):\n", + " if x == \"post\":\n", + " return \"What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}.\"\n", + " else:\n", + " return \"What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}.\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "4b81fb40-1275-4205-9d42-dbfc425dd4a4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['input', 'output', 'instruction'],\n", + " num_rows: 938\n", + "})" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# dataset = load_dataset('pauri32/fiqa-2018')\n", + "dataset = load_from_disk('../data/fiqa-2018/')\n", + "dataset = datasets.concatenate_datasets([dataset[\"train\"], dataset[\"validation\"] ,dataset[\"test\"] ])\n", + "dataset = dataset.to_pandas()\n", + "dataset[\"output\"] = dataset.sentiment_score.apply(make_label)\n", + "dataset[\"instruction\"] = dataset.format.apply(add_instructions)\n", + "dataset = dataset[['sentence', 'output', \"instruction\"]]\n", + "dataset.columns = [\"input\", \"output\", \"instruction\"]\n", + "dataset = datasets.Dataset.from_pandas(dataset)\n", + "dataset = dataset.train_test_split(0.226, seed=42)['train']\n", + "dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "80349c7a-3c0c-4c99-b2c3-f1b2dd377e64", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "19698\n" + ] + }, + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['input', 'output', 'instruction'],\n", + " num_rows: 41502\n", + "})" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tmp_dataset = datasets.concatenate_datasets([dataset]*21)\n", + "train_dataset = datasets.concatenate_datasets([train_dataset, tmp_dataset]) \n", + "print(tmp_dataset.num_rows)\n", + "train_dataset" + ] + }, + { + "cell_type": "markdown", + "id": "a4887301-4f6a-49b0-9e42-aa5bdcd19856", + "metadata": {}, + "source": [ + "### 3. TFNS" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "e707ffbf-7f92-4b45-8570-a6d63bc64bc7", + "metadata": {}, + "outputs": [], + "source": [ + "dic = {\n", + " 0:\"negative\",\n", + " 1:'positive',\n", + " 2:'neutral',\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "41388d35-c6ae-4da8-b92d-13fb0b1db21e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['input', 'output', 'instruction'],\n", + " num_rows: 9543\n", + "})" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# social_media_dataset = load_dataset('zeroshot/twitter-financial-news-sentiment')\n", + "social_media_dataset = load_from_disk('../data/twitter-financial-news-sentiment')\n", + "social_media_dataset = social_media_dataset['train']\n", + "social_media_dataset = social_media_dataset.to_pandas()\n", + "social_media_dataset['label'] = social_media_dataset['label'].apply(lambda x:dic[x])\n", + "social_media_dataset['instruction'] = 'What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}.'\n", + "social_media_dataset.columns = ['input', 'output', 'instruction']\n", + "social_media_dataset = datasets.Dataset.from_pandas(social_media_dataset)\n", + "social_media_dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "9f1ef9b7-af0c-407b-b270-7c4b2180036c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "19086\n" + ] + }, + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['input', 'output', 'instruction'],\n", + " num_rows: 60588\n", + "})" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tmp_dataset = datasets.concatenate_datasets([social_media_dataset]*2)\n", + "train_dataset = datasets.concatenate_datasets([train_dataset,tmp_dataset]) \n", + "print(tmp_dataset.num_rows)\n", + "train_dataset" + ] + }, + { + "cell_type": "markdown", + "id": "aee85f1d-7c0e-4642-adbd-f145adb23ef1", + "metadata": {}, + "source": [ + "### 4. NWGI" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "642fcaf0-b909-4562-a913-877557d829bb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['input', 'output', 'instruction'],\n", + " num_rows: 16184\n", + "})" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# finance_dataset = load_dataset('oliverwang15/news_with_gpt_instructions')\n", + "finance_dataset = load_from_disk('../data/news_with_gpt_instructions/')\n", + "finance_dataset = finance_dataset['train'].to_pandas()\n", + "finance_dataset['output'] = finance_dataset['label']\n", + "finance_dataset[\"input\"] = finance_dataset[\"news\"]\n", + "finance_dataset[\"instruction\"] = 'What is the sentiment of this news? Please choose an answer from {strong negative/moderately negative/mildly negative/neutral/mildly positive/moderately positive/strong positive}.'\n", + "finance_dataset = finance_dataset[['input', 'output', 'instruction']]\n", + "finance_dataset = datasets.Dataset.from_pandas(finance_dataset)\n", + "finance_dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "98e24280-119d-4748-8c1d-709d7d3cf50d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(76772, 3)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_dataset = datasets.concatenate_datasets([train_dataset, finance_dataset])\n", + "all_dataset = train_dataset.shuffle(seed=42)\n", + "all_dataset.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "a8f4d2a8-c0ac-49ce-8933-cce0de14f99f", + "metadata": {}, + "outputs": [], + "source": [ + "# from huggingface_hub import notebook_login\n", + "# notebook_login()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "4509a524-d694-4bcf-813c-4a28551bed39", + "metadata": {}, + "outputs": [], + "source": [ + "# all_dataset.push_to_hub(\"fingpt_chatglm2_sentiment_instruction_lora_ft_dataset\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "b7774a58-e4e6-49ac-81fb-ef72dc46446e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "76772" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "21804 + 19698 + 19086 + 16184" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "1c740a64-d41d-43de-b20b-7826f01e9421", + "metadata": {}, + "outputs": [ + { + "data": { + "application/json": { + "ascii": false, + "bar_format": null, + "colour": null, + "elapsed": 0.004731893539428711, + "initial": 0, + "n": 0, + "ncols": null, + "nrows": null, + "postfix": null, + "prefix": "Saving the dataset (0/1 shards)", + "rate": null, + "total": 76772, + "unit": " examples", + "unit_divisor": 1000, + "unit_scale": false + }, + "application/vnd.jupyter.widget-view+json": { + "model_id": "24c8597611564c9a8f2aa8055b126bf7", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Saving the dataset (0/1 shards): 0%| | 0/76772 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DatesURLNewsPrice or NotPrice Direction UpPrice Direction ConstantPrice Direction DownPastPriceFuturePricePastNewsFutureNewsAsset ComparisionPrice Sentiment
028-01-2016http://www.marketwatch.com/story/april-gold-do...april gold down 20 cents to settle at $1,116.1...100110000negative
113-09-2017http://www.marketwatch.com/story/gold-prices-s...gold suffers third straight daily decline100110000negative
226-07-2016http://www.marketwatch.com/story/gold-futures-...Gold futures edge up after two-session decline110010000positive
328-02-2018https://www.metalsdaily.com/link/277199/dent-r...dent research : is gold's day in the sun comin...000000010none
406-09-2017http://www.marketwatch.com/story/gold-steadies...Gold snaps three-day rally as Trump, lawmakers...110010000positive
\n", + "" + ], + "text/plain": [ + " Dates URL \\\n", + "0 28-01-2016 http://www.marketwatch.com/story/april-gold-do... \n", + "1 13-09-2017 http://www.marketwatch.com/story/gold-prices-s... \n", + "2 26-07-2016 http://www.marketwatch.com/story/gold-futures-... \n", + "3 28-02-2018 https://www.metalsdaily.com/link/277199/dent-r... \n", + "4 06-09-2017 http://www.marketwatch.com/story/gold-steadies... \n", + "\n", + " News Price or Not \\\n", + "0 april gold down 20 cents to settle at $1,116.1... 1 \n", + "1 gold suffers third straight daily decline 1 \n", + "2 Gold futures edge up after two-session decline 1 \n", + "3 dent research : is gold's day in the sun comin... 0 \n", + "4 Gold snaps three-day rally as Trump, lawmakers... 1 \n", + "\n", + " Price Direction Up Price Direction Constant Price Direction Down \\\n", + "0 0 0 1 \n", + "1 0 0 1 \n", + "2 1 0 0 \n", + "3 0 0 0 \n", + "4 1 0 0 \n", + "\n", + " PastPrice FuturePrice PastNews FutureNews Asset Comparision \\\n", + "0 1 0 0 0 0 \n", + "1 1 0 0 0 0 \n", + "2 1 0 0 0 0 \n", + "3 0 0 0 1 0 \n", + "4 1 0 0 0 0 \n", + "\n", + " Price Sentiment \n", + "0 negative \n", + "1 negative \n", + "2 positive \n", + "3 none \n", + "4 positive " + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv('gold-dataset-sinha-khandait.csv')\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "2e848299-c202-4163-b46a-336af9b98495", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "9129\n", + "2283\n" + ] + } + ], + "source": [ + "train_dataset, test_dataset = {}, {}\n", + "inputs, outputs, instructions = [], [], []\n", + "for index, row in df.iterrows():\n", + " \n", + " if index + 1 >= len(df) * 0.8 and not train_dataset:\n", + " train_dataset['input'] = inputs\n", + " train_dataset['output'] = outputs\n", + " train_dataset['instruction'] = instructions\n", + " inputs, outputs, instructions = [], [], []\n", + " \n", + " inputs.extend([row['News']] * 9)\n", + " # price or not\n", + " instructions.append('Does the news headline talk about price? Please choose an answer from {Yes/No}.')\n", + " outputs.append('Yes' if row['Price Direction Constant'] else 'No')\n", + " # price up\n", + " instructions.append('Does the news headline talk about price going up? Please choose an answer from {Yes/No}.')\n", + " outputs.append('Yes' if row['Price Direction Up'] else 'No')\n", + " # price stable\n", + " instructions.append('Does the news headline talk about price staying constant? Please choose an answer from {Yes/No}.')\n", + " outputs.append('Yes' if row['Price Direction Constant'] else 'No')\n", + " # price down\n", + " instructions.append('Does the news headline talk about price going down? Please choose an answer from {Yes/No}.')\n", + " outputs.append('Yes' if row['Price Direction Down'] else 'No')\n", + " # past price\n", + " instructions.append('Does the news headline talk about price in the past? Please choose an answer from {Yes/No}.')\n", + " outputs.append('Yes' if row['PastPrice'] else 'No')\n", + " # future price\n", + " instructions.append('Does the news headline talk about price in the future? Please choose an answer from {Yes/No}.')\n", + " outputs.append('Yes' if row['FuturePrice'] else 'No')\n", + " # past general\n", + " instructions.append('Does the news headline talk about a general event (apart from prices) in the past? Please choose an answer from {Yes/No}.')\n", + " outputs.append('Yes' if row['PastNews'] else 'No')\n", + " # future general\n", + " instructions.append('Does the news headline talk about a general event (apart from prices) in the future? Please choose an answer from {Yes/No}.')\n", + " outputs.append('Yes' if row['FutureNews'] else 'No')\n", + " # asset comparison\n", + " instructions.append('Does the news headline compare gold with any other asset? Please choose an answer from {Yes/No}.')\n", + " outputs.append('Yes' if row['Asset Comparision'] else 'No')\n", + " \n", + "test_dataset['input'] = inputs\n", + "test_dataset['output'] = outputs\n", + "test_dataset['instruction'] = instructions\n", + "\n", + "print(len(train_dataset['input']) // 9)\n", + "print(len(test_dataset['input']) // 9)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "6ae82176-fd34-43b9-a3ad-7c85bf35ee08", + "metadata": {}, + "outputs": [ + { + "data": { + "application/json": { + "ascii": false, + "bar_format": null, + "colour": null, + "elapsed": 0.004788875579833984, + "initial": 0, + "n": 0, + "ncols": null, + "nrows": null, + "postfix": null, + "prefix": "Saving the dataset (0/1 shards)", + "rate": null, + "total": 82161, + "unit": " examples", + "unit_divisor": 1000, + "unit_scale": false + }, + "application/vnd.jupyter.widget-view+json": { + "model_id": "670ca6e61e0141cfbe00f348caa2bdf5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Saving the dataset (0/1 shards): 0%| | 0/82161 [00:00', '-')\n", + " context = f'{pre_text} {table} {post_text}\\n'\n", + " questions, answers, turn_ind = annos['dialogue_break'], annos['exe_ans_list'], annos['turn_ind']\n", + " for i in range(turn_ind):\n", + " context += f'Question: {questions[i]}\\n'\n", + " context += f'Answer: {answers[i]}\\n'\n", + " context += f'Question: {questions[turn_ind]}\\n'\n", + " outputs.append(str(answers[turn_ind]))\n", + " instructions.append(instruction)\n", + " inputs.append(context)\n", + " \n", + " return Dataset.from_dict({\n", + " 'input': inputs,\n", + " 'output': outputs,\n", + " 'instruction': instructions\n", + " })\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "e817bc45-9825-4af6-bd34-94a6307c15fc", + "metadata": {}, + "outputs": [ + { + "data": { + "application/json": { + "ascii": false, + "bar_format": null, + "colour": null, + "elapsed": 0.004643678665161133, + "initial": 0, + "n": 0, + "ncols": null, + "nrows": null, + "postfix": null, + "prefix": "Saving the dataset (0/1 shards)", + "rate": null, + "total": 11104, + "unit": " examples", + "unit_divisor": 1000, + "unit_scale": false + }, + "application/vnd.jupyter.widget-view+json": { + "model_id": "4bebad1197d14912ba83461801e19b9a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Saving the dataset (0/1 shards): 0%| | 0/11104 [00:001-2002200120002net sales$ 5742$ 5363$ 79833cost of sales4139412858174gross margin$ 1603$ 1235$ 21665gross margin percentage28% ( 28 % )23% ( 23 % )27% ( 27 % ) .\\nQuestion: what was the total of net sales in 2001?\\nAnswer: 5363.0\\nQuestion: and what was that in 2000?\\n',\n", + " 'output': '7983.0',\n", + " 'instruction': 'Read the following texts and table with financial data from an S&P 500 earnings report carefully.Based on the question-answer history (if provided), answer the last question. The answer may require mathematical calculation based on the data provided.\\n'}" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "convfinqa_dataset['train'][9]" + ] + }, + { + "cell_type": "markdown", + "id": "44f2a7ea-b832-462f-8103-eca6ee4046bb", + "metadata": { + "jp-MarkdownHeadingCollapsed": true, + "tags": [] + }, + "source": [ + "# FinEval" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "7f42f8eb-4174-46e7-b2fe-7c8a6487f084", + "metadata": {}, + "outputs": [ + { + "data": { + "application/json": { + "ascii": false, + "bar_format": null, + "colour": null, + "elapsed": 0.004954338073730469, + "initial": 0, + "n": 0, + "ncols": null, + "nrows": null, + "postfix": null, + "prefix": "Saving the dataset (0/1 shards)", + "rate": null, + "total": 1056, + "unit": " examples", + "unit_divisor": 1000, + "unit_scale": false + }, + "application/vnd.jupyter.widget-view+json": { + "model_id": "3784d06ad3f94b04a2e185d139aa9f33", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Saving the dataset (0/1 shards): 0%| | 0/1056 [00:00