Commit
·
da00d1b
1
Parent(s):
38ff862
Update README.md to include detailed model information for FinSight AI, a financial advisory chatbot. Added sections on model details, usage examples, training details, limitations, and future improvements. Changed license from Apache-2.0 to MIT and updated language and tags for better categorization.
Browse files- LICENSE +21 -0
- README.md +234 -3
- adapter_config.json +31 -0
- adapter_model.safetensors +3 -0
- all_results.json +12 -0
- eval_results.json +7 -0
- inference.py +212 -0
- merges.txt +0 -0
- requirements.txt +11 -0
- special_tokens_map.json +34 -0
- tokenizer.json +0 -0
- tokenizer_config.json +155 -0
- train_results.json +8 -0
- trainer_state.json +326 -0
- training_args.bin +3 -0
- vocab.json +0 -0
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2025 Zahemen
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
CHANGED
@@ -1,3 +1,234 @@
|
|
1 |
-
---
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language:
|
3 |
+
- en
|
4 |
+
license: mit
|
5 |
+
library_name: transformers
|
6 |
+
tags:
|
7 |
+
- financial-analysis
|
8 |
+
- conversational
|
9 |
+
- finance
|
10 |
+
- qlora
|
11 |
+
- financial-advice
|
12 |
+
- text-generation
|
13 |
+
pipeline_tag: text-generation
|
14 |
+
model-index:
|
15 |
+
- name: FinSight AI
|
16 |
+
results:
|
17 |
+
- task:
|
18 |
+
type: text-generation
|
19 |
+
name: Financial Advisory Generation
|
20 |
+
dataset:
|
21 |
+
type: custom
|
22 |
+
name: Financial Conversations
|
23 |
+
metrics:
|
24 |
+
- type: rouge1
|
25 |
+
value: 12.57%
|
26 |
+
name: ROUGE-1 Improvement
|
27 |
+
- type: rouge2
|
28 |
+
value: 79.48%
|
29 |
+
name: ROUGE-2 Improvement
|
30 |
+
- type: rougeL
|
31 |
+
value: 24.00%
|
32 |
+
name: ROUGE-L Improvement
|
33 |
+
- type: bleu
|
34 |
+
value: 135.36%
|
35 |
+
name: BLEU Improvement
|
36 |
+
base_model: HuggingFaceTB/SmolLM2-1.7B-Instruct
|
37 |
+
---
|
38 |
+
|
39 |
+
# FinSight AI - Financial Advisory Chatbot
|
40 |
+
|
41 |
+
A fine-tuned version of SmolLM2-1.7B optimized for financial advice and discussion.
|
42 |
+
|
43 |
+
<div align="center">
|
44 |
+
<a href="https://pytorch.org/" target="_blank">
|
45 |
+
<img src="https://img.shields.io/badge/PyTorch-EE4C2C?style=for-the-badge&logo=pytorch&logoColor=white" alt="PyTorch" />
|
46 |
+
</a>
|
47 |
+
<a href="https://huggingface.co/transformers/" target="_blank">
|
48 |
+
<img src="https://img.shields.io/badge/🤗%20Transformers-FFAE33?style=for-the-badge&logoColor=white" alt="Transformers" />
|
49 |
+
</a>
|
50 |
+
<a href="https://huggingface.co/" target="_blank">
|
51 |
+
<img src="https://img.shields.io/badge/🤗%20Hugging%20Face-0050C5?style=for-the-badge&logoColor=white" alt="Hugging Face" />
|
52 |
+
</a>
|
53 |
+
<a href="https://github.com/microsoft/LoRA" target="_blank">
|
54 |
+
<img src="https://img.shields.io/badge/LoRA-2088FF?style=for-the-badge&logo=github&logoColor=white" alt="LoRA" />
|
55 |
+
</a>
|
56 |
+
<a href="https://github.com/TimDettmers/bitsandbytes" target="_blank">
|
57 |
+
<img src="https://img.shields.io/badge/BitsAndBytes-4D4D4D?style=for-the-badge&logo=github&logoColor=white" alt="BitsAndBytes" />
|
58 |
+
</a>
|
59 |
+
</div>
|
60 |
+
|
61 |
+
|
62 |
+
<div align="center">
|
63 |
+
<h4><a href="https://github.com/zahemen9900/Datasets-for-Finsight/blob/97d7cacfff62e7b6099ef3bb0af9cf3d044a5b35/metrics/model_paper.md" target="_blank">📄 Read Model Paper 📄</a></h4>
|
64 |
+
</div>
|
65 |
+
|
66 |
+
## Model Details
|
67 |
+
|
68 |
+
- **Base Model**: HuggingFaceTB/SmolLM2-1.7B-Instruct
|
69 |
+
- **Task**: Financial Advisory and Discussion
|
70 |
+
- **Training Data**: Curated dataset of ~11,000 financial conversations (~16.5M tokens)
|
71 |
+
- **Training Method**: QLoRA (4-bit quantization with LoRA)
|
72 |
+
- **Language**: English
|
73 |
+
- **License**: MIT
|
74 |
+
|
75 |
+
## Model Description
|
76 |
+
|
77 |
+
FinSight AI is a specialized financial advisory assistant built by fine-tuning SmolLM2-1.7B-Instruct using QLoRA (Quantized Low-Rank Adaptation). The model has been trained on a comprehensive dataset of financial conversations to provide accurate, concise, and helpful information across various financial domains including personal finance, investing, market analysis, and financial planning.
|
78 |
+
|
79 |
+
Our evaluation demonstrates significant performance improvements across all standard NLP metrics **(ROUGE-1 , ROUGE-2, ROUGE-L & BLEU)**, showcasing the effectiveness of our domain-specific training approach. The model exhibits enhanced capabilities with richer financial terminology usage, more precise responses, improved handling of numerical data, and greater technical accuracy - all while maintaining a compact, resource-efficient architecture suitable for deployment on consumer hardware.
|
80 |
+
|
81 |
+
## Usage
|
82 |
+
|
83 |
+
```python
|
84 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
85 |
+
import torch
|
86 |
+
from peft import PeftModel
|
87 |
+
|
88 |
+
# For 4-bit quantized inference (recommended)
|
89 |
+
bnb_config = BitsAndBytesConfig(
|
90 |
+
load_in_4bit=True,
|
91 |
+
bnb_4bit_use_double_quant=True,
|
92 |
+
bnb_4bit_quant_type="nf4",
|
93 |
+
bnb_4bit_compute_dtype=torch.bfloat16
|
94 |
+
)
|
95 |
+
|
96 |
+
# First load the base model with quantization
|
97 |
+
base_model = AutoModelForCausalLM.from_pretrained(
|
98 |
+
"HuggingFaceTB/SmolLM2-1.7B-Instruct",
|
99 |
+
quantization_config=bnb_config,
|
100 |
+
device_map="auto"
|
101 |
+
)
|
102 |
+
|
103 |
+
# Then load the adapter weights (LoRA)
|
104 |
+
model = PeftModel.from_pretrained(base_model, "zahemen9900/finsight-ai")
|
105 |
+
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
|
106 |
+
|
107 |
+
# Example usage
|
108 |
+
prompt = "What's a good strategy for long-term investing?"
|
109 |
+
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
110 |
+
outputs = model.generate(
|
111 |
+
inputs.input_ids,
|
112 |
+
max_new_tokens=512,
|
113 |
+
temperature=0.7,
|
114 |
+
top_p=0.95,
|
115 |
+
do_sample=True
|
116 |
+
)
|
117 |
+
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
118 |
+
print(response)
|
119 |
+
```
|
120 |
+
|
121 |
+
### For streaming options:
|
122 |
+
|
123 |
+
```python
|
124 |
+
from transformers import TextIteratorStreamer
|
125 |
+
import threading
|
126 |
+
|
127 |
+
# Setup model and tokenizer (same as above)
|
128 |
+
|
129 |
+
prompt = "What's a good strategy for long-term investing?"
|
130 |
+
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
131 |
+
|
132 |
+
# Create a streamer
|
133 |
+
streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
|
134 |
+
|
135 |
+
# Generate in a separate thread
|
136 |
+
generation_kwargs = {
|
137 |
+
"input_ids": inputs.input_ids,
|
138 |
+
"max_new_tokens": 512,
|
139 |
+
"temperature": 0.7,
|
140 |
+
"top_p": 0.95,
|
141 |
+
"do_sample": True,
|
142 |
+
"streamer": streamer
|
143 |
+
}
|
144 |
+
|
145 |
+
thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
|
146 |
+
thread.start()
|
147 |
+
|
148 |
+
# Iterate over the generated text
|
149 |
+
print("Response: ", end="")
|
150 |
+
for text in streamer:
|
151 |
+
print(text, end="", flush=True)
|
152 |
+
|
153 |
+
```
|
154 |
+
|
155 |
+
## Example Interactions
|
156 |
+
|
157 |
+
**Query**: "What is dollar-cost averaging?"
|
158 |
+
|
159 |
+
**Response**: "Dollar-cost averaging (DCA) is an investment strategy in which an investor divides the total amount to be invested into equal parts and invests those parts at regular intervals, regardless of the asset's price. This approach aims to reduce the impact of market volatility by spreading out the investment over time. By purchasing more shares when prices are low and fewer shares when prices are high, DCA can potentially lower the average cost per share over the long term."
|
160 |
+
|
161 |
+
**Query**: "How do I start investing with a small budget?"
|
162 |
+
|
163 |
+
**Response**: "Starting to invest with a small budget is definitely possible. Here are some steps to get you started:
|
164 |
+
|
165 |
+
1. **Set clear goals**: Determine what you're investing for and your time horizon.
|
166 |
+
|
167 |
+
2. **Build an emergency fund first**: Before investing, ensure you have 3-6 months of living expenses saved.
|
168 |
+
|
169 |
+
3. **Use micro-investing apps**: Platforms like Acorns, Stash, or Robinhood allow you to start with very small amounts.
|
170 |
+
|
171 |
+
4. **Consider index ETFs**: These offer instant diversification and typically have low expense ratios.
|
172 |
+
|
173 |
+
5. **Utilize employer retirement plans**: If available, contribute to a 401(k), especially if your employer offers matching contributions.
|
174 |
+
|
175 |
+
6. **Implement dollar-cost averaging**: Invest fixed amounts regularly rather than trying to time the market.
|
176 |
+
|
177 |
+
7. **Reinvest dividends**: This allows your investments to compound over time.
|
178 |
+
|
179 |
+
8. **Be consistent**: Small regular investments can grow significantly over time due to compound interest.
|
180 |
+
|
181 |
+
Remember, the most important factor is getting started early and staying consistent, rather than the initial amount you invest."
|
182 |
+
|
183 |
+
## Training Details
|
184 |
+
|
185 |
+
The model was trained using the following configuration:
|
186 |
+
|
187 |
+
- **QLoRA Parameters**:
|
188 |
+
- Rank (r): 64
|
189 |
+
- Alpha: 16
|
190 |
+
- Target modules: Query, Key, Value projections, MLP layers
|
191 |
+
- 4-bit NF4 quantization with double quantization
|
192 |
+
|
193 |
+
- **Training Hyperparameters**:
|
194 |
+
- Learning rate: 2e-4
|
195 |
+
- Epochs: 2
|
196 |
+
- Batch size: 2 (with gradient accumulation steps of 4)
|
197 |
+
- Weight decay: 0.05
|
198 |
+
- Scheduler: Cosine with restarts
|
199 |
+
- Warmup ratio: 0.15
|
200 |
+
|
201 |
+
- **Hardware**: Consumer-grade NVIDIA RTX 3050 GPU with 6GB VRAM
|
202 |
+
|
203 |
+
## Limitations
|
204 |
+
|
205 |
+
## Limitations
|
206 |
+
|
207 |
+
- **Information Currency**: Financial data and knowledge within the model is limited to the training data cutoff date. Market conditions, regulations, and financial instruments may have changed since then.
|
208 |
+
- **No Real-time Information**: The model operates without internet connectivity and cannot access current market data, breaking news, or recent economic developments.
|
209 |
+
- **Not Financial Advice**: Responses should not be considered personalized financial advice. The model cannot account for individual financial situations, risk tolerances, or specific circumstances required for proper financial planning.
|
210 |
+
- **Language Limitations**: While optimized for English financial terminology, the model may have reduced performance with non-English financial terms or concepts specific to regional markets.
|
211 |
+
- **Regulatory Compliance**: The model is not updated with the latest financial regulations across different jurisdictions and cannot ensure compliance with local financial laws.
|
212 |
+
- **Complexity Handling**: May struggle with highly complex or niche financial scenarios that were underrepresented in the training data.
|
213 |
+
|
214 |
+
## Future Improvements
|
215 |
+
|
216 |
+
- **Retrieval Augmented Generation (RAG)**: Implementing RAG would allow the model to reference current financial data, market statistics, and regulatory information before generating responses, significantly improving accuracy and relevance.
|
217 |
+
- **Domain-Specific Fine-tuning**: Additional training on specialized financial domains like cryptocurrency, derivatives trading, and international tax regulations.
|
218 |
+
- **Multilingual Support**: Expanding capabilities to handle financial terminology and concepts across multiple languages and markets.
|
219 |
+
- **Personalization Framework**: Developing mechanisms to better contextualize responses based on stated user preferences while maintaining privacy.
|
220 |
+
|
221 |
+
## Citation
|
222 |
+
|
223 |
+
If you use FinSight AI in your research, please cite:
|
224 |
+
```md
|
225 |
+
|
226 |
+
@misc{FinSightAI2025,
|
227 |
+
author = {Zahemen, FinsightAI Team},
|
228 |
+
title = {FinSight AI: Enhancing Financial Domain Performance of Small Language Models Through QLoRA Fine-tuning},
|
229 |
+
year = {2025},
|
230 |
+
publisher = {GitHub},
|
231 |
+
journal = {GitHub repository},
|
232 |
+
howpublished = {\url{https://github.com/zahemen9900/FinsightAI}}
|
233 |
+
}
|
234 |
+
```
|
adapter_config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
|
5 |
+
"bias": "none",
|
6 |
+
"fan_in_fan_out": false,
|
7 |
+
"inference_mode": true,
|
8 |
+
"init_lora_weights": "gaussian",
|
9 |
+
"layers_pattern": null,
|
10 |
+
"layers_to_transform": null,
|
11 |
+
"lora_alpha": 16,
|
12 |
+
"lora_dropout": 0.05,
|
13 |
+
"modules_to_save": null,
|
14 |
+
"peft_type": "LORA",
|
15 |
+
"r": 64,
|
16 |
+
"rank_pattern": {},
|
17 |
+
"revision": null,
|
18 |
+
"target_modules": [
|
19 |
+
"v_proj",
|
20 |
+
"mixer_mlp",
|
21 |
+
"gate_proj",
|
22 |
+
"mixer_self_attention",
|
23 |
+
"up_proj",
|
24 |
+
"k_proj",
|
25 |
+
"mixer_cross_attention",
|
26 |
+
"q_proj",
|
27 |
+
"o_proj",
|
28 |
+
"down_proj"
|
29 |
+
],
|
30 |
+
"task_type": "CAUSAL_LM"
|
31 |
+
}
|
adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3fa488100b3b6a79d43bc6742f1938de7d2d14ad50dfc6b831faa2beb3056240
|
3 |
+
size 289452128
|
all_results.json
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 1.4679393049437102,
|
3 |
+
"eval_loss": 1.5792005062103271,
|
4 |
+
"eval_runtime": 1884.1302,
|
5 |
+
"eval_samples_per_second": 1.446,
|
6 |
+
"eval_steps_per_second": 0.723,
|
7 |
+
"total_flos": 1.105565365842985e+17,
|
8 |
+
"train_loss": 1.763797264099121,
|
9 |
+
"train_runtime": 30390.5778,
|
10 |
+
"train_samples_per_second": 0.395,
|
11 |
+
"train_steps_per_second": 0.049
|
12 |
+
}
|
eval_results.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 1.4679393049437102,
|
3 |
+
"eval_loss": 1.5792005062103271,
|
4 |
+
"eval_runtime": 1884.1302,
|
5 |
+
"eval_samples_per_second": 1.446,
|
6 |
+
"eval_steps_per_second": 0.723
|
7 |
+
}
|
inference.py
ADDED
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
FinSight AI - Inference script for financial advisory chatbot
|
4 |
+
This script provides a simple way to interact with the model via command line
|
5 |
+
"""
|
6 |
+
|
7 |
+
import os
|
8 |
+
import argparse
|
9 |
+
import torch
|
10 |
+
from typing import List, Dict
|
11 |
+
from transformers import (
|
12 |
+
AutoModelForCausalLM,
|
13 |
+
AutoTokenizer,
|
14 |
+
TextStreamer,
|
15 |
+
BitsAndBytesConfig
|
16 |
+
)
|
17 |
+
|
18 |
+
class FinancialAdvisor:
|
19 |
+
def __init__(
|
20 |
+
self,
|
21 |
+
model_id: str = "zahemen9900/finsight-ai",
|
22 |
+
use_4bit: bool = True,
|
23 |
+
device: str = None
|
24 |
+
):
|
25 |
+
self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
26 |
+
print(f"Using device: {self.device}")
|
27 |
+
|
28 |
+
# Configure quantization if requested and available
|
29 |
+
if use_4bit and self.device == "cuda":
|
30 |
+
print("Loading model in 4-bit quantization mode")
|
31 |
+
bnb_config = BitsAndBytesConfig(
|
32 |
+
load_in_4bit=True,
|
33 |
+
bnb_4bit_quant_type="nf4",
|
34 |
+
bnb_4bit_use_double_quant=True,
|
35 |
+
bnb_4bit_compute_dtype=torch.bfloat16
|
36 |
+
)
|
37 |
+
else:
|
38 |
+
print("Loading model in standard mode")
|
39 |
+
bnb_config = None
|
40 |
+
|
41 |
+
# Load tokenizer and model
|
42 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_id)
|
43 |
+
self.model = AutoModelForCausalLM.from_pretrained(
|
44 |
+
model_id,
|
45 |
+
quantization_config=bnb_config,
|
46 |
+
device_map="auto" if self.device == "cuda" else None,
|
47 |
+
torch_dtype=torch.bfloat16 if self.device == "cuda" else torch.float32,
|
48 |
+
)
|
49 |
+
|
50 |
+
if self.device == "cpu":
|
51 |
+
self.model = self.model.to(self.device)
|
52 |
+
|
53 |
+
self.model.eval()
|
54 |
+
self.conversation_history = []
|
55 |
+
self.system_message = {
|
56 |
+
"role": "system",
|
57 |
+
"content": (
|
58 |
+
"You are FinSight AI, a helpful and knowledgeable financial assistant. "
|
59 |
+
"You can provide information and guidance on financial topics, market trends, investment strategies, "
|
60 |
+
"and personal finance management. Always strive to be accurate, informative, and helpful. "
|
61 |
+
"Remember that you cannot provide personalized financial advice that would require knowing a person's "
|
62 |
+
"complete financial situation or future market movements."
|
63 |
+
)
|
64 |
+
}
|
65 |
+
|
66 |
+
def generate_response(
|
67 |
+
self,
|
68 |
+
prompt: str,
|
69 |
+
temperature: float = 0.7,
|
70 |
+
max_new_tokens: int = 512,
|
71 |
+
stream: bool = True
|
72 |
+
) -> str:
|
73 |
+
"""Generate response from the model"""
|
74 |
+
# Manage conversation history (keep last 5 exchanges)
|
75 |
+
if len(self.conversation_history) > 10:
|
76 |
+
self.conversation_history = self.conversation_history[-10:]
|
77 |
+
|
78 |
+
# Create messages with history
|
79 |
+
messages = [self.system_message] + self.conversation_history
|
80 |
+
messages.append({"role": "user", "content": prompt})
|
81 |
+
|
82 |
+
# Format prompt using chat template
|
83 |
+
formatted_prompt = self.tokenizer.apply_chat_template(
|
84 |
+
messages,
|
85 |
+
tokenize=False,
|
86 |
+
add_generation_prompt=True
|
87 |
+
)
|
88 |
+
|
89 |
+
# Encode the input
|
90 |
+
inputs = self.tokenizer(
|
91 |
+
formatted_prompt,
|
92 |
+
return_tensors="pt",
|
93 |
+
truncation=True,
|
94 |
+
max_length=4096
|
95 |
+
).to(self.device)
|
96 |
+
|
97 |
+
# Setup streamer if requested
|
98 |
+
streamer = TextStreamer(
|
99 |
+
self.tokenizer,
|
100 |
+
skip_prompt=True,
|
101 |
+
skip_special_tokens=True
|
102 |
+
) if stream else None
|
103 |
+
|
104 |
+
# Generate response
|
105 |
+
with torch.inference_mode():
|
106 |
+
output_ids = self.model.generate(
|
107 |
+
inputs.input_ids,
|
108 |
+
attention_mask=inputs.attention_mask,
|
109 |
+
max_new_tokens=max_new_tokens,
|
110 |
+
do_sample=True,
|
111 |
+
temperature=temperature,
|
112 |
+
top_p=0.95,
|
113 |
+
streamer=streamer,
|
114 |
+
pad_token_id=self.tokenizer.eos_token_id,
|
115 |
+
repetition_penalty=1.1
|
116 |
+
)
|
117 |
+
|
118 |
+
# Return the response
|
119 |
+
if not stream:
|
120 |
+
response = self.tokenizer.decode(
|
121 |
+
output_ids[0][inputs.input_ids.shape[1]:],
|
122 |
+
skip_special_tokens=True
|
123 |
+
)
|
124 |
+
print("\nAssistant:", response)
|
125 |
+
else:
|
126 |
+
response = "" # Response was already streamed
|
127 |
+
|
128 |
+
# Update conversation history
|
129 |
+
self.conversation_history.append({"role": "user", "content": prompt})
|
130 |
+
self.conversation_history.append({"role": "assistant", "content": response if response else "[Response was streamed]"})
|
131 |
+
|
132 |
+
return response
|
133 |
+
|
134 |
+
def start_chat_loop(self):
|
135 |
+
"""Start an interactive chat session"""
|
136 |
+
print("\nWelcome to FinSight AI - Your Financial Advisory Assistant!")
|
137 |
+
print("Type 'quit', 'exit', or press Ctrl+C to end the conversation.\n")
|
138 |
+
|
139 |
+
while True:
|
140 |
+
try:
|
141 |
+
user_input = input("\nYou: ").strip()
|
142 |
+
if user_input.lower() in ["quit", "exit", "q"]:
|
143 |
+
break
|
144 |
+
|
145 |
+
if user_input.lower() == "clear":
|
146 |
+
self.conversation_history = []
|
147 |
+
print("Conversation history cleared.")
|
148 |
+
continue
|
149 |
+
|
150 |
+
print("\nAssistant: ", end="", flush=True)
|
151 |
+
self.generate_response(user_input)
|
152 |
+
|
153 |
+
except KeyboardInterrupt:
|
154 |
+
print("\nExiting chat...")
|
155 |
+
break
|
156 |
+
except Exception as e:
|
157 |
+
print(f"\nError: {e}")
|
158 |
+
continue
|
159 |
+
|
160 |
+
print("\nThank you for using FinSight AI. Goodbye!")
|
161 |
+
|
162 |
+
def main():
|
163 |
+
parser = argparse.ArgumentParser(description="FinSight AI Inference Script")
|
164 |
+
parser.add_argument(
|
165 |
+
"--model_id",
|
166 |
+
type=str,
|
167 |
+
default="zahemen9900/finsight-ai",
|
168 |
+
help="Model ID or path to load"
|
169 |
+
)
|
170 |
+
parser.add_argument(
|
171 |
+
"--no_quantize",
|
172 |
+
action="store_true",
|
173 |
+
help="Disable 4-bit quantization (uses more memory)"
|
174 |
+
)
|
175 |
+
parser.add_argument(
|
176 |
+
"--query",
|
177 |
+
type=str,
|
178 |
+
help="Single query mode: provide a question and get one response"
|
179 |
+
)
|
180 |
+
parser.add_argument(
|
181 |
+
"--temperature",
|
182 |
+
type=float,
|
183 |
+
default=0.7,
|
184 |
+
help="Sampling temperature (higher = more random)"
|
185 |
+
)
|
186 |
+
parser.add_argument(
|
187 |
+
"--max_tokens",
|
188 |
+
type=int,
|
189 |
+
default=512,
|
190 |
+
help="Maximum number of new tokens to generate"
|
191 |
+
)
|
192 |
+
|
193 |
+
args = parser.parse_args()
|
194 |
+
|
195 |
+
advisor = FinancialAdvisor(
|
196 |
+
model_id=args.model_id,
|
197 |
+
use_4bit=not args.no_quantize
|
198 |
+
)
|
199 |
+
|
200 |
+
# Single query mode
|
201 |
+
if args.query:
|
202 |
+
advisor.generate_response(
|
203 |
+
args.query,
|
204 |
+
temperature=args.temperature,
|
205 |
+
max_new_tokens=args.max_tokens
|
206 |
+
)
|
207 |
+
# Interactive chat mode
|
208 |
+
else:
|
209 |
+
advisor.start_chat_loop()
|
210 |
+
|
211 |
+
if __name__ == "__main__":
|
212 |
+
main()
|
merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch>=2.0.0
|
2 |
+
transformers>=4.34.0
|
3 |
+
accelerate>=0.30.0
|
4 |
+
peft>=0.6.2
|
5 |
+
bitsandbytes>=0.40.0
|
6 |
+
sentencepiece
|
7 |
+
protobuf
|
8 |
+
einops
|
9 |
+
regex
|
10 |
+
numpy
|
11 |
+
tqdm
|
special_tokens_map.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"additional_special_tokens": [
|
3 |
+
"<|im_start|>",
|
4 |
+
"<|im_end|>"
|
5 |
+
],
|
6 |
+
"bos_token": {
|
7 |
+
"content": "<|im_start|>",
|
8 |
+
"lstrip": false,
|
9 |
+
"normalized": false,
|
10 |
+
"rstrip": false,
|
11 |
+
"single_word": false
|
12 |
+
},
|
13 |
+
"eos_token": {
|
14 |
+
"content": "<|im_end|>",
|
15 |
+
"lstrip": false,
|
16 |
+
"normalized": false,
|
17 |
+
"rstrip": false,
|
18 |
+
"single_word": false
|
19 |
+
},
|
20 |
+
"pad_token": {
|
21 |
+
"content": "<|im_end|>",
|
22 |
+
"lstrip": false,
|
23 |
+
"normalized": false,
|
24 |
+
"rstrip": false,
|
25 |
+
"single_word": false
|
26 |
+
},
|
27 |
+
"unk_token": {
|
28 |
+
"content": "<|endoftext|>",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false
|
33 |
+
}
|
34 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_prefix_space": false,
|
3 |
+
"added_tokens_decoder": {
|
4 |
+
"0": {
|
5 |
+
"content": "<|endoftext|>",
|
6 |
+
"lstrip": false,
|
7 |
+
"normalized": false,
|
8 |
+
"rstrip": false,
|
9 |
+
"single_word": false,
|
10 |
+
"special": true
|
11 |
+
},
|
12 |
+
"1": {
|
13 |
+
"content": "<|im_start|>",
|
14 |
+
"lstrip": false,
|
15 |
+
"normalized": false,
|
16 |
+
"rstrip": false,
|
17 |
+
"single_word": false,
|
18 |
+
"special": true
|
19 |
+
},
|
20 |
+
"2": {
|
21 |
+
"content": "<|im_end|>",
|
22 |
+
"lstrip": false,
|
23 |
+
"normalized": false,
|
24 |
+
"rstrip": false,
|
25 |
+
"single_word": false,
|
26 |
+
"special": true
|
27 |
+
},
|
28 |
+
"3": {
|
29 |
+
"content": "<repo_name>",
|
30 |
+
"lstrip": false,
|
31 |
+
"normalized": false,
|
32 |
+
"rstrip": false,
|
33 |
+
"single_word": false,
|
34 |
+
"special": true
|
35 |
+
},
|
36 |
+
"4": {
|
37 |
+
"content": "<reponame>",
|
38 |
+
"lstrip": false,
|
39 |
+
"normalized": false,
|
40 |
+
"rstrip": false,
|
41 |
+
"single_word": false,
|
42 |
+
"special": true
|
43 |
+
},
|
44 |
+
"5": {
|
45 |
+
"content": "<file_sep>",
|
46 |
+
"lstrip": false,
|
47 |
+
"normalized": false,
|
48 |
+
"rstrip": false,
|
49 |
+
"single_word": false,
|
50 |
+
"special": true
|
51 |
+
},
|
52 |
+
"6": {
|
53 |
+
"content": "<filename>",
|
54 |
+
"lstrip": false,
|
55 |
+
"normalized": false,
|
56 |
+
"rstrip": false,
|
57 |
+
"single_word": false,
|
58 |
+
"special": true
|
59 |
+
},
|
60 |
+
"7": {
|
61 |
+
"content": "<gh_stars>",
|
62 |
+
"lstrip": false,
|
63 |
+
"normalized": false,
|
64 |
+
"rstrip": false,
|
65 |
+
"single_word": false,
|
66 |
+
"special": true
|
67 |
+
},
|
68 |
+
"8": {
|
69 |
+
"content": "<issue_start>",
|
70 |
+
"lstrip": false,
|
71 |
+
"normalized": false,
|
72 |
+
"rstrip": false,
|
73 |
+
"single_word": false,
|
74 |
+
"special": true
|
75 |
+
},
|
76 |
+
"9": {
|
77 |
+
"content": "<issue_comment>",
|
78 |
+
"lstrip": false,
|
79 |
+
"normalized": false,
|
80 |
+
"rstrip": false,
|
81 |
+
"single_word": false,
|
82 |
+
"special": true
|
83 |
+
},
|
84 |
+
"10": {
|
85 |
+
"content": "<issue_closed>",
|
86 |
+
"lstrip": false,
|
87 |
+
"normalized": false,
|
88 |
+
"rstrip": false,
|
89 |
+
"single_word": false,
|
90 |
+
"special": true
|
91 |
+
},
|
92 |
+
"11": {
|
93 |
+
"content": "<jupyter_start>",
|
94 |
+
"lstrip": false,
|
95 |
+
"normalized": false,
|
96 |
+
"rstrip": false,
|
97 |
+
"single_word": false,
|
98 |
+
"special": true
|
99 |
+
},
|
100 |
+
"12": {
|
101 |
+
"content": "<jupyter_text>",
|
102 |
+
"lstrip": false,
|
103 |
+
"normalized": false,
|
104 |
+
"rstrip": false,
|
105 |
+
"single_word": false,
|
106 |
+
"special": true
|
107 |
+
},
|
108 |
+
"13": {
|
109 |
+
"content": "<jupyter_code>",
|
110 |
+
"lstrip": false,
|
111 |
+
"normalized": false,
|
112 |
+
"rstrip": false,
|
113 |
+
"single_word": false,
|
114 |
+
"special": true
|
115 |
+
},
|
116 |
+
"14": {
|
117 |
+
"content": "<jupyter_output>",
|
118 |
+
"lstrip": false,
|
119 |
+
"normalized": false,
|
120 |
+
"rstrip": false,
|
121 |
+
"single_word": false,
|
122 |
+
"special": true
|
123 |
+
},
|
124 |
+
"15": {
|
125 |
+
"content": "<jupyter_script>",
|
126 |
+
"lstrip": false,
|
127 |
+
"normalized": false,
|
128 |
+
"rstrip": false,
|
129 |
+
"single_word": false,
|
130 |
+
"special": true
|
131 |
+
},
|
132 |
+
"16": {
|
133 |
+
"content": "<empty_output>",
|
134 |
+
"lstrip": false,
|
135 |
+
"normalized": false,
|
136 |
+
"rstrip": false,
|
137 |
+
"single_word": false,
|
138 |
+
"special": true
|
139 |
+
}
|
140 |
+
},
|
141 |
+
"additional_special_tokens": [
|
142 |
+
"<|im_start|>",
|
143 |
+
"<|im_end|>"
|
144 |
+
],
|
145 |
+
"bos_token": "<|im_start|>",
|
146 |
+
"chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
|
147 |
+
"clean_up_tokenization_spaces": false,
|
148 |
+
"eos_token": "<|im_end|>",
|
149 |
+
"extra_special_tokens": {},
|
150 |
+
"model_max_length": 8192,
|
151 |
+
"pad_token": "<|im_end|>",
|
152 |
+
"tokenizer_class": "GPT2Tokenizer",
|
153 |
+
"unk_token": "<|endoftext|>",
|
154 |
+
"vocab_size": 49152
|
155 |
+
}
|
train_results.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 1.4679393049437102,
|
3 |
+
"total_flos": 1.105565365842985e+17,
|
4 |
+
"train_loss": 1.763797264099121,
|
5 |
+
"train_runtime": 30390.5778,
|
6 |
+
"train_samples_per_second": 0.395,
|
7 |
+
"train_steps_per_second": 0.049
|
8 |
+
}
|
trainer_state.json
ADDED
@@ -0,0 +1,326 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": 1.5792005062103271,
|
3 |
+
"best_model_checkpoint": "qlora_output/checkpoint-1200",
|
4 |
+
"epoch": 1.4679393049437102,
|
5 |
+
"eval_steps": 600,
|
6 |
+
"global_step": 1500,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 0.03915810083210964,
|
13 |
+
"grad_norm": 0.07319402694702148,
|
14 |
+
"learning_rate": 3.555555555555556e-05,
|
15 |
+
"loss": 2.4428,
|
16 |
+
"step": 40
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"epoch": 0.07831620166421928,
|
20 |
+
"grad_norm": 0.04330237954854965,
|
21 |
+
"learning_rate": 7.111111111111112e-05,
|
22 |
+
"loss": 2.268,
|
23 |
+
"step": 80
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"epoch": 0.11747430249632893,
|
27 |
+
"grad_norm": 0.05867455527186394,
|
28 |
+
"learning_rate": 0.00010666666666666667,
|
29 |
+
"loss": 2.1806,
|
30 |
+
"step": 120
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"epoch": 0.15663240332843856,
|
34 |
+
"grad_norm": 0.06936266273260117,
|
35 |
+
"learning_rate": 0.00014222222222222224,
|
36 |
+
"loss": 2.0778,
|
37 |
+
"step": 160
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"epoch": 0.19579050416054822,
|
41 |
+
"grad_norm": 0.08056484907865524,
|
42 |
+
"learning_rate": 0.00017777777777777779,
|
43 |
+
"loss": 2.0382,
|
44 |
+
"step": 200
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"epoch": 0.23494860499265785,
|
48 |
+
"grad_norm": 0.0779654011130333,
|
49 |
+
"learning_rate": 0.0001999317060143023,
|
50 |
+
"loss": 1.9227,
|
51 |
+
"step": 240
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"epoch": 0.2741067058247675,
|
55 |
+
"grad_norm": 0.11802724003791809,
|
56 |
+
"learning_rate": 0.00019908312530915603,
|
57 |
+
"loss": 1.9139,
|
58 |
+
"step": 280
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"epoch": 0.3132648066568771,
|
62 |
+
"grad_norm": 0.0852489247918129,
|
63 |
+
"learning_rate": 0.00019727282722446047,
|
64 |
+
"loss": 1.9423,
|
65 |
+
"step": 320
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"epoch": 0.3524229074889868,
|
69 |
+
"grad_norm": 0.1409972459077835,
|
70 |
+
"learning_rate": 0.00019451838281608197,
|
71 |
+
"loss": 1.8484,
|
72 |
+
"step": 360
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"epoch": 0.39158100832109644,
|
76 |
+
"grad_norm": 0.11129080504179001,
|
77 |
+
"learning_rate": 0.00019084652718195238,
|
78 |
+
"loss": 1.7694,
|
79 |
+
"step": 400
|
80 |
+
},
|
81 |
+
{
|
82 |
+
"epoch": 0.43073910915320607,
|
83 |
+
"grad_norm": 0.10179898887872696,
|
84 |
+
"learning_rate": 0.00018629289996673897,
|
85 |
+
"loss": 1.8026,
|
86 |
+
"step": 440
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"epoch": 0.4698972099853157,
|
90 |
+
"grad_norm": 0.14124783873558044,
|
91 |
+
"learning_rate": 0.00018090169943749476,
|
92 |
+
"loss": 1.8217,
|
93 |
+
"step": 480
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"epoch": 0.5090553108174254,
|
97 |
+
"grad_norm": 0.16184218227863312,
|
98 |
+
"learning_rate": 0.0001747252534878891,
|
99 |
+
"loss": 1.7847,
|
100 |
+
"step": 520
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"epoch": 0.548213411649535,
|
104 |
+
"grad_norm": 0.11349498480558395,
|
105 |
+
"learning_rate": 0.00016782351173492342,
|
106 |
+
"loss": 1.6622,
|
107 |
+
"step": 560
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"epoch": 0.5873715124816447,
|
111 |
+
"grad_norm": 0.08884529024362564,
|
112 |
+
"learning_rate": 0.00016026346363792567,
|
113 |
+
"loss": 1.7633,
|
114 |
+
"step": 600
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"epoch": 0.5873715124816447,
|
118 |
+
"eval_loss": 1.6572695970535278,
|
119 |
+
"eval_runtime": 1912.2507,
|
120 |
+
"eval_samples_per_second": 1.425,
|
121 |
+
"eval_steps_per_second": 0.713,
|
122 |
+
"step": 600
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"epoch": 0.6265296133137542,
|
126 |
+
"grad_norm": 0.09996389597654343,
|
127 |
+
"learning_rate": 0.0001521184882876585,
|
128 |
+
"loss": 1.6764,
|
129 |
+
"step": 640
|
130 |
+
},
|
131 |
+
{
|
132 |
+
"epoch": 0.6656877141458639,
|
133 |
+
"grad_norm": 0.12769252061843872,
|
134 |
+
"learning_rate": 0.00014346764217659653,
|
135 |
+
"loss": 1.7871,
|
136 |
+
"step": 680
|
137 |
+
},
|
138 |
+
{
|
139 |
+
"epoch": 0.7048458149779736,
|
140 |
+
"grad_norm": 0.13380451500415802,
|
141 |
+
"learning_rate": 0.00013439489186339282,
|
142 |
+
"loss": 1.7167,
|
143 |
+
"step": 720
|
144 |
+
},
|
145 |
+
{
|
146 |
+
"epoch": 0.7440039158100832,
|
147 |
+
"grad_norm": 0.11822285503149033,
|
148 |
+
"learning_rate": 0.0001249882989794231,
|
149 |
+
"loss": 1.6789,
|
150 |
+
"step": 760
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"epoch": 0.7831620166421929,
|
154 |
+
"grad_norm": 0.12109290808439255,
|
155 |
+
"learning_rate": 0.00011533916548786857,
|
156 |
+
"loss": 1.583,
|
157 |
+
"step": 800
|
158 |
+
},
|
159 |
+
{
|
160 |
+
"epoch": 0.8223201174743024,
|
161 |
+
"grad_norm": 0.12838001549243927,
|
162 |
+
"learning_rate": 0.000105541147491597,
|
163 |
+
"loss": 1.7412,
|
164 |
+
"step": 840
|
165 |
+
},
|
166 |
+
{
|
167 |
+
"epoch": 0.8614782183064121,
|
168 |
+
"grad_norm": 0.16042716801166534,
|
169 |
+
"learning_rate": 9.568934619137046e-05,
|
170 |
+
"loss": 1.6519,
|
171 |
+
"step": 880
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"epoch": 0.9006363191385218,
|
175 |
+
"grad_norm": 0.1427149474620819,
|
176 |
+
"learning_rate": 8.587938481769089e-05,
|
177 |
+
"loss": 1.6598,
|
178 |
+
"step": 920
|
179 |
+
},
|
180 |
+
{
|
181 |
+
"epoch": 0.9397944199706314,
|
182 |
+
"grad_norm": 0.118178591132164,
|
183 |
+
"learning_rate": 7.620648049573815e-05,
|
184 |
+
"loss": 1.7378,
|
185 |
+
"step": 960
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"epoch": 0.9789525208027411,
|
189 |
+
"grad_norm": 0.1253277212381363,
|
190 |
+
"learning_rate": 6.676452005203406e-05,
|
191 |
+
"loss": 1.6451,
|
192 |
+
"step": 1000
|
193 |
+
},
|
194 |
+
{
|
195 |
+
"epoch": 1.0176211453744493,
|
196 |
+
"grad_norm": 0.15462452173233032,
|
197 |
+
"learning_rate": 5.764514873320761e-05,
|
198 |
+
"loss": 1.6475,
|
199 |
+
"step": 1040
|
200 |
+
},
|
201 |
+
{
|
202 |
+
"epoch": 1.056779246206559,
|
203 |
+
"grad_norm": 0.106235072016716,
|
204 |
+
"learning_rate": 4.893688068190932e-05,
|
205 |
+
"loss": 1.6686,
|
206 |
+
"step": 1080
|
207 |
+
},
|
208 |
+
{
|
209 |
+
"epoch": 1.0959373470386686,
|
210 |
+
"grad_norm": 0.09717393666505814,
|
211 |
+
"learning_rate": 4.072423980374452e-05,
|
212 |
+
"loss": 1.6824,
|
213 |
+
"step": 1120
|
214 |
+
},
|
215 |
+
{
|
216 |
+
"epoch": 1.1350954478707782,
|
217 |
+
"grad_norm": 0.13711334764957428,
|
218 |
+
"learning_rate": 3.308693936411421e-05,
|
219 |
+
"loss": 1.6147,
|
220 |
+
"step": 1160
|
221 |
+
},
|
222 |
+
{
|
223 |
+
"epoch": 1.174253548702888,
|
224 |
+
"grad_norm": 0.1265803724527359,
|
225 |
+
"learning_rate": 2.6099108277934103e-05,
|
226 |
+
"loss": 1.6174,
|
227 |
+
"step": 1200
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"epoch": 1.174253548702888,
|
231 |
+
"eval_loss": 1.5792005062103271,
|
232 |
+
"eval_runtime": 1903.0333,
|
233 |
+
"eval_samples_per_second": 1.432,
|
234 |
+
"eval_steps_per_second": 0.716,
|
235 |
+
"step": 1200
|
236 |
+
},
|
237 |
+
{
|
238 |
+
"epoch": 1.2134116495349976,
|
239 |
+
"grad_norm": 0.09578167647123337,
|
240 |
+
"learning_rate": 1.982857160199334e-05,
|
241 |
+
"loss": 1.6246,
|
242 |
+
"step": 1240
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"epoch": 1.2525697503671072,
|
246 |
+
"grad_norm": 0.14227357506752014,
|
247 |
+
"learning_rate": 1.4336192213613742e-05,
|
248 |
+
"loss": 1.5548,
|
249 |
+
"step": 1280
|
250 |
+
},
|
251 |
+
{
|
252 |
+
"epoch": 1.2917278511992167,
|
253 |
+
"grad_norm": 0.1526080220937729,
|
254 |
+
"learning_rate": 9.675280065387116e-06,
|
255 |
+
"loss": 1.5454,
|
256 |
+
"step": 1320
|
257 |
+
},
|
258 |
+
{
|
259 |
+
"epoch": 1.3308859520313265,
|
260 |
+
"grad_norm": 0.17356757819652557,
|
261 |
+
"learning_rate": 5.891074749862857e-06,
|
262 |
+
"loss": 1.5555,
|
263 |
+
"step": 1360
|
264 |
+
},
|
265 |
+
{
|
266 |
+
"epoch": 1.3700440528634361,
|
267 |
+
"grad_norm": 0.1258653849363327,
|
268 |
+
"learning_rate": 3.0203063964990617e-06,
|
269 |
+
"loss": 1.5775,
|
270 |
+
"step": 1400
|
271 |
+
},
|
272 |
+
{
|
273 |
+
"epoch": 1.4092021536955457,
|
274 |
+
"grad_norm": 0.12249883264303207,
|
275 |
+
"learning_rate": 1.0908391628854041e-06,
|
276 |
+
"loss": 1.5619,
|
277 |
+
"step": 1440
|
278 |
+
},
|
279 |
+
{
|
280 |
+
"epoch": 1.4483602545276555,
|
281 |
+
"grad_norm": 0.1455027014017105,
|
282 |
+
"learning_rate": 1.2140078057101266e-07,
|
283 |
+
"loss": 1.5342,
|
284 |
+
"step": 1480
|
285 |
+
},
|
286 |
+
{
|
287 |
+
"epoch": 1.4679393049437102,
|
288 |
+
"step": 1500,
|
289 |
+
"total_flos": 1.105565365842985e+17,
|
290 |
+
"train_loss": 1.763797264099121,
|
291 |
+
"train_runtime": 30390.5778,
|
292 |
+
"train_samples_per_second": 0.395,
|
293 |
+
"train_steps_per_second": 0.049
|
294 |
+
}
|
295 |
+
],
|
296 |
+
"logging_steps": 40,
|
297 |
+
"max_steps": 1500,
|
298 |
+
"num_input_tokens_seen": 0,
|
299 |
+
"num_train_epochs": 2,
|
300 |
+
"save_steps": 600,
|
301 |
+
"stateful_callbacks": {
|
302 |
+
"EarlyStoppingCallback": {
|
303 |
+
"args": {
|
304 |
+
"early_stopping_patience": 3,
|
305 |
+
"early_stopping_threshold": 0.05
|
306 |
+
},
|
307 |
+
"attributes": {
|
308 |
+
"early_stopping_patience_counter": 0
|
309 |
+
}
|
310 |
+
},
|
311 |
+
"TrainerControl": {
|
312 |
+
"args": {
|
313 |
+
"should_epoch_stop": false,
|
314 |
+
"should_evaluate": false,
|
315 |
+
"should_log": false,
|
316 |
+
"should_save": true,
|
317 |
+
"should_training_stop": true
|
318 |
+
},
|
319 |
+
"attributes": {}
|
320 |
+
}
|
321 |
+
},
|
322 |
+
"total_flos": 1.105565365842985e+17,
|
323 |
+
"train_batch_size": 2,
|
324 |
+
"trial_name": null,
|
325 |
+
"trial_params": null
|
326 |
+
}
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f306e4b6253715e26a88fa57ccca63669e96880d48de4c2a1e644ce5571bcc5a
|
3 |
+
size 5880
|
vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|