zhengr commited on
Commit
275ce9b
·
verified ·
1 Parent(s): 5732886

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +449 -0
app.py ADDED
@@ -0,0 +1,449 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, TextStreamer
3
+ from peft import PeftModel
4
+ import re
5
+ import os
6
+ import akshare as ak
7
+ import pandas as pd
8
+ import random
9
+ import json
10
+ import requests
11
+ import math
12
+ from datetime import date
13
+ from datetime import date, datetime, timedelta
14
+
15
+
16
+ access_token = os.environ["TOKEN"]
17
+
18
+ # load model
19
+ model = "meta-llama/Llama-2-7b-chat-hf"
20
+ peft_model = "FinGPT/fingpt-forecaster_sz50_llama2-7B_lora"
21
+
22
+ tokenizer = AutoTokenizer.from_pretrained(model, token = access_token, trust_remote_code=True)
23
+ tokenizer.pad_token = tokenizer.eos_token
24
+ tokenizer.padding_side = "right"
25
+
26
+ streamer = TextStreamer(tokenizer)
27
+
28
+ model = AutoModelForCausalLM.from_pretrained(model, trust_remote_code=True, token = access_token, device_map="cuda", load_in_8bit=True, offload_folder="offload/")
29
+ model = PeftModel.from_pretrained(model, peft_model, offload_folder="offload/")
30
+
31
+ model = model.eval()
32
+
33
+ # Inference Data
34
+ # get company news online
35
+
36
+ B_INST, E_INST = "[INST]", "[/INST]"
37
+ B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
38
+ SYSTEM_PROMPT = "你是一名经验丰富的股票市场分析师。你的任务是根据公司在过去几周内的相关新闻和季度财务状况,列出公司的积极发展和潜在担忧,然后结合你对整体金融经济市场的判断,对公司未来一周的股价变化提供预测和分析。" \
39
+ "你的回答语言应为中文。你的回答格式应该如下:\n\n[积极发展]:\n1. ...\n\n[潜在担忧]:\n1. ...\n\n[预测和分析]:\n...\n"
40
+
41
+ # ------------------------------------------------------------------------------
42
+ # Utils
43
+ # ------------------------------------------------------------------------------
44
+ def get_curday():
45
+
46
+ return date.today().strftime("%Y%m%d")
47
+
48
+ def n_weeks_before(date_string, n, format = "%Y%m%d"):
49
+
50
+ date = datetime.strptime(date_string, "%Y%m%d") - timedelta(days=7*n)
51
+
52
+ return date.strftime(format=format)
53
+
54
+ def check_news_quality(n, last_n, week_end_date, repeat_rate = 0.6):
55
+ try:
56
+ # check content avalability
57
+ if not (not(str(n['新闻内容'])[0].isdigit()) and not(str(n['新闻内容'])=='nan') and n['发布时间'][:8] <= week_end_date.replace('-', '')):
58
+ return False
59
+ # check highly duplicated news
60
+ # (assume the duplicated contents happened adjacent)
61
+
62
+ elif str(last_n['新闻内容'])=='nan':
63
+ return True
64
+ elif len(set(n['新闻内容'][:20]) & set(last_n['新闻内容'][:20])) >= 20*repeat_rate or len(set(n['新闻标题']) & set(last_n['新闻标题']))/len(last_n['新闻标题']) > repeat_rate:
65
+ return False
66
+
67
+ else:
68
+ return True
69
+ except TypeError:
70
+ print(n)
71
+ print(last_n)
72
+ raise Exception("Check Error")
73
+
74
+ def sample_news(news, k=5):
75
+
76
+ return [news[i] for i in sorted(random.sample(range(len(news)), k))]
77
+
78
+ def return_transform(ret):
79
+
80
+ up_down = '涨' if ret >= 0 else '跌'
81
+ integer = math.ceil(abs(100 * ret))
82
+ if integer == 0:
83
+ return "平"
84
+
85
+ return up_down + (str(integer) if integer <= 5 else '5+')
86
+
87
+ def map_return_label(return_lb):
88
+
89
+ lb = return_lb.replace('涨', '上涨')
90
+ lb = lb.replace('跌', '下跌')
91
+ lb = lb.replace('平', '股价持平')
92
+ lb = lb.replace('1', '0-1%')
93
+ lb = lb.replace('2', '1-2%')
94
+ lb = lb.replace('3', '2-3%')
95
+ lb = lb.replace('4', '3-4%')
96
+ if lb.endswith('+'):
97
+ lb = lb.replace('5+', '超过5%')
98
+ else:
99
+ lb = lb.replace('5', '4-5%')
100
+
101
+ return lb
102
+ # ------------------------------------------------------------------------------
103
+ # Get data from website
104
+ # ------------------------------------------------------------------------------
105
+ def stock_news_em(symbol: str = "300059", page = 1) -> pd.DataFrame:
106
+
107
+ url = "https://search-api-web.eastmoney.com/search/jsonp"
108
+ params = {
109
+ "cb": "jQuery3510875346244069884_1668256937995",
110
+ "param": '{"uid":"",'
111
+ + f'"keyword":"{symbol}"'
112
+ + ',"type":["cmsArticleWebOld"],"client":"web","clientType":"web","clientVersion":"curr","param":{"cmsArticleWebOld":{"searchScope":"default","sort":"default",' + f'"pageIndex":{page}'+ ',"pageSize":100,"preTag":"<em>","postTag":"</em>"}}}',
113
+ "_": "1668256937996",
114
+ }
115
+ try:
116
+ r = requests.get(url, params=params, timeout=30)
117
+ except requests.exceptions.ReadTimeout:
118
+ print("The request timed out. Trying again...")
119
+ # Retry the request or handle the timeout in other ways
120
+ except requests.exceptions.RequestException as e:
121
+ # Handle other potential exceptions
122
+ print(f"An error occurred: {e}")
123
+
124
+ data_text = r.text
125
+ data_json = json.loads(
126
+ data_text.strip("jQuery3510875346244069884_1668256937995(")[:-1]
127
+ )
128
+ temp_df = pd.DataFrame(data_json["result"]["cmsArticleWebOld"])
129
+ temp_df.rename(
130
+ columns={
131
+ "date": "发布时间",
132
+ "mediaName": "文章来源",
133
+ "code": "-",
134
+ "title": "新闻标题",
135
+ "content": "新闻内容",
136
+ "url": "新闻链接",
137
+ "image": "-",
138
+ },
139
+ inplace=True,
140
+ )
141
+ temp_df["关键词"] = symbol
142
+ temp_df = temp_df[
143
+ [
144
+ "关键词",
145
+ "新闻标题",
146
+ "新闻内容",
147
+ "发布时间",
148
+ "文章来源",
149
+ "新闻链接",
150
+ ]
151
+ ]
152
+ temp_df["新闻标题"] = (
153
+ temp_df["新闻标题"]
154
+ .str.replace(r"\(<em>", "", regex=True)
155
+ .str.replace(r"</em>\)", "", regex=True)
156
+ )
157
+ temp_df["新闻标题"] = (
158
+ temp_df["新闻标题"]
159
+ .str.replace(r"<em>", "", regex=True)
160
+ .str.replace(r"</em>", "", regex=True)
161
+ )
162
+ temp_df["新闻内容"] = (
163
+ temp_df["新闻内容"]
164
+ .str.replace(r"\(<em>", "", regex=True)
165
+ .str.replace(r"</em>\)", "", regex=True)
166
+ )
167
+ temp_df["新闻内容"] = (
168
+ temp_df["新闻内容"]
169
+ .str.replace(r"<em>", "", regex=True)
170
+ .str.replace(r"</em>", "", regex=True)
171
+ )
172
+ temp_df["新闻内容"] = temp_df["新闻内容"].str.replace(r"\u3000", "", regex=True)
173
+ temp_df["新闻内容"] = temp_df["新闻内容"].str.replace(r"\r\n", " ", regex=True)
174
+ return temp_df
175
+
176
+ def get_news(symbol, max_page = 3):
177
+
178
+ df_list = []
179
+ for page in range(1, max_page):
180
+
181
+ try:
182
+ df_list.append(stock_news_em(symbol, page))
183
+ except KeyError:
184
+ print(str(symbol) + "pages obtained for symbol: " + page)
185
+ break
186
+
187
+ news_df = pd.concat(df_list, ignore_index=True)
188
+ return news_df
189
+
190
+ def get_cur_return(symbol, start_date, end_date, adjust="qfq"):
191
+
192
+ # load data
193
+ return_data = ak.stock_zh_a_hist(symbol=symbol, period="daily", start_date=start_date, end_date=end_date, adjust=adjust)
194
+
195
+ # process timestamp
196
+ return_data["日期"] = pd.to_datetime(return_data["日期"])
197
+ return_data.set_index("日期", inplace=True)
198
+
199
+ # resample and filled with forward data
200
+ weekly_data = return_data["收盘"].resample("W").ffill()
201
+ weekly_returns = weekly_data.pct_change()[1:]
202
+ weekly_start_prices = weekly_data[:-1]
203
+ weekly_end_prices = weekly_data[1:]
204
+ weekly_data = pd.DataFrame({
205
+ '起始日期': weekly_start_prices.index,
206
+ '起始价': weekly_start_prices.values,
207
+ '结算日期': weekly_end_prices.index,
208
+ '结算价': weekly_end_prices.values,
209
+ '周收益': weekly_returns.values
210
+ })
211
+ weekly_data["简化周收益"] = weekly_data["周收益"].map(return_transform)
212
+ # check enddate
213
+ if weekly_data.iloc[-1, 2] > pd.to_datetime(end_date):
214
+ weekly_data.iloc[-1, 2] = pd.to_datetime(end_date)
215
+
216
+ return weekly_data
217
+
218
+ def get_basic(symbol, data):
219
+
220
+ key_financials = ['报告期', '净利润同比增长率', '营业总收入同比增长率', '流动比率', '速动比率', '资产负债率']
221
+
222
+ # load quarterly basic data
223
+ basic_quarter_financials = ak.stock_financial_abstract_ths(symbol = symbol, indicator="按单季度")
224
+ basic_fin_dict = basic_quarter_financials.to_dict("index")
225
+ basic_fin_list = [dict([(key, val) for key, val in basic_fin_dict[i].items() if (key in key_financials) and val]) for i in range(len(basic_fin_dict))]
226
+
227
+ # match basic financial data to news dataframe
228
+ matched_basic_fin = []
229
+ for i, row in data.iterrows():
230
+
231
+ newsweek_enddate = row['结算日期'].strftime("%Y-%m-%d")
232
+
233
+ matched_basic = {}
234
+ for basic in basic_fin_list:
235
+ # match the most current financial report
236
+ if basic["报告期"] < newsweek_enddate:
237
+ matched_basic = basic
238
+ break
239
+ matched_basic_fin.append(json.dumps(matched_basic, ensure_ascii=False))
240
+
241
+ data['基本面'] = matched_basic_fin
242
+
243
+ return data
244
+ # ------------------------------------------------------------------------------
245
+ # Structure Data
246
+ # ------------------------------------------------------------------------------
247
+ def cur_financial_data(symbol, start_date, end_date, with_basics = True):
248
+
249
+ # get data
250
+ data = get_cur_return(symbol=symbol, start_date=start_date, end_date=end_date)
251
+
252
+ news_df = get_news(symbol=symbol)
253
+ news_df["发布时间"] = pd.to_datetime(news_df["发布时间"], exact=False, format="%Y-%m-%d")
254
+ news_df.sort_values(by=["发布时间"], inplace=True)
255
+
256
+ # match weekly news for return data
257
+ news_list = []
258
+ for a, row in data.iterrows():
259
+ week_start_date = row['起始日期'].strftime('%Y-%m-%d')
260
+ week_end_date = row['结算日期'].strftime('%Y-%m-%d')
261
+ print(symbol, ': ', week_start_date, ' - ', week_end_date)
262
+
263
+ weekly_news = news_df.loc[(news_df["发布时间"]>week_start_date) & (news_df["发布时间"]<week_end_date)]
264
+
265
+ weekly_news = [
266
+ {
267
+ "发布时间": n["发布时间"].strftime('%Y%m%d'),
268
+ "新闻标题": n['新闻标题'],
269
+ "新闻内容": n['新闻内容'],
270
+ } for a, n in weekly_news.iterrows()
271
+ ]
272
+ news_list.append(json.dumps(weekly_news,ensure_ascii=False))
273
+
274
+ data["新闻"] = news_list
275
+
276
+ if with_basics:
277
+ data = get_basic(symbol=symbol, data=data)
278
+ # data.to_csv(symbol+start_date+"_"+end_date+".csv")
279
+ else:
280
+ data['基本面'] = [json.dumps({})] * len(data)
281
+ # data.to_csv(symbol+start_date+"_"+end_date+"_nobasics.csv")
282
+
283
+ return data
284
+ # ------------------------------------------------------------------------------
285
+ # Formate Instruction
286
+ # ------------------------------------------------------------------------------
287
+ def get_company_prompt_new(symbol):
288
+ try:
289
+ company_profile = dict(ak.stock_individual_info_em(symbol).values)
290
+ except:
291
+ print("Company Info Request Time Out! Please wait and retry.")
292
+ company_profile["上市时间"] = pd.to_datetime(str(company_profile["上市时间"])).strftime("%Y年%m月%d日")
293
+
294
+ template = "[公司介绍]:\n\n{股票简称}是一家在{行业}行业的领先实体,自{上市时间}成立并公开交易。截止今天,{股票简称}的总市值为{总市值}人民币,总股本数为{总股本},流通市值为{流通市值}人民币,流通股数为{流通股}。" \
295
+ "\n\n{股票简称}主要在中国运营,以股票代码{股票代码}在交易所进行交易。"
296
+
297
+ formatted_profile = template.format(**company_profile)
298
+ stockname = company_profile['股票简称']
299
+ return formatted_profile, stockname
300
+
301
+ def get_prompt_by_row_new(stock, row):
302
+
303
+ week_start_date = row['起始日期'] if isinstance(row['起始日期'], str) else row['起始日期'].strftime('%Y-%m-%d')
304
+ week_end_date = row['结算日期'] if isinstance(row['结算日期'], str) else row['结算日期'].strftime('%Y-%m-%d')
305
+ term = '上涨' if row['结算价'] > row['起始价'] else '下跌'
306
+ chg = map_return_label(row['简化周收益'])
307
+ head = "自{}至{},{}的股票价格由{:.2f}{}至{:.2f},涨跌幅为:{}。在此期间的公司新闻如下:\n\n".format(
308
+ week_start_date, week_end_date, stock, row['起始价'], term, row['结算价'], chg)
309
+
310
+ news = json.loads(row["新闻"])
311
+
312
+ left, right = 0, 0
313
+ filtered_news = []
314
+ while left < len(news):
315
+ n = news[left]
316
+
317
+ if left == 0:
318
+ # check first news quality
319
+ if (not(str(n['新闻内容'])[0].isdigit()) and not(str(n['新闻内容'])=='nan') and n['发布时间'][:8] <= week_end_date.replace('-', '')):
320
+ filtered_news.append("[新闻标题]:{}\n[新闻内容]:{}\n".format(n['新闻标题'], n['新闻内容']))
321
+ left += 1
322
+
323
+ else:
324
+ news_check = check_news_quality(n, last_n = news[right], week_end_date= week_end_date, repeat_rate=0.5)
325
+ if news_check:
326
+ filtered_news.append("[新闻标题]:{}\n[新闻内容]:{}\n".format(n['新闻标题'], n['新闻内容']))
327
+ left += 1
328
+ right += 1
329
+
330
+
331
+ basics = json.loads(row['基本面'])
332
+ if basics:
333
+ basics = "如下所列为{}近期的一些金融基本面信息,记录时间为{}:\n\n[金融基本面]:\n\n".format(
334
+ stock, basics['报告期']) + "\n".join(f"{k}: {v}" for k, v in basics.items() if k != 'period')
335
+ else:
336
+ basics = "[金融基本面]:\n\n 无金融基本面记录"
337
+
338
+ return head, filtered_news, basics
339
+
340
+ @spaces.GPU()
341
+ def get_all_prompts_online(symbol, with_basics=True, max_news_perweek = 3, weeks_before = 2):
342
+
343
+ end_date = get_curday()
344
+ start_date = n_weeks_before(end_date, weeks_before)
345
+
346
+ company_prompt, stock = get_company_prompt_new(symbol)
347
+ data = cur_financial_data(symbol=symbol, start_date=start_date, end_date=end_date, with_basics=with_basics)
348
+
349
+ prev_rows = []
350
+
351
+ for row_idx, row in data.iterrows():
352
+ head, news, basics = get_prompt_by_row_new(symbol, row)
353
+ prev_rows.append((head, news, basics))
354
+
355
+ prompt = ""
356
+ for i in range(-len(prev_rows), 0):
357
+ prompt += "\n" + prev_rows[i][0]
358
+ sampled_news = sample_news(
359
+ prev_rows[i][1],
360
+ min(max_news_perweek, len(prev_rows[i][1]))
361
+ )
362
+ if sampled_news:
363
+ prompt += "\n".join(sampled_news)
364
+ else:
365
+ prompt += "No relative news reported."
366
+
367
+ next_date = n_weeks_before(end_date, -1, format="%Y-%m-%d")
368
+ end_date = pd.to_datetime(end_date).strftime("%Y-%m-%d")
369
+ period = "{}至{}".format(end_date, next_date)
370
+
371
+ if with_basics:
372
+ basics = prev_rows[-1][2]
373
+ else:
374
+ basics = "[金融基本面]:\n\n 无金融基本面记录"
375
+
376
+ info = company_prompt + '\n' + prompt + '\n' + basics
377
+
378
+ new_system_prompt = SYSTEM_PROMPT.replace(':\n...', ':\n预测涨跌幅:...\n总结分析:...')
379
+ prompt = B_INST + B_SYS + new_system_prompt + E_SYS + info + f"\n\n基于在{end_date}之前的所有信息,让我们首先分析{stock}的积极发展和潜在担忧。请简洁地陈述,分别提出2-4个最重要的因素。大部分所提及的因素应该从公司的相关新闻中推断出来。" \
380
+ f"接下来请预测{symbol}下周({period})的股票涨跌幅,并提供一个总结分析来支持你的预测。" + E_INST
381
+
382
+ del prev_rows
383
+ del data
384
+
385
+ return info, prompt
386
+
387
+
388
+ def ask(symbol, weeks_before, withbasic):
389
+
390
+ # load inference data
391
+ info, pt = get_all_prompts_online(symbol=symbol, weeks_before=weeks_before, with_basics=withbasic)
392
+ # print(info)
393
+
394
+ inputs = tokenizer(pt, return_tensors='pt')
395
+ inputs = {key: value.to(model.device) for key, value in inputs.items()}
396
+ print("Inputs loaded onto devices.")
397
+
398
+ res = model.generate(
399
+ **inputs,
400
+ use_cache=True,
401
+ streamer=streamer
402
+ )
403
+ output = tokenizer.decode(res[0], skip_special_tokens=True)
404
+ output_cur = re.sub(r'.*\[/INST\]\s*', '', output, flags=re.DOTALL)
405
+ return info, output_cur
406
+
407
+ server = gr.Interface(
408
+ ask,
409
+ inputs=[
410
+ gr.Textbox(
411
+ label="Symbol",
412
+ value="600519",
413
+ info="Companys from SZ50 are recommended"
414
+ ),
415
+ gr.Slider(
416
+ minimum=1,
417
+ maximum=3,
418
+ value=2,
419
+ step=1,
420
+ label="weeks_before",
421
+ info="Due to the token length constraint, you are recommended to input with 2"
422
+ ),
423
+ gr.Checkbox(
424
+ label="Use Latest Basic Financials",
425
+ value=True,
426
+ info="If checked, the latest quarterly reported basic financials of the company is taken into account."
427
+ )
428
+ ],
429
+ outputs=[
430
+ gr.Textbox(
431
+ label="Information Provided"
432
+ ),
433
+ gr.Textbox(
434
+ label="Response"
435
+ )
436
+ ],
437
+ title="FinGPT-Forecaster-Chinese",
438
+ description="""This version allows the prediction based on the most current date. We will upgrade it to allow customized date soon.
439
+
440
+ **The estimated time cost is 180s**
441
+
442
+ This demo has been downgraded to using T4 with 8-bit inference due to cost considerations, speed & performance may be affected.
443
+
444
+ **⚠️Warning**: This is just a demo showing what this model can do. During each individual inference, company news is randomly sampled from all the news from designated weeks, which might result in different predictions for the same period. We suggest users deploy the original model or clone this space and inference with more carefully selected news in their favorable ways.
445
+
446
+ **Disclaimer**: Nothing herein is financial advice, and NOT a recommendation to trade real money. Please use common sense and always first consult a professional before trading or investing."""
447
+ )
448
+
449
+ server.launch()