Jiangxz commited on
Commit
806933c
·
verified ·
1 Parent(s): 48843d4

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +157 -33
  2. requirements.txt +4 -2
app.py CHANGED
@@ -14,8 +14,10 @@ from langchain_community.document_loaders import WebBaseLoader, TextLoader
14
  from langchain.prompts import PromptTemplate
15
  from langchain.schema import Document
16
  import gradio as gr
 
17
  import re
18
  import time
 
19
 
20
  def load_documents(sources):
21
  documents = []
@@ -34,23 +36,23 @@ def load_documents(sources):
34
  return documents
35
 
36
  sources = [
37
- "TaxQADataSet_Slim1.txt",
38
- "TaxQADataSet_Slim2.txt",
39
- "TaxQADataSet_Slim3.txt",
40
- "TaxQADataSet_Slim4.txt",
41
- "TaxQADataSet_Slim5.txt",
42
- "TaxQADataSet_Slim6.txt",
43
- "TaxQADataSet_ntpc1.txt",
44
- "TaxQADataSet_ntpc2.txt",
45
- "TaxQADataSet_kctax.txt",
46
- "TaxQADataSet_chutax.txt",
47
- "LandTaxAct1100623.txt",
48
- "TheEnforcementRulesoftheLandTaxAct1100923.txt",
49
- "HouseTaxAct1130103.txt",
50
- "VehicleLicenseTaxAct1101230.txt",
51
- "TaxCollectionAct1101217.txt",
52
- "AmusementTaxAct960523.txt",
53
- "StampTaxAct910515.txt",
54
  "DeedTaxAct990505.txt"
55
  ]
56
 
@@ -69,21 +71,22 @@ split_docs = text_splitter.split_documents(documents)
69
  print(f"分割後的文件數量:{len(split_docs)}")
70
 
71
  embeddings = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")
72
- print(f"\n成功初始化 Microsoft 嵌入模型")
73
 
74
  print(f"\n開始建立向量資料庫")
75
  vectorstore = Chroma.from_documents(split_docs, embeddings, persist_directory="./Knowledge-base")
76
- print(f"成功建立 Chroma 向量資料庫")
77
 
78
  retriever = vectorstore.as_retriever(
79
  search_type="mmr",
80
  search_kwargs={
81
- "k": 4,
82
- "fetch_k": 20,
83
  "lambda_mult": 0.8
84
  }
85
  )
86
  print(f"檢索演算法:Maximum Marginal Relevance Retrieval")
 
87
 
88
  template = """Let's work this out in a step by step way to be sure we have the right answer. Must reply to me in Taiwanese Traditional Chinese.
89
  在回答之前,請仔細分析檢索到的上下文,確保你的回答準確完整反映了上下文中的訊息,而不是依賴先前的知識,在回應的答案中絕對不要提到是根據上下文回答。
@@ -179,8 +182,93 @@ def answer_multiple_questions(query, api_key):
179
  selected_insight_questions = all_insight_questions[:3]
180
  return combined_answer, selected_insight_questions
181
 
182
- def convert_punctuation(text):
183
- return text.replace('?', '?').replace(',', ',').replace('!', '!').replace(' ', ' ')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
  def handle_interaction(query, api_key, state):
186
  gr.Info("開始處理問題,請稍待片刻......")
@@ -191,12 +279,32 @@ def handle_interaction(query, api_key, state):
191
  api_key = os.getenv("YOUR_API_KEY")
192
  query = convert_punctuation(query)
193
  answer, insight_questions = answer_multiple_questions(query, api_key)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  state["history"].append((query, answer))
195
  while len(insight_questions) < 3:
196
  insight_questions.append("提供更多地方稅資訊")
197
  end_time = time.time()
198
  gr.Info(f"Model 已答覆,執行時間: {(end_time - start_time):.2f} 秒。")
199
- return answer, insight_questions[0], insight_questions[1], insight_questions[2], state, query
 
 
 
 
 
 
200
 
201
  custom_css = """
202
  .query-input {
@@ -231,6 +339,7 @@ custom_css = """
231
  }
232
  #submit-btn {
233
  border-radius: 10px !important;
 
234
  background-color: #ff4081 !important;
235
  color: white !important;
236
  font-weight: bold !important;
@@ -243,6 +352,7 @@ custom_css = """
243
  }
244
  .insight-btn {
245
  border-radius: 10px !important;
 
246
  background-color: #00bcd4 !important;
247
  }
248
  .insight-btn:hover {
@@ -259,6 +369,13 @@ custom_css = """
259
  border-radius: 10px !important;
260
  margin: 0 !important;
261
  }
 
 
 
 
 
 
 
262
  .clear-button {
263
  color: white !important;
264
  background-color: #000000 !important;
@@ -272,7 +389,7 @@ custom_css = """
272
  }
273
  """
274
 
275
- with gr.Blocks(theme=gr.themes.Monochrome(), css=custom_css) as iface:
276
  gr.Markdown("""
277
  # 地方稅知識庫系統 - 財政部財政資訊中心
278
  > ### **※ RAG-based 系統部署:江信宗,LLM:Llama-3.1-70B,以地方稅極少知識資料示範,僅供參考,準確資訊請依據地方稅稽徵機關回覆為準。**
@@ -280,33 +397,40 @@ with gr.Blocks(theme=gr.themes.Monochrome(), css=custom_css) as iface:
280
  with gr.Row():
281
  query_input = gr.Textbox(label="輸入您的問題,系統將基於學習到的知識資料提供相關答案。", placeholder="請輸入您的問題(支援同時輸入多個問題,例如:問題1?問題2?)", autofocus=True, scale=3, max_lines=5, elem_classes="query-input")
282
  api_key_input = gr.Textbox(label="請輸入您的 API Key", type="password", placeholder="API authentication key", scale=1, elem_classes="api-key-input")
283
- answer_output = gr.Textbox(label="答案:", interactive=False, max_lines=40, elem_classes="answer-box")
284
  with gr.Row():
285
  insight_q1 = gr.Button("洞察問題 1", visible=False, elem_classes=["insight-btn"])
286
  insight_q2 = gr.Button("洞察問題 2", visible=False, elem_classes=["insight-btn"])
287
  insight_q3 = gr.Button("洞察問題 3", visible=False, elem_classes=["insight-btn"])
288
  state = gr.State()
289
  current_question = gr.Textbox(lines=2, label="當前問題", visible=False)
 
290
  with gr.Row():
291
  submit_btn = gr.Button("傳送", variant="primary", scale=3, elem_id="submit-btn")
292
  clear_button = gr.Button("清除", variant="secondary", scale=1, elem_classes="clear-button")
293
- def update_ui(answer, q1, q2, q3, state, current_q):
 
 
 
 
 
294
  return [
295
  answer,
296
  gr.update(value=q1, visible=bool(q1)),
297
  gr.update(value=q2, visible=bool(q2)),
298
  gr.update(value=q3, visible=bool(q3)),
299
  state,
300
- current_q
 
301
  ]
302
  submit_btn.click(
303
  fn=handle_interaction,
304
  inputs=[query_input, api_key_input, state],
305
- outputs=[answer_output, insight_q1, insight_q2, insight_q3, state, current_question]
306
  ).then(
307
  fn=update_ui,
308
- inputs=[answer_output, insight_q1, insight_q2, insight_q3, state, current_question],
309
- outputs=[answer_output, insight_q1, insight_q2, insight_q3, state, current_question]
310
  )
311
  for btn in [insight_q1, insight_q2, insight_q3]:
312
  btn.click(
@@ -315,11 +439,11 @@ with gr.Blocks(theme=gr.themes.Monochrome(), css=custom_css) as iface:
315
  outputs=[query_input]
316
  )
317
  def clear_outputs():
318
- return "", ""
319
  clear_button.click(
320
  fn=clear_outputs,
321
  inputs=[],
322
- outputs=[query_input, answer_output]
323
  )
324
 
325
  if __name__ == "__main__":
 
14
  from langchain.prompts import PromptTemplate
15
  from langchain.schema import Document
16
  import gradio as gr
17
+ from openai import OpenAI
18
  import re
19
  import time
20
+ import requests
21
 
22
  def load_documents(sources):
23
  documents = []
 
36
  return documents
37
 
38
  sources = [
39
+ # "TaxQADataSet_Slim1.txt",
40
+ # "TaxQADataSet_Slim2.txt",
41
+ # "TaxQADataSet_Slim3.txt",
42
+ # "TaxQADataSet_Slim4.txt",
43
+ # "TaxQADataSet_Slim5.txt",
44
+ # "TaxQADataSet_Slim6.txt",
45
+ # "TaxQADataSet_ntpc1.txt",
46
+ # "TaxQADataSet_ntpc2.txt",
47
+ # "TaxQADataSet_kctax.txt",
48
+ # "TaxQADataSet_chutax.txt",
49
+ # "LandTaxAct1100623.txt",
50
+ # "TheEnforcementRulesoftheLandTaxAct1100923.txt",
51
+ # "HouseTaxAct1130103.txt",
52
+ # "VehicleLicenseTaxAct1101230.txt",
53
+ # "TaxCollectionAct1101217.txt",
54
+ # "AmusementTaxAct960523.txt",
55
+ # "StampTaxAct910515.txt",
56
  "DeedTaxAct990505.txt"
57
  ]
58
 
 
71
  print(f"分割後的文件數量:{len(split_docs)}")
72
 
73
  embeddings = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")
74
+ print(f"\n成功初始化 Microsoft 嵌入型")
75
 
76
  print(f"\n開始建立向量資料庫")
77
  vectorstore = Chroma.from_documents(split_docs, embeddings, persist_directory="./Knowledge-base")
78
+ print(f"成功建立 Chroma 向量資料庫,共有 {len(split_docs)} 個文檔")
79
 
80
  retriever = vectorstore.as_retriever(
81
  search_type="mmr",
82
  search_kwargs={
83
+ "k": min(4, len(split_docs)),
84
+ "fetch_k": min(20, len(split_docs)),
85
  "lambda_mult": 0.8
86
  }
87
  )
88
  print(f"檢索演算法:Maximum Marginal Relevance Retrieval")
89
+ print(f"檢索文檔數量:k={min(4, len(split_docs))}, fetch_k={min(20, len(split_docs))}")
90
 
91
  template = """Let's work this out in a step by step way to be sure we have the right answer. Must reply to me in Taiwanese Traditional Chinese.
92
  在回答之前,請仔細分析檢索到的上下文,確保你的回答準確完整反映了上下文中的訊息,而不是依賴先前的知識,在回應的答案中絕對不要提到是根據上下文回答。
 
182
  selected_insight_questions = all_insight_questions[:3]
183
  return combined_answer, selected_insight_questions
184
 
185
+ def get_tax_law(tax_type):
186
+ tax_law_dict = {
187
+ "房屋稅": "房屋稅條例",
188
+ "地價稅": "土地稅法",
189
+ "契稅": "契稅條例",
190
+ "娛樂稅": "娛樂稅法",
191
+ "印花稅": "印花稅法",
192
+ "牌照稅": "使用牌照稅法",
193
+ "稅捐稽徵法": "稅捐稽徵法"
194
+ }
195
+ return tax_law_dict.get(tax_type, "無稅法")
196
+
197
+ def fetch_law_summary(tax_law, keywords):
198
+ url = "https://ttc.mof.gov.tw/Api/GetData"
199
+ headers = {
200
+ "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
201
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
202
+ }
203
+ gr.Info("檢索法令彙編函釋中......")
204
+ all_results = []
205
+
206
+ for keyword in keywords:
207
+ payload = {
208
+ "FunctionID": "FB10001",
209
+ "ObjParams[TaxAct]": tax_law,
210
+ "ObjParams[TaxVer]": "請選擇",
211
+ "ObjParams[Chapter]": "請選擇",
212
+ "ObjParams[Article]": "請選擇",
213
+ "ObjParams[Content]": keyword,
214
+ "ObjParams[Operator01]": "0",
215
+ "ObjParams[Content01]": "",
216
+ "ObjParams[Operator02]": "0",
217
+ "ObjParams[Content02]": ""
218
+ }
219
+
220
+ try:
221
+ response = requests.post(url, data=payload, headers=headers)
222
+ response.raise_for_status()
223
+ data = response.json()
224
+
225
+ if data["Code"] == "1" and "Table" in data["Data"]:
226
+ all_results.extend(data["Data"]["Table"])
227
+ except Exception as e:
228
+ print(f"Error fetching law summary for keyword '{keyword}': {str(e)}")
229
+
230
+ if all_results:
231
+ summary = "<h3>法令彙編函釋檢索結果:</h3>"
232
+ for index, result in enumerate(all_results[:20]): # Limit to first 20 results across all keywords
233
+ summary += f"""
234
+ <details>
235
+ <summary style="cursor: pointer; color: #0066cc;">{result['Title']}</summary>
236
+ <p>{result['Content']}</p>
237
+ </details>
238
+ """
239
+ return summary
240
+ else:
241
+ return "<p>未找到相關法令彙編函釋。</p>"
242
+
243
+ def llm_openai_api(query, answer):
244
+ client = OpenAI(
245
+ api_key=os.environ.get("YOUR_API_TOKEN"),
246
+ base_url="https://api.sambanova.ai/v1",
247
+ )
248
+
249
+ user_prompt = f"""
250
+ 「題目:{query}
251
+ 答案:{answer}」
252
+ 請詳細分析答案內容後,依據與題目相關性最高的稅目名稱及最多3個重點關鍵字回應我,問題與答案中的稅目名稱列入TaxName,關鍵字列入KeyWord,只須根據格式回應,不要寫其他的。
253
+
254
+ # 回應字典格式範例:
255
+ {{"TaxName": "地價稅", "KeyWord": "宿舍用地,醫護人員"}}
256
+ """
257
+
258
+ try:
259
+ response = client.chat.completions.create(
260
+ model='Meta-Llama-3.1-405B-Instruct',
261
+ messages=[
262
+ {"role": "system", "content": "Must reply to user in Traditional Chinese."},
263
+ {"role": "user", "content": user_prompt}
264
+ ],
265
+ temperature=0.7,
266
+ top_p=1
267
+ )
268
+ return response.choices[0].message.content.strip()
269
+ except Exception as e:
270
+ print(f"請輸入正確的 API Key!Error: {str(e)}")
271
+ return '{"TaxName": "", "KeyWord": ""}'
272
 
273
  def handle_interaction(query, api_key, state):
274
  gr.Info("開始處理問題,請稍待片刻......")
 
279
  api_key = os.getenv("YOUR_API_KEY")
280
  query = convert_punctuation(query)
281
  answer, insight_questions = answer_multiple_questions(query, api_key)
282
+ api_response = llm_openai_api(query, answer)
283
+ tax_name = ""
284
+ keywords = []
285
+ print(api_response)
286
+ try:
287
+ response_dict = eval(api_response)
288
+ tax_name = response_dict.get("TaxName", "")
289
+ keywords = response_dict.get("KeyWord", "").split(",")
290
+ except:
291
+ print("Error parsing api_response")
292
+
293
+ tax_law = get_tax_law(tax_name)
294
+ law_summary_content = fetch_law_summary(tax_law, keywords)
295
+
296
  state["history"].append((query, answer))
297
  while len(insight_questions) < 3:
298
  insight_questions.append("提供更多地方稅資訊")
299
  end_time = time.time()
300
  gr.Info(f"Model 已答覆,執行時間: {(end_time - start_time):.2f} 秒。")
301
+ return answer, insight_questions[0], insight_questions[1], insight_questions[2], state, query, law_summary_content
302
+
303
+ def convert_punctuation(text):
304
+ return text.replace('?', '?').replace(',', ',').replace('!', '!').replace(' ', ' ')
305
+
306
+ def clear_outputs():
307
+ return "", "", gr.update(value="", visible=False)
308
 
309
  custom_css = """
310
  .query-input {
 
339
  }
340
  #submit-btn {
341
  border-radius: 10px !important;
342
+ border: none !important;
343
  background-color: #ff4081 !important;
344
  color: white !important;
345
  font-weight: bold !important;
 
352
  }
353
  .insight-btn {
354
  border-radius: 10px !important;
355
+ border: none !important;
356
  background-color: #00bcd4 !important;
357
  }
358
  .insight-btn:hover {
 
369
  border-radius: 10px !important;
370
  margin: 0 !important;
371
  }
372
+ .text-background {
373
+ font-size: 18px !important;
374
+ padding: 5px !important;
375
+ border-radius: 10px !important;
376
+ border: 2px solid #B7E0FF !important;
377
+ margin: 0 !important;
378
+ }
379
  .clear-button {
380
  color: white !important;
381
  background-color: #000000 !important;
 
389
  }
390
  """
391
 
392
+ with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as iface:
393
  gr.Markdown("""
394
  # 地方稅知識庫系統 - 財政部財政資訊中心
395
  > ### **※ RAG-based 系統部署:江信宗,LLM:Llama-3.1-70B,以地方稅極少知識資料示範,僅供參考,準確資訊請依據地方稅稽徵機關回覆為準。**
 
397
  with gr.Row():
398
  query_input = gr.Textbox(label="輸入您的問題,系統將基於學習到的知識資料提供相關答案。", placeholder="請輸入您的問題(支援同時輸入多個問題,例如:問題1?問題2?)", autofocus=True, scale=3, max_lines=5, elem_classes="query-input")
399
  api_key_input = gr.Textbox(label="請輸入您的 API Key", type="password", placeholder="API authentication key", scale=1, elem_classes="api-key-input")
400
+ answer_output = gr.Textbox(label="知識��答案", interactive=False, max_lines=40, elem_classes="answer-box")
401
  with gr.Row():
402
  insight_q1 = gr.Button("洞察問題 1", visible=False, elem_classes=["insight-btn"])
403
  insight_q2 = gr.Button("洞察問題 2", visible=False, elem_classes=["insight-btn"])
404
  insight_q3 = gr.Button("洞察問題 3", visible=False, elem_classes=["insight-btn"])
405
  state = gr.State()
406
  current_question = gr.Textbox(lines=2, label="當前問題", visible=False)
407
+ law_summary = gr.HTML(label="法令彙編函釋檢索", elem_classes="text-background", visible=False) # Set initial visibility to False
408
  with gr.Row():
409
  submit_btn = gr.Button("傳送", variant="primary", scale=3, elem_id="submit-btn")
410
  clear_button = gr.Button("清除", variant="secondary", scale=1, elem_classes="clear-button")
411
+ gr.HTML(
412
+ """
413
+ <span style="font-size: 18px; color: black;">※ 財政部各稅法令函釋檢索系統:</span><a href="https://ttc.mof.gov.tw/" title="財政部各稅法令函釋檢索系統" style="font-size: 18px; color: red;">https://ttc.mof.gov.tw/</a>
414
+ """
415
+ )
416
+ def update_ui(answer, q1, q2, q3, state, current_q, law_summary):
417
  return [
418
  answer,
419
  gr.update(value=q1, visible=bool(q1)),
420
  gr.update(value=q2, visible=bool(q2)),
421
  gr.update(value=q3, visible=bool(q3)),
422
  state,
423
+ current_q,
424
+ gr.update(value=law_summary, visible=bool(law_summary.strip()))
425
  ]
426
  submit_btn.click(
427
  fn=handle_interaction,
428
  inputs=[query_input, api_key_input, state],
429
+ outputs=[answer_output, insight_q1, insight_q2, insight_q3, state, current_question, law_summary]
430
  ).then(
431
  fn=update_ui,
432
+ inputs=[answer_output, insight_q1, insight_q2, insight_q3, state, current_question, law_summary],
433
+ outputs=[answer_output, insight_q1, insight_q2, insight_q3, state, current_question, law_summary]
434
  )
435
  for btn in [insight_q1, insight_q2, insight_q3]:
436
  btn.click(
 
439
  outputs=[query_input]
440
  )
441
  def clear_outputs():
442
+ return "", "", gr.update(value="", visible=False)
443
  clear_button.click(
444
  fn=clear_outputs,
445
  inputs=[],
446
+ outputs=[query_input, answer_output, law_summary]
447
  )
448
 
449
  if __name__ == "__main__":
requirements.txt CHANGED
@@ -1,9 +1,11 @@
 
 
1
  langchain
2
  transformers
3
  langchain-groq
4
  chromadb
5
  langchain-community
6
  langchain-huggingface
7
- gradio
8
  python-dotenv
9
- beautifulsoup4
 
 
1
+ gradio
2
+ openai
3
  langchain
4
  transformers
5
  langchain-groq
6
  chromadb
7
  langchain-community
8
  langchain-huggingface
 
9
  python-dotenv
10
+ beautifulsoup4
11
+ requests