# -*- coding: utf-8 -*- # 財政部財政資訊中心 江信宗 import os from dotenv import load_dotenv load_dotenv() import gradio as gr from openai import OpenAI from langchain_community.utils import user_agent from langchain_groq import ChatGroq from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_huggingface import HuggingFaceEmbeddings from langchain_community.vectorstores import Chroma from langchain.chains import RetrievalQA from langchain_community.document_loaders import WebBaseLoader, TextLoader from langchain.prompts import PromptTemplate from langchain.schema import Document import resend import requests import re import time def load_documents(sources): documents = [] for source in sources: try: if isinstance(source, str): if source.startswith('http'): loader = WebBaseLoader(source) else: loader = TextLoader(source) documents.extend(loader.load()) elif isinstance(source, dict): documents.append(Document(page_content=source['content'], metadata=source.get('metadata', {}))) except Exception as e: print(f"Error loading source {source}: {str(e)}") return documents sources = [ "TaxQADataSet_Slim1.txt", "TaxQADataSet_Slim2.txt", "TaxQADataSet_Slim3.txt", "TaxQADataSet_Slim4.txt", "TaxQADataSet_Slim5.txt", "TaxQADataSet_Slim6.txt", "TaxQADataSet_ntpc1.txt", "TaxQADataSet_ntpc2.txt", "TaxQADataSet_kctax.txt", "TaxQADataSet_chutax.txt", "LandTaxAct1100623.txt", "TheEnforcementRulesoftheLandTaxAct1100923.txt", "HouseTaxAct1130103.txt", "VehicleLicenseTaxAct1101230.txt", "TaxCollectionAct1101217.txt", "AmusementTaxAct960523.txt", "StampTaxAct910515.txt", "DeedTaxAct990505.txt" ] documents = load_documents(sources) print(f"\n成功載入 {len(documents)} 個檔案") text_splitter = RecursiveCharacterTextSplitter( chunk_size=512, chunk_overlap=50, length_function=len, is_separator_regex=False, separators=["\n\n\n","\n\n", "\n", "。"] ) split_docs = text_splitter.split_documents(documents) print(f"分割後的文件數量:{len(split_docs)}") embeddings = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large") print(f"\n成功初始化 Microsoft 嵌入模型") print(f"\n開始建立向量資料庫") vectorstore = Chroma.from_documents(split_docs, embeddings, persist_directory="./Knowledge-base") print(f"成功建立 Chroma 向量資料庫,共有 {len(split_docs)} 個文檔") retriever = vectorstore.as_retriever( search_type="mmr", search_kwargs={ "k": min(4, len(split_docs)), "fetch_k": min(20, len(split_docs)), "lambda_mult": 0.8 } ) print(f"檢索演算法:Maximum Marginal Relevance Retrieval") print(f"檢索文檔數量:k={min(4, len(split_docs))}, fetch_k={min(20, len(split_docs))}") template = """Let's work this out in a step by step way to be sure we have the right answer. Must reply to me in Taiwanese Traditional Chinese. 在回答之前,請仔細分析檢索到的上下文,確保你的回答準確完整反映了上下文中的訊息,而不是依賴先前的知識,在回應的答案中絕對不要提到是根據上下文回答。 如果檢索到的多個上下文之間存在聯繫,請整合這些訊息以提供更全面的回答,但要避免過度推斷。 如果檢索到的上下文不包含足夠回答問題的訊息,請誠實的說明,不要試圖編造答案。 上下文: {context} 問題: {question} 答案:""" PROMPT = PromptTemplate( template=template, input_variables=["context", "question"] ) def create_chain(llm): return RetrievalQA.from_chain_type( llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True, chain_type_kwargs={"prompt": PROMPT} ) print(f"成功建立 RAG Chain") def initialize_llm(api_key): return ChatGroq( groq_api_key=api_key, model_name='llama-3.3-70b-versatile' ) def generate_insight_questions(query, api_key): llm = initialize_llm(api_key) prompt = f"""Let's work this out in a step by step way to be sure we have the right answer. Must reply to me in "Traditional Chinese". 根據以下回答,生成3個相關的洞察問題: 原始問題: {query} 請提供3個簡短但有深度的問題,這些問題應該符合: 1. 與原始問題緊密相關 2. 準確重新描述原始問題 3. 引導更深入的解決原始問題 請直接列出這3個問題,每個問題一行,不要添加編號或其他文字。 """ try: response = llm.invoke(prompt) if hasattr(response, 'content'): questions = response.content.split('\n') else: questions = str(response).split('\n') while len(questions) < 3: questions.append("提供更多地方稅資訊") return questions[:3] except Exception as e: print(f"Error generating insight questions:{str(e)}") return ["提供更多地方稅資訊", "提供其他地方稅問題", "還想了解什麼地方稅目"] def answer_question(query, api_key): try: gr.Info("檢索地方稅知識庫中......") llm = initialize_llm(api_key) chain = create_chain(llm) result = chain.invoke({"query": query}) answer = result["result"] insight_questions = generate_insight_questions(query, api_key) while len(insight_questions) < 3: insight_questions.append("提供更多地方稅資訊") return answer, insight_questions[:3] except Exception as e: return f"抱歉,處理您的問題時發生錯誤:{str(e)}", [] def split_questions(query): questions = re.split(r'[?!。 ]', query) return [q.strip() for q in questions if q.strip()] def answer_multiple_questions(query, api_key): questions = split_questions(query) all_answers = [] all_insight_questions = [] for question in questions: answer, insight_questions = answer_question(question, api_key) if len(questions) > 1: all_answers.append(f"【問題】{question}\n答案:{answer}") else: all_answers.append(answer) all_insight_questions.extend(insight_questions) if len(questions) > 1: combined_answer = "\n\n\n".join(all_answers) else: combined_answer = "\n".join(all_answers) selected_insight_questions = all_insight_questions[:3] return combined_answer, selected_insight_questions def get_tax_law(tax_type): tax_law_dict = { "房屋稅": "房屋稅條例", "地價稅": "土地稅法", "土地增值稅": "土地稅法", "增值稅": "土地稅法", "契稅": "契稅條例", "娛樂稅": "娛樂稅法", "印花稅": "印花稅法", "使用牌照稅": "使用牌照稅法", "牌照稅": "使用牌照稅法", "稅捐稽徵法": "稅捐稽徵法", "綜合所得稅": "所得稅法", "所得稅": "所得稅法", "遺產稅": "遺產及贈與稅法", "贈與稅": "遺產及贈與稅法", "營業稅": "營業稅法" } return tax_law_dict.get(tax_type, "無稅法") def fetch_law_summary(query, tax_law, keywords): url = "https://ttc.mof.gov.tw/Api/GetData" headers = { "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36", "accept": "application/json, text/javascript, */*; q=0.01", "accept-encoding": "gzip, deflate, br, zstd", "accept-language": "zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7", "referer": "https://ttc.mof.gov.tw/" } gr.Info("檢索法令彙編函釋中......") version_payload = { "FunctionID": "FB10001", "ObjParams[TaxAct]": tax_law, "ObjParams[TaxVer]": "請選擇", "ObjParams[Chapter]": "請選擇", "ObjParams[Article]": "請選擇", "ObjParams[Content]": "", "ObjParams[Operator01]": "0", "ObjParams[Content01]": "", "ObjParams[Operator02]": "0", "ObjParams[Content02]": "" } try: version_response = requests.post(url, data=version_payload, headers=headers) version_response.raise_for_status() version_data = version_response.json() if version_data["Code"] == "1" and "Table1" in version_data["Data"]: latest_version = "請選擇" for item in version_data["Data"]["Table1"]: if item["TaxAct"] == tax_law: latest_version = item["TaxVer"] break if latest_version == "請選擇": print(f"未找到 {tax_law} 的對應版本,使用預設選項。") else: gr.Warning("無法獲取稅法版本資訊,使用預設選項。") latest_version = "請選擇" except Exception as e: print(f"獲取稅法版本時發生錯誤:{str(e)}") latest_version = "請選擇" all_results = [] for keyword in keywords: payload = { "FunctionID": "FB10001", "ObjParams[TaxAct]": tax_law, "ObjParams[TaxVer]": latest_version, "ObjParams[Chapter]": "請選擇", "ObjParams[Article]": "請選擇", "ObjParams[Content]": keyword, "ObjParams[Operator01]": "0", "ObjParams[Content01]": "", "ObjParams[Operator02]": "0", "ObjParams[Content02]": "" } try: response = requests.post(url, data=payload, headers=headers) response.raise_for_status() data = response.json() if data["Code"] == "1" and "Table" in data["Data"]: all_results.extend(data["Data"]["Table"]) except Exception as e: print(f"檢索關鍵字 '{keyword}' 的法令彙編函釋時發生錯誤:{str(e)}") if all_results: summary = f"
{result['Content']}
未檢索到相關法令彙編函釋。
" def llm_openai_api(query, answer): client = OpenAI( api_key=os.environ.get("YOUR_API_TOKEN"), base_url="https://api.sambanova.ai/v1", ) user_prompt = f""" 「題目:{query} 答案:{answer}」 請詳細分析答案內容後,依據與題目相關性最高的稅目名稱及最多3個重點關鍵字回應我,提供的3個重點關鍵字不能與稅目名稱相同,問題與答案中的稅目名稱列入TaxName,關鍵字列入KeyWord,只須根據格式回應,不要寫其他的。 # 回應字典格式範例: {{"TaxName": "地價稅", "KeyWord": "宿舍用地,醫護人員"}} """ try: response = client.chat.completions.create( model='Meta-Llama-3.1-405B-Instruct', messages=[ {"role": "system", "content": "Must reply to user in Traditional Chinese."}, {"role": "user", "content": user_prompt} ], temperature=0.7, top_p=0.9 ) return response.choices[0].message.content.strip() except Exception as e: print(f"檢索法令彙編函釋 API Key!Error: {str(e)}") gr.Warning(f"檢索法令彙編函釋 API Key 額度不足!!") return '{"TaxName": "", "KeyWord": ""}' def handle_interaction(query, api_key, state): gr.Info("開始處理問題,請稍待片刻......") start_time = time.time() if state is None: state = {"history": []} if not api_key: resend.api_key = os.environ["YOUR_USE_API_KEY"] params: resend.Emails.SendParams = { "from": "Tax_KM