Rafa1986 commited on
Commit
5baddf7
·
verified ·
1 Parent(s): fbe2154

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -25
app.py CHANGED
@@ -8,6 +8,8 @@ from io import BytesIO
8
  from langchain_community.embeddings import OpenAIEmbeddings
9
  from langchain_community.vectorstores import FAISS
10
  from langchain_community.llms import OpenAI
 
 
11
 
12
  def detect_language(text):
13
  """Detects the language of the input text using OpenAI."""
@@ -23,20 +25,34 @@ def detect_language(text):
23
  # Set up OpenAI API key (replace with your key)
24
  openai.api_key = "YOUR_OPENAI_API_KEY"
25
 
26
- def extract_files_from_zip(zip_file):
27
- """Extracts PDF, TXT, and CSV files from a ZIP archive."""
28
  extracted_files = {"pdf": [], "txt": [], "csv": []}
29
- with zipfile.ZipFile(zip_file, 'r') as zip_ref:
 
30
  for file_name in zip_ref.namelist():
31
- with zip_ref.open(file_name) as file:
32
- if file_name.endswith(".pdf"):
33
- extracted_files["pdf"].append(BytesIO(file.read()))
34
- elif file_name.endswith(".txt"):
35
- extracted_files["txt"].append(BytesIO(file.read()))
36
- elif file_name.endswith(".csv"):
37
- extracted_files["csv"].append(BytesIO(file.read()))
 
 
38
  return extracted_files
39
 
 
 
 
 
 
 
 
 
 
 
 
40
  def get_text_from_pdf(pdf_files):
41
  text = ""
42
  for pdf in pdf_files:
@@ -65,7 +81,7 @@ def create_vector_database(text):
65
  vector_db = FAISS.from_texts(texts, embeddings)
66
  return vector_db
67
 
68
- def get_answer(question, vector_db):
69
  retriever = vector_db.as_retriever()
70
  docs = retriever.get_relevant_documents(question)
71
 
@@ -77,32 +93,33 @@ def get_answer(question, vector_db):
77
  response = openai.ChatCompletion.create(
78
  model="gpt-3.5-turbo",
79
  messages=[
80
- {"role": "system", "content": f"You are a Data Analytics assistant. Answer in {language}. Use the documents to answer questions."},
81
- {"role": "user", "content": question + "\n\nBased on the following context:\n" + context}
82
  ]
83
  )
84
  return response["choices"][0]["message"]["content"]
85
 
86
- def chatbot_interface(zip_file, question):
87
- text = ""
88
- if zip_file:
89
- extracted_files = extract_files_from_zip(zip_file)
90
- text += get_text_from_pdf(extracted_files["pdf"])
91
- text += get_text_from_txt(extracted_files["txt"])
92
- text += get_text_from_csv(extracted_files["csv"])
93
 
94
  if not text:
95
- return "Please upload a ZIP file containing PDFs, TXTs, or CSVs before asking questions."
96
 
 
97
  vector_db = create_vector_database(text)
98
- return get_answer(question, vector_db)
99
 
100
  # Gradio interface
101
  demo = gr.Interface(
102
  fn=chatbot_interface,
103
- inputs=[gr.File(file_types=[".zip"]),
104
- gr.Textbox(placeholder="Type your question here...")],
105
- outputs=gr.Textbox()
106
  )
107
 
108
  demo.launch()
 
 
8
  from langchain_community.embeddings import OpenAIEmbeddings
9
  from langchain_community.vectorstores import FAISS
10
  from langchain_community.llms import OpenAI
11
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
12
+
13
 
14
  def detect_language(text):
15
  """Detects the language of the input text using OpenAI."""
 
25
  # Set up OpenAI API key (replace with your key)
26
  openai.api_key = "YOUR_OPENAI_API_KEY"
27
 
28
+ def extract_files_from_zip(zip_path):
29
+ """Extracts PDF, TXT, and CSV files from a ZIP archive, including subfolders."""
30
  extracted_files = {"pdf": [], "txt": [], "csv": []}
31
+
32
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
33
  for file_name in zip_ref.namelist():
34
+ if file_name.endswith(('.pdf', '.txt', '.csv')):
35
+ with zip_ref.open(file_name) as file:
36
+ content = file.read()
37
+ if file_name.endswith(".pdf"):
38
+ extracted_files["pdf"].append(BytesIO(content))
39
+ elif file_name.endswith(".txt"):
40
+ extracted_files["txt"].append(BytesIO(content))
41
+ elif file_name.endswith(".csv"):
42
+ extracted_files["csv"].append(BytesIO(content))
43
  return extracted_files
44
 
45
+ def analyze_text(text):
46
+ """Uses OpenAI to analyze notes, links, and complementary information in the text."""
47
+ response = openai.ChatCompletion.create(
48
+ model="gpt-3.5-turbo",
49
+ messages=[
50
+ {"role": "system", "content": "Analyze this document and extract key points, links, and complementary information."},
51
+ {"role": "user", "content": text}
52
+ ]
53
+ )
54
+ return response["choices"][0]["message"]["content"].strip()
55
+
56
  def get_text_from_pdf(pdf_files):
57
  text = ""
58
  for pdf in pdf_files:
 
81
  vector_db = FAISS.from_texts(texts, embeddings)
82
  return vector_db
83
 
84
+ def get_answer(question, vector_db, analysis):
85
  retriever = vector_db.as_retriever()
86
  docs = retriever.get_relevant_documents(question)
87
 
 
93
  response = openai.ChatCompletion.create(
94
  model="gpt-3.5-turbo",
95
  messages=[
96
+ {"role": "system", "content": f"You are a Data Analytics assistant. Answer in {language}. Use the documents and their analyses to answer questions."},
97
+ {"role": "user", "content": question + "\n\nBased on the following document content:\n" + context + "\n\nAdditional insights:\n" + analysis}
98
  ]
99
  )
100
  return response["choices"][0]["message"]["content"]
101
 
102
+ def chatbot_interface(zip_file_path, question):
103
+ if not zip_file_path:
104
+ return "Please upload a ZIP file before asking a question."
105
+
106
+ extracted_files = extract_files_from_zip(zip_file_path)
107
+ text = get_text_from_pdf(extracted_files["pdf"]) + get_text_from_txt(extracted_files["txt"]) + get_text_from_csv(extracted_files["csv"])
 
108
 
109
  if not text:
110
+ return "The ZIP file does not contain valid PDF, TXT, or CSV files. Please upload supported file types."
111
 
112
+ analysis = analyze_text(text)
113
  vector_db = create_vector_database(text)
114
+ return get_answer(question, vector_db, analysis)
115
 
116
  # Gradio interface
117
  demo = gr.Interface(
118
  fn=chatbot_interface,
119
+ inputs=[gr.File(label="Upload ZIP File"),
120
+ gr.Textbox(label="Ask a question", placeholder="Type your question here...")],
121
+ outputs=gr.Textbox(label="Answer")
122
  )
123
 
124
  demo.launch()
125
+