Rafa1986 commited on
Commit
a008248
·
verified ·
1 Parent(s): 5baddf7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -50
app.py CHANGED
@@ -3,14 +3,11 @@ import os
3
  import PyPDF2
4
  import pandas as pd
5
  import openai
6
- import zipfile
7
- from io import BytesIO
8
  from langchain_community.embeddings import OpenAIEmbeddings
9
  from langchain_community.vectorstores import FAISS
10
  from langchain_community.llms import OpenAI
11
  from langchain.text_splitter import RecursiveCharacterTextSplitter
12
 
13
-
14
  def detect_language(text):
15
  """Detects the language of the input text using OpenAI."""
16
  response = openai.ChatCompletion.create(
@@ -25,52 +22,42 @@ def detect_language(text):
25
  # Set up OpenAI API key (replace with your key)
26
  openai.api_key = "YOUR_OPENAI_API_KEY"
27
 
28
- def extract_files_from_zip(zip_path):
29
- """Extracts PDF, TXT, and CSV files from a ZIP archive, including subfolders."""
30
  extracted_files = {"pdf": [], "txt": [], "csv": []}
31
 
32
- with zipfile.ZipFile(zip_path, 'r') as zip_ref:
33
- for file_name in zip_ref.namelist():
34
- if file_name.endswith(('.pdf', '.txt', '.csv')):
35
- with zip_ref.open(file_name) as file:
36
- content = file.read()
37
- if file_name.endswith(".pdf"):
38
- extracted_files["pdf"].append(BytesIO(content))
39
- elif file_name.endswith(".txt"):
40
- extracted_files["txt"].append(BytesIO(content))
41
- elif file_name.endswith(".csv"):
42
- extracted_files["csv"].append(BytesIO(content))
43
  return extracted_files
44
 
45
- def analyze_text(text):
46
- """Uses OpenAI to analyze notes, links, and complementary information in the text."""
47
- response = openai.ChatCompletion.create(
48
- model="gpt-3.5-turbo",
49
- messages=[
50
- {"role": "system", "content": "Analyze this document and extract key points, links, and complementary information."},
51
- {"role": "user", "content": text}
52
- ]
53
- )
54
- return response["choices"][0]["message"]["content"].strip()
55
-
56
- def get_text_from_pdf(pdf_files):
57
  text = ""
58
- for pdf in pdf_files:
59
- reader = PyPDF2.PdfReader(pdf)
60
- for page in reader.pages:
61
- text += page.extract_text() + "\n"
62
  return text
63
 
64
- def get_text_from_txt(txt_files):
65
  text = ""
66
- for txt in txt_files:
67
- text += txt.read().decode("utf-8") + "\n"
 
 
 
68
  return text
69
 
70
  def get_text_from_csv(csv_files):
71
  text = ""
72
- for csv in csv_files:
73
- df = pd.read_csv(csv)
74
  text += df.to_string() + "\n"
75
  return text
76
 
@@ -81,7 +68,18 @@ def create_vector_database(text):
81
  vector_db = FAISS.from_texts(texts, embeddings)
82
  return vector_db
83
 
84
- def get_answer(question, vector_db, analysis):
 
 
 
 
 
 
 
 
 
 
 
85
  retriever = vector_db.as_retriever()
86
  docs = retriever.get_relevant_documents(question)
87
 
@@ -93,33 +91,33 @@ def get_answer(question, vector_db, analysis):
93
  response = openai.ChatCompletion.create(
94
  model="gpt-3.5-turbo",
95
  messages=[
96
- {"role": "system", "content": f"You are a Data Analytics assistant. Answer in {language}. Use the documents and their analyses to answer questions."},
97
- {"role": "user", "content": question + "\n\nBased on the following document content:\n" + context + "\n\nAdditional insights:\n" + analysis}
98
  ]
99
  )
100
  return response["choices"][0]["message"]["content"]
101
 
102
- def chatbot_interface(zip_file_path, question):
103
- if not zip_file_path:
104
- return "Please upload a ZIP file before asking a question."
105
 
106
- extracted_files = extract_files_from_zip(zip_file_path)
107
- text = get_text_from_pdf(extracted_files["pdf"]) + get_text_from_txt(extracted_files["txt"]) + get_text_from_csv(extracted_files["csv"])
 
108
 
109
  if not text:
110
- return "The ZIP file does not contain valid PDF, TXT, or CSV files. Please upload supported file types."
111
 
112
- analysis = analyze_text(text)
113
  vector_db = create_vector_database(text)
114
- return get_answer(question, vector_db, analysis)
115
 
116
  # Gradio interface
117
  demo = gr.Interface(
118
  fn=chatbot_interface,
119
- inputs=[gr.File(label="Upload ZIP File"),
120
  gr.Textbox(label="Ask a question", placeholder="Type your question here...")],
121
  outputs=gr.Textbox(label="Answer")
122
  )
123
 
124
  demo.launch()
125
-
 
3
  import PyPDF2
4
  import pandas as pd
5
  import openai
 
 
6
  from langchain_community.embeddings import OpenAIEmbeddings
7
  from langchain_community.vectorstores import FAISS
8
  from langchain_community.llms import OpenAI
9
  from langchain.text_splitter import RecursiveCharacterTextSplitter
10
 
 
11
  def detect_language(text):
12
  """Detects the language of the input text using OpenAI."""
13
  response = openai.ChatCompletion.create(
 
22
  # Set up OpenAI API key (replace with your key)
23
  openai.api_key = "YOUR_OPENAI_API_KEY"
24
 
25
+ def extract_files_from_folder(folder_path):
26
+ """Scans a folder and its subfolders for PDF, TXT, and CSV files."""
27
  extracted_files = {"pdf": [], "txt": [], "csv": []}
28
 
29
+ for root, _, files in os.walk(folder_path):
30
+ for file_name in files:
31
+ file_path = os.path.join(root, file_name)
32
+ if file_name.endswith(".pdf"):
33
+ extracted_files["pdf"].append(file_path)
34
+ elif file_name.endswith(".txt"):
35
+ extracted_files["txt"].append(file_path)
36
+ elif file_name.endswith(".csv"):
37
+ extracted_files["csv"].append(file_path)
 
 
38
  return extracted_files
39
 
40
+ def read_text_from_files(file_paths):
41
+ """Reads text content from a list of files."""
 
 
 
 
 
 
 
 
 
 
42
  text = ""
43
+ for file_path in file_paths:
44
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
45
+ text += file.read() + "\n"
 
46
  return text
47
 
48
+ def get_text_from_pdf(pdf_files):
49
  text = ""
50
+ for pdf_path in pdf_files:
51
+ with open(pdf_path, "rb") as pdf_file:
52
+ reader = PyPDF2.PdfReader(pdf_file)
53
+ for page in reader.pages:
54
+ text += page.extract_text() + "\n"
55
  return text
56
 
57
  def get_text_from_csv(csv_files):
58
  text = ""
59
+ for csv_path in csv_files:
60
+ df = pd.read_csv(csv_path)
61
  text += df.to_string() + "\n"
62
  return text
63
 
 
68
  vector_db = FAISS.from_texts(texts, embeddings)
69
  return vector_db
70
 
71
+ def correct_exercises(text):
72
+ """Uses OpenAI to correct and complete exercises found in the documents."""
73
+ response = openai.ChatCompletion.create(
74
+ model="gpt-3.5-turbo",
75
+ messages=[
76
+ {"role": "system", "content": "Analyze the text and complete or correct any incomplete exercises."},
77
+ {"role": "user", "content": text}
78
+ ]
79
+ )
80
+ return response["choices"][0]["message"]["content"].strip()
81
+
82
+ def get_answer(question, vector_db, corrected_exercises):
83
  retriever = vector_db.as_retriever()
84
  docs = retriever.get_relevant_documents(question)
85
 
 
91
  response = openai.ChatCompletion.create(
92
  model="gpt-3.5-turbo",
93
  messages=[
94
+ {"role": "system", "content": f"You are a Data Analytics assistant. Answer in {language}. Use the documents to answer questions. Also, use the corrected exercises if relevant."},
95
+ {"role": "user", "content": question + "\n\nBased on the following document content:\n" + context + "\n\nCorrected Exercises:\n" + corrected_exercises}
96
  ]
97
  )
98
  return response["choices"][0]["message"]["content"]
99
 
100
+ def chatbot_interface(folder_path, question):
101
+ if not folder_path:
102
+ return "Please provide a folder path before asking a question."
103
 
104
+ extracted_files = extract_files_from_folder(folder_path)
105
+
106
+ text = get_text_from_pdf(extracted_files["pdf"]) + read_text_from_files(extracted_files["txt"]) + get_text_from_csv(extracted_files["csv"])
107
 
108
  if not text:
109
+ return "The folder does not contain valid PDF, TXT, or CSV files. Please upload supported file types."
110
 
111
+ corrected_exercises = correct_exercises(text)
112
  vector_db = create_vector_database(text)
113
+ return get_answer(question, vector_db, corrected_exercises)
114
 
115
  # Gradio interface
116
  demo = gr.Interface(
117
  fn=chatbot_interface,
118
+ inputs=[gr.Textbox(label="Folder Path", placeholder="Enter the path to the folder containing the documents"),
119
  gr.Textbox(label="Ask a question", placeholder="Type your question here...")],
120
  outputs=gr.Textbox(label="Answer")
121
  )
122
 
123
  demo.launch()