Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -4,6 +4,7 @@ import PyPDF2
|
|
4 |
import pandas as pd
|
5 |
import openai
|
6 |
import docx
|
|
|
7 |
from docx import Document
|
8 |
from langchain_community.embeddings import OpenAIEmbeddings
|
9 |
from langchain_community.vectorstores import FAISS
|
@@ -45,13 +46,31 @@ def extract_files_from_folder(folder_path):
|
|
45 |
print("Files found:", extracted_files) # Debugging log
|
46 |
return extracted_files
|
47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
def read_text_from_files(file_paths):
|
49 |
"""Reads text content from a list of files."""
|
50 |
text = ""
|
51 |
for file_path in file_paths:
|
52 |
print(f"Reading text file: {file_path}") # Debugging log
|
53 |
with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
|
54 |
-
|
|
|
|
|
55 |
return text
|
56 |
|
57 |
def get_text_from_pdf(pdf_files):
|
@@ -103,26 +122,8 @@ def correct_exercises(text):
|
|
103 |
)
|
104 |
return response["choices"][0]["message"]["content"].strip()
|
105 |
|
106 |
-
def get_answer(question, vector_db, corrected_exercises):
|
107 |
-
retriever = vector_db.as_retriever()
|
108 |
-
docs = retriever.get_relevant_documents(question)
|
109 |
-
|
110 |
-
if not docs:
|
111 |
-
return "I could not find the answer in the documents. Do you want me to search external sources?"
|
112 |
-
|
113 |
-
context = "\n".join([doc.page_content for doc in docs])
|
114 |
-
language = detect_language(question)
|
115 |
-
response = openai.ChatCompletion.create(
|
116 |
-
model="gpt-3.5-turbo",
|
117 |
-
messages=[
|
118 |
-
{"role": "system", "content": f"You are a Data Analytics assistant. Answer in {language}. Use the documents to answer questions. Also, use the corrected exercises if relevant."},
|
119 |
-
{"role": "user", "content": question + "\n\nBased on the following document content:\n" + context + "\n\nCorrected Exercises:\n" + corrected_exercises}
|
120 |
-
]
|
121 |
-
)
|
122 |
-
return response["choices"][0]["message"]["content"]
|
123 |
-
|
124 |
def chatbot_interface(question):
|
125 |
-
folder_path = "/mnt/data/
|
126 |
extracted_files = extract_files_from_folder(folder_path)
|
127 |
|
128 |
text = get_text_from_pdf(extracted_files["pdf"]) + read_text_from_files(extracted_files["txt"]) + get_text_from_csv(extracted_files["csv"]) + get_text_from_docx(extracted_files["docx"])
|
@@ -141,4 +142,4 @@ demo = gr.Interface(
|
|
141 |
outputs=gr.Textbox(label="Answer")
|
142 |
)
|
143 |
|
144 |
-
demo.launch()
|
|
|
4 |
import pandas as pd
|
5 |
import openai
|
6 |
import docx
|
7 |
+
import requests
|
8 |
from docx import Document
|
9 |
from langchain_community.embeddings import OpenAIEmbeddings
|
10 |
from langchain_community.vectorstores import FAISS
|
|
|
46 |
print("Files found:", extracted_files) # Debugging log
|
47 |
return extracted_files
|
48 |
|
49 |
+
def extract_links_from_text(text):
|
50 |
+
"""Extracts links from text files and fetches their content."""
|
51 |
+
import re
|
52 |
+
links = re.findall(r'https?://\S+', text)
|
53 |
+
extracted_content = ""
|
54 |
+
|
55 |
+
for link in links:
|
56 |
+
try:
|
57 |
+
response = requests.get(link, timeout=5)
|
58 |
+
if response.status_code == 200:
|
59 |
+
extracted_content += f"\n[Extracted from {link}]\n" + response.text[:1000] # Limit to first 1000 chars
|
60 |
+
except requests.exceptions.RequestException:
|
61 |
+
extracted_content += f"\n[Could not access {link}]\n"
|
62 |
+
|
63 |
+
return extracted_content
|
64 |
+
|
65 |
def read_text_from_files(file_paths):
|
66 |
"""Reads text content from a list of files."""
|
67 |
text = ""
|
68 |
for file_path in file_paths:
|
69 |
print(f"Reading text file: {file_path}") # Debugging log
|
70 |
with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
|
71 |
+
file_text = file.read()
|
72 |
+
text += file_text + "\n"
|
73 |
+
text += extract_links_from_text(file_text) # Extract and add web content
|
74 |
return text
|
75 |
|
76 |
def get_text_from_pdf(pdf_files):
|
|
|
122 |
)
|
123 |
return response["choices"][0]["message"]["content"].strip()
|
124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
def chatbot_interface(question):
|
126 |
+
folder_path = "/mnt/data/New_Data_Analytics/"
|
127 |
extracted_files = extract_files_from_folder(folder_path)
|
128 |
|
129 |
text = get_text_from_pdf(extracted_files["pdf"]) + read_text_from_files(extracted_files["txt"]) + get_text_from_csv(extracted_files["csv"]) + get_text_from_docx(extracted_files["docx"])
|
|
|
142 |
outputs=gr.Textbox(label="Answer")
|
143 |
)
|
144 |
|
145 |
+
demo.launch()
|