Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,6 +4,7 @@ import PyPDF2
|
|
| 4 |
import pandas as pd
|
| 5 |
import openai
|
| 6 |
import docx
|
|
|
|
| 7 |
from docx import Document
|
| 8 |
from langchain_community.embeddings import OpenAIEmbeddings
|
| 9 |
from langchain_community.vectorstores import FAISS
|
|
@@ -45,13 +46,31 @@ def extract_files_from_folder(folder_path):
|
|
| 45 |
print("Files found:", extracted_files) # Debugging log
|
| 46 |
return extracted_files
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
def read_text_from_files(file_paths):
|
| 49 |
"""Reads text content from a list of files."""
|
| 50 |
text = ""
|
| 51 |
for file_path in file_paths:
|
| 52 |
print(f"Reading text file: {file_path}") # Debugging log
|
| 53 |
with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
|
| 54 |
-
|
|
|
|
|
|
|
| 55 |
return text
|
| 56 |
|
| 57 |
def get_text_from_pdf(pdf_files):
|
|
@@ -103,26 +122,8 @@ def correct_exercises(text):
|
|
| 103 |
)
|
| 104 |
return response["choices"][0]["message"]["content"].strip()
|
| 105 |
|
| 106 |
-
def get_answer(question, vector_db, corrected_exercises):
|
| 107 |
-
retriever = vector_db.as_retriever()
|
| 108 |
-
docs = retriever.get_relevant_documents(question)
|
| 109 |
-
|
| 110 |
-
if not docs:
|
| 111 |
-
return "I could not find the answer in the documents. Do you want me to search external sources?"
|
| 112 |
-
|
| 113 |
-
context = "\n".join([doc.page_content for doc in docs])
|
| 114 |
-
language = detect_language(question)
|
| 115 |
-
response = openai.ChatCompletion.create(
|
| 116 |
-
model="gpt-3.5-turbo",
|
| 117 |
-
messages=[
|
| 118 |
-
{"role": "system", "content": f"You are a Data Analytics assistant. Answer in {language}. Use the documents to answer questions. Also, use the corrected exercises if relevant."},
|
| 119 |
-
{"role": "user", "content": question + "\n\nBased on the following document content:\n" + context + "\n\nCorrected Exercises:\n" + corrected_exercises}
|
| 120 |
-
]
|
| 121 |
-
)
|
| 122 |
-
return response["choices"][0]["message"]["content"]
|
| 123 |
-
|
| 124 |
def chatbot_interface(question):
|
| 125 |
-
folder_path = "/mnt/data/
|
| 126 |
extracted_files = extract_files_from_folder(folder_path)
|
| 127 |
|
| 128 |
text = get_text_from_pdf(extracted_files["pdf"]) + read_text_from_files(extracted_files["txt"]) + get_text_from_csv(extracted_files["csv"]) + get_text_from_docx(extracted_files["docx"])
|
|
@@ -141,4 +142,4 @@ demo = gr.Interface(
|
|
| 141 |
outputs=gr.Textbox(label="Answer")
|
| 142 |
)
|
| 143 |
|
| 144 |
-
demo.launch()
|
|
|
|
| 4 |
import pandas as pd
|
| 5 |
import openai
|
| 6 |
import docx
|
| 7 |
+
import requests
|
| 8 |
from docx import Document
|
| 9 |
from langchain_community.embeddings import OpenAIEmbeddings
|
| 10 |
from langchain_community.vectorstores import FAISS
|
|
|
|
| 46 |
print("Files found:", extracted_files) # Debugging log
|
| 47 |
return extracted_files
|
| 48 |
|
| 49 |
+
def extract_links_from_text(text):
|
| 50 |
+
"""Extracts links from text files and fetches their content."""
|
| 51 |
+
import re
|
| 52 |
+
links = re.findall(r'https?://\S+', text)
|
| 53 |
+
extracted_content = ""
|
| 54 |
+
|
| 55 |
+
for link in links:
|
| 56 |
+
try:
|
| 57 |
+
response = requests.get(link, timeout=5)
|
| 58 |
+
if response.status_code == 200:
|
| 59 |
+
extracted_content += f"\n[Extracted from {link}]\n" + response.text[:1000] # Limit to first 1000 chars
|
| 60 |
+
except requests.exceptions.RequestException:
|
| 61 |
+
extracted_content += f"\n[Could not access {link}]\n"
|
| 62 |
+
|
| 63 |
+
return extracted_content
|
| 64 |
+
|
| 65 |
def read_text_from_files(file_paths):
|
| 66 |
"""Reads text content from a list of files."""
|
| 67 |
text = ""
|
| 68 |
for file_path in file_paths:
|
| 69 |
print(f"Reading text file: {file_path}") # Debugging log
|
| 70 |
with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
|
| 71 |
+
file_text = file.read()
|
| 72 |
+
text += file_text + "\n"
|
| 73 |
+
text += extract_links_from_text(file_text) # Extract and add web content
|
| 74 |
return text
|
| 75 |
|
| 76 |
def get_text_from_pdf(pdf_files):
|
|
|
|
| 122 |
)
|
| 123 |
return response["choices"][0]["message"]["content"].strip()
|
| 124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
def chatbot_interface(question):
|
| 126 |
+
folder_path = "/mnt/data/New_Data_Analytics/"
|
| 127 |
extracted_files = extract_files_from_folder(folder_path)
|
| 128 |
|
| 129 |
text = get_text_from_pdf(extracted_files["pdf"]) + read_text_from_files(extracted_files["txt"]) + get_text_from_csv(extracted_files["csv"]) + get_text_from_docx(extracted_files["docx"])
|
|
|
|
| 142 |
outputs=gr.Textbox(label="Answer")
|
| 143 |
)
|
| 144 |
|
| 145 |
+
demo.launch()
|