Spaces:
Sleeping
Sleeping
File size: 3,689 Bytes
91c1e78 65881ce 94f3898 51ac55a 0302345 a52be2f 0530099 5baddf7 65881ce 91c1e78 65881ce 91c1e78 7e7a2a5 0302345 5baddf7 8b5a642 a008248 0302345 a008248 a64a105 0302345 8b5a642 fbe2154 0c80efa 0302345 0c80efa 0302345 65881ce 0302345 65881ce 0c80efa 91c1e78 65881ce a64a105 5baddf7 91c1e78 0c80efa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
import gradio as gr
import os
import PyPDF2
import pandas as pd
import openai
import docx
import requests
import json
from docx import Document
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
def detect_language(text):
"""Detects the language of the input text using OpenAI."""
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "Detect the language of this text."},
{"role": "user", "content": text}
]
)
return response["choices"][0]["message"]["content"].strip()
# Set up OpenAI API key (replace with your key)
openai.api_key = "YOUR_OPENAI_API_KEY"
def extract_files_from_folder(folder_path):
"""Scans a folder and its subfolders for PDF, TXT, CSV, DOCX, and IPYNB files."""
extracted_files = {"pdf": [], "txt": [], "csv": [], "docx": [], "ipynb": []}
print(f"Scanning folder: {folder_path}")
for root, subdirs, files in os.walk(folder_path):
print(f"Checking folder: {root}") # Debugging log for subfolders
for file_name in files:
file_path = os.path.join(root, file_name)
print(f"Found file: {file_path}")
if file_name.endswith(".pdf"):
extracted_files["pdf"].append(file_path)
elif file_name.endswith(".txt"):
extracted_files["txt"].append(file_path)
elif file_name.endswith(".csv"):
extracted_files["csv"].append(file_path)
elif file_name.endswith(".docx"):
extracted_files["docx"].append(file_path)
elif file_name.endswith(".ipynb"):
extracted_files["ipynb"].append(file_path)
print("Files found:", extracted_files) # Debugging log
return extracted_files
def combine_text_from_files(extracted_files):
"""Combines text from all extracted files."""
text = (
get_text_from_pdf(extracted_files["pdf"]) +
read_text_from_files(extracted_files["txt"]) +
get_text_from_csv(extracted_files["csv"]) +
get_text_from_docx(extracted_files["docx"]) +
get_text_from_ipynb(extracted_files["ipynb"])
)
return text
def generate_response(question, text):
"""Uses OpenAI to answer a question based on extracted text."""
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You are a data analytics assistant. Answer the question based on the provided document content."},
{"role": "user", "content": f"{question}\n\nBased on the following document content:\n{text[:3000]}"} # Limit to 3000 characters to avoid excessive token usage
]
)
return response["choices"][0]["message"]["content"].strip()
def chatbot_interface(question):
folder_path = "New_Data_Analytics/"
extracted_files = extract_files_from_folder(folder_path)
text = combine_text_from_files(extracted_files)
print("Final extracted text for chatbot processing:", text[:500]) # Debugging log (First 500 chars)
if not text.strip():
return "The folder does not contain valid PDF, TXT, CSV, DOCX, or IPYNB files. Please upload supported file types."
return generate_response(question, text)
# Gradio interface
demo = gr.Interface(
fn=chatbot_interface,
inputs=gr.Textbox(label="Ask a question", placeholder="Type your question here..."),
outputs=gr.Textbox(label="Answer")
)
demo.launch()
|