Spaces:
Sleeping
Sleeping
File size: 5,427 Bytes
91c1e78 65881ce 94f3898 51ac55a a52be2f 0530099 5baddf7 65881ce 91c1e78 65881ce 91c1e78 7e7a2a5 a64a105 5baddf7 8b5a642 a008248 a64a105 8b5a642 fbe2154 51ac55a a008248 65881ce a008248 8b5a642 a008248 51ac55a 65881ce 91c1e78 a008248 65881ce a008248 8b5a642 a008248 94f3898 65881ce 91c1e78 65881ce a008248 8b5a642 a008248 65881ce 91c1e78 94f3898 a52be2f 94f3898 65881ce 91c1e78 a008248 be26077 51ac55a a008248 94f3898 65881ce a64a105 65881ce a008248 65881ce a008248 91c1e78 65881ce a64a105 5baddf7 91c1e78 51ac55a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import gradio as gr
import os
import PyPDF2
import pandas as pd
import openai
import docx
import requests
from docx import Document
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
def detect_language(text):
"""Detects the language of the input text using OpenAI."""
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "Detect the language of this text."},
{"role": "user", "content": text}
]
)
return response["choices"][0]["message"]["content"].strip()
# Set up OpenAI API key (replace with your key)
openai.api_key = "YOUR_OPENAI_API_KEY"
def extract_files_from_folder(folder_path):
"""Scans a folder and its subfolders for PDF, TXT, CSV, and DOCX files."""
extracted_files = {"pdf": [], "txt": [], "csv": [], "docx": []}
print(f"Scanning folder: {folder_path}")
for root, subdirs, files in os.walk(folder_path):
print(f"Checking folder: {root}") # Debugging log for subfolders
for file_name in files:
file_path = os.path.join(root, file_name)
if file_name.endswith(".pdf"):
extracted_files["pdf"].append(file_path)
elif file_name.endswith(".txt"):
extracted_files["txt"].append(file_path)
elif file_name.endswith(".csv"):
extracted_files["csv"].append(file_path)
elif file_name.endswith(".docx"):
extracted_files["docx"].append(file_path)
print("Files found:", extracted_files) # Debugging log
return extracted_files
def extract_links_from_text(text):
"""Extracts links from text files and fetches their content."""
import re
links = re.findall(r'https?://\S+', text)
extracted_content = ""
for link in links:
try:
response = requests.get(link, timeout=5)
if response.status_code == 200:
extracted_content += f"\n[Extracted from {link}]\n" + response.text[:1000] # Limit to first 1000 chars
except requests.exceptions.RequestException:
extracted_content += f"\n[Could not access {link}]\n"
return extracted_content
def read_text_from_files(file_paths):
"""Reads text content from a list of files."""
text = ""
for file_path in file_paths:
print(f"Reading text file: {file_path}") # Debugging log
with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
file_text = file.read()
text += file_text + "\n"
text += extract_links_from_text(file_text) # Extract and add web content
return text
def get_text_from_pdf(pdf_files):
text = ""
for pdf_path in pdf_files:
print(f"Reading PDF file: {pdf_path}") # Debugging log
with open(pdf_path, "rb") as pdf_file:
reader = PyPDF2.PdfReader(pdf_file)
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
else:
text += "[Could not extract text from this page]\n"
return text
def get_text_from_csv(csv_files):
text = ""
for csv_path in csv_files:
print(f"Reading CSV file: {csv_path}") # Debugging log
df = pd.read_csv(csv_path)
text += df.to_string() + "\n"
return text
def get_text_from_docx(docx_files):
text = ""
for docx_path in docx_files:
print(f"Reading DOCX file: {docx_path}") # Debugging log
doc = Document(docx_path)
for para in doc.paragraphs:
text += para.text + "\n"
return text
def create_vector_database(text):
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = splitter.split_text(text)
embeddings = OpenAIEmbeddings()
vector_db = FAISS.from_texts(texts, embeddings)
return vector_db
def correct_exercises(text):
"""Uses OpenAI to correct and complete exercises found in the documents."""
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "Analyze the text and complete or correct any incomplete exercises."},
{"role": "user", "content": text}
]
)
return response["choices"][0]["message"]["content"].strip()
def chatbot_interface(question):
folder_path = "/mnt/data/New_Data_Analytics/"
extracted_files = extract_files_from_folder(folder_path)
text = get_text_from_pdf(extracted_files["pdf"]) + read_text_from_files(extracted_files["txt"]) + get_text_from_csv(extracted_files["csv"]) + get_text_from_docx(extracted_files["docx"])
if not text:
return "The folder does not contain valid PDF, TXT, CSV, or DOCX files. Please upload supported file types."
corrected_exercises = correct_exercises(text)
vector_db = create_vector_database(text)
return get_answer(question, vector_db, corrected_exercises)
# Gradio interface
demo = gr.Interface(
fn=chatbot_interface,
inputs=gr.Textbox(label="Ask a question", placeholder="Type your question here..."),
outputs=gr.Textbox(label="Answer")
)
demo.launch()
|