Rafa1986 commited on
Commit
0302345
·
verified ·
1 Parent(s): 208be4a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -46
app.py CHANGED
@@ -5,6 +5,7 @@ import pandas as pd
5
  import openai
6
  import docx
7
  import requests
 
8
  from docx import Document
9
  from langchain_community.embeddings import OpenAIEmbeddings
10
  from langchain_community.vectorstores import FAISS
@@ -26,14 +27,15 @@ def detect_language(text):
26
  openai.api_key = "YOUR_OPENAI_API_KEY"
27
 
28
  def extract_files_from_folder(folder_path):
29
- """Scans a folder and its subfolders for PDF, TXT, CSV, and DOCX files."""
30
- extracted_files = {"pdf": [], "txt": [], "csv": [], "docx": []}
31
 
32
  print(f"Scanning folder: {folder_path}")
33
  for root, subdirs, files in os.walk(folder_path):
34
  print(f"Checking folder: {root}") # Debugging log for subfolders
35
  for file_name in files:
36
  file_path = os.path.join(root, file_name)
 
37
  if file_name.endswith(".pdf"):
38
  extracted_files["pdf"].append(file_path)
39
  elif file_name.endswith(".txt"):
@@ -42,26 +44,12 @@ def extract_files_from_folder(folder_path):
42
  extracted_files["csv"].append(file_path)
43
  elif file_name.endswith(".docx"):
44
  extracted_files["docx"].append(file_path)
 
 
45
 
46
  print("Files found:", extracted_files) # Debugging log
47
  return extracted_files
48
 
49
- def extract_links_from_text(text):
50
- """Extracts links from text files and fetches their content."""
51
- import re
52
- links = re.findall(r'https?://\S+', text)
53
- extracted_content = ""
54
-
55
- for link in links:
56
- try:
57
- response = requests.get(link, timeout=5)
58
- if response.status_code == 200:
59
- extracted_content += f"\n[Extracted from {link}]\n" + response.text[:1000] # Limit to first 1000 chars
60
- except requests.exceptions.RequestException:
61
- extracted_content += f"\n[Could not access {link}]\n"
62
-
63
- return extracted_content
64
-
65
  def read_text_from_files(file_paths):
66
  """Reads text content from a list of files."""
67
  text = ""
@@ -70,7 +58,7 @@ def read_text_from_files(file_paths):
70
  with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
71
  file_text = file.read()
72
  text += file_text + "\n"
73
- text += extract_links_from_text(file_text) # Extract and add web content
74
  return text
75
 
76
  def get_text_from_pdf(pdf_files):
@@ -83,8 +71,7 @@ def get_text_from_pdf(pdf_files):
83
  page_text = page.extract_text()
84
  if page_text:
85
  text += page_text + "\n"
86
- else:
87
- text += "[Could not extract text from this page]\n"
88
  return text
89
 
90
  def get_text_from_csv(csv_files):
@@ -93,6 +80,7 @@ def get_text_from_csv(csv_files):
93
  print(f"Reading CSV file: {csv_path}") # Debugging log
94
  df = pd.read_csv(csv_path)
95
  text += df.to_string() + "\n"
 
96
  return text
97
 
98
  def get_text_from_docx(docx_files):
@@ -102,38 +90,39 @@ def get_text_from_docx(docx_files):
102
  doc = Document(docx_path)
103
  for para in doc.paragraphs:
104
  text += para.text + "\n"
 
105
  return text
106
 
107
- def create_vector_database(text):
108
- splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
109
- texts = splitter.split_text(text)
110
- embeddings = OpenAIEmbeddings()
111
- vector_db = FAISS.from_texts(texts, embeddings)
112
- return vector_db
113
-
114
- def correct_exercises(text):
115
- """Uses OpenAI to correct and complete exercises found in the documents."""
116
- response = openai.ChatCompletion.create(
117
- model="gpt-3.5-turbo",
118
- messages=[
119
- {"role": "system", "content": "Analyze the text and complete or correct any incomplete exercises."},
120
- {"role": "user", "content": text}
121
- ]
122
- )
123
- return response["choices"][0]["message"]["content"].strip()
124
 
125
  def chatbot_interface(question):
126
- folder_path = "/mnt/data/New_Data_Analytics/"
127
  extracted_files = extract_files_from_folder(folder_path)
128
 
129
- text = get_text_from_pdf(extracted_files["pdf"]) + read_text_from_files(extracted_files["txt"]) + get_text_from_csv(extracted_files["csv"]) + get_text_from_docx(extracted_files["docx"])
 
 
 
 
 
 
 
 
130
 
131
- if not text:
132
- return "The folder does not contain valid PDF, TXT, CSV, or DOCX files. Please upload supported file types."
133
 
134
- corrected_exercises = correct_exercises(text)
135
- vector_db = create_vector_database(text)
136
- return get_answer(question, vector_db, corrected_exercises)
137
 
138
  # Gradio interface
139
  demo = gr.Interface(
@@ -142,4 +131,4 @@ demo = gr.Interface(
142
  outputs=gr.Textbox(label="Answer")
143
  )
144
 
145
- demo.launch()
 
5
  import openai
6
  import docx
7
  import requests
8
+ import json
9
  from docx import Document
10
  from langchain_community.embeddings import OpenAIEmbeddings
11
  from langchain_community.vectorstores import FAISS
 
27
  openai.api_key = "YOUR_OPENAI_API_KEY"
28
 
29
  def extract_files_from_folder(folder_path):
30
+ """Scans a folder and its subfolders for PDF, TXT, CSV, DOCX, and IPYNB files."""
31
+ extracted_files = {"pdf": [], "txt": [], "csv": [], "docx": [], "ipynb": []}
32
 
33
  print(f"Scanning folder: {folder_path}")
34
  for root, subdirs, files in os.walk(folder_path):
35
  print(f"Checking folder: {root}") # Debugging log for subfolders
36
  for file_name in files:
37
  file_path = os.path.join(root, file_name)
38
+ print(f"Found file: {file_path}")
39
  if file_name.endswith(".pdf"):
40
  extracted_files["pdf"].append(file_path)
41
  elif file_name.endswith(".txt"):
 
44
  extracted_files["csv"].append(file_path)
45
  elif file_name.endswith(".docx"):
46
  extracted_files["docx"].append(file_path)
47
+ elif file_name.endswith(".ipynb"):
48
+ extracted_files["ipynb"].append(file_path)
49
 
50
  print("Files found:", extracted_files) # Debugging log
51
  return extracted_files
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  def read_text_from_files(file_paths):
54
  """Reads text content from a list of files."""
55
  text = ""
 
58
  with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
59
  file_text = file.read()
60
  text += file_text + "\n"
61
+ print("Extracted text from TXT files:", text[:500]) # Debugging log (First 500 chars)
62
  return text
63
 
64
  def get_text_from_pdf(pdf_files):
 
71
  page_text = page.extract_text()
72
  if page_text:
73
  text += page_text + "\n"
74
+ print("Extracted text from PDF files:", text[:500]) # Debugging log (First 500 chars)
 
75
  return text
76
 
77
  def get_text_from_csv(csv_files):
 
80
  print(f"Reading CSV file: {csv_path}") # Debugging log
81
  df = pd.read_csv(csv_path)
82
  text += df.to_string() + "\n"
83
+ print("Extracted text from CSV files:", text[:500]) # Debugging log (First 500 chars)
84
  return text
85
 
86
  def get_text_from_docx(docx_files):
 
90
  doc = Document(docx_path)
91
  for para in doc.paragraphs:
92
  text += para.text + "\n"
93
+ print("Extracted text from DOCX files:", text[:500]) # Debugging log (First 500 chars)
94
  return text
95
 
96
+ def get_text_from_ipynb(ipynb_files):
97
+ text = ""
98
+ for ipynb_path in ipynb_files:
99
+ print(f"Reading IPYNB file: {ipynb_path}") # Debugging log
100
+ with open(ipynb_path, "r", encoding="utf-8") as file:
101
+ notebook = json.load(file)
102
+ for cell in notebook.get("cells", []):
103
+ if cell.get("cell_type") == "markdown":
104
+ text += "\n".join(cell.get("source", [])) + "\n"
105
+ print("Extracted text from IPYNB files:", text[:500]) # Debugging log (First 500 chars)
106
+ return text
 
 
 
 
 
 
107
 
108
  def chatbot_interface(question):
109
+ folder_path = "New_Data_Analytics/"
110
  extracted_files = extract_files_from_folder(folder_path)
111
 
112
+ text = (
113
+ get_text_from_pdf(extracted_files["pdf"]) +
114
+ read_text_from_files(extracted_files["txt"]) +
115
+ get_text_from_csv(extracted_files["csv"]) +
116
+ get_text_from_docx(extracted_files["docx"]) +
117
+ get_text_from_ipynb(extracted_files["ipynb"])
118
+ )
119
+
120
+ print("Final extracted text for chatbot processing:", text[:500]) # Debugging log (First 500 chars)
121
 
122
+ if not text.strip():
123
+ return "The folder does not contain valid PDF, TXT, CSV, DOCX, or IPYNB files. Please upload supported file types."
124
 
125
+ return "Files successfully read. Processing question..."
 
 
126
 
127
  # Gradio interface
128
  demo = gr.Interface(
 
131
  outputs=gr.Textbox(label="Answer")
132
  )
133
 
134
+ demo.launch()