Rafa1986 commited on
Commit
94f3898
·
verified ·
1 Parent(s): 8b5a642

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -3
app.py CHANGED
@@ -3,6 +3,7 @@ import os
3
  import PyPDF2
4
  import pandas as pd
5
  import openai
 
6
  from langchain_community.embeddings import OpenAIEmbeddings
7
  from langchain_community.vectorstores import FAISS
8
  from langchain_community.llms import OpenAI
@@ -59,7 +60,11 @@ def get_text_from_pdf(pdf_files):
59
  with open(pdf_path, "rb") as pdf_file:
60
  reader = PyPDF2.PdfReader(pdf_file)
61
  for page in reader.pages:
62
- text += page.extract_text() + "\n"
 
 
 
 
63
  return text
64
 
65
  def get_text_from_csv(csv_files):
@@ -70,6 +75,15 @@ def get_text_from_csv(csv_files):
70
  text += df.to_string() + "\n"
71
  return text
72
 
 
 
 
 
 
 
 
 
 
73
  def create_vector_database(text):
74
  splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
75
  texts = splitter.split_text(text)
@@ -110,7 +124,7 @@ def chatbot_interface(question):
110
  folder_path = "/mnt/data/Data Analitics/"
111
  extracted_files = extract_files_from_folder(folder_path)
112
 
113
- text = get_text_from_pdf(extracted_files["pdf"]) + read_text_from_files(extracted_files["txt"]) + get_text_from_csv(extracted_files["csv"])
114
 
115
  if not text:
116
  return "The folder does not contain valid PDF, TXT, CSV, or DOCX files. Please upload supported file types."
@@ -126,4 +140,4 @@ demo = gr.Interface(
126
  outputs=gr.Textbox(label="Answer")
127
  )
128
 
129
- demo.launch()
 
3
  import PyPDF2
4
  import pandas as pd
5
  import openai
6
+ import docx
7
  from langchain_community.embeddings import OpenAIEmbeddings
8
  from langchain_community.vectorstores import FAISS
9
  from langchain_community.llms import OpenAI
 
60
  with open(pdf_path, "rb") as pdf_file:
61
  reader = PyPDF2.PdfReader(pdf_file)
62
  for page in reader.pages:
63
+ page_text = page.extract_text()
64
+ if page_text:
65
+ text += page_text + "\n"
66
+ else:
67
+ text += "[Could not extract text from this page]\n"
68
  return text
69
 
70
  def get_text_from_csv(csv_files):
 
75
  text += df.to_string() + "\n"
76
  return text
77
 
78
+ def get_text_from_docx(docx_files):
79
+ text = ""
80
+ for docx_path in docx_files:
81
+ print(f"Reading DOCX file: {docx_path}") # Debugging log
82
+ doc = docx.Document(docx_path)
83
+ for para in doc.paragraphs:
84
+ text += para.text + "\n"
85
+ return text
86
+
87
  def create_vector_database(text):
88
  splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
89
  texts = splitter.split_text(text)
 
124
  folder_path = "/mnt/data/Data Analitics/"
125
  extracted_files = extract_files_from_folder(folder_path)
126
 
127
+ text = get_text_from_pdf(extracted_files["pdf"]) + read_text_from_files(extracted_files["txt"]) + get_text_from_csv(extracted_files["csv"]) + get_text_from_docx(extracted_files["docx"])
128
 
129
  if not text:
130
  return "The folder does not contain valid PDF, TXT, CSV, or DOCX files. Please upload supported file types."
 
140
  outputs=gr.Textbox(label="Answer")
141
  )
142
 
143
+ demo.launch()