Rafa1986 commited on
Commit
51ac55a
·
verified ·
1 Parent(s): 5b9b221

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -21
app.py CHANGED
@@ -4,6 +4,7 @@ import PyPDF2
4
  import pandas as pd
5
  import openai
6
  import docx
 
7
  from docx import Document
8
  from langchain_community.embeddings import OpenAIEmbeddings
9
  from langchain_community.vectorstores import FAISS
@@ -45,13 +46,31 @@ def extract_files_from_folder(folder_path):
45
  print("Files found:", extracted_files) # Debugging log
46
  return extracted_files
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  def read_text_from_files(file_paths):
49
  """Reads text content from a list of files."""
50
  text = ""
51
  for file_path in file_paths:
52
  print(f"Reading text file: {file_path}") # Debugging log
53
  with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
54
- text += file.read() + "\n"
 
 
55
  return text
56
 
57
  def get_text_from_pdf(pdf_files):
@@ -103,26 +122,8 @@ def correct_exercises(text):
103
  )
104
  return response["choices"][0]["message"]["content"].strip()
105
 
106
- def get_answer(question, vector_db, corrected_exercises):
107
- retriever = vector_db.as_retriever()
108
- docs = retriever.get_relevant_documents(question)
109
-
110
- if not docs:
111
- return "I could not find the answer in the documents. Do you want me to search external sources?"
112
-
113
- context = "\n".join([doc.page_content for doc in docs])
114
- language = detect_language(question)
115
- response = openai.ChatCompletion.create(
116
- model="gpt-3.5-turbo",
117
- messages=[
118
- {"role": "system", "content": f"You are a Data Analytics assistant. Answer in {language}. Use the documents to answer questions. Also, use the corrected exercises if relevant."},
119
- {"role": "user", "content": question + "\n\nBased on the following document content:\n" + context + "\n\nCorrected Exercises:\n" + corrected_exercises}
120
- ]
121
- )
122
- return response["choices"][0]["message"]["content"]
123
-
124
  def chatbot_interface(question):
125
- folder_path = "/mnt/data/Data Analitics/"
126
  extracted_files = extract_files_from_folder(folder_path)
127
 
128
  text = get_text_from_pdf(extracted_files["pdf"]) + read_text_from_files(extracted_files["txt"]) + get_text_from_csv(extracted_files["csv"]) + get_text_from_docx(extracted_files["docx"])
@@ -141,4 +142,4 @@ demo = gr.Interface(
141
  outputs=gr.Textbox(label="Answer")
142
  )
143
 
144
- demo.launch()
 
4
  import pandas as pd
5
  import openai
6
  import docx
7
+ import requests
8
  from docx import Document
9
  from langchain_community.embeddings import OpenAIEmbeddings
10
  from langchain_community.vectorstores import FAISS
 
46
  print("Files found:", extracted_files) # Debugging log
47
  return extracted_files
48
 
49
+ def extract_links_from_text(text):
50
+ """Extracts links from text files and fetches their content."""
51
+ import re
52
+ links = re.findall(r'https?://\S+', text)
53
+ extracted_content = ""
54
+
55
+ for link in links:
56
+ try:
57
+ response = requests.get(link, timeout=5)
58
+ if response.status_code == 200:
59
+ extracted_content += f"\n[Extracted from {link}]\n" + response.text[:1000] # Limit to first 1000 chars
60
+ except requests.exceptions.RequestException:
61
+ extracted_content += f"\n[Could not access {link}]\n"
62
+
63
+ return extracted_content
64
+
65
  def read_text_from_files(file_paths):
66
  """Reads text content from a list of files."""
67
  text = ""
68
  for file_path in file_paths:
69
  print(f"Reading text file: {file_path}") # Debugging log
70
  with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
71
+ file_text = file.read()
72
+ text += file_text + "\n"
73
+ text += extract_links_from_text(file_text) # Extract and add web content
74
  return text
75
 
76
  def get_text_from_pdf(pdf_files):
 
122
  )
123
  return response["choices"][0]["message"]["content"].strip()
124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  def chatbot_interface(question):
126
+ folder_path = "/mnt/data/New_Data_Analytics/"
127
  extracted_files = extract_files_from_folder(folder_path)
128
 
129
  text = get_text_from_pdf(extracted_files["pdf"]) + read_text_from_files(extracted_files["txt"]) + get_text_from_csv(extracted_files["csv"]) + get_text_from_docx(extracted_files["docx"])
 
142
  outputs=gr.Textbox(label="Answer")
143
  )
144
 
145
+ demo.launch()