VishwaTechnologiesPvtLtd
new one
a2ff264
raw
history blame
1.74 kB
from .IDataReader import IDataReader
from PyPDF2 import PdfReader
from docx import Document
class DataReader(IDataReader):
def read_pdf(self, file_path: str) -> str:
"""
Reads a PDF file and returns its text content.
:param file_path: Path to the PDF file.
:return: Text content of the PDF file.
"""
try:
text = ""
with open(file_path, "rb") as f:
reader = PdfReader(f)
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
return text
except Exception as e:
print(f"Error reading PDF file: {e}")
return ""
def read_docx(self, file_path: str) -> str:
"""
Reads a DOCX file and returns its text content.
:param file_path: Path to the DOCX file.
:return: Text content of the DOCX file.
"""
try:
doc = Document(file_path)
text = "\n".join([para.text for para in doc.paragraphs])
return text
except Exception as e:
print(f"Error reading DOCX file: {e}")
return ""
def read_txt(self, file_path: str) -> str:
"""
Reads a TXT file and returns its text content.
:param file_path: Path to the TXT file.
:return: Text content of the TXT file.
"""
try:
with open(file_path, "r", encoding="utf-8") as f:
text = f.read()
return text
except Exception as e:
print(f"Error reading TXT file: {e}")
return ""