File size: 1,744 Bytes
a2ff264
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from .IDataReader import IDataReader
from PyPDF2 import PdfReader
from docx import Document

class DataReader(IDataReader):
    def read_pdf(self, file_path: str) -> str:
        """
        Reads a PDF file and returns its text content.
        
        :param file_path: Path to the PDF file.
        :return: Text content of the PDF file.
        """
        try:
            text = ""
            with open(file_path, "rb") as f:
                reader = PdfReader(f)
                for page in reader.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n"
            return text
        except Exception as e:
            print(f"Error reading PDF file: {e}")
            return ""
        
    def read_docx(self, file_path: str) -> str:
        """
        Reads a DOCX file and returns its text content.
        
        :param file_path: Path to the DOCX file.
        :return: Text content of the DOCX file.
        """
        try:
            doc = Document(file_path)
            text = "\n".join([para.text for para in doc.paragraphs])
            return text
        except Exception as e:
            print(f"Error reading DOCX file: {e}")
            return ""
        
    def read_txt(self, file_path: str) -> str:
        """
        Reads a TXT file and returns its text content.
        
        :param file_path: Path to the TXT file.
        :return: Text content of the TXT file.
        """
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                text = f.read()
            return text
        except Exception as e:
            print(f"Error reading TXT file: {e}")
            return ""