Spaces:
Running
Running
Update extract_text_from_pdf.py
Browse files- extract_text_from_pdf.py +4 -3
extract_text_from_pdf.py
CHANGED
@@ -49,7 +49,7 @@ class PDFTextExtractor:
|
|
49 |
Be smart and aggressive with removing details; you're only cleaning up the text without summarizing.
|
50 |
Here is the text:
|
51 |
"""
|
52 |
-
|
53 |
def validate_pdf(self):
|
54 |
"""Check if the file exists and is a valid PDF."""
|
55 |
if not os.path.exists(self.pdf_path):
|
@@ -60,6 +60,7 @@ class PDFTextExtractor:
|
|
60 |
return False
|
61 |
return True
|
62 |
|
|
|
63 |
def extract_text(self):
|
64 |
"""Extract text from the PDF, limited by max_chars."""
|
65 |
if not self.validate_pdf():
|
@@ -90,7 +91,7 @@ class PDFTextExtractor:
|
|
90 |
final_text = '\n'.join(extracted_text)
|
91 |
print(f"Extraction complete! Total characters: {len(final_text)}")
|
92 |
return final_text
|
93 |
-
|
94 |
def create_word_bounded_chunks(self, text):
|
95 |
"""Split text into chunks around the target size."""
|
96 |
words = text.split()
|
@@ -129,7 +130,7 @@ class PDFTextExtractor:
|
|
129 |
|
130 |
processed_text = self.tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):].strip()
|
131 |
return processed_text
|
132 |
-
|
133 |
def clean_and_save_text(self):
|
134 |
"""Extract, clean, and save processed text to a file."""
|
135 |
extracted_text = self.extract_text()
|
|
|
49 |
Be smart and aggressive with removing details; you're only cleaning up the text without summarizing.
|
50 |
Here is the text:
|
51 |
"""
|
52 |
+
@spaces.GPU
|
53 |
def validate_pdf(self):
|
54 |
"""Check if the file exists and is a valid PDF."""
|
55 |
if not os.path.exists(self.pdf_path):
|
|
|
60 |
return False
|
61 |
return True
|
62 |
|
63 |
+
@spaces.GPU
|
64 |
def extract_text(self):
|
65 |
"""Extract text from the PDF, limited by max_chars."""
|
66 |
if not self.validate_pdf():
|
|
|
91 |
final_text = '\n'.join(extracted_text)
|
92 |
print(f"Extraction complete! Total characters: {len(final_text)}")
|
93 |
return final_text
|
94 |
+
@spaces.GPU
|
95 |
def create_word_bounded_chunks(self, text):
|
96 |
"""Split text into chunks around the target size."""
|
97 |
words = text.split()
|
|
|
130 |
|
131 |
processed_text = self.tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):].strip()
|
132 |
return processed_text
|
133 |
+
@spaces.GPU
|
134 |
def clean_and_save_text(self):
|
135 |
"""Extract, clean, and save processed text to a file."""
|
136 |
extracted_text = self.extract_text()
|