Spaces:
Running
Running
Update extract_text_from_pdf.py
Browse files- extract_text_from_pdf.py +7 -7
extract_text_from_pdf.py
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
|
3 |
import os
|
4 |
import torch
|
5 |
-
import spaces
|
6 |
from PyPDF2 import PdfReader
|
7 |
from accelerate import Accelerator
|
8 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
@@ -18,7 +18,7 @@ class PDFTextExtractor:
|
|
18 |
"""
|
19 |
A class to handle PDF text extraction and preprocessing for podcast preparation.
|
20 |
"""
|
21 |
-
|
22 |
def __init__(self, pdf_path, output_path):
|
23 |
"""
|
24 |
Initialize the PDFTextExtractor with paths and model details.
|
@@ -49,7 +49,7 @@ class PDFTextExtractor:
|
|
49 |
Be smart and aggressive with removing details; you're only cleaning up the text without summarizing.
|
50 |
Here is the text:
|
51 |
"""
|
52 |
-
|
53 |
def validate_pdf(self):
|
54 |
"""Check if the file exists and is a valid PDF."""
|
55 |
if not os.path.exists(self.pdf_path):
|
@@ -60,7 +60,7 @@ class PDFTextExtractor:
|
|
60 |
return False
|
61 |
return True
|
62 |
|
63 |
-
|
64 |
def extract_text(self):
|
65 |
"""Extract text from the PDF, limited by max_chars."""
|
66 |
if not self.validate_pdf():
|
@@ -91,7 +91,7 @@ class PDFTextExtractor:
|
|
91 |
final_text = '\n'.join(extracted_text)
|
92 |
print(f"Extraction complete! Total characters: {len(final_text)}")
|
93 |
return final_text
|
94 |
-
|
95 |
def create_word_bounded_chunks(self, text):
|
96 |
"""Split text into chunks around the target size."""
|
97 |
words = text.split()
|
@@ -114,7 +114,7 @@ class PDFTextExtractor:
|
|
114 |
|
115 |
return chunks
|
116 |
|
117 |
-
|
118 |
def process_chunk(self, text_chunk):
|
119 |
"""Process a text chunk with the model and return the cleaned text."""
|
120 |
conversation = [
|
@@ -130,7 +130,7 @@ class PDFTextExtractor:
|
|
130 |
|
131 |
processed_text = self.tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):].strip()
|
132 |
return processed_text
|
133 |
-
|
134 |
def clean_and_save_text(self):
|
135 |
"""Extract, clean, and save processed text to a file."""
|
136 |
extracted_text = self.extract_text()
|
|
|
2 |
|
3 |
import os
|
4 |
import torch
|
5 |
+
#import spaces
|
6 |
from PyPDF2 import PdfReader
|
7 |
from accelerate import Accelerator
|
8 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
18 |
"""
|
19 |
A class to handle PDF text extraction and preprocessing for podcast preparation.
|
20 |
"""
|
21 |
+
#@spaces.GPU
|
22 |
def __init__(self, pdf_path, output_path):
|
23 |
"""
|
24 |
Initialize the PDFTextExtractor with paths and model details.
|
|
|
49 |
Be smart and aggressive with removing details; you're only cleaning up the text without summarizing.
|
50 |
Here is the text:
|
51 |
"""
|
52 |
+
#@spaces.GPU
|
53 |
def validate_pdf(self):
|
54 |
"""Check if the file exists and is a valid PDF."""
|
55 |
if not os.path.exists(self.pdf_path):
|
|
|
60 |
return False
|
61 |
return True
|
62 |
|
63 |
+
#@spaces.GPU
|
64 |
def extract_text(self):
|
65 |
"""Extract text from the PDF, limited by max_chars."""
|
66 |
if not self.validate_pdf():
|
|
|
91 |
final_text = '\n'.join(extracted_text)
|
92 |
print(f"Extraction complete! Total characters: {len(final_text)}")
|
93 |
return final_text
|
94 |
+
#@spaces.GPU
|
95 |
def create_word_bounded_chunks(self, text):
|
96 |
"""Split text into chunks around the target size."""
|
97 |
words = text.split()
|
|
|
114 |
|
115 |
return chunks
|
116 |
|
117 |
+
#@spaces.GPU(duration=120)
|
118 |
def process_chunk(self, text_chunk):
|
119 |
"""Process a text chunk with the model and return the cleaned text."""
|
120 |
conversation = [
|
|
|
130 |
|
131 |
processed_text = self.tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):].strip()
|
132 |
return processed_text
|
133 |
+
#@spaces.GPU
|
134 |
def clean_and_save_text(self):
|
135 |
"""Extract, clean, and save processed text to a file."""
|
136 |
extracted_text = self.extract_text()
|