Spaces:
Running
Running
Update extract_text_from_pdf.py
Browse files- extract_text_from_pdf.py +4 -1
extract_text_from_pdf.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
| 2 |
|
| 3 |
import os
|
| 4 |
import torch
|
|
|
|
| 5 |
from PyPDF2 import PdfReader
|
| 6 |
from accelerate import Accelerator
|
| 7 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
@@ -9,6 +10,7 @@ from tqdm import tqdm
|
|
| 9 |
import warnings
|
| 10 |
|
| 11 |
|
|
|
|
| 12 |
warnings.filterwarnings('ignore')
|
| 13 |
|
| 14 |
|
|
@@ -16,7 +18,7 @@ class PDFTextExtractor:
|
|
| 16 |
"""
|
| 17 |
A class to handle PDF text extraction and preprocessing for podcast preparation.
|
| 18 |
"""
|
| 19 |
-
|
| 20 |
def __init__(self, pdf_path, output_path, model_name="meta-llama/Llama-3.2-1B-Instruct"):
|
| 21 |
"""
|
| 22 |
Initialize the PDFTextExtractor with paths and model details.
|
|
@@ -109,6 +111,7 @@ class PDFTextExtractor:
|
|
| 109 |
|
| 110 |
return chunks
|
| 111 |
|
|
|
|
| 112 |
def process_chunk(self, text_chunk):
|
| 113 |
"""Process a text chunk with the model and return the cleaned text."""
|
| 114 |
conversation = [
|
|
|
|
| 2 |
|
| 3 |
import os
|
| 4 |
import torch
|
| 5 |
+
import spaces
|
| 6 |
from PyPDF2 import PdfReader
|
| 7 |
from accelerate import Accelerator
|
| 8 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
|
| 10 |
import warnings
|
| 11 |
|
| 12 |
|
| 13 |
+
|
| 14 |
warnings.filterwarnings('ignore')
|
| 15 |
|
| 16 |
|
|
|
|
| 18 |
"""
|
| 19 |
A class to handle PDF text extraction and preprocessing for podcast preparation.
|
| 20 |
"""
|
| 21 |
+
@spaces.GPU
|
| 22 |
def __init__(self, pdf_path, output_path, model_name="meta-llama/Llama-3.2-1B-Instruct"):
|
| 23 |
"""
|
| 24 |
Initialize the PDFTextExtractor with paths and model details.
|
|
|
|
| 111 |
|
| 112 |
return chunks
|
| 113 |
|
| 114 |
+
@spaces.GPU
|
| 115 |
def process_chunk(self, text_chunk):
|
| 116 |
"""Process a text chunk with the model and return the cleaned text."""
|
| 117 |
conversation = [
|