yasserrmd commited on
Commit
2fd0e2b
·
verified ·
1 Parent(s): 534c98c

Update extract_text_from_pdf.py

Browse files
Files changed (1) hide show
  1. extract_text_from_pdf.py +4 -1
extract_text_from_pdf.py CHANGED
@@ -2,6 +2,7 @@
2
 
3
  import os
4
  import torch
 
5
  from PyPDF2 import PdfReader
6
  from accelerate import Accelerator
7
  from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -9,6 +10,7 @@ from tqdm import tqdm
9
  import warnings
10
 
11
 
 
12
  warnings.filterwarnings('ignore')
13
 
14
 
@@ -16,7 +18,7 @@ class PDFTextExtractor:
16
  """
17
  A class to handle PDF text extraction and preprocessing for podcast preparation.
18
  """
19
-
20
  def __init__(self, pdf_path, output_path, model_name="meta-llama/Llama-3.2-1B-Instruct"):
21
  """
22
  Initialize the PDFTextExtractor with paths and model details.
@@ -109,6 +111,7 @@ class PDFTextExtractor:
109
 
110
  return chunks
111
 
 
112
  def process_chunk(self, text_chunk):
113
  """Process a text chunk with the model and return the cleaned text."""
114
  conversation = [
 
2
 
3
  import os
4
  import torch
5
+ import spaces
6
  from PyPDF2 import PdfReader
7
  from accelerate import Accelerator
8
  from transformers import AutoModelForCausalLM, AutoTokenizer
 
10
  import warnings
11
 
12
 
13
+
14
  warnings.filterwarnings('ignore')
15
 
16
 
 
18
  """
19
  A class to handle PDF text extraction and preprocessing for podcast preparation.
20
  """
21
+ @spaces.GPU
22
  def __init__(self, pdf_path, output_path, model_name="meta-llama/Llama-3.2-1B-Instruct"):
23
  """
24
  Initialize the PDFTextExtractor with paths and model details.
 
111
 
112
  return chunks
113
 
114
+ @spaces.GPU
115
  def process_chunk(self, text_chunk):
116
  """Process a text chunk with the model and return the cleaned text."""
117
  conversation = [