yasserrmd commited on
Commit
7db791c
·
verified ·
1 Parent(s): 870f7a1

Update extract_text_from_pdf.py

Browse files
Files changed (1) hide show
  1. extract_text_from_pdf.py +7 -7
extract_text_from_pdf.py CHANGED
@@ -2,7 +2,7 @@
2
 
3
  import os
4
  import torch
5
- import spaces
6
  from PyPDF2 import PdfReader
7
  from accelerate import Accelerator
8
  from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -18,7 +18,7 @@ class PDFTextExtractor:
18
  """
19
  A class to handle PDF text extraction and preprocessing for podcast preparation.
20
  """
21
- @spaces.GPU
22
  def __init__(self, pdf_path, output_path):
23
  """
24
  Initialize the PDFTextExtractor with paths and model details.
@@ -49,7 +49,7 @@ class PDFTextExtractor:
49
  Be smart and aggressive with removing details; you're only cleaning up the text without summarizing.
50
  Here is the text:
51
  """
52
- @spaces.GPU
53
  def validate_pdf(self):
54
  """Check if the file exists and is a valid PDF."""
55
  if not os.path.exists(self.pdf_path):
@@ -60,7 +60,7 @@ class PDFTextExtractor:
60
  return False
61
  return True
62
 
63
- @spaces.GPU
64
  def extract_text(self):
65
  """Extract text from the PDF, limited by max_chars."""
66
  if not self.validate_pdf():
@@ -91,7 +91,7 @@ class PDFTextExtractor:
91
  final_text = '\n'.join(extracted_text)
92
  print(f"Extraction complete! Total characters: {len(final_text)}")
93
  return final_text
94
- @spaces.GPU
95
  def create_word_bounded_chunks(self, text):
96
  """Split text into chunks around the target size."""
97
  words = text.split()
@@ -114,7 +114,7 @@ class PDFTextExtractor:
114
 
115
  return chunks
116
 
117
- @spaces.GPU(duration=120)
118
  def process_chunk(self, text_chunk):
119
  """Process a text chunk with the model and return the cleaned text."""
120
  conversation = [
@@ -130,7 +130,7 @@ class PDFTextExtractor:
130
 
131
  processed_text = self.tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):].strip()
132
  return processed_text
133
- @spaces.GPU
134
  def clean_and_save_text(self):
135
  """Extract, clean, and save processed text to a file."""
136
  extracted_text = self.extract_text()
 
2
 
3
  import os
4
  import torch
5
+ #import spaces
6
  from PyPDF2 import PdfReader
7
  from accelerate import Accelerator
8
  from transformers import AutoModelForCausalLM, AutoTokenizer
 
18
  """
19
  A class to handle PDF text extraction and preprocessing for podcast preparation.
20
  """
21
+ #@spaces.GPU
22
  def __init__(self, pdf_path, output_path):
23
  """
24
  Initialize the PDFTextExtractor with paths and model details.
 
49
  Be smart and aggressive with removing details; you're only cleaning up the text without summarizing.
50
  Here is the text:
51
  """
52
+ #@spaces.GPU
53
  def validate_pdf(self):
54
  """Check if the file exists and is a valid PDF."""
55
  if not os.path.exists(self.pdf_path):
 
60
  return False
61
  return True
62
 
63
+ #@spaces.GPU
64
  def extract_text(self):
65
  """Extract text from the PDF, limited by max_chars."""
66
  if not self.validate_pdf():
 
91
  final_text = '\n'.join(extracted_text)
92
  print(f"Extraction complete! Total characters: {len(final_text)}")
93
  return final_text
94
+ #@spaces.GPU
95
  def create_word_bounded_chunks(self, text):
96
  """Split text into chunks around the target size."""
97
  words = text.split()
 
114
 
115
  return chunks
116
 
117
+ #@spaces.GPU(duration=120)
118
  def process_chunk(self, text_chunk):
119
  """Process a text chunk with the model and return the cleaned text."""
120
  conversation = [
 
130
 
131
  processed_text = self.tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):].strip()
132
  return processed_text
133
+ #@spaces.GPU
134
  def clean_and_save_text(self):
135
  """Extract, clean, and save processed text to a file."""
136
  extracted_text = self.extract_text()