yasserrmd commited on
Commit
ea4f634
·
verified ·
1 Parent(s): 65a0de7

Update extract_text_from_pdf.py

Browse files
Files changed (1) hide show
  1. extract_text_from_pdf.py +4 -3
extract_text_from_pdf.py CHANGED
@@ -49,7 +49,7 @@ class PDFTextExtractor:
49
  Be smart and aggressive with removing details; you're only cleaning up the text without summarizing.
50
  Here is the text:
51
  """
52
-
53
  def validate_pdf(self):
54
  """Check if the file exists and is a valid PDF."""
55
  if not os.path.exists(self.pdf_path):
@@ -60,6 +60,7 @@ class PDFTextExtractor:
60
  return False
61
  return True
62
 
 
63
  def extract_text(self):
64
  """Extract text from the PDF, limited by max_chars."""
65
  if not self.validate_pdf():
@@ -90,7 +91,7 @@ class PDFTextExtractor:
90
  final_text = '\n'.join(extracted_text)
91
  print(f"Extraction complete! Total characters: {len(final_text)}")
92
  return final_text
93
-
94
  def create_word_bounded_chunks(self, text):
95
  """Split text into chunks around the target size."""
96
  words = text.split()
@@ -129,7 +130,7 @@ class PDFTextExtractor:
129
 
130
  processed_text = self.tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):].strip()
131
  return processed_text
132
-
133
  def clean_and_save_text(self):
134
  """Extract, clean, and save processed text to a file."""
135
  extracted_text = self.extract_text()
 
49
  Be smart and aggressive with removing details; you're only cleaning up the text without summarizing.
50
  Here is the text:
51
  """
52
+ @spaces.GPU
53
  def validate_pdf(self):
54
  """Check if the file exists and is a valid PDF."""
55
  if not os.path.exists(self.pdf_path):
 
60
  return False
61
  return True
62
 
63
+ @spaces.GPU
64
  def extract_text(self):
65
  """Extract text from the PDF, limited by max_chars."""
66
  if not self.validate_pdf():
 
91
  final_text = '\n'.join(extracted_text)
92
  print(f"Extraction complete! Total characters: {len(final_text)}")
93
  return final_text
94
+ @spaces.GPU
95
  def create_word_bounded_chunks(self, text):
96
  """Split text into chunks around the target size."""
97
  words = text.split()
 
130
 
131
  processed_text = self.tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):].strip()
132
  return processed_text
133
+ @spaces.GPU
134
  def clean_and_save_text(self):
135
  """Extract, clean, and save processed text to a file."""
136
  extracted_text = self.extract_text()