yasserrmd commited on
Commit
ffbac09
·
verified ·
1 Parent(s): 64cf881

Update extract_text_from_pdf.py

Browse files
Files changed (1) hide show
  1. extract_text_from_pdf.py +22 -7
extract_text_from_pdf.py CHANGED
@@ -28,8 +28,8 @@ class PDFTextExtractor:
28
  model_name (str): Name of the model to use for text processing.
29
  """
30
 
31
- model_name="bartowski/Llama-3.2-1B-Instruct-GGUF"
32
- filename = "Llama-3.2-1B-Instruct-Q5_K_S.gguf"
33
  self.pdf_path = pdf_path
34
  self.output_path = output_path
35
  self.max_chars = 100000
@@ -38,17 +38,30 @@ class PDFTextExtractor:
38
 
39
  # Initialize model and tokenizer
40
  self.accelerator = Accelerator()
41
- self.model = AutoModelForCausalLM.from_pretrained(model_name, gguf_file=filename, torch_dtype=torch.bfloat16).to(self.device)
42
- self.tokenizer = AutoTokenizer.from_pretrained(model_name, gguf_file=filename)
43
  self.model, self.tokenizer = self.accelerator.prepare(self.model, self.tokenizer)
 
44
 
45
  # System prompt for text processing
46
  self.system_prompt = """
47
  You are a world class text pre-processor, here is the raw data from a PDF, please parse and return it in a way that is crispy and usable to send to a podcast writer.
 
 
48
 
49
- Be smart and aggressive with removing details; you're only cleaning up the text without summarizing.
50
- Here is the text:
51
- """
 
 
 
 
 
 
 
 
 
 
52
  #@spaces.GPU
53
  def validate_pdf(self):
54
  """Check if the file exists and is a valid PDF."""
@@ -130,6 +143,8 @@ class PDFTextExtractor:
130
 
131
  processed_text = self.tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):].strip()
132
  return processed_text
 
 
133
  #@spaces.GPU
134
  def clean_and_save_text(self):
135
  """Extract, clean, and save processed text to a file."""
 
28
  model_name (str): Name of the model to use for text processing.
29
  """
30
 
31
+ model_name="meta-llama/Llama-3.2-1B-Instruct"
32
+
33
  self.pdf_path = pdf_path
34
  self.output_path = output_path
35
  self.max_chars = 100000
 
38
 
39
  # Initialize model and tokenizer
40
  self.accelerator = Accelerator()
41
+ self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16,use_safetensors=True,device_map=device)
42
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_safetensors=True)
43
  self.model, self.tokenizer = self.accelerator.prepare(self.model, self.tokenizer)
44
+
45
 
46
  # System prompt for text processing
47
  self.system_prompt = """
48
  You are a world class text pre-processor, here is the raw data from a PDF, please parse and return it in a way that is crispy and usable to send to a podcast writer.
49
+
50
+ The raw data is messed up with new lines, Latex math and you will see fluff that we can remove completely. Basically take away any details that you think might be useless in a podcast author's transcript.
51
 
52
+ Remember, the podcast could be on any topic whatsoever so the issues listed above are not exhaustive
53
+
54
+ Please be smart with what you remove and be creative ok?
55
+
56
+ Remember DO NOT START SUMMARIZING THIS, YOU ARE ONLY CLEANING UP THE TEXT AND RE-WRITING WHEN NEEDED
57
+
58
+ Be very smart and aggressive with removing details, you will get a running portion of the text and keep returning the processed text.
59
+
60
+ PLEASE DO NOT ADD MARKDOWN FORMATTING, STOP ADDING SPECIAL CHARACTERS THAT MARKDOWN CAPATILISATION ETC LIKES
61
+
62
+ ALWAYS start your response directly with processed text and NO ACKNOWLEDGEMENTS about my questions ok?
63
+ Here is the text:"""
64
+
65
  #@spaces.GPU
66
  def validate_pdf(self):
67
  """Check if the file exists and is a valid PDF."""
 
143
 
144
  processed_text = self.tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):].strip()
145
  return processed_text
146
+
147
+
148
  #@spaces.GPU
149
  def clean_and_save_text(self):
150
  """Extract, clean, and save processed text to a file."""