yasserrmd commited on
Commit
2a61b37
·
verified ·
1 Parent(s): a11721e

Update extract_text_from_pdf.py

Browse files
Files changed (1) hide show
  1. extract_text_from_pdf.py +25 -12
extract_text_from_pdf.py CHANGED
@@ -12,7 +12,7 @@ import spaces
12
 
13
  warnings.filterwarnings('ignore')
14
 
15
- @spaces.GPU
16
  class PDFTextExtractor:
17
  """
18
  A class to handle PDF text extraction and preprocessing for podcast preparation.
@@ -28,19 +28,21 @@ class PDFTextExtractor:
28
  model_name (str): Name of the model to use for text processing.
29
  """
30
 
31
- model_name="meta-llama/Llama-3.2-1B-Instruct"
32
 
33
  self.pdf_path = pdf_path
34
  self.output_path = output_path
35
  self.max_chars = 100000
36
  self.chunk_size = 1000
37
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
 
38
 
39
  # Initialize model and tokenizer
40
- self.accelerator = Accelerator()
41
- self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16,use_safetensors=True,device_map=self.device)
42
- self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_safetensors=True)
43
- self.model, self.tokenizer = self.accelerator.prepare(self.model, self.tokenizer)
 
44
 
45
 
46
  # System prompt for text processing
@@ -135,13 +137,24 @@ class PDFTextExtractor:
135
  {"role": "user", "content": text_chunk}
136
  ]
137
 
138
- prompt = self.tokenizer.apply_chat_template(conversation, tokenize=False)
139
- inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
- with torch.no_grad():
142
- output = self.model.generate(**inputs, temperature=0.7, top_p=0.9, max_new_tokens=512)
143
 
144
- processed_text = self.tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):].strip()
145
  return processed_text
146
 
147
 
 
12
 
13
  warnings.filterwarnings('ignore')
14
 
15
+ #@spaces.GPU
16
  class PDFTextExtractor:
17
  """
18
  A class to handle PDF text extraction and preprocessing for podcast preparation.
 
28
  model_name (str): Name of the model to use for text processing.
29
  """
30
 
31
+ #model_name="meta-llama/Llama-3.2-1B-Instruct"
32
 
33
  self.pdf_path = pdf_path
34
  self.output_path = output_path
35
  self.max_chars = 100000
36
  self.chunk_size = 1000
37
+
38
+ #self.device = "cuda" if torch.cuda.is_available() else "cpu"
39
 
40
  # Initialize model and tokenizer
41
+ # self.accelerator = Accelerator()
42
+ # self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16,use_safetensors=True,device_map=self.device)
43
+ # self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_safetensors=True)
44
+ # self.model, self.tokenizer = self.accelerator.prepare(self.model, self.tokenizer)
45
+ self.model_name="llama3-8b-8192"
46
 
47
 
48
  # System prompt for text processing
 
137
  {"role": "user", "content": text_chunk}
138
  ]
139
 
140
+ # prompt = self.tokenizer.apply_chat_template(conversation, tokenize=False)
141
+ # inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
142
+
143
+ # with torch.no_grad():
144
+ # output = self.model.generate(**inputs, temperature=0.7, top_p=0.9, max_new_tokens=512)
145
+
146
+ # processed_text = self.tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):].strip()
147
+ client = Groq(
148
+ api_key=os.environ.get("GROQ_API_KEY"),
149
+ )
150
+
151
+ chat_completion = client.chat.completions.create(
152
+ messages=conversation,
153
+ model=self.model_name,
154
+ )
155
 
156
+ processed_text = self.extract_tuple(chat_completion.choices[0].message.content)
 
157
 
 
158
  return processed_text
159
 
160