File size: 5,905 Bytes
a7932b8
b432dd9
 
 
 
c0cb368
b432dd9
 
 
 
8f71aa4
c0cb368
8f71aa4
c0cb368
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b432dd9
10e4113
c0cb368
 
 
 
 
 
 
 
 
 
 
 
 
 
10e4113
d06c0f4
302823e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import os
import gradio as gr
from PyPDF2 import PdfReader
import requests
from dotenv import load_dotenv
from transformers import AutoTokenizer
# Load environment variables
load_dotenv()
# Get the Hugging Face API token
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
def count_tokens(text):
   return len(tokenizer.encode(text))
def summarize_text(text, instructions, agent_name, max_length, temperature, repetition_penalty, top_p):
   print(f"{agent_name}: Starting summarization")
   API_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3"
   headers = {"Authorization": f"Bearer {HUGGINGFACE_TOKEN}"}
   summaries = []
   current_text = text
   while len(current_text) > 0:
       payload = {
           "inputs": f"{instructions}\n\nText to summarize:\n{current_text}",
           "parameters": {
               "max_length": max_length,
               "temperature": temperature,
               "repetition_penalty": repetition_penalty,
               "top_p": top_p
           }
       }
       print(f"{agent_name}: Sending request to API")
       response = requests.post(API_URL, headers=headers, json=payload)
       print(f"{agent_name}: Received response from API")
       generated_text = response.json()[0]["generated_text"]
       # Split the generated text by the delimiter "\n\n" and take the last part as the summary
       summary = generated_text.split("\n\n")[-1]
       summaries.append(summary)
       # Remove the summarized part from the current text
       current_text = current_text[len(summary):].strip()
   # Join all partial summaries into a final summary
   final_summary = "\n\n".join(summaries)
   return final_summary
def process_pdf(pdf_file, chunk_instructions, window_instructions, final_instructions, max_length, temperature, repetition_penalty, top_p):
   print("Starting PDF processing")
   # Read PDF
   reader = PdfReader(pdf_file)
   text = ""
   for page in reader.pages:
       text += page.extract_text() + "\n\n"
   print(f"Extracted {len(reader.pages)} pages from PDF")
   # Chunk the text (simple splitting by pages for this example)
   chunks = text.split("\n\n")
   print(f"Split text into {len(chunks)} chunks")
   # Agent 1: Summarize each chunk
   agent1_summaries = []
   for i, chunk in enumerate(chunks):
       print(f"Agent 1: Processing chunk {i+1}/{len(chunks)}")
       summary = summarize_text(chunk, chunk_instructions, "Agent 1", max_length, temperature, repetition_penalty, top_p)
       agent1_summaries.append(summary)
   print("Agent 1: Finished processing all chunks")
   # Concatenate Agent 1 summaries
   concatenated_summary = "\n\n".join(agent1_summaries)
   print(f"Concatenated Agent 1 summaries (length: {count_tokens(concatenated_summary)} tokens)")
   print(f"Concatenated Summary: {concatenated_summary}")
   # Sliding window approach
   window_size = 3500  # in tokens
   step_size = 3000  # overlap of 500 tokens
   windows = []
   current_position = 0
   while current_position < len(concatenated_summary):
       window_end = current_position
       window_text = ""
       while count_tokens(window_text) < window_size and window_end < len(concatenated_summary):
           window_text += concatenated_summary[window_end]
           window_end += 1
       windows.append(window_text)
       current_position += step_size
   print(f"Created {len(windows)} windows for intermediate summarization")
   # Intermediate summarization
   intermediate_summaries = []
   for i, window in enumerate(windows):
       print(f"Processing window {i+1}/{len(windows)}")
       summary = summarize_text(window, window_instructions, f"Window {i+1}", max_length, temperature, repetition_penalty, top_p)
       intermediate_summaries.append(summary)
   # Final summarization
   final_input = "\n\n".join(intermediate_summaries)
   print(f"Final input length: {count_tokens(final_input)} tokens")
   final_summary = summarize_text(final_input, final_instructions, "Agent 2", max_length, temperature, repetition_penalty, top_p)
   print("Agent 2: Finished final summarization")
   return final_summary
def pdf_summarizer(pdf_file, chunk_instructions, window_instructions, final_instructions, max_length, temperature, repetition_penalty, top_p):
   if pdf_file is None:
       print("Error: No PDF file uploaded")
       return "Please upload a PDF file."
   try:
       print(f"Starting summarization process for file: {pdf_file.name}")
       summary = process_pdf(pdf_file.name, chunk_instructions, window_instructions, final_instructions, max_length, temperature, repetition_penalty, top_p)
       print("Summarization process completed successfully")
       return summary
   except Exception as e:
       print(f"An error occurred: {str(e)}")
       return f"An error occurred: {str(e)}"
# Gradio interface
iface = gr.Interface(
   fn=pdf_summarizer,
   inputs=[
       gr.File(label="Upload PDF"),
       gr.Textbox(label="Chunk Instructions", placeholder="Instructions for summarizing each chunk"),
       gr.Textbox(label="Window Instructions", placeholder="Instructions for summarizing each window"),
       gr.Textbox(label="Final Instructions", placeholder="Instructions for final summarization"),
       gr.Slider(label="Max Length", minimum=500, maximum=3500, step=100, value=2000),
       gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, step=0.1, value=0.7),
       gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, step=0.1, value=1.1),
       gr.Slider(label="Top P", minimum=0.1, maximum=1.0, step=0.1, value=0.9)
   ],
   outputs=gr.Textbox(label="Summary"),
   title="PDF Earnings Summary Generator",
   description="Upload a PDF of an earnings summary or transcript to generate a concise summary."
)
print("Launching Gradio interface")
iface.launch()