Shreyas094 commited on
Commit
c0cb368
·
verified ·
1 Parent(s): 5e2a934

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -116
app.py CHANGED
@@ -3,130 +3,119 @@ import gradio as gr
3
  from PyPDF2 import PdfReader
4
  import requests
5
  from dotenv import load_dotenv
6
- import tiktoken
7
-
8
  # Load environment variables
9
  load_dotenv()
10
-
11
  # Get the Hugging Face API token
12
  HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
13
-
14
  # Initialize the tokenizer
15
- tokenizer = tiktoken.get_encoding("cl100k_base")
16
-
17
  def count_tokens(text):
18
- return len(tokenizer.encode(text))
19
-
20
- def summarize_text(text, instructions, agent_name):
21
- print(f"{agent_name}: Starting summarization")
22
- API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
23
- headers = {"Authorization": f"Bearer {HUGGINGFACE_TOKEN}"}
24
-
25
- payload = {
26
- "inputs": f"{instructions}\n\nText to summarize:\n{text}",
27
- "parameters": {"max_length": 2000}
28
- }
29
-
30
- print(f"{agent_name}: Sending request to API")
31
- response = requests.post(API_URL, headers=headers, json=payload)
32
- print(f"{agent_name}: Received response from API")
33
-
34
- # Extracting only the generated summary from the response
35
- generated_text = response.json()[0]["generated_text"]
36
-
37
- # Assuming the model returns the entire input followed by the summary
38
- # Split the generated text by the delimiter "\n\n" and take the last part as the summary
39
- summary = generated_text.split("\n\n")[-1]
40
-
41
- return summary
42
-
43
- def process_pdf(pdf_file, chunk_instructions, window_instructions, final_instructions):
44
- print("Starting PDF processing")
45
- # Read PDF
46
- reader = PdfReader(pdf_file)
47
- text = ""
48
- for page in reader.pages:
49
- text += page.extract_text() + "\n\n"
50
-
51
- print(f"Extracted {len(reader.pages)} pages from PDF")
52
-
53
- # Chunk the text (simple splitting by pages for this example)
54
- chunks = text.split("\n\n")
55
- print(f"Split text into {len(chunks)} chunks")
56
-
57
- # Agent 1: Summarize each chunk
58
- agent1_summaries = []
59
- for i, chunk in enumerate(chunks):
60
- print(f"Agent 1: Processing chunk {i+1}/{len(chunks)}")
61
- summary = summarize_text(chunk, chunk_instructions, "Agent 1")
62
- agent1_summaries.append(summary)
63
-
64
- print("Agent 1: Finished processing all chunks")
65
-
66
- # Concatenate Agent 1 summaries
67
- concatenated_summary = "\n\n".join(agent1_summaries)
68
- print(f"Concatenated Agent 1 summaries (length: {len(concatenated_summary)})")
69
- print(f"Concatenated Summary: {concatenated_summary}")
70
-
71
- # Sliding window approach
72
- window_size = 3500 # in tokens
73
- step_size = 3000 # overlap of 500 tokens
74
- windows = []
75
- current_position = 0
76
-
77
- while current_position < len(concatenated_summary):
78
- window_end = current_position
79
- window_text = ""
80
- while count_tokens(window_text) < window_size and window_end < len(concatenated_summary):
81
- window_text += concatenated_summary[window_end]
82
- window_end += 1
83
- windows.append(window_text)
84
- current_position += step_size
85
-
86
- print(f"Created {len(windows)} windows for intermediate summarization")
87
-
88
- # Intermediate summarization
89
- intermediate_summaries = []
90
- for i, window in enumerate(windows):
91
- print(f"Processing window {i+1}/{len(windows)}")
92
- summary = summarize_text(window, window_instructions, f"Window {i+1}")
93
- intermediate_summaries.append(summary)
94
-
95
- # Final summarization
96
- final_input = "\n\n".join(intermediate_summaries)
97
- print(f"Final input length: {count_tokens(final_input)} tokens")
98
- final_summary = summarize_text(final_input, final_instructions, "Agent 2")
99
- print("Agent 2: Finished final summarization")
100
-
101
- return final_summary
102
-
103
- def pdf_summarizer(pdf_file, chunk_instructions, window_instructions, final_instructions):
104
- if pdf_file is None:
105
- print("Error: No PDF file uploaded")
106
- return "Please upload a PDF file."
107
-
108
- try:
109
- print(f"Starting summarization process for file: {pdf_file.name}")
110
- summary = process_pdf(pdf_file.name, chunk_instructions, window_instructions, final_instructions)
111
- print("Summarization process completed successfully")
112
- return summary
113
- except Exception as e:
114
- print(f"An error occurred: {str(e)}")
115
- return f"An error occurred: {str(e)}"
116
-
117
  # Gradio interface
118
  iface = gr.Interface(
119
- fn=pdf_summarizer,
120
- inputs=[
121
- gr.File(label="Upload PDF"),
122
- gr.Textbox(label="Chunk Instructions", placeholder="Instructions for summarizing each chunk"),
123
- gr.Textbox(label="Window Instructions", placeholder="Instructions for summarizing each window"),
124
- gr.Textbox(label="Final Instructions", placeholder="Instructions for final summarization")
125
- ],
126
- outputs=gr.Textbox(label="Summary"),
127
- title="PDF Earnings Summary Generator",
128
- description="Upload a PDF of an earnings summary or transcript to generate a concise summary."
 
 
 
 
129
  )
130
-
131
  print("Launching Gradio interface")
132
  iface.launch()
 
3
  from PyPDF2 import PdfReader
4
  import requests
5
  from dotenv import load_dotenv
6
+ from transformers import AutoTokenizer
 
7
  # Load environment variables
8
  load_dotenv()
 
9
  # Get the Hugging Face API token
10
  HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
 
11
  # Initialize the tokenizer
12
+ tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
 
13
  def count_tokens(text):
14
+ return len(tokenizer.encode(text))
15
+ def summarize_text(text, instructions, agent_name, max_length, temperature, repetition_penalty, top_p):
16
+ print(f"{agent_name}: Starting summarization")
17
+ API_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3"
18
+ headers = {"Authorization": f"Bearer {HUGGINGFACE_TOKEN}"}
19
+ summaries = []
20
+ current_text = text
21
+ while len(current_text) > 0:
22
+ payload = {
23
+ "inputs": f"{instructions}\n\nText to summarize:\n{current_text}",
24
+ "parameters": {
25
+ "max_length": max_length,
26
+ "temperature": temperature,
27
+ "repetition_penalty": repetition_penalty,
28
+ "top_p": top_p
29
+ }
30
+ }
31
+ print(f"{agent_name}: Sending request to API")
32
+ response = requests.post(API_URL, headers=headers, json=payload)
33
+ print(f"{agent_name}: Received response from API")
34
+ generated_text = response.json()[0]["generated_text"]
35
+ # Split the generated text by the delimiter "\n\n" and take the last part as the summary
36
+ summary = generated_text.split("\n\n")[-1]
37
+ summaries.append(summary)
38
+ # Remove the summarized part from the current text
39
+ current_text = current_text[len(summary):].strip()
40
+ # Join all partial summaries into a final summary
41
+ final_summary = "\n\n".join(summaries)
42
+ return final_summary
43
+ def process_pdf(pdf_file, chunk_instructions, window_instructions, final_instructions, max_length, temperature, repetition_penalty, top_p):
44
+ print("Starting PDF processing")
45
+ # Read PDF
46
+ reader = PdfReader(pdf_file)
47
+ text = ""
48
+ for page in reader.pages:
49
+ text += page.extract_text() + "\n\n"
50
+ print(f"Extracted {len(reader.pages)} pages from PDF")
51
+ # Chunk the text (simple splitting by pages for this example)
52
+ chunks = text.split("\n\n")
53
+ print(f"Split text into {len(chunks)} chunks")
54
+ # Agent 1: Summarize each chunk
55
+ agent1_summaries = []
56
+ for i, chunk in enumerate(chunks):
57
+ print(f"Agent 1: Processing chunk {i+1}/{len(chunks)}")
58
+ summary = summarize_text(chunk, chunk_instructions, "Agent 1", max_length, temperature, repetition_penalty, top_p)
59
+ agent1_summaries.append(summary)
60
+ print("Agent 1: Finished processing all chunks")
61
+ # Concatenate Agent 1 summaries
62
+ concatenated_summary = "\n\n".join(agent1_summaries)
63
+ print(f"Concatenated Agent 1 summaries (length: {count_tokens(concatenated_summary)} tokens)")
64
+ print(f"Concatenated Summary: {concatenated_summary}")
65
+ # Sliding window approach
66
+ window_size = 3500 # in tokens
67
+ step_size = 3000 # overlap of 500 tokens
68
+ windows = []
69
+ current_position = 0
70
+ while current_position < len(concatenated_summary):
71
+ window_end = current_position
72
+ window_text = ""
73
+ while count_tokens(window_text) < window_size and window_end < len(concatenated_summary):
74
+ window_text += concatenated_summary[window_end]
75
+ window_end += 1
76
+ windows.append(window_text)
77
+ current_position += step_size
78
+ print(f"Created {len(windows)} windows for intermediate summarization")
79
+ # Intermediate summarization
80
+ intermediate_summaries = []
81
+ for i, window in enumerate(windows):
82
+ print(f"Processing window {i+1}/{len(windows)}")
83
+ summary = summarize_text(window, window_instructions, f"Window {i+1}", max_length, temperature, repetition_penalty, top_p)
84
+ intermediate_summaries.append(summary)
85
+ # Final summarization
86
+ final_input = "\n\n".join(intermediate_summaries)
87
+ print(f"Final input length: {count_tokens(final_input)} tokens")
88
+ final_summary = summarize_text(final_input, final_instructions, "Agent 2", max_length, temperature, repetition_penalty, top_p)
89
+ print("Agent 2: Finished final summarization")
90
+ return final_summary
91
+ def pdf_summarizer(pdf_file, chunk_instructions, window_instructions, final_instructions, max_length, temperature, repetition_penalty, top_p):
92
+ if pdf_file is None:
93
+ print("Error: No PDF file uploaded")
94
+ return "Please upload a PDF file."
95
+ try:
96
+ print(f"Starting summarization process for file: {pdf_file.name}")
97
+ summary = process_pdf(pdf_file.name, chunk_instructions, window_instructions, final_instructions, max_length, temperature, repetition_penalty, top_p)
98
+ print("Summarization process completed successfully")
99
+ return summary
100
+ except Exception as e:
101
+ print(f"An error occurred: {str(e)}")
102
+ return f"An error occurred: {str(e)}"
 
 
 
 
 
 
 
 
 
 
103
  # Gradio interface
104
  iface = gr.Interface(
105
+ fn=pdf_summarizer,
106
+ inputs=[
107
+ gr.File(label="Upload PDF"),
108
+ gr.Textbox(label="Chunk Instructions", placeholder="Instructions for summarizing each chunk"),
109
+ gr.Textbox(label="Window Instructions", placeholder="Instructions for summarizing each window"),
110
+ gr.Textbox(label="Final Instructions", placeholder="Instructions for final summarization"),
111
+ gr.Slider(label="Max Length", minimum=500, maximum=3500, step=100, value=2000),
112
+ gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, step=0.1, value=0.7),
113
+ gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, step=0.1, value=1.1),
114
+ gr.Slider(label="Top P", minimum=0.1, maximum=1.0, step=0.1, value=0.9)
115
+ ],
116
+ outputs=gr.Textbox(label="Summary"),
117
+ title="PDF Earnings Summary Generator",
118
+ description="Upload a PDF of an earnings summary or transcript to generate a concise summary."
119
  )
 
120
  print("Launching Gradio interface")
121
  iface.launch()