Sentinel-AI-Beta-Test

Building

App Files Files Community

Shreyas094 commited on Jul 1, 2024

Commit

c0cb368

verified ·

1 Parent(s): 5e2a934

Update app.py

Browse files

Files changed (1) hide show

app.py +105 -116

app.py CHANGED Viewed

@@ -3,130 +3,119 @@ import gradio as gr
 from PyPDF2 import PdfReader
 import requests
 from dotenv import load_dotenv
-import tiktoken
 # Load environment variables
 load_dotenv()
 # Get the Hugging Face API token
 HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
 # Initialize the tokenizer
-tokenizer = tiktoken.get_encoding("cl100k_base")
 def count_tokens(text):
-    return len(tokenizer.encode(text))
-def summarize_text(text, instructions, agent_name):
-    print(f"{agent_name}: Starting summarization")
-    API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
-    headers = {"Authorization": f"Bearer {HUGGINGFACE_TOKEN}"}
-    payload = {
-        "inputs": f"{instructions}\n\nText to summarize:\n{text}",
-        "parameters": {"max_length": 2000}
-    }
-    print(f"{agent_name}: Sending request to API")
-    response = requests.post(API_URL, headers=headers, json=payload)
-    print(f"{agent_name}: Received response from API")
-    # Extracting only the generated summary from the response
-    generated_text = response.json()[0]["generated_text"]
-    # Assuming the model returns the entire input followed by the summary
-    # Split the generated text by the delimiter "\n\n" and take the last part as the summary
-    summary = generated_text.split("\n\n")[-1]
-    return summary
-def process_pdf(pdf_file, chunk_instructions, window_instructions, final_instructions):
-    print("Starting PDF processing")
-    # Read PDF
-    reader = PdfReader(pdf_file)
-    text = ""
-    for page in reader.pages:
-        text += page.extract_text() + "\n\n"
-    print(f"Extracted {len(reader.pages)} pages from PDF")
-    # Chunk the text (simple splitting by pages for this example)
-    chunks = text.split("\n\n")
-    print(f"Split text into {len(chunks)} chunks")
-    # Agent 1: Summarize each chunk
-    agent1_summaries = []
-    for i, chunk in enumerate(chunks):
-        print(f"Agent 1: Processing chunk {i+1}/{len(chunks)}")
-        summary = summarize_text(chunk, chunk_instructions, "Agent 1")
-        agent1_summaries.append(summary)
-    print("Agent 1: Finished processing all chunks")
-    # Concatenate Agent 1 summaries
-    concatenated_summary = "\n\n".join(agent1_summaries)
-    print(f"Concatenated Agent 1 summaries (length: {len(concatenated_summary)})")
-    print(f"Concatenated Summary: {concatenated_summary}")
-    # Sliding window approach
-    window_size = 3500  # in tokens
-    step_size = 3000  # overlap of 500 tokens
-    windows = []
-    current_position = 0
-    while current_position < len(concatenated_summary):
-        window_end = current_position
-        window_text = ""
-        while count_tokens(window_text) < window_size and window_end < len(concatenated_summary):
-            window_text += concatenated_summary[window_end]
-            window_end += 1
-        windows.append(window_text)
-        current_position += step_size
-    print(f"Created {len(windows)} windows for intermediate summarization")
-    # Intermediate summarization
-    intermediate_summaries = []
-    for i, window in enumerate(windows):
-        print(f"Processing window {i+1}/{len(windows)}")
-        summary = summarize_text(window, window_instructions, f"Window {i+1}")
-        intermediate_summaries.append(summary)
-    # Final summarization
-    final_input = "\n\n".join(intermediate_summaries)
-    print(f"Final input length: {count_tokens(final_input)} tokens")
-    final_summary = summarize_text(final_input, final_instructions, "Agent 2")
-    print("Agent 2: Finished final summarization")
-    return final_summary
-def pdf_summarizer(pdf_file, chunk_instructions, window_instructions, final_instructions):
-    if pdf_file is None:
-        print("Error: No PDF file uploaded")
-        return "Please upload a PDF file."
-    try:
-        print(f"Starting summarization process for file: {pdf_file.name}")
-        summary = process_pdf(pdf_file.name, chunk_instructions, window_instructions, final_instructions)
-        print("Summarization process completed successfully")
-        return summary
-    except Exception as e:
-        print(f"An error occurred: {str(e)}")
-        return f"An error occurred: {str(e)}"
 # Gradio interface
 iface = gr.Interface(
-    fn=pdf_summarizer,
-    inputs=[
-        gr.File(label="Upload PDF"),
-        gr.Textbox(label="Chunk Instructions", placeholder="Instructions for summarizing each chunk"),
-        gr.Textbox(label="Window Instructions", placeholder="Instructions for summarizing each window"),
-        gr.Textbox(label="Final Instructions", placeholder="Instructions for final summarization")
-    ],
-    outputs=gr.Textbox(label="Summary"),
-    title="PDF Earnings Summary Generator",
-    description="Upload a PDF of an earnings summary or transcript to generate a concise summary."
 )
 print("Launching Gradio interface")
 iface.launch()

 from PyPDF2 import PdfReader
 import requests
 from dotenv import load_dotenv
+from transformers import AutoTokenizer
 # Load environment variables
 load_dotenv()
 # Get the Hugging Face API token
 HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
 # Initialize the tokenizer
+tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
 def count_tokens(text):
+   return len(tokenizer.encode(text))
+def summarize_text(text, instructions, agent_name, max_length, temperature, repetition_penalty, top_p):
+   print(f"{agent_name}: Starting summarization")
+   API_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3"
+   headers = {"Authorization": f"Bearer {HUGGINGFACE_TOKEN}"}
+   summaries = []
+   current_text = text
+   while len(current_text) > 0:
+       payload = {
+           "inputs": f"{instructions}\n\nText to summarize:\n{current_text}",
+           "parameters": {
+               "max_length": max_length,
+               "temperature": temperature,
+               "repetition_penalty": repetition_penalty,
+               "top_p": top_p
+           }
+       }
+       print(f"{agent_name}: Sending request to API")
+       response = requests.post(API_URL, headers=headers, json=payload)
+       print(f"{agent_name}: Received response from API")
+       generated_text = response.json()[0]["generated_text"]
+       # Split the generated text by the delimiter "\n\n" and take the last part as the summary
+       summary = generated_text.split("\n\n")[-1]
+       summaries.append(summary)
+       # Remove the summarized part from the current text
+       current_text = current_text[len(summary):].strip()
+   # Join all partial summaries into a final summary
+   final_summary = "\n\n".join(summaries)
+   return final_summary
+def process_pdf(pdf_file, chunk_instructions, window_instructions, final_instructions, max_length, temperature, repetition_penalty, top_p):
+   print("Starting PDF processing")
+   # Read PDF
+   reader = PdfReader(pdf_file)
+   text = ""
+   for page in reader.pages:
+       text += page.extract_text() + "\n\n"
+   print(f"Extracted {len(reader.pages)} pages from PDF")
+   # Chunk the text (simple splitting by pages for this example)
+   chunks = text.split("\n\n")
+   print(f"Split text into {len(chunks)} chunks")
+   # Agent 1: Summarize each chunk
+   agent1_summaries = []
+   for i, chunk in enumerate(chunks):
+       print(f"Agent 1: Processing chunk {i+1}/{len(chunks)}")
+       summary = summarize_text(chunk, chunk_instructions, "Agent 1", max_length, temperature, repetition_penalty, top_p)
+       agent1_summaries.append(summary)
+   print("Agent 1: Finished processing all chunks")
+   # Concatenate Agent 1 summaries
+   concatenated_summary = "\n\n".join(agent1_summaries)
+   print(f"Concatenated Agent 1 summaries (length: {count_tokens(concatenated_summary)} tokens)")
+   print(f"Concatenated Summary: {concatenated_summary}")
+   # Sliding window approach
+   window_size = 3500  # in tokens
+   step_size = 3000  # overlap of 500 tokens
+   windows = []
+   current_position = 0
+   while current_position < len(concatenated_summary):
+       window_end = current_position
+       window_text = ""
+       while count_tokens(window_text) < window_size and window_end < len(concatenated_summary):
+           window_text += concatenated_summary[window_end]
+           window_end += 1
+       windows.append(window_text)
+       current_position += step_size
+   print(f"Created {len(windows)} windows for intermediate summarization")
+   # Intermediate summarization
+   intermediate_summaries = []
+   for i, window in enumerate(windows):
+       print(f"Processing window {i+1}/{len(windows)}")
+       summary = summarize_text(window, window_instructions, f"Window {i+1}", max_length, temperature, repetition_penalty, top_p)
+       intermediate_summaries.append(summary)
+   # Final summarization
+   final_input = "\n\n".join(intermediate_summaries)
+   print(f"Final input length: {count_tokens(final_input)} tokens")
+   final_summary = summarize_text(final_input, final_instructions, "Agent 2", max_length, temperature, repetition_penalty, top_p)
+   print("Agent 2: Finished final summarization")
+   return final_summary
+def pdf_summarizer(pdf_file, chunk_instructions, window_instructions, final_instructions, max_length, temperature, repetition_penalty, top_p):
+   if pdf_file is None:
+       print("Error: No PDF file uploaded")
+       return "Please upload a PDF file."
+   try:
+       print(f"Starting summarization process for file: {pdf_file.name}")
+       summary = process_pdf(pdf_file.name, chunk_instructions, window_instructions, final_instructions, max_length, temperature, repetition_penalty, top_p)
+       print("Summarization process completed successfully")
+       return summary
+   except Exception as e:
+       print(f"An error occurred: {str(e)}")
+       return f"An error occurred: {str(e)}"
 # Gradio interface
 iface = gr.Interface(
+   fn=pdf_summarizer,
+   inputs=[
+       gr.File(label="Upload PDF"),
+       gr.Textbox(label="Chunk Instructions", placeholder="Instructions for summarizing each chunk"),
+       gr.Textbox(label="Window Instructions", placeholder="Instructions for summarizing each window"),
+       gr.Textbox(label="Final Instructions", placeholder="Instructions for final summarization"),
+       gr.Slider(label="Max Length", minimum=500, maximum=3500, step=100, value=2000),
+       gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, step=0.1, value=0.7),
+       gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, step=0.1, value=1.1),
+       gr.Slider(label="Top P", minimum=0.1, maximum=1.0, step=0.1, value=0.9)
+   ],
+   outputs=gr.Textbox(label="Summary"),
+   title="PDF Earnings Summary Generator",
+   description="Upload a PDF of an earnings summary or transcript to generate a concise summary."
 )
 print("Launching Gradio interface")
 iface.launch()