Tulika2000 commited on
Commit
3e07c38
·
verified ·
1 Parent(s): 0c9d696

Upload summarization.py

Browse files
Files changed (1) hide show
  1. summarization.py +75 -0
summarization.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Summarization.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1XblbxoRxB4XOHixjGij789FPD9KjKdhi
8
+ """
9
+
10
+ !pip install groq PyPDF2 gradio
11
+
12
+ !pip install -U langchain langchain-community langchain-groq
13
+
14
+ import os
15
+ import PyPDF2
16
+ import gradio as gr
17
+ from langchain_groq.chat_models import ChatGroq
18
+
19
+ # Set Groq API key securely
20
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY") # Fetch from environment variables
21
+
22
+ # Ensure API key is available
23
+ if not GROQ_API_KEY:
24
+ raise ValueError("GROQ_API_KEY is not set. Add it in Hugging Face Secrets.")
25
+
26
+ # Initialize LLM (Mistral-8x7B)
27
+ llm = ChatGroq(model_name="mixtral-8x7b-32768")
28
+
29
+ def extract_text_from_pdf(pdf_file):
30
+ """Extract text from a PDF file."""
31
+ text = ""
32
+ reader = PyPDF2.PdfReader(pdf_file)
33
+ for page in reader.pages:
34
+ page_text = page.extract_text()
35
+ if page_text:
36
+ text += page_text + "\n"
37
+ return text
38
+
39
+ def summarize_text(text):
40
+ """Summarize the text"""
41
+ prompt = f"Summarize the following document:\n\n{text[:10000]}" # Limit input size
42
+ response = llm.predict(prompt)
43
+ return response
44
+
45
+ def process_pdf(file):
46
+ """Extract text and summarize PDF using Mistral-8x7B."""
47
+ if file is None:
48
+ return "No file uploaded."
49
+
50
+ # Read file bytes and process it using PyPDF2
51
+ pdf_reader = PyPDF2.PdfReader(file)
52
+ text = ""
53
+ for page in pdf_reader.pages:
54
+ page_text = page.extract_text()
55
+ if page_text:
56
+ text += page_text + "\n"
57
+
58
+ # Limit text size for API efficiency
59
+ text = text[:10000] if len(text) > 10000 else text
60
+
61
+ # Summarize
62
+ summary = summarize_text(text)
63
+ return summary
64
+
65
+ # Create Gradio Interface
66
+ interface = gr.Interface(
67
+ fn=process_pdf,
68
+ inputs=gr.File(label="Upload a PDF"),
69
+ outputs="text",
70
+ title="📄 PDF Summarizer",
71
+ description="Upload a PDF file and get a summary"
72
+ )
73
+
74
+ # Run the app
75
+ interface.launch()