Spaces:
Sleeping
Sleeping
File size: 4,307 Bytes
f1691d8 a43e6d5 f1691d8 c97532f 0dd7ae7 162dd8b f1691d8 bf3ac4c 367b557 e50a70a 0dd7ae7 e50a70a f1691d8 e292744 a1758a8 e50a70a 8639815 f6a07f3 f1691d8 9e89ef8 162dd8b 026783f a43e6d5 28f0884 c97532f 28f0884 c97532f a43e6d5 28f0884 a43e6d5 28f0884 a43e6d5 28f0884 a84c0e0 a43e6d5 a84c0e0 9e89ef8 b975282 162dd8b 026783f 241247a 026783f a84c0e0 dc3c24e a43e6d5 dc3c24e b975282 1dcae0c 65033bf a43e6d5 65033bf 162dd8b 8639815 162dd8b b9c9dac 162dd8b f1691d8 162dd8b f1691d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
import fitz
import gradio as gr
from langchain.prompts import PromptTemplate
from pathlib import Path
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
from langdetect import detect
CONTEXT_WINDOW = 50_000
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype="float16",
bnb_4bit_use_double_quant=True
)
llm = HuggingFaceEndpoint(
repo_id="mistralai/Mistral-Nemo-Instruct-2407", #"mistralai/Mistral-7B-Instruct-v0.3",
task="text-generation",
model_kwargs={"quantization_config": quantization_config},
max_new_tokens=4096,
temperature=0.5,
do_sample=False,
)
#llm_engine_hf = ChatHuggingFace(llm=llm)
def read_pdf(file_path):
logger.info("Reading a PDF file")
try:
pdf_document = fitz.open(file_path)
text = ""
for page_num in range(len(pdf_document)):
page = pdf_document[page_num]
text += page.get_text()
if not text.strip():
message = "PDF contains no text. It may be due to the PDF being password-protected, collapsed, or full of images."
logger.info(message)
return message
return text
except Exception as e:
error_message = f"Error reading PDF file: {e}"
logger.error(error_message)
return error_message
def read_txt(file_path):
logger.info("Reading a TXT file")
try:
with open(file_path, "r", encoding="utf-8") as f:
text = f.read()
return text
except Exception as e:
error_message = f"Error reading TXT file: {e}"
logger.error(error_message)
return error_message
def summarize(file):
global llm
# Read the content of the uploaded file
file_path = file.name
if file_path.endswith('.pdf'):
text = read_pdf(file_path)
else:
text = read_txt(file_path)
logger.info("Length of text is %d", len(text))
lang = detect(text[:CONTEXT_WINDOW])
template_translate = '''
Please carefully read the following document:
<document>
{TEXT}
</document>
After reading through the document, pinpoint the key points and main ideas covered in the text.
Organize these key points into a concise bulleted list that summarizes the essential information from the document.
The summary should be in {LANG} language.
'''
prompt_summarize = PromptTemplate(
template=template_translate,
input_variables=["TEXT", "LANG"]
)
summaries = []
for i in range(0, len(text), CONTEXT_WINDOW):
chunk = text[i:i + CONTEXT_WINDOW]
formatted_prompt = prompt_summarize.format(TEXT=chunk, LANG=lang)
summary = llm.invoke(formatted_prompt)
summaries.append(summary)
logger.info(f"Chunked into {len(summaries)}.")
final_summary = "\n\n".join(summaries)
return final_summary
def download_summary(output_text):
if output_text:
file_path = Path('summary.txt')
with open(file_path, 'w', encoding='utf-8') as f:
f.write(output_text)
return file_path
else:
return None
def create_download_file(summary_text):
file_path = download_summary(summary_text)
return str(file_path) if file_path else None
# Create the Gradio interface
with gr.Blocks() as demo:
gr.Markdown("## Document Summarizer")
with gr.Row():
with gr.Column():
file = gr.File(label="Submit a file")
with gr.Column():
output_text = gr.Textbox(label="Summary", lines=20)
submit_button = gr.Button("Summarize")
submit_button.click(summarize, inputs=[file], outputs=output_text)
def generate_file():
summary_text = output_text
file_path = download_summary(summary_text)
return file_path
download_button = gr.Button("Download Summary")
download_button.click(
fn=create_download_file,
inputs=[output_text],
outputs=gr.File()
)
# Run the Gradio app
demo.launch(share=True) |