# pip install gradio transformers optimum onnxruntime onnx beautifulsoup4 langdetect deep-translator requests torch import gradio as gr import requests from bs4 import BeautifulSoup import re from requests.sessions import Session from langdetect import detect from deep_translator import GoogleTranslator from transformers import AutoTokenizer from optimum.onnxruntime import ORTModelForSeq2SeqLM from optimum.pipelines import pipeline import onnxruntime as ort import torch # --- ONNX CPU optimization setup --- sess_options = ort.SessionOptions() sess_options.intra_op_num_threads = min(4, torch.get_num_threads()) sess_options.inter_op_num_threads = 1 sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL model_name = "Rahmat82/t5-small-finetuned-summarization-xsum" model = ORTModelForSeq2SeqLM.from_pretrained(model_name, session_options=sess_options) tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) summarizer = pipeline( "summarization", model=model, tokenizer=tokenizer, device=-1, # CPU batch_size=8, ) # --- Scraper function --- def scrape_visible_text_from_url(url, query_selector=None, email=None, password=None, login_url=None): try: session = Session() if email and password and login_url: login_data = {'email': email, 'password': password} response = session.post(login_url, data=login_data) response.raise_for_status() else: response = session.get(url) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]): tag.extract() if query_selector: elements = soup.select(query_selector) text_content = " ".join([element.get_text() for element in elements]) else: header_content = soup.find("header") header_text = header_content.get_text() if header_content else "" paragraph_content = soup.body paragraph_text = " ".join([p.get_text() for p in paragraph_content]) if paragraph_content else "" text_content = f"{header_text}\n\n{paragraph_text}" visible_text = re.sub(r'\s+', ' ', text_content).strip() translator = GoogleTranslator(source='auto', target='en') sentences = re.split(r'(?<=[.!?]) +', visible_text) translated_sentences = [] for sentence in sentences: try: lang = detect(sentence) if lang != 'en': translated_sentence = translator.translate(sentence) translated_sentences.append(translated_sentence) else: translated_sentences.append(sentence) except Exception: translated_sentences.append(sentence) translated_text = ' '.join(translated_sentences) return translated_text except Exception as e: return f"Error occurred while scraping: {e}" # --- Main function for Gradio --- def scrape_and_summarize(url, query_selector, email, password, login_url): scraped_text = scrape_visible_text_from_url(url, query_selector, email, password, login_url) if scraped_text.startswith("Error occurred"): return scraped_text, "" if not scraped_text.strip(): return "No text found to summarize.", "" # Summarize scraped text try: inputs = tokenizer.encode(scraped_text, max_length=1024, truncation=True, return_tensors="pt") input_text = tokenizer.decode(inputs[0], skip_special_tokens=True) summary = summarizer( input_text, min_length=90, max_length=120, do_sample=False ) return scraped_text, summary[0]["summary_text"] except Exception as e: return scraped_text, f"Error during summarization: {e}" # --- Gradio Interface --- with gr.Blocks() as app: gr.Markdown("# 🌐 Web Scraper + ⚙️ ONNX T5 Summarizer") with gr.Row(): with gr.Column(): url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com", lines=1) query_selector_input = gr.Textbox(label="CSS Query Selector (optional)", placeholder=".article p", lines=1) email_input = gr.Textbox(label="Email (if login required)", lines=1) password_input = gr.Textbox(label="Password (if login required)", type="password", lines=1) login_url_input = gr.Textbox(label="Login URL (if login required)", lines=1) submit_btn = gr.Button("Scrape & Summarize") with gr.Column(): scraped_output = gr.Textbox(label="Scraped Text", lines=15) summary_output = gr.Textbox(label="Summary", lines=8) submit_btn.click( fn=scrape_and_summarize, inputs=[url_input, query_selector_input, email_input, password_input, login_url_input], outputs=[scraped_output, summary_output] ) if __name__ == "__main__": app.launch()