ar08's picture
Update app.py
db5824c verified
# pip install gradio transformers optimum onnxruntime onnx beautifulsoup4 langdetect deep-translator requests torch
import gradio as gr
import requests
from bs4 import BeautifulSoup
import re
from requests.sessions import Session
from langdetect import detect
from deep_translator import GoogleTranslator
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForSeq2SeqLM
from optimum.pipelines import pipeline
import onnxruntime as ort
import torch
# --- ONNX CPU optimization setup ---
sess_options = ort.SessionOptions()
sess_options.intra_op_num_threads = min(4, torch.get_num_threads())
sess_options.inter_op_num_threads = 1
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
model_name = "Rahmat82/t5-small-finetuned-summarization-xsum"
model = ORTModelForSeq2SeqLM.from_pretrained(model_name, session_options=sess_options)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
summarizer = pipeline(
"summarization",
model=model,
tokenizer=tokenizer,
device=-1, # CPU
batch_size=8,
)
# --- Scraper function ---
def scrape_visible_text_from_url(url, query_selector=None, email=None, password=None, login_url=None):
try:
session = Session()
if email and password and login_url:
login_data = {'email': email, 'password': password}
response = session.post(login_url, data=login_data)
response.raise_for_status()
else:
response = session.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]):
tag.extract()
if query_selector:
elements = soup.select(query_selector)
text_content = " ".join([element.get_text() for element in elements])
else:
header_content = soup.find("header")
header_text = header_content.get_text() if header_content else ""
paragraph_content = soup.body
paragraph_text = " ".join([p.get_text() for p in paragraph_content]) if paragraph_content else ""
text_content = f"{header_text}\n\n{paragraph_text}"
visible_text = re.sub(r'\s+', ' ', text_content).strip()
translator = GoogleTranslator(source='auto', target='en')
sentences = re.split(r'(?<=[.!?]) +', visible_text)
translated_sentences = []
for sentence in sentences:
try:
lang = detect(sentence)
if lang != 'en':
translated_sentence = translator.translate(sentence)
translated_sentences.append(translated_sentence)
else:
translated_sentences.append(sentence)
except Exception:
translated_sentences.append(sentence)
translated_text = ' '.join(translated_sentences)
return translated_text
except Exception as e:
return f"Error occurred while scraping: {e}"
# --- Main function for Gradio ---
def scrape_and_summarize(url, query_selector, email, password, login_url):
scraped_text = scrape_visible_text_from_url(url, query_selector, email, password, login_url)
if scraped_text.startswith("Error occurred"):
return scraped_text, ""
if not scraped_text.strip():
return "No text found to summarize.", ""
# Summarize scraped text
try:
inputs = tokenizer.encode(scraped_text, max_length=1024, truncation=True, return_tensors="pt")
input_text = tokenizer.decode(inputs[0], skip_special_tokens=True)
summary = summarizer(
input_text,
min_length=90,
max_length=120,
do_sample=False
)
return scraped_text, summary[0]["summary_text"]
except Exception as e:
return scraped_text, f"Error during summarization: {e}"
# --- Gradio Interface ---
with gr.Blocks() as app:
gr.Markdown("# 🌐 Web Scraper + ⚙️ ONNX T5 Summarizer")
with gr.Row():
with gr.Column():
url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com", lines=1)
query_selector_input = gr.Textbox(label="CSS Query Selector (optional)", placeholder=".article p", lines=1)
email_input = gr.Textbox(label="Email (if login required)", lines=1)
password_input = gr.Textbox(label="Password (if login required)", type="password", lines=1)
login_url_input = gr.Textbox(label="Login URL (if login required)", lines=1)
submit_btn = gr.Button("Scrape & Summarize")
with gr.Column():
scraped_output = gr.Textbox(label="Scraped Text", lines=15)
summary_output = gr.Textbox(label="Summary", lines=8)
submit_btn.click(
fn=scrape_and_summarize,
inputs=[url_input, query_selector_input, email_input, password_input, login_url_input],
outputs=[scraped_output, summary_output]
)
if __name__ == "__main__":
app.launch()