File size: 5,132 Bytes
4c8da68 3c0c86a de8638e dd979da 4c8da68 dd979da 3c0c86a 2a14ed0 2716805 2a14ed0 de8638e dd979da 2716805 2a14ed0 2716805 2a14ed0 2716805 2a14ed0 17947a0 2a14ed0 17947a0 dd979da 2716805 17947a0 72ba805 dd979da 4c8da68 dd979da 4c8da68 dd979da db5824c dd979da dcd6dd2 de8638e 17947a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
# pip install gradio transformers optimum onnxruntime onnx beautifulsoup4 langdetect deep-translator requests torch
import gradio as gr
import requests
from bs4 import BeautifulSoup
import re
from requests.sessions import Session
from langdetect import detect
from deep_translator import GoogleTranslator
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForSeq2SeqLM
from optimum.pipelines import pipeline
import onnxruntime as ort
import torch
# --- ONNX CPU optimization setup ---
sess_options = ort.SessionOptions()
sess_options.intra_op_num_threads = min(4, torch.get_num_threads())
sess_options.inter_op_num_threads = 1
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
model_name = "Rahmat82/t5-small-finetuned-summarization-xsum"
model = ORTModelForSeq2SeqLM.from_pretrained(model_name, session_options=sess_options)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
summarizer = pipeline(
"summarization",
model=model,
tokenizer=tokenizer,
device=-1, # CPU
batch_size=8,
)
# --- Scraper function ---
def scrape_visible_text_from_url(url, query_selector=None, email=None, password=None, login_url=None):
try:
session = Session()
if email and password and login_url:
login_data = {'email': email, 'password': password}
response = session.post(login_url, data=login_data)
response.raise_for_status()
else:
response = session.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]):
tag.extract()
if query_selector:
elements = soup.select(query_selector)
text_content = " ".join([element.get_text() for element in elements])
else:
header_content = soup.find("header")
header_text = header_content.get_text() if header_content else ""
paragraph_content = soup.body
paragraph_text = " ".join([p.get_text() for p in paragraph_content]) if paragraph_content else ""
text_content = f"{header_text}\n\n{paragraph_text}"
visible_text = re.sub(r'\s+', ' ', text_content).strip()
translator = GoogleTranslator(source='auto', target='en')
sentences = re.split(r'(?<=[.!?]) +', visible_text)
translated_sentences = []
for sentence in sentences:
try:
lang = detect(sentence)
if lang != 'en':
translated_sentence = translator.translate(sentence)
translated_sentences.append(translated_sentence)
else:
translated_sentences.append(sentence)
except Exception:
translated_sentences.append(sentence)
translated_text = ' '.join(translated_sentences)
return translated_text
except Exception as e:
return f"Error occurred while scraping: {e}"
# --- Main function for Gradio ---
def scrape_and_summarize(url, query_selector, email, password, login_url):
scraped_text = scrape_visible_text_from_url(url, query_selector, email, password, login_url)
if scraped_text.startswith("Error occurred"):
return scraped_text, ""
if not scraped_text.strip():
return "No text found to summarize.", ""
# Summarize scraped text
try:
inputs = tokenizer.encode(scraped_text, max_length=1024, truncation=True, return_tensors="pt")
input_text = tokenizer.decode(inputs[0], skip_special_tokens=True)
summary = summarizer(
input_text,
min_length=90,
max_length=120,
do_sample=False
)
return scraped_text, summary[0]["summary_text"]
except Exception as e:
return scraped_text, f"Error during summarization: {e}"
# --- Gradio Interface ---
with gr.Blocks() as app:
gr.Markdown("# 🌐 Web Scraper + ⚙️ ONNX T5 Summarizer")
with gr.Row():
with gr.Column():
url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com", lines=1)
query_selector_input = gr.Textbox(label="CSS Query Selector (optional)", placeholder=".article p", lines=1)
email_input = gr.Textbox(label="Email (if login required)", lines=1)
password_input = gr.Textbox(label="Password (if login required)", type="password", lines=1)
login_url_input = gr.Textbox(label="Login URL (if login required)", lines=1)
submit_btn = gr.Button("Scrape & Summarize")
with gr.Column():
scraped_output = gr.Textbox(label="Scraped Text", lines=15)
summary_output = gr.Textbox(label="Summary", lines=8)
submit_btn.click(
fn=scrape_and_summarize,
inputs=[url_input, query_selector_input, email_input, password_input, login_url_input],
outputs=[scraped_output, summary_output]
)
if __name__ == "__main__":
app.launch()
|