|
|
|
|
|
|
|
|
import gradio as gr |
|
|
import requests |
|
|
from bs4 import BeautifulSoup |
|
|
import re |
|
|
from requests.sessions import Session |
|
|
from langdetect import detect |
|
|
from deep_translator import GoogleTranslator |
|
|
|
|
|
from transformers import AutoTokenizer |
|
|
from optimum.onnxruntime import ORTModelForSeq2SeqLM |
|
|
from optimum.pipelines import pipeline |
|
|
import onnxruntime as ort |
|
|
import torch |
|
|
|
|
|
|
|
|
sess_options = ort.SessionOptions() |
|
|
sess_options.intra_op_num_threads = min(4, torch.get_num_threads()) |
|
|
sess_options.inter_op_num_threads = 1 |
|
|
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL |
|
|
|
|
|
model_name = "Rahmat82/t5-small-finetuned-summarization-xsum" |
|
|
model = ORTModelForSeq2SeqLM.from_pretrained(model_name, session_options=sess_options) |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) |
|
|
|
|
|
summarizer = pipeline( |
|
|
"summarization", |
|
|
model=model, |
|
|
tokenizer=tokenizer, |
|
|
device=-1, |
|
|
batch_size=8, |
|
|
) |
|
|
|
|
|
|
|
|
def scrape_visible_text_from_url(url, query_selector=None, email=None, password=None, login_url=None): |
|
|
try: |
|
|
session = Session() |
|
|
|
|
|
if email and password and login_url: |
|
|
login_data = {'email': email, 'password': password} |
|
|
response = session.post(login_url, data=login_data) |
|
|
response.raise_for_status() |
|
|
else: |
|
|
response = session.get(url) |
|
|
response.raise_for_status() |
|
|
|
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
|
for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]): |
|
|
tag.extract() |
|
|
|
|
|
if query_selector: |
|
|
elements = soup.select(query_selector) |
|
|
text_content = " ".join([element.get_text() for element in elements]) |
|
|
else: |
|
|
header_content = soup.find("header") |
|
|
header_text = header_content.get_text() if header_content else "" |
|
|
paragraph_content = soup.body |
|
|
paragraph_text = " ".join([p.get_text() for p in paragraph_content]) if paragraph_content else "" |
|
|
text_content = f"{header_text}\n\n{paragraph_text}" |
|
|
|
|
|
visible_text = re.sub(r'\s+', ' ', text_content).strip() |
|
|
|
|
|
translator = GoogleTranslator(source='auto', target='en') |
|
|
sentences = re.split(r'(?<=[.!?]) +', visible_text) |
|
|
translated_sentences = [] |
|
|
for sentence in sentences: |
|
|
try: |
|
|
lang = detect(sentence) |
|
|
if lang != 'en': |
|
|
translated_sentence = translator.translate(sentence) |
|
|
translated_sentences.append(translated_sentence) |
|
|
else: |
|
|
translated_sentences.append(sentence) |
|
|
except Exception: |
|
|
translated_sentences.append(sentence) |
|
|
translated_text = ' '.join(translated_sentences) |
|
|
|
|
|
return translated_text |
|
|
|
|
|
except Exception as e: |
|
|
return f"Error occurred while scraping: {e}" |
|
|
|
|
|
|
|
|
def scrape_and_summarize(url, query_selector, email, password, login_url): |
|
|
scraped_text = scrape_visible_text_from_url(url, query_selector, email, password, login_url) |
|
|
if scraped_text.startswith("Error occurred"): |
|
|
return scraped_text, "" |
|
|
if not scraped_text.strip(): |
|
|
return "No text found to summarize.", "" |
|
|
|
|
|
|
|
|
try: |
|
|
inputs = tokenizer.encode(scraped_text, max_length=1024, truncation=True, return_tensors="pt") |
|
|
input_text = tokenizer.decode(inputs[0], skip_special_tokens=True) |
|
|
|
|
|
summary = summarizer( |
|
|
input_text, |
|
|
min_length=90, |
|
|
max_length=120, |
|
|
do_sample=False |
|
|
) |
|
|
return scraped_text, summary[0]["summary_text"] |
|
|
except Exception as e: |
|
|
return scraped_text, f"Error during summarization: {e}" |
|
|
|
|
|
|
|
|
with gr.Blocks() as app: |
|
|
gr.Markdown("# 🌐 Web Scraper + ⚙️ ONNX T5 Summarizer") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com", lines=1) |
|
|
query_selector_input = gr.Textbox(label="CSS Query Selector (optional)", placeholder=".article p", lines=1) |
|
|
email_input = gr.Textbox(label="Email (if login required)", lines=1) |
|
|
password_input = gr.Textbox(label="Password (if login required)", type="password", lines=1) |
|
|
login_url_input = gr.Textbox(label="Login URL (if login required)", lines=1) |
|
|
submit_btn = gr.Button("Scrape & Summarize") |
|
|
|
|
|
with gr.Column(): |
|
|
scraped_output = gr.Textbox(label="Scraped Text", lines=15) |
|
|
summary_output = gr.Textbox(label="Summary", lines=8) |
|
|
|
|
|
submit_btn.click( |
|
|
fn=scrape_and_summarize, |
|
|
inputs=[url_input, query_selector_input, email_input, password_input, login_url_input], |
|
|
outputs=[scraped_output, summary_output] |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
app.launch() |
|
|
|