File size: 5,132 Bytes
4c8da68
3c0c86a
de8638e
dd979da
 
 
 
 
4c8da68
dd979da
3c0c86a
2a14ed0
 
2716805
2a14ed0
de8638e
dd979da
2716805
2a14ed0
2716805
 
 
2a14ed0
 
2716805
2a14ed0
17947a0
2a14ed0
17947a0
 
dd979da
2716805
17947a0
72ba805
dd979da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c8da68
dd979da
 
 
 
 
 
4c8da68
 
dd979da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
db5824c
dd979da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dcd6dd2
de8638e
17947a0
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# pip install gradio transformers optimum onnxruntime onnx beautifulsoup4 langdetect deep-translator requests torch

import gradio as gr
import requests
from bs4 import BeautifulSoup
import re
from requests.sessions import Session
from langdetect import detect
from deep_translator import GoogleTranslator

from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForSeq2SeqLM
from optimum.pipelines import pipeline
import onnxruntime as ort
import torch

# --- ONNX CPU optimization setup ---
sess_options = ort.SessionOptions()
sess_options.intra_op_num_threads = min(4, torch.get_num_threads())
sess_options.inter_op_num_threads = 1
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL

model_name = "Rahmat82/t5-small-finetuned-summarization-xsum"
model = ORTModelForSeq2SeqLM.from_pretrained(model_name, session_options=sess_options)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

summarizer = pipeline(
    "summarization",
    model=model,
    tokenizer=tokenizer,
    device=-1,  # CPU
    batch_size=8,
)

# --- Scraper function ---
def scrape_visible_text_from_url(url, query_selector=None, email=None, password=None, login_url=None):
    try:
        session = Session()

        if email and password and login_url:
            login_data = {'email': email, 'password': password}
            response = session.post(login_url, data=login_data)
            response.raise_for_status()
        else:
            response = session.get(url)
            response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]):
            tag.extract()

        if query_selector:
            elements = soup.select(query_selector)
            text_content = " ".join([element.get_text() for element in elements])
        else:
            header_content = soup.find("header")
            header_text = header_content.get_text() if header_content else ""
            paragraph_content = soup.body
            paragraph_text = " ".join([p.get_text() for p in paragraph_content]) if paragraph_content else ""
            text_content = f"{header_text}\n\n{paragraph_text}"

        visible_text = re.sub(r'\s+', ' ', text_content).strip()

        translator = GoogleTranslator(source='auto', target='en')
        sentences = re.split(r'(?<=[.!?]) +', visible_text)
        translated_sentences = []
        for sentence in sentences:
            try:
                lang = detect(sentence)
                if lang != 'en':
                    translated_sentence = translator.translate(sentence)
                    translated_sentences.append(translated_sentence)
                else:
                    translated_sentences.append(sentence)
            except Exception:
                translated_sentences.append(sentence)
        translated_text = ' '.join(translated_sentences)

        return translated_text

    except Exception as e:
        return f"Error occurred while scraping: {e}"

# --- Main function for Gradio ---
def scrape_and_summarize(url, query_selector, email, password, login_url):
    scraped_text = scrape_visible_text_from_url(url, query_selector, email, password, login_url)
    if scraped_text.startswith("Error occurred"):
        return scraped_text, ""
    if not scraped_text.strip():
        return "No text found to summarize.", ""

    # Summarize scraped text
    try:
        inputs = tokenizer.encode(scraped_text, max_length=1024, truncation=True, return_tensors="pt")
        input_text = tokenizer.decode(inputs[0], skip_special_tokens=True)

        summary = summarizer(
            input_text,
            min_length=90,
            max_length=120,
            do_sample=False
        )
        return scraped_text, summary[0]["summary_text"]
    except Exception as e:
        return scraped_text, f"Error during summarization: {e}"

# --- Gradio Interface ---
with gr.Blocks() as app:
    gr.Markdown("# 🌐 Web Scraper + ⚙️ ONNX T5 Summarizer")

    with gr.Row():
        with gr.Column():
            url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com", lines=1)
            query_selector_input = gr.Textbox(label="CSS Query Selector (optional)", placeholder=".article p", lines=1)
            email_input = gr.Textbox(label="Email (if login required)", lines=1)
            password_input = gr.Textbox(label="Password (if login required)", type="password", lines=1)
            login_url_input = gr.Textbox(label="Login URL (if login required)", lines=1)
            submit_btn = gr.Button("Scrape & Summarize")

        with gr.Column():
            scraped_output = gr.Textbox(label="Scraped Text", lines=15)
            summary_output = gr.Textbox(label="Summary", lines=8)

    submit_btn.click(
        fn=scrape_and_summarize,
        inputs=[url_input, query_selector_input, email_input, password_input, login_url_input],
        outputs=[scraped_output, summary_output]
    )

if __name__ == "__main__":
    app.launch()