ar08 commited on
Commit
dd979da
·
verified ·
1 Parent(s): ed4712c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -29
app.py CHANGED
@@ -1,57 +1,132 @@
1
- # pip install gradio transformers optimum onnxruntime onnx
2
 
3
  import gradio as gr
 
 
 
 
 
 
 
4
  from transformers import AutoTokenizer
5
  from optimum.onnxruntime import ORTModelForSeq2SeqLM
6
  from optimum.pipelines import pipeline
7
  import onnxruntime as ort
8
  import torch
9
 
10
- # CPU optimization settings
11
  sess_options = ort.SessionOptions()
12
  sess_options.intra_op_num_threads = min(4, torch.get_num_threads())
13
  sess_options.inter_op_num_threads = 1
14
  sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
15
 
16
- # Load ONNX model and tokenizer
17
  model_name = "Rahmat82/t5-small-finetuned-summarization-xsum"
18
  model = ORTModelForSeq2SeqLM.from_pretrained(model_name, session_options=sess_options)
19
  tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
20
 
21
- # Build CPU pipeline
22
  summarizer = pipeline(
23
  "summarization",
24
  model=model,
25
  tokenizer=tokenizer,
26
- device=-1, # Force CPU
27
  batch_size=8,
28
  )
29
 
30
- # Summarization function
31
- def summarize_text(text):
32
- text = text.strip()
33
- if not text:
34
- return "Please enter some text."
35
-
36
- inputs = tokenizer.encode(text, max_length=1024, truncation=True, return_tensors="pt")
37
- input_text = tokenizer.decode(inputs[0], skip_special_tokens=True)
38
-
39
- summary = summarizer(
40
- input_text,
41
- min_length=90,
42
- max_length=120,
43
- do_sample=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  )
45
- return summary[0]["summary_text"]
46
-
47
- # Gradio UI
48
- app = gr.Interface(
49
- fn=summarize_text,
50
- inputs=gr.Textbox(lines=12, label="Input Text"),
51
- outputs=gr.Textbox(label="Summary"),
52
- title="⚙️ ONNX T5 Summarizer (CPU-Optimized)",
53
- description="Fast and optimized ONNX model for summarization on CPU. No quantization warnings or deprecated cache used."
54
- )
55
 
56
  if __name__ == "__main__":
57
  app.launch()
 
1
+ # pip install gradio transformers optimum onnxruntime onnx beautifulsoup4 langdetect googletrans==4.0.0-rc1 requests
2
 
3
  import gradio as gr
4
+ import requests
5
+ from bs4 import BeautifulSoup
6
+ import re
7
+ from requests.sessions import Session
8
+ from langdetect import detect
9
+ from googletrans import Translator
10
+
11
  from transformers import AutoTokenizer
12
  from optimum.onnxruntime import ORTModelForSeq2SeqLM
13
  from optimum.pipelines import pipeline
14
  import onnxruntime as ort
15
  import torch
16
 
17
+ # --- ONNX CPU optimization setup ---
18
  sess_options = ort.SessionOptions()
19
  sess_options.intra_op_num_threads = min(4, torch.get_num_threads())
20
  sess_options.inter_op_num_threads = 1
21
  sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
22
 
 
23
  model_name = "Rahmat82/t5-small-finetuned-summarization-xsum"
24
  model = ORTModelForSeq2SeqLM.from_pretrained(model_name, session_options=sess_options)
25
  tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
26
 
 
27
  summarizer = pipeline(
28
  "summarization",
29
  model=model,
30
  tokenizer=tokenizer,
31
+ device=-1, # CPU
32
  batch_size=8,
33
  )
34
 
35
+ # --- Scraper function ---
36
+ def scrape_visible_text_from_url(url, query_selector=None, email=None, password=None, login_url=None):
37
+ try:
38
+ session = Session()
39
+
40
+ if email and password and login_url:
41
+ login_data = {'email': email, 'password': password}
42
+ response = session.post(login_url, data=login_data)
43
+ response.raise_for_status()
44
+ else:
45
+ response = session.get(url)
46
+ response.raise_for_status()
47
+
48
+ soup = BeautifulSoup(response.content, 'html.parser')
49
+
50
+ for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]):
51
+ tag.extract()
52
+
53
+ if query_selector:
54
+ elements = soup.select(query_selector)
55
+ text_content = " ".join([element.get_text() for element in elements])
56
+ else:
57
+ header_content = soup.find("header")
58
+ header_text = header_content.get_text() if header_content else ""
59
+ paragraph_content = soup.body
60
+ paragraph_text = " ".join([p.get_text() for p in paragraph_content]) if paragraph_content else ""
61
+ text_content = f"{header_text}\n\n{paragraph_text}"
62
+
63
+ visible_text = re.sub(r'\s+', ' ', text_content).strip()
64
+
65
+ translator = Translator()
66
+ sentences = re.split(r'(?<=[.!?]) +', visible_text)
67
+ translated_sentences = []
68
+ for sentence in sentences:
69
+ try:
70
+ lang = detect(sentence)
71
+ if lang != 'en':
72
+ translation = translator.translate(sentence, dest='en').text
73
+ translated_sentences.append(translation)
74
+ else:
75
+ translated_sentences.append(sentence)
76
+ except Exception:
77
+ translated_sentences.append(sentence)
78
+ translated_text = ' '.join(translated_sentences)
79
+
80
+ return translated_text
81
+
82
+ except Exception as e:
83
+ return f"Error occurred while scraping: {e}"
84
+
85
+ # --- Main function for Gradio ---
86
+ def scrape_and_summarize(url, query_selector, email, password, login_url):
87
+ scraped_text = scrape_visible_text_from_url(url, query_selector, email, password, login_url)
88
+ if scraped_text.startswith("Error occurred"):
89
+ return scraped_text, ""
90
+ if not scraped_text.strip():
91
+ return "No text found to summarize.", ""
92
+
93
+ # Summarize scraped text
94
+ try:
95
+ inputs = tokenizer.encode(scraped_text, max_length=1024, truncation=True, return_tensors="pt")
96
+ input_text = tokenizer.decode(inputs[0], skip_special_tokens=True)
97
+
98
+ summary = summarizer(
99
+ input_text,
100
+ min_length=90,
101
+ max_length=120,
102
+ do_sample=False
103
+ )
104
+ return scraped_text, summary[0]["summary_text"]
105
+ except Exception as e:
106
+ return scraped_text, f"Error during summarization: {e}"
107
+
108
+ # --- Gradio Interface ---
109
+ with gr.Blocks() as app:
110
+ gr.Markdown("# 🌐 Web Scraper + ⚙️ ONNX T5 Summarizer")
111
+
112
+ with gr.Row():
113
+ with gr.Column():
114
+ url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com", lines=1)
115
+ query_selector_input = gr.Textbox(label="CSS Query Selector (optional)", placeholder=".article p", lines=1)
116
+ email_input = gr.Textbox(label="Email (if login required)", lines=1)
117
+ password_input = gr.Textbox(label="Password (if login required)", type="password", lines=1)
118
+ login_url_input = gr.Textbox(label="Login URL (if login required)", lines=1)
119
+ submit_btn = gr.Button("Scrape & Summarize")
120
+
121
+ with gr.Column():
122
+ scraped_output = gr.Textbox(label="Scraped Text", lines=15)
123
+ summary_output = gr.Textbox(label="Summary", lines=8)
124
+
125
+ submit_btn.click(
126
+ fn=scrape_and_summarize,
127
+ inputs=[url_input, query_selector_input, email_input, password_input, login_url_input],
128
+ outputs=[scraped_output, summary_output]
129
  )
 
 
 
 
 
 
 
 
 
 
130
 
131
  if __name__ == "__main__":
132
  app.launch()